PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py CHANGED Viewed

@@ -2,114 +2,31 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import json
 import logging
-import math
 import os
-from typing import Optional
+from typing import Dict, List, Union
+from typing_extensions import overload, override
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
-from ..._common.math import list_mean_nan_safe
-from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
+from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 logger = logging.getLogger(__name__)
-try:
-    from .._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = "None"
+class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """
+    Evaluates retrieval score for a given query and context or a multi-turn conversation, including reasoning.
-class _AsyncRetrievalScoreEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    _PROMPTY_FILE = "retrieval.prompty"
-    _LLM_CALL_TIMEOUT = 600
-    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+    The retrieval measure assesses the AI system's performance in retrieving information
+    for additional context (e.g. a RAG scenario).
-    def __init__(self, model_config: dict):
-        prompty_model_config = construct_prompty_model_config(
-            validate_model_config(model_config),
-            self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
-        )
+    Retrieval scores range from 1 to 5, with 1 being the worst and 5 being the best.
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query, context, conversation, **kwargs):
-        if conversation:
-            # Extract queries, responses and contexts from conversation
-            queries = []
-            responses = []
-            contexts = []
-            conversation = conversation.get("messages", None)
-            for each_turn in conversation:
-                role = each_turn["role"]
-                if role == "user":
-                    queries.append(each_turn["content"])
-                elif role == "assistant":
-                    responses.append(each_turn["content"])
-                    if "context" in each_turn:
-                        if "citations" in each_turn["context"]:
-                            citations = json.dumps(each_turn["context"]["citations"])
-                            contexts.append(citations)
-                        elif isinstance(each_turn["context"], str):
-                            contexts.append(each_turn["context"])
-            # Evaluate each turn
-            per_turn_scores = []
-            per_turn_reasons = []
-            for turn_num, turn_query in enumerate(queries):
-                try:
-                    if turn_num >= len(queries):
-                        turn_query = ""
-                    context = contexts[turn_num] if turn_num < len(contexts) else ""
-                    llm_output = await self._flow(
-                        query=turn_query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs
-                    )
-                    score, reason = parse_quality_evaluator_reason_score(llm_output)
-                    per_turn_scores.append(score)
-                    per_turn_reasons.append(reason)
-                except Exception as e:  # pylint: disable=broad-exception-caught
-                    logger.warning(
-                        "Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
-                    )
-                    per_turn_scores.append(math.nan)
-                    per_turn_reasons.append("")
-            mean_per_turn_score = list_mean_nan_safe(per_turn_scores)
-            return {
-                "retrieval": mean_per_turn_score,
-                "gpt_retrieval": mean_per_turn_score,
-                "evaluation_per_turn": {
-                    "gpt_retrieval": per_turn_scores,
-                    "retrieval": per_turn_scores,
-                    "retrieval_reason": per_turn_reasons,
-                },
-            }
-        llm_output = await self._flow(query=query, context=context, timeout=self._LLM_CALL_TIMEOUT, **kwargs)
-        score, reason = parse_quality_evaluator_reason_score(llm_output)
-        return {
-            "retrieval": score,
-            "retrieval_reason": reason,
-            "gpt_retrieval": score,
-        }
-class RetrievalEvaluator:
-    """
-    Initialize an evaluator configured for a specific Azure OpenAI model.
+    High retrieval scores indicate that the AI system has successfully extracted and ranked
+    the most relevant information at the top, without introducing bias from external knowledge
+    and ignoring factual correctness. Conversely, low retrieval scores suggest that the AI system
+    has failed to surface the most relevant context chunks at the top of the list
+    and/or introduced bias and ignored factual correctness.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
@@ -117,45 +34,68 @@ class RetrievalEvaluator:
     :return: A function that evaluates and generates metrics for "chat" scenario.
     :rtype: Callable
-    **Usage**
-    .. code-block:: python
-        chat_eval = RetrievalEvaluator(model_config)
-        conversation = {
-            "messages": [
-                {"role": "user", "content": "What is the value of 2 + 2?"},
-                {
-                    "role": "assistant", "content": "2 + 2 = 4",
-                    "context": "From 'math_doc.md': Information about additions: 1 + 2 = 3, 2 + 2 = 4"
-                }
-            ]
-        }
-        result = chat_eval(conversation=conversation)
-    **Output format**
-    .. code-block:: python
-        {
-            "gpt_retrieval": 3.0,
-            "retrieval": 3.0,
-            "evaluation_per_turn": {
-                "gpt_retrieval": [1.0, 2.0, 3.0],
-                "retrieval": [1.0, 2.0, 3.0],
-                "retrieval_reason": ["<reasoning for score 1>", "<reasoning for score 2>", "<reasoning for score 3>"]
-            }
-        }
-    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
-    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START retrieval_evaluator]
+            :end-before: [END retrieval_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a RetrievalEvaluator.
+    .. note::
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
-    def __init__(self, model_config):
-        self._async_evaluator = _AsyncRetrievalScoreEvaluator(model_config)
+    _PROMPTY_FILE = "retrieval.prompty"
+    _RESULT_KEY = "retrieval"
+    id = "azureml://registries/azureml/models/Retrieval-Evaluator/versions/1"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
+    def __init__(self, model_config):  # pylint: disable=super-init-not-called
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        context: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluates retrieval for a given a query and context
+        :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
+        :paramtype query: Optional[str]
+        :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
+        :paramtype context: Optional[str]
+        :return: The scores for Chat scenario.
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+        :keyword conversation: The conversation to be evaluated.
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The scores for Chat scenario.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
-    def __call__(self, *, query: Optional[str] = None, context: Optional[str] = None, conversation=None, **kwargs):
+    @override
+    def __call__(self, *args, **kwargs):  # pylint: disable=docstring-missing-param
         """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
         the evaluator will aggregate the results of each turn.
@@ -167,31 +107,6 @@ class RetrievalEvaluator:
         :keyword conversation: The conversation to be evaluated.
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The scores for Chat scenario.
-        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
         """
-        if (query is None or context is None) and conversation is None:
-            msg = "Either a pair of 'query'/'context' or 'conversation' must be provided."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.MISSING_FIELD,
-                target=ErrorTarget.RETRIEVAL_EVALUATOR,
-            )
-        if (query or context) and conversation:
-            msg = "Either a pair of 'query'/'context' or 'conversation' must be provided, but not both."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.INVALID_VALUE,
-                target=ErrorTarget.RETRIEVAL_EVALUATOR,
-            )
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, context=context, conversation=conversation, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -6,10 +6,9 @@ from enum import Enum
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
-from azure.core import CaseInsensitiveEnumMeta
-class RougeType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+class RougeType(Enum):
     """
     Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
     """
@@ -38,8 +37,8 @@ class _AsyncRougeScoreEvaluator:
         self._rouge_type = rouge_type
     async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
         return {
             "rouge_precision": metrics.precision,
             "rouge_recall": metrics.recall,
@@ -49,34 +48,34 @@ class _AsyncRougeScoreEvaluator:
 class RougeScoreEvaluator:
     """
-    Evaluator for computes the ROUGE scores between two strings.
+    Calculates the ROUGE score for a given response and ground truth.
-    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
-    summarization and machine translation. It measures the overlap between generated text and reference summaries.
-    ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
-    summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
-    coherence and relevance are critical.
+    The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
+    generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
+    ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
+    the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
+    (Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
+    (L-graph overlap)
-    **Usage**
+    Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
+    other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
+    information from the reference text.
-    .. code-block:: python
+    ROUGE scores range from 0 to 1, with higher scores indicating better quality.
-        eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
-        result = eval_fn(
-            response="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
+    .. admonition:: Example:
-    **Output format**
-    .. code-block:: python
-        {
-            "rouge_precision": 1.0,
-            "rouge_recall": 1.0,
-            "rouge_f1_score": 1.0
-        }
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START rouge_score_evaluator]
+            :end-before: [END rouge_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
     """
+    id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, rouge_type: RougeType):
         self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py CHANGED Viewed

@@ -1,22 +1,26 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Dict
-from typing_extensions import override
+from typing import List, Union, Dict
+from typing_extensions import overload, override
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 @experimental
-class GroundednessProEvaluator(RaiServiceEvaluatorBase):
+class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
-    Initialize a Groundedness Pro evaluator for determine if the response is grounded
-    in the query and context.
+    Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
+    including reasoning.
-    If this evaluator is supplied to the `evaluate` function, the aggregated metric
-    for the groundedness pro label will be "groundedness_pro_passing_rate".
+    The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
+    in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
+    they can't be verified against the provided sources (such as your input source or your database).
+    Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
@@ -26,64 +30,24 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
-    **Usage**
-    .. code-block:: python
-        azure_ai_project = {
-            "subscription_id": "<subscription_id>",
-            "resource_group_name": "<resource_group_name>",
-            "project_name": "<project_name>",
-        }
-        credential = DefaultAzureCredential()
-        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
-        result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
-    **Output format**
-    .. code-block:: python
-        {
-            "groundedness_pro_label": True,
-            "reason": "'All Contents are grounded"
-        }
-    **Usage with conversation input**
-    .. code-block:: python
+    .. admonition:: Example:
-        azure_ai_project = {
-            "subscription_id": "<subscription_id>",
-            "resource_group_name": "<resource_group_name>",
-            "project_name": "<project_name>",
-        }
-        credential = DefaultAzureCredential()
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START groundedness_pro_evaluator]
+            :end-before: [END groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
-        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
-        conversation = {
-            "messages": [
-                {"role": "user", "content": "What is the capital of France?"},
-                {"role": "assistant", "content": "Paris.", "context": "Paris."}
-                {"role": "user", "content": "What is the capital of Germany?"},
-                {"role": "assistant", "content": "Berlin.", "context": "Berlin."}
-            ]
-        }
-        result = eval_fn(conversation=conversation)
+    .. note::
-    **Output format**
-    .. code-block:: python
-            {
-                "groundedness_pro_label": 1.0,
-                "evaluation_per_turn": {
-                    "groundedness_pro_label": [True, True],
-                    "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
-                }
-            }
+        If this evaluator is supplied to the `evaluate` function, the aggregated metric
+        for the groundedness pro label will be "groundedness_pro_passing_rate".
     """
+    id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
     def __init__(
         self,
@@ -91,7 +55,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
         azure_ai_project,
         **kwargs,
     ):
-        self._passing_score = 3  # TODO update once the binarization PR is merged
+        self._passing_score = 5  # TODO update once the binarization PR is merged
         self._output_prefix = "groundedness_pro"
         super().__init__(
             eval_metric=EvaluationMetrics.GROUNDEDNESS,
@@ -100,14 +64,48 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
             **kwargs,
         )
-    @override
+    @overload
+    def __call__(
+        self,
+        *,
+        response: str,
+        context: str,
+        query: str,
+    ) -> Dict[str, Union[str, bool]]:
+        """Evaluate groundedness for a given query/response/context
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
+        """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
+        more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
+        available in the output under the "evaluation_per_turn" key.
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
         **kwargs,
     ):
         """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
@@ -128,7 +126,7 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
         :return: The relevance score.
         :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
         """
-        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
     @override
     async def _do_eval(self, eval_input: Dict):

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -80,36 +80,42 @@ class _AsyncSimilarityEvaluator:
 class SimilarityEvaluator:
     """
-    Initialize a similarity evaluator configured for a specific Azure OpenAI model.
+    Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
-    :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
+    AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
+    the ground truth and the model's prediction, which are high-dimensional vector representations capturing
+    the semantic meaning and context of the sentences.
-    **Usage**
+    Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
+    tasks where you have access to ground truth responses. Similarity enables you to assess the generated
+    text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
-    .. code-block:: python
+    Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
-        eval_fn = SimilarityEvaluator(model_config)
-        result = eval_fn(
-            query="What is the capital of Japan?",
-            response="The capital of Japan is Tokyo.",
-            ground_truth="Tokyo is Japan's capital.")
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
-    **Output format**
+    .. admonition:: Example:
-    .. code-block:: python
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START rouge_score_evaluator]
+            :end-before: [END rouge_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
-        {
-            "similarity": 3.0,
-            "gpt_similarity": 3.0,
-        }
+    .. note::
-    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
-    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
+    id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, model_config):
         self._async_evaluator = _AsyncSimilarityEvaluator(model_config)

azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.0.1py3-none-any.whl