PyPI - azure-ai-evaluation - Versions diffs - 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show

azure/ai/evaluation/_evaluators/_meteor/_meteor.py CHANGED Viewed

@@ -1,38 +1,17 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.meteor_score import meteor_score
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-class _AsyncMeteorScoreEvaluator:
-    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma
-        ensure_nltk_data_downloaded()
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-        score = meteor_score(
-            [reference_tokens],
-            hypothesis_tokens,
-            alpha=self._alpha,
-            beta=self._beta,
-            gamma=self._gamma,
-        )
-        return {
-            "meteor_score": score,
-        }
-class MeteorScoreEvaluator:
+class MeteorScoreEvaluator(EvaluatorBase):
     """
     Calculates the METEOR score for a given response and ground truth.
@@ -54,6 +33,8 @@ class MeteorScoreEvaluator:
     :type beta: float
     :param gamma: The METEOR score gamma parameter. Default is 0.5.
     :type gamma: float
+    :param threshold: The threshold for the METEOR score evaluator. Default is 0.5.
+    :type threshold: float
     .. admonition:: Example:
@@ -63,15 +44,75 @@ class MeteorScoreEvaluator:
             :language: python
             :dedent: 8
             :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START meteor_score_evaluator]
+            :end-before: [END meteor_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_meteor_score_evaluator]
+            :end-before: [END threshold_meteor_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a MeteorScoreEvaluator.
     """
-    id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/meteor_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
+    @override
+    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, *, threshold: float = 0.5):
+        self._alpha = alpha
+        self._beta = beta
+        self._gamma = gamma
+        ensure_nltk_data_downloaded()
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a meteor score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
+        score = meteor_score(
+            [reference_tokens],
+            hypothesis_tokens,
+            alpha=self._alpha,
+            beta=self._beta,
+            gamma=self._gamma,
+        )
+        binary_result = False
+        if self._higher_is_better:
+            if score >= self._threshold:
+                binary_result = True
+        else:
+            if score <= self._threshold:
+                binary_result = True
+        return {
+            "meteor_score": score,
+            "meteor_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "meteor_threshold": self._threshold,
+        }
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the METEOR score between the response and the ground truth.
@@ -82,9 +123,21 @@ class MeteorScoreEvaluator:
         :return: The METEOR score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the METEOR score between the response and the ground truth.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The METEOR score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py CHANGED Viewed

@@ -25,9 +25,9 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param credential: The credential required for connecting to the Azure AI project.
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
-        resource group, and project name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
     .. admonition:: Example:
@@ -37,21 +37,39 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call a ProtectedMaterialEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START protected_material_evaluator]
+            :end-before: [END protected_material_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
-    id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/protected_material"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        **kwargs,
     ):
+        # Set default for evaluate_query if not provided
+        if "evaluate_query" not in kwargs:
+            kwargs["evaluate_query"] = True
         super().__init__(
             eval_metric=EvaluationMetrics.PROTECTED_MATERIAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -2,10 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Union
+from typing import Union
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from typing_extensions import overload, override
+from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
 from .._coherence import CoherenceEvaluator
 from .._f1_score import F1ScoreEvaluator
@@ -15,13 +16,25 @@ from .._relevance import RelevanceEvaluator
 from .._similarity import SimilarityEvaluator
-class QAEvaluator:
+class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     """
     Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
+    :type groundedness_threshold: int
+    :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
+    :type relevance_threshold: int
+    :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
+    :type coherence_threshold: int
+    :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
+    :type fluency_threshold: int
+    :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
+    :type similarity_threshold: int
+    :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
+    :type f1_score_threshold: float
     :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
@@ -35,6 +48,25 @@ class QAEvaluator:
             :dedent: 8
             :caption: Initialize and call a QAEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START qa_evaluator]
+            :end-before: [END qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_qa_evaluator]
+            :end-before: [END threshold_qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a QAEvaluator.
     .. note::
         To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
@@ -42,22 +74,66 @@ class QAEvaluator:
         however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
     """
-    id = "qa"
+    id = "azureai://built-in/evaluators/qa"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self, model_config, **kwargs):
-        self._parallel = kwargs.pop("_parallel", False)
-        self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
-            GroundednessEvaluator(model_config),
-            RelevanceEvaluator(model_config),
-            CoherenceEvaluator(model_config),
-            FluencyEvaluator(model_config),
-            SimilarityEvaluator(model_config),
-            F1ScoreEvaluator(),
+    def __init__(
+        self,
+        model_config,
+        *,
+        groundedness_threshold: int = 3,
+        relevance_threshold: int = 3,
+        coherence_threshold: int = 3,
+        fluency_threshold: int = 3,
+        similarity_threshold: int = 3,
+        f1_score_threshold: float = 0.5,
+        **kwargs,
+    ):
+        # Type checking
+        for name, value in [
+            ("groundedness_threshold", groundedness_threshold),
+            ("relevance_threshold", relevance_threshold),
+            ("coherence_threshold", coherence_threshold),
+            ("fluency_threshold", fluency_threshold),
+            ("similarity_threshold", similarity_threshold),
+            ("f1_score_threshold", f1_score_threshold),
+        ]:
+            if not isinstance(value, (int, float)):
+                raise TypeError(f"{name} must be an int or float, got {type(value)}")
+        evaluators = [
+            GroundednessEvaluator(model_config, threshold=groundedness_threshold),
+            RelevanceEvaluator(model_config, threshold=relevance_threshold),
+            CoherenceEvaluator(model_config, threshold=coherence_threshold),
+            FluencyEvaluator(model_config, threshold=fluency_threshold),
+            SimilarityEvaluator(model_config, threshold=similarity_threshold),
+            F1ScoreEvaluator(threshold=f1_score_threshold),
         ]
+        super().__init__(evaluators=evaluators, **kwargs)
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
+        """
+        Evaluates question-answering scenario.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The scores for QA scenario.
+        :rtype: Dict[str, Union[str, float]]
+        """
-    def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
         """
         Evaluates question-answering scenario.
@@ -72,22 +148,5 @@ class QAEvaluator:
         :return: The scores for QA scenario.
         :rtype: Dict[str, Union[str, float]]
         """
-        results: Dict[str, Union[str, float]] = {}
-        if self._parallel:
-            with ThreadPoolExecutor() as executor:
-                futures = {
-                    executor.submit(
-                        evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
-                    ): evaluator
-                    for evaluator in self._evaluators
-                }
-                # Collect results as they complete
-                for future in as_completed(futures):
-                    results.update(future.result())
-        else:
-            for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
-                results.update(result)
-        return results
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -1,15 +1,21 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import logging
+import math
 import os
 from typing import Dict, Union, List
 from typing_extensions import overload, override
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ..._common.utils import reformat_conversation_history, reformat_agent_response
 from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+logger = logging.getLogger(__name__)
 class RelevanceEvaluator(PromptyEvaluatorBase):
     """
@@ -27,6 +33,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the relevance evaluator. Default is 3.
+    :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     .. admonition:: Example:
@@ -37,6 +50,25 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
             :dedent: 8
             :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START relevance_evaluator]
+            :end-before: [END relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_relevance_evaluator]
+            :end-before: [END threshold_relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -48,14 +80,22 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     _PROMPTY_FILE = "relevance.prompty"
     _RESULT_KEY = "relevance"
-    id = "azureml://registries/azureml/models/Relevance-Evaluator/versions/4"
+    id = "azureai://built-in/evaluators/relevance"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config):
+    def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            credential=credential,
+            _higher_is_better=True,
+            **kwargs,
+        )
     @overload
     def __call__(
@@ -112,3 +152,59 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do a relevance evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain
+        whatever inputs are needed for the _flow method, including context
+        and other fields depending on the child class.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
+        if not isinstance(eval_input["query"], str):
+            eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
+        if not isinstance(eval_input["response"], str):
+            eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
+        result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = result.get("llm_output")
+        score = math.nan
+        if isinstance(llm_output, dict):
+            score = float(llm_output.get("score", math.nan))
+            reason = llm_output.get("explanation", "")
+            # Parse out score and reason from evaluators known to possess them.
+            binary_result = self._get_binary_result(score)
+            return {
+                self._result_key: float(score),
+                f"gpt_{self._result_key}": float(score),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_reason": reason,
+                f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
+                f"{self._result_key}_model": result.get("model_id", ""),
+                f"{self._result_key}_sample_input": result.get("sample_input", ""),
+                f"{self._result_key}_sample_output": result.get("sample_output", ""),
+            }
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }

azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl