PyPI - azure-ai-evaluation - Versions diffs - 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -3,12 +3,16 @@
 # ---------------------------------------------------------
 from enum import Enum
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing import Dict, Union
+from typing_extensions import overload, override
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
+import math
-class RougeType(Enum):
+class RougeType(str, Enum):
     """
     Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
     """
@@ -32,21 +36,7 @@ class RougeType(Enum):
     """Overlap of L-grams (L consecutive words) between generated and reference text."""
-class _AsyncRougeScoreEvaluator:
-    def __init__(self, rouge_type: RougeType):
-        self._rouge_type = rouge_type
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
-        return {
-            "rouge_precision": metrics.precision,
-            "rouge_recall": metrics.recall,
-            "rouge_f1_score": metrics.fmeasure,
-        }
-class RougeScoreEvaluator:
+class RougeScoreEvaluator(EvaluatorBase):
     """
     Calculates the ROUGE score for a given response and ground truth.
@@ -62,6 +52,14 @@ class RougeScoreEvaluator:
     information from the reference text.
     ROUGE scores range from 0 to 1, with higher scores indicating better quality.
+    :param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
+    :type rouge_type: str
+    :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
+    :type precision_threshold: float
+    :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
+    :type recall_threshold: float
+    :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
+    :type f1_score_threshold: float
     .. admonition:: Example:
@@ -71,15 +69,146 @@ class RougeScoreEvaluator:
             :language: python
             :dedent: 8
             :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START rouge_score_evaluator]
+            :end-before: [END rouge_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_rouge_score_evaluator]
+            :end-before: [END threshold_rouge_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
     """
-    id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/rouge_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self, rouge_type: RougeType):
-        self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
+    @override
+    def __init__(
+        self,
+        rouge_type: RougeType,
+        *,
+        precision_threshold: float = 0.5,
+        recall_threshold: float = 0.5,
+        f1_score_threshold: float = 0.5,
+    ):
+        self._rouge_type = rouge_type
+        self._higher_is_better = True
+        super().__init__()
+        # Type checking for threshold parameters
+        for name, value in [
+            ("precision_threshold", precision_threshold),
+            ("recall_threshold", recall_threshold),
+            ("f1_score_threshold", f1_score_threshold),
+        ]:
+            if not isinstance(value, float):
+                raise TypeError(f"{name} must be a float, got {type(value)}")
+        self._threshold = {
+            "precision": precision_threshold,
+            "recall": recall_threshold,
+            "f1_score": f1_score_threshold,
+        }
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+    def _get_binary_result(
+        self,
+        rouge_precision: float,
+        rouge_recall: float,
+        rouge_f1_score: float,
+    ) -> Dict[str, bool]:
+        """
+        Get binary result based on the threshold.
+        :param rouge_precision: The precision score.
+        :type rouge_precision: float
+        :param rouge_recall: The recall score.
+        :type rouge_recall: float
+        :param rouge_f1_score: The F1 score.
+        :type rouge_f1_score: float
+        :return: A dictionary with binary results for precision, recall, and F1 score.
+        """
+        # Initialize results with False for NaN values
+        results = {
+            "rouge_precision_result": False,
+            "rouge_recall_result": False,
+            "rouge_f1_score_result": False,
+        }
+        # Check if values are valid (not NaN) before comparison
+        precision_valid = not math.isnan(rouge_precision)
+        recall_valid = not math.isnan(rouge_recall)
+        f1_valid = not math.isnan(rouge_f1_score)
+        if self._higher_is_better:
+            if precision_valid:
+                results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
+            if recall_valid:
+                results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
+            if f1_valid:
+                results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
+        else:
+            if precision_valid:
+                results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
+            if recall_valid:
+                results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
+            if f1_valid:
+                results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
+        return results
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a rouge score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type]
+        binary_results = {
+            "rouge_precision_result": False,
+            "rouge_recall_result": False,
+            "rouge_f1_score_result": False,
+        }
+        # Convert metrics to floats, using nan for None or non-convertible values
+        rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
+        rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
+        rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
+        binary_results = self._get_binary_result(
+            rouge_precision=rouge_precision,
+            rouge_recall=rouge_recall,
+            rouge_f1_score=rouge_f1_score,
+        )
+        return {
+            "rouge_precision": rouge_precision,
+            "rouge_recall": rouge_recall,
+            "rouge_f1_score": rouge_f1_score,
+            "rouge_precision_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_precision_result"]],
+            "rouge_recall_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_recall_result"]],
+            "rouge_f1_score_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_f1_score_result"]],
+            "rouge_precision_threshold": self._threshold["precision"],
+            "rouge_recall_threshold": self._threshold["recall"],
+            "rouge_f1_score_threshold": self._threshold["f1_score"],
+        }
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the ROUGE score between the response and the ground truth.
@@ -90,9 +219,20 @@ class RougeScoreEvaluator:
         :return: The ROUGE score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate route score.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The ROUGE score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py CHANGED Viewed

@@ -24,9 +24,11 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
-    :param azure_ai_project: The scope of the Azure AI project.
-        It contains subscription id, resource group, and project name.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
+    :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
+    :type threshold: int
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
@@ -39,28 +41,52 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :dedent: 8
             :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START groundedness_pro_evaluator]
+            :end-before: [END groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_groundedness_pro_evaluator]
+            :end-before: [END threshold_groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a specified threshold and call GroundednessProEvaluator with a query, response, and context.
     .. note::
         If this evaluator is supplied to the `evaluate` function, the aggregated metric
         for the groundedness pro label will be "groundedness_pro_passing_rate".
     """
-    id = "azureml://registries/azureml/models/Groundedness-Pro-Evaluator/versions/1"
+    id = "azureai://built-in/evaluators/groundedness_pro"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        *,
+        threshold: int = 5,
         **kwargs,
     ):
-        self._passing_score = 5  # TODO update once the binarization PR is merged
+        self.threshold = threshold
+        self._higher_is_better = True
         self._output_prefix = "groundedness_pro"
         super().__init__(
             eval_metric=EvaluationMetrics.GROUNDEDNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            threshold=self.threshold,
             **kwargs,
         )
@@ -141,8 +167,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
         """
         result = await super()._do_eval(eval_input)
         real_result = {}
+        real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
         real_result[self._output_prefix + "_label"] = (
-            result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
+            result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.threshold
         )
-        real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
+        if self._higher_is_better:
+            real_result[self._output_prefix + "_score"] = max(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 0)
+        else:
+            real_result[self._output_prefix + "_score"] = min(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 1)
         return real_result

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -2,85 +2,17 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import math
 import os
-import re
+from typing import Dict
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
+from typing_extensions import overload, override
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from ..._common.utils import construct_prompty_model_config, validate_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = "None"
-class _AsyncSimilarityEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    _PROMPTY_FILE = "similarity.prompty"
-    _LLM_CALL_TIMEOUT = 600
-    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        prompty_model_config = construct_prompty_model_config(
-            validate_model_config(model_config),
-            self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate similarity.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The similarity score.
-        :rtype: Dict[str, float]
-        """
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        ground_truth = str(ground_truth or "")
-        if not (query.strip() and response.strip() and ground_truth.strip()):
-            msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.SIMILARITY_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = math.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"similarity": float(score), "gpt_similarity": float(score)}
-class SimilarityEvaluator:
+class SimilarityEvaluator(PromptyEvaluatorBase):
     """
-    Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation.
+    Evaluates similarity score for a given query, response, and ground truth.
     The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
     AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
@@ -96,15 +28,41 @@ class SimilarityEvaluator:
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the similarity evaluator. Default is 3.
+    :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START rouge_score_evaluator]
-            :end-before: [END rouge_score_evaluator]
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
+            :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_similarity_evaluator]
+            :end-before: [END threshold_similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a threshold and call a SimilarityEvaluator.
     .. note::
@@ -113,13 +71,37 @@ class SimilarityEvaluator:
         however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
-    id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
+    # Constants must be defined within eval's directory to be save/loadable
+    _PROMPTY_FILE = "similarity.prompty"
+    _RESULT_KEY = "similarity"
+    id = "azureai://built-in/evaluators/similarity"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self, model_config):
-        self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
+    @override
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            credential=credential,
+            _higher_is_better=self._higher_is_better,
+            **kwargs,
+        )
-    def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
+    # Ignoring a mypy error about having only 1 overload function.
+    # We want to use the overload style for all evals, even single-inputs. This is both to make
+    # refactoring to multi-input styles easier, stylistic consistency consistency across evals,
+    # and due to the fact that non-overloaded syntax now causes various parsing issues that
+    # we don't want to deal with.
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
         """
         Evaluate similarity.
@@ -132,9 +114,23 @@ class SimilarityEvaluator:
         :return: The similarity score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate similarity.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The similarity score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_task_adherence/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._task_adherence import TaskAdherenceEvaluator
+__all__ = ["TaskAdherenceEvaluator"]

azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl