PyPI - azure-ai-evaluation - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -1,28 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.gleu_score import sentence_gleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
-class _AsyncGleuScoreEvaluator:
-    def __init__(self):
-        pass
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-        score = sentence_gleu([reference_tokens], hypothesis_tokens)
-        return {
-            "gleu_score": score,
-        }
-class GleuScoreEvaluator:
+class GleuScoreEvaluator(EvaluatorBase):
     """
     Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
@@ -47,10 +35,32 @@ class GleuScoreEvaluator:
     id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
     def __init__(self):
-        self._async_evaluator = _AsyncGleuScoreEvaluator()
+        super().__init__()
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+        score = sentence_gleu([reference_tokens], hypothesis_tokens)
+        return {
+            "gleu_score": score,
+        }
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str):
         """
         Evaluate the GLEU score between the response and the ground truth.
@@ -61,9 +71,21 @@ class GleuScoreEvaluator:
         :return: The GLEU score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the GLEU score between the response and the ground truth.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The GLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_meteor/_meteor.py CHANGED Viewed

@@ -1,38 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.meteor_score import meteor_score
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
-class _AsyncMeteorScoreEvaluator:
-    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._alpha = alpha
-        self._beta = beta
-        self._gamma = gamma
-        ensure_nltk_data_downloaded()
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-        score = meteor_score(
-            [reference_tokens],
-            hypothesis_tokens,
-            alpha=self._alpha,
-            beta=self._beta,
-            gamma=self._gamma,
-        )
-        return {
-            "meteor_score": score,
-        }
-class MeteorScoreEvaluator:
+class MeteorScoreEvaluator(EvaluatorBase):
     """
     Calculates the METEOR score for a given response and ground truth.
@@ -68,10 +46,41 @@ class MeteorScoreEvaluator:
     id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
     def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
-        self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)
+        self._alpha = alpha
+        self._beta = beta
+        self._gamma = gamma
+        ensure_nltk_data_downloaded()
+        super().__init__()
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a meteor score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
+        score = meteor_score(
+            [reference_tokens],
+            hypothesis_tokens,
+            alpha=self._alpha,
+            beta=self._beta,
+            gamma=self._gamma,
+        )
+        return {
+            "meteor_score": score,
+        }
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the METEOR score between the response and the ground truth.
@@ -82,9 +91,21 @@ class MeteorScoreEvaluator:
         :return: The METEOR score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the METEOR score between the response and the ground truth.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The METEOR score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -2,10 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Union
+from typing import Union
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from typing_extensions import overload, override
+from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
 from .._coherence import CoherenceEvaluator
 from .._f1_score import F1ScoreEvaluator
@@ -15,7 +16,7 @@ from .._relevance import RelevanceEvaluator
 from .._similarity import SimilarityEvaluator
-class QAEvaluator:
+class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     """
     Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
@@ -46,9 +47,7 @@ class QAEvaluator:
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, model_config, **kwargs):
-        self._parallel = kwargs.pop("_parallel", False)
-        self._evaluators: List[Union[Callable[..., Dict[str, Union[str, float]]], Callable[..., Dict[str, float]]]] = [
+        evaluators = [
             GroundednessEvaluator(model_config),
             RelevanceEvaluator(model_config),
             CoherenceEvaluator(model_config),
@@ -56,8 +55,31 @@ class QAEvaluator:
             SimilarityEvaluator(model_config),
             F1ScoreEvaluator(),
         ]
+        super().__init__(evaluators=evaluators, **kwargs)
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
+        """
+        Evaluates question-answering scenario.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The scores for QA scenario.
+        :rtype: Dict[str, Union[str, float]]
+        """
-    def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
         """
         Evaluates question-answering scenario.
@@ -72,22 +94,5 @@ class QAEvaluator:
         :return: The scores for QA scenario.
         :rtype: Dict[str, Union[str, float]]
         """
-        results: Dict[str, Union[str, float]] = {}
-        if self._parallel:
-            with ThreadPoolExecutor() as executor:
-                futures = {
-                    executor.submit(
-                        evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
-                    ): evaluator
-                    for evaluator in self._evaluators
-                }
-                # Collect results as they complete
-                for future in as_completed(futures):
-                    results.update(future.result())
-        else:
-            for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
-                results.update(result)
-        return results
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -3,9 +3,11 @@
 # ---------------------------------------------------------
 from enum import Enum
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing import Dict
+from typing_extensions import overload, override
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
 class RougeType(Enum):
@@ -32,21 +34,7 @@ class RougeType(Enum):
     """Overlap of L-grams (L consecutive words) between generated and reference text."""
-class _AsyncRougeScoreEvaluator:
-    def __init__(self, rouge_type: RougeType):
-        self._rouge_type = rouge_type
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
-        return {
-            "rouge_precision": metrics.precision,
-            "rouge_recall": metrics.recall,
-            "rouge_f1_score": metrics.fmeasure,
-        }
-class RougeScoreEvaluator:
+class RougeScoreEvaluator(EvaluatorBase):
     """
     Calculates the ROUGE score for a given response and ground truth.
@@ -76,10 +64,32 @@ class RougeScoreEvaluator:
     id = "azureml://registries/azureml/models/Rouge-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
     def __init__(self, rouge_type: RougeType):
-        self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
+        self._rouge_type = rouge_type
+        super().__init__()
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a rouge score evaluation result.
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
+        return {
+            "rouge_precision": metrics.precision,
+            "rouge_recall": metrics.recall,
+            "rouge_f1_score": metrics.fmeasure,
+        }
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
         """
         Evaluate the ROUGE score between the response and the ground truth.
@@ -90,9 +100,20 @@ class RougeScoreEvaluator:
         :return: The ROUGE score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate route score.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The ROUGE score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -2,83 +2,15 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import math
 import os
-import re
+from typing import Dict
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
+from typing_extensions import overload, override
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from ..._common.utils import construct_prompty_model_config, validate_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = "None"
-class _AsyncSimilarityEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    _PROMPTY_FILE = "similarity.prompty"
-    _LLM_CALL_TIMEOUT = 600
-    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        prompty_model_config = construct_prompty_model_config(
-            validate_model_config(model_config),
-            self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate similarity.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The similarity score.
-        :rtype: Dict[str, float]
-        """
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        ground_truth = str(ground_truth or "")
-        if not (query.strip() and response.strip() and ground_truth.strip()):
-            msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.SIMILARITY_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = math.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"similarity": float(score), "gpt_similarity": float(score)}
-class SimilarityEvaluator:
+class SimilarityEvaluator(PromptyEvaluatorBase):
     """
     Evaluates similarity score for a given query, response, and ground truth.
@@ -113,13 +45,27 @@ class SimilarityEvaluator:
         however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
-    id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
+    # Constants must be defined within eval's directory to be save/loadable
+    _PROMPTY_FILE = "similarity.prompty"
+    _RESULT_KEY = "similarity"
+    id = "similarity"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
     def __init__(self, model_config):
-        self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
-    def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+    # Ignoring a mypy error about having only 1 overload function.
+    # We want to use the overload style for all evals, even single-inputs. This is both to make
+    # refactoring to multi-input styles easier, stylistic consistency consistency across evals,
+    # and due to the fact that non-overloaded syntax now causes various parsing issues that
+    # we don't want to deal with.
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
         """
         Evaluate similarity.
@@ -132,9 +78,23 @@ class SimilarityEvaluator:
         :return: The similarity score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate similarity.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The similarity score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_exceptions.py CHANGED Viewed

@@ -63,7 +63,6 @@ class ErrorTarget(Enum):
     RAI_CLIENT = "RAIClient"
     COHERENCE_EVALUATOR = "CoherenceEvaluator"
     CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
-    CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
     ECI_EVALUATOR = "ECIEvaluator"
     F1_EVALUATOR = "F1Evaluator"
     GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"

azure/ai/evaluation/_safety_evaluation/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------

azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl