PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -2,9 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from concurrent.futures import as_completed
+from typing import Union
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from typing_extensions import overload, override
+from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
 from .._coherence import CoherenceEvaluator
 from .._f1_score import F1ScoreEvaluator
@@ -14,55 +16,103 @@ from .._relevance import RelevanceEvaluator
 from .._similarity import SimilarityEvaluator
-class QAEvaluator:
+class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     """
     Initialize a question-answer evaluator configured for a specific Azure OpenAI model.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    :return: A function that evaluates and generates metrics for "question-answering" scenario.
-    :rtype: Callable
-    **Usage**
-    .. code-block:: python
-        eval_fn = QAEvaluator(model_config)
-        result = qa_eval(
-            query="Tokyo is the capital of which country?",
-            response="Japan",
-            context="Tokyo is the capital of Japan.",
-            ground_truth="Japan"
-        )
-    **Output format**
-    .. code-block:: python
-        {
-            "gpt_groundedness": 3.5,
-            "gpt_relevance": 4.0,
-            "gpt_coherence": 1.5,
-            "gpt_fluency": 4.0,
-            "gpt_similarity": 3.0,
-            "f1_score": 0.42
-        }
+    :param groundedness_threshold: The threshold for groundedness evaluation. Default is 3.
+    :type groundedness_threshold: int
+    :param relevance_threshold: The threshold for relevance evaluation. Default is 3.
+    :type relevance_threshold: int
+    :param coherence_threshold: The threshold for coherence evaluation. Default is 3.
+    :type coherence_threshold: int
+    :param fluency_threshold: The threshold for fluency evaluation. Default is 3.
+    :type fluency_threshold: int
+    :param similarity_threshold: The threshold for similarity evaluation. Default is 3.
+    :type similarity_threshold: int
+    :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
+    :type f1_score_threshold: float
+    :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START qa_evaluator]
+            :end-before: [END qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a QAEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START qa_evaluator]
+            :end-before: [END qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_qa_evaluator]
+            :end-before: [END threshold_qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a QAEvaluator.
+    .. note::
+        To align with our support of a diverse set of models, keys without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old keys with the `gpt_` prefix are still be present in the output;
+        however, it is recommended to use the new keys moving forward as the old keys will be deprecated in the future.
     """
-    def __init__(self, model_config: dict, parallel: bool = True):
-        self._parallel = parallel
-        self._evaluators = [
-            GroundednessEvaluator(model_config),
-            RelevanceEvaluator(model_config),
-            CoherenceEvaluator(model_config),
-            FluencyEvaluator(model_config),
-            SimilarityEvaluator(model_config),
-            F1ScoreEvaluator(),
+    id = "azureai://built-in/evaluators/qa"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    def __init__(
+        self,
+        model_config,
+        *,
+        groundedness_threshold: int = 3,
+        relevance_threshold: int = 3,
+        coherence_threshold: int = 3,
+        fluency_threshold: int = 3,
+        similarity_threshold: int = 3,
+        f1_score_threshold: float = 0.5,
+        **kwargs,
+    ):
+        # Type checking
+        for name, value in [
+            ("groundedness_threshold", groundedness_threshold),
+            ("relevance_threshold", relevance_threshold),
+            ("coherence_threshold", coherence_threshold),
+            ("fluency_threshold", fluency_threshold),
+            ("similarity_threshold", similarity_threshold),
+            ("f1_score_threshold", f1_score_threshold),
+        ]:
+            if not isinstance(value, (int, float)):
+                raise TypeError(f"{name} must be an int or float, got {type(value)}")
+        evaluators = [
+            GroundednessEvaluator(model_config, threshold=groundedness_threshold),
+            RelevanceEvaluator(model_config, threshold=relevance_threshold),
+            CoherenceEvaluator(model_config, threshold=coherence_threshold),
+            FluencyEvaluator(model_config, threshold=fluency_threshold),
+            SimilarityEvaluator(model_config, threshold=similarity_threshold),
+            F1ScoreEvaluator(threshold=f1_score_threshold),
         ]
+        super().__init__(evaluators=evaluators, **kwargs)
-    def __call__(self, *, query: str, response: str, context: str, ground_truth: str, **kwargs):
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, context: str, ground_truth: str):
         """
         Evaluates question-answering scenario.
@@ -74,27 +124,29 @@ class QAEvaluator:
         :paramtype context: str
         :keyword ground_truth: The ground truth to be evaluated.
         :paramtype ground_truth: str
-        :keyword parallel: Whether to evaluate in parallel. Defaults to True.
-        :paramtype parallel: bool
         :return: The scores for QA scenario.
-        :rtype: dict
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
         """
-        results = {}
-        if self._parallel:
-            with ThreadPoolExecutor() as executor:
-                futures = {
-                    executor.submit(
-                        evaluator, query=query, response=response, context=context, ground_truth=ground_truth, **kwargs
-                    ): evaluator
-                    for evaluator in self._evaluators
-                }
-                # Collect results as they complete
-                for future in as_completed(futures):
-                    results.update(future.result())
-        else:
-            for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, context=context, ground_truth=ground_truth, **kwargs)
-                results.update(result)
-        return results
+        Evaluates question-answering scenario.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The scores for QA scenario.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -1,126 +1,210 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import logging
+import math
 import os
-import re
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from typing import Dict, Union, List
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from typing_extensions import overload, override
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = None
-class _AsyncRelevanceEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "relevance.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ..._common.utils import reformat_conversation_history, reformat_agent_response
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
+from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
+logger = logging.getLogger(__name__)
-    async def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        context = str(context or "")
-        if not (query.strip() and response.strip() and context.strip()):
-            msg = "'query', 'response' and 'context' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.RELEVANCE_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-        return {"gpt_relevance": float(score)}
+class RelevanceEvaluator(PromptyEvaluatorBase):
+    """
+    Evaluates relevance score for a given query and response or a multi-turn conversation, including reasoning.
+    The relevance measure assesses the ability of answers to capture the key points of the context.
+    High relevance scores signify the AI system's understanding of the input and its capability to produce coherent
+    and contextually appropriate outputs. Conversely, low relevance scores indicate that generated responses might
+    be off-topic, lacking in context, or insufficient in addressing the user's intended queries. Use the relevance
+    metric when evaluating the AI system's performance in understanding the input and generating contextually
+    appropriate responses.
-class RelevanceEvaluator:
-    """
-    Initialize a relevance evaluator configured for a specific Azure OpenAI model.
+    Relevance scores range from 1 to 5, with 1 being the worst and 5 being the best.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the relevance evaluator. Default is 3.
+    :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START relevance_evaluator]
+            :end-before: [END relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START relevance_evaluator]
+            :end-before: [END relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_relevance_evaluator]
+            :end-before: [END threshold_relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a RelevanceEvaluator with a query, response, and context.
+    .. note::
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    """
-    **Usage**
-    .. code-block:: python
-        eval_fn = RelevanceEvaluator(model_config)
-        result = eval_fn(
-            query="What is the capital of Japan?",
-            response="The capital of Japan is Tokyo.",
-            context="Tokyo is Japan's capital, known for its blend of traditional culture \
-                and technological advancements.")
-    **Output format**
-    .. code-block:: python
+    # Constants must be defined within eval's directory to be save/loadable
+    _PROMPTY_FILE = "relevance.prompty"
+    _RESULT_KEY = "relevance"
-        {
-            "gpt_relevance": 3.0
-        }
-    """
+    id = "azureai://built-in/evaluators/relevance"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncRelevanceEvaluator(model_config)
+    @override
+    def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            credential=credential,
+            _higher_is_better=True,
+            **kwargs,
+        )
-    def __call__(self, *, query: str, response: str, context: str, **kwargs):
-        """
-        Evaluate relevance.
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for given input of query, response, context
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
-        :keyword context: The context to be evaluated.
-        :paramtype context: str
         :return: The relevance score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, context=context, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate relevance for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate relevance. Accepts either a query and response for a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+        :keyword query: The query to be evaluated. Mutually exclusive with the `conversation` parameter.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
+        :paramtype response: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
+        """
+        return super().__call__(*args, **kwargs)
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do a relevance evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain
+        whatever inputs are needed for the _flow method, including context
+        and other fields depending on the child class.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
+        if not isinstance(eval_input["query"], str):
+            eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
+        if not isinstance(eval_input["response"], str):
+            eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
+        result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = result.get("llm_output")
+        score = math.nan
+        if isinstance(llm_output, dict):
+            score = float(llm_output.get("score", math.nan))
+            reason = llm_output.get("explanation", "")
+            # Parse out score and reason from evaluators known to possess them.
+            binary_result = self._get_binary_result(score)
+            return {
+                self._result_key: float(score),
+                f"gpt_{self._result_key}": float(score),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_reason": reason,
+                f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
+                f"{self._result_key}_model": result.get("model_id", ""),
+                f"{self._result_key}_sample_input": result.get("sample_input", ""),
+                f"{self._result_key}_sample_output": result.get("sample_output", ""),
+            }
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }