PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -1,58 +1,99 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.gleu_score import sentence_gleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
-class _AsyncGleuScoreEvaluator:
-    def __init__(self):
-        pass
-    async def __call__(self, *, ground_truth: str, response: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-        score = sentence_gleu([reference_tokens], hypothesis_tokens)
-        return {
-            "gleu_score": score,
-        }
-class GleuScoreEvaluator:
+class GleuScoreEvaluator(EvaluatorBase):
     """
-    Evaluator that computes the BLEU Score between two strings.
+    Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
     The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
     evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
     sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
     use cases such as machine translation, text summarization, and text generation.
-    **Usage**
+    GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
+    the ground truth and a value of 0 indicates no overlap.
-    .. code-block:: python
+    :param threshold: The threshold for the GLEU evaluator. Default is 0.5.
+    :type threshold: float
-        eval_fn = GleuScoreEvaluator()
-        result = eval_fn(
-            response="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
+    .. admonition:: Example:
-    **Output format**
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START gleu_score_evaluator]
+            :end-before: [END gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GleuScoreEvaluator.
-    .. code-block:: python
+    .. admonition:: Example with Threshold:
-        {
-            "gleu_score": 0.41
-        }
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_gleu_score_evaluator]
+            :end-before: [END threshold_gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a GleuScoreEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START gleu_score_evaluator]
+            :end-before: [END gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
-    def __init__(self):
-        self._async_evaluator = _AsyncGleuScoreEvaluator()
+    id = "azureai://built-in/evaluators/gleu_score"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __call__(self, *, ground_truth: str, response: str, **kwargs):
+    @override
+    def __init__(self, *, threshold=0.5):
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
+        score = sentence_gleu([reference_tokens], hypothesis_tokens)
+        binary_result = False
+        if self._higher_is_better:
+            if score >= self._threshold:
+                binary_result = True
+        else:
+            if score <= self._threshold:
+                binary_result = True
+        return {
+            "gleu_score": score,
+            "gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "gleu_threshold": self._threshold,
+        }
+    @overload  # type: ignore
+    def __call__(self, *, ground_truth: str, response: str):
         """
         Evaluate the GLEU score between the response and the ground truth.
@@ -61,11 +102,23 @@ class GleuScoreEvaluator:
         :keyword ground_truth: The ground truth to be compared against.
         :paramtype ground_truth: str
         :return: The GLEU score.
-        :rtype: dict
+        :rtype: Dict[str, float]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
-        )
+        Evaluate the GLEU score between the response and the ground truth.
-    def _to_async(self):
-        return self._async_evaluator
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The GLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -1,118 +1,354 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import os, logging
+from typing import Dict, List, Optional, Union, Any, Tuple
-import os
-import re
+from typing_extensions import overload, override
+from azure.ai.evaluation._legacy.prompty import AsyncPrompty
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
+from ..._common.utils import (
+    ErrorBlame,
+    ErrorTarget,
+    EvaluationException,
+    ErrorCategory,
+    construct_prompty_model_config,
+    validate_model_config,
+    simplify_messages,
+)
 try:
-    from ..._user_agent import USER_AGENT
+    from ..._user_agent import UserAgentSingleton
 except ImportError:
-    USER_AGENT = None
-class _AsyncGroundednessEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "groundedness.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
+    class UserAgentSingleton:
+        @property
+        def value(self) -> str:
+            return "None"
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
+logger = logging.getLogger(__name__)
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, "groundedness.prompty")
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
+class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
+    including reasoning.
-    async def __call__(self, *, response: str, context: str, **kwargs):
-        # Validate input parameters
-        response = str(response or "")
-        context = str(context or "")
+    The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
+    context, making sure that these claims are substantiated by the context. Even if the responses from LLM are
+    factually correct, they'll be considered ungrounded if they can't be verified against the provided sources
+    (such as your input source or your database). Use the groundedness metric when you need to verify that
+    AI-generated responses align with and are validated by the provided context.
-        if not response.strip() or not context.strip():
-            msg = "Both 'response' and 'context' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.F1_EVALUATOR,
-            )
+    Groundedness scores range from 1 to 5, with 1 being the least grounded and 5 being the most grounded.
-        # Run the evaluation flow
-        llm_output = await self._flow(response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the groundedness evaluator. Default is 3.
+    :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
+    .. admonition:: Example:
-        return {"gpt_groundedness": float(score)}
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START groundedness_evaluator]
+            :end-before: [END groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GroundednessEvaluator.
+    .. admonition:: Example with Threshold:
-class GroundednessEvaluator:
-    """
-    Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_groundedness_evaluator]
+            :end-before: [END threshold_groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a GroundednessEvaluator.
-    :param model_config: Configuration for the Azure OpenAI model.
-    :type model_config: Union[~azure.ai.evalation.AzureOpenAIModelConfiguration,
-        ~azure.ai.evalation.OpenAIModelConfiguration]
+    .. admonition:: Example using Azure AI Project URL:
-    **Usage**
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START groundedness_evaluator]
+            :end-before: [END groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
-    .. code-block:: python
+    .. note::
-        eval_fn = GroundednessEvaluator(model_config)
-        result = eval_fn(
-            response="The capital of Japan is Tokyo.",
-            context="Tokyo is Japan's capital, known for its blend of traditional culture \
-                and technological advancements.")
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    """
-    **Output format**
+    _PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty"
+    _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
+    _RESULT_KEY = "groundedness"
+    _OPTIONAL_PARAMS = ["query"]
+    _SUPPORTED_TOOLS = ["file_search"]
-    .. code-block:: python
+    id = "azureai://built-in/evaluators/groundedness"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-        {
-            "gpt_groundedness": 5
-        }
-    """
+    @override
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY)  # Default to no query
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncGroundednessEvaluator(model_config)
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            credential=credential,
+            _higher_is_better=self._higher_is_better,
+            **kwargs,
+        )
+        self._model_config = model_config
+        self.threshold = threshold
+        # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
-    def __call__(self, *, response: str, context: str, **kwargs):
-        """
-        Evaluate groundedness of the response in the context.
+    @overload
+    def __call__(
+        self,
+        *,
+        response: str,
+        context: str,
+        query: Optional[str] = None,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for given input of response, context
         :keyword response: The response to be evaluated.
         :paramtype response: str
-        :keyword context: The context in which the response is evaluated.
+        :keyword context: The context to be evaluated.
         :paramtype context: str
+        :keyword query: The query to be evaluated. Optional parameter for use with the `response`
+            and `context` parameters. If provided, a different prompt template will be used for evaluation.
+        :paramtype query: Optional[str]
+        :return: The groundedness score.
+        :rtype: Dict[str, float]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: List[dict],
+        tool_definitions: List[dict],
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response from the agent to be evaluated.
+        :paramtype response: List[dict]
+        :keyword tool_definitions: The tool definitions used by the agent.
+        :paramtype tool_definitions: List[dict]
+        :return: The groundedness score.
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate groundedness for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The groundedness score.
-        :rtype: dict
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate groundedness. Accepts either a query, response, and context for a single evaluation,
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn.
+        :keyword query: The query to be evaluated. Mutually exclusive with `conversation`. Optional parameter for use
+            with the `response` and `context` parameters. If provided, a different prompt template will be used for
+            evaluation.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated. Mutually exclusive with the `conversation` parameter.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
+        """
+        if kwargs.get("query", None):
+            self._ensure_query_prompty_loaded()
+        return super().__call__(*args, **kwargs)
+    def _ensure_query_prompty_loaded(self):
+        """Switch to the query prompty file if not already loaded."""
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
+        self._prompty_file = prompty_path
+        prompty_model_config = construct_prompty_model_config(
+            validate_model_config(self._model_config),
+            self._DEFAULT_OPEN_API_VERSION,
+            UserAgentSingleton().value,
+        )
+        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
+    def _has_context(self, eval_input: dict) -> bool:
+        """
+        Return True if eval_input contains a non-empty 'context' field.
+        Treats None, empty strings, empty lists, and lists of empty strings as no context.
+        """
+        context = eval_input.get("context", None)
+        if not context:
+            return False
+        if context == "<>":  # Special marker for no context
+            return False
+        if isinstance(context, list):
+            return any(str(c).strip() for c in context)
+        if isinstance(context, str):
+            return bool(context.strip())
+        return True
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
+        if eval_input.get("query", None) is None:
+            return await super()._do_eval(eval_input)
+        contains_context = self._has_context(eval_input)
+        simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
+        simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
+        # Build simplified input
+        simplified_eval_input = {
+            "query": simplified_query,
+            "response": simplified_response,
+            "context": eval_input["context"],
+        }
+        # Replace and call the parent method
+        return await super()._do_eval(simplified_eval_input)
+    async def _real_call(self, **kwargs):
+        """The asynchronous call where real end-to-end evaluation logic is performed.
+        :keyword kwargs: The inputs to evaluate.
+        :type kwargs: Dict
+        :return: The evaluation result.
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
-        return async_run_allowing_running_loop(self._async_evaluator, response=response, context=context, **kwargs)
+        # Convert inputs into list of evaluable inputs.
+        try:
+            return await super()._real_call(**kwargs)
+        except EvaluationException as ex:
+            if ex.category == ErrorCategory.NOT_APPLICABLE:
+                return {
+                    self._result_key: self._NOT_APPLICABLE_RESULT,
+                    f"{self._result_key}_result": "pass",
+                    f"{self._result_key}_threshold": self.threshold,
+                    f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
+                }
+            else:
+                raise ex
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        if kwargs.get("context") or kwargs.get("conversation"):
+            return super()._convert_kwargs_to_eval_input(**kwargs)
+        query = kwargs.get("query")
+        response = kwargs.get("response")
+        tool_definitions = kwargs.get("tool_definitions")
+        if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
+            self._ensure_query_prompty_loaded()
+        if (not query) or (not response):  # or not tool_definitions:
+            msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
+            raise EvaluationException(
+                message=msg,
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+        context = self._get_context_from_agent_response(response, tool_definitions)
+        filtered_response = self._filter_file_search_results(response)
+        return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
+    def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Filter out file_search tool results from the messages."""
+        file_search_ids = self._get_file_search_tool_call_ids(messages)
+        return [
+            msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
+        ]
+    def _get_context_from_agent_response(self, response, tool_definitions):
+        """Extract context text from file_search tool results in the agent response."""
+        NO_CONTEXT = "<>"
+        context = ""
+        try:
+            logger.debug("Extracting context from response")
+            tool_calls = self._parse_tools_from_response(response=response)
+            logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
+            if not tool_calls:
+                return NO_CONTEXT
+            context_lines = []
+            for tool_call in tool_calls:
+                if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
+                    continue
+                tool_name = tool_call.get("name")
+                if tool_name != "file_search":
+                    continue
+                # Extract tool results
+                for result in tool_call.get("tool_result", []):
+                    results = result if isinstance(result, list) else [result]
+                    for r in results:
+                        file_name = r.get("file_name", "Unknown file name")
+                        for content in r.get("content", []):
+                            text = content.get("text")
+                            if text:
+                                context_lines.append(f"{file_name}:\n- {text}---\n\n")
+            context = "\n".join(context_lines) if len(context_lines) > 0 else None
+        except Exception as ex:
+            logger.debug(f"Error extracting context from agent response : {str(ex)}")
+            context = None
+        context = context if context else NO_CONTEXT
+        return context
-    def _to_async(self):
-        return self._async_evaluator
+    def _get_file_search_tool_call_ids(self, query_or_response):
+        """Return a list of tool_call_ids for file search tool calls."""
+        tool_calls = self._parse_tools_from_response(query_or_response)
+        return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]