PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py ADDED Viewed

@@ -0,0 +1,179 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import List, Union, Dict
+from typing_extensions import overload, override
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
+@experimental
+class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+    """
+    Evaluates service-based groundedness score for a given response, context, and query or a multi-turn conversation,
+    including reasoning.
+    The groundedness measure calls Azure AI Evaluation service to assess how well the AI-generated answer is grounded
+    in the source context. Even if the responses from LLM are factually correct, they'll be considered ungrounded if
+    they can't be verified against the provided sources (such as your input source or your database).
+    Service-based groundedness scores are boolean values, where True indicates that the response is grounded.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
+    :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
+    :type threshold: int
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START groundedness_pro_evaluator]
+            :end-before: [END groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START groundedness_pro_evaluator]
+            :end-before: [END groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example with threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_groundedness_pro_evaluator]
+            :end-before: [END threshold_groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a specified threshold and call GroundednessProEvaluator with a query, response, and context.
+    .. note::
+        If this evaluator is supplied to the `evaluate` function, the aggregated metric
+        for the groundedness pro label will be "groundedness_pro_passing_rate".
+    """
+    id = "azureai://built-in/evaluators/groundedness_pro"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+        *,
+        threshold: int = 5,
+        **kwargs,
+    ):
+        self.threshold = threshold
+        self._higher_is_better = True
+        self._output_prefix = "groundedness_pro"
+        super().__init__(
+            eval_metric=EvaluationMetrics.GROUNDEDNESS,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+            threshold=self.threshold,
+            **kwargs,
+        )
+    @overload
+    def __call__(
+        self,
+        *,
+        response: str,
+        context: str,
+        query: str,
+    ) -> Dict[str, Union[str, bool]]:
+        """Evaluate groundedness for a given query/response/context
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]:
+        """Evaluate groundedness for a conversation for a multi-turn evaluation. If the conversation has
+        more than one turn, the evaluator will aggregate the results of each turn, with the per-turn results
+        available in the output under the "evaluation_per_turn" key.
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn, with the per-turn results available
+        in the output under the "evaluation_per_turn" key.
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
+        """
+        return super().__call__(*args, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict):
+        """This evaluator has some unique post-processing that requires data that
+        the rai_service script is not currently built to handle. So we post-post-process
+        the result here to message it into the right form.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        result = await super()._do_eval(eval_input)
+        real_result = {}
+        real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
+        real_result[self._output_prefix + "_label"] = (
+            result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self.threshold
+        )
+        if self._higher_is_better:
+            real_result[self._output_prefix + "_score"] = max(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 0)
+        else:
+            real_result[self._output_prefix + "_score"] = min(result[EvaluationMetrics.GROUNDEDNESS + "_score"], 1)
+        return real_result

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -3,108 +3,124 @@
 # ---------------------------------------------------------
 import os
-import re
+from typing import Dict
-import numpy as np
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from promptflow.core import AsyncPrompty
+from typing_extensions import overload, override
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
-try:
-    from ..._user_agent import USER_AGENT
-except ImportError:
-    USER_AGENT = None
-class _AsyncSimilarityEvaluator:
-    # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "similarity.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: dict):
-        ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
-        prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
-        # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
-        # https://github.com/encode/httpx/discussions/2959
-        prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
-        ensure_user_agent_in_aoai_model_config(
-            model_config,
-            prompty_model_config,
-            USER_AGENT,
-        )
-        current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
-        self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
-    async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
-        # Validate input parameters
-        query = str(query or "")
-        response = str(response or "")
-        ground_truth = str(ground_truth or "")
-        if not (query.strip() and response.strip() and ground_truth.strip()):
-            msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.SIMILARITY_EVALUATOR,
-            )
-        # Run the evaluation flow
-        llm_output = await self._flow(
-            query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
-        )
-        score = np.nan
-        if llm_output:
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
+class SimilarityEvaluator(PromptyEvaluatorBase):
+    """
+    Evaluates similarity score for a given query, response, and ground truth.
-        return {"gpt_similarity": float(score)}
+    The similarity measure evaluates the likeness between a ground truth sentence (or document) and the
+    AI model's generated prediction. This calculation involves creating sentence-level embeddings for both
+    the ground truth and the model's prediction, which are high-dimensional vector representations capturing
+    the semantic meaning and context of the sentences.
+    Use it when you want an objective evaluation of an AI model's performance, particularly in text generation
+    tasks where you have access to ground truth responses. Similarity enables you to assess the generated
+    text's semantic alignment with the desired content, helping to gauge the model's quality and accuracy.
-class SimilarityEvaluator:
-    """
-    Initialize a similarity evaluator configured for a specific Azure OpenAI model.
+    Similarity scores range from 1 to 5, with 1 being the least similar and 5 being the most similar.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the similarity evaluator. Default is 3.
+    :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_similarity_evaluator]
+            :end-before: [END threshold_similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a threshold and call a SimilarityEvaluator.
+    .. note::
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    """
-    **Usage**
-    .. code-block:: python
+    # Constants must be defined within eval's directory to be save/loadable
-        eval_fn = SimilarityEvaluator(model_config)
-        result = eval_fn(
-            query="What is the capital of Japan?",
-            response="The capital of Japan is Tokyo.",
-            ground_truth="Tokyo is Japan's capital.")
+    _PROMPTY_FILE = "similarity.prompty"
+    _RESULT_KEY = "similarity"
-    **Output format**
+    id = "azureai://built-in/evaluators/similarity"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    .. code-block:: python
+    @override
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            credential=credential,
+            _higher_is_better=self._higher_is_better,
+            **kwargs,
+        )
-        {
-            "gpt_similarity": 3.0
-        }
-    """
+    # Ignoring a mypy error about having only 1 overload function.
+    # We want to use the overload style for all evals, even single-inputs. This is both to make
+    # refactoring to multi-input styles easier, stylistic consistency consistency across evals,
+    # and due to the fact that non-overloaded syntax now causes various parsing issues that
+    # we don't want to deal with.
+    @overload  # type: ignore
+    def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
+        """
+        Evaluate similarity.
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The similarity score.
+        :rtype: Dict[str, float]
+        """
-    def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
         """
         Evaluate similarity.
@@ -115,11 +131,6 @@ class SimilarityEvaluator:
         :keyword ground_truth: The ground truth to be evaluated.
         :paramtype ground_truth: str
         :return: The similarity score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_similarity/similarity.prompty CHANGED Viewed

@@ -3,11 +3,6 @@ name: Similarity
 description: Evaluates similarity score for QA scenario
 model:
   api: chat
-  configuration:
-    type: azure_openai
-    azure_deployment: ${env:AZURE_DEPLOYMENT}
-    api_key: ${env:AZURE_OPENAI_API_KEY}
-    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
   parameters:
     temperature: 0.0
     max_tokens: 1

azure/ai/evaluation/_evaluators/_task_adherence/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._task_adherence import TaskAdherenceEvaluator
+__all__ = ["TaskAdherenceEvaluator"]

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py ADDED Viewed

@@ -0,0 +1,226 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import os
+import math
+import logging
+from typing import Dict, Union, List, Optional
+from typing_extensions import overload, override
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from ..._common.utils import (
+    reformat_conversation_history,
+    reformat_agent_response,
+)
+from azure.ai.evaluation._model_configurations import Message
+from azure.ai.evaluation._common._experimental import experimental
+logger = logging.getLogger(__name__)
+@experimental
+class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """The Task Adherence evaluator assesses whether an AI assistant's actions fully align with the user's intent
+    and fully achieve the intended goal across three dimensions:
+        - Goal adherence: Did the assistant achieve the user's objective within scope and constraints?
+        - Rule adherence: Did the assistant respect safety, privacy, authorization, and presentation contracts?
+        - Procedural adherence: Did the assistant follow required workflows, tool use, sequencing, and verification?
+    The evaluator returns a boolean flag indicating whether there was any material failure in any dimension.
+    A material failure is an issue that makes the output unusable, creates verifiable risk, violates an explicit
+    constraint, or is a critical issue as defined in the evaluation dimensions.
+    The evaluation includes step-by-step reasoning and a flagged boolean result.
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START task_adherence_evaluator]
+            :end-before: [END task_adherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START task_adherence_evaluator]
+            :end-before: [END task_adherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
+    """
+    _PROMPTY_FILE = "task_adherence.prompty"
+    _RESULT_KEY = "task_adherence"
+    _OPTIONAL_PARAMS = []
+    _DEFAULT_TASK_ADHERENCE_SCORE = 0
+    id = "azureai://built-in/evaluators/task_adherence"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self.threshold = threshold  # to be removed in favor of _threshold
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            _higher_is_better=True,
+            **kwargs,
+        )
+    @overload
+    def __call__(
+        self,
+        *,
+        query: Union[str, List[dict]],
+        response: Union[str, List[dict]],
+        tool_definitions: Optional[Union[dict, List[dict]]] = None,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate task adherence for a given query and response.
+        The query and response must be lists of messages in conversation format.
+        Example with list of messages:
+            evaluator = TaskAdherenceEvaluator(model_config)
+            query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
+            response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
+            result = evaluator(query=query, response=response)
+        :keyword query: The query being evaluated, must be a list of messages including system and user messages.
+        :paramtype query: Union[str, List[dict]]
+        :keyword response: The response being evaluated, must be a list of messages (full agent response including tool calls and results)
+        :paramtype response: Union[str, List[dict]]
+        :return: A dictionary with the task adherence evaluation results including flagged (bool) and reasoning (str).
+        :rtype: Dict[str, Union[str, float, bool]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Invokes the instance using the overloaded __call__ signature.
+        For detailed parameter types and return value documentation, see the overloaded __call__ definition.
+        """
+        return super().__call__(*args, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]:  # type: ignore[override]
+        """Do Task Adherence evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        # we override the _do_eval method as we want the output to be a dictionary,
+        # which is a different schema than _base_prompty_eval.py
+        if "query" not in eval_input or "response" not in eval_input:
+            raise EvaluationException(
+                message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
+                internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
+            )
+        # Reformat conversation history and extract system message
+        query_messages = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
+        system_message = ""
+        user_query = ""
+        # Parse query messages to extract system message and user query
+        if isinstance(query_messages, list):
+            for msg in query_messages:
+                if isinstance(msg, dict) and msg.get("role") == "system":
+                    system_message = msg.get("content", "")
+                elif isinstance(msg, dict) and msg.get("role") == "user":
+                    user_query = msg.get("content", "")
+        elif isinstance(query_messages, str):
+            user_query = query_messages
+        # Reformat response and separate assistant messages from tool calls
+        response_messages = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
+        assistant_response = ""
+        tool_calls = ""
+        # Parse response messages to extract assistant response and tool calls
+        if isinstance(response_messages, list):
+            assistant_parts = []
+            tool_parts = []
+            for msg in response_messages:
+                if isinstance(msg, dict):
+                    role = msg.get("role", "")
+                    if role == "assistant":
+                        content = msg.get("content", "")
+                        if isinstance(content, list):
+                            for item in content:
+                                if isinstance(item, dict):
+                                    if item.get("type") == "text":
+                                        assistant_parts.append(item.get("text", ""))
+                                    elif item.get("type") == "tool_call":
+                                        tool_parts.append(str(item.get("tool_call", "")))
+                        else:
+                            assistant_parts.append(str(content))
+                    elif role == "tool":
+                        tool_parts.append(str(msg))
+            assistant_response = "\n".join(assistant_parts)
+            tool_calls = "\n".join(tool_parts)
+        elif isinstance(response_messages, str):
+            assistant_response = response_messages
+        # Prepare inputs for prompty
+        prompty_input = {
+            "system_message": system_message,
+            "query": user_query,
+            "response": assistant_response,
+            "tool_calls": tool_calls,
+        }
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
+        llm_output = prompty_output_dict["llm_output"]
+        if isinstance(llm_output, dict):
+            flagged = llm_output.get("flagged", False)
+            reasoning = llm_output.get("reasoning", "")
+            # Convert flagged to numeric score for backward compatibility (1 = pass, 0 = fail)
+            score = 0.0 if flagged else 1.0
+            score_result = "fail" if flagged else "pass"
+            return {
+                f"{self._result_key}": score,
+                f"{self._result_key}_result": score_result,
+                f"{self._result_key}_reason": reasoning,
+                f"{self._result_key}_details": llm_output.get("details", ""),
+                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
+                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
+                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
+                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
+            }
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning 0 for the success.")
+        return {self._result_key: 0}

azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.13.3py3-none-any.whl