PyPI - azure-ai-evaluation - Versions diffs - 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.2.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (134) hide show

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -28,15 +28,26 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the similarity evaluator. Default is 5.
+    :type threshold: int
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START rouge_score_evaluator]
-            :end-before: [END rouge_score_evaluator]
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
+            :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_similarity_evaluator]
+            :end-before: [END threshold_similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with a threshold and call a SimilarityEvaluator.
     .. note::
@@ -54,10 +65,18 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config):
+    def __init__(self, model_config, *, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            _higher_is_better=self._higher_is_better
+        )
     # Ignoring a mypy error about having only 1 overload function.
     # We want to use the overload style for all evals, even single-inputs. This is both to make

azure/ai/evaluation/_evaluators/_task_adherence/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._task_adherence import TaskAdherenceEvaluator
+__all__ = ["TaskAdherenceEvaluator"]

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py ADDED Viewed

@@ -0,0 +1,148 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import os
+import math
+from typing import Dict, Union, List, Optional
+from typing_extensions import overload, override
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
+from azure.ai.evaluation._model_configurations import Message
+from azure.ai.evaluation._common._experimental import experimental
+@experimental
+class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
+        - Alignment with instructions and definitions
+        - Accuracy and clarity of the response
+        - Proper use of provided tool definitions
+    Scoring is based on five levels:
+    1. Fully Inadherent - Response completely ignores instructions.
+    2. Barely Adherent - Partial alignment with critical gaps.
+    3. Moderately Adherent - Meets core requirements but lacks precision.
+    4. Mostly Adherent - Clear and accurate with minor issues.
+    5. Fully Adherent - Flawless adherence to instructions.
+    The evaluation includes a step-by-step reasoning process, a brief explanation, and a final integer score.
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START task_adherence_evaluator]
+            :end-before: [END task_adherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
+    """
+    _PROMPTY_FILE = "task_adherence.prompty"
+    _RESULT_KEY = "task_adherence"
+    _OPTIONAL_PARAMS = ["tool_definitions"]
+    _DEFAULT_TASK_ADHERENCE_SCORE = 3
+    id = None
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self.threshold = threshold
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+    @overload
+    def __call__(
+        self,
+        *,
+        query: Union[str, List[dict]],
+        response: Union[str, List[dict]],
+        tool_definitions: Optional[Union[dict, List[dict]]] = None,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate task adherence for a given query, response, and optional tool defintions.
+        The query and response can be either a string or a list of messages.
+        Example with string inputs and no tools:
+            evaluator = TaskAdherenceEvaluator(model_config)
+            query = "What is the weather today?"
+            response = "The weather is sunny."
+            result = evaluator(query=query, response=response)
+        Example with list of messages:
+            evaluator = TaskAdherenceEvaluator(model_config)
+            query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
+            response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
+            tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
+            result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        :keyword query: The query being evaluated, either a string or a list of messages.
+        :paramtype query: Union[str, List[dict]]
+        :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
+        :paramtype response: Union[str, List[dict]]
+        :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
+        :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
+        :return: A dictionary with the task adherence evaluation results.
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+            self,
+            *args,
+            **kwargs,
+    ):
+        """
+        Invokes the instance using the overloaded __call__ signature.
+        For detailed parameter types and return value documentation, see the overloaded __call__ definition.
+        """
+        return super().__call__(*args, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do Task Adherence evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        # we override the _do_eval method as we want the output to be a dictionary,
+        # which is a different schema than _base_prompty_eval.py
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
+                internal_message=f"Both query and response must be provided as input to the Task Adherence evaluator.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
+            )
+        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        score = math.nan
+        if llm_output:
+            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
+            score_result = 'pass' if score >= self.threshold else 'fail'
+            return {
+                f"{self._result_key}": score,
+                f"{self._result_key}_result": score_result,
+                f"{self._result_key}_threshold": self.threshold,
+                f"{self._result_key}_reason": reason,
+            }
+        return {self._result_key: math.nan}

azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty ADDED Viewed

@@ -0,0 +1,117 @@
+---
+name: TaskAdherence
+description: Evaluates Task Adherence score for QA scenario
+model:
+  api: chat
+  parameters:
+    temperature: 0.0
+    max_tokens: 800
+    top_p: 1.0
+    presence_penalty: 0
+    frequency_penalty: 0
+    response_format:
+      type: text
+inputs:
+  query:
+    type: string
+  response:
+    type: string
+  tool_definitions:
+    type: string
+    optional: true
+    default: "[]"
+---
+system:
+# Instruction
+## Context
+### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
+- **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task.
+- **Data**: Your input data includes query, response, and tool definitions.
+- **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
+# Definition
+**Level 1: Fully Inadherent**
+**Definition:**
+Response completely ignores instructions or deviates significantly
+**Example:**
+  **Query:** What is a recommended weekend itinerary in Paris?
+  **Response:** Paris is a lovely city with a rich history.
+Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan.
+**Level 2: Barely Adherent**
+**Definition:**
+Response partially aligns with instructions but has critical gaps.
+**Example:**
+  **Query:** What is a recommended weekend itinerary in Paris?
+  **Response:** Spend your weekend visiting famous places in Paris.
+Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions.
+**Level 3: Moderately Adherent**
+**Definition:**
+Response meets the core requirements but lacks precision or clarity.
+**Example:**
+  **Query:** What is a recommended weekend itinerary in Paris?
+  **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday.
+Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear.
+**Level 4: Mostly Adherent**
+**Definition:**
+Response is clear, accurate, and aligns with instructions with minor issues.
+**Example:**
+  **Query:** What is a recommended weekend itinerary in Paris?
+  **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences.
+Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect.
+**Level 5: Fully Adherent**
+**Definition:**
+Response is flawless, accurate, and follows instructions to the letter.
+**Example:**
+  **Query:** What is a recommended weekend itinerary in Paris?
+  **Response:** Here is a detailed weekend itinerary in Paris:
+Saturday:
+Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top.
+Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits.
+Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques.
+Evening: Enjoy dinner at a classic Parisian bistro near the river.
+Sunday:
+Morning: Visit the Notre-Dame Cathedral to explore its architecture and history.
+Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés.
+Afternoon: Finish your trip with a scenic boat tour on the Seine.
+This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit.
+Explanation:  This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps.
+# Data
+Query: {{query}}
+Response: {{response}}
+Tool Definitions: {{tool_definitions}}
+# Tasks
+## Please provide your assessment Score for the previous answer. Your output should include the following information:
+- **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
+- **Explanation**: a very short explanation of why you think the input data should get that Score.
+- **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions.
+## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
+# Output

azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._tool_call_accuracy import ToolCallAccuracyEvaluator
+__all__ = [
+    "ToolCallAccuracyEvaluator",
+]

azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py ADDED Viewed

@@ -0,0 +1,292 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import math
+import os
+import logging
+import re
+from typing import Dict, List, Union, TypeVar, cast
+from typing_extensions import overload, override
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._common.utils import remove_optional_singletons, parse_quality_evaluator_reason_score
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._common._experimental import experimental
+logger = logging.getLogger(__name__)
+T_EvalValue = TypeVar("T_EvalValue")
+@experimental
+class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
+        - Relevance to the conversation
+        - Parameter correctness according to tool definitions
+        - Parameter value extraction from the conversation
+    The evaluator uses a binary scoring system (0 or 1):
+        - Score 0: The tool call is irrelevant or contains information not in the conversation/definition
+        - Score 1: The tool call is relevant with properly extracted parameters from the conversation
+    This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
+    user needs while properly following tool definitions and using information present in the
+    conversation history.
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START tool_call_accuracy_evaluator]
+            :end-before: [END tool_call_accuracy_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a ToolCallAccuracyEvaluator.
+    .. note::
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+    """
+    _PROMPTY_FILE = "tool_call_accuracy.prompty"
+    _RESULT_KEY = "tool_call_accurate"
+    _AGGREGATE_RESULT_KEY = "tool_call_accuracy"
+    _MAX_TOOL_CALL_ACCURACY_SCORE = 1.0
+    _MIN_TOOL_CALL_ACCURACY_SCORE = 0.0
+    _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 0.8
+    id = "id"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
+    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self.threshold = threshold
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+    @overload
+    def __call__(
+        self,
+        *,
+        query: Union[str, List[dict]],
+        tool_definitions: Union[dict, List[dict]],
+        tool_calls: Union[dict, List[dict]]  = None,
+        response: Union[str, List[dict]] = None
+    ) -> Dict[str, Union[str, float]]:
+        """
+        Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
+        :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
+        :paramtype query: Union[str, List[dict]]
+        :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
+        :paramtype tool_definitions: Union[dict, List[dict]]
+        :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
+            tool call(s) in it.
+        :paramtype tool_calls: Union[dict, List[dict]]
+        :keyword response: Optional response to be evaluated alongside the tool calls.
+            If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
+            If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
+                If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
+            Recommended to provide it when there are tool calls that depend on output of a previous tool call.
+        :paramtype response: Union[str, List[dict]]
+        :return: The tool selection evaluation results.
+        :rtype: Dict[str, Union[str, float]]
+        """
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        """Convert an arbitrary input into a list of inputs for evaluators.
+        It is assumed that evaluators generally make use of their inputs in one of two ways.
+        Either they receive a collection of keyname inputs that are all single values
+        (like a query and response), or they receive conversation that iss a list of dictionary
+        values.
+        The self._singleton_inputs list assigned during initialization is used to find and extract
+        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
+        is a valid input.
+        If both conversations and singletons are allowed, the function will raise an exception if both
+        are inputted.
+        This function must be overridden by child classes IF they need to both a conversation and
+        other inputs to be passed in.
+        :keyword kwargs: The inputs to convert.
+        :type kwargs: Dict
+        :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
+        :rtype: List
+        """
+        # TODO add warning that only tool calls of type function are supported
+        # Collect inputs
+        tool_calls = kwargs.get("tool_calls", None)
+        tool_definitions = kwargs.get("tool_definitions")
+        query = kwargs.get("query", None)
+        response = kwargs.get("response", None)
+        if response is None and tool_calls is None:
+            raise EvaluationException(
+                message="Either response or tool_calls must be provided.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+            )
+        if tool_definitions is None:
+            raise EvaluationException(
+                message="Tool definitions must be provided.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+            )
+        # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
+        if tool_calls is None:
+            # Extract tool calls from response if not provided
+            tool_calls = []
+            if isinstance(response, list):
+                for message in response:
+                    if message.get("role") == "assistant":
+                        tool_calls.extend([content for content in message.get("content")
+                                        if content.get("type") == "tool_call"])
+            if len(tool_calls) == 0:
+                raise EvaluationException(
+                    message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.MISSING_FIELD,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                )
+        if not isinstance(tool_calls, list):
+            tool_calls = [tool_calls]
+        if not isinstance(tool_definitions, list):
+            tool_definitions = [tool_definitions]
+        eval_inputs = []
+        # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
+        # we need to provide the output of the previous tool call as part of messages.
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":  # TODO assuming dict here but it can be a class
+                function_name = tool_call.get("name")
+                tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
+                if len(tool_definition) > 0:
+                    tool_definition = tool_definition
+                else:
+                    raise EvaluationException(
+                        message="Tool definition not found",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    )
+                eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
+            else:
+                raise EvaluationException(
+                    message="Tool definition not found",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                )
+        return eval_inputs
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do a relevance evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain
+        whatever inputs are needed for the _flow method, including context
+        and other fields depending on the child class.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        score = math.nan
+        if llm_output:
+            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
+            return {
+                self._result_key: bool(float(score)),
+                f"{self._result_key}_reason": reason,
+                "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+            }
+        return {self._result_key: float(score)}
+    async def _real_call(self, **kwargs):
+        """The asynchronous call where real end-to-end evaluation logic is performed.
+        :keyword kwargs: The inputs to evaluate.
+        :type kwargs: Dict
+        :return: The evaluation result.
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
+        """
+        # Convert inputs into list of evaluable inputs.
+        eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
+        per_turn_results = []
+        # Evaluate all inputs.
+        for eval_input in eval_input_list:
+            per_turn_results.append(await self._do_eval(eval_input))
+        return self._aggregate_results(per_turn_results=per_turn_results)
+    def _aggregate_results(self, per_turn_results):
+        """Aggregate the evaluation results of each conversation turn into a single result.
+        Exact implementation might need to vary slightly depending on the results produced.
+        Default behavior is to average the all number-based outputs.
+        :param per_turn_results: List of evaluation results for each turn in the conversation.
+        :type per_turn_results: List[Dict]
+        :return: A dictionary containing aggregated results, with numeric metrics having their
+        means as top-level values in the dictionary, and all original
+        values (including non-numerics) located in under the "evaluation_per_turn" key,
+        which each sub-key being a metric and each sub-value being a the list of that metric's
+        per-turn values.
+        :rtype: AggregateResult[T_EvalValue]
+        """
+        aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
+        evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
+        # Go over each turn, and rotate the results into a
+        # metric: List[values] format for the evals_per_turn dictionary.
+        score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
+        aggregated[self._AGGREGATE_RESULT_KEY] = score
+        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
+        aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
+        aggregated["per_tool_call_details"] = per_turn_results
+        return aggregated
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
+        :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
+        :paramtype query: Union[str, List[dict]]
+        :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
+        :paramtype tool_definitions: Union[dict, List[dict]]
+        :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
+            tool call(s) in it.
+        :paramtype tool_calls: Union[dict, List[dict]]
+        :keyword response: Optional response to be evaluated alongside the tool calls.
+            If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
+            If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
+                If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
+            Recommended to provide it when there are tool calls that depend on output of a previous tool call.
+        :paramtype response: Union[str, List[dict]]
+        :return: The tool selection evaluation results.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        return super().__call__(*args, **kwargs)

azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.2.0py3-none-any.whl → 1.4.0py3-none-any.whl