PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show

azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py CHANGED Viewed

@@ -1,33 +1,75 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from itertools import chain
 import math
 import os
 import logging
 import re
-from typing import Dict, List, Union, TypeVar, cast
+from typing import Dict, List, Union, TypeVar, Optional
 from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from azure.ai.evaluation._common.utils import remove_optional_singletons, parse_quality_evaluator_reason_score
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._exceptions import (
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+    EvaluationException,
+)
+from ..._common.utils import check_score_is_valid
 from azure.ai.evaluation._common._experimental import experimental
+from ..._converters._models import (
+    _BUILT_IN_DESCRIPTIONS,
+    _BUILT_IN_PARAMS,
+)
 logger = logging.getLogger(__name__)
 T_EvalValue = TypeVar("T_EvalValue")
+def _get_built_in_definition(tool_name: str):
+    """Get the definition for the built-in tool."""
+    if tool_name in _BUILT_IN_DESCRIPTIONS:
+        return {
+            "type": tool_name,
+            "description": _BUILT_IN_DESCRIPTIONS[tool_name],
+            "name": tool_name,
+            "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
+        }
+    return None
+def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
+    """Extract tool definitions needed for the given built-in tool calls."""
+    needed_definitions = []
+    for tool_call in tool_calls:
+        if isinstance(tool_call, dict):
+            tool_type = tool_call.get("type")
+            # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
+            if tool_type == "tool_call":
+                tool_name = tool_call.get("name")
+                if tool_name in _BUILT_IN_DESCRIPTIONS:
+                    built_in_def = _get_built_in_definition(tool_name)
+                    if built_in_def and built_in_def not in needed_definitions:
+                        needed_definitions.append(built_in_def)
+    return needed_definitions
 @experimental
 class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
-        - Relevance to the conversation
-        - Parameter correctness according to tool definitions
-        - Parameter value extraction from the conversation
+        - Relevance to the conversation.
+        - Parameter correctness according to tool definitions.
+        - Parameter value extraction from the conversation.
-    The evaluator uses a binary scoring system (0 or 1):
-        - Score 0: The tool call is irrelevant or contains information not in the conversation/definition
-        - Score 1: The tool call is relevant with properly extracted parameters from the conversation
+    The evaluator uses a scoring rubric of 1 to 5:
+        - Score 1: The tool calls are irrelevant
+        - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
+        - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
+        - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
+        - Score 5: The tool calls are relevant, and all parameters were correctly passed.
     This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
     user needs while properly following tool definitions and using information present in the
@@ -64,22 +106,34 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     _PROMPTY_FILE = "tool_call_accuracy.prompty"
-    _RESULT_KEY = "tool_call_accurate"
-    _AGGREGATE_RESULT_KEY = "tool_call_accuracy"
+    _RESULT_KEY = "tool_call_accuracy"
+    _MAX_TOOL_CALL_ACCURACY_SCORE = 5
+    _MIN_TOOL_CALL_ACCURACY_SCORE = 1
+    _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
-    _MAX_TOOL_CALL_ACCURACY_SCORE = 1.0
-    _MIN_TOOL_CALL_ACCURACY_SCORE = 0.0
-    _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 0.8
+    _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
+    _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
+    _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
+    _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
-    id = "id"
+    _LLM_SCORE_KEY = "tool_calls_success_level"
+    id = "azureai://built-in/evaluators/tool_call_accuracy"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(
@@ -134,84 +188,45 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         """
         # TODO add warning that only tool calls of type function are supported
         # Collect inputs
-        tool_calls = kwargs.get("tool_calls", None)
-        tool_definitions = kwargs.get("tool_definitions")
-        query = kwargs.get("query", None)
-        response = kwargs.get("response", None)
-        if response is None and tool_calls is None:
-            raise EvaluationException(
-                message="Either response or tool_calls must be provided.",
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.MISSING_FIELD,
-                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            )
-        if tool_definitions is None:
-            raise EvaluationException(
-                message="Tool definitions must be provided.",
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.MISSING_FIELD,
-                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            )
+        tool_calls = kwargs.get("tool_calls")
+        tool_definitions = kwargs.get("tool_definitions", [])  # Default to empty list
+        query = kwargs.get("query")
+        response = kwargs.get("response")
         # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
-        if tool_calls is None:
-            # Extract tool calls from response if not provided
-            tool_calls = []
-            if isinstance(response, list):
-                for message in response:
-                    if message.get("role") == "assistant":
-                        tool_calls.extend(
-                            [content for content in message.get("content") if content.get("type") == "tool_call"]
-                        )
-            if len(tool_calls) == 0:
-                raise EvaluationException(
-                    message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
-                    blame=ErrorBlame.USER_ERROR,
-                    category=ErrorCategory.MISSING_FIELD,
-                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-                )
+        if response:
+            parsed_tool_calls = self._parse_tools_from_response(response)
+            if parsed_tool_calls:
+                tool_calls = parsed_tool_calls
+        if not tool_calls:
+            return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
         if not isinstance(tool_definitions, list):
-            tool_definitions = [tool_definitions]
-        eval_inputs = []
-        # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
-        # we need to provide the output of the previous tool call as part of messages.
-        for tool_call in tool_calls:
-            if (
-                isinstance(tool_call, dict) and tool_call.get("type") == "tool_call"
-            ):  # TODO assuming dict here but it can be a class
-                function_name = tool_call.get("name")
-                tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
-                if len(tool_definition) > 0:
-                    tool_definition = tool_definition
-                else:
-                    raise EvaluationException(
-                        message="Tool definition not found",
-                        blame=ErrorBlame.USER_ERROR,
-                        category=ErrorCategory.INVALID_VALUE,
-                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-                    )
-                eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
+            tool_definitions = [tool_definitions] if tool_definitions else []
+        try:
+            needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
+        except EvaluationException as e:
+            # Check if this is because no tool definitions were provided at all
+            if len(tool_definitions) == 0:
+                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
             else:
-                raise EvaluationException(
-                    message="Tool definition not found",
-                    blame=ErrorBlame.USER_ERROR,
-                    category=ErrorCategory.INVALID_VALUE,
-                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-                )
+                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
-        return eval_inputs
+        if len(needed_tool_definitions) == 0:
+            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+        return {
+            "query": query,
+            "tool_calls": tool_calls,
+            "tool_definitions": needed_tool_definitions,
+        }
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
-        """Do a relevance evaluation.
+        """Do a tool call accuracy evaluation.
         :param eval_input: The input to the evaluator. Expected to contain
         whatever inputs are needed for the _flow method, including context
         and other fields depending on the child class.
@@ -219,23 +234,43 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        # Single LLM call for all tool calls
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        score = math.nan
-        if llm_output:
-            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
-            if score >= 0 and score <= 1:
-                return {
-                    self._result_key: bool(float(score)),
-                    f"{self._result_key}_reason": reason,
-                    "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
-                }
-        raise EvaluationException(
-            message="Tool call accuracy evaluator: Invalid score returned from LLM.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.INVALID_VALUE,
-            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-        )
+        if isinstance(llm_output, dict):
+            score = llm_output.get(self._LLM_SCORE_KEY, None)
+            if not score or not check_score_is_valid(
+                score,
+                ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
+                ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
+            ):
+                raise EvaluationException(
+                    message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
+                    internal_message="Invalid score value.",
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    blame=ErrorBlame.SYSTEM_ERROR,
+                )
+            # Format the output
+            reason = llm_output.get("chain_of_thought", "")
+            score = float(score)
+            score_result = "pass" if score >= self.threshold else "fail"
+            response_dict = {
+                self._result_key: score,
+                f"{self._result_key}_result": score_result,
+                f"{self._result_key}_threshold": self.threshold,
+                f"{self._result_key}_reason": reason,
+                "details": llm_output.get("details", {}),
+            }
+            return response_dict
+        else:
+            raise EvaluationException(
+                message="Tool call accuracy evaluator returned invalid output.",
+                blame=ErrorBlame.SYSTEM_ERROR,
+                category=ErrorCategory.FAILED_EXECUTION,
+                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+            )
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -246,106 +281,98 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
         # Convert inputs into list of evaluable inputs.
-        eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
-        if len(eval_input_list) == 0:
-            return {
-                self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
-                f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
-                f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
-                f"{self._AGGREGATE_RESULT_KEY}_reason": "No tool calls were made.",
-                "per_tool_call_details": [],
-            }
-        per_turn_results = []
-        # Evaluate all inputs.
-        for eval_input in eval_input_list:
-            if self._is_applicable_tool(eval_input):
-                per_turn_results.append(await self._do_eval(eval_input))
-            else:
-                per_turn_results.append(self._not_applicable_result(eval_input))
-        return self._aggregate_results(per_turn_results=per_turn_results)
-    def _is_applicable_tool(self, eval_input):
-        """Determine if a given tool should be evaluated, since we only evaluate tools that
-        have sufficient context available.
-        :type eval_input: Dict
-        :return: True if the tool call should be evaluated
-        :rtype: bool
-        """
-        tool_definition = eval_input.get("tool_definition")
-        if tool_definition is None or len(tool_definition) != 1:
-            return False
-        tool_type = tool_definition[0].get("type")
-        if tool_type is None or tool_type != "function":
-            return False
-        return True
-    def _not_applicable_result(self, eval_input):
+        eval_input = self._convert_kwargs_to_eval_input(**kwargs)
+        if isinstance(eval_input, dict) and eval_input.get("error_message"):
+            # If there is an error message, return not applicable result
+            return self._not_applicable_result(eval_input.get("error_message"))
+        # Do the evaluation
+        result = await self._do_eval(eval_input)
+        # Return the result
+        return result
+    def _not_applicable_result(self, error_message):
         """Return a result indicating that the tool call is not applicable for evaluation.
         :param eval_input: The input to the evaluator.
         :type eval_input: Dict
         :return: A dictionary containing the result of the evaluation.
         :rtype: Dict[str, Union[str, float]]
         """
+        # If no tool calls were made or tool call type is not supported, return not applicable result
         return {
-            f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
-            f"{self._result_key}_reason": "Tool call not supported for evaluation",
-            "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
+            self._result_key: self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_result": "pass",
+            f"{self._result_key}_threshold": self.threshold,
+            f"{self._result_key}_reason": error_message,
+            "details": {},
         }
-    def _aggregate_results(self, per_turn_results):
-        """Aggregate the evaluation results of each conversation turn into a single result.
+    def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
+        """Extract the tool definitions that are needed for the provided tool calls."""
+        needed_tool_definitions = []
-        Exact implementation might need to vary slightly depending on the results produced.
-        Default behavior is to average the all number-based outputs.
+        # Add all user-provided tool definitions
+        needed_tool_definitions.extend(tool_definitions)
-        :param per_turn_results: List of evaluation results for each turn in the conversation.
-        :type per_turn_results: List[Dict]
-        :return: A dictionary containing aggregated results, with numeric metrics having their
-        means as top-level values in the dictionary, and all original
-        values (including non-numerics) located in under the "evaluation_per_turn" key,
-        which each sub-key being a metric and each sub-value being a the list of that metric's
-        per-turn values.
-        :rtype: AggregateResult[T_EvalValue]
-        """
+        # Add the needed built-in tool definitions (if they are called)
+        built_in_definitions = _get_needed_built_in_definitions(tool_calls)
+        needed_tool_definitions.extend(built_in_definitions)
-        aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
-        evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
-        # Go over each turn, and rotate the results into a
-        # metric: List[values] format for the evals_per_turn dictionary.
+        # Validate that all tool calls have corresponding definitions
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
+                        # This is a built-in tool from converter, already handled above
+                        continue
+                    elif tool_name:
+                        # This is a regular function tool from converter
+                        tool_definition_exists = any(
+                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
+                            for tool in tool_definitions_expanded
+                        )
+                        if not tool_definition_exists:
+                            raise EvaluationException(
+                                message=f"Tool definition for {tool_name} not found",
+                                blame=ErrorBlame.USER_ERROR,
+                                category=ErrorCategory.INVALID_VALUE,
+                                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                            )
+                    else:
+                        raise EvaluationException(
+                            message=f"Tool call missing name: {tool_call}",
+                            blame=ErrorBlame.USER_ERROR,
+                            category=ErrorCategory.INVALID_VALUE,
+                            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        )
+                else:
+                    # Unsupported tool format - only converter format is supported
+                    raise EvaluationException(
+                        message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                    )
+            else:
+                # Tool call is not a dictionary
+                raise EvaluationException(
+                    message=f"Tool call is not a dictionary: {tool_call}",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                )
-        num_evaluated = len(
-            [
-                per_turn_result
-                for per_turn_result in per_turn_results
-                if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
-            ]
-        )
-        if num_evaluated == 0:
-            # None of the invoked tools were applicable, return not applicable result
-            # (If a tool fails evaluation, we'll throw an exception)
-            return {
-                self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
-                f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
-                f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
-                f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
-                "per_tool_call_details": [],
-            }
-        # ignore not_applicable results, where the _result_key will be "not applicable"
-        score = (
-            sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
-        )
-        aggregated[self._AGGREGATE_RESULT_KEY] = score
-        aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
-            self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
-        )
-        aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
-        aggregated["per_tool_call_details"] = per_turn_results
-        return aggregated
+        return needed_tool_definitions
     @override
     def __call__(  # pylint: disable=docstring-missing-param