PyPI - azure-ai-evaluation - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl - Mend

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import os
+import os, logging
 from typing import Dict, List, Optional, Union
 from typing_extensions import overload, override
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
-from ..._common.utils import construct_prompty_model_config, validate_model_config
+from ..._common.utils import (
+    ErrorBlame,
+    ErrorTarget,
+    EvaluationException,
+    ErrorCategory,
+    construct_prompty_model_config,
+    validate_model_config,
+)
 try:
     from ..._user_agent import UserAgentSingleton
@@ -21,6 +28,9 @@ except ImportError:
             return "None"
+logger = logging.getLogger(__name__)
 class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
     _RESULT_KEY = "groundedness"
     _OPTIONAL_PARAMS = ["query"]
+    _SUPPORTED_TOOLS = ["file_search"]
     id = "azureai://built-in/evaluators/groundedness"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3, **kwargs):
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY)  # Default to no query
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )
         self._model_config = model_config
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         :rtype: Dict[str, float]
         """
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: List[dict],
+        tool_definitions: List[dict],
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response from the agent to be evaluated.
+        :paramtype response: List[dict]
+        :keyword tool_definitions: The tool definitions used by the agent.
+        :paramtype tool_definitions: List[dict]
+        :return: The groundedness score.
+        :rtype: Dict[str, Union[str, float]]
+        """
     @overload
     def __call__(
         self,
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
         return super().__call__(*args, **kwargs)
+    async def _real_call(self, **kwargs):
+        """The asynchronous call where real end-to-end evaluation logic is performed.
+        :keyword kwargs: The inputs to evaluate.
+        :type kwargs: Dict
+        :return: The evaluation result.
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
+        """
+        # Convert inputs into list of evaluable inputs.
+        try:
+            return await super()._real_call(**kwargs)
+        except EvaluationException as ex:
+            if ex.category == ErrorCategory.NOT_APPLICABLE:
+                return {
+                    self._result_key: self._NOT_APPLICABLE_RESULT,
+                    f"{self._result_key}_result": "pass",
+                    f"{self._result_key}_threshold": self.threshold,
+                    f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
+                }
+            else:
+                raise ex
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        if "context" in kwargs or "conversation" in kwargs:
+            return super()._convert_kwargs_to_eval_input(**kwargs)
+        query = kwargs.get("query")
+        response = kwargs.get("response")
+        tool_definitions = kwargs.get("tool_definitions")
+        if not query or not response or not tool_definitions:
+            msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
+            raise EvaluationException(
+                message=msg,
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+        context = self._get_context_from_agent_response(response, tool_definitions)
+        if not context:
+            raise EvaluationException(
+                message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+        return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
+    def _get_context_from_agent_response(self, response, tool_definitions):
+        context = ""
+        try:
+            logger.debug("Extracting context from response")
+            tool_calls = self._parse_tools_from_response(response=response)
+            logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
+            if tool_calls:
+                for tool_call in tool_calls:
+                    if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
+                        tool_name = tool_call.get("name")
+                        for tool in tool_definitions:
+                            if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
+                                if tool_name == "file_search":
+                                    tool_result = tool_call.get("tool_result")
+                                    if tool_result:
+                                        for result in tool_result:
+                                            content_list = result.get("content")
+                                            if content_list:
+                                                for content in content_list:
+                                                    text = content.get("text")
+                                                    if text:
+                                                        context = context + "\n" + str(text)
+        except Exception as ex:
+            logger.debug(f"Error extracting context from agent response : {str(ex)}")
+            context = ""
+        return context if context else None

azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py CHANGED Viewed

@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py CHANGED Viewed

@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
+    def __init__(
+        self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
+    ):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py CHANGED Viewed

@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold: float = 3):  # pylint: disable=super-init-not-called
+    def __init__(self, model_config, *, threshold: float = 3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py CHANGED Viewed

@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from itertools import chain
 import math
 import os
 import logging
 import re
-from typing import Dict, List, Union, TypeVar, cast
+from typing import Dict, List, Union, TypeVar, Optional
 from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._exceptions import (
@@ -16,12 +17,46 @@ from azure.ai.evaluation._exceptions import (
 )
 from ..._common.utils import check_score_is_valid
 from azure.ai.evaluation._common._experimental import experimental
+from ..._converters._models import (
+    _BUILT_IN_DESCRIPTIONS,
+    _BUILT_IN_PARAMS,
+)
 logger = logging.getLogger(__name__)
 T_EvalValue = TypeVar("T_EvalValue")
+def _get_built_in_definition(tool_name: str):
+    """Get the definition for the built-in tool."""
+    if tool_name in _BUILT_IN_DESCRIPTIONS:
+        return {
+            "type": tool_name,
+            "description": _BUILT_IN_DESCRIPTIONS[tool_name],
+            "name": tool_name,
+            "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
+        }
+    return None
+def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
+    """Extract tool definitions needed for the given built-in tool calls."""
+    needed_definitions = []
+    for tool_call in tool_calls:
+        if isinstance(tool_call, dict):
+            tool_type = tool_call.get("type")
+            # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
+            if tool_type == "tool_call":
+                tool_name = tool_call.get("name")
+                if tool_name in _BUILT_IN_DESCRIPTIONS:
+                    built_in_def = _get_built_in_definition(tool_name)
+                    if built_in_def and built_in_def not in needed_definitions:
+                        needed_definitions.append(built_in_def)
+    return needed_definitions
 @experimental
 class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -88,7 +123,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
@@ -96,6 +131,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             model_config=model_config,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
+            credential=credential,
             **kwargs,
         )
@@ -153,10 +189,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         # TODO add warning that only tool calls of type function are supported
         # Collect inputs
         tool_calls = kwargs.get("tool_calls")
-        tool_definitions = kwargs.get("tool_definitions")
+        tool_definitions = kwargs.get("tool_definitions", [])  # Default to empty list
         query = kwargs.get("query")
         response = kwargs.get("response")
         # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
         if response:
             parsed_tool_calls = self._parse_tools_from_response(response)
@@ -165,20 +200,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         if not tool_calls:
             return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
-        if not tool_definitions or len(tool_definitions) == 0:
-            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
         if not isinstance(tool_definitions, list):
-            tool_definitions = [tool_definitions]
+            tool_definitions = [tool_definitions] if tool_definitions else []
         try:
             needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
         except EvaluationException as e:
-            return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+            # Check if this is because no tool definitions were provided at all
+            if len(tool_definitions) == 0:
+                return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
+            else:
+                return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
         if len(needed_tool_definitions) == 0:
-            return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
+            return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
         return {
             "query": query,
@@ -268,66 +306,72 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             "details": {},
         }
-    def _parse_tools_from_response(self, response):
-        """Parse the response to extract tool calls and results.
-        :param response: The response to parse.
-        :type response: Union[str, List[dict]]
-        :return: List of tool calls extracted from the response.
-        :rtype: List[dict]
-        """
-        tool_calls = []
-        tool_results_map = {}
-        if isinstance(response, list):
-            for message in response:
-                # Extract tool calls from assistant messages
-                if message.get("role") == "assistant" and isinstance(message.get("content"), list):
-                    for content_item in message.get("content"):
-                        if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
-                            tool_calls.append(content_item)
-                # Extract tool results from tool messages
-                elif message.get("role") == "tool" and message.get("tool_call_id"):
-                    tool_call_id = message.get("tool_call_id")
-                    if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
-                        result_content = message.get("content")[0]
-                        if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
-                            tool_results_map[tool_call_id] = result_content
-        # Attach results to their corresponding calls
-        for tool_call in tool_calls:
-            tool_call_id = tool_call.get("tool_call_id")
-            if tool_call_id in tool_results_map:
-                tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
-        return tool_calls
     def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
-        """Extract the tool definitions that are needed for the provided tool calls.
-        :param tool_calls: List of tool calls to evaluate.
-        :type tool_calls: List[dict]
-        :param tool_definitions: List of tool definitions to use for evaluation.
-        :type tool_definitions: List[dict]
-        :return: List of tool definitions that are needed for the provided tool calls.
-        :rtype: List[dict]
-        """
+        """Extract the tool definitions that are needed for the provided tool calls."""
         needed_tool_definitions = []
+        # Add all user-provided tool definitions
+        needed_tool_definitions.extend(tool_definitions)
+        # Add the needed built-in tool definitions (if they are called)
+        built_in_definitions = _get_needed_built_in_definitions(tool_calls)
+        needed_tool_definitions.extend(built_in_definitions)
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
+        # Validate that all tool calls have corresponding definitions
         for tool_call in tool_calls:
-            if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
-                tool_name = tool_call.get("name")
-                tool_definition = [
-                    tool
-                    for tool in tool_definitions
-                    if tool.get("name") == tool_name and tool.get("type", "function") == "function"
-                ]
-                if len(tool_definition) > 0:
-                    needed_tool_definitions.extend(tool_definition)
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
+                        # This is a built-in tool from converter, already handled above
+                        continue
+                    elif tool_name:
+                        # This is a regular function tool from converter
+                        tool_definition_exists = any(
+                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
+                            for tool in tool_definitions_expanded
+                        )
+                        if not tool_definition_exists:
+                            raise EvaluationException(
+                                message=f"Tool definition for {tool_name} not found",
+                                blame=ErrorBlame.USER_ERROR,
+                                category=ErrorCategory.INVALID_VALUE,
+                                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                            )
+                    else:
+                        raise EvaluationException(
+                            message=f"Tool call missing name: {tool_call}",
+                            blame=ErrorBlame.USER_ERROR,
+                            category=ErrorCategory.INVALID_VALUE,
+                            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                        )
                 else:
+                    # Unsupported tool format - only converter format is supported
                     raise EvaluationException(
-                        message=f"Tool definition for {tool_name} not found",
+                        message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
                         blame=ErrorBlame.USER_ERROR,
                         category=ErrorCategory.INVALID_VALUE,
                         target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
                     )
+            else:
+                # Tool call is not a dictionary
+                raise EvaluationException(
+                    message=f"Tool call is not a dictionary: {tool_call}",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                )
         return needed_tool_definitions
     @override

azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.1py3-none-any.whl