PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show

azure/ai/evaluation/__init__.py CHANGED Viewed

@@ -46,6 +46,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader
 from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
 from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
 from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
+from ._aoai.python_grader import AzureOpenAIPythonGrader
 _patch_all = []
@@ -53,21 +54,46 @@ _patch_all = []
 # The converter from the AI service to the evaluator schema requires a dependency on
 # ai.projects, but we also don't want to force users installing ai.evaluations to pull
 # in ai.projects. So we only import it if it's available and the user has ai.projects.
-try:
-    from ._converters._ai_services import AIAgentConverter
+# We use lazy loading to avoid printing messages during import unless the classes are actually used.
+_lazy_imports = {}
-    _patch_all.append("AIAgentConverter")
-except ImportError:
-    print(
-        "[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`."
-    )
-try:
-    from ._converters._sk_services import SKAgentConverter
+def _create_lazy_import(class_name, module_path, dependency_name):
+    """Create a lazy import function for optional dependencies.
-    _patch_all.append("SKAgentConverter")
-except ImportError:
-    print("[INFO] Could not import SKAgentConverter. Please install the dependency with `pip install semantic-kernel`.")
+    Args:
+        class_name: Name of the class to import
+        module_path: Module path to import from
+        dependency_name: Name of the dependency package for error message
+    Returns:
+        A function that performs the lazy import when called
+    """
+    def lazy_import():
+        try:
+            module = __import__(module_path, fromlist=[class_name])
+            cls = getattr(module, class_name)
+            _patch_all.append(class_name)
+            return cls
+        except ImportError:
+            raise ImportError(
+                f"Could not import {class_name}. Please install the dependency with `pip install {dependency_name}`."
+            )
+    return lazy_import
+_lazy_imports["AIAgentConverter"] = _create_lazy_import(
+    "AIAgentConverter",
+    "azure.ai.evaluation._converters._ai_services",
+    "azure-ai-projects",
+)
+_lazy_imports["SKAgentConverter"] = _create_lazy_import(
+    "SKAgentConverter",
+    "azure.ai.evaluation._converters._sk_services",
+    "semantic-kernel",
+)
 __all__ = [
     "evaluate",
@@ -110,6 +136,14 @@ __all__ = [
     "AzureOpenAIStringCheckGrader",
     "AzureOpenAITextSimilarityGrader",
     "AzureOpenAIScoreModelGrader",
+    "AzureOpenAIPythonGrader",
 ]
 __all__.extend([p for p in _patch_all if p not in __all__])
+def __getattr__(name):
+    """Handle lazy imports for optional dependencies."""
+    if name in _lazy_imports:
+        return _lazy_imports[name]()
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")

azure/ai/evaluation/_aoai/python_grader.py ADDED Viewed

@@ -0,0 +1,84 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Any, Dict, Union, Optional
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from openai.types.graders import PythonGrader
+from azure.ai.evaluation._common._experimental import experimental
+from .aoai_grader import AzureOpenAIGrader
+@experimental
+class AzureOpenAIPythonGrader(AzureOpenAIGrader):
+    """
+    Wrapper class for OpenAI's Python code graders.
+    Enables custom Python-based evaluation logic with flexible scoring and
+    pass/fail thresholds. The grader executes user-provided Python code
+    to evaluate outputs against custom criteria.
+    Supplying a PythonGrader to the `evaluate` method will cause an
+    asynchronous request to evaluate the grader via the OpenAI API. The
+    results of the evaluation will then be merged into the standard
+    evaluation results.
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param name: The name of the grader.
+    :type name: str
+    :param image_tag: The image tag for the Python execution environment.
+    :type image_tag: str
+    :param pass_threshold: Score threshold for pass/fail classification.
+        Scores >= threshold are considered passing.
+    :type pass_threshold: float
+    :param source: Python source code containing the grade function.
+        Must define: def grade(sample: dict, item: dict) -> float
+    :type source: str
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Any
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_common.py
+            :start-after: [START python_grader_example]
+            :end-before: [END python_grader_example]
+            :language: python
+            :dedent: 8
+            :caption: Using AzureOpenAIPythonGrader for custom evaluation logic.
+    """
+    id = "azureai://built-in/evaluators/azure-openai/python_grader"
+    def __init__(
+        self,
+        *,
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+        name: str,
+        image_tag: str,
+        pass_threshold: float,
+        source: str,
+        **kwargs: Any,
+    ):
+        # Validate pass_threshold
+        if not 0.0 <= pass_threshold <= 1.0:
+            raise ValueError("pass_threshold must be between 0.0 and 1.0")
+        # Store pass_threshold as instance attribute for potential future use
+        self.pass_threshold = pass_threshold
+        # Create OpenAI PythonGrader instance
+        grader = PythonGrader(
+            name=name,
+            image_tag=image_tag,
+            pass_threshold=pass_threshold,
+            source=source,
+            type="python",
+        )
+        super().__init__(model_config=model_config, grader_config=grader, **kwargs)

azure/ai/evaluation/_aoai/score_model_grader.py CHANGED Viewed

@@ -84,6 +84,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
             grader_kwargs["range"] = range
         if sampling_params is not None:
             grader_kwargs["sampling_params"] = sampling_params
+        grader_kwargs["pass_threshold"] = self.pass_threshold
         grader = ScoreModelGrader(**grader_kwargs)

azure/ai/evaluation/_common/onedp/models/_models.py CHANGED Viewed

@@ -1961,12 +1961,16 @@ class Message(_Model):
     :vartype role: str
     :ivar content: The content.
     :vartype content: str
+    :ivar context: The context.
+    :vartype context: str
     """
     role: Optional[str] = rest_field(name="Role", visibility=["read", "create", "update", "delete", "query"])
     """The role."""
     content: Optional[str] = rest_field(name="Content", visibility=["read", "create", "update", "delete", "query"])
     """The content."""
+    context: Optional[str] = rest_field(name="Context", visibility=["read", "create", "update", "delete", "query"])
+    """The context."""
     @overload
     def __init__(
@@ -1974,6 +1978,7 @@ class Message(_Model):
         *,
         role: Optional[str] = None,
         content: Optional[str] = None,
+        context: Optional[str] = None,
     ) -> None: ...
     @overload

azure/ai/evaluation/_common/rai_service.py CHANGED Viewed

@@ -290,7 +290,7 @@ async def submit_request_onedp(
     payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
     headers = get_common_headers(token, evaluator_name)
     if scan_session_id:
-        headers["client_request_id"] = scan_session_id
+        headers["x-ms-client-request-id"] = scan_session_id
     response = client.evaluations.submit_annotation(payload, headers=headers)
     result = json.loads(response)
     operation_id = result["location"].split("/")[-1]
@@ -319,8 +319,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
         token = await fetch_or_reuse_token(credential, token)
         headers = get_common_headers(token)
-        async with get_async_http_client_with_timeout() as client:
-            response = await client.get(url, headers=headers)
+        async with get_async_http_client() as client:
+            response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
         if response.status_code == 200:
             return response.json()

azure/ai/evaluation/_common/utils.py CHANGED Viewed

@@ -6,11 +6,11 @@ import posixpath
 import re
 import math
 import threading
-from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
+from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
 import nltk
 from azure.storage.blob import ContainerClient
-from typing_extensions import NotRequired, Required, TypeGuard
+from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
 from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
 from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -127,17 +127,15 @@ def construct_prompty_model_config(
     return prompty_model_config
-def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
+def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
     """Check if the Azure AI project is an OneDP project.
     :param azure_ai_project: The scope of the Azure AI project.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
     :return: True if the Azure AI project is an OneDP project, False otherwise.
     :rtype: bool
     """
-    if isinstance(azure_ai_project, str):
-        return True
-    return False
+    return isinstance(azure_ai_project, str)
 def validate_azure_ai_project(o: object) -> AzureAIProject:
@@ -494,14 +492,17 @@ def _extract_text_from_content(content):
     return text
-def _get_conversation_history(query):
+def _get_conversation_history(query, include_system_messages=False):
     all_user_queries = []
     cur_user_query = []
     all_agent_responses = []
     cur_agent_response = []
+    system_message = None
     for msg in query:
         if not "role" in msg:
             continue
+        if include_system_messages and msg["role"] == "system" and "content" in msg:
+            system_message = msg.get("content", "")
         if msg["role"] == "user" and "content" in msg:
             if cur_agent_response != []:
                 all_agent_responses.append(cur_agent_response)
@@ -530,13 +531,18 @@ def _get_conversation_history(query):
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
-    return {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+    result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+    if include_system_messages:
+        result["system_message"] = system_message
+    return result
 def _pretty_format_conversation_history(conversation_history):
     """Formats the conversation history for better readability."""
     formatted_history = ""
+    if "system_message" in conversation_history and conversation_history["system_message"] is not None:
+        formatted_history += "SYSTEM_PROMPT:\n"
+        formatted_history += "  " + conversation_history["system_message"] + "\n\n"
     for i, (user_query, agent_response) in enumerate(
         zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
     ):
@@ -552,10 +558,10 @@ def _pretty_format_conversation_history(conversation_history):
     return formatted_history
-def reformat_conversation_history(query, logger=None):
+def reformat_conversation_history(query, logger=None, include_system_messages=False):
     """Reformats the conversation history to a more compact representation."""
     try:
-        conversation_history = _get_conversation_history(query)
+        conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
         return _pretty_format_conversation_history(conversation_history)
     except:
         # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
@@ -570,22 +576,53 @@ def reformat_conversation_history(query, logger=None):
         return query
-def _get_agent_response(agent_response_msgs):
-    """Extracts the text from the agent response content."""
+def _get_agent_response(agent_response_msgs, include_tool_messages=False):
+    """Extracts formatted agent response including text, and optionally tool calls/results."""
     agent_response_text = []
+    tool_results = {}
+    # First pass: collect tool results
+    if include_tool_messages:
+        for msg in agent_response_msgs:
+            if msg.get("role") == "tool" and "tool_call_id" in msg:
+                for content in msg.get("content", []):
+                    if content.get("type") == "tool_result":
+                        result = content.get("tool_result")
+                        tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
+    # Second pass: parse assistant messages and tool calls
     for msg in agent_response_msgs:
-        if "role" in msg and msg["role"] == "assistant" and "content" in msg:
+        if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
             text = _extract_text_from_content(msg["content"])
             if text:
                 agent_response_text.extend(text)
+            if include_tool_messages:
+                for content in msg.get("content", []):
+                    # Todo: Verify if this is the correct way to handle tool calls
+                    if content.get("type") == "tool_call":
+                        if "tool_call" in content and "function" in content.get("tool_call", {}):
+                            tc = content.get("tool_call", {})
+                            func_name = tc.get("function", {}).get("name", "")
+                            args = tc.get("function", {}).get("arguments", {})
+                            tool_call_id = tc.get("id")
+                        else:
+                            tool_call_id = content.get("tool_call_id")
+                            func_name = content.get("name", "")
+                            args = content.get("arguments", {})
+                        args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
+                        call_line = f"[TOOL_CALL] {func_name}({args_str})"
+                        agent_response_text.append(call_line)
+                        if tool_call_id in tool_results:
+                            agent_response_text.append(tool_results[tool_call_id])
     return agent_response_text
-def reformat_agent_response(response, logger=None):
+def reformat_agent_response(response, logger=None, include_tool_messages=False):
     try:
         if response is None or response == []:
             return ""
-        agent_response = _get_agent_response(response)
+        agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
         if agent_response == []:
             # If no message could be extracted, likely the format changed, fallback to the original response in that case
             if logger:
@@ -602,6 +639,26 @@ def reformat_agent_response(response, logger=None):
         return response
+def reformat_tool_definitions(tool_definitions, logger=None):
+    try:
+        output_lines = ["TOOL_DEFINITIONS:"]
+        for tool in tool_definitions:
+            name = tool.get("name", "unnamed_tool")
+            desc = tool.get("description", "").strip()
+            params = tool.get("parameters", {}).get("properties", {})
+            param_names = ", ".join(params.keys()) if params else "no parameters"
+            output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
+        return "\n".join(output_lines)
+    except Exception as e:
+        # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(
+                f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
+            )
+        return tool_definitions
 def upload(path: str, container_client: ContainerClient, logger=None):
     """Upload files or directories to Azure Blob Storage using a container client.

azure/ai/evaluation/_converters/_ai_services.py CHANGED Viewed

@@ -11,7 +11,18 @@ from azure.ai.evaluation._common._experimental import experimental
 from packaging.version import Version
 # Constants.
-from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+from ._models import (
+    _USER,
+    _AGENT,
+    _TOOL,
+    _TOOL_CALL,
+    _TOOL_CALLS,
+    _FUNCTION,
+    _BUILT_IN_DESCRIPTIONS,
+    _BUILT_IN_PARAMS,
+    _OPENAPI,
+    OpenAPIToolDefinition,
+)
 # Message instances.
 from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
@@ -93,7 +104,7 @@ class AIAgentConverter:
         return tool_calls_chronological
     @staticmethod
-    def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinition]:
+    def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]:
         """
         Extracts tool definitions from a thread run.
@@ -121,6 +132,26 @@ class AIAgentConverter:
                         parameters=parameters,
                     )
                 )
+            elif tool.type == _OPENAPI:
+                openapi_tool = tool.openapi
+                tool_definition = OpenAPIToolDefinition(
+                    name=openapi_tool.name,
+                    description=openapi_tool.description,
+                    type=_OPENAPI,
+                    spec=openapi_tool.spec,
+                    auth=openapi_tool.auth.as_dict(),
+                    default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None,
+                    functions=[
+                        ToolDefinition(
+                            name=func.get("name"),
+                            description=func.get("description"),
+                            parameters=func.get("parameters"),
+                            type="function",
+                        )
+                        for func in openapi_tool.get("functions")
+                    ],
+                )
+                final_tools.append(tool_definition)
             else:
                 # Add limited support for built-in tools. Descriptions and parameters
                 # are not published, but we'll include placeholders.
@@ -243,16 +274,30 @@ class AIAgentConverter:
             if len(single_turn.content) < 1:
                 continue
-            # Build the content of the text message.
-            content = {
-                "type": "text",
-                "text": single_turn.content[0].text.value,
-            }
+            content_list = []
+            # If content is a list, process all content items.
+            for content_item in single_turn.content:
+                if content_item.type == "text":
+                    content_list.append(
+                        {
+                            "type": "text",
+                            "text": content_item.text.value,
+                        }
+                    )
+                elif content_item.type == "image":
+                    content_list.append(
+                        {
+                            "type": "image",
+                            "image": {
+                                "file_id": content_item.image_file.file_id,
+                            },
+                        }
+                    )
             # If we have a user message, then we save it as such and since it's a human message, there is no
             # run_id associated with it.
             if single_turn.role == _USER:
-                final_messages.append(UserMessage(content=[content], createdAt=single_turn.created_at))
+                final_messages.append(UserMessage(content=content_list, createdAt=single_turn.created_at))
                 continue
             # In this case, we have an assistant message. Unfortunately, this would only have the user-facing
@@ -261,7 +306,7 @@ class AIAgentConverter:
             if single_turn.role == _AGENT:
                 # We are required to put the run_id in the assistant message.
                 final_messages.append(
-                    AssistantMessage(content=[content], run_id=single_turn.run_id, createdAt=single_turn.created_at)
+                    AssistantMessage(content=content_list, run_id=single_turn.run_id, createdAt=single_turn.created_at)
                 )
                 continue
@@ -791,6 +836,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
                 limit=self._AI_SERVICES_API_MAX_LIMIT,
                 order="asc",
                 after=after,
+                include=["step_details.tool_calls[*].file_search.results[*].content"],
             )
             has_more = run_steps.has_more
             after = run_steps.last_id
@@ -838,7 +884,11 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
     def _list_run_steps_chronological(self, thread_id: str, run_id: str):
         return self.project_client.agents.run_steps.list(
-            thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
+            thread_id=thread_id,
+            run_id=run_id,
+            limit=self._AI_SERVICES_API_MAX_LIMIT,
+            order="asc",
+            include=["step_details.tool_calls[*].file_search.results[*].content"],
         )
     def _list_run_ids_chronological(self, thread_id: str) -> List[str]:

azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl