PyPI - azure-ai-evaluation - Versions diffs - 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show

azure/ai/evaluation/_common/utils.py CHANGED Viewed

@@ -6,14 +6,14 @@ import posixpath
 import re
 import math
 import threading
-from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
+from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
 import nltk
 from azure.storage.blob import ContainerClient
-from typing_extensions import NotRequired, Required, TypeGuard
+from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
 from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -126,17 +126,17 @@ def construct_prompty_model_config(
     return prompty_model_config
-def is_onedp_project(azure_ai_project: AzureAIProject) -> bool:
+def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
     """Check if the Azure AI project is an OneDP project.
     :param azure_ai_project: The scope of the Azure AI project.
-    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
     :return: True if the Azure AI project is an OneDP project, False otherwise.
     :rtype: bool
     """
-    if isinstance(azure_ai_project, str):
-        return True
-    return False
+    return isinstance(azure_ai_project, str)
 def validate_azure_ai_project(o: object) -> AzureAIProject:
     fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
@@ -291,7 +291,8 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
     return cast(T_TypedDict, o)
-def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
+def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
     """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
     :param score: The score to check.
@@ -310,6 +311,7 @@ def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5)
     return min_score <= numeric_score <= max_score
 def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
     """Parse the output of prompt-based quality evaluators that return a score and reason.
@@ -481,6 +483,182 @@ def validate_conversation(conversation):
             ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
         )
+def _extract_text_from_content(content):
+    text = []
+    for msg in content:
+        if "text" in msg:
+            text.append(msg["text"])
+    return text
+def _get_conversation_history(query, include_system_messages=False):
+    all_user_queries = []
+    cur_user_query = []
+    all_agent_responses = []
+    cur_agent_response = []
+    system_message = None
+    for msg in query:
+        if not "role" in msg:
+            continue
+        if include_system_messages and msg["role"] == "system" and "content" in msg:
+            system_message = msg.get("content", "")
+        if msg["role"] == "user" and "content" in msg:
+            if cur_agent_response != []:
+                all_agent_responses.append(cur_agent_response)
+                cur_agent_response = []
+            text_in_msg = _extract_text_from_content(msg["content"])
+            if text_in_msg:
+                cur_user_query.append(text_in_msg)
+        if msg["role"] == "assistant" and "content" in msg:
+            if cur_user_query != []:
+                all_user_queries.append(cur_user_query)
+                cur_user_query = []
+            text_in_msg = _extract_text_from_content(msg["content"])
+            if text_in_msg:
+                cur_agent_response.append(text_in_msg)
+    if cur_user_query != []:
+        all_user_queries.append(cur_user_query)
+    if cur_agent_response != []:
+        all_agent_responses.append(cur_agent_response)
+    if len(all_user_queries) != len(all_agent_responses) + 1:
+        raise EvaluationException(
+            message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
+            target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
+    if include_system_messages:
+        result["system_message"] = system_message
+    return result
+def _pretty_format_conversation_history(conversation_history):
+    """Formats the conversation history for better readability."""
+    formatted_history = ""
+    if "system_message" in conversation_history and conversation_history["system_message"] is not None:
+        formatted_history += "SYSTEM_PROMPT:\n"
+        formatted_history += "  " + conversation_history["system_message"] + "\n\n"
+    for i, (user_query, agent_response) in enumerate(
+        zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
+    ):
+        formatted_history += f"User turn {i+1}:\n"
+        for msg in user_query:
+            formatted_history += "  " + "\n  ".join(msg)
+        formatted_history += "\n\n"
+        if agent_response:
+            formatted_history += f"Agent turn {i+1}:\n"
+            for msg in agent_response:
+                formatted_history += "  " + "\n  ".join(msg)
+            formatted_history += "\n\n"
+    return formatted_history
+def reformat_conversation_history(query, logger=None, include_system_messages=False):
+    """Reformats the conversation history to a more compact representation."""
+    try:
+        conversation_history = _get_conversation_history(query, include_system_messages=include_system_messages)
+        return _pretty_format_conversation_history(conversation_history)
+    except:
+        # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
+        # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
+        # From our tests the negative impact on IntentResolution is:
+        #   Higher intra model variance (0.142 vs 0.046)
+        #   Higher inter model variance (0.345 vs 0.607)
+        #   Lower percentage of mode in Likert scale (73.4% vs 75.4%)
+        #   Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
+        if logger:
+            logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
+        return query
+def _get_agent_response(agent_response_msgs, include_tool_messages=False):
+    """Extracts formatted agent response including text, and optionally tool calls/results."""
+    agent_response_text = []
+    tool_results = {}
+    # First pass: collect tool results
+    if include_tool_messages:
+        for msg in agent_response_msgs:
+            if msg.get("role") == "tool" and "tool_call_id" in msg:
+                for content in msg.get("content", []):
+                    if content.get("type") == "tool_result":
+                        result = content.get("tool_result")
+                        tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
+    # Second pass: parse assistant messages and tool calls
+    for msg in agent_response_msgs:
+        if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
+            text = _extract_text_from_content(msg["content"])
+            if text:
+                agent_response_text.extend(text)
+            if include_tool_messages:
+                for content in msg.get("content", []):
+                    # Todo: Verify if this is the correct way to handle tool calls
+                    if content.get("type") == "tool_call":
+                        if "tool_call" in content and "function" in content.get("tool_call", {}):
+                            tc = content.get("tool_call", {})
+                            func_name = tc.get("function", {}).get("name", "")
+                            args = tc.get("function", {}).get("arguments", {})
+                            tool_call_id = tc.get("id")
+                        else:
+                            tool_call_id = content.get("tool_call_id")
+                            func_name = content.get("name", "")
+                            args = content.get("arguments", {})
+                        args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
+                        call_line = f"[TOOL_CALL] {func_name}({args_str})"
+                        agent_response_text.append(call_line)
+                        if tool_call_id in tool_results:
+                            agent_response_text.append(tool_results[tool_call_id])
+    return agent_response_text
+def reformat_agent_response(response, logger=None, include_tool_messages=False):
+    try:
+        if response is None or response == []:
+            return ""
+        agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
+        if agent_response == []:
+            # If no message could be extracted, likely the format changed, fallback to the original response in that case
+            if logger:
+                logger.warning(
+                    f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
+                )
+            return response
+        return "\n".join(agent_response)
+    except:
+        # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
+        return response
+def reformat_tool_definitions(tool_definitions, logger=None):
+    try:
+        output_lines = ["TOOL_DEFINITIONS:"]
+        for tool in tool_definitions:
+            name = tool.get("name", "unnamed_tool")
+            desc = tool.get("description", "").strip()
+            params = tool.get("parameters", {}).get("properties", {})
+            param_names = ", ".join(params.keys()) if params else "no parameters"
+            output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
+        return "\n".join(output_lines)
+    except Exception as e:
+        # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
+        # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
+        if logger:
+            logger.warning(
+                f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
+            )
+        return tool_definitions
 def upload(path: str, container_client: ContainerClient, logger=None):
     """Upload files or directories to Azure Blob Storage using a container client.
@@ -509,7 +687,7 @@ def upload(path: str, container_client: ContainerClient, logger=None):
     local_paths = []
     if os.path.isdir(path):
-        for (root, _, filenames) in os.walk(path):
+        for root, _, filenames in os.walk(path):
             upload_path = ""
             if root != path:
                 rel_path = os.path.relpath(root, path)

azure/ai/evaluation/_constants.py CHANGED Viewed

@@ -81,6 +81,7 @@ class _AggregationType(enum.Enum):
     SUM = "sum"
     CUSTOM = "custom"
 class TokenScope(str, enum.Enum):
     """Defines the scope of the token used to access Azure resources."""
@@ -114,4 +115,4 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
 AOAI_COLUMN_NAME = "aoai"
 DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
-DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
+DEFAULT_AOAI_API_VERSION = "2025-04-01-preview"  # Unfortunately relying on preview version for now.

azure/ai/evaluation/_converters/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
+# ---------------------------------------------------------

azure/ai/evaluation/_converters/_ai_services.py CHANGED Viewed

@@ -718,6 +718,7 @@ class AIAgentConverter:
         return AIAgentConverter._convert_from_conversation(data, run_id)
 @experimental
 class AIAgentDataRetriever:
     # Maximum items to fetch in a single AI Services API call (imposed by the service).
@@ -748,6 +749,7 @@ class AIAgentDataRetriever:
     def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
         pass
 @experimental
 class LegacyAgentDataRetriever(AIAgentDataRetriever):
@@ -768,7 +770,8 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
         after = None
         while has_more:
             messages = self.project_client.agents.list_messages(
-            thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after)
+                thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
+            )
             has_more = messages.has_more
             after = messages.last_id
             if messages.data:
@@ -812,6 +815,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
     def _get_run(self, thread_id: str, run_id: str):
         return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
 @experimental
 class FDPAgentDataRetriever(AIAgentDataRetriever):
@@ -833,16 +837,13 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
     def _list_run_steps_chronological(self, thread_id: str, run_id: str):
-        return  self.project_client.agents.run_steps.list(
-                thread_id=thread_id,
-                run_id=run_id,
-                limit=self._AI_SERVICES_API_MAX_LIMIT,
-                order="asc"
-            )
+        return self.project_client.agents.run_steps.list(
+            thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
+        )
     def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
         runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
         return [run.id for run in runs]
     def _get_run(self, thread_id: str, run_id: str):
-        return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)
+        return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)

azure/ai/evaluation/_converters/_models.py CHANGED Viewed

@@ -20,6 +20,7 @@ _SYSTEM = "system"
 _USER = "user"
 _AGENT = "assistant"
 _TOOL = "tool"
+_DEVELOPER = "developer"  # part of the semantic kernel
 # Constant definitions for what tool details include.
 _TOOL_CALL = "tool_call"
@@ -81,6 +82,7 @@ _BUILT_IN_PARAMS = {
     },
 }
 class Message(BaseModel):
     """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
     to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
@@ -123,6 +125,17 @@ class UserMessage(Message):
     role: str = _USER
+class SKDeveloperMessage(Message):
+    """Represents a developer message in a conversation with agents, assistants, and tools.
+    This is used in the context of Semantic Kernel (SK) agents.
+    :param role: The role of the message sender, which is always 'developer'.
+    :type role: str
+    """
+    role: str = _DEVELOPER
 class ToolMessage(Message):
     """Represents a tool message in a conversation with agents, assistants, and tools.
@@ -139,6 +152,19 @@ class ToolMessage(Message):
     tool_call_id: Optional[str] = None
+class SKToolMessage(Message):
+    """Represents a tool message in the context of a Semantic Kernel (SK) agent.
+    :param role: The role of the message sender, which is always 'tool'.
+    :type role: str
+    :param tool_call_id: The ID of the tool call associated with the message. Optional.
+    :type tool_call_id: Optional[str]
+    """
+    role: str = _TOOL
+    tool_call_id: Optional[str] = None
 class AssistantMessage(Message):
     """Represents an assistant message.
@@ -152,6 +178,26 @@ class AssistantMessage(Message):
     role: str = _AGENT
+class SKAssistantMessage(Message):
+    """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
+    :param role: The role of the message sender, which is always 'assistant'.
+    :type role: str
+    """
+    role: str = _AGENT
+class SKAssistantMessage(Message):
+    """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
+    :param role: The role of the message sender, which is always 'assistant'.
+    :type role: str
+    """
+    role: str = _AGENT
 class ToolDefinition(BaseModel):
     """Represents a tool definition that will be used in the agent.

azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl