PyPI - azure-ai-evaluation - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show

azure/ai/evaluation/_common/onedp/models/_models.py CHANGED Viewed

@@ -1961,12 +1961,16 @@ class Message(_Model):
     :vartype role: str
     :ivar content: The content.
     :vartype content: str
+    :ivar context: The context.
+    :vartype context: str
     """
     role: Optional[str] = rest_field(name="Role", visibility=["read", "create", "update", "delete", "query"])
     """The role."""
     content: Optional[str] = rest_field(name="Content", visibility=["read", "create", "update", "delete", "query"])
     """The content."""
+    context: Optional[str] = rest_field(name="Context", visibility=["read", "create", "update", "delete", "query"])
+    """The context."""
     @overload
     def __init__(
@@ -1974,6 +1978,7 @@ class Message(_Model):
         *,
         role: Optional[str] = None,
         content: Optional[str] = None,
+        context: Optional[str] = None,
     ) -> None: ...
     @overload

azure/ai/evaluation/_converters/_ai_services.py CHANGED Viewed

@@ -11,7 +11,18 @@ from azure.ai.evaluation._common._experimental import experimental
 from packaging.version import Version
 # Constants.
-from ._models import _USER, _AGENT, _TOOL, _TOOL_CALL, _TOOL_CALLS, _FUNCTION, _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+from ._models import (
+    _USER,
+    _AGENT,
+    _TOOL,
+    _TOOL_CALL,
+    _TOOL_CALLS,
+    _FUNCTION,
+    _BUILT_IN_DESCRIPTIONS,
+    _BUILT_IN_PARAMS,
+    _OPENAPI,
+    OpenAPIToolDefinition,
+)
 # Message instances.
 from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
@@ -93,7 +104,7 @@ class AIAgentConverter:
         return tool_calls_chronological
     @staticmethod
-    def _extract_function_tool_definitions(thread_run: object) -> List[ToolDefinition]:
+    def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]:
         """
         Extracts tool definitions from a thread run.
@@ -121,6 +132,26 @@ class AIAgentConverter:
                         parameters=parameters,
                     )
                 )
+            elif tool.type == _OPENAPI:
+                openapi_tool = tool.openapi
+                tool_definition = OpenAPIToolDefinition(
+                    name=openapi_tool.name,
+                    description=openapi_tool.description,
+                    type=_OPENAPI,
+                    spec=openapi_tool.spec,
+                    auth=openapi_tool.auth.as_dict(),
+                    default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None,
+                    functions=[
+                        ToolDefinition(
+                            name=func.get("name"),
+                            description=func.get("description"),
+                            parameters=func.get("parameters"),
+                            type="function",
+                        )
+                        for func in openapi_tool.get("functions")
+                    ],
+                )
+                final_tools.append(tool_definition)
             else:
                 # Add limited support for built-in tools. Descriptions and parameters
                 # are not published, but we'll include placeholders.
@@ -243,16 +274,30 @@ class AIAgentConverter:
             if len(single_turn.content) < 1:
                 continue
-            # Build the content of the text message.
-            content = {
-                "type": "text",
-                "text": single_turn.content[0].text.value,
-            }
+            content_list = []
+            # If content is a list, process all content items.
+            for content_item in single_turn.content:
+                if content_item.type == "text":
+                    content_list.append(
+                        {
+                            "type": "text",
+                            "text": content_item.text.value,
+                        }
+                    )
+                elif content_item.type == "image":
+                    content_list.append(
+                        {
+                            "type": "image",
+                            "image": {
+                                "file_id": content_item.image_file.file_id,
+                            },
+                        }
+                    )
             # If we have a user message, then we save it as such and since it's a human message, there is no
             # run_id associated with it.
             if single_turn.role == _USER:
-                final_messages.append(UserMessage(content=[content], createdAt=single_turn.created_at))
+                final_messages.append(UserMessage(content=content_list, createdAt=single_turn.created_at))
                 continue
             # In this case, we have an assistant message. Unfortunately, this would only have the user-facing
@@ -261,7 +306,7 @@ class AIAgentConverter:
             if single_turn.role == _AGENT:
                 # We are required to put the run_id in the assistant message.
                 final_messages.append(
-                    AssistantMessage(content=[content], run_id=single_turn.run_id, createdAt=single_turn.created_at)
+                    AssistantMessage(content=content_list, run_id=single_turn.run_id, createdAt=single_turn.created_at)
                 )
                 continue
@@ -791,6 +836,7 @@ class LegacyAgentDataRetriever(AIAgentDataRetriever):
                 limit=self._AI_SERVICES_API_MAX_LIMIT,
                 order="asc",
                 after=after,
+                include=["step_details.tool_calls[*].file_search.results[*].content"],
             )
             has_more = run_steps.has_more
             after = run_steps.last_id
@@ -838,7 +884,11 @@ class FDPAgentDataRetriever(AIAgentDataRetriever):
     def _list_run_steps_chronological(self, thread_id: str, run_id: str):
         return self.project_client.agents.run_steps.list(
-            thread_id=thread_id, run_id=run_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
+            thread_id=thread_id,
+            run_id=run_id,
+            limit=self._AI_SERVICES_API_MAX_LIMIT,
+            order="asc",
+            include=["step_details.tool_calls[*].file_search.results[*].content"],
         )
     def _list_run_ids_chronological(self, thread_id: str) -> List[str]:

azure/ai/evaluation/_converters/_models.py CHANGED Viewed

@@ -3,17 +3,31 @@ import json
 from pydantic import BaseModel
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Union
 # Models moved in a later version of agents SDK, so try a few different locations
-try:
-    from azure.ai.projects.models import RunStepFunctionToolCall
-except ImportError:
-    pass
-try:
-    from azure.ai.agents.models import RunStepFunctionToolCall
-except ImportError:
-    pass
+# Only import for type checking to avoid runtime import errors
+if TYPE_CHECKING:
+    try:
+        from azure.ai.projects.models import RunStepFunctionToolCall
+    except ImportError:
+        try:
+            from azure.ai.agents.models import RunStepFunctionToolCall
+        except ImportError:
+            # Create a protocol for type checking when the real class isn't available
+            from typing import Protocol
+            class RunStepFunctionToolCall(Protocol):
+                """Protocol defining the expected interface for RunStepFunctionToolCall."""
+                id: str
+                type: str
+                def get(self, key: str, default: Any = None) -> Any: ...
+else:
+    # At runtime, we don't need the actual class since it's only used in type annotations
+    RunStepFunctionToolCall = Any
 # Message roles constants.
 _SYSTEM = "system"
@@ -33,9 +47,12 @@ _TOOL_CALLS = "tool_calls"
 # Constants to only be used internally in this file for the built-in tools.
 _CODE_INTERPRETER = "code_interpreter"
 _BING_GROUNDING = "bing_grounding"
+_BING_CUSTOM_SEARCH = "bing_custom_search"
 _FILE_SEARCH = "file_search"
 _AZURE_AI_SEARCH = "azure_ai_search"
+_SHAREPOINT_GROUNDING = "sharepoint_grounding"
 _FABRIC_DATAAGENT = "fabric_dataagent"
+_OPENAPI = "openapi"
 # Built-in tool descriptions and parameters are hidden, but we include basic descriptions
 # for evaluation purposes.
@@ -44,8 +61,10 @@ _BUILT_IN_DESCRIPTIONS = {
     + "generate code, and create graphs and charts using your data. Supports "
     + "up to 20 files.",
     _BING_GROUNDING: "Enhance model output with web data.",
-    _FILE_SEARCH: "Search for data across uploaded files.",
+    _BING_CUSTOM_SEARCH: "Enables agents to retrieve content from a curated subset of websites, enhancing relevance and reducing noise from public web searches.",
+    _FILE_SEARCH: "Search for data across uploaded files. A single call can return multiple results/files in the 'results' field.",
     _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
+    _SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
     _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
 }
@@ -59,6 +78,15 @@ _BUILT_IN_PARAMS = {
         "type": "object",
         "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
     },
+    _BING_CUSTOM_SEARCH: {
+        "type": "object",
+        "properties": {
+            "requesturl": {
+                "type": "string",
+                "description": "Search queries, along with pre-configured site restrictions or domain filters.",
+            }
+        },
+    },
     _FILE_SEARCH: {
         "type": "object",
         "properties": {
@@ -76,6 +104,12 @@ _BUILT_IN_PARAMS = {
         "type": "object",
         "properties": {"input": {"type": "string", "description": "Search terms to use."}},
     },
+    _SHAREPOINT_GROUNDING: {
+        "type": "object",
+        "properties": {
+            "input": {"type": "string", "description": "A natural language query to search SharePoint content."}
+        },
+    },
     _FABRIC_DATAAGENT: {
         "type": "object",
         "properties": {"input": {"type": "string", "description": "Search terms to use."}},
@@ -217,6 +251,27 @@ class ToolDefinition(BaseModel):
     parameters: dict
+class OpenAPIToolDefinition(BaseModel):
+    """Represents OpenAPI tool definition that will be used in the agent.
+    :param name: The name of the tool.
+    :type name: str
+    :param type: The type of the tool.
+    :type type: str
+    :param description: A description of the tool.
+    :type description: str
+    :param parameters: The parameters required by the tool.
+    :type parameters: dict
+    """
+    name: str
+    type: str
+    description: Optional[str] = None
+    spec: object
+    auth: object
+    default_params: Optional[list[str]] = None
+    functions: list[ToolDefinition]
 class ToolCall:
     """Represents a tool call, used as an intermediate step in the conversion process.
@@ -247,7 +302,7 @@ class EvaluatorData(BaseModel):
     query: List[Message]
     response: List[Message]
-    tool_definitions: List[ToolDefinition]
+    tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]]
     def to_json(self):
         """Converts the result to a JSON string.
@@ -277,14 +332,16 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     # all in most of the cases, and bing would only show the API URL, without arguments or results.
     # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
     # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
-    if hasattr(tool_call.details, _FUNCTION):
+    if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
         # This is the internals of the content object that will be included with the tool call.
         tool_call_id = tool_call.details.id
         content_tool_call = {
             "type": _TOOL_CALL,
             "tool_call_id": tool_call_id,
-            "name": tool_call.details.function.name,
-            "arguments": safe_loads(tool_call.details.function.arguments),
+            "name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None,
+            "arguments": safe_loads(
+                tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None
+            ),
         }
     else:
         # Treat built-in tools separately.  Object models may be unique so handle each case separately
@@ -322,27 +379,19 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     # assistant's action of calling the tool.
     messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
-    if hasattr(tool_call.details, _FUNCTION):
-        output = safe_loads(tool_call.details.function["output"])
+    if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
+        output = safe_loads(tool_call.details.get("function")["output"])
     else:
         try:
             # Some built-ins may have output, others may not
             # Try to retrieve it, but if we don't find anything, skip adding the message
             # Just manually converting to dicts for easy serialization for now rather than custom serializers
             if tool_call.details.type == _CODE_INTERPRETER:
-                output = tool_call.details.code_interpreter.outputs
+                output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
             elif tool_call.details.type == _BING_GROUNDING:
                 return messages  # not supported yet from bing grounding tool
             elif tool_call.details.type == _FILE_SEARCH:
-                output = [
-                    {
-                        "file_id": result.file_id,
-                        "file_name": result.file_name,
-                        "score": result.score,
-                        "content": result.content,
-                    }
-                    for result in tool_call.details.file_search.results
-                ]
+                output = [result.as_dict() for result in tool_call.details.file_search.results]
             elif tool_call.details.type == _AZURE_AI_SEARCH:
                 output = tool_call.details.azure_ai_search["output"]
             elif tool_call.details.type == _FABRIC_DATAAGENT:

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -81,6 +81,8 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         ~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
     :param promptflow_run: The promptflow run used by the
     :type promptflow_run: Optional[promptflow._sdk.entities.Run]
+    :param tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
+    :type tags: Optional[Dict[str, str]]
     """
     _MAX_RETRIES = 5
@@ -98,6 +100,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         workspace_name: str,
         management_client: LiteMLClient,
         promptflow_run: Optional[Run] = None,
+        tags: Optional[Dict[str, str]] = None,
     ) -> None:
         self._tracking_uri: str = tracking_uri
         self._subscription_id: str = subscription_id
@@ -107,6 +110,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         self._is_promptflow_run: bool = promptflow_run is not None
         self._run_name = run_name
         self._promptflow_run = promptflow_run
+        self._tags = tags or {}
         self._status = RunStatus.NOT_STARTED
         self._url_base: Optional[str] = None
         self._info: Optional[RunInfo] = None
@@ -173,11 +177,20 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                 )
             else:
                 url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
+                # Prepare tags: start with user tags, ensure mlflow.user is set
+                run_tags = self._tags.copy()
+                if "mlflow.user" not in run_tags:
+                    run_tags["mlflow.user"] = "azure-ai-evaluation"
+                # Convert tags to MLflow format
+                tags_list = [{"key": key, "value": value} for key, value in run_tags.items()]
                 body = {
                     "experiment_id": "0",
                     "user_id": "azure-ai-evaluation",
                     "start_time": int(time.time() * 1000),
-                    "tags": [{"key": "mlflow.user", "value": "azure-ai-evaluation"}],
+                    "tags": tags_list,
                 }
                 if self._run_name:
                     body["run_name"] = self._run_name

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -464,7 +464,7 @@ def _validate_columns_for_evaluators(
         )
-def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
+def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
     if data is None:
         msg = "The 'data' parameter is required for evaluation."
         raise EvaluationException(
@@ -725,6 +725,7 @@ def evaluate(
     azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> EvaluationResult:
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -757,6 +758,10 @@ def evaluate(
         Defaults to false, which means that evaluations will continue regardless of failures.
         If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
     :paramtype fail_on_evaluator_errors: bool
+    :keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
+        Keys and values must be strings. For more information about tag limits, see:
+        https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
+    :paramtype tags: Optional[Dict[str, str]]
     :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
     :paramtype user_agent: Optional[str]
     :return: Evaluation results.
@@ -793,6 +798,7 @@ def evaluate(
                 azure_ai_project=azure_ai_project,
                 output_path=output_path,
                 fail_on_evaluator_errors=fail_on_evaluator_errors,
+                tags=tags,
                 **kwargs,
             )
     except Exception as e:
@@ -861,6 +867,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> EvaluationResult:
     if fail_on_evaluator_errors:
@@ -877,6 +884,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         azure_ai_project=azure_ai_project,
         evaluation_name=evaluation_name,
         fail_on_evaluator_errors=fail_on_evaluator_errors,
+        tags=tags,
         **kwargs,
     )
@@ -956,7 +964,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     name_map = _map_names_to_builtins(evaluators, graders)
     if is_onedp_project(azure_ai_project):
         studio_url = _log_metrics_and_instance_results_onedp(
-            metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
+            metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
         )
     else:
         # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
@@ -964,7 +972,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         studio_url = None
         if trace_destination:
             studio_url = _log_metrics_and_instance_results(
-                metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
+                metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
             )
     result_df_dict = results_df.to_dict("records")
@@ -985,6 +993,7 @@ def _preprocess_data(
     azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     evaluation_name: Optional[str] = None,
     fail_on_evaluator_errors: bool = False,
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> __ValidatedData:
     # Process evaluator config to replace ${target.} with ${data.}
@@ -992,7 +1001,7 @@ def _preprocess_data(
         evaluator_config = {}
     input_data_df = _validate_and_load_data(
-        target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
+        target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
     )
     if target is not None:
         _validate_columns_for_target(input_data_df, target)

azure/ai/evaluation/_evaluate/_evaluate_aoai.py CHANGED Viewed

@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
     eval_group_id: str
     eval_run_id: str
     grader_name_map: Dict[str, str]
+    # Total number of expected rows in the original dataset. Used to
+    # re-align AOAI grader results to guard against silent row drops
+    # causing horizontal concatenation misalignment.
+    expected_rows: int
 def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
     )
     return OAIEvalRunCreationInfo(
-        client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
+        client=client,
+        eval_group_id=eval_group_info.id,
+        eval_run_id=eval_run_id,
+        grader_name_map=grader_name_map,
+        expected_rows=len(data),
     )
@@ -214,7 +222,7 @@ def _get_single_run_results(
         )
     # Convert run results into a dictionary of metrics
-    run_metrics = {}
+    run_metrics: Dict[str, Any] = {}
     if run_results.per_testing_criteria_results is None:
         msg = (
             "AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
         grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
         passed = criteria_result.passed
         failed = criteria_result.failed
-        ratio = passed / (passed + failed)
+        ratio = passed / (passed + failed) if (passed + failed) else 0.0
         formatted_column_name = f"{grader_name}.pass_rate"
         run_metrics[formatted_column_name] = ratio
-    # Get full results and convert them into a dataframe.
-    # Notes on raw full data output from OAI eval runs:
-    # Each row in the full results list in itself a list.
-    # Each entry corresponds to one grader's results from the criteria list
-    # that was inputted to the eval group.
-    # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
-    # The name is used to figure out which grader the entry refers to, the sample is ignored.
-    # The passed and score values are then added to the results dictionary, prepended with the grader's name
-    # as entered by the user in the inputted dictionary.
-    # Other values, if they exist, are also added to the results dictionary.
     # Collect all results with pagination
-    all_results = []
-    next_cursor = None
+    all_results: List[Any] = []
+    next_cursor: Optional[str] = None
     limit = 100  # Max allowed by API
     while True:
-        # Build kwargs for the API call
         list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
         if next_cursor is not None:
             list_kwargs["after"] = next_cursor
@@ -265,28 +261,25 @@ def _get_single_run_results(
         # Check for more pages
         if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
             if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
-                # Get the last item's ID for cursor-based pagination
                 next_cursor = raw_list_results.data[-1].id
             else:
                 break
         else:
             break
-    listed_results = {"index": []}
-    # raw data has no order guarantees, we need to sort them by their
-    # datasource_item_id
+    listed_results: Dict[str, List[Any]] = {"index": []}
+    # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
     for row_result in all_results:
-        # Add the datasource_item_id for later sorting
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
             grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
             for name, value in single_grader_row_result.items():
-                if name in ["name"]:  # Todo decide if we also want to exclude "sample"
+                if name in ["name"]:
                     continue
                 if name.lower() == "passed":
-                    # create a `_result` column for each grader
+                    # Create a `_result` column for each grader
                     result_column_name = f"outputs.{grader_name}.{grader_name}_result"
-                    if len(result_column_name) < 50:  # TODO: is this the limit? Should we keep "passed"?
+                    if len(result_column_name) < 50:
                         if result_column_name not in listed_results:
                             listed_results[result_column_name] = []
                         listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +289,67 @@ def _get_single_run_results(
                     listed_results[formatted_column_name] = []
                 listed_results[formatted_column_name].append(value)
-    # Ensure all columns have the same length as the index
+    # Ensure all columns are the same length as the 'index' list
     num_rows = len(listed_results["index"])
     for col_name in list(listed_results.keys()):
         if col_name != "index":
             col_length = len(listed_results[col_name])
             if col_length < num_rows:
-                # Pad with None values
                 listed_results[col_name].extend([None] * (num_rows - col_length))
             elif col_length > num_rows:
-                # This shouldn't happen, but truncate if it does
                 listed_results[col_name] = listed_results[col_name][:num_rows]
     output_df = pd.DataFrame(listed_results)
-    # sort by index
-    output_df = output_df.sort_values("index", ascending=[True])
-    # remove index column
-    output_df.drop(columns=["index"], inplace=True)
+    # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
+    if "index" not in output_df.columns:
+        output_df["index"] = list(range(len(output_df)))
+    # Deterministic ordering by original datasource_item_id
+    output_df = output_df.sort_values("index", ascending=True)
+    # Keep a temporary row-id copy for debugging/inspection.
+    # Use underscores (not hyphens) to avoid pandas column handling quirks.
+    output_df["__azure_ai_evaluation_index"] = output_df["index"]
+    # Preserve original ids as index, then pad to expected length
+    output_df.set_index("index", inplace=True)
+    expected = run_info.get("expected_rows", None)
+    if expected is not None:
+        pre_len = len(output_df)
+        # Assumes original datasource_item_id space is 0..expected-1
+        output_df = output_df.reindex(range(expected))
+        if pre_len != expected:
+            missing_rows = expected - pre_len
+            LOGGER.warning(
+                "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
+                run_info["eval_run_id"],
+                pre_len,
+                expected,
+                missing_rows,
+            )
+            # Add a per-grader 'row_missing' boolean for padded rows
+            grader_user_names: Set[str] = set()
+            for col in output_df.columns:
+                if col.startswith("outputs."):
+                    parts = col.split(".")
+                    if len(parts) > 2:
+                        grader_user_names.add(parts[1])
+            if grader_user_names:
+                missing_index_mask = output_df.isna().all(axis=1)
+                for g in grader_user_names:
+                    col_name = f"outputs.{g}.row_missing"
+                    if col_name not in output_df:
+                        output_df[col_name] = False
+                    output_df.loc[missing_index_mask, col_name] = True
+    # Drop the temporary helper column before returning (no public surface change)
+    if "__azure_ai_evaluation_index" in output_df.columns:
+        output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
+    # Reset to RangeIndex so downstream concatenation aligns on position
+    output_df.reset_index(drop=True, inplace=True)
     return output_df, run_metrics
@@ -406,8 +443,15 @@ def _get_graders_and_column_mappings(
     :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
     """
+    if column_mappings is None:
+        return [({name: grader}, None) for name, grader in graders.items()]
     default_mapping = column_mappings.get("default", None)
-    return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
+    if default_mapping is None:
+        default_mapping = {}
+    return [
+        ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
+        for name, grader in graders.items()
+    ]
 def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:

azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl