PyPI - azure-ai-evaluation - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl - Mend

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show

azure/ai/evaluation/_evaluate/_evaluate_aoai.py CHANGED Viewed

@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
     eval_group_id: str
     eval_run_id: str
     grader_name_map: Dict[str, str]
+    # Total number of expected rows in the original dataset. Used to
+    # re-align AOAI grader results to guard against silent row drops
+    # causing horizontal concatenation misalignment.
+    expected_rows: int
 def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
     )
     return OAIEvalRunCreationInfo(
-        client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
+        client=client,
+        eval_group_id=eval_group_info.id,
+        eval_run_id=eval_run_id,
+        grader_name_map=grader_name_map,
+        expected_rows=len(data),
     )
@@ -214,7 +222,7 @@ def _get_single_run_results(
         )
     # Convert run results into a dictionary of metrics
-    run_metrics = {}
+    run_metrics: Dict[str, Any] = {}
     if run_results.per_testing_criteria_results is None:
         msg = (
             "AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
         grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
         passed = criteria_result.passed
         failed = criteria_result.failed
-        ratio = passed / (passed + failed)
+        ratio = passed / (passed + failed) if (passed + failed) else 0.0
         formatted_column_name = f"{grader_name}.pass_rate"
         run_metrics[formatted_column_name] = ratio
-    # Get full results and convert them into a dataframe.
-    # Notes on raw full data output from OAI eval runs:
-    # Each row in the full results list in itself a list.
-    # Each entry corresponds to one grader's results from the criteria list
-    # that was inputted to the eval group.
-    # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
-    # The name is used to figure out which grader the entry refers to, the sample is ignored.
-    # The passed and score values are then added to the results dictionary, prepended with the grader's name
-    # as entered by the user in the inputted dictionary.
-    # Other values, if they exist, are also added to the results dictionary.
     # Collect all results with pagination
-    all_results = []
-    next_cursor = None
+    all_results: List[Any] = []
+    next_cursor: Optional[str] = None
     limit = 100  # Max allowed by API
     while True:
-        # Build kwargs for the API call
         list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
         if next_cursor is not None:
             list_kwargs["after"] = next_cursor
@@ -265,28 +261,50 @@ def _get_single_run_results(
         # Check for more pages
         if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
             if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
-                # Get the last item's ID for cursor-based pagination
                 next_cursor = raw_list_results.data[-1].id
             else:
                 break
         else:
             break
-    listed_results = {"index": []}
-    # raw data has no order guarantees, we need to sort them by their
-    # datasource_item_id
+    listed_results: Dict[str, List[Any]] = {"index": []}
+    # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
     for row_result in all_results:
-        # Add the datasource_item_id for later sorting
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
-            grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
-            for name, value in single_grader_row_result.items():
-                if name in ["name"]:  # Todo decide if we also want to exclude "sample"
+            if isinstance(single_grader_row_result, dict):
+                result_dict = single_grader_row_result
+            elif hasattr(single_grader_row_result, "model_dump"):
+                result_dict = single_grader_row_result.model_dump()
+            elif hasattr(single_grader_row_result, "dict"):
+                result_dict = single_grader_row_result.dict()
+            elif hasattr(single_grader_row_result, "__dict__"):
+                result_dict = vars(single_grader_row_result)
+            else:
+                raise EvaluationException(
+                    message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
+                    blame=ErrorBlame.UNKNOWN,
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    target=ErrorTarget.AOAI_GRADER,
+                )
+            grader_result_name = result_dict.get("name", None)
+            if grader_result_name is None:
+                raise EvaluationException(
+                    message="AOAI evaluation response missing grader result name; unable to map to original grader.",
+                    blame=ErrorBlame.UNKNOWN,
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    target=ErrorTarget.AOAI_GRADER,
+                )
+            grader_name = run_info["grader_name_map"][grader_result_name]
+            for name, value in result_dict.items():
+                if name in ["name"]:
                     continue
                 if name.lower() == "passed":
-                    # create a `_result` column for each grader
+                    # Create a `_result` column for each grader
                     result_column_name = f"outputs.{grader_name}.{grader_name}_result"
-                    if len(result_column_name) < 50:  # TODO: is this the limit? Should we keep "passed"?
+                    if len(result_column_name) < 50:
                         if result_column_name not in listed_results:
                             listed_results[result_column_name] = []
                         listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +314,67 @@ def _get_single_run_results(
                     listed_results[formatted_column_name] = []
                 listed_results[formatted_column_name].append(value)
-    # Ensure all columns have the same length as the index
+    # Ensure all columns are the same length as the 'index' list
     num_rows = len(listed_results["index"])
     for col_name in list(listed_results.keys()):
         if col_name != "index":
             col_length = len(listed_results[col_name])
             if col_length < num_rows:
-                # Pad with None values
                 listed_results[col_name].extend([None] * (num_rows - col_length))
             elif col_length > num_rows:
-                # This shouldn't happen, but truncate if it does
                 listed_results[col_name] = listed_results[col_name][:num_rows]
     output_df = pd.DataFrame(listed_results)
-    # sort by index
-    output_df = output_df.sort_values("index", ascending=[True])
-    # remove index column
-    output_df.drop(columns=["index"], inplace=True)
+    # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
+    if "index" not in output_df.columns:
+        output_df["index"] = list(range(len(output_df)))
+    # Deterministic ordering by original datasource_item_id
+    output_df = output_df.sort_values("index", ascending=True)
+    # Keep a temporary row-id copy for debugging/inspection.
+    # Use underscores (not hyphens) to avoid pandas column handling quirks.
+    output_df["__azure_ai_evaluation_index"] = output_df["index"]
+    # Preserve original ids as index, then pad to expected length
+    output_df.set_index("index", inplace=True)
+    expected = run_info.get("expected_rows", None)
+    if expected is not None:
+        pre_len = len(output_df)
+        # Assumes original datasource_item_id space is 0..expected-1
+        output_df = output_df.reindex(range(expected))
+        if pre_len != expected:
+            missing_rows = expected - pre_len
+            LOGGER.warning(
+                "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
+                run_info["eval_run_id"],
+                pre_len,
+                expected,
+                missing_rows,
+            )
+            # Add a per-grader 'row_missing' boolean for padded rows
+            grader_user_names: Set[str] = set()
+            for col in output_df.columns:
+                if col.startswith("outputs."):
+                    parts = col.split(".")
+                    if len(parts) > 2:
+                        grader_user_names.add(parts[1])
+            if grader_user_names:
+                missing_index_mask = output_df.isna().all(axis=1)
+                for g in grader_user_names:
+                    col_name = f"outputs.{g}.row_missing"
+                    if col_name not in output_df:
+                        output_df[col_name] = False
+                    output_df.loc[missing_index_mask, col_name] = True
+    # Drop the temporary helper column before returning (no public surface change)
+    if "__azure_ai_evaluation_index" in output_df.columns:
+        output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
+    # Reset to RangeIndex so downstream concatenation aligns on position
+    output_df.reset_index(drop=True, inplace=True)
     return output_df, run_metrics
@@ -406,8 +468,15 @@ def _get_graders_and_column_mappings(
     :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
     """
+    if column_mappings is None:
+        return [({name: grader}, None) for name, grader in graders.items()]
     default_mapping = column_mappings.get("default", None)
-    return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
+    if default_mapping is None:
+        default_mapping = {}
+    return [
+        ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
+        for name, grader in graders.items()
+    ]
 def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
     project_url: str,
     evaluation_name: Optional[str],
     name_map: Dict[str, str],
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Optional[str]:
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
             evaluation=EvaluationUpload(
                 display_name=evaluation_name,
                 properties=properties,
+                tags=tags,
             )
         )
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
     run: Optional[Run],
     evaluation_name: Optional[str],
     name_map: Dict[str, str],
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Optional[str]:
     from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
         workspace_name=ws_triad.workspace_name,
         management_client=management_client,
         promptflow_run=run,
+        tags=tags,
     ) as ev_run:
         artifact_name = EvalRun.EVALUATION_ARTIFACT

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
-    def _derive_singleton_inputs(self) -> List[str]:
+    def _derive_singleton_inputs(self) -> List[List[str]]:
         """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
         when the evaluator is being used in a non-conversation context.
         By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
         Thankfully this works the way you'd hope, with the call_signature being based on the child
         function's signature, not the parent's.
-        :return: A list of strings representing the names of singleton inputs.
-        :rtype: List[str]
+        :return: A list of lists, where each inner list represents the singleton inputs for each overload.
+        :rtype: List[List[str]]
         """
         overloads = get_overloads(self.__call__)
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             call_signatures = [inspect.signature(self.__call__)]
         else:
             call_signatures = [inspect.signature(overload) for overload in overloads]
-        call_signature = inspect.signature(self.__call__)
-        singletons = []
+        overload_inputs = []
         for call_signature in call_signatures:
             params = call_signature.parameters
             if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
                 continue
             # exclude self since it is not a singleton input
-            singletons.extend([p for p in params if p != "self"])
-        return singletons
+            overload_inputs.append([p for p in params if p != "self"])
+        return overload_inputs
+    def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
+        """Find the overload that matches the provided kwargs and return its input parameters.
+        :keyword kwargs: The keyword arguments to match against overloads.
+        :type kwargs: Dict
+        :return: List of input parameter names for the matching overload.
+        :rtype: List[str]
+        """
+        overload_inputs = self._singleton_inputs
+        provided_keys = set(key for key, value in kwargs.items() if value is not None)
+        # Find the overload that best matches the provided parameters
+        best_match = None
+        best_score = -1
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            # Calculate match score: how many of the overload's params are provided
+            if input_set.issubset(provided_keys):
+                score = len(input_set)
+                if score > best_score:
+                    best_score = score
+                    best_match = inputs
+        # If exact match found, return it
+        if best_match is not None:
+            return best_match
+        # If no exact match, find the overload with the most overlap
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            overlap = len(input_set.intersection(provided_keys))
+            if overlap > best_score:
+                best_score = overlap
+                best_match = inputs
+        # Return the best match or the first overload as fallback
+        return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
+    def _get_all_singleton_inputs(self) -> List[str]:
+        """Get a flattened list of all possible singleton inputs across all overloads.
+        :return: Flattened list of all singleton input names.
+        :rtype: List[str]
+        """
+        all_inputs = set()
+        for inputs in self._singleton_inputs:
+            all_inputs.update(inputs)
+        return list(all_inputs)
     def _derive_conversation_converter(
         self,
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         :return: The function that will be used to convert conversations to evaluable inputs.
         :rtype: Callable
         """
-        include_context = "context" in self._singleton_inputs
-        include_query = "query" in self._singleton_inputs
-        include_response = "response" in self._singleton_inputs
-        include_ground_truth = "ground_truth" in self._singleton_inputs
+        all_singleton_inputs = self._get_all_singleton_inputs()
+        include_context = "context" in all_singleton_inputs
+        include_query = "query" in all_singleton_inputs
+        include_response = "response" in all_singleton_inputs
+        include_ground_truth = "ground_truth" in all_singleton_inputs
         def converter(conversation: Dict) -> List[DerivedEvalInput]:
             messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         (like a query and response), or they receive conversation that iss a list of dictionary
         values.
-        The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
-        is a valid input.
+        The self._singleton_inputs list (containing overload signatures) assigned during initialization
+        is used to find and extract singleton keywords, and determine which overload matches the
+        provided arguments.
         If both conversations and singletons are allowed, the function will raise an exception if both
         are inputted.
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         conversation = kwargs.get("conversation", None)
         singletons = {}
         if len(self._singleton_inputs) > 0:
-            singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
+            # Get all possible singleton inputs and check what's provided
+            all_singleton_inputs = self._get_all_singleton_inputs()
+            singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
         # Check that both conversation and other inputs aren't set
         if conversation is not None and any(singletons.values()):
             msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             if self._is_multi_modal_conversation(conversation):
                 return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
-        # Handle Singletons
-        required_singletons = remove_optional_singletons(self, singletons)
-        if all(value is not None for value in required_singletons.values()):
-            return [singletons]
+        # Handle Singletons - find matching overload
+        matching_inputs = self._get_matching_overload_inputs(**kwargs)
+        if matching_inputs:
+            # Check if all required inputs for this overload are provided
+            required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
+            required_singletons = remove_optional_singletons(self, required_singletons)
+            if all(value is not None for value in required_singletons.values()):
+                return [singletons]
         # Missing input
         msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
         raise EvaluationException(
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
+    def _parse_tools_from_response(self, response):
+        """Parse the response to extract tool calls and results.
+        :param response: The response to parse.
+        :type response: Union[str, List[dict]]
+        :return: List of tool calls extracted from the response.
+        :rtype: List[dict]
+        """
+        tool_calls = []
+        tool_results_map = {}
+        if isinstance(response, list):
+            for message in response:
+                # Extract tool calls from assistant messages
+                if message.get("role") == "assistant" and isinstance(message.get("content"), list):
+                    for content_item in message.get("content"):
+                        if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            tool_calls.append(content_item)
+                # Extract tool results from tool messages
+                elif message.get("role") == "tool" and message.get("tool_call_id"):
+                    tool_call_id = message.get("tool_call_id")
+                    if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
+                        result_content = message.get("content")[0]
+                        if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
+                            tool_results_map[tool_call_id] = result_content
+        # Attach results to their corresponding calls
+        for tool_call in tool_calls:
+            tool_call_id = tool_call.get("tool_call_id")
+            if tool_call_id in tool_results_map:
+                tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
+        return tool_calls
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import math
 import re
 import os
-from typing import Dict, TypeVar, Union
+from typing import Dict, Optional, TypeVar, Union
 if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
     from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
     from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
+from azure.core.credentials import TokenCredential
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         model_config: dict,
         eval_last_turn: bool = False,
         threshold: int = 3,
+        credential: Optional[TokenCredential] = None,
         _higher_is_better: bool = False,
         **kwargs,
     ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         )
         self._flow = AsyncPrompty.load(
-            source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
+            source=self._prompty_file,
+            model=prompty_model_config,
+            token_credential=credential,
+            is_reasoning_model=self._is_reasoning_model,
         )
     # __call__ not overridden here because child classes have such varied signatures that there's no point

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         if query is not None and self._evaluate_query:
             input_data["query"] = str(query)
-        if "context" in self._singleton_inputs:
+        if "context" in self._get_all_singleton_inputs():
             context = eval_input.get("context", None)
             if context is None:
                 raise EvaluationException(

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.1py3-none-any.whl