PyPI - azure-ai-evaluation - Versions diffs - 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.11.1py3-none-any.whl → 1.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show

azure/ai/evaluation/_aoai/aoai_grader.py CHANGED Viewed

@@ -1,13 +1,19 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing_extensions import TypeIs
-from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._constants import DEFAULT_AOAI_API_VERSION, TokenScope
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from azure.ai.evaluation._user_agent import UserAgentSingleton
-from typing import Any, Dict, Union
-from azure.ai.evaluation._common._experimental import experimental
+from azure.core.credentials import TokenCredential
+if TYPE_CHECKING:
+    from openai.lib.azure import AzureADTokenProvider
 @experimental
@@ -30,6 +36,8 @@ class AzureOpenAIGrader:
         to be formatted as a dictionary that matches the specifications of the sub-types of
         the TestingCriterion alias specified in (OpenAI's SDK)[https://github.com/openai/openai-python/blob/ed53107e10e6c86754866b48f8bd862659134ca8/src/openai/types/eval_create_params.py#L151].
     :type grader_config: Dict[str, Any]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
@@ -43,10 +51,12 @@ class AzureOpenAIGrader:
         *,
         model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
         grader_config: Dict[str, Any],
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         self._model_config = model_config
         self._grader_config = grader_config
+        self._credential = credential
         if kwargs.get("validate", True):
             self._validate_model_config()
@@ -54,20 +64,39 @@ class AzureOpenAIGrader:
     def _validate_model_config(self) -> None:
         """Validate the model configuration that this grader wrapper is using."""
-        if "api_key" not in self._model_config or not self._model_config.get("api_key"):
-            msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
-            raise EvaluationException(
-                message=msg,
-                blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.INVALID_VALUE,
-                target=ErrorTarget.AOAI_GRADER,
-            )
+        msg = None
+        if self._is_azure_model_config(self._model_config):
+            if not any(auth for auth in (self._model_config.get("api_key"), self._credential)):
+                msg = (
+                    f"{type(self).__name__}: Requires an api_key in the supplied model_config, "
+                    + "or providing a credential to the grader's __init__ method. "
+                )
+        else:
+            if "api_key" not in self._model_config or not self._model_config.get("api_key"):
+                msg = f"{type(self).__name__}: Requires an api_key in the supplied model_config."
+        if msg is None:
+            return
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.USER_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.AOAI_GRADER,
+        )
     def _validate_grader_config(self) -> None:
         """Validate the grader configuration that this grader wrapper is using."""
         return
+    @staticmethod
+    def _is_azure_model_config(
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+    ) -> TypeIs[AzureOpenAIModelConfiguration]:
+        return "azure_endpoint" in model_config
     def get_client(self) -> Any:
         """Construct an appropriate OpenAI client using this grader's model configuration.
         Returns a slightly different client depending on whether or not this grader's model
@@ -77,23 +106,38 @@ class AzureOpenAIGrader:
         :rtype: [~openai.OpenAI, ~openai.AzureOpenAI]
         """
         default_headers = {"User-Agent": UserAgentSingleton().value}
-        if "azure_endpoint" in self._model_config:
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] = self._model_config
+        api_key: Optional[str] = model_config.get("api_key")
+        if self._is_azure_model_config(model_config):
             from openai import AzureOpenAI
             # TODO set default values?
             return AzureOpenAI(
-                azure_endpoint=self._model_config["azure_endpoint"],
-                api_key=self._model_config.get("api_key", None),  # Default-style access to appease linters.
+                azure_endpoint=model_config["azure_endpoint"],
+                api_key=api_key,  # Default-style access to appease linters.
                 api_version=DEFAULT_AOAI_API_VERSION,  # Force a known working version
-                azure_deployment=self._model_config.get("azure_deployment", ""),
+                azure_deployment=model_config.get("azure_deployment", ""),
+                azure_ad_token_provider=self._get_token_provider(self._credential) if not api_key else None,
                 default_headers=default_headers,
             )
         from openai import OpenAI
         # TODO add default values for base_url and organization?
         return OpenAI(
-            api_key=self._model_config["api_key"],
-            base_url=self._model_config.get("base_url", ""),
-            organization=self._model_config.get("organization", ""),
+            api_key=api_key,
+            base_url=model_config.get("base_url", ""),
+            organization=model_config.get("organization", ""),
             default_headers=default_headers,
         )
+    @staticmethod
+    def _get_token_provider(cred: TokenCredential) -> "AzureADTokenProvider":
+        """Get the token provider the AzureOpenAI client.
+        :param TokenCredential cred: The Azure authentication credential.
+        :return: The token provider if a credential is provided, otherwise None.
+        :rtype: openai.lib.azure.AzureADTokenProvider
+        """
+        return lambda: cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT).token

azure/ai/evaluation/_aoai/label_grader.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, List
+from typing import Any, Dict, List, Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import LabelModelGrader
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 from .aoai_grader import AzureOpenAIGrader
@@ -37,6 +39,8 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
     :type name: str
     :param passing_labels: The labels that indicate a passing result. Must be a subset of labels.
     :type passing_labels: List[str]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
@@ -54,6 +58,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
         model: str,
         name: str,
         passing_labels: List[str],
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = LabelModelGrader(
@@ -64,4 +69,4 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
             passing_labels=passing_labels,
             type="label_model",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)

azure/ai/evaluation/_aoai/python_grader.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, Optional
+from typing import Any, Dict, Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import PythonGrader
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 from .aoai_grader import AzureOpenAIGrader
@@ -39,6 +41,8 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
     :param source: Python source code containing the grade function.
         Must define: def grade(sample: dict, item: dict) -> float
     :type source: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
@@ -63,6 +67,7 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
         image_tag: str,
         pass_threshold: float,
         source: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         # Validate pass_threshold
@@ -81,4 +86,4 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
             type="python",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)

azure/ai/evaluation/_aoai/score_model_grader.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union, List, Optional
+from typing import Any, Dict, List, Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import ScoreModelGrader
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 from .aoai_grader import AzureOpenAIGrader
@@ -43,6 +45,8 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
     :type pass_threshold: Optional[float]
     :param sampling_params: The sampling parameters for the model.
     :type sampling_params: Optional[Dict[str, Any]]
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
     """
@@ -59,6 +63,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
         range: Optional[List[float]] = None,
         pass_threshold: Optional[float] = None,
         sampling_params: Optional[Dict[str, Any]] = None,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any,
     ):
         # Validate range and pass_threshold
@@ -88,4 +93,4 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
         grader = ScoreModelGrader(**grader_kwargs)
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)

azure/ai/evaluation/_aoai/string_check_grader.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union
-from typing_extensions import Literal
+from typing import Any, Dict, Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import StringCheckGrader
+from typing_extensions import Literal
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 from .aoai_grader import AzureOpenAIGrader
@@ -33,6 +35,8 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
     :type operation: Literal["eq", "ne", "like", "ilike"]
     :param reference: The reference text. This may include template strings.
     :type reference: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
@@ -54,6 +58,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
             "ilike",
         ],
         reference: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = StringCheckGrader(
@@ -63,4 +68,4 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
             reference=reference,
             type="string_check",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)

azure/ai/evaluation/_aoai/text_similarity_grader.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Any, Dict, Union
-from typing_extensions import Literal
+from typing import Any, Dict, Optional, Union
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from openai.types.graders import TextSimilarityGrader
+from typing_extensions import Literal
 from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from azure.core.credentials import TokenCredential
 from .aoai_grader import AzureOpenAIGrader
@@ -47,6 +49,8 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
     :type reference: str
     :param name: The name of the grader.
     :type name: str
+    :param credential: The credential to use to authenticate to the model. Only applicable to AzureOpenAI models.
+    :type credential: ~azure.core.credentials.TokenCredential
     :param kwargs: Additional keyword arguments to pass to the grader.
     :type kwargs: Any
@@ -76,6 +80,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
         pass_threshold: float,
         reference: str,
         name: str,
+        credential: Optional[TokenCredential] = None,
         **kwargs: Any
     ):
         grader = TextSimilarityGrader(
@@ -86,4 +91,4 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
             reference=reference,
             type="text_similarity",
         )
-        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
+        super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)

azure/ai/evaluation/_eval_mapping.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # Import all evals
 from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
+from azure.ai.evaluation._evaluators._task_success import TaskSuccessEvaluator
 from azure.ai.evaluation import (
     BleuScoreEvaluator,
     CodeVulnerabilityEvaluator,
@@ -67,6 +68,7 @@ EVAL_CLASS_MAP = {
     SexualEvaluator: "sexual",
     SimilarityEvaluator: "similarity",
     TaskAdherenceEvaluator: "task_adherence",
+    TaskSuccessEvaluator: "task_success",
     ToolCallAccuracyEvaluator: "tool_call_accuracy",
     UngroundedAttributesEvaluator: "ungrounded_attributes",
     ViolenceEvaluator: "violence",

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import re
 import tempfile
 import json
-from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
 from openai import OpenAI, AzureOpenAI
 from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -18,6 +18,7 @@ import pandas as pd
 from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
 from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
+from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
@@ -317,6 +318,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     # For rest of metrics, we will calculate mean
     df.drop(columns=handled_columns, inplace=True)
+    # Convert "not applicable" strings to None to allow proper numeric aggregation
+    df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
     # NOTE: nan/None values don't count as as booleans, so boolean columns with
     # nan/None values won't have a mean produced from them.
     # This is different from label-based known evaluators, which have special handling.
@@ -1131,11 +1135,36 @@ def _preprocess_data(
     # via target mapping.
     # If both the data and the output dictionary of the target function
     # have the same column, then the target function value is used.
+    # NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
+    # Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
+    if input_data_df is not None:
+        if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
+            # No action is taken when 'conversation' or 'messages' columns are present,
+            # as these indicate chat/conversation data which should not be flattened or mapped by default.
+            pass
+        else:
+            input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
+    # Build default mapping for leaves:
     if input_data_df is not None:
+        # First, map flattened nested columns (those containing a dot) to leaf names.
+        for col in input_data_df.columns:
+            # Skip target output columns
+            if col.startswith(Prefixes.TSG_OUTPUTS):
+                continue
+            # Skip root container columns (no dot) here; they'll be handled below if truly primitive.
+            if "." in col:
+                leaf_name = col.split(".")[-1]
+                if leaf_name not in column_mapping["default"]:
+                    column_mapping["default"][leaf_name] = f"${{data.{col}}}"
+        # Then, handle remaining top-level primitive columns (original logic).
         for col in input_data_df.columns:
-            # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
-            # Also ignore columns that are already in config, since they've been covered by target mapping.
-            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
+            if (
+                not col.startswith(Prefixes.TSG_OUTPUTS)
+                and col not in column_mapping["default"].keys()
+                and "." not in col  # only pure top-level primitives
+            ):
                 column_mapping["default"][col] = f"${{data.{col}}}"
     return __ValidatedData(
@@ -1149,6 +1178,79 @@ def _preprocess_data(
     )
+def _flatten_object_columns_for_default_mapping(
+    df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
+) -> pd.DataFrame:
+    """Flatten nested dictionary-valued columns into dotted leaf columns.
+    For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
+    leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
+    columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
+    all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
+    are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
+    Example
+        If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
+        columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
+    :param df: Input DataFrame to flatten in place.
+    :type df: ~pandas.DataFrame
+    :param root_prefixes: Optional iterable restricting which top-level columns are considered
+        for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
+    :type root_prefixes: Optional[Iterable[str]]
+    :return: The same DataFrame instance (returned for convenient chaining).
+    :rtype: ~pandas.DataFrame
+    """
+    candidate_cols = []
+    if root_prefixes is not None:
+        candidate_cols = [c for c in root_prefixes if c in df.columns]
+    else:
+        # pick columns where at least one non-null value is a dict
+        for c in df.columns:
+            series = df[c]
+            if series.map(lambda v: isinstance(v, dict)).any():
+                candidate_cols.append(c)
+    def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                new_prefix = f"{prefix}.{k}" if prefix else k
+                if isinstance(v, dict):
+                    yield from _extract_leaves(v, new_prefix)
+                else:
+                    # treat list / primitive / None as leaf
+                    yield new_prefix, v
+    for root_col in candidate_cols:
+        # Build a union of leaf paths across rows to ensure consistent columns
+        leaf_paths: Set[str] = set()
+        for val in df[root_col]:
+            if isinstance(val, dict):
+                for path, _ in _extract_leaves(val, root_col):
+                    leaf_paths.add(path)
+        if not leaf_paths:
+            continue
+        # Create each flattened column if absent
+        for path in leaf_paths:
+            if path in df.columns:
+                continue  # already present
+            relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
+            def getter(root_val: Any) -> Any:
+                cur = root_val
+                for rk in relative_keys:
+                    if not isinstance(cur, dict):
+                        return None
+                    cur = cur.get(rk, None)
+                return cur
+            df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
+    return df
 def _run_callable_evaluators(
     validated_data: __ValidatedData,
     fail_on_evaluator_errors: bool = False,

azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.11.1py3-none-any.whl → 1.12.0py3-none-any.whl