PyPI - azure-ai-evaluation - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (31) hide show

azure/ai/evaluation/_azure/_clients.py CHANGED Viewed

@@ -17,7 +17,7 @@ from azure.ai.evaluation.simulator._model_tools._identity_manager import TokenSc
 from ._models import BlobStoreInfo, Workspace
-API_VERSION: Final[str] = "2024-10-01"
+API_VERSION: Final[str] = "2024-07-01-preview"
 QUERY_KEY_API_VERSION: Final[str] = "api-version"
 PATH_ML_WORKSPACES = ("providers", "Microsoft.MachineLearningServices", "workspaces")
@@ -69,7 +69,9 @@ class LiteMLClient:
         self._get_token_manager()
         return cast(TokenCredential, self._credential)
-    def workspace_get_default_datastore(self, workspace_name: str, include_credentials: bool = False) -> BlobStoreInfo:
+    def workspace_get_default_datastore(
+        self, workspace_name: str, *, include_credentials: bool = False, **kwargs: Any
+    ) -> BlobStoreInfo:
         # 1. Get the default blob store
         # REST API documentation:
         # https://learn.microsoft.com/rest/api/azureml/datastores/list?view=rest-azureml-2024-10-01
@@ -92,18 +94,29 @@ class LiteMLClient:
         account_name = props_json["accountName"]
         endpoint = props_json["endpoint"]
         container_name = props_json["containerName"]
+        credential_type = props_json.get("credentials", {}).get("credentialsType")
         # 2. Get the SAS token to use for accessing the blob store
         # REST API documentation:
         # https://learn.microsoft.com/rest/api/azureml/datastores/list-secrets?view=rest-azureml-2024-10-01
-        blob_store_credential: Optional[Union[AzureSasCredential, str]] = None
-        if include_credentials:
+        blob_store_credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
+        if not include_credentials:
+            blob_store_credential = None
+        elif credential_type and credential_type.lower() == "none":
+            # If storage account key access is disabled, and only Microsoft Entra ID authentication is available,
+            # the credentialsType will be "None" and we should not attempt to get the secrets.
+            blob_store_credential = self.get_credential()
+        else:
             url = self._generate_path(
                 *PATH_ML_WORKSPACES, workspace_name, "datastores", "workspaceblobstore", "listSecrets"
             )
             secrets_response = self._http_client.request(
                 method="POST",
                 url=url,
+                json={
+                    "expirableSecret": True,
+                    "expireAfterHours": int(kwargs.get("key_expiration_hours", 1)),
+                },
                 params={
                     QUERY_KEY_API_VERSION: self._api_version,
                 },
@@ -114,10 +127,13 @@ class LiteMLClient:
             secrets_json = secrets_response.json()
             secrets_type = secrets_json["secretsType"].lower()
+            # As per this website, only SAS tokens, access tokens, or Entra IDs are valid for accessing blob data
+            # stores:
+            # https://learn.microsoft.com/rest/api/storageservices/authorize-requests-to-azure-storage.
             if secrets_type == "sas":
                 blob_store_credential = AzureSasCredential(secrets_json["sasToken"])
             elif secrets_type == "accountkey":
-                # To support olders versions of azure-storage-blob better, we return a string here instead of
+                # To support older versions of azure-storage-blob better, we return a string here instead of
                 # an AzureNamedKeyCredential
                 blob_store_credential = secrets_json["key"]
             else:
@@ -164,19 +180,19 @@ class LiteMLClient:
             # nothing to see here, move along
             return
-        additional_info: Optional[str] = None
+        message = f"The {description} request failed with HTTP {response.status_code}"
         try:
             error_json = response.json()["error"]
             additional_info = f"({error_json['code']}) {error_json['message']}"
+            message += f" - {additional_info}"
         except (JSONDecodeError, ValueError, KeyError):
             pass
         raise EvaluationException(
-            message=f"The {description} request failed with HTTP {response.status_code}",
+            message=message,
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.FAILED_EXECUTION,
             blame=ErrorBlame.SYSTEM_ERROR,
-            internal_message=additional_info,
         )
     def _generate_path(self, *paths: str) -> str:

azure/ai/evaluation/_azure/_models.py CHANGED Viewed

@@ -8,7 +8,7 @@
 from typing import Dict, List, NamedTuple, Optional, Union
 from msrest.serialization import Model
-from azure.core.credentials import AzureSasCredential
+from azure.core.credentials import AzureSasCredential, TokenCredential
 class BlobStoreInfo(NamedTuple):
@@ -16,7 +16,7 @@ class BlobStoreInfo(NamedTuple):
     account_name: str
     endpoint: str
     container_name: str
-    credential: Optional[Union[AzureSasCredential, str]]
+    credential: Optional[Union[AzureSasCredential, TokenCredential, str]]
 class WorkspaceHubConfig(Model):

azure/ai/evaluation/_constants.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import enum
 from typing import Literal
+from azure.ai.evaluation._common._experimental import experimental
 class EvaluationMetrics:
@@ -57,6 +59,22 @@ class EvaluationRunProperties:
     EVALUATION_SDK = "_azureml.evaluation_sdk_name"
+@experimental
+class _AggregationType(enum.Enum):
+    """Defines how numeric evaluation results should be aggregated
+    to produce a single value. Used by individual evaluators to combine per-turn results for
+    a conversation-based input. In general, wherever this enum is used, it is also possible
+    to directly assign the underlying aggregation function for more complex use cases.
+    The 'custom' value is generally not an acceptable input, and should only be used as an output
+    to indicate that a custom aggregation function has been injected."""
+    MEAN = "mean"
+    MAX = "max"
+    MIN = "min"
+    SUM = "sum"
+    CUSTOM = "custom"
 DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
 CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4

azure/ai/evaluation/_evaluate/_batch_run/__init__.py CHANGED Viewed

@@ -5,5 +5,6 @@ from .eval_run_context import EvalRunContext
 from .code_client import CodeClient
 from .proxy_client import ProxyClient
 from .target_run_context import TargetRunContext
+from .proxy_client import ProxyRun
-__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
+__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -421,7 +421,9 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                 local_paths.append(local_file_path)
         # We will write the artifacts to the workspaceblobstore
-        datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
+        datastore = self._management_client.workspace_get_default_datastore(
+            self._workspace_name, include_credentials=True
+        )
         account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
         svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -12,6 +12,7 @@ import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
 from promptflow.entities import Run
+from promptflow._sdk._configuration import Configuration
 from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
@@ -20,17 +21,19 @@ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarg
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
+    DefaultOpenEncoding,
     Prefixes,
     _InternalEvaluationMetrics,
 )
 from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
-from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
+from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
 from ._utils import (
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
+    DataLoaderFactory,
 )
 TClient = TypeVar("TClient", ProxyClient, CodeClient)
@@ -429,10 +432,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
             )
     try:
-        initial_data_df = pd.read_json(data, lines=True)
+        data_loader = DataLoaderFactory.get_loader(data)
+        initial_data_df = data_loader.load()
     except Exception as e:
         raise EvaluationException(
-            message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
+            message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
@@ -444,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
     data: Union[str, os.PathLike],
-    pf_client: PFClient,
+    batch_client: TClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,
@@ -454,10 +458,10 @@ def _apply_target_to_data(
     :param target: The function to be applied to data.
     :type target: Callable
-    :param data: The path to input jsonl file.
+    :param data: The path to input jsonl or csv file.
     :type data: Union[str, os.PathLike]
-    :param pf_client: The promptflow client to be used.
-    :type pf_client: PFClient
+    :param batch_client: The promptflow client to be used.
+    :type batch_client: PFClient
     :param initial_data: The data frame with the loaded data.
     :type initial_data: pd.DataFrame
     :param evaluation_name: The name of the evaluation.
@@ -467,7 +471,7 @@ def _apply_target_to_data(
     """
     _run_name = kwargs.get("_run_name")
     with TargetRunContext():
-        run: Run = pf_client.run(
+        run: ProxyRun = batch_client.run(
             flow=target,
             display_name=evaluation_name,
             data=data,
@@ -475,7 +479,18 @@ def _apply_target_to_data(
             name=_run_name,
         )
-    target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
+    target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
+    run_summary = batch_client.get_run_summary(run)
+    if run_summary["completed_lines"] == 0:
+        msg = (f"Evaluation target failed to produce any results."
+               f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.FAILED_EXECUTION,
+            blame=ErrorBlame.USER_ERROR,
+        )
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -494,7 +509,7 @@ def _apply_target_to_data(
     # Concatenate output to input
     target_output = pd.concat([target_output, initial_data], axis=1)
-    return target_output, generated_columns, run
+    return target_output, generated_columns, run.run.result()
 def _process_column_mappings(
@@ -569,13 +584,14 @@ def evaluate(
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
         data will be run through target function and then results will be evaluated.
     :keyword data: Path to the data to be evaluated or passed to target if target is set.
-        Only .jsonl format files are supported.  `target` and `data` both cannot be None. Required.
+        JSONL and CSV files are supported.  `target` and `data` both cannot be None. Required.
     :paramtype data: str
     :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
         and value as the evaluator function. Required.
@@ -594,6 +610,11 @@ def evaluate(
     :paramtype output_path: Optional[str]
     :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
     :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
+    :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
+        if ANY evaluator fails during their evaluation.
+        Defaults to false, which means that evaluations will continue regardless of failures.
+        If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
+    :paramtype fail_on_evaluator_errors: bool
     :return: Evaluation results.
     :rtype: ~azure.ai.evaluation.EvaluationResult
@@ -615,6 +636,7 @@ def evaluate(
             evaluator_config=evaluator_config,
             azure_ai_project=azure_ai_project,
             output_path=output_path,
+            fail_on_evaluator_errors=fail_on_evaluator_errors,
             **kwargs,
         )
     except Exception as e:
@@ -663,6 +685,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
         print("\n====================================================\n")
+def _print_fail_flag_warning() -> None:
+    print(
+        "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
+        + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
+        + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
+        + "without producing any outputs, since a single failure will cancel the entire run "
+        "when fail_on_evaluator_errors is enabled."
+    )
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluators: Dict[str, Callable],
@@ -672,8 +704,11 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
+    fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
+    if fail_on_evaluator_errors:
+        _print_fail_flag_warning()
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
     # Process evaluator config to replace ${target.} with ${data.}
@@ -690,6 +725,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     if target is not None:
         _validate_columns_for_target(input_data_df, target)
+    Configuration.get_instance().set_config("trace.destination", "none")
     pf_client = PFClient(user_agent=USER_AGENT)
     target_run: Optional[Run] = None
@@ -702,7 +738,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     target_generated_columns: Set[str] = set()
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, pf_client, input_data_df, evaluation_name, **kwargs
+            target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
         )
         for evaluator_name, mapping in column_mapping.items():
@@ -773,6 +809,10 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     evaluators_result_df = None
     evaluators_metric = {}
     for evaluator_name, evaluator_result in per_evaluator_results.items():
+        if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
+            _print_summary(per_evaluator_results)
+            _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
         evaluator_result_df = evaluator_result["result"]
         # drop input columns
@@ -825,3 +865,20 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         _write_output(output_path, result)
     return result
+def _turn_error_logs_into_exception(log_path: str) -> None:
+    """Produce an EvaluationException using the contents of the inputted
+    file as the error message.
+    :param log_path: The path to the error log file.
+    :type log_path: str
+    """
+    with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
+        error_message = file.read()
+    raise EvaluationException(
+        message=error_message,
+        target=ErrorTarget.EVALUATE,
+        category=ErrorCategory.FAILED_EXECUTION,
+        blame=ErrorBlame.UNKNOWN,
+    )

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -328,3 +328,30 @@ def set_event_loop_policy() -> None:
         # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]
+class JSONLDataFileLoader:
+    def __init__(self, filename: Union[os.PathLike, str]):
+        self.filename = filename
+    def load(self) -> pd.DataFrame:
+        return pd.read_json(self.filename, lines=True)
+class CSVDataFileLoader:
+    def __init__(self, filename: Union[os.PathLike, str]):
+        self.filename = filename
+    def load(self) -> pd.DataFrame:
+        return pd.read_csv(self.filename)
+class DataLoaderFactory:
+    @staticmethod
+    def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, CSVDataFileLoader]:
+        filename_str = str(filename).lower()
+        if filename_str.endswith(".csv"):
+            return CSVDataFileLoader(filename)
+        # fallback to JSONL to maintain backward compatibility
+        return JSONLDataFileLoader(filename)

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -1,30 +1,16 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict
 from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
-class _AsyncBleuScoreEvaluator:
-    def __init__(self):
-        pass
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(response)
-        # NIST Smoothing
-        smoothing_function = SmoothingFunction().method4
-        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
-        return {
-            "bleu_score": score,
-        }
-class BleuScoreEvaluator:
+class BleuScoreEvaluator(EvaluatorBase):
     """
     Calculate the BLEU score for a given response and ground truth.
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self):
-        self._async_evaluator = _AsyncBleuScoreEvaluator()
+        super().__init__()
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce a glue score evaluation result.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        reference_tokens = nltk_tokenize(ground_truth)
+        hypothesis_tokens = nltk_tokenize(response)
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+        # NIST Smoothing
+        smoothing_function = SmoothingFunction().method4
+        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
+        return {
+            "bleu_score": score,
+        }
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str):
         """
         Evaluate the BLEU score between the response and the ground truth.
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
         :return: The BLEU score.
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
-    def _to_async(self):
-        return self._async_evaluator
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the BLEU score between the response and the ground truth.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be compared against.
+        :paramtype ground_truth: str
+        :return: The BLEU score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_common/__init__.py CHANGED Viewed

@@ -5,9 +5,11 @@
 from ._base_eval import EvaluatorBase
 from ._base_prompty_eval import PromptyEvaluatorBase
 from ._base_rai_svc_eval import RaiServiceEvaluatorBase
+from ._base_multi_eval import MultiEvaluatorBase
 __all__ = [
     "EvaluatorBase",
     "PromptyEvaluatorBase",
     "RaiServiceEvaluatorBase",
+    "MultiEvaluatorBase",
 ]

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -4,15 +4,18 @@
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
+from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
-from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._constants import _AggregationType
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._common._experimental import experimental
+from ._conversation_aggregators import GetAggregator, GetAggregatorType
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -25,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
     query: Dict[str, Any]
     response: Dict[str, Any]
     context: str
+    ground_truth: str
 AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -69,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type not_singleton_inputs: List[str]
     :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
+        to produce a single result.
+        Default is ~azure.ai.evaluation._AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+    :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
+        overrides the standard aggregator implied by conversation_aggregation_type. None by default.
+    :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
     """
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -80,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         *,
         not_singleton_inputs: List[str] = ["conversation", "kwargs"],
         eval_last_turn: bool = False,
+        conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
+        conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
     ):
         self._not_singleton_inputs = not_singleton_inputs
         self._eval_last_turn = eval_last_turn
         self._singleton_inputs = self._derive_singleton_inputs()
         self._async_evaluator = AsyncEvaluatorBase(self._real_call)
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        if conversation_aggregator_override is not None:
+            # Type ignore since we already checked for None, but mypy doesn't know that.
+            self._conversation_aggregation_function = conversation_aggregator_override  # type: ignore[assignment]
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
@@ -158,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         include_context = "context" in self._singleton_inputs
         include_query = "query" in self._singleton_inputs
         include_response = "response" in self._singleton_inputs
+        include_ground_truth = "ground_truth" in self._singleton_inputs
         def converter(conversation: Dict) -> List[DerivedEvalInput]:
             messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -198,6 +216,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
                     eval_input["response"] = response.get("content", "")
                 if include_context:
                     eval_input["context"] = str(context)
+                if include_ground_truth:
+                    eval_input["ground_truth"] = response.get("ground_truth", "")
                 eval_inputs.append(eval_input)
             return eval_inputs
@@ -355,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
+                aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
@@ -383,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         # Otherwise, aggregate results.
         return self._aggregate_results(per_turn_results=per_turn_results)
+    # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
     @final
     def _to_async(self) -> "AsyncEvaluatorBase":
         return self._async_evaluator
+    @experimental
+    @final
+    def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
+        """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
+        multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
+        multi-turn conversation into a single top-level result.
+        :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
+            results of a conversation to produce a single result.
+        :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+        """
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+    @experimental
+    @final
+    def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
+        """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
+        of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
+        evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
+        suit your needs, but use with caution.
+        :param aggregator: The function to use to aggregate per-turn results.
+        :type aggregator: Callable[[List[float]], float]
+        """
+        self._conversation_aggregation_function = aggregator
+    @experimental
+    @final
+    def _get_conversation_aggregator_type(self) -> _AggregationType:
+        """Get the current conversation aggregation type used by this evaluator. This refers to the
+        method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
+        is inputted into an evaluator that evaluates each turn individually). The individual inputs
+        are combined by the function implied here to produce a single overall result.
+        :return: The conversation aggregation type.
+        :rtype: ~azure.ai.evaluation._AggregationType
+        """
+        return GetAggregatorType(self._conversation_aggregation_function)
 class AsyncEvaluatorBase:
     """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
@@ -402,7 +463,9 @@ class AsyncEvaluatorBase:
     # are just not passed into this function instead of ending up in kwargs.
     # Since we want this to be relatively call-agnostic, we just account for every input that any children
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.
-    async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
+    async def __call__(
+        self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
+    ):
         if conversation is not None:
             kwargs["conversation"] = conversation
         if query is not None:
@@ -411,4 +474,6 @@ class AsyncEvaluatorBase:
             kwargs["response"] = response
         if context is not None:
             kwargs["context"] = context
+        if ground_truth is not None:
+            kwargs["ground_truth"] = ground_truth
         return await self._real_call(**kwargs)

azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl