PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show

azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py CHANGED Viewed

@@ -5,8 +5,9 @@ import inspect
 import json
 import logging
 import os
+from concurrent.futures import Future
 from pathlib import Path
-from typing import Callable, Dict, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union, cast
 import pandas as pd
 from promptflow.contracts.types import AttrDict
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
 class CodeRun:
     def __init__(
-        self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs  # pylint: disable=unused-argument
-    ):
+        self,
+        *,
+        run: Future,
+        input_data,
+        evaluator_name: Optional[str] = None,
+        aggregator: Callable[["CodeRun"], Future],
+        **kwargs,  # pylint: disable=unused-argument
+    ) -> None:
         self.run = run
         self.evaluator_name = evaluator_name if evaluator_name is not None else ""
         self.input_data = input_data
-        self.aggregated_metrics = aggregated_metrics
+        self.aggregated_metrics = aggregator(self)
-    def get_result_df(self, exclude_inputs=False):
+    def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
         batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
-        result_df = self.run.result(timeout=batch_run_timeout)
+        result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
         if exclude_inputs:
             result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
         return result_df
-    def get_aggregated_metrics(self):
+    def get_aggregated_metrics(self) -> Dict[str, Any]:
         try:
             batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
-            aggregated_metrics = (
-                self.aggregated_metrics.result(timeout=batch_run_timeout)
+            aggregated_metrics: Optional[Any] = (
+                cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
                 if self.aggregated_metrics is not None
                 else None
             )
@@ -104,10 +111,10 @@ class CodeClient:  # pylint: disable=client-accepts-api-version-keyword
             verify_integrity=True,
         )
-    def _calculate_aggregations(self, evaluator, run):
+    @staticmethod
+    def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
         try:
             if _has_aggregator(evaluator):
-                aggregate_input = None
                 evaluator_output = run.get_result_df(exclude_inputs=True)
                 if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
                     aggregate_input = evaluator_output["output"].tolist()
@@ -152,21 +159,30 @@ class CodeClient:  # pylint: disable=client-accepts-api-version-keyword
             column_mapping=column_mapping,
             evaluator_name=evaluator_name,
         )
-        run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
-        aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
-        run.aggregated_metrics = aggregation_future
-        return run
+        return CodeRun(
+            run=eval_future,
+            input_data=data,
+            evaluator_name=evaluator_name,
+            aggregator=lambda code_run: self._thread_pool.submit(
+                self._calculate_aggregations, evaluator=flow, run=code_run
+            ),
+        )
     def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
         result_df = run.get_result_df(exclude_inputs=not all_results)
         return result_df
-    def get_metrics(self, run: CodeRun) -> Optional[None]:
+    def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
         try:
             aggregated_metrics = run.get_aggregated_metrics()
             print("Aggregated metrics")
             print(aggregated_metrics)
         except Exception as ex:  # pylint: disable=broad-exception-caught
             LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
-            return None
+            return {}
         return aggregated_metrics
+    def get_run_summary(self, run: CodeRun) -> Any:  # pylint: disable=unused-argument
+        # Not implemented
+        return None

azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py CHANGED Viewed

@@ -3,11 +3,12 @@
 # ---------------------------------------------------------
 import inspect
 import logging
+import math
 import os
 from concurrent.futures import Future
 from typing import Any, Callable, Dict, Optional, Union
+from collections import OrderedDict
-import numpy as np
 import pandas as pd
 from promptflow.client import PFClient
 from promptflow.entities import Run
@@ -53,13 +54,27 @@ class ProxyClient:  # pylint: disable=client-accepts-api-version-keyword
     def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
         run: Run = proxy_run.run.result()
         result_df = self._pf_client.get_details(run, all_results=all_results)
-        result_df.replace("(Failed)", np.nan, inplace=True)
+        result_df.replace("(Failed)", math.nan, inplace=True)
         return result_df
     def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
         run: Run = proxy_run.run.result()
         return self._pf_client.get_metrics(run)
+    def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
+        run = proxy_run.run.result()
+        # pylint: disable=protected-access
+        return OrderedDict(
+            [
+                ("status", run.status),
+                ("duration", str(run._end_time - run._created_on)),
+                ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
+                ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
+                ("log_path", str(run._output_path)),
+            ]
+        )
     @staticmethod
     def _should_batch_use_async(flow):
         if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":

azure/ai/evaluation/_evaluate/_eval_run.py CHANGED Viewed

@@ -10,10 +10,11 @@ import posixpath
 import time
 import types
 import uuid
-from typing import Any, Dict, Optional, Set, Type
+from typing import Any, Dict, List, Optional, Set, Type
 from urllib.parse import urlparse
 from promptflow._sdk.entities import Run
+from typing_extensions import Self
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_http_client
@@ -27,6 +28,7 @@ LOGGER = logging.getLogger(__name__)
 # Handle optional import. The azure libraries are only present if
 # promptflow-azure is installed.
 try:
+    from azure.ai.ml import MLClient
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration  # pylint: disable=ungrouped-imports
     from azure.ai.ml.entities._datastore.datastore import Datastore
     from azure.storage.blob import BlobServiceClient
@@ -121,8 +123,8 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         self._run_name = run_name
         self._promptflow_run = promptflow_run
         self._status = RunStatus.NOT_STARTED
-        self._url_base = None
-        self.info = None
+        self._url_base: Optional[str] = None
+        self._info: Optional[RunInfo] = None
     @property
     def status(self) -> RunStatus:
@@ -134,6 +136,20 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         """
         return self._status
+    @property
+    def info(self) -> RunInfo:
+        if self._info is None:
+            msg = "Run info is missing"
+            raise EvaluationException(
+                message=msg,
+                internal_message=msg,
+                target=ErrorTarget.EVAL_RUN,
+                category=ErrorCategory.UNKNOWN,
+                blame=ErrorBlame.UNKNOWN,
+            )
+        return self._info
     def _get_scope(self) -> str:
         """
         Return the scope information for the workspace.
@@ -161,11 +177,11 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             )
             self._url_base = None
             self._status = RunStatus.BROKEN
-            self.info = RunInfo.generate(self._run_name)
+            self._info = RunInfo.generate(self._run_name)
         else:
             self._url_base = urlparse(self._tracking_uri).netloc
             if self._promptflow_run is not None:
-                self.info = RunInfo(
+                self._info = RunInfo(
                     self._promptflow_run.name,
                     self._promptflow_run._experiment_name,  # pylint: disable=protected-access
                     self._promptflow_run.name,
@@ -182,7 +198,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                     body["run_name"] = self._run_name
                 response = self.request_with_retry(url=url, method="POST", json_dict=body)
                 if response.status_code != 200:
-                    self.info = RunInfo.generate(self._run_name)
+                    self._info = RunInfo.generate(self._run_name)
                     LOGGER.warning(
                         "The run failed to start: %s: %s."
                         "The results will be saved locally, but will not be logged to Azure.",
@@ -192,7 +208,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
                     self._status = RunStatus.BROKEN
                 else:
                     parsed_response = response.json()
-                    self.info = RunInfo(
+                    self._info = RunInfo(
                         run_id=parsed_response["run"]["info"]["run_id"],
                         experiment_id=parsed_response["run"]["info"]["experiment_id"],
                         run_name=parsed_response["run"]["info"]["run_name"],
@@ -235,7 +251,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             LOGGER.warning("Unable to terminate the run.")
         self._status = RunStatus.TERMINATED
-    def __enter__(self):
+    def __enter__(self) -> Self:
         """The Context Manager enter call.
         :return: The instance of the class.
@@ -249,7 +265,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
         exc_type: Optional[Type[BaseException]],
         exc_value: Optional[BaseException],
         exc_tb: Optional[types.TracebackType],
-    ) -> Optional[bool]:
+    ) -> None:
         """The context manager exit call.
         :param exc_type: The exception type
@@ -408,7 +424,7 @@ class EvalRun(contextlib.AbstractContextManager):  # pylint: disable=too-many-in
             return
         # First we will list the files and the appropriate remote paths for them.
         root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
-        remote_paths = {"paths": []}
+        remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
         local_paths = []
         # Go over the artifact folder and upload all artifacts.
         for root, _, filenames in os.walk(artifact_folder):

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -4,18 +4,22 @@
 import inspect
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+import json
-import numpy as np
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
 from promptflow.client import PFClient
+from promptflow.entities import Run
+from promptflow._sdk._errors import MissingAzurePackage
+from azure.ai.evaluation._common.math import list_sum
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
     EvaluationMetrics,
+    EvaluationRunProperties,
     Prefixes,
     _InternalEvaluationMetrics,
 )
@@ -23,16 +27,25 @@ from .._model_configurations import AzureAIProject, EvaluatorConfig
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._utils import (
+    EvaluateResult,
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
 )
+TClient = TypeVar("TClient", ProxyClient, CodeClient)
+class __EvaluatorInfo(TypedDict):
+    result: pd.DataFrame
+    metrics: Dict[str, Any]
+    run_summary: Dict[str, Any]
 # pylint: disable=line-too-long
 def _aggregate_content_safety_metrics(
-    df: pd.DataFrame, evaluators: Dict[str, Type]
+    df: pd.DataFrame, evaluators: Dict[str, Callable]
 ) -> Tuple[List[str], Dict[str, float]]:
     """Find and aggregate defect rates for content safety metrics. Returns both a list
     of columns that were used to calculate defect rates and the defect rates themselves.
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
         defect_rate_name = col.replace("_score", "_defect_rate")
         col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
+            list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
             / col_with_numeric_values.count(),
             2,
         )
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         defect_rate_name = col.replace("_label", "_defect_rate")
         col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
         defect_rates[defect_rate_name] = round(
-            np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
+            list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
             2,
         )
     return label_cols, defect_rates
-def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
+def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
     that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
     :param df: The dataframe of evaluation results.
     :type df: ~pandas.DataFrame
     :param evaluators:  A dictionary mapping of strings to evaluator classes.
-    :type evaluators: Dict[str, Type]
+    :type evaluators: Dict[str, Callable]
     :return: The aggregated metrics.
     :rtype: Dict[str, float]
     """
@@ -277,7 +290,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _validate_columns(
     df: pd.DataFrame,
-    evaluators: Dict[str, Any],
+    evaluators: Dict[str, Callable],
     target: Optional[Callable],
     column_mapping: Dict[str, Dict[str, str]],
 ) -> None:
@@ -287,7 +300,7 @@ def _validate_columns(
     :param df: The data frame to be validated.
     :type df: pd.DataFrame
     :param evaluators: The dictionary of evaluators.
-    :type evaluators: Dict[str, Any]
+    :type evaluators: Dict[str, Callable]
     :param target: The callable to be applied to data set.
     :type target: Optional[Callable]
     :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
@@ -326,7 +339,7 @@ def _apply_target_to_data(
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     _run_name: Optional[str] = None,
-) -> Tuple[pd.DataFrame, Set[str]]:
+) -> Tuple[pd.DataFrame, Set[str], Run]:
     """
     Apply the target function to the data set and return updated data and generated columns.
@@ -348,15 +361,15 @@ def _apply_target_to_data(
     # We are manually creating the temporary directory for the flow
     # because the way tempdir remove temporary directories will
     # hang the debugger, because promptflow will keep flow directory.
-    run = pf_client.run(
+    run: Run = pf_client.run(
         flow=target,
         display_name=evaluation_name,
         data=data,
-        properties={"runType": "eval_run", "isEvaluatorRun": "true"},
+        properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
         stream=True,
         name=_run_name,
     )
-    target_output = pf_client.runs.get_details(run, all_results=True)
+    target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -378,16 +391,18 @@ def _apply_target_to_data(
     return target_output, generated_columns, run
-def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
+def _process_column_mappings(
+    column_mapping: Dict[str, Optional[Dict[str, str]]],
+) -> Dict[str, Dict[str, str]]:
     """Process column_mapping to replace ${target.} with ${data.}
     :param column_mapping: The configuration for evaluators.
-    :type column_mapping: Dict[str, Dict[str, str]]
+    :type column_mapping: Dict[str, Optional[Dict[str, str]]]
     :return: The processed configuration.
     :rtype: Dict[str, Dict[str, str]]
     """
-    processed_config = {}
+    processed_config: Dict[str, Dict[str, str]] = {}
     unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
@@ -554,41 +569,69 @@ def evaluate(
         raise e
+def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
+    # Extract evaluators with a non-empty "run_summary"
+    output_dict = {
+        name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
+    }
+    if output_dict:
+        print("======= Combined Run Summary (Per Evaluator) =======\n")
+        print(json.dumps(output_dict, indent=4))
+        print("\n====================================================")
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
+    evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    data: Optional[str] = None,
-    evaluators: Optional[Dict[str, Callable]] = None,
+    data: str,
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
-):
+) -> EvaluateResult:
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
     # extract column mapping dicts into dictionary mapping evaluator name to column mapping
-    column_mapping = {
-        evaluator_name: evaluator_configuration.get("column_mapping", None)
-        for evaluator_name, evaluator_configuration in evaluator_config.items()
-    }
-    column_mapping = _process_column_mappings(column_mapping)
+    column_mapping = _process_column_mappings(
+        {
+            evaluator_name: evaluator_configuration.get("column_mapping", None)
+            for evaluator_name, evaluator_configuration in evaluator_config.items()
+        }
+    )
     _validate_columns(input_data_df, evaluators, target, column_mapping)
     # Target Run
-    pf_client = PFClient(
-        config=(
-            {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
-        ),
-        user_agent=USER_AGENT,
-    )
+    try:
+        pf_client = PFClient(
+            config=(
+                {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
+                if azure_ai_project
+                else None
+            ),
+            user_agent=USER_AGENT,
+        )
+    # pylint: disable=raise-missing-from
+    except MissingAzurePackage:
+        msg = (
+            "The required packages for remote tracking are missing.\n"
+            'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
+        )
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.MISSING_PACKAGE,
+            blame=ErrorBlame.USER_ERROR,
+        )
-    trace_destination = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
-    target_run = None
-    target_generated_columns = set()
+    trace_destination: Optional[str] = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
+    target_run: Optional[Run] = None
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
@@ -627,45 +670,54 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             # Also ignore columns that are already in config, since they've been covered by target mapping.
             if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
                 column_mapping["default"][col] = f"${{data.{col}}}"
+    def eval_batch_run(
+        batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
+    ) -> Dict[str, __EvaluatorInfo]:
+        with BatchRunContext(batch_run_client):
+            runs = {
+                evaluator_name: batch_run_client.run(
+                    flow=evaluator,
+                    run=target_run,
+                    evaluator_name=evaluator_name,
+                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                    data=data,
+                    stream=True,
+                    name=kwargs.get("_run_name"),
+                )
+                for evaluator_name, evaluator in evaluators.items()
+            }
+            # get_details needs to be called within BatchRunContext scope in order to have user agent populated
+            return {
+                evaluator_name: {
+                    "result": batch_run_client.get_details(run, all_results=True),
+                    "metrics": batch_run_client.get_metrics(run),
+                    "run_summary": batch_run_client.get_run_summary(run),
+                }
+                for evaluator_name, run in runs.items()
+            }
     # Batch Run
-    evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
     if use_pf_client:
-        # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
-        # The root cause is still unclear, but it seems related to a conflict between the async run uploader
-        # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
-        batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
         # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
         # multiple evaluators. If the path is already absolute, abspath will return the original path.
         data = os.path.abspath(data)
+        # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
+        # The root cause is still unclear, but it seems related to a conflict between the async run uploader
+        # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
+        per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
     else:
-        batch_run_client = CodeClient()
         data = input_data_df
-    with BatchRunContext(batch_run_client):
-        for evaluator_name, evaluator in evaluators.items():
-            evaluators_info[evaluator_name] = {}
-            evaluators_info[evaluator_name]["run"] = batch_run_client.run(
-                flow=evaluator,
-                run=target_run,
-                evaluator_name=evaluator_name,
-                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
-                data=data,
-                stream=True,
-                name=kwargs.get("_run_name"),
-            )
-        # get_details needs to be called within BatchRunContext scope in order to have user agent populated
-        for evaluator_name, evaluator_info in evaluators_info.items():
-            evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
-            evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
+        per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
     # Concatenate all results
     evaluators_result_df = None
     evaluators_metric = {}
-    for evaluator_name, evaluator_info in evaluators_info.items():
-        evaluator_result_df = evaluator_info["result"]
+    for evaluator_name, evaluator_result in per_evaluator_results.items():
+        evaluator_result_df = evaluator_result["result"]
         # drop input columns
         evaluator_result_df = evaluator_result_df.drop(
@@ -688,7 +740,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             else evaluator_result_df
         )
-        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
+        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
     # Rename columns, generated by target function to outputs instead of inputs.
     # If target generates columns, already present in the input data, these columns
@@ -706,9 +758,11 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         evaluation_name,
     )
-    result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
+    result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
     if output_path:
         _write_output(output_path, result)
+    _print_summary(per_evaluator_results)
     return result

azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl