PyPI - azure-ai-evaluation - Versions diffs - 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.4.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (150) hide show

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -6,43 +6,59 @@ import json
 import logging
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
+from openai import OpenAI, AzureOpenAI
+from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
+from azure.ai.evaluation._legacy._adapters.entities import Run
 import pandas as pd
-from promptflow._sdk._constants import LINE_NUMBER
-from promptflow.client import PFClient
-from promptflow.entities import Run
-from promptflow._sdk._configuration import Configuration
 from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
-from azure.ai.evaluation._common.utils import validate_azure_ai_project
+from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
+    EVALUATION_PASS_FAIL_MAPPING,
     EvaluationMetrics,
     DefaultOpenEncoding,
     Prefixes,
     _InternalEvaluationMetrics,
+    BINARY_AGGREGATE_SUFFIX,
+    DEFAULT_OAI_EVAL_RUN_NAME
 )
 from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
-from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
+from ._batch_run import (
+    EvalRunContext,
+    CodeClient,
+    ProxyClient,
+    TargetRunContext,
+    RunSubmitterClient,
+)
 from ._utils import (
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
-    DataLoaderFactory,
+    DataLoaderFactory, _log_metrics_and_instance_results_onedp,
 )
+from ._batch_run.batch_clients import BatchClient, BatchClientRun
-TClient = TypeVar("TClient", ProxyClient, CodeClient)
+from ._evaluate_aoai import (
+    _begin_aoai_evaluation,
+    _split_evaluators_and_grader_configs,
+    _get_evaluation_run_results,
+    OAIEvalRunCreationInfo
+)
 LOGGER = logging.getLogger(__name__)
 # For metrics (aggregates) whose metric names intentionally differ from their
 # originating column name, usually because the aggregation of the original value
 # means something sufficiently different.
-# Note that content safety metrics are handled seprately.
+# Note that content safety metrics are handled separately.
 METRIC_COLUMN_NAME_REPLACEMENTS = {
     "groundedness_pro_label": "groundedness_pro_passing_rate",
 }
@@ -53,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
     metrics: Dict[str, Any]
     run_summary: Dict[str, Any]
+class __ValidatedData(TypedDict):
+    '''
+    Simple dictionary that contains ALL pre-processed data and
+    the resultant objects that are needed for downstream evaluation.
+    '''
+    evaluators: Dict[str, Callable]
+    graders: Dict[str, AzureOpenAIGrader]
+    input_data_df: pd.DataFrame
+    column_mapping: Dict[str, Dict[str, str]]
+    target_run: Optional[BatchClientRun]
+    batch_run_client: BatchClient
+    batch_run_data: Union[str, os.PathLike, pd.DataFrame]
 def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
     """Identify and average various metrics that need to have the metric name be replaced,
@@ -71,7 +100,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
         if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
             renamed_cols.append(col)
             new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
-            col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
+            col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
             try:
                 metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
             except EvaluationException:  # only exception that can be cause is all NaN values
@@ -122,7 +151,7 @@ def _aggregate_content_safety_metrics(
     defect_rates = {}
     for col in content_safety_df.columns:
         defect_rate_name = col.replace("_score", "_defect_rate")
-        col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
+        col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
         try:
             col_with_boolean_values = apply_transform_nan_safe(
                 col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
@@ -161,37 +190,40 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         metric_name = col.split(".")[1]
         if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
             label_cols.append(col)
-        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
+        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
             details_cols = col
     label_df = df[label_cols]
     defect_rates = {}
     for col in label_df.columns:
         defect_rate_name = col.replace("_label", "_defect_rate")
-        col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
+        col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
         try:
             defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
         except EvaluationException:  # only exception that can be cause is all NaN values
             msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
             LOGGER.warning(msg)
     if details_cols:
         details_df = df[details_cols]
         detail_defect_rates = {}
         for key, value in details_df.items():
             _process_rows(value, detail_defect_rates)
         for key, value in detail_defect_rates.items():
             col_with_boolean_values = pd.to_numeric(value, errors="coerce")
             try:
-                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
+                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
+                    list_mean_nan_safe(col_with_boolean_values), 2
+                )
             except EvaluationException:  # only exception that can be cause is all NaN values
                 msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
                 LOGGER.warning(msg)
     return label_cols, defect_rates
 def _process_rows(row, detail_defect_rates):
     for key, value in row.items():
         if key not in detail_defect_rates:
@@ -199,6 +231,49 @@ def _process_rows(row, detail_defect_rates):
         detail_defect_rates[key].append(value)
     return detail_defect_rates
+def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
+    """
+    Aggregate binary output results (pass/fail) from evaluation dataframe.
+    For each evaluator, calculates the proportion of "pass" results.
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A dictionary mapping evaluator names to the proportion of pass results.
+    :rtype: Dict[str, float]
+    """
+    results = {}
+    # Find all columns that end with "_result"
+    result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
+    for col in result_columns:
+        # Extract the evaluator name from the column name
+        # (outputs.<evaluator>.<metric>_result)
+        parts = col.split(".")
+        evaluator_name = None
+        if len(parts) >= 3:
+            evaluator_name = parts[1]
+        else:
+            LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
+            continue
+        if evaluator_name:
+            # Count the occurrences of each unique value (pass/fail)
+            value_counts = df[col].value_counts().to_dict()
+            # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
+            total_rows = len(df)
+            pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
+            proportion = pass_count / total_rows if total_rows > 0 else 0.0
+            # Set the result with the evaluator name as the key
+            result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
+            results[result_key] = round(proportion, 2)
+    return results
 def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -212,6 +287,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     :return: The aggregated metrics.
     :rtype: Dict[str, float]
     """
+    binary_metrics = _aggregation_binary_output(df)
     df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
     handled_columns = []
@@ -239,6 +316,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     metrics = mean_value.to_dict()
     # Add defect rates back into metrics
     metrics.update(defect_rates)
+    # Add binary threshold metrics based on pass/fail results
+    metrics.update(binary_metrics)
     return metrics
@@ -330,7 +411,7 @@ def _validate_columns_for_evaluators(
                 missing_inputs = []
             else:
                 optional_params = (
-                    evaluator._OPTIONAL_PARAMS  # pylint: disable=protected-access
+                    cast(Any, evaluator)._OPTIONAL_PARAMS  # pylint: disable=protected-access
                     if hasattr(evaluator, "_OPTIONAL_PARAMS")
                     else []
                 )
@@ -477,12 +558,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
-    data: Union[str, os.PathLike],
-    batch_client: TClient,
+    data: Union[str, os.PathLike, pd.DataFrame],
+    batch_client: BatchClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,
-) -> Tuple[pd.DataFrame, Set[str], Run]:
+) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
     """
     Apply the target function to the data set and return updated data and generated columns.
@@ -499,18 +580,19 @@ def _apply_target_to_data(
     :return: The tuple, containing data frame and the list of added columns.
     :rtype: Tuple[pandas.DataFrame, List[str]]
     """
     _run_name = kwargs.get("_run_name")
-    with TargetRunContext():
-        run: ProxyRun = batch_client.run(
+    with TargetRunContext(batch_client):
+        run: BatchClientRun = batch_client.run(
             flow=target,
             display_name=evaluation_name,
             data=data,
             stream=True,
             name=_run_name,
+            evaluator_name=getattr(target, "__qualname__", "TARGET"),
         )
-    target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
-    run_summary = batch_client.get_run_summary(run)
+        target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
+        run_summary = batch_client.get_run_summary(run)
     if run_summary["completed_lines"] == 0:
         msg = (
@@ -541,7 +623,7 @@ def _apply_target_to_data(
     # Concatenate output to input
     target_output = pd.concat([target_output, initial_data], axis=1)
-    return target_output, generated_columns, run.run.result()
+    return target_output, generated_columns, run
 def _process_column_mappings(
@@ -557,7 +639,7 @@ def _process_column_mappings(
     processed_config: Dict[str, Dict[str, str]] = {}
-    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
+    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
     if column_mapping:
         for evaluator, mapping_config in column_mapping.items():
@@ -606,15 +688,14 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
     return df
-# @log_evaluate_activity
 def evaluate(
     *,
     data: Union[str, os.PathLike],
-    evaluators: Dict[str, Callable],
+    evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
-    azure_ai_project: Optional[AzureAIProject] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
     **kwargs,
@@ -626,8 +707,9 @@ def evaluate(
         JSONL and CSV files are supported.  `target` and `data` both cannot be None. Required.
     :paramtype data: str
     :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
-        and value as the evaluator function. Required.
-    :paramtype evaluators: Dict[str, Callable]
+        and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
+        Required.
+    :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
     :keyword evaluation_name: Display name of the evaluation.
     :paramtype evaluation_name: Optional[str]
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -664,7 +746,7 @@ def evaluate(
             evaluation_name=evaluation_name,
             target=target,
             data=data,
-            evaluators=evaluators,
+            evaluators_and_graders=evaluators,
             evaluator_config=evaluator_config,
             azure_ai_project=azure_ai_project,
             output_path=output_path,
@@ -729,23 +811,157 @@ def _print_fail_flag_warning() -> None:
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
-    evaluators: Dict[str, Callable],
+    evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     data: Union[str, os.PathLike],
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
-    azure_ai_project: Optional[AzureAIProject] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
     if fail_on_evaluator_errors:
         _print_fail_flag_warning()
-    input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
+    # Turn inputted mess of data into a dataframe, apply targets if needed
+    # split graders and evaluators, and verify that column mappings are sensible.
+    validated_data = _preprocess_data(
+        data=data,
+        evaluators_and_graders=evaluators_and_graders,
+        evaluator_config=evaluator_config,
+        target=target,
+        output_path=output_path,
+        azure_ai_project=azure_ai_project,
+        evaluation_name=evaluation_name,
+        **kwargs,
+    )
+    # extract relevant info from validated data
+    column_mapping = validated_data["column_mapping"]
+    evaluators = validated_data["evaluators"]
+    graders = validated_data["graders"]
+    input_data_df = validated_data["input_data_df"]
+    results_df = pd.DataFrame()
+    metrics: Dict[str, float] = {}
+    eval_run_info_list: List[OAIEvalRunCreationInfo] = []
+    # Start OAI eval runs if any graders are present.
+    need_oai_run = len(graders) > 0
+    need_local_run = len(evaluators) > 0
+    need_get_oai_results = False
+    got_local_results = False
+    if need_oai_run:
+        try:
+            aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
+            eval_run_info_list = _begin_aoai_evaluation(
+                graders,
+                column_mapping,
+                input_data_df,
+                aoi_name
+            )
+            need_get_oai_results = len(eval_run_info_list) > 0
+        except EvaluationException as e:
+            if need_local_run:
+                # If there are normal evaluators, don't stop execution and try to run
+                # those.
+                LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
+                               " Continuing with local evaluators.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
+    if need_local_run:
+        try:
+            eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
+                validated_data=validated_data,
+                fail_on_evaluator_errors=fail_on_evaluator_errors
+            )
+            results_df = eval_result_df
+            metrics = eval_metrics
+            got_local_results = True
+            # TODO figure out how to update this printing to include OAI results?
+            _print_summary(per_evaluator_results)
+        except EvaluationException as e:
+            if need_get_oai_results:
+                # If there are OAI graders, we only print a warning on local failures.
+                LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Retrieve OAI eval run results if needed.
+    if need_get_oai_results:
+        try:
+            aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
+            # Post build TODO: add equivalent of  _print_summary(per_evaluator_results) here
+            # Combine results if both evaluators and graders are present
+            if len(evaluators) > 0:
+                results_df = pd.concat([results_df, aoai_results], axis=1)
+                metrics.update(aoai_metrics)
+            else:
+                # Otherwise combine aoai results with input data df to include input columns in outputs.
+                results_df = pd.concat([input_data_df, aoai_results], axis=1)
+                metrics = aoai_metrics
+        except EvaluationException as e:
+            if got_local_results:
+                # If there are local eval results, we only print a warning on OAI failure.
+                LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Done with all evaluations, message outputs into final forms, and log results if needed.
+    name_map = _map_names_to_builtins(evaluators, graders)
+    if is_onedp_project(azure_ai_project):
+        studio_url = _log_metrics_and_instance_results_onedp(
+            metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
+        )
+    else:
+        # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
+        trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
+        studio_url = None
+        if trace_destination:
+            studio_url = _log_metrics_and_instance_results(
+                metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
+            )
+    result_df_dict = results_df.to_dict("records")
+    result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
+    if output_path:
+        _write_output(output_path, result)
+    return result
+def _preprocess_data(
+    data: Union[str, os.PathLike],
+    evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
+    target: Optional[Callable] = None,
+    output_path: Optional[Union[str, os.PathLike]] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
+    evaluation_name: Optional[str] = None,
+    **kwargs,
+    ) -> __ValidatedData:
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
+    input_data_df = _validate_and_load_data(
+        target,
+        data,
+        evaluators_and_graders,
+        output_path,
+        azure_ai_project,
+        evaluation_name
+    )
+    if target is not None:
+        _validate_columns_for_target(input_data_df, target)
     # extract column mapping dicts into dictionary mapping evaluator name to column mapping
     column_mapping = _process_column_mappings(
         {
@@ -754,23 +970,46 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         }
     )
-    if target is not None:
-        _validate_columns_for_target(input_data_df, target)
-    Configuration.get_instance().set_config("trace.destination", "none")
-    pf_client = PFClient(user_agent=USER_AGENT)
-    target_run: Optional[Run] = None
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
     column_mapping = column_mapping or {}
     column_mapping.setdefault("default", {})
-    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
+    # Split normal evaluators and OAI graders
+    evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
+    input_data_df = _validate_and_load_data(
+        target,
+        data,
+        evaluators_and_graders,
+        output_path,
+        azure_ai_project,
+        evaluation_name
+    )
+    if target is not None:
+        _validate_columns_for_target(input_data_df, target)
+    target_run: Optional[BatchClientRun] = None
     target_generated_columns: Set[str] = set()
+    batch_run_client: BatchClient
+    batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
+    if kwargs.pop("_use_run_submitter_client", False):
+        batch_run_client = RunSubmitterClient()
+        batch_run_data = input_data_df
+    elif kwargs.pop("_use_pf_client", True):
+        batch_run_client = ProxyClient(user_agent=USER_AGENT)
+        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
+        # multiple evaluators. If the path is already absolute, abspath will return the original path.
+        batch_run_data = os.path.abspath(data)
+    else:
+        batch_run_client = CodeClient()
+        batch_run_data = input_data_df
+    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
+            target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
         )
         for evaluator_name, mapping in column_mapping.items():
@@ -799,46 +1038,55 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
                 column_mapping["default"][col] = f"${{data.{col}}}"
-    def eval_batch_run(
-        batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
-    ) -> Dict[str, __EvaluatorInfo]:
-        with EvalRunContext(batch_run_client):
-            runs = {
-                evaluator_name: batch_run_client.run(
-                    flow=evaluator,
-                    run=target_run,
-                    evaluator_name=evaluator_name,
-                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
-                    data=data,
-                    stream=True,
-                    name=kwargs.get("_run_name"),
-                )
-                for evaluator_name, evaluator in evaluators.items()
-            }
+    return __ValidatedData(
+        evaluators=evaluators,
+        graders=graders,
+        input_data_df=input_data_df,
+        column_mapping=column_mapping,
+        target_run=target_run,
+        batch_run_client=batch_run_client,
+        batch_run_data=batch_run_data,
+    )
-            # get_details needs to be called within EvalRunContext scope in order to have user agent populated
-            return {
-                evaluator_name: {
-                    "result": batch_run_client.get_details(run, all_results=True),
-                    "metrics": batch_run_client.get_metrics(run),
-                    "run_summary": batch_run_client.get_run_summary(run),
-                }
-                for evaluator_name, run in runs.items()
-            }
-    # Batch Run
-    use_pf_client = kwargs.get("_use_pf_client", True)
-    if use_pf_client:
-        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
-        # multiple evaluators. If the path is already absolute, abspath will return the original path.
-        data = os.path.abspath(data)
-        per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
-    else:
-        data = input_data_df
-        per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
+def _run_callable_evaluators(
+    validated_data: __ValidatedData,
+    fail_on_evaluator_errors: bool = False,
+    **kwargs,
+) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
+    # Extract needed values
+    batch_run_client = validated_data["batch_run_client"]
+    target_run = validated_data["target_run"]
+    batch_run_data = validated_data["batch_run_data"]
+    column_mapping = validated_data["column_mapping"]
+    evaluators = validated_data["evaluators"]
+    with EvalRunContext(batch_run_client):
+        runs = {
+            evaluator_name: batch_run_client.run(
+                flow=evaluator,
+                data=batch_run_data,
+                run=target_run,
+                evaluator_name=evaluator_name,
+                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                stream=True,
+                name=kwargs.get("_run_name"),
+            )
+            for evaluator_name, evaluator in evaluators.items()
+        }
+        # get_details needs to be called within EvalRunContext scope in order to have user agent populated
+        per_evaluator_results: Dict[str, __EvaluatorInfo] = {
+            evaluator_name: {
+                "result": batch_run_client.get_details(run, all_results=True),
+                "metrics": batch_run_client.get_metrics(run),
+                "run_summary": batch_run_client.get_run_summary(run),
+            }
+            for evaluator_name, run in runs.items()
+        }
     # Concatenate all results
-    evaluators_result_df = None
+    evaluators_result_df = pd.DataFrame()
     evaluators_metric = {}
     for evaluator_name, evaluator_result in per_evaluator_results.items():
         if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
@@ -873,31 +1121,50 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     # Rename columns, generated by target function to outputs instead of inputs.
     # If target generates columns, already present in the input data, these columns
     # will be marked as outputs already so we do not need to rename them.
-    input_data_df = _rename_columns_conditionally(input_data_df)
-    result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
-    metrics = _aggregate_metrics(evaluators_result_df, evaluators)
-    metrics.update(evaluators_metric)
-    # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
-    target_run = None
-    trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
-    studio_url = None
-    if trace_destination:
-        studio_url = _log_metrics_and_instance_results(
-            metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
-        )
-    result_df_dict = result_df.to_dict("records")
-    result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
+    input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
+    eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
+    eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
+    eval_metrics.update(evaluators_metric)
-    _print_summary(per_evaluator_results)
+    return eval_result_df, eval_metrics, per_evaluator_results
-    if output_path:
-        _write_output(output_path, result)
+def _map_names_to_builtins(
+        evaluators: Dict[str, Callable],
+        graders: Dict[str, AzureOpenAIGrader],
+    ) -> Dict[str, str]:
+    """
+    Construct a mapping from user-supplied evaluator names to which known, built-in
+    evaluator or grader they refer to. Custom or otherwise unknown evaluators are
+    mapped to the "unknown" value.
-    return result
+    :param evaluators: The dictionary of evaluators.
+    :type evaluators: Dict[str, Callable]
+    :param graders: The dictionary of graders.
+    :type graders: Dict[str, AzureOpenAIGrader]
+    :param evaluator_config: The configuration for evaluators.
+    :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
+    """
+    from .._eval_mapping import EVAL_CLASS_MAP
+    name_map = {}
+    for name, evaluator in evaluators.items():
+        # Check if the evaluator is a known built-in evaluator
+        found_eval = False
+        for eval_class, eval_id in EVAL_CLASS_MAP.items():
+            if isinstance(evaluator, eval_class):
+                name_map[name] = eval_id
+                found_eval = True
+                break
+        if not found_eval:
+            # If not found, map to "unknown"
+            name_map[name] = "unknown"
+    for  name, grader in graders.items():
+        name_map[name] = grader.id
+    return name_map
 def _turn_error_logs_into_exception(log_path: str) -> None:
     """Produce an EvaluationException using the contents of the inputted
@@ -913,4 +1180,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
         target=ErrorTarget.EVALUATE,
         category=ErrorCategory.FAILED_EXECUTION,
         blame=ErrorBlame.UNKNOWN,
-    )
+    )