PyPI - azure-ai-evaluation - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -8,20 +8,26 @@ import os
 import re
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
+from openai import OpenAI, AzureOpenAI
 from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
 from azure.ai.evaluation._legacy._adapters.entities import Run
 import pandas as pd
 from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
-from azure.ai.evaluation._common.utils import validate_azure_ai_project
+from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
+    EVALUATION_PASS_FAIL_MAPPING,
     EvaluationMetrics,
     DefaultOpenEncoding,
     Prefixes,
     _InternalEvaluationMetrics,
+    BINARY_AGGREGATE_SUFFIX,
+    DEFAULT_OAI_EVAL_RUN_NAME
 )
 from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
@@ -29,7 +35,6 @@ from ._batch_run import (
     EvalRunContext,
     CodeClient,
     ProxyClient,
-    ProxyRun,
     TargetRunContext,
     RunSubmitterClient,
 )
@@ -38,16 +43,22 @@ from ._utils import (
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
-    DataLoaderFactory,
+    DataLoaderFactory, _log_metrics_and_instance_results_onedp,
 )
-from ._batch_run.batch_clients import BatchClient
+from ._batch_run.batch_clients import BatchClient, BatchClientRun
+from ._evaluate_aoai import (
+    _begin_aoai_evaluation,
+    _split_evaluators_and_grader_configs,
+    _get_evaluation_run_results,
+    OAIEvalRunCreationInfo
+)
 LOGGER = logging.getLogger(__name__)
 # For metrics (aggregates) whose metric names intentionally differ from their
 # originating column name, usually because the aggregation of the original value
 # means something sufficiently different.
-# Note that content safety metrics are handled seprately.
+# Note that content safety metrics are handled separately.
 METRIC_COLUMN_NAME_REPLACEMENTS = {
     "groundedness_pro_label": "groundedness_pro_passing_rate",
 }
@@ -58,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
     metrics: Dict[str, Any]
     run_summary: Dict[str, Any]
+class __ValidatedData(TypedDict):
+    '''
+    Simple dictionary that contains ALL pre-processed data and
+    the resultant objects that are needed for downstream evaluation.
+    '''
+    evaluators: Dict[str, Callable]
+    graders: Dict[str, AzureOpenAIGrader]
+    input_data_df: pd.DataFrame
+    column_mapping: Dict[str, Dict[str, str]]
+    target_run: Optional[BatchClientRun]
+    batch_run_client: BatchClient
+    batch_run_data: Union[str, os.PathLike, pd.DataFrame]
 def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
     """Identify and average various metrics that need to have the metric name be replaced,
@@ -117,7 +141,6 @@ def _aggregate_content_safety_metrics(
             module = inspect.getmodule(evaluators[evaluator_name])
             if (
                 module
-                and module.__name__.startswith("azure.ai.evaluation.")
                 and metric_name.endswith("_score")
                 and metric_name.replace("_score", "") in content_safety_metrics
             ):
@@ -208,6 +231,48 @@ def _process_rows(row, detail_defect_rates):
     return detail_defect_rates
+def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
+    """
+    Aggregate binary output results (pass/fail) from evaluation dataframe.
+    For each evaluator, calculates the proportion of "pass" results.
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A dictionary mapping evaluator names to the proportion of pass results.
+    :rtype: Dict[str, float]
+    """
+    results = {}
+    # Find all columns that end with "_result"
+    result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
+    for col in result_columns:
+        # Extract the evaluator name from the column name
+        # (outputs.<evaluator>.<metric>_result)
+        parts = col.split(".")
+        evaluator_name = None
+        if len(parts) >= 3:
+            evaluator_name = parts[1]
+        else:
+            LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
+            continue
+        if evaluator_name:
+            # Count the occurrences of each unique value (pass/fail)
+            value_counts = df[col].value_counts().to_dict()
+            # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
+            total_rows = len(df)
+            pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
+            proportion = pass_count / total_rows if total_rows > 0 else 0.0
+            # Set the result with the evaluator name as the key
+            result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
+            results[result_key] = round(proportion, 2)
+    return results
 def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -221,6 +286,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     :return: The aggregated metrics.
     :rtype: Dict[str, float]
     """
+    binary_metrics = _aggregation_binary_output(df)
     df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
     handled_columns = []
@@ -248,6 +315,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     metrics = mean_value.to_dict()
     # Add defect rates back into metrics
     metrics.update(defect_rates)
+    # Add binary threshold metrics based on pass/fail results
+    metrics.update(binary_metrics)
     return metrics
@@ -486,12 +557,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
-    data: Union[str, os.PathLike],
+    data: Union[str, os.PathLike, pd.DataFrame],
     batch_client: BatchClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,
-) -> Tuple[pd.DataFrame, Set[str], Run]:
+) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
     """
     Apply the target function to the data set and return updated data and generated columns.
@@ -509,24 +580,18 @@ def _apply_target_to_data(
     :rtype: Tuple[pandas.DataFrame, List[str]]
     """
-    if not isinstance(batch_client, ProxyClient):
-        raise ValueError("Only ProxyClient supports target runs for now.")
     _run_name = kwargs.get("_run_name")
-    with TargetRunContext():
-        run = cast(
-            ProxyRun,
-            batch_client.run(
-                flow=target,
-                display_name=evaluation_name,
-                data=data,
-                stream=True,
-                name=_run_name,
-            ),
+    with TargetRunContext(batch_client):
+        run: BatchClientRun = batch_client.run(
+            flow=target,
+            display_name=evaluation_name,
+            data=data,
+            stream=True,
+            name=_run_name,
+            evaluator_name=getattr(target, "__qualname__", "TARGET"),
         )
-    target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
-    run_summary = batch_client.get_run_summary(run)
+        target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
+        run_summary = batch_client.get_run_summary(run)
     if run_summary["completed_lines"] == 0:
         msg = (
@@ -557,7 +622,7 @@ def _apply_target_to_data(
     # Concatenate output to input
     target_output = pd.concat([target_output, initial_data], axis=1)
-    return target_output, generated_columns, run.run.result()
+    return target_output, generated_columns, run
 def _process_column_mappings(
@@ -573,7 +638,7 @@ def _process_column_mappings(
     processed_config: Dict[str, Dict[str, str]] = {}
-    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z_]+\}$")
+    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
     if column_mapping:
         for evaluator, mapping_config in column_mapping.items():
@@ -625,11 +690,11 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
 def evaluate(
     *,
     data: Union[str, os.PathLike],
-    evaluators: Dict[str, Callable],
+    evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
-    azure_ai_project: Optional[AzureAIProject] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
     **kwargs,
@@ -641,8 +706,9 @@ def evaluate(
         JSONL and CSV files are supported.  `target` and `data` both cannot be None. Required.
     :paramtype data: str
     :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
-        and value as the evaluator function. Required.
-    :paramtype evaluators: Dict[str, Callable]
+        and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
+        Required.
+    :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
     :keyword evaluation_name: Display name of the evaluation.
     :paramtype evaluation_name: Optional[str]
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -672,14 +738,24 @@ def evaluate(
             :end-before: [END evaluate_method]
             :language: python
             :dedent: 8
-            :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
+            :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START evaluate_method]
+            :end-before: [END evaluate_method]
+            :language: python
+            :dedent: 8
+            :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     try:
         return _evaluate(
             evaluation_name=evaluation_name,
             target=target,
             data=data,
-            evaluators=evaluators,
+            evaluators_and_graders=evaluators,
             evaluator_config=evaluator_config,
             azure_ai_project=azure_ai_project,
             output_path=output_path,
@@ -744,23 +820,157 @@ def _print_fail_flag_warning() -> None:
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
-    evaluators: Dict[str, Callable],
+    evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     data: Union[str, os.PathLike],
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
-    azure_ai_project: Optional[AzureAIProject] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
     output_path: Optional[Union[str, os.PathLike]] = None,
     fail_on_evaluator_errors: bool = False,
     **kwargs,
 ) -> EvaluationResult:
     if fail_on_evaluator_errors:
         _print_fail_flag_warning()
-    input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
+    # Turn inputted mess of data into a dataframe, apply targets if needed
+    # split graders and evaluators, and verify that column mappings are sensible.
+    validated_data = _preprocess_data(
+        data=data,
+        evaluators_and_graders=evaluators_and_graders,
+        evaluator_config=evaluator_config,
+        target=target,
+        output_path=output_path,
+        azure_ai_project=azure_ai_project,
+        evaluation_name=evaluation_name,
+        **kwargs,
+    )
+    # extract relevant info from validated data
+    column_mapping = validated_data["column_mapping"]
+    evaluators = validated_data["evaluators"]
+    graders = validated_data["graders"]
+    input_data_df = validated_data["input_data_df"]
+    results_df = pd.DataFrame()
+    metrics: Dict[str, float] = {}
+    eval_run_info_list: List[OAIEvalRunCreationInfo] = []
+    # Start OAI eval runs if any graders are present.
+    need_oai_run = len(graders) > 0
+    need_local_run = len(evaluators) > 0
+    need_get_oai_results = False
+    got_local_results = False
+    if need_oai_run:
+        try:
+            aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
+            eval_run_info_list = _begin_aoai_evaluation(
+                graders,
+                column_mapping,
+                input_data_df,
+                aoi_name
+            )
+            need_get_oai_results = len(eval_run_info_list) > 0
+        except EvaluationException as e:
+            if need_local_run:
+                # If there are normal evaluators, don't stop execution and try to run
+                # those.
+                LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
+                               " Continuing with local evaluators.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
+    if need_local_run:
+        try:
+            eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
+                validated_data=validated_data,
+                fail_on_evaluator_errors=fail_on_evaluator_errors
+            )
+            results_df = eval_result_df
+            metrics = eval_metrics
+            got_local_results = True
+            # TODO figure out how to update this printing to include OAI results?
+            _print_summary(per_evaluator_results)
+        except EvaluationException as e:
+            if need_get_oai_results:
+                # If there are OAI graders, we only print a warning on local failures.
+                LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Retrieve OAI eval run results if needed.
+    if need_get_oai_results:
+        try:
+            aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
+            # Post build TODO: add equivalent of  _print_summary(per_evaluator_results) here
+            # Combine results if both evaluators and graders are present
+            if len(evaluators) > 0:
+                results_df = pd.concat([results_df, aoai_results], axis=1)
+                metrics.update(aoai_metrics)
+            else:
+                # Otherwise combine aoai results with input data df to include input columns in outputs.
+                results_df = pd.concat([input_data_df, aoai_results], axis=1)
+                metrics = aoai_metrics
+        except EvaluationException as e:
+            if got_local_results:
+                # If there are local eval results, we only print a warning on OAI failure.
+                LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
+                LOGGER.warning(e)
+            else:
+                raise e
+    # Done with all evaluations, message outputs into final forms, and log results if needed.
+    name_map = _map_names_to_builtins(evaluators, graders)
+    if is_onedp_project(azure_ai_project):
+        studio_url = _log_metrics_and_instance_results_onedp(
+            metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
+        )
+    else:
+        # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
+        trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
+        studio_url = None
+        if trace_destination:
+            studio_url = _log_metrics_and_instance_results(
+                metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
+            )
+    result_df_dict = results_df.to_dict("records")
+    result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
+    if output_path:
+        _write_output(output_path, result)
+    return result
+def _preprocess_data(
+    data: Union[str, os.PathLike],
+    evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
+    target: Optional[Callable] = None,
+    output_path: Optional[Union[str, os.PathLike]] = None,
+    azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
+    evaluation_name: Optional[str] = None,
+    **kwargs,
+    ) -> __ValidatedData:
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
+    input_data_df = _validate_and_load_data(
+        target,
+        data,
+        evaluators_and_graders,
+        output_path,
+        azure_ai_project,
+        evaluation_name
+    )
+    if target is not None:
+        _validate_columns_for_target(input_data_df, target)
     # extract column mapping dicts into dictionary mapping evaluator name to column mapping
     column_mapping = _process_column_mappings(
         {
@@ -769,27 +979,35 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         }
     )
-    if target is not None:
-        _validate_columns_for_target(input_data_df, target)
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
     column_mapping = column_mapping or {}
     column_mapping.setdefault("default", {})
-    target_run: Optional[Run] = None
+    # Split normal evaluators and OAI graders
+    evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
+    target_run: Optional[BatchClientRun] = None
     target_generated_columns: Set[str] = set()
     batch_run_client: BatchClient
     batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
-    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
-    if data is not None and target is not None:
-        # Right now, only the ProxyClient that uses Promptflow supports a target function
+    if kwargs.pop("_use_run_submitter_client", False):
+        batch_run_client = RunSubmitterClient()
+        batch_run_data = input_data_df
+    elif kwargs.pop("_use_pf_client", True):
         batch_run_client = ProxyClient(user_agent=USER_AGENT)
+        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
+        # multiple evaluators. If the path is already absolute, abspath will return the original path.
         batch_run_data = os.path.abspath(data)
+    else:
+        batch_run_client = CodeClient()
+        batch_run_data = input_data_df
+    # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
+    if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, batch_run_client, input_data_df, evaluation_name, **kwargs
+            target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
         )
         for evaluator_name, mapping in column_mapping.items():
@@ -803,17 +1021,6 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
                 # customer did not mapped target output.
                 if col not in mapping and run_output not in mapped_to_values:
                     column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
-    elif kwargs.pop("_use_run_submitter_client", False):
-        batch_run_client = RunSubmitterClient()
-        batch_run_data = input_data_df
-    elif kwargs.pop("_use_pf_client", True):
-        batch_run_client = ProxyClient(user_agent=USER_AGENT)
-        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
-        # multiple evaluators. If the path is already absolute, abspath will return the original path.
-        batch_run_data = os.path.abspath(data)
-    else:
-        batch_run_client = CodeClient()
-        batch_run_data = input_data_df
     # After we have generated all columns, we can check if we have everything we need for evaluators.
     _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -829,6 +1036,29 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
                 column_mapping["default"][col] = f"${{data.{col}}}"
+    return __ValidatedData(
+        evaluators=evaluators,
+        graders=graders,
+        input_data_df=input_data_df,
+        column_mapping=column_mapping,
+        target_run=target_run,
+        batch_run_client=batch_run_client,
+        batch_run_data=batch_run_data,
+    )
+def _run_callable_evaluators(
+    validated_data: __ValidatedData,
+    fail_on_evaluator_errors: bool = False,
+    **kwargs,
+) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
+    # Extract needed values
+    batch_run_client = validated_data["batch_run_client"]
+    target_run = validated_data["target_run"]
+    batch_run_data = validated_data["batch_run_data"]
+    column_mapping = validated_data["column_mapping"]
+    evaluators = validated_data["evaluators"]
     with EvalRunContext(batch_run_client):
         runs = {
             evaluator_name: batch_run_client.run(
@@ -889,31 +1119,50 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     # Rename columns, generated by target function to outputs instead of inputs.
     # If target generates columns, already present in the input data, these columns
     # will be marked as outputs already so we do not need to rename them.
-    input_data_df = _rename_columns_conditionally(input_data_df)
-    result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
-    metrics = _aggregate_metrics(evaluators_result_df, evaluators)
-    metrics.update(evaluators_metric)
-    # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
-    target_run: Optional[Run] = None
-    trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
-    studio_url = None
-    if trace_destination:
-        studio_url = _log_metrics_and_instance_results(
-            metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
-        )
-    result_df_dict = result_df.to_dict("records")
-    result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
+    input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
+    eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
+    eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
+    eval_metrics.update(evaluators_metric)
-    _print_summary(per_evaluator_results)
-    if output_path:
-        _write_output(output_path, result)
+    return eval_result_df, eval_metrics, per_evaluator_results
-    return result
+def _map_names_to_builtins(
+        evaluators: Dict[str, Callable],
+        graders: Dict[str, AzureOpenAIGrader],
+    ) -> Dict[str, str]:
+    """
+    Construct a mapping from user-supplied evaluator names to which known, built-in
+    evaluator or grader they refer to. Custom or otherwise unknown evaluators are
+    mapped to the "unknown" value.
+    :param evaluators: The dictionary of evaluators.
+    :type evaluators: Dict[str, Callable]
+    :param graders: The dictionary of graders.
+    :type graders: Dict[str, AzureOpenAIGrader]
+    :param evaluator_config: The configuration for evaluators.
+    :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
+    """
+    from .._eval_mapping import EVAL_CLASS_MAP
+    name_map = {}
+    for name, evaluator in evaluators.items():
+        # Check if the evaluator is a known built-in evaluator
+        found_eval = False
+        for eval_class, eval_id in EVAL_CLASS_MAP.items():
+            if isinstance(evaluator, eval_class):
+                name_map[name] = eval_id
+                found_eval = True
+                break
+        if not found_eval:
+            # If not found, map to "unknown"
+            name_map[name] = "unknown"
+    for  name, grader in graders.items():
+        name_map[name] = grader.id
+    return name_map
 def _turn_error_logs_into_exception(log_path: str) -> None:
     """Produce an EvaluationException using the contents of the inputted
@@ -929,4 +1178,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
         target=ErrorTarget.EVALUATE,
         category=ErrorCategory.FAILED_EXECUTION,
         blame=ErrorBlame.UNKNOWN,
-    )
+    )