PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.0.0b4py3-none-any.whl → 1.0.0b5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show

azure/ai/evaluation/_evaluate/_evaluate.py CHANGED Viewed

@@ -2,18 +2,19 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import inspect
+import json
 import os
 import re
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
-import json
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
+from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
 from promptflow.client import PFClient
 from promptflow.entities import Run
-from promptflow._sdk._errors import MissingAzurePackage
 from azure.ai.evaluation._common.math import list_sum
+from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from .._constants import (
@@ -23,11 +24,10 @@ from .._constants import (
     Prefixes,
     _InternalEvaluationMetrics,
 )
-from .._model_configurations import AzureAIProject, EvaluatorConfig
+from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
-from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
+from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
 from ._utils import (
-    EvaluateResult,
     _apply_column_mapping,
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
@@ -36,6 +36,14 @@ from ._utils import (
 TClient = TypeVar("TClient", ProxyClient, CodeClient)
+# For metrics (aggregates) whose metric names intentionally differ from their
+# originating column name, usually because the aggregation of the original value
+# means something sufficiently different.
+# Note that content safety metrics are handled seprately.
+METRIC_COLUMN_NAME_REPLACEMENTS = {
+    "groundedness_pro_label": "groundedness_pro_passing_rate",
+}
 class __EvaluatorInfo(TypedDict):
     result: pd.DataFrame
@@ -43,6 +51,32 @@ class __EvaluatorInfo(TypedDict):
     run_summary: Dict[str, Any]
+def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
+    """Identify and average various metrics that need to have the metric name be replaced,
+    instead of having the metric match the originating column name.
+    :param df: The dataframe of evaluation results.
+    :type df: ~pandas.DataFrame
+    :return: A tuple; the first element is a list of dataframe columns that were aggregated,
+        and the second element is a dictionary of resultant new metric column names and their values.
+    :rtype: Tuple[List[str], Dict[str, float]]
+    """
+    renamed_cols = []
+    metric_columns = {}
+    for col in df.columns:
+        metric_prefix = col.split(".")[0]
+        metric_name = col.split(".")[1]
+        if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
+            renamed_cols.append(col)
+            new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
+            col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
+            metric_columns[new_col_name] = round(
+                list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
+                2,
+            )
+    return renamed_cols, metric_columns
 # pylint: disable=line-too-long
 def _aggregate_content_safety_metrics(
     df: pd.DataFrame, evaluators: Dict[str, Callable]
@@ -146,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     # Rename certain columns as defect rates if we know that's what their aggregates represent
     # Content safety metrics
     content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
+    other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
     handled_columns.extend(content_safety_cols)
+    handled_columns.extend(other_renamed_cols)
     defect_rates.update(cs_defect_rates)
+    defect_rates.update(renamed_cols)
     # Label-based (true/false) metrics where 'true' means 'something is wrong'
     label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
     handled_columns.extend(label_cols)
@@ -163,34 +200,127 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     return metrics
-def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
+def _validate_columns_for_target(
+    df: pd.DataFrame,
+    target: Callable,
+) -> None:
+    """
+    Check that all columns needed by target function are present.
+    :param df: The data frame to be validated.
+    :type df: pd.DataFrame
+    :param target: The callable to be applied to data set.
+    :type target: Optional[Callable]
+    :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
+    """
+    if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
+        msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
+        raise EvaluationException(
+            message=msg,
+            internal_message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    # If the target function is given, it may return
+    # several columns and hence we cannot check the availability of columns
+    # without knowing target function semantics.
+    # Instead, here we will validate the columns, taken by target.
     required_inputs = [
         param.name
-        for param in inspect.signature(evaluator).parameters.values()
+        for param in inspect.signature(target).parameters.values()
         if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
     ]
-    missing_inputs = [col for col in required_inputs if col not in df_data.columns]
-    if missing_inputs and "conversation" in required_inputs:
-        non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
-        if len(missing_inputs) == len(non_conversation_inputs) and [
-            input in non_conversation_inputs for input in missing_inputs
-        ]:
-            missing_inputs = []
+    missing_inputs = [col for col in required_inputs if col not in df.columns]
     if missing_inputs:
-        if not is_target_fn:
-            msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.MISSING_FIELD,
-                blame=ErrorBlame.USER_ERROR,
-            )
-        msg = f"Missing required inputs for target : {missing_inputs}."
+        msg = f"Missing required inputs for target: {missing_inputs}."
         raise EvaluationException(
             message=msg,
-            internal_message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.MISSING_FIELD,
+            blame=ErrorBlame.USER_ERROR,
+        )
+def _validate_columns_for_evaluators(
+    df: pd.DataFrame,
+    evaluators: Dict[str, Callable],
+    target: Optional[Callable],
+    target_generated_columns: Optional[Set[str]],
+    column_mapping: Dict[str, Dict[str, str]],
+) -> None:
+    """
+    Check that all columns needed by evaluators are present.
+    :param df: The data frame to be validated.
+    :type df: pd.DataFrame
+    :param evaluators: The dictionary of evaluators.
+    :type evaluators: Dict[str, Callable]
+    :param target: The callable to be applied to data set.
+    :type target: Optional[Callable]
+    :param target_generated_columns: The set of columns generated by the target callable.
+    :type target_generated_columns: Optional[Set[str]]
+    :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
+    :type column_mapping: Dict[str, Dict[str, str]]
+    :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
+    """
+    missing_inputs_per_evaluator = {}
+    for evaluator_name, evaluator in evaluators.items():
+        # Apply column mapping
+        mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
+        new_df = _apply_column_mapping(df, mapping_config)
+        # Validate input data for evaluator
+        is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
+        if is_built_in:
+            # Note that for built-in evaluators supporting the "conversation" parameter,
+            # input parameters are now optional.
+            evaluator_params = [
+                param.name
+                for param in inspect.signature(evaluator).parameters.values()
+                if param.name not in ["kwargs", "args", "self"]
+            ]
+            if "conversation" in evaluator_params and "conversation" in new_df.columns:
+                # Ignore the missing fields if "conversation" presents in the input data
+                missing_inputs = []
+            else:
+                missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
+                # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
+                # Otherwise, remove it from the missing inputs
+                if "conversation" in missing_inputs:
+                    if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
+                        missing_inputs.remove("conversation")
+        else:
+            evaluator_params = [
+                param.name
+                for param in inspect.signature(evaluator).parameters.values()
+                if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
+            ]
+            missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
+        if missing_inputs:
+            missing_inputs_per_evaluator[evaluator_name] = missing_inputs
+    if missing_inputs_per_evaluator:
+        msg = "Some evaluators are missing required inputs:\n"
+        for evaluator_name, missing in missing_inputs_per_evaluator.items():
+            msg += f"- {evaluator_name}: {missing}\n"
+        # Add the additional notes
+        msg += "\nTo resolve this issue:\n"
+        msg += "- Ensure the data contains required inputs.\n"
+        if target is not None:
+            msg += "- Verify that the target is generating the necessary columns for the evaluators. "
+            msg += f"Currently generated columns: {target_generated_columns} \n"
+        msg += "- Check that the column mapping is correctly configured."
+        raise EvaluationException(
+            message=msg.strip(),
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.MISSING_FIELD,
             blame=ErrorBlame.USER_ERROR,
@@ -199,76 +329,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
 def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
     if data is None:
-        msg = "data parameter must be provided for evaluation."
+        msg = "The 'data' parameter is required for evaluation."
         raise EvaluationException(
             message=msg,
-            internal_message=msg,
             target=ErrorTarget.EVALUATE,
-            category=ErrorCategory.MISSING_FIELD,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not isinstance(data, (os.PathLike, str)):
+        msg = "The 'data' parameter must be a string or a path-like object."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not os.path.exists(data):
+        msg = f"The input data file path '{data}' does not exist."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
     if target is not None:
         if not callable(target):
-            msg = "target parameter must be a callable function."
+            msg = "The 'target' parameter must be a callable function."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
-    if data is not None:
-        if not isinstance(data, str):
-            msg = "data parameter must be a string."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            )
+    if not evaluators:
+        msg = "The 'evaluators' parameter is required and cannot be None or empty."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not isinstance(evaluators, dict):
+        msg = "The 'evaluators' parameter must be a dictionary."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
-    if evaluators is not None:
-        if not isinstance(evaluators, dict):
-            msg = "evaluators parameter must be a dictionary."
+    if output_path is not None:
+        if not isinstance(output_path, (os.PathLike, str)):
+            msg = "The 'output_path' parameter must be a string or a path-like object."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
-    if output_path is not None:
-        if not isinstance(output_path, str):
-            msg = "output_path parameter must be a string."
+        output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
     if azure_ai_project is not None:
-        if not isinstance(azure_ai_project, Dict):
-            msg = "azure_ai_project parameter must be a dictionary."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            )
+        validate_azure_ai_project(azure_ai_project)
     if evaluation_name is not None:
-        if not isinstance(evaluation_name, str):
-            msg = "evaluation_name parameter must be a string."
+        if not isinstance(evaluation_name, str) or not evaluation_name.strip():
+            msg = "The 'evaluation_name' parameter must be a non-empty string."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
@@ -278,8 +417,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
         initial_data_df = pd.read_json(data, lines=True)
     except Exception as e:
         raise EvaluationException(
-            message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
-            internal_message="Failed to load data. Confirm that it is valid jsonl data.",
+            message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
@@ -288,57 +426,13 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
     return initial_data_df
-def _validate_columns(
-    df: pd.DataFrame,
-    evaluators: Dict[str, Callable],
-    target: Optional[Callable],
-    column_mapping: Dict[str, Dict[str, str]],
-) -> None:
-    """
-    Check that all columns needed by evaluator or target function are present.
-    :param df: The data frame to be validated.
-    :type df: pd.DataFrame
-    :param evaluators: The dictionary of evaluators.
-    :type evaluators: Dict[str, Callable]
-    :param target: The callable to be applied to data set.
-    :type target: Optional[Callable]
-    :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
-    :type column_mapping: Dict[str, Dict[str, str]]
-    :raises EvaluationException: If column starts from "__outputs." while target is defined.
-    """
-    if target:
-        if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
-            msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            )
-        # If the target function is given, it may return
-        # several columns and hence we cannot check the availability of columns
-        # without knowing target function semantics.
-        # Instead, here we will validate the columns, taken by target.
-        _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
-    else:
-        for evaluator_name, evaluator in evaluators.items():
-            # Apply column mapping
-            mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
-            new_df = _apply_column_mapping(df, mapping_config)
-            # Validate input data for evaluator
-            _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
 def _apply_target_to_data(
     target: Callable,
-    data: str,
+    data: Union[str, os.PathLike],
     pf_client: PFClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
-    _run_name: Optional[str] = None,
+    **kwargs,
 ) -> Tuple[pd.DataFrame, Set[str], Run]:
     """
     Apply the target function to the data set and return updated data and generated columns.
@@ -346,29 +440,45 @@ def _apply_target_to_data(
     :param target: The function to be applied to data.
     :type target: Callable
     :param data: The path to input jsonl file.
-    :type data: str
+    :type data: Union[str, os.PathLike]
     :param pf_client: The promptflow client to be used.
     :type pf_client: PFClient
     :param initial_data: The data frame with the loaded data.
     :type initial_data: pd.DataFrame
     :param evaluation_name: The name of the evaluation.
     :type evaluation_name: Optional[str]
-    :param _run_name: The name of target run. Used for testing only.
-    :type _run_name: Optional[str]
     :return: The tuple, containing data frame and the list of added columns.
     :rtype: Tuple[pandas.DataFrame, List[str]]
     """
-    # We are manually creating the temporary directory for the flow
-    # because the way tempdir remove temporary directories will
-    # hang the debugger, because promptflow will keep flow directory.
-    run: Run = pf_client.run(
-        flow=target,
-        display_name=evaluation_name,
-        data=data,
-        properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
-        stream=True,
-        name=_run_name,
-    )
+    _run_name = kwargs.get("_run_name")
+    upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
+    try:
+        with TargetRunContext(upload_target_snaphot):
+            run: Run = pf_client.run(
+                flow=target,
+                display_name=evaluation_name,
+                data=data,
+                properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
+                stream=True,
+                name=_run_name,
+            )
+    except (UserAuthenticationError, UploadInternalError) as ex:
+        if "Failed to upload run" in ex.message:
+            msg = (
+                "Failed to upload the target run to the cloud. "
+                "This may be caused by insufficient permission to access storage or other errors."
+            )
+            raise EvaluationException(
+                message=msg,
+                target=ErrorTarget.EVALUATE,
+                category=ErrorCategory.FAILED_REMOTE_TRACKING,
+                blame=ErrorBlame.USER_ERROR,
+                tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
+            ) from ex
+        raise ex
     target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
     # Remove input and output prefix
     generated_columns = {
@@ -456,15 +566,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
 # @log_evaluate_activity
 def evaluate(
     *,
-    data: str,
+    data: Union[str, os.PathLike],
     evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
-    output_path: Optional[str] = None,
+    output_path: Optional[Union[str, os.PathLike]] = None,
     **kwargs,
-):
+) -> EvaluationResult:
     """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
         data will be run through target function and then results will be evaluated.
@@ -489,7 +599,7 @@ def evaluate(
     :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
     :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
     :return: Evaluation results.
-    :rtype: dict
+    :rtype: ~azure.ai.evaluation.EvaluationResult
     :Example:
@@ -563,7 +673,17 @@ def evaluate(
                 internal_message=error_message,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.FAILED_EXECUTION,
-                blame=ErrorBlame.UNKNOWN,
+                blame=ErrorBlame.USER_ERROR,
+            ) from e
+        # Ensure a consistent user experience when encountering errors by converting
+        # all other exceptions to EvaluationException.
+        if not isinstance(e, EvaluationException):
+            raise EvaluationException(
+                message=str(e),
+                target=ErrorTarget.EVALUATE,
+                category=ErrorCategory.FAILED_EXECUTION,
+                blame=ErrorBlame.SYSTEM_ERROR,
             ) from e
         raise e
@@ -586,12 +706,12 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    data: str,
+    data: Union[str, os.PathLike],
     evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
-    output_path: Optional[str] = None,
+    output_path: Optional[Union[str, os.PathLike]] = None,
     **kwargs,
-) -> EvaluateResult:
+) -> EvaluationResult:
     input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
     # Process evaluator config to replace ${target.} with ${data.}
@@ -604,7 +724,9 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             for evaluator_name, evaluator_configuration in evaluator_config.items()
         }
     )
-    _validate_columns(input_data_df, evaluators, target, column_mapping)
+    if target is not None:
+        _validate_columns_for_target(input_data_df, target)
     # Target Run
     try:
@@ -623,7 +745,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
         )
-        raise EvaluationException(
+        raise EvaluationException(  # pylint: disable=raise-missing-from
             message=msg,
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.MISSING_PACKAGE,
@@ -631,6 +753,11 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         )
     trace_destination: Optional[str] = pf_client._config.get_trace_destination()  # pylint: disable=protected-access
+    # Handle the case where the customer manually run "pf config set trace.destination=none"
+    if trace_destination and trace_destination.lower() == "none":
+        trace_destination = None
     target_run: Optional[Run] = None
     # Create default configuration for evaluators that directly maps
@@ -639,9 +766,10 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     column_mapping.setdefault("default", {})
     # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
+    target_generated_columns: Set[str] = set()
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
+            target, data, pf_client, input_data_df, evaluation_name, **kwargs
         )
         for evaluator_name, mapping in column_mapping.items():
@@ -656,9 +784,8 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
                 if col not in mapping and run_output not in mapped_to_values:
                     column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
-        # After we have generated all columns we can check if we have
-        # everything we need for evaluators.
-        _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
+    # After we have generated all columns, we can check if we have everything we need for evaluators.
+    _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
     # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
     # via target mapping.
@@ -674,7 +801,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     def eval_batch_run(
         batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
     ) -> Dict[str, __EvaluatorInfo]:
-        with BatchRunContext(batch_run_client):
+        with EvalRunContext(batch_run_client):
             runs = {
                 evaluator_name: batch_run_client.run(
                     flow=evaluator,
@@ -688,7 +815,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
                 for evaluator_name, evaluator in evaluators.items()
             }
-            # get_details needs to be called within BatchRunContext scope in order to have user agent populated
+            # get_details needs to be called within EvalRunContext scope in order to have user agent populated
             return {
                 evaluator_name: {
                     "result": batch_run_client.get_details(run, all_results=True),
@@ -758,7 +885,8 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
         evaluation_name,
     )
-    result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
+    result_df_dict = result_df.to_dict("records")
+    result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
     if output_path:
         _write_output(output_path, result)

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -16,10 +16,10 @@ from promptflow.client import PFClient
 from promptflow.core import Prompty as prompty_core
 from typing_extensions import ParamSpec
-from azure.ai.evaluation._model_configurations import AzureAIProject
+from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
 from ..._user_agent import USER_AGENT
-from .._utils import EvaluateResult, _trace_destination_from_project_scope
+from .._utils import _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)
@@ -97,17 +97,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
 # cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
+def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
     """Decorator to log evaluate activity
     :param func: The function to be decorated
     :type func: Callable
     :returns: The decorated function
-    :rtype: Callable[P, EvaluateResult]
+    :rtype: Callable[P, EvaluationResult]
     """
     @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
         from promptflow._sdk._telemetry import ActivityType, log_activity
         from promptflow._sdk._telemetry.telemetry import get_telemetry_logger