azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (108) hide show
  1. azure/ai/evaluation/__init__.py +4 -26
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +122 -302
  4. azure/ai/evaluation/_common/utils.py +35 -393
  5. azure/ai/evaluation/_constants.py +6 -28
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  10. azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
  11. azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
  13. azure/ai/evaluation/_evaluate/_utils.py +47 -108
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +134 -205
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +2 -3
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
  60. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
  61. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
  62. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  63. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
  64. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
  65. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  66. azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  69. azure/ai/evaluation/simulator/_tracing.py +28 -25
  70. azure/ai/evaluation/simulator/_utils.py +13 -34
  71. azure/ai/evaluation/simulator/simulator.py +579 -0
  72. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  73. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  74. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_common/_experimental.py +0 -172
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
  78. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  79. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  80. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  81. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  82. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  84. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  85. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  86. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  87. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  88. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  89. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  91. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  92. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  93. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  94. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  95. azure/ai/evaluation/_vendor/__init__.py +0 -3
  96. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  97. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  98. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  100. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  101. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  102. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  103. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  104. azure/ai/evaluation/simulator/_simulator.py +0 -716
  105. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  106. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  107. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  108. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
@@ -2,87 +2,38 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
- import json
6
- import logging
7
5
  import os
8
6
  import re
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
10
8
 
9
+ import numpy as np
11
10
  import pandas as pd
11
+
12
12
  from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
14
13
  from promptflow.client import PFClient
15
- from promptflow.entities import Run
16
-
17
- from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
19
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
14
 
15
+ from .._model_configurations import AzureAIProject
21
16
  from .._constants import (
22
17
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
23
18
  EvaluationMetrics,
24
- EvaluationRunProperties,
25
19
  Prefixes,
26
20
  _InternalEvaluationMetrics,
27
21
  )
28
- from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
22
  from .._user_agent import USER_AGENT
30
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
23
+ from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
24
+ from ._telemetry import log_evaluate_activity
31
25
  from ._utils import (
32
26
  _apply_column_mapping,
33
27
  _log_metrics_and_instance_results,
34
28
  _trace_destination_from_project_scope,
35
29
  _write_output,
36
30
  )
37
-
38
- TClient = TypeVar("TClient", ProxyClient, CodeClient)
39
- LOGGER = logging.getLogger(__name__)
40
-
41
- # For metrics (aggregates) whose metric names intentionally differ from their
42
- # originating column name, usually because the aggregation of the original value
43
- # means something sufficiently different.
44
- # Note that content safety metrics are handled seprately.
45
- METRIC_COLUMN_NAME_REPLACEMENTS = {
46
- "groundedness_pro_label": "groundedness_pro_passing_rate",
47
- }
48
-
49
-
50
- class __EvaluatorInfo(TypedDict):
51
- result: pd.DataFrame
52
- metrics: Dict[str, Any]
53
- run_summary: Dict[str, Any]
54
-
55
-
56
- def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
57
- """Identify and average various metrics that need to have the metric name be replaced,
58
- instead of having the metric match the originating column name.
59
- :param df: The dataframe of evaluation results.
60
- :type df: ~pandas.DataFrame
61
- :return: A tuple; the first element is a list of dataframe columns that were aggregated,
62
- and the second element is a dictionary of resultant new metric column names and their values.
63
- :rtype: Tuple[List[str], Dict[str, float]]
64
- """
65
- renamed_cols = []
66
- metric_columns = {}
67
- for col in df.columns:
68
- metric_prefix = col.split(".")[0]
69
- metric_name = col.split(".")[1]
70
- if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
71
- renamed_cols.append(col)
72
- new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
73
- col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
74
- try:
75
- metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76
- except EvaluationException: # only exception that can be cause is all NaN values
77
- msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
78
- LOGGER.warning(msg)
79
-
80
- return renamed_cols, metric_columns
31
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
81
32
 
82
33
 
83
34
  # pylint: disable=line-too-long
84
35
  def _aggregate_content_safety_metrics(
85
- df: pd.DataFrame, evaluators: Dict[str, Callable]
36
+ df: pd.DataFrame, evaluators: Dict[str, Type]
86
37
  ) -> Tuple[List[str], Dict[str, float]]:
87
38
  """Find and aggregate defect rates for content safety metrics. Returns both a list
88
39
  of columns that were used to calculate defect rates and the defect rates themselves.
@@ -122,15 +73,11 @@ def _aggregate_content_safety_metrics(
122
73
  for col in content_safety_df.columns:
123
74
  defect_rate_name = col.replace("_score", "_defect_rate")
124
75
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
125
- try:
126
- col_with_boolean_values = apply_transform_nan_safe(
127
- col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
128
- )
129
- defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
130
- except EvaluationException: # only exception that can be cause is all NaN values
131
- msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
132
- LOGGER.warning(msg)
133
-
76
+ defect_rates[defect_rate_name] = round(
77
+ np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
78
+ / col_with_numeric_values.count(),
79
+ 2,
80
+ )
134
81
  return content_safety_cols, defect_rates
135
82
 
136
83
 
@@ -160,15 +107,14 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
160
107
  for col in label_df.columns:
161
108
  defect_rate_name = col.replace("_label", "_defect_rate")
162
109
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
163
- try:
164
- defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165
- except EvaluationException: # only exception that can be cause is all NaN values
166
- msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167
- LOGGER.warning(msg)
110
+ defect_rates[defect_rate_name] = round(
111
+ np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
112
+ 2,
113
+ )
168
114
  return label_cols, defect_rates
169
115
 
170
116
 
171
- def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
117
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
172
118
  """Aggregate metrics from the evaluation results.
173
119
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
174
120
  that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -177,7 +123,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
177
123
  :param df: The dataframe of evaluation results.
178
124
  :type df: ~pandas.DataFrame
179
125
  :param evaluators: A dictionary mapping of strings to evaluator classes.
180
- :type evaluators: Dict[str, Callable]
126
+ :type evaluators: Dict[str, Type]
181
127
  :return: The aggregated metrics.
182
128
  :rtype: Dict[str, float]
183
129
  """
@@ -188,11 +134,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
188
134
  # Rename certain columns as defect rates if we know that's what their aggregates represent
189
135
  # Content safety metrics
190
136
  content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
191
- other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
192
137
  handled_columns.extend(content_safety_cols)
193
- handled_columns.extend(other_renamed_cols)
194
138
  defect_rates.update(cs_defect_rates)
195
- defect_rates.update(renamed_cols)
196
139
  # Label-based (true/false) metrics where 'true' means 'something is wrong'
197
140
  label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
198
141
  handled_columns.extend(label_cols)
@@ -201,9 +144,6 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
201
144
  # For rest of metrics, we will calculate mean
202
145
  df.drop(columns=handled_columns, inplace=True)
203
146
 
204
- # NOTE: nan/None values don't count as as booleans, so boolean columns with
205
- # nan/None values won't have a mean produced from them.
206
- # This is different from label-based known evaluators, which have special handling.
207
147
  mean_value = df.mean(numeric_only=True)
208
148
  metrics = mean_value.to_dict()
209
149
  # Add defect rates back into metrics
@@ -211,133 +151,28 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
211
151
  return metrics
212
152
 
213
153
 
214
- def _validate_columns_for_target(
215
- df: pd.DataFrame,
216
- target: Callable,
217
- ) -> None:
218
- """
219
- Check that all columns needed by target function are present.
220
-
221
- :param df: The data frame to be validated.
222
- :type df: pd.DataFrame
223
- :param target: The callable to be applied to data set.
224
- :type target: Optional[Callable]
225
- :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
226
- """
227
- if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
228
- msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
229
- raise EvaluationException(
230
- message=msg,
231
- internal_message=msg,
232
- target=ErrorTarget.EVALUATE,
233
- category=ErrorCategory.INVALID_VALUE,
234
- blame=ErrorBlame.USER_ERROR,
235
- )
236
- # If the target function is given, it may return
237
- # several columns and hence we cannot check the availability of columns
238
- # without knowing target function semantics.
239
- # Instead, here we will validate the columns, taken by target.
154
+ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
240
155
  required_inputs = [
241
156
  param.name
242
- for param in inspect.signature(target).parameters.values()
157
+ for param in inspect.signature(evaluator).parameters.values()
243
158
  if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
244
159
  ]
245
160
 
246
- missing_inputs = [col for col in required_inputs if col not in df.columns]
161
+ missing_inputs = [col for col in required_inputs if col not in df_data.columns]
247
162
  if missing_inputs:
248
- msg = f"Missing required inputs for target: {missing_inputs}."
163
+ if not is_target_fn:
164
+ msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
165
+ raise EvaluationException(
166
+ message=msg,
167
+ internal_message=msg,
168
+ target=ErrorTarget.EVALUATE,
169
+ category=ErrorCategory.MISSING_FIELD,
170
+ blame=ErrorBlame.USER_ERROR,
171
+ )
172
+ msg = f"Missing required inputs for target : {missing_inputs}."
249
173
  raise EvaluationException(
250
174
  message=msg,
251
- target=ErrorTarget.EVALUATE,
252
- category=ErrorCategory.MISSING_FIELD,
253
- blame=ErrorBlame.USER_ERROR,
254
- )
255
-
256
-
257
- def _validate_columns_for_evaluators(
258
- df: pd.DataFrame,
259
- evaluators: Dict[str, Callable],
260
- target: Optional[Callable],
261
- target_generated_columns: Optional[Set[str]],
262
- column_mapping: Dict[str, Dict[str, str]],
263
- ) -> None:
264
- """
265
- Check that all columns needed by evaluators are present.
266
-
267
- :param df: The data frame to be validated.
268
- :type df: pd.DataFrame
269
- :param evaluators: The dictionary of evaluators.
270
- :type evaluators: Dict[str, Callable]
271
- :param target: The callable to be applied to data set.
272
- :type target: Optional[Callable]
273
- :param target_generated_columns: The set of columns generated by the target callable.
274
- :type target_generated_columns: Optional[Set[str]]
275
- :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
276
- :type column_mapping: Dict[str, Dict[str, str]]
277
- :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
278
- """
279
- missing_inputs_per_evaluator = {}
280
-
281
- for evaluator_name, evaluator in evaluators.items():
282
- # Apply column mapping
283
- mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
284
- new_df = _apply_column_mapping(df, mapping_config)
285
-
286
- # Validate input data for evaluator
287
- is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
288
- if is_built_in:
289
- # Note that for built-in evaluators supporting the "conversation" parameter,
290
- # input parameters are now optional.
291
- evaluator_params = [
292
- param.name
293
- for param in inspect.signature(evaluator).parameters.values()
294
- if param.name not in ["kwargs", "args", "self"]
295
- ]
296
-
297
- if "conversation" in evaluator_params and "conversation" in new_df.columns:
298
- # Ignore the missing fields if "conversation" presents in the input data
299
- missing_inputs = []
300
- else:
301
- optional_params = (
302
- evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
303
- if hasattr(evaluator, "_OPTIONAL_PARAMS")
304
- else []
305
- )
306
- excluded_params = set(new_df.columns).union(optional_params)
307
- missing_inputs = [col for col in evaluator_params if col not in excluded_params]
308
-
309
- # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
310
- # Otherwise, remove it from the missing inputs
311
- if "conversation" in missing_inputs:
312
- if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
313
- missing_inputs.remove("conversation")
314
- else:
315
- evaluator_params = [
316
- param.name
317
- for param in inspect.signature(evaluator).parameters.values()
318
- if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
319
- ]
320
-
321
- missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
322
-
323
- if missing_inputs:
324
- missing_inputs_per_evaluator[evaluator_name] = missing_inputs
325
-
326
- if missing_inputs_per_evaluator:
327
- msg = "Some evaluators are missing required inputs:\n"
328
- for evaluator_name, missing in missing_inputs_per_evaluator.items():
329
- msg += f"- {evaluator_name}: {missing}\n"
330
-
331
- # Add the additional notes
332
- msg += "\nTo resolve this issue:\n"
333
- msg += "- Ensure the data contains required inputs.\n"
334
- if target is not None:
335
- msg += "- Verify that the target is generating the necessary columns for the evaluators. "
336
- msg += f"Currently generated columns: {target_generated_columns} \n"
337
- msg += "- Check that the column mapping is correctly configured."
338
-
339
- raise EvaluationException(
340
- message=msg.strip(),
175
+ internal_message=msg,
341
176
  target=ErrorTarget.EVALUATE,
342
177
  category=ErrorCategory.MISSING_FIELD,
343
178
  blame=ErrorBlame.USER_ERROR,
@@ -346,85 +181,76 @@ def _validate_columns_for_evaluators(
346
181
 
347
182
  def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
348
183
  if data is None:
349
- msg = "The 'data' parameter is required for evaluation."
350
- raise EvaluationException(
351
- message=msg,
352
- target=ErrorTarget.EVALUATE,
353
- category=ErrorCategory.INVALID_VALUE,
354
- blame=ErrorBlame.USER_ERROR,
355
- )
356
- if not isinstance(data, (os.PathLike, str)):
357
- msg = "The 'data' parameter must be a string or a path-like object."
358
- raise EvaluationException(
359
- message=msg,
360
- target=ErrorTarget.EVALUATE,
361
- category=ErrorCategory.INVALID_VALUE,
362
- blame=ErrorBlame.USER_ERROR,
363
- )
364
- if not os.path.exists(data):
365
- msg = f"The input data file path '{data}' does not exist."
184
+ msg = "data parameter must be provided for evaluation."
366
185
  raise EvaluationException(
367
186
  message=msg,
187
+ internal_message=msg,
368
188
  target=ErrorTarget.EVALUATE,
369
- category=ErrorCategory.INVALID_VALUE,
189
+ category=ErrorCategory.MISSING_FIELD,
370
190
  blame=ErrorBlame.USER_ERROR,
371
191
  )
372
192
 
373
193
  if target is not None:
374
194
  if not callable(target):
375
- msg = "The 'target' parameter must be a callable function."
195
+ msg = "target parameter must be a callable function."
376
196
  raise EvaluationException(
377
197
  message=msg,
198
+ internal_message=msg,
378
199
  target=ErrorTarget.EVALUATE,
379
200
  category=ErrorCategory.INVALID_VALUE,
380
201
  blame=ErrorBlame.USER_ERROR,
381
202
  )
382
203
 
383
- if not evaluators:
384
- msg = "The 'evaluators' parameter is required and cannot be None or empty."
385
- raise EvaluationException(
386
- message=msg,
387
- target=ErrorTarget.EVALUATE,
388
- category=ErrorCategory.INVALID_VALUE,
389
- blame=ErrorBlame.USER_ERROR,
390
- )
391
- if not isinstance(evaluators, dict):
392
- msg = "The 'evaluators' parameter must be a dictionary."
393
- raise EvaluationException(
394
- message=msg,
395
- target=ErrorTarget.EVALUATE,
396
- category=ErrorCategory.INVALID_VALUE,
397
- blame=ErrorBlame.USER_ERROR,
398
- )
204
+ if data is not None:
205
+ if not isinstance(data, str):
206
+ msg = "data parameter must be a string."
207
+ raise EvaluationException(
208
+ message=msg,
209
+ internal_message=msg,
210
+ target=ErrorTarget.EVALUATE,
211
+ category=ErrorCategory.INVALID_VALUE,
212
+ blame=ErrorBlame.USER_ERROR,
213
+ )
399
214
 
400
- if output_path is not None:
401
- if not isinstance(output_path, (os.PathLike, str)):
402
- msg = "The 'output_path' parameter must be a string or a path-like object."
215
+ if evaluators is not None:
216
+ if not isinstance(evaluators, dict):
217
+ msg = "evaluators parameter must be a dictionary."
403
218
  raise EvaluationException(
404
219
  message=msg,
220
+ internal_message=msg,
405
221
  target=ErrorTarget.EVALUATE,
406
222
  category=ErrorCategory.INVALID_VALUE,
407
223
  blame=ErrorBlame.USER_ERROR,
408
224
  )
409
225
 
410
- output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
411
- if output_dir and not os.path.exists(output_dir):
412
- msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
226
+ if output_path is not None:
227
+ if not isinstance(output_path, str):
228
+ msg = "output_path parameter must be a string."
413
229
  raise EvaluationException(
414
230
  message=msg,
231
+ internal_message=msg,
415
232
  target=ErrorTarget.EVALUATE,
416
233
  category=ErrorCategory.INVALID_VALUE,
417
234
  blame=ErrorBlame.USER_ERROR,
418
235
  )
419
236
 
420
237
  if azure_ai_project is not None:
421
- validate_azure_ai_project(azure_ai_project)
238
+ if not isinstance(azure_ai_project, Dict):
239
+ msg = "azure_ai_project parameter must be a dictionary."
240
+ raise EvaluationException(
241
+ message=msg,
242
+ internal_message=msg,
243
+ target=ErrorTarget.EVALUATE,
244
+ category=ErrorCategory.INVALID_VALUE,
245
+ blame=ErrorBlame.USER_ERROR,
246
+ )
422
247
 
423
248
  if evaluation_name is not None:
424
- if not isinstance(evaluation_name, str) or not evaluation_name.strip():
425
- msg = "The 'evaluation_name' parameter must be a non-empty string."
249
+ if not isinstance(evaluation_name, str):
250
+ msg = "evaluation_name parameter must be a string."
426
251
  raise EvaluationException(
427
252
  message=msg,
253
+ internal_message=msg,
428
254
  target=ErrorTarget.EVALUATE,
429
255
  category=ErrorCategory.INVALID_VALUE,
430
256
  blame=ErrorBlame.USER_ERROR,
@@ -434,69 +260,98 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
434
260
  initial_data_df = pd.read_json(data, lines=True)
435
261
  except Exception as e:
436
262
  raise EvaluationException(
437
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
438
- target=ErrorTarget.EVALUATE,
439
- category=ErrorCategory.INVALID_VALUE,
440
- blame=ErrorBlame.USER_ERROR,
441
- ) from e
263
+ message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
264
+ internal_message="Failed to load data. Confirm that it is valid jsonl data.",
265
+ target=ErrorTarget.EVALUATE,
266
+ category=ErrorCategory.INVALID_VALUE,
267
+ blame=ErrorBlame.USER_ERROR,
268
+ ) from e
442
269
 
443
270
  return initial_data_df
444
271
 
445
272
 
273
+ def _validate_columns(
274
+ df: pd.DataFrame,
275
+ evaluators: Dict[str, Any],
276
+ target: Optional[Callable],
277
+ evaluator_config: Dict[str, Dict[str, str]],
278
+ ) -> None:
279
+ """
280
+ Check that all columns needed by evaluator or target function are present.
281
+
282
+ :param df: The data frame to be validated.
283
+ :type df: pd.DataFrame
284
+ :param evaluators: The dictionary of evaluators.
285
+ :type evaluators: Dict[str, Any]
286
+ :param target: The callable to be applied to data set.
287
+ :type target: Optional[Callable]
288
+ :param evaluator_config: The configuration for evaluators.
289
+ :type evaluator_config: Dict[str, Dict[str, str]]
290
+ :raises EvaluationException: If column starts from "__outputs." while target is defined.
291
+ """
292
+ if target:
293
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
294
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
295
+ raise EvaluationException(
296
+ message=msg,
297
+ internal_message=msg,
298
+ target=ErrorTarget.EVALUATE,
299
+ category=ErrorCategory.INVALID_VALUE,
300
+ blame=ErrorBlame.USER_ERROR,
301
+ )
302
+ # If the target function is given, it may return
303
+ # several columns and hence we cannot check the availability of columns
304
+ # without knowing target function semantics.
305
+ # Instead, here we will validate the columns, taken by target.
306
+ _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
307
+ else:
308
+ for evaluator_name, evaluator in evaluators.items():
309
+ # Apply column mapping
310
+ mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
311
+ new_df = _apply_column_mapping(df, mapping_config)
312
+
313
+ # Validate input data for evaluator
314
+ _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
315
+
316
+
446
317
  def _apply_target_to_data(
447
318
  target: Callable,
448
- data: Union[str, os.PathLike],
319
+ data: str,
449
320
  pf_client: PFClient,
450
321
  initial_data: pd.DataFrame,
451
322
  evaluation_name: Optional[str] = None,
452
- **kwargs,
453
- ) -> Tuple[pd.DataFrame, Set[str], Run]:
323
+ _run_name: Optional[str] = None,
324
+ ) -> Tuple[pd.DataFrame, Set[str]]:
454
325
  """
455
326
  Apply the target function to the data set and return updated data and generated columns.
456
327
 
457
328
  :param target: The function to be applied to data.
458
329
  :type target: Callable
459
330
  :param data: The path to input jsonl file.
460
- :type data: Union[str, os.PathLike]
331
+ :type data: str
461
332
  :param pf_client: The promptflow client to be used.
462
333
  :type pf_client: PFClient
463
334
  :param initial_data: The data frame with the loaded data.
464
335
  :type initial_data: pd.DataFrame
465
336
  :param evaluation_name: The name of the evaluation.
466
337
  :type evaluation_name: Optional[str]
338
+ :param _run_name: The name of target run. Used for testing only.
339
+ :type _run_name: Optional[str]
467
340
  :return: The tuple, containing data frame and the list of added columns.
468
341
  :rtype: Tuple[pandas.DataFrame, List[str]]
469
342
  """
470
- _run_name = kwargs.get("_run_name")
471
- upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
472
-
473
- try:
474
- with TargetRunContext(upload_target_snaphot):
475
- run: Run = pf_client.run(
476
- flow=target,
477
- display_name=evaluation_name,
478
- data=data,
479
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
- stream=True,
481
- name=_run_name,
482
- )
483
- except (UserAuthenticationError, UploadInternalError) as ex:
484
- if "Failed to upload run" in ex.message:
485
- msg = (
486
- "Failed to upload the target run to the cloud. "
487
- "This may be caused by insufficient permission to access storage or other errors."
488
- )
489
- raise EvaluationException(
490
- message=msg,
491
- target=ErrorTarget.EVALUATE,
492
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
- blame=ErrorBlame.USER_ERROR,
494
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
- ) from ex
496
-
497
- raise ex
498
-
499
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
343
+ # We are manually creating the temporary directory for the flow
344
+ # because the way tempdir remove temporary directories will
345
+ # hang the debugger, because promptflow will keep flow directory.
346
+ run = pf_client.run(
347
+ flow=target,
348
+ display_name=evaluation_name,
349
+ data=data,
350
+ properties={"runType": "eval_run", "isEvaluatorRun": "true"},
351
+ stream=True,
352
+ name=_run_name,
353
+ )
354
+ target_output = pf_client.runs.get_details(run, all_results=True)
500
355
  # Remove input and output prefix
501
356
  generated_columns = {
502
357
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -518,30 +373,28 @@ def _apply_target_to_data(
518
373
  return target_output, generated_columns, run
519
374
 
520
375
 
521
- def _process_column_mappings(
522
- column_mapping: Dict[str, Optional[Dict[str, str]]],
523
- ) -> Dict[str, Dict[str, str]]:
524
- """Process column_mapping to replace ${target.} with ${data.}
376
+ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
377
+ """Process evaluator_config to replace ${target.} with ${data.}
525
378
 
526
- :param column_mapping: The configuration for evaluators.
527
- :type column_mapping: Dict[str, Optional[Dict[str, str]]]
379
+ :param evaluator_config: The configuration for evaluators.
380
+ :type evaluator_config: Dict[str, Dict[str, str]]
528
381
  :return: The processed configuration.
529
382
  :rtype: Dict[str, Dict[str, str]]
530
383
  """
531
384
 
532
- processed_config: Dict[str, Dict[str, str]] = {}
385
+ processed_config = {}
533
386
 
534
387
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
535
388
 
536
- if column_mapping:
537
- for evaluator, mapping_config in column_mapping.items():
389
+ if evaluator_config:
390
+ for evaluator, mapping_config in evaluator_config.items():
538
391
  if isinstance(mapping_config, dict):
539
392
  processed_config[evaluator] = {}
540
393
 
541
394
  for map_to_key, map_value in mapping_config.items():
542
395
  # Check if there's any unexpected reference other than ${target.} or ${data.}
543
396
  if unexpected_references.search(map_value):
544
- msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397
+ msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
545
398
  raise EvaluationException(
546
399
  message=msg,
547
400
  internal_message=msg,
@@ -583,49 +436,79 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
583
436
  # @log_evaluate_activity
584
437
  def evaluate(
585
438
  *,
586
- data: Union[str, os.PathLike],
587
- evaluators: Dict[str, Callable],
588
439
  evaluation_name: Optional[str] = None,
589
440
  target: Optional[Callable] = None,
590
- evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
441
+ data: Optional[str] = None,
442
+ evaluators: Optional[Dict[str, Callable]] = None,
443
+ evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
591
444
  azure_ai_project: Optional[AzureAIProject] = None,
592
- output_path: Optional[Union[str, os.PathLike]] = None,
445
+ output_path: Optional[str] = None,
593
446
  **kwargs,
594
- ) -> EvaluationResult:
447
+ ):
595
448
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
596
449
  data will be run through target function and then results will be evaluated.
597
450
 
598
- :keyword data: Path to the data to be evaluated or passed to target if target is set.
599
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
600
- :paramtype data: str
601
- :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
602
- and value as the evaluator function. Required.
603
- :paramtype evaluators: Dict[str, Callable]
604
451
  :keyword evaluation_name: Display name of the evaluation.
605
452
  :paramtype evaluation_name: Optional[str]
606
453
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
607
454
  :paramtype target: Optional[Callable]
455
+ :keyword data: Path to the data to be evaluated or passed to target if target is set.
456
+ Only .jsonl format files are supported. `target` and `data` both cannot be None
457
+ :paramtype data: Optional[str]
458
+ :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
459
+ and value as the evaluator function.
460
+ :paramtype evaluators: Optional[Dict[str, Callable]
608
461
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
609
- names as keys and a values that are dictionaries containing the column mappings. The column mappings should
610
- be a dictionary with keys as the column names in the evaluator input and values as the column names in the
611
- input data or data generated by target.
612
- :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
462
+ names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
463
+ keys as the column names in the evaluator input and values as the column names in the input data or data
464
+ generated by target.
465
+ :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
613
466
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
614
467
  the results will be saved to a file named `evaluation_results.json` in the folder.
615
468
  :paramtype output_path: Optional[str]
616
469
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
617
470
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
618
471
  :return: Evaluation results.
619
- :rtype: ~azure.ai.evaluation.EvaluationResult
472
+ :rtype: dict
473
+
474
+ :Example:
620
475
 
621
- .. admonition:: Example:
476
+ Evaluate API can be used as follows:
477
+
478
+ .. code-block:: python
479
+
480
+ from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
481
+
482
+
483
+ model_config = {
484
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
485
+ "api_key": os.environ.get("AZURE_OPENAI_KEY"),
486
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
487
+ }
488
+
489
+ coherence_eval = CoherenceEvaluator(model_config=model_config)
490
+ relevance_eval = RelevanceEvaluator(model_config=model_config)
491
+
492
+ path = "evaluate_test_data.jsonl"
493
+ result = evaluate(
494
+ data=path,
495
+ evaluators={
496
+ "coherence": coherence_eval,
497
+ "relevance": relevance_eval,
498
+ },
499
+ evaluator_config={
500
+ "coherence": {
501
+ "response": "${data.response}",
502
+ "query": "${data.query}"
503
+ },
504
+ "relevance": {
505
+ "response": "${data.response}",
506
+ "context": "${data.context}",
507
+ "query": "${data.query}"
508
+ }
509
+ }
510
+ )
622
511
 
623
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
624
- :start-after: [START evaluate_method]
625
- :end-before: [END evaluate_method]
626
- :language: python
627
- :dedent: 8
628
- :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
629
512
  """
630
513
  try:
631
514
  return _evaluate(
@@ -656,145 +539,107 @@ def evaluate(
656
539
  internal_message=error_message,
657
540
  target=ErrorTarget.EVALUATE,
658
541
  category=ErrorCategory.FAILED_EXECUTION,
659
- blame=ErrorBlame.USER_ERROR,
660
- ) from e
661
-
662
- # Ensure a consistent user experience when encountering errors by converting
663
- # all other exceptions to EvaluationException.
664
- if not isinstance(e, EvaluationException):
665
- raise EvaluationException(
666
- message=str(e),
667
- target=ErrorTarget.EVALUATE,
668
- category=ErrorCategory.FAILED_EXECUTION,
669
- blame=ErrorBlame.SYSTEM_ERROR,
542
+ blame=ErrorBlame.UNKNOWN,
670
543
  ) from e
671
544
 
672
545
  raise e
673
546
 
674
547
 
675
- def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
676
- # Extract evaluators with a non-empty "run_summary"
677
- output_dict = {
678
- name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
679
- }
680
-
681
- if output_dict:
682
- print("======= Combined Run Summary (Per Evaluator) =======\n")
683
- print(json.dumps(output_dict, indent=4))
684
- print("\n====================================================\n")
685
-
686
-
687
- def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548
+ def _evaluate( # pylint: disable=too-many-locals
688
549
  *,
689
- evaluators: Dict[str, Callable],
690
550
  evaluation_name: Optional[str] = None,
691
551
  target: Optional[Callable] = None,
692
- data: Union[str, os.PathLike],
693
- evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
552
+ data: Optional[str] = None,
553
+ evaluators: Optional[Dict[str, Callable]] = None,
554
+ evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
694
555
  azure_ai_project: Optional[AzureAIProject] = None,
695
- output_path: Optional[Union[str, os.PathLike]] = None,
556
+ output_path: Optional[str] = None,
696
557
  **kwargs,
697
- ) -> EvaluationResult:
558
+ ):
698
559
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
699
560
 
700
561
  # Process evaluator config to replace ${target.} with ${data.}
701
562
  if evaluator_config is None:
702
563
  evaluator_config = {}
703
- # extract column mapping dicts into dictionary mapping evaluator name to column mapping
704
- column_mapping = _process_column_mappings(
705
- {
706
- evaluator_name: evaluator_configuration.get("column_mapping", None)
707
- for evaluator_name, evaluator_configuration in evaluator_config.items()
708
- }
564
+ evaluator_config = _process_evaluator_config(evaluator_config)
565
+ _validate_columns(input_data_df, evaluators, target, evaluator_config)
566
+
567
+ # Target Run
568
+ pf_client = PFClient(
569
+ config=(
570
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
571
+ ),
572
+ user_agent=USER_AGENT,
709
573
  )
710
574
 
711
- if target is not None:
712
- _validate_columns_for_target(input_data_df, target)
713
-
714
- pf_client = PFClient(user_agent=USER_AGENT)
715
- target_run: Optional[Run] = None
575
+ trace_destination = pf_client._config.get_trace_destination()
716
576
 
717
- # Create default configuration for evaluators that directly maps
718
- # input data names to keyword inputs of the same name in the evaluators.
719
- column_mapping = column_mapping or {}
720
- column_mapping.setdefault("default", {})
577
+ target_run = None
721
578
 
722
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
723
- target_generated_columns: Set[str] = set()
579
+ target_generated_columns = set()
724
580
  if data is not None and target is not None:
725
581
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
726
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
582
+ target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
727
583
  )
728
584
 
729
- for evaluator_name, mapping in column_mapping.items():
585
+ # Make sure, the default is always in the configuration.
586
+ if not evaluator_config:
587
+ evaluator_config = {}
588
+ if "default" not in evaluator_config:
589
+ evaluator_config["default"] = {}
590
+
591
+ for evaluator_name, mapping in evaluator_config.items():
730
592
  mapped_to_values = set(mapping.values())
731
593
  for col in target_generated_columns:
732
594
  # If user defined mapping differently, do not change it.
733
595
  # If it was mapped to target, we have already changed it
734
- # in _process_column_mappings
596
+ # in _process_evaluator_config
735
597
  run_output = f"${{run.outputs.{col}}}"
736
598
  # We will add our mapping only if
737
599
  # customer did not mapped target output.
738
600
  if col not in mapping and run_output not in mapped_to_values:
739
- column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
740
-
741
- # After we have generated all columns, we can check if we have everything we need for evaluators.
742
- _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
743
-
744
- # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
745
- # via target mapping.
746
- # If both the data and the output dictionary of the target function
747
- # have the same column, then the target function value is used.
748
- if input_data_df is not None:
749
- for col in input_data_df.columns:
750
- # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
751
- # Also ignore columns that are already in config, since they've been covered by target mapping.
752
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
753
- column_mapping["default"][col] = f"${{data.{col}}}"
754
-
755
- def eval_batch_run(
756
- batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
757
- ) -> Dict[str, __EvaluatorInfo]:
758
- with EvalRunContext(batch_run_client):
759
- runs = {
760
- evaluator_name: batch_run_client.run(
761
- flow=evaluator,
762
- run=target_run,
763
- evaluator_name=evaluator_name,
764
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
765
- data=data,
766
- stream=True,
767
- name=kwargs.get("_run_name"),
768
- )
769
- for evaluator_name, evaluator in evaluators.items()
770
- }
601
+ evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
771
602
 
772
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
773
- return {
774
- evaluator_name: {
775
- "result": batch_run_client.get_details(run, all_results=True),
776
- "metrics": batch_run_client.get_metrics(run),
777
- "run_summary": batch_run_client.get_run_summary(run),
778
- }
779
- for evaluator_name, run in runs.items()
780
- }
603
+ # After we have generated all columns we can check if we have
604
+ # everything we need for evaluators.
605
+ _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
781
606
 
782
607
  # Batch Run
608
+ evaluators_info = {}
783
609
  use_pf_client = kwargs.get("_use_pf_client", True)
784
610
  if use_pf_client:
611
+ batch_run_client = ProxyClient(pf_client)
612
+
785
613
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
786
614
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
787
615
  data = os.path.abspath(data)
788
- per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
789
616
  else:
617
+ batch_run_client = CodeClient()
790
618
  data = input_data_df
791
- per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
619
+
620
+ with BatchRunContext(batch_run_client):
621
+ for evaluator_name, evaluator in evaluators.items():
622
+ evaluators_info[evaluator_name] = {}
623
+ evaluators_info[evaluator_name]["run"] = batch_run_client.run(
624
+ flow=evaluator,
625
+ run=target_run,
626
+ evaluator_name=evaluator_name,
627
+ column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
628
+ data=data,
629
+ stream=True,
630
+ name=kwargs.get("_run_name"),
631
+ )
632
+
633
+ # get_details needs to be called within BatchRunContext scope in order to have user agent populated
634
+ for evaluator_name, evaluator_info in evaluators_info.items():
635
+ evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
636
+ evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
792
637
 
793
638
  # Concatenate all results
794
639
  evaluators_result_df = None
795
640
  evaluators_metric = {}
796
- for evaluator_name, evaluator_result in per_evaluator_results.items():
797
- evaluator_result_df = evaluator_result["result"]
641
+ for evaluator_name, evaluator_info in evaluators_info.items():
642
+ evaluator_result_df = evaluator_info["result"]
798
643
 
799
644
  # drop input columns
800
645
  evaluator_result_df = evaluator_result_df.drop(
@@ -817,7 +662,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
817
662
  else evaluator_result_df
818
663
  )
819
664
 
820
- evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
665
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
821
666
 
822
667
  # Rename columns, generated by target function to outputs instead of inputs.
823
668
  # If target generates columns, already present in the input data, these columns
@@ -828,9 +673,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
828
673
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
829
674
  metrics.update(evaluators_metric)
830
675
 
831
- # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
832
- target_run = None
833
- trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
834
676
  studio_url = _log_metrics_and_instance_results(
835
677
  metrics,
836
678
  result_df,
@@ -839,10 +681,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
839
681
  evaluation_name,
840
682
  )
841
683
 
842
- result_df_dict = result_df.to_dict("records")
843
- result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
844
-
845
- _print_summary(per_evaluator_results)
684
+ result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
846
685
 
847
686
  if output_path:
848
687
  _write_output(output_path, result)