azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show
  1. azure/ai/evaluation/__init__.py +23 -1
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
  3. azure/ai/evaluation/_common/constants.py +9 -2
  4. azure/ai/evaluation/_common/math.py +29 -0
  5. azure/ai/evaluation/_common/rai_service.py +222 -93
  6. azure/ai/evaluation/_common/utils.py +328 -19
  7. azure/ai/evaluation/_constants.py +16 -8
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
  10. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
  11. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  13. azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
  14. azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
  16. azure/ai/evaluation/_evaluate/_utils.py +77 -25
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
  19. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  20. azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
  21. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
  22. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
  24. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
  25. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
  26. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
  27. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
  28. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
  29. azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
  30. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  31. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
  32. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  33. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  34. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
  35. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  46. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
  47. azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
  48. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
  49. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  50. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
  51. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  52. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  53. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  55. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
  56. azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
  57. azure/ai/evaluation/_exceptions.py +26 -6
  58. azure/ai/evaluation/_http_utils.py +203 -132
  59. azure/ai/evaluation/_model_configurations.py +23 -6
  60. azure/ai/evaluation/_vendor/__init__.py +3 -0
  61. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  62. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  63. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  64. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  65. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  66. azure/ai/evaluation/_version.py +1 -1
  67. azure/ai/evaluation/simulator/__init__.py +2 -1
  68. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  69. azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
  70. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  71. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  72. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  73. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  74. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  75. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  76. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
  78. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  80. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  81. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  82. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
  83. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  84. azure/ai/evaluation/simulator/_simulator.py +222 -169
  85. azure/ai/evaluation/simulator/_tracing.py +4 -4
  86. azure/ai/evaluation/simulator/_utils.py +6 -6
  87. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
  88. azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
  89. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  90. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  91. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  92. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
  93. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -2,26 +2,31 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import json
5
6
  import os
6
7
  import re
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
8
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8
9
 
9
- import numpy as np
10
10
  import pandas as pd
11
11
  from promptflow._sdk._constants import LINE_NUMBER
12
+ from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
12
13
  from promptflow.client import PFClient
14
+ from promptflow.entities import Run
13
15
 
16
+ from azure.ai.evaluation._common.math import list_sum
17
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
14
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
19
 
16
20
  from .._constants import (
17
21
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
22
  EvaluationMetrics,
23
+ EvaluationRunProperties,
19
24
  Prefixes,
20
25
  _InternalEvaluationMetrics,
21
26
  )
22
- from .._model_configurations import AzureAIProject, EvaluatorConfig
27
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
23
28
  from .._user_agent import USER_AGENT
24
- from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
29
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
25
30
  from ._utils import (
26
31
  _apply_column_mapping,
27
32
  _log_metrics_and_instance_results,
@@ -29,10 +34,52 @@ from ._utils import (
29
34
  _write_output,
30
35
  )
31
36
 
37
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
38
+
39
+ # For metrics (aggregates) whose metric names intentionally differ from their
40
+ # originating column name, usually because the aggregation of the original value
41
+ # means something sufficiently different.
42
+ # Note that content safety metrics are handled seprately.
43
+ METRIC_COLUMN_NAME_REPLACEMENTS = {
44
+ "groundedness_pro_label": "groundedness_pro_passing_rate",
45
+ }
46
+
47
+
48
+ class __EvaluatorInfo(TypedDict):
49
+ result: pd.DataFrame
50
+ metrics: Dict[str, Any]
51
+ run_summary: Dict[str, Any]
52
+
53
+
54
+ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
55
+ """Identify and average various metrics that need to have the metric name be replaced,
56
+ instead of having the metric match the originating column name.
57
+ :param df: The dataframe of evaluation results.
58
+ :type df: ~pandas.DataFrame
59
+ :return: A tuple; the first element is a list of dataframe columns that were aggregated,
60
+ and the second element is a dictionary of resultant new metric column names and their values.
61
+ :rtype: Tuple[List[str], Dict[str, float]]
62
+ """
63
+ renamed_cols = []
64
+ metric_columns = {}
65
+ for col in df.columns:
66
+ metric_prefix = col.split(".")[0]
67
+ metric_name = col.split(".")[1]
68
+ if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
69
+ renamed_cols.append(col)
70
+ new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
71
+ col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
72
+ metric_columns[new_col_name] = round(
73
+ list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
74
+ 2,
75
+ )
76
+
77
+ return renamed_cols, metric_columns
78
+
32
79
 
33
80
  # pylint: disable=line-too-long
34
81
  def _aggregate_content_safety_metrics(
35
- df: pd.DataFrame, evaluators: Dict[str, Type]
82
+ df: pd.DataFrame, evaluators: Dict[str, Callable]
36
83
  ) -> Tuple[List[str], Dict[str, float]]:
37
84
  """Find and aggregate defect rates for content safety metrics. Returns both a list
38
85
  of columns that were used to calculate defect rates and the defect rates themselves.
@@ -73,7 +120,7 @@ def _aggregate_content_safety_metrics(
73
120
  defect_rate_name = col.replace("_score", "_defect_rate")
74
121
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
75
122
  defect_rates[defect_rate_name] = round(
76
- np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
123
+ list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
77
124
  / col_with_numeric_values.count(),
78
125
  2,
79
126
  )
@@ -107,13 +154,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
107
154
  defect_rate_name = col.replace("_label", "_defect_rate")
108
155
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
109
156
  defect_rates[defect_rate_name] = round(
110
- np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
157
+ list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
111
158
  2,
112
159
  )
113
160
  return label_cols, defect_rates
114
161
 
115
162
 
116
- def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
163
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
117
164
  """Aggregate metrics from the evaluation results.
118
165
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
119
166
  that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,7 +169,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
122
169
  :param df: The dataframe of evaluation results.
123
170
  :type df: ~pandas.DataFrame
124
171
  :param evaluators: A dictionary mapping of strings to evaluator classes.
125
- :type evaluators: Dict[str, Type]
172
+ :type evaluators: Dict[str, Callable]
126
173
  :return: The aggregated metrics.
127
174
  :rtype: Dict[str, float]
128
175
  """
@@ -133,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
133
180
  # Rename certain columns as defect rates if we know that's what their aggregates represent
134
181
  # Content safety metrics
135
182
  content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
183
+ other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
136
184
  handled_columns.extend(content_safety_cols)
185
+ handled_columns.extend(other_renamed_cols)
137
186
  defect_rates.update(cs_defect_rates)
187
+ defect_rates.update(renamed_cols)
138
188
  # Label-based (true/false) metrics where 'true' means 'something is wrong'
139
189
  label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
140
190
  handled_columns.extend(label_cols)
@@ -150,34 +200,127 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
150
200
  return metrics
151
201
 
152
202
 
153
- def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
203
+ def _validate_columns_for_target(
204
+ df: pd.DataFrame,
205
+ target: Callable,
206
+ ) -> None:
207
+ """
208
+ Check that all columns needed by target function are present.
209
+
210
+ :param df: The data frame to be validated.
211
+ :type df: pd.DataFrame
212
+ :param target: The callable to be applied to data set.
213
+ :type target: Optional[Callable]
214
+ :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
215
+ """
216
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
217
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
218
+ raise EvaluationException(
219
+ message=msg,
220
+ internal_message=msg,
221
+ target=ErrorTarget.EVALUATE,
222
+ category=ErrorCategory.INVALID_VALUE,
223
+ blame=ErrorBlame.USER_ERROR,
224
+ )
225
+ # If the target function is given, it may return
226
+ # several columns and hence we cannot check the availability of columns
227
+ # without knowing target function semantics.
228
+ # Instead, here we will validate the columns, taken by target.
154
229
  required_inputs = [
155
230
  param.name
156
- for param in inspect.signature(evaluator).parameters.values()
231
+ for param in inspect.signature(target).parameters.values()
157
232
  if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
158
233
  ]
159
234
 
160
- missing_inputs = [col for col in required_inputs if col not in df_data.columns]
161
- if missing_inputs and "conversation" in required_inputs:
162
- non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
163
- if len(missing_inputs) == len(non_conversation_inputs) and [
164
- input in non_conversation_inputs for input in missing_inputs
165
- ]:
166
- missing_inputs = []
235
+ missing_inputs = [col for col in required_inputs if col not in df.columns]
167
236
  if missing_inputs:
168
- if not is_target_fn:
169
- msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
170
- raise EvaluationException(
171
- message=msg,
172
- internal_message=msg,
173
- target=ErrorTarget.EVALUATE,
174
- category=ErrorCategory.MISSING_FIELD,
175
- blame=ErrorBlame.USER_ERROR,
176
- )
177
- msg = f"Missing required inputs for target : {missing_inputs}."
237
+ msg = f"Missing required inputs for target: {missing_inputs}."
178
238
  raise EvaluationException(
179
239
  message=msg,
180
- internal_message=msg,
240
+ target=ErrorTarget.EVALUATE,
241
+ category=ErrorCategory.MISSING_FIELD,
242
+ blame=ErrorBlame.USER_ERROR,
243
+ )
244
+
245
+
246
+ def _validate_columns_for_evaluators(
247
+ df: pd.DataFrame,
248
+ evaluators: Dict[str, Callable],
249
+ target: Optional[Callable],
250
+ target_generated_columns: Optional[Set[str]],
251
+ column_mapping: Dict[str, Dict[str, str]],
252
+ ) -> None:
253
+ """
254
+ Check that all columns needed by evaluators are present.
255
+
256
+ :param df: The data frame to be validated.
257
+ :type df: pd.DataFrame
258
+ :param evaluators: The dictionary of evaluators.
259
+ :type evaluators: Dict[str, Callable]
260
+ :param target: The callable to be applied to data set.
261
+ :type target: Optional[Callable]
262
+ :param target_generated_columns: The set of columns generated by the target callable.
263
+ :type target_generated_columns: Optional[Set[str]]
264
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
265
+ :type column_mapping: Dict[str, Dict[str, str]]
266
+ :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
267
+ """
268
+ missing_inputs_per_evaluator = {}
269
+
270
+ for evaluator_name, evaluator in evaluators.items():
271
+ # Apply column mapping
272
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
273
+ new_df = _apply_column_mapping(df, mapping_config)
274
+
275
+ # Validate input data for evaluator
276
+ is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
277
+ if is_built_in:
278
+ # Note that for built-in evaluators supporting the "conversation" parameter,
279
+ # input parameters are now optional.
280
+ evaluator_params = [
281
+ param.name
282
+ for param in inspect.signature(evaluator).parameters.values()
283
+ if param.name not in ["kwargs", "args", "self"]
284
+ ]
285
+
286
+ if "conversation" in evaluator_params and "conversation" in new_df.columns:
287
+ # Ignore the missing fields if "conversation" presents in the input data
288
+ missing_inputs = []
289
+ else:
290
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
291
+
292
+ # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
293
+ # Otherwise, remove it from the missing inputs
294
+ if "conversation" in missing_inputs:
295
+ if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
296
+ missing_inputs.remove("conversation")
297
+ else:
298
+ evaluator_params = [
299
+ param.name
300
+ for param in inspect.signature(evaluator).parameters.values()
301
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
302
+ ]
303
+
304
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
305
+
306
+ if missing_inputs:
307
+ missing_inputs_per_evaluator[evaluator_name] = missing_inputs
308
+
309
+ if missing_inputs_per_evaluator:
310
+ msg = "Some evaluators are missing required inputs:\n"
311
+ for evaluator_name, missing in missing_inputs_per_evaluator.items():
312
+ msg += f"- {evaluator_name}: {missing}\n"
313
+
314
+ # Add the additional notes
315
+ msg += "\nTo resolve this issue:\n"
316
+ msg += "- Ensure the data contains required inputs.\n"
317
+ if target is not None:
318
+ msg += "- Verify that the target is generating the necessary columns for the evaluators. "
319
+ msg += f"Currently generated columns: {target_generated_columns} \n"
320
+ msg += "- Check that the column mapping is correctly configured."
321
+
322
+ raise EvaluationException(
323
+ message=msg.strip(),
181
324
  target=ErrorTarget.EVALUATE,
182
325
  category=ErrorCategory.MISSING_FIELD,
183
326
  blame=ErrorBlame.USER_ERROR,
@@ -186,76 +329,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
186
329
 
187
330
  def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
188
331
  if data is None:
189
- msg = "data parameter must be provided for evaluation."
332
+ msg = "The 'data' parameter is required for evaluation."
190
333
  raise EvaluationException(
191
334
  message=msg,
192
- internal_message=msg,
193
335
  target=ErrorTarget.EVALUATE,
194
- category=ErrorCategory.MISSING_FIELD,
336
+ category=ErrorCategory.INVALID_VALUE,
337
+ blame=ErrorBlame.USER_ERROR,
338
+ )
339
+ if not isinstance(data, (os.PathLike, str)):
340
+ msg = "The 'data' parameter must be a string or a path-like object."
341
+ raise EvaluationException(
342
+ message=msg,
343
+ target=ErrorTarget.EVALUATE,
344
+ category=ErrorCategory.INVALID_VALUE,
345
+ blame=ErrorBlame.USER_ERROR,
346
+ )
347
+ if not os.path.exists(data):
348
+ msg = f"The input data file path '{data}' does not exist."
349
+ raise EvaluationException(
350
+ message=msg,
351
+ target=ErrorTarget.EVALUATE,
352
+ category=ErrorCategory.INVALID_VALUE,
195
353
  blame=ErrorBlame.USER_ERROR,
196
354
  )
197
355
 
198
356
  if target is not None:
199
357
  if not callable(target):
200
- msg = "target parameter must be a callable function."
358
+ msg = "The 'target' parameter must be a callable function."
201
359
  raise EvaluationException(
202
360
  message=msg,
203
- internal_message=msg,
204
361
  target=ErrorTarget.EVALUATE,
205
362
  category=ErrorCategory.INVALID_VALUE,
206
363
  blame=ErrorBlame.USER_ERROR,
207
364
  )
208
365
 
209
- if data is not None:
210
- if not isinstance(data, str):
211
- msg = "data parameter must be a string."
212
- raise EvaluationException(
213
- message=msg,
214
- internal_message=msg,
215
- target=ErrorTarget.EVALUATE,
216
- category=ErrorCategory.INVALID_VALUE,
217
- blame=ErrorBlame.USER_ERROR,
218
- )
366
+ if not evaluators:
367
+ msg = "The 'evaluators' parameter is required and cannot be None or empty."
368
+ raise EvaluationException(
369
+ message=msg,
370
+ target=ErrorTarget.EVALUATE,
371
+ category=ErrorCategory.INVALID_VALUE,
372
+ blame=ErrorBlame.USER_ERROR,
373
+ )
374
+ if not isinstance(evaluators, dict):
375
+ msg = "The 'evaluators' parameter must be a dictionary."
376
+ raise EvaluationException(
377
+ message=msg,
378
+ target=ErrorTarget.EVALUATE,
379
+ category=ErrorCategory.INVALID_VALUE,
380
+ blame=ErrorBlame.USER_ERROR,
381
+ )
219
382
 
220
- if evaluators is not None:
221
- if not isinstance(evaluators, dict):
222
- msg = "evaluators parameter must be a dictionary."
383
+ if output_path is not None:
384
+ if not isinstance(output_path, (os.PathLike, str)):
385
+ msg = "The 'output_path' parameter must be a string or a path-like object."
223
386
  raise EvaluationException(
224
387
  message=msg,
225
- internal_message=msg,
226
388
  target=ErrorTarget.EVALUATE,
227
389
  category=ErrorCategory.INVALID_VALUE,
228
390
  blame=ErrorBlame.USER_ERROR,
229
391
  )
230
392
 
231
- if output_path is not None:
232
- if not isinstance(output_path, str):
233
- msg = "output_path parameter must be a string."
393
+ output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
394
+ if not os.path.exists(output_dir):
395
+ msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
234
396
  raise EvaluationException(
235
397
  message=msg,
236
- internal_message=msg,
237
398
  target=ErrorTarget.EVALUATE,
238
399
  category=ErrorCategory.INVALID_VALUE,
239
400
  blame=ErrorBlame.USER_ERROR,
240
401
  )
241
402
 
242
403
  if azure_ai_project is not None:
243
- if not isinstance(azure_ai_project, Dict):
244
- msg = "azure_ai_project parameter must be a dictionary."
245
- raise EvaluationException(
246
- message=msg,
247
- internal_message=msg,
248
- target=ErrorTarget.EVALUATE,
249
- category=ErrorCategory.INVALID_VALUE,
250
- blame=ErrorBlame.USER_ERROR,
251
- )
404
+ validate_azure_ai_project(azure_ai_project)
252
405
 
253
406
  if evaluation_name is not None:
254
- if not isinstance(evaluation_name, str):
255
- msg = "evaluation_name parameter must be a string."
407
+ if not isinstance(evaluation_name, str) or not evaluation_name.strip():
408
+ msg = "The 'evaluation_name' parameter must be a non-empty string."
256
409
  raise EvaluationException(
257
410
  message=msg,
258
- internal_message=msg,
259
411
  target=ErrorTarget.EVALUATE,
260
412
  category=ErrorCategory.INVALID_VALUE,
261
413
  blame=ErrorBlame.USER_ERROR,
@@ -265,8 +417,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
265
417
  initial_data_df = pd.read_json(data, lines=True)
266
418
  except Exception as e:
267
419
  raise EvaluationException(
268
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
269
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
420
+ message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
270
421
  target=ErrorTarget.EVALUATE,
271
422
  category=ErrorCategory.INVALID_VALUE,
272
423
  blame=ErrorBlame.USER_ERROR,
@@ -275,88 +426,60 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
275
426
  return initial_data_df
276
427
 
277
428
 
278
- def _validate_columns(
279
- df: pd.DataFrame,
280
- evaluators: Dict[str, Any],
281
- target: Optional[Callable],
282
- column_mapping: Dict[str, Dict[str, str]],
283
- ) -> None:
284
- """
285
- Check that all columns needed by evaluator or target function are present.
286
-
287
- :param df: The data frame to be validated.
288
- :type df: pd.DataFrame
289
- :param evaluators: The dictionary of evaluators.
290
- :type evaluators: Dict[str, Any]
291
- :param target: The callable to be applied to data set.
292
- :type target: Optional[Callable]
293
- :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
294
- :type column_mapping: Dict[str, Dict[str, str]]
295
- :raises EvaluationException: If column starts from "__outputs." while target is defined.
296
- """
297
- if target:
298
- if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
299
- msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
300
- raise EvaluationException(
301
- message=msg,
302
- internal_message=msg,
303
- target=ErrorTarget.EVALUATE,
304
- category=ErrorCategory.INVALID_VALUE,
305
- blame=ErrorBlame.USER_ERROR,
306
- )
307
- # If the target function is given, it may return
308
- # several columns and hence we cannot check the availability of columns
309
- # without knowing target function semantics.
310
- # Instead, here we will validate the columns, taken by target.
311
- _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
312
- else:
313
- for evaluator_name, evaluator in evaluators.items():
314
- # Apply column mapping
315
- mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
316
- new_df = _apply_column_mapping(df, mapping_config)
317
-
318
- # Validate input data for evaluator
319
- _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
320
-
321
-
322
429
  def _apply_target_to_data(
323
430
  target: Callable,
324
- data: str,
431
+ data: Union[str, os.PathLike],
325
432
  pf_client: PFClient,
326
433
  initial_data: pd.DataFrame,
327
434
  evaluation_name: Optional[str] = None,
328
- _run_name: Optional[str] = None,
329
- ) -> Tuple[pd.DataFrame, Set[str]]:
435
+ **kwargs,
436
+ ) -> Tuple[pd.DataFrame, Set[str], Run]:
330
437
  """
331
438
  Apply the target function to the data set and return updated data and generated columns.
332
439
 
333
440
  :param target: The function to be applied to data.
334
441
  :type target: Callable
335
442
  :param data: The path to input jsonl file.
336
- :type data: str
443
+ :type data: Union[str, os.PathLike]
337
444
  :param pf_client: The promptflow client to be used.
338
445
  :type pf_client: PFClient
339
446
  :param initial_data: The data frame with the loaded data.
340
447
  :type initial_data: pd.DataFrame
341
448
  :param evaluation_name: The name of the evaluation.
342
449
  :type evaluation_name: Optional[str]
343
- :param _run_name: The name of target run. Used for testing only.
344
- :type _run_name: Optional[str]
345
450
  :return: The tuple, containing data frame and the list of added columns.
346
451
  :rtype: Tuple[pandas.DataFrame, List[str]]
347
452
  """
348
- # We are manually creating the temporary directory for the flow
349
- # because the way tempdir remove temporary directories will
350
- # hang the debugger, because promptflow will keep flow directory.
351
- run = pf_client.run(
352
- flow=target,
353
- display_name=evaluation_name,
354
- data=data,
355
- properties={"runType": "eval_run", "isEvaluatorRun": "true"},
356
- stream=True,
357
- name=_run_name,
358
- )
359
- target_output = pf_client.runs.get_details(run, all_results=True)
453
+ _run_name = kwargs.get("_run_name")
454
+ upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
455
+
456
+ try:
457
+ with TargetRunContext(upload_target_snaphot):
458
+ run: Run = pf_client.run(
459
+ flow=target,
460
+ display_name=evaluation_name,
461
+ data=data,
462
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
463
+ stream=True,
464
+ name=_run_name,
465
+ )
466
+ except (UserAuthenticationError, UploadInternalError) as ex:
467
+ if "Failed to upload run" in ex.message:
468
+ msg = (
469
+ "Failed to upload the target run to the cloud. "
470
+ "This may be caused by insufficient permission to access storage or other errors."
471
+ )
472
+ raise EvaluationException(
473
+ message=msg,
474
+ target=ErrorTarget.EVALUATE,
475
+ category=ErrorCategory.FAILED_REMOTE_TRACKING,
476
+ blame=ErrorBlame.USER_ERROR,
477
+ tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
478
+ ) from ex
479
+
480
+ raise ex
481
+
482
+ target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
360
483
  # Remove input and output prefix
361
484
  generated_columns = {
362
485
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -378,16 +501,18 @@ def _apply_target_to_data(
378
501
  return target_output, generated_columns, run
379
502
 
380
503
 
381
- def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
504
+ def _process_column_mappings(
505
+ column_mapping: Dict[str, Optional[Dict[str, str]]],
506
+ ) -> Dict[str, Dict[str, str]]:
382
507
  """Process column_mapping to replace ${target.} with ${data.}
383
508
 
384
509
  :param column_mapping: The configuration for evaluators.
385
- :type column_mapping: Dict[str, Dict[str, str]]
510
+ :type column_mapping: Dict[str, Optional[Dict[str, str]]]
386
511
  :return: The processed configuration.
387
512
  :rtype: Dict[str, Dict[str, str]]
388
513
  """
389
514
 
390
- processed_config = {}
515
+ processed_config: Dict[str, Dict[str, str]] = {}
391
516
 
392
517
  unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
393
518
 
@@ -441,15 +566,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
441
566
  # @log_evaluate_activity
442
567
  def evaluate(
443
568
  *,
444
- data: str,
569
+ data: Union[str, os.PathLike],
445
570
  evaluators: Dict[str, Callable],
446
571
  evaluation_name: Optional[str] = None,
447
572
  target: Optional[Callable] = None,
448
573
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
449
574
  azure_ai_project: Optional[AzureAIProject] = None,
450
- output_path: Optional[str] = None,
575
+ output_path: Optional[Union[str, os.PathLike]] = None,
451
576
  **kwargs,
452
- ):
577
+ ) -> EvaluationResult:
453
578
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
454
579
  data will be run through target function and then results will be evaluated.
455
580
 
@@ -474,7 +599,7 @@ def evaluate(
474
599
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
475
600
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
476
601
  :return: Evaluation results.
477
- :rtype: dict
602
+ :rtype: ~azure.ai.evaluation.EvaluationResult
478
603
 
479
604
  :Example:
480
605
 
@@ -548,47 +673,92 @@ def evaluate(
548
673
  internal_message=error_message,
549
674
  target=ErrorTarget.EVALUATE,
550
675
  category=ErrorCategory.FAILED_EXECUTION,
551
- blame=ErrorBlame.UNKNOWN,
676
+ blame=ErrorBlame.USER_ERROR,
677
+ ) from e
678
+
679
+ # Ensure a consistent user experience when encountering errors by converting
680
+ # all other exceptions to EvaluationException.
681
+ if not isinstance(e, EvaluationException):
682
+ raise EvaluationException(
683
+ message=str(e),
684
+ target=ErrorTarget.EVALUATE,
685
+ category=ErrorCategory.FAILED_EXECUTION,
686
+ blame=ErrorBlame.SYSTEM_ERROR,
552
687
  ) from e
553
688
 
554
689
  raise e
555
690
 
556
691
 
692
+ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
693
+ # Extract evaluators with a non-empty "run_summary"
694
+ output_dict = {
695
+ name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
696
+ }
697
+
698
+ if output_dict:
699
+ print("======= Combined Run Summary (Per Evaluator) =======\n")
700
+ print(json.dumps(output_dict, indent=4))
701
+ print("\n====================================================")
702
+
703
+
557
704
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
558
705
  *,
706
+ evaluators: Dict[str, Callable],
559
707
  evaluation_name: Optional[str] = None,
560
708
  target: Optional[Callable] = None,
561
- data: Optional[str] = None,
562
- evaluators: Optional[Dict[str, Callable]] = None,
709
+ data: Union[str, os.PathLike],
563
710
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
564
711
  azure_ai_project: Optional[AzureAIProject] = None,
565
- output_path: Optional[str] = None,
712
+ output_path: Optional[Union[str, os.PathLike]] = None,
566
713
  **kwargs,
567
- ):
714
+ ) -> EvaluationResult:
568
715
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
569
716
 
570
717
  # Process evaluator config to replace ${target.} with ${data.}
571
718
  if evaluator_config is None:
572
719
  evaluator_config = {}
573
720
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
574
- column_mapping = {
575
- evaluator_name: evaluator_configuration.get("column_mapping", None)
576
- for evaluator_name, evaluator_configuration in evaluator_config.items()
577
- }
578
- column_mapping = _process_column_mappings(column_mapping)
579
- _validate_columns(input_data_df, evaluators, target, column_mapping)
721
+ column_mapping = _process_column_mappings(
722
+ {
723
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
724
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
725
+ }
726
+ )
727
+
728
+ if target is not None:
729
+ _validate_columns_for_target(input_data_df, target)
580
730
 
581
731
  # Target Run
582
- pf_client = PFClient(
583
- config=(
584
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
585
- ),
586
- user_agent=USER_AGENT,
587
- )
732
+ try:
733
+ pf_client = PFClient(
734
+ config=(
735
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
736
+ if azure_ai_project
737
+ else None
738
+ ),
739
+ user_agent=USER_AGENT,
740
+ )
741
+ # pylint: disable=raise-missing-from
742
+ except MissingAzurePackage:
743
+ msg = (
744
+ "The required packages for remote tracking are missing.\n"
745
+ 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
746
+ )
747
+
748
+ raise EvaluationException( # pylint: disable=raise-missing-from
749
+ message=msg,
750
+ target=ErrorTarget.EVALUATE,
751
+ category=ErrorCategory.MISSING_PACKAGE,
752
+ blame=ErrorBlame.USER_ERROR,
753
+ )
754
+
755
+ trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
588
756
 
589
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
590
- target_run = None
591
- target_generated_columns = set()
757
+ # Handle the case where the customer manually run "pf config set trace.destination=none"
758
+ if trace_destination and trace_destination.lower() == "none":
759
+ trace_destination = None
760
+
761
+ target_run: Optional[Run] = None
592
762
 
593
763
  # Create default configuration for evaluators that directly maps
594
764
  # input data names to keyword inputs of the same name in the evaluators.
@@ -596,9 +766,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
596
766
  column_mapping.setdefault("default", {})
597
767
 
598
768
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
769
+ target_generated_columns: Set[str] = set()
599
770
  if data is not None and target is not None:
600
771
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
601
- target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
772
+ target, data, pf_client, input_data_df, evaluation_name, **kwargs
602
773
  )
603
774
 
604
775
  for evaluator_name, mapping in column_mapping.items():
@@ -613,9 +784,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
613
784
  if col not in mapping and run_output not in mapped_to_values:
614
785
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
615
786
 
616
- # After we have generated all columns we can check if we have
617
- # everything we need for evaluators.
618
- _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
787
+ # After we have generated all columns, we can check if we have everything we need for evaluators.
788
+ _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
619
789
 
620
790
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
621
791
  # via target mapping.
@@ -627,45 +797,54 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
627
797
  # Also ignore columns that are already in config, since they've been covered by target mapping.
628
798
  if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
629
799
  column_mapping["default"][col] = f"${{data.{col}}}"
800
+
801
+ def eval_batch_run(
802
+ batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
803
+ ) -> Dict[str, __EvaluatorInfo]:
804
+ with EvalRunContext(batch_run_client):
805
+ runs = {
806
+ evaluator_name: batch_run_client.run(
807
+ flow=evaluator,
808
+ run=target_run,
809
+ evaluator_name=evaluator_name,
810
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
811
+ data=data,
812
+ stream=True,
813
+ name=kwargs.get("_run_name"),
814
+ )
815
+ for evaluator_name, evaluator in evaluators.items()
816
+ }
817
+
818
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
819
+ return {
820
+ evaluator_name: {
821
+ "result": batch_run_client.get_details(run, all_results=True),
822
+ "metrics": batch_run_client.get_metrics(run),
823
+ "run_summary": batch_run_client.get_run_summary(run),
824
+ }
825
+ for evaluator_name, run in runs.items()
826
+ }
827
+
630
828
  # Batch Run
631
- evaluators_info = {}
632
829
  use_pf_client = kwargs.get("_use_pf_client", True)
633
830
  if use_pf_client:
634
- # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
635
- # The root cause is still unclear, but it seems related to a conflict between the async run uploader
636
- # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
637
- batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
638
-
639
831
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
640
832
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
641
833
  data = os.path.abspath(data)
834
+
835
+ # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
836
+ # The root cause is still unclear, but it seems related to a conflict between the async run uploader
837
+ # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
838
+ per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
642
839
  else:
643
- batch_run_client = CodeClient()
644
840
  data = input_data_df
645
-
646
- with BatchRunContext(batch_run_client):
647
- for evaluator_name, evaluator in evaluators.items():
648
- evaluators_info[evaluator_name] = {}
649
- evaluators_info[evaluator_name]["run"] = batch_run_client.run(
650
- flow=evaluator,
651
- run=target_run,
652
- evaluator_name=evaluator_name,
653
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
654
- data=data,
655
- stream=True,
656
- name=kwargs.get("_run_name"),
657
- )
658
-
659
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
660
- for evaluator_name, evaluator_info in evaluators_info.items():
661
- evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
662
- evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
841
+ per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
663
842
 
664
843
  # Concatenate all results
665
844
  evaluators_result_df = None
666
845
  evaluators_metric = {}
667
- for evaluator_name, evaluator_info in evaluators_info.items():
668
- evaluator_result_df = evaluator_info["result"]
846
+ for evaluator_name, evaluator_result in per_evaluator_results.items():
847
+ evaluator_result_df = evaluator_result["result"]
669
848
 
670
849
  # drop input columns
671
850
  evaluator_result_df = evaluator_result_df.drop(
@@ -688,7 +867,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
867
  else evaluator_result_df
689
868
  )
690
869
 
691
- evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
870
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
692
871
 
693
872
  # Rename columns, generated by target function to outputs instead of inputs.
694
873
  # If target generates columns, already present in the input data, these columns
@@ -706,9 +885,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
706
885
  evaluation_name,
707
886
  )
708
887
 
709
- result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
888
+ result_df_dict = result_df.to_dict("records")
889
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
710
890
 
711
891
  if output_path:
712
892
  _write_output(output_path, result)
713
893
 
894
+ _print_summary(per_evaluator_results)
895
+
714
896
  return result