azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
  3. azure/ai/evaluation/_common/constants.py +5 -0
  4. azure/ai/evaluation/_common/math.py +73 -2
  5. azure/ai/evaluation/_common/rai_service.py +250 -62
  6. azure/ai/evaluation/_common/utils.py +196 -23
  7. azure/ai/evaluation/_constants.py +7 -6
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
  10. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
  11. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
  13. azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
  15. azure/ai/evaluation/_evaluate/_utils.py +46 -11
  16. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
  18. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  31. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
  32. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  34. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  35. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
  36. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  44. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
  45. azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
  46. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
  47. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  48. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
  49. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  50. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
  51. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  52. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  53. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
  54. azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
  55. azure/ai/evaluation/_exceptions.py +19 -0
  56. azure/ai/evaluation/_model_configurations.py +83 -15
  57. azure/ai/evaluation/_version.py +1 -1
  58. azure/ai/evaluation/simulator/__init__.py +2 -1
  59. azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
  60. azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
  61. azure/ai/evaluation/simulator/_constants.py +11 -1
  62. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  63. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  64. azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
  65. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  66. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  67. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
  68. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
  69. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  70. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  71. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  72. azure/ai/evaluation/simulator/_simulator.py +165 -105
  73. azure/ai/evaluation/simulator/_utils.py +31 -13
  74. azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
  75. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
  76. azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
  77. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
  78. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  79. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  80. azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
  81. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  82. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  83. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,20 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import json
6
+ import logging
5
7
  import os
6
8
  import re
7
9
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8
- import json
9
10
 
10
11
  import pandas as pd
11
12
  from promptflow._sdk._constants import LINE_NUMBER
13
+ from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
12
14
  from promptflow.client import PFClient
13
15
  from promptflow.entities import Run
14
- from promptflow._sdk._errors import MissingAzurePackage
15
16
 
16
- from azure.ai.evaluation._common.math import list_sum
17
+ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
17
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
20
 
19
21
  from .._constants import (
@@ -23,11 +25,10 @@ from .._constants import (
23
25
  Prefixes,
24
26
  _InternalEvaluationMetrics,
25
27
  )
26
- from .._model_configurations import AzureAIProject, EvaluatorConfig
28
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
29
  from .._user_agent import USER_AGENT
28
- from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
29
31
  from ._utils import (
30
- EvaluateResult,
31
32
  _apply_column_mapping,
32
33
  _log_metrics_and_instance_results,
33
34
  _trace_destination_from_project_scope,
@@ -35,6 +36,15 @@ from ._utils import (
35
36
  )
36
37
 
37
38
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
39
+ LOGGER = logging.getLogger(__name__)
40
+
41
+ # For metrics (aggregates) whose metric names intentionally differ from their
42
+ # originating column name, usually because the aggregation of the original value
43
+ # means something sufficiently different.
44
+ # Note that content safety metrics are handled seprately.
45
+ METRIC_COLUMN_NAME_REPLACEMENTS = {
46
+ "groundedness_pro_label": "groundedness_pro_passing_rate",
47
+ }
38
48
 
39
49
 
40
50
  class __EvaluatorInfo(TypedDict):
@@ -43,6 +53,33 @@ class __EvaluatorInfo(TypedDict):
43
53
  run_summary: Dict[str, Any]
44
54
 
45
55
 
56
+ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
57
+ """Identify and average various metrics that need to have the metric name be replaced,
58
+ instead of having the metric match the originating column name.
59
+ :param df: The dataframe of evaluation results.
60
+ :type df: ~pandas.DataFrame
61
+ :return: A tuple; the first element is a list of dataframe columns that were aggregated,
62
+ and the second element is a dictionary of resultant new metric column names and their values.
63
+ :rtype: Tuple[List[str], Dict[str, float]]
64
+ """
65
+ renamed_cols = []
66
+ metric_columns = {}
67
+ for col in df.columns:
68
+ metric_prefix = col.split(".")[0]
69
+ metric_name = col.split(".")[1]
70
+ if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
71
+ renamed_cols.append(col)
72
+ new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
73
+ col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
74
+ try:
75
+ metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76
+ except EvaluationException: # only exception that can be cause is all NaN values
77
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
78
+ LOGGER.warning(msg)
79
+
80
+ return renamed_cols, metric_columns
81
+
82
+
46
83
  # pylint: disable=line-too-long
47
84
  def _aggregate_content_safety_metrics(
48
85
  df: pd.DataFrame, evaluators: Dict[str, Callable]
@@ -85,11 +122,15 @@ def _aggregate_content_safety_metrics(
85
122
  for col in content_safety_df.columns:
86
123
  defect_rate_name = col.replace("_score", "_defect_rate")
87
124
  col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
88
- defect_rates[defect_rate_name] = round(
89
- list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
90
- / col_with_numeric_values.count(),
91
- 2,
92
- )
125
+ try:
126
+ col_with_boolean_values = apply_transform_nan_safe(
127
+ col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
128
+ )
129
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
130
+ except EvaluationException: # only exception that can be cause is all NaN values
131
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
132
+ LOGGER.warning(msg)
133
+
93
134
  return content_safety_cols, defect_rates
94
135
 
95
136
 
@@ -119,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
119
160
  for col in label_df.columns:
120
161
  defect_rate_name = col.replace("_label", "_defect_rate")
121
162
  col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
122
- defect_rates[defect_rate_name] = round(
123
- list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
124
- 2,
125
- )
163
+ try:
164
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165
+ except EvaluationException: # only exception that can be cause is all NaN values
166
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167
+ LOGGER.warning(msg)
126
168
  return label_cols, defect_rates
127
169
 
128
170
 
@@ -146,8 +188,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
146
188
  # Rename certain columns as defect rates if we know that's what their aggregates represent
147
189
  # Content safety metrics
148
190
  content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
191
+ other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
149
192
  handled_columns.extend(content_safety_cols)
193
+ handled_columns.extend(other_renamed_cols)
150
194
  defect_rates.update(cs_defect_rates)
195
+ defect_rates.update(renamed_cols)
151
196
  # Label-based (true/false) metrics where 'true' means 'something is wrong'
152
197
  label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
153
198
  handled_columns.extend(label_cols)
@@ -156,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
156
201
  # For rest of metrics, we will calculate mean
157
202
  df.drop(columns=handled_columns, inplace=True)
158
203
 
204
+ # NOTE: nan/None values don't count as as booleans, so boolean columns with
205
+ # nan/None values won't have a mean produced from them.
206
+ # This is different from label-based known evaluators, which have special handling.
159
207
  mean_value = df.mean(numeric_only=True)
160
208
  metrics = mean_value.to_dict()
161
209
  # Add defect rates back into metrics
@@ -163,34 +211,133 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
163
211
  return metrics
164
212
 
165
213
 
166
- def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
214
+ def _validate_columns_for_target(
215
+ df: pd.DataFrame,
216
+ target: Callable,
217
+ ) -> None:
218
+ """
219
+ Check that all columns needed by target function are present.
220
+
221
+ :param df: The data frame to be validated.
222
+ :type df: pd.DataFrame
223
+ :param target: The callable to be applied to data set.
224
+ :type target: Optional[Callable]
225
+ :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
226
+ """
227
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
228
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
229
+ raise EvaluationException(
230
+ message=msg,
231
+ internal_message=msg,
232
+ target=ErrorTarget.EVALUATE,
233
+ category=ErrorCategory.INVALID_VALUE,
234
+ blame=ErrorBlame.USER_ERROR,
235
+ )
236
+ # If the target function is given, it may return
237
+ # several columns and hence we cannot check the availability of columns
238
+ # without knowing target function semantics.
239
+ # Instead, here we will validate the columns, taken by target.
167
240
  required_inputs = [
168
241
  param.name
169
- for param in inspect.signature(evaluator).parameters.values()
242
+ for param in inspect.signature(target).parameters.values()
170
243
  if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
171
244
  ]
172
245
 
173
- missing_inputs = [col for col in required_inputs if col not in df_data.columns]
174
- if missing_inputs and "conversation" in required_inputs:
175
- non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
176
- if len(missing_inputs) == len(non_conversation_inputs) and [
177
- input in non_conversation_inputs for input in missing_inputs
178
- ]:
179
- missing_inputs = []
246
+ missing_inputs = [col for col in required_inputs if col not in df.columns]
180
247
  if missing_inputs:
181
- if not is_target_fn:
182
- msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
183
- raise EvaluationException(
184
- message=msg,
185
- internal_message=msg,
186
- target=ErrorTarget.EVALUATE,
187
- category=ErrorCategory.MISSING_FIELD,
188
- blame=ErrorBlame.USER_ERROR,
189
- )
190
- msg = f"Missing required inputs for target : {missing_inputs}."
248
+ msg = f"Missing required inputs for target: {missing_inputs}."
191
249
  raise EvaluationException(
192
250
  message=msg,
193
- internal_message=msg,
251
+ target=ErrorTarget.EVALUATE,
252
+ category=ErrorCategory.MISSING_FIELD,
253
+ blame=ErrorBlame.USER_ERROR,
254
+ )
255
+
256
+
257
+ def _validate_columns_for_evaluators(
258
+ df: pd.DataFrame,
259
+ evaluators: Dict[str, Callable],
260
+ target: Optional[Callable],
261
+ target_generated_columns: Optional[Set[str]],
262
+ column_mapping: Dict[str, Dict[str, str]],
263
+ ) -> None:
264
+ """
265
+ Check that all columns needed by evaluators are present.
266
+
267
+ :param df: The data frame to be validated.
268
+ :type df: pd.DataFrame
269
+ :param evaluators: The dictionary of evaluators.
270
+ :type evaluators: Dict[str, Callable]
271
+ :param target: The callable to be applied to data set.
272
+ :type target: Optional[Callable]
273
+ :param target_generated_columns: The set of columns generated by the target callable.
274
+ :type target_generated_columns: Optional[Set[str]]
275
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
276
+ :type column_mapping: Dict[str, Dict[str, str]]
277
+ :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
278
+ """
279
+ missing_inputs_per_evaluator = {}
280
+
281
+ for evaluator_name, evaluator in evaluators.items():
282
+ # Apply column mapping
283
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
284
+ new_df = _apply_column_mapping(df, mapping_config)
285
+
286
+ # Validate input data for evaluator
287
+ is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
288
+ if is_built_in:
289
+ # Note that for built-in evaluators supporting the "conversation" parameter,
290
+ # input parameters are now optional.
291
+ evaluator_params = [
292
+ param.name
293
+ for param in inspect.signature(evaluator).parameters.values()
294
+ if param.name not in ["kwargs", "args", "self"]
295
+ ]
296
+
297
+ if "conversation" in evaluator_params and "conversation" in new_df.columns:
298
+ # Ignore the missing fields if "conversation" presents in the input data
299
+ missing_inputs = []
300
+ else:
301
+ optional_params = (
302
+ evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
303
+ if hasattr(evaluator, "_OPTIONAL_PARAMS")
304
+ else []
305
+ )
306
+ excluded_params = set(new_df.columns).union(optional_params)
307
+ missing_inputs = [col for col in evaluator_params if col not in excluded_params]
308
+
309
+ # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
310
+ # Otherwise, remove it from the missing inputs
311
+ if "conversation" in missing_inputs:
312
+ if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
313
+ missing_inputs.remove("conversation")
314
+ else:
315
+ evaluator_params = [
316
+ param.name
317
+ for param in inspect.signature(evaluator).parameters.values()
318
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
319
+ ]
320
+
321
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
322
+
323
+ if missing_inputs:
324
+ missing_inputs_per_evaluator[evaluator_name] = missing_inputs
325
+
326
+ if missing_inputs_per_evaluator:
327
+ msg = "Some evaluators are missing required inputs:\n"
328
+ for evaluator_name, missing in missing_inputs_per_evaluator.items():
329
+ msg += f"- {evaluator_name}: {missing}\n"
330
+
331
+ # Add the additional notes
332
+ msg += "\nTo resolve this issue:\n"
333
+ msg += "- Ensure the data contains required inputs.\n"
334
+ if target is not None:
335
+ msg += "- Verify that the target is generating the necessary columns for the evaluators. "
336
+ msg += f"Currently generated columns: {target_generated_columns} \n"
337
+ msg += "- Check that the column mapping is correctly configured."
338
+
339
+ raise EvaluationException(
340
+ message=msg.strip(),
194
341
  target=ErrorTarget.EVALUATE,
195
342
  category=ErrorCategory.MISSING_FIELD,
196
343
  blame=ErrorBlame.USER_ERROR,
@@ -199,76 +346,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
199
346
 
200
347
  def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
201
348
  if data is None:
202
- msg = "data parameter must be provided for evaluation."
349
+ msg = "The 'data' parameter is required for evaluation."
203
350
  raise EvaluationException(
204
351
  message=msg,
205
- internal_message=msg,
206
352
  target=ErrorTarget.EVALUATE,
207
- category=ErrorCategory.MISSING_FIELD,
353
+ category=ErrorCategory.INVALID_VALUE,
354
+ blame=ErrorBlame.USER_ERROR,
355
+ )
356
+ if not isinstance(data, (os.PathLike, str)):
357
+ msg = "The 'data' parameter must be a string or a path-like object."
358
+ raise EvaluationException(
359
+ message=msg,
360
+ target=ErrorTarget.EVALUATE,
361
+ category=ErrorCategory.INVALID_VALUE,
362
+ blame=ErrorBlame.USER_ERROR,
363
+ )
364
+ if not os.path.exists(data):
365
+ msg = f"The input data file path '{data}' does not exist."
366
+ raise EvaluationException(
367
+ message=msg,
368
+ target=ErrorTarget.EVALUATE,
369
+ category=ErrorCategory.INVALID_VALUE,
208
370
  blame=ErrorBlame.USER_ERROR,
209
371
  )
210
372
 
211
373
  if target is not None:
212
374
  if not callable(target):
213
- msg = "target parameter must be a callable function."
375
+ msg = "The 'target' parameter must be a callable function."
214
376
  raise EvaluationException(
215
377
  message=msg,
216
- internal_message=msg,
217
378
  target=ErrorTarget.EVALUATE,
218
379
  category=ErrorCategory.INVALID_VALUE,
219
380
  blame=ErrorBlame.USER_ERROR,
220
381
  )
221
382
 
222
- if data is not None:
223
- if not isinstance(data, str):
224
- msg = "data parameter must be a string."
225
- raise EvaluationException(
226
- message=msg,
227
- internal_message=msg,
228
- target=ErrorTarget.EVALUATE,
229
- category=ErrorCategory.INVALID_VALUE,
230
- blame=ErrorBlame.USER_ERROR,
231
- )
383
+ if not evaluators:
384
+ msg = "The 'evaluators' parameter is required and cannot be None or empty."
385
+ raise EvaluationException(
386
+ message=msg,
387
+ target=ErrorTarget.EVALUATE,
388
+ category=ErrorCategory.INVALID_VALUE,
389
+ blame=ErrorBlame.USER_ERROR,
390
+ )
391
+ if not isinstance(evaluators, dict):
392
+ msg = "The 'evaluators' parameter must be a dictionary."
393
+ raise EvaluationException(
394
+ message=msg,
395
+ target=ErrorTarget.EVALUATE,
396
+ category=ErrorCategory.INVALID_VALUE,
397
+ blame=ErrorBlame.USER_ERROR,
398
+ )
232
399
 
233
- if evaluators is not None:
234
- if not isinstance(evaluators, dict):
235
- msg = "evaluators parameter must be a dictionary."
400
+ if output_path is not None:
401
+ if not isinstance(output_path, (os.PathLike, str)):
402
+ msg = "The 'output_path' parameter must be a string or a path-like object."
236
403
  raise EvaluationException(
237
404
  message=msg,
238
- internal_message=msg,
239
405
  target=ErrorTarget.EVALUATE,
240
406
  category=ErrorCategory.INVALID_VALUE,
241
407
  blame=ErrorBlame.USER_ERROR,
242
408
  )
243
409
 
244
- if output_path is not None:
245
- if not isinstance(output_path, str):
246
- msg = "output_path parameter must be a string."
410
+ output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
411
+ if output_dir and not os.path.exists(output_dir):
412
+ msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
247
413
  raise EvaluationException(
248
414
  message=msg,
249
- internal_message=msg,
250
415
  target=ErrorTarget.EVALUATE,
251
416
  category=ErrorCategory.INVALID_VALUE,
252
417
  blame=ErrorBlame.USER_ERROR,
253
418
  )
254
419
 
255
420
  if azure_ai_project is not None:
256
- if not isinstance(azure_ai_project, Dict):
257
- msg = "azure_ai_project parameter must be a dictionary."
258
- raise EvaluationException(
259
- message=msg,
260
- internal_message=msg,
261
- target=ErrorTarget.EVALUATE,
262
- category=ErrorCategory.INVALID_VALUE,
263
- blame=ErrorBlame.USER_ERROR,
264
- )
421
+ validate_azure_ai_project(azure_ai_project)
265
422
 
266
423
  if evaluation_name is not None:
267
- if not isinstance(evaluation_name, str):
268
- msg = "evaluation_name parameter must be a string."
424
+ if not isinstance(evaluation_name, str) or not evaluation_name.strip():
425
+ msg = "The 'evaluation_name' parameter must be a non-empty string."
269
426
  raise EvaluationException(
270
427
  message=msg,
271
- internal_message=msg,
272
428
  target=ErrorTarget.EVALUATE,
273
429
  category=ErrorCategory.INVALID_VALUE,
274
430
  blame=ErrorBlame.USER_ERROR,
@@ -278,8 +434,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
278
434
  initial_data_df = pd.read_json(data, lines=True)
279
435
  except Exception as e:
280
436
  raise EvaluationException(
281
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
282
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
437
+ message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
283
438
  target=ErrorTarget.EVALUATE,
284
439
  category=ErrorCategory.INVALID_VALUE,
285
440
  blame=ErrorBlame.USER_ERROR,
@@ -288,57 +443,13 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
288
443
  return initial_data_df
289
444
 
290
445
 
291
- def _validate_columns(
292
- df: pd.DataFrame,
293
- evaluators: Dict[str, Callable],
294
- target: Optional[Callable],
295
- column_mapping: Dict[str, Dict[str, str]],
296
- ) -> None:
297
- """
298
- Check that all columns needed by evaluator or target function are present.
299
-
300
- :param df: The data frame to be validated.
301
- :type df: pd.DataFrame
302
- :param evaluators: The dictionary of evaluators.
303
- :type evaluators: Dict[str, Callable]
304
- :param target: The callable to be applied to data set.
305
- :type target: Optional[Callable]
306
- :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
307
- :type column_mapping: Dict[str, Dict[str, str]]
308
- :raises EvaluationException: If column starts from "__outputs." while target is defined.
309
- """
310
- if target:
311
- if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
312
- msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
313
- raise EvaluationException(
314
- message=msg,
315
- internal_message=msg,
316
- target=ErrorTarget.EVALUATE,
317
- category=ErrorCategory.INVALID_VALUE,
318
- blame=ErrorBlame.USER_ERROR,
319
- )
320
- # If the target function is given, it may return
321
- # several columns and hence we cannot check the availability of columns
322
- # without knowing target function semantics.
323
- # Instead, here we will validate the columns, taken by target.
324
- _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
325
- else:
326
- for evaluator_name, evaluator in evaluators.items():
327
- # Apply column mapping
328
- mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
329
- new_df = _apply_column_mapping(df, mapping_config)
330
-
331
- # Validate input data for evaluator
332
- _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
333
-
334
-
335
446
  def _apply_target_to_data(
336
447
  target: Callable,
337
- data: str,
448
+ data: Union[str, os.PathLike],
338
449
  pf_client: PFClient,
339
450
  initial_data: pd.DataFrame,
340
451
  evaluation_name: Optional[str] = None,
341
- _run_name: Optional[str] = None,
452
+ **kwargs,
342
453
  ) -> Tuple[pd.DataFrame, Set[str], Run]:
343
454
  """
344
455
  Apply the target function to the data set and return updated data and generated columns.
@@ -346,29 +457,45 @@ def _apply_target_to_data(
346
457
  :param target: The function to be applied to data.
347
458
  :type target: Callable
348
459
  :param data: The path to input jsonl file.
349
- :type data: str
460
+ :type data: Union[str, os.PathLike]
350
461
  :param pf_client: The promptflow client to be used.
351
462
  :type pf_client: PFClient
352
463
  :param initial_data: The data frame with the loaded data.
353
464
  :type initial_data: pd.DataFrame
354
465
  :param evaluation_name: The name of the evaluation.
355
466
  :type evaluation_name: Optional[str]
356
- :param _run_name: The name of target run. Used for testing only.
357
- :type _run_name: Optional[str]
358
467
  :return: The tuple, containing data frame and the list of added columns.
359
468
  :rtype: Tuple[pandas.DataFrame, List[str]]
360
469
  """
361
- # We are manually creating the temporary directory for the flow
362
- # because the way tempdir remove temporary directories will
363
- # hang the debugger, because promptflow will keep flow directory.
364
- run: Run = pf_client.run(
365
- flow=target,
366
- display_name=evaluation_name,
367
- data=data,
368
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
369
- stream=True,
370
- name=_run_name,
371
- )
470
+ _run_name = kwargs.get("_run_name")
471
+ upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
472
+
473
+ try:
474
+ with TargetRunContext(upload_target_snaphot):
475
+ run: Run = pf_client.run(
476
+ flow=target,
477
+ display_name=evaluation_name,
478
+ data=data,
479
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
+ stream=True,
481
+ name=_run_name,
482
+ )
483
+ except (UserAuthenticationError, UploadInternalError) as ex:
484
+ if "Failed to upload run" in ex.message:
485
+ msg = (
486
+ "Failed to upload the target run to the cloud. "
487
+ "This may be caused by insufficient permission to access storage or other errors."
488
+ )
489
+ raise EvaluationException(
490
+ message=msg,
491
+ target=ErrorTarget.EVALUATE,
492
+ category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
+ blame=ErrorBlame.USER_ERROR,
494
+ tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
+ ) from ex
496
+
497
+ raise ex
498
+
372
499
  target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
373
500
  # Remove input and output prefix
374
501
  generated_columns = {
@@ -456,15 +583,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
456
583
  # @log_evaluate_activity
457
584
  def evaluate(
458
585
  *,
459
- data: str,
586
+ data: Union[str, os.PathLike],
460
587
  evaluators: Dict[str, Callable],
461
588
  evaluation_name: Optional[str] = None,
462
589
  target: Optional[Callable] = None,
463
590
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
464
591
  azure_ai_project: Optional[AzureAIProject] = None,
465
- output_path: Optional[str] = None,
592
+ output_path: Optional[Union[str, os.PathLike]] = None,
466
593
  **kwargs,
467
- ):
594
+ ) -> EvaluationResult:
468
595
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
469
596
  data will be run through target function and then results will be evaluated.
470
597
 
@@ -489,50 +616,16 @@ def evaluate(
489
616
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
490
617
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
491
618
  :return: Evaluation results.
492
- :rtype: dict
619
+ :rtype: ~azure.ai.evaluation.EvaluationResult
493
620
 
494
- :Example:
495
-
496
- Evaluate API can be used as follows:
497
-
498
- .. code-block:: python
499
-
500
- from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
501
-
502
-
503
- model_config = {
504
- "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
505
- "api_key": os.environ.get("AZURE_OPENAI_KEY"),
506
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
507
- }
508
-
509
- coherence_eval = CoherenceEvaluator(model_config=model_config)
510
- relevance_eval = RelevanceEvaluator(model_config=model_config)
511
-
512
- path = "evaluate_test_data.jsonl"
513
- result = evaluate(
514
- data=path,
515
- evaluators={
516
- "coherence": coherence_eval,
517
- "relevance": relevance_eval,
518
- },
519
- evaluator_config={
520
- "coherence": {
521
- "column_mapping": {
522
- "response": "${data.response}",
523
- "query": "${data.query}",
524
- },
525
- },
526
- "relevance": {
527
- "column_mapping": {
528
- "response": "${data.response}",
529
- "context": "${data.context}",
530
- "query": "${data.query}",
531
- },
532
- },
533
- },
534
- )
621
+ .. admonition:: Example:
535
622
 
623
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
624
+ :start-after: [START evaluate_method]
625
+ :end-before: [END evaluate_method]
626
+ :language: python
627
+ :dedent: 8
628
+ :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
536
629
  """
537
630
  try:
538
631
  return _evaluate(
@@ -563,7 +656,17 @@ def evaluate(
563
656
  internal_message=error_message,
564
657
  target=ErrorTarget.EVALUATE,
565
658
  category=ErrorCategory.FAILED_EXECUTION,
566
- blame=ErrorBlame.UNKNOWN,
659
+ blame=ErrorBlame.USER_ERROR,
660
+ ) from e
661
+
662
+ # Ensure a consistent user experience when encountering errors by converting
663
+ # all other exceptions to EvaluationException.
664
+ if not isinstance(e, EvaluationException):
665
+ raise EvaluationException(
666
+ message=str(e),
667
+ target=ErrorTarget.EVALUATE,
668
+ category=ErrorCategory.FAILED_EXECUTION,
669
+ blame=ErrorBlame.SYSTEM_ERROR,
567
670
  ) from e
568
671
 
569
672
  raise e
@@ -578,7 +681,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
578
681
  if output_dict:
579
682
  print("======= Combined Run Summary (Per Evaluator) =======\n")
580
683
  print(json.dumps(output_dict, indent=4))
581
- print("\n====================================================")
684
+ print("\n====================================================\n")
582
685
 
583
686
 
584
687
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
@@ -586,12 +689,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
586
689
  evaluators: Dict[str, Callable],
587
690
  evaluation_name: Optional[str] = None,
588
691
  target: Optional[Callable] = None,
589
- data: str,
692
+ data: Union[str, os.PathLike],
590
693
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
591
694
  azure_ai_project: Optional[AzureAIProject] = None,
592
- output_path: Optional[str] = None,
695
+ output_path: Optional[Union[str, os.PathLike]] = None,
593
696
  **kwargs,
594
- ) -> EvaluateResult:
697
+ ) -> EvaluationResult:
595
698
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
596
699
 
597
700
  # Process evaluator config to replace ${target.} with ${data.}
@@ -604,33 +707,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
604
707
  for evaluator_name, evaluator_configuration in evaluator_config.items()
605
708
  }
606
709
  )
607
- _validate_columns(input_data_df, evaluators, target, column_mapping)
608
710
 
609
- # Target Run
610
- try:
611
- pf_client = PFClient(
612
- config=(
613
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
614
- if azure_ai_project
615
- else None
616
- ),
617
- user_agent=USER_AGENT,
618
- )
619
- # pylint: disable=raise-missing-from
620
- except MissingAzurePackage:
621
- msg = (
622
- "The required packages for remote tracking are missing.\n"
623
- 'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
624
- )
625
-
626
- raise EvaluationException(
627
- message=msg,
628
- target=ErrorTarget.EVALUATE,
629
- category=ErrorCategory.MISSING_PACKAGE,
630
- blame=ErrorBlame.USER_ERROR,
631
- )
711
+ if target is not None:
712
+ _validate_columns_for_target(input_data_df, target)
632
713
 
633
- trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
714
+ pf_client = PFClient(user_agent=USER_AGENT)
634
715
  target_run: Optional[Run] = None
635
716
 
636
717
  # Create default configuration for evaluators that directly maps
@@ -639,9 +720,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
639
720
  column_mapping.setdefault("default", {})
640
721
 
641
722
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
723
+ target_generated_columns: Set[str] = set()
642
724
  if data is not None and target is not None:
643
725
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
644
- target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
726
+ target, data, pf_client, input_data_df, evaluation_name, **kwargs
645
727
  )
646
728
 
647
729
  for evaluator_name, mapping in column_mapping.items():
@@ -656,9 +738,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
656
738
  if col not in mapping and run_output not in mapped_to_values:
657
739
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
658
740
 
659
- # After we have generated all columns we can check if we have
660
- # everything we need for evaluators.
661
- _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
741
+ # After we have generated all columns, we can check if we have everything we need for evaluators.
742
+ _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
662
743
 
663
744
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
664
745
  # via target mapping.
@@ -674,7 +755,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
674
755
  def eval_batch_run(
675
756
  batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
676
757
  ) -> Dict[str, __EvaluatorInfo]:
677
- with BatchRunContext(batch_run_client):
758
+ with EvalRunContext(batch_run_client):
678
759
  runs = {
679
760
  evaluator_name: batch_run_client.run(
680
761
  flow=evaluator,
@@ -688,7 +769,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
769
  for evaluator_name, evaluator in evaluators.items()
689
770
  }
690
771
 
691
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
772
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
692
773
  return {
693
774
  evaluator_name: {
694
775
  "result": batch_run_client.get_details(run, all_results=True),
@@ -704,11 +785,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
704
785
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
705
786
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
706
787
  data = os.path.abspath(data)
707
-
708
- # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
709
- # The root cause is still unclear, but it seems related to a conflict between the async run uploader
710
- # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
711
- per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
788
+ per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
712
789
  else:
713
790
  data = input_data_df
714
791
  per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
@@ -750,19 +827,26 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
750
827
  result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
751
828
  metrics = _aggregate_metrics(evaluators_result_df, evaluators)
752
829
  metrics.update(evaluators_metric)
753
- studio_url = _log_metrics_and_instance_results(
754
- metrics,
755
- result_df,
756
- trace_destination,
757
- target_run,
758
- evaluation_name,
759
- )
760
830
 
761
- result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
831
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
832
+ target_run = None
833
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
834
+ studio_url = None
835
+ if trace_destination:
836
+ studio_url = _log_metrics_and_instance_results(
837
+ metrics,
838
+ result_df,
839
+ trace_destination,
840
+ target_run,
841
+ evaluation_name,
842
+ )
843
+
844
+ result_df_dict = result_df.to_dict("records")
845
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
846
+
847
+ _print_summary(per_evaluator_results)
762
848
 
763
849
  if output_path:
764
850
  _write_output(output_path, result)
765
851
 
766
- _print_summary(per_evaluator_results)
767
-
768
852
  return result