azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/_common/constants.py +5 -0
  3. azure/ai/evaluation/_common/math.py +11 -0
  4. azure/ai/evaluation/_common/rai_service.py +172 -35
  5. azure/ai/evaluation/_common/utils.py +162 -23
  6. azure/ai/evaluation/_constants.py +6 -6
  7. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  8. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
  10. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  11. azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
  12. azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
  13. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
  14. azure/ai/evaluation/_evaluate/_utils.py +40 -7
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
  17. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
  19. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
  20. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  34. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  35. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  36. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  43. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
  44. azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
  45. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
  46. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
  48. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  49. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  50. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  51. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  52. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
  53. azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
  54. azure/ai/evaluation/_exceptions.py +17 -0
  55. azure/ai/evaluation/_model_configurations.py +18 -1
  56. azure/ai/evaluation/_version.py +1 -1
  57. azure/ai/evaluation/simulator/__init__.py +2 -1
  58. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  59. azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
  60. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  61. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  62. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  63. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  64. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  65. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
  66. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  69. azure/ai/evaluation/simulator/_simulator.py +115 -61
  70. azure/ai/evaluation/simulator/_utils.py +6 -6
  71. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
  72. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
  73. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  74. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  76. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  77. /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
  78. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  79. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,19 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import json
5
6
  import os
6
7
  import re
7
8
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8
- import json
9
9
 
10
10
  import pandas as pd
11
11
  from promptflow._sdk._constants import LINE_NUMBER
12
+ from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
12
13
  from promptflow.client import PFClient
13
14
  from promptflow.entities import Run
14
- from promptflow._sdk._errors import MissingAzurePackage
15
15
 
16
16
  from azure.ai.evaluation._common.math import list_sum
17
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
17
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
19
 
19
20
  from .._constants import (
@@ -23,11 +24,10 @@ from .._constants import (
23
24
  Prefixes,
24
25
  _InternalEvaluationMetrics,
25
26
  )
26
- from .._model_configurations import AzureAIProject, EvaluatorConfig
27
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
27
28
  from .._user_agent import USER_AGENT
28
- from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
29
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
29
30
  from ._utils import (
30
- EvaluateResult,
31
31
  _apply_column_mapping,
32
32
  _log_metrics_and_instance_results,
33
33
  _trace_destination_from_project_scope,
@@ -36,6 +36,14 @@ from ._utils import (
36
36
 
37
37
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
38
38
 
39
+ # For metrics (aggregates) whose metric names intentionally differ from their
40
+ # originating column name, usually because the aggregation of the original value
41
+ # means something sufficiently different.
42
+ # Note that content safety metrics are handled seprately.
43
+ METRIC_COLUMN_NAME_REPLACEMENTS = {
44
+ "groundedness_pro_label": "groundedness_pro_passing_rate",
45
+ }
46
+
39
47
 
40
48
  class __EvaluatorInfo(TypedDict):
41
49
  result: pd.DataFrame
@@ -43,6 +51,32 @@ class __EvaluatorInfo(TypedDict):
43
51
  run_summary: Dict[str, Any]
44
52
 
45
53
 
54
+ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
55
+ """Identify and average various metrics that need to have the metric name be replaced,
56
+ instead of having the metric match the originating column name.
57
+ :param df: The dataframe of evaluation results.
58
+ :type df: ~pandas.DataFrame
59
+ :return: A tuple; the first element is a list of dataframe columns that were aggregated,
60
+ and the second element is a dictionary of resultant new metric column names and their values.
61
+ :rtype: Tuple[List[str], Dict[str, float]]
62
+ """
63
+ renamed_cols = []
64
+ metric_columns = {}
65
+ for col in df.columns:
66
+ metric_prefix = col.split(".")[0]
67
+ metric_name = col.split(".")[1]
68
+ if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
69
+ renamed_cols.append(col)
70
+ new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
71
+ col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
72
+ metric_columns[new_col_name] = round(
73
+ list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
74
+ 2,
75
+ )
76
+
77
+ return renamed_cols, metric_columns
78
+
79
+
46
80
  # pylint: disable=line-too-long
47
81
  def _aggregate_content_safety_metrics(
48
82
  df: pd.DataFrame, evaluators: Dict[str, Callable]
@@ -146,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
146
180
  # Rename certain columns as defect rates if we know that's what their aggregates represent
147
181
  # Content safety metrics
148
182
  content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
183
+ other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
149
184
  handled_columns.extend(content_safety_cols)
185
+ handled_columns.extend(other_renamed_cols)
150
186
  defect_rates.update(cs_defect_rates)
187
+ defect_rates.update(renamed_cols)
151
188
  # Label-based (true/false) metrics where 'true' means 'something is wrong'
152
189
  label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
153
190
  handled_columns.extend(label_cols)
@@ -163,34 +200,127 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
163
200
  return metrics
164
201
 
165
202
 
166
- def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
203
+ def _validate_columns_for_target(
204
+ df: pd.DataFrame,
205
+ target: Callable,
206
+ ) -> None:
207
+ """
208
+ Check that all columns needed by target function are present.
209
+
210
+ :param df: The data frame to be validated.
211
+ :type df: pd.DataFrame
212
+ :param target: The callable to be applied to data set.
213
+ :type target: Optional[Callable]
214
+ :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
215
+ """
216
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
217
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
218
+ raise EvaluationException(
219
+ message=msg,
220
+ internal_message=msg,
221
+ target=ErrorTarget.EVALUATE,
222
+ category=ErrorCategory.INVALID_VALUE,
223
+ blame=ErrorBlame.USER_ERROR,
224
+ )
225
+ # If the target function is given, it may return
226
+ # several columns and hence we cannot check the availability of columns
227
+ # without knowing target function semantics.
228
+ # Instead, here we will validate the columns, taken by target.
167
229
  required_inputs = [
168
230
  param.name
169
- for param in inspect.signature(evaluator).parameters.values()
231
+ for param in inspect.signature(target).parameters.values()
170
232
  if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
171
233
  ]
172
234
 
173
- missing_inputs = [col for col in required_inputs if col not in df_data.columns]
174
- if missing_inputs and "conversation" in required_inputs:
175
- non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
176
- if len(missing_inputs) == len(non_conversation_inputs) and [
177
- input in non_conversation_inputs for input in missing_inputs
178
- ]:
179
- missing_inputs = []
235
+ missing_inputs = [col for col in required_inputs if col not in df.columns]
180
236
  if missing_inputs:
181
- if not is_target_fn:
182
- msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
183
- raise EvaluationException(
184
- message=msg,
185
- internal_message=msg,
186
- target=ErrorTarget.EVALUATE,
187
- category=ErrorCategory.MISSING_FIELD,
188
- blame=ErrorBlame.USER_ERROR,
189
- )
190
- msg = f"Missing required inputs for target : {missing_inputs}."
237
+ msg = f"Missing required inputs for target: {missing_inputs}."
191
238
  raise EvaluationException(
192
239
  message=msg,
193
- internal_message=msg,
240
+ target=ErrorTarget.EVALUATE,
241
+ category=ErrorCategory.MISSING_FIELD,
242
+ blame=ErrorBlame.USER_ERROR,
243
+ )
244
+
245
+
246
+ def _validate_columns_for_evaluators(
247
+ df: pd.DataFrame,
248
+ evaluators: Dict[str, Callable],
249
+ target: Optional[Callable],
250
+ target_generated_columns: Optional[Set[str]],
251
+ column_mapping: Dict[str, Dict[str, str]],
252
+ ) -> None:
253
+ """
254
+ Check that all columns needed by evaluators are present.
255
+
256
+ :param df: The data frame to be validated.
257
+ :type df: pd.DataFrame
258
+ :param evaluators: The dictionary of evaluators.
259
+ :type evaluators: Dict[str, Callable]
260
+ :param target: The callable to be applied to data set.
261
+ :type target: Optional[Callable]
262
+ :param target_generated_columns: The set of columns generated by the target callable.
263
+ :type target_generated_columns: Optional[Set[str]]
264
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
265
+ :type column_mapping: Dict[str, Dict[str, str]]
266
+ :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
267
+ """
268
+ missing_inputs_per_evaluator = {}
269
+
270
+ for evaluator_name, evaluator in evaluators.items():
271
+ # Apply column mapping
272
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
273
+ new_df = _apply_column_mapping(df, mapping_config)
274
+
275
+ # Validate input data for evaluator
276
+ is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
277
+ if is_built_in:
278
+ # Note that for built-in evaluators supporting the "conversation" parameter,
279
+ # input parameters are now optional.
280
+ evaluator_params = [
281
+ param.name
282
+ for param in inspect.signature(evaluator).parameters.values()
283
+ if param.name not in ["kwargs", "args", "self"]
284
+ ]
285
+
286
+ if "conversation" in evaluator_params and "conversation" in new_df.columns:
287
+ # Ignore the missing fields if "conversation" presents in the input data
288
+ missing_inputs = []
289
+ else:
290
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
291
+
292
+ # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
293
+ # Otherwise, remove it from the missing inputs
294
+ if "conversation" in missing_inputs:
295
+ if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
296
+ missing_inputs.remove("conversation")
297
+ else:
298
+ evaluator_params = [
299
+ param.name
300
+ for param in inspect.signature(evaluator).parameters.values()
301
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
302
+ ]
303
+
304
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
305
+
306
+ if missing_inputs:
307
+ missing_inputs_per_evaluator[evaluator_name] = missing_inputs
308
+
309
+ if missing_inputs_per_evaluator:
310
+ msg = "Some evaluators are missing required inputs:\n"
311
+ for evaluator_name, missing in missing_inputs_per_evaluator.items():
312
+ msg += f"- {evaluator_name}: {missing}\n"
313
+
314
+ # Add the additional notes
315
+ msg += "\nTo resolve this issue:\n"
316
+ msg += "- Ensure the data contains required inputs.\n"
317
+ if target is not None:
318
+ msg += "- Verify that the target is generating the necessary columns for the evaluators. "
319
+ msg += f"Currently generated columns: {target_generated_columns} \n"
320
+ msg += "- Check that the column mapping is correctly configured."
321
+
322
+ raise EvaluationException(
323
+ message=msg.strip(),
194
324
  target=ErrorTarget.EVALUATE,
195
325
  category=ErrorCategory.MISSING_FIELD,
196
326
  blame=ErrorBlame.USER_ERROR,
@@ -199,76 +329,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
199
329
 
200
330
  def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
201
331
  if data is None:
202
- msg = "data parameter must be provided for evaluation."
332
+ msg = "The 'data' parameter is required for evaluation."
203
333
  raise EvaluationException(
204
334
  message=msg,
205
- internal_message=msg,
206
335
  target=ErrorTarget.EVALUATE,
207
- category=ErrorCategory.MISSING_FIELD,
336
+ category=ErrorCategory.INVALID_VALUE,
337
+ blame=ErrorBlame.USER_ERROR,
338
+ )
339
+ if not isinstance(data, (os.PathLike, str)):
340
+ msg = "The 'data' parameter must be a string or a path-like object."
341
+ raise EvaluationException(
342
+ message=msg,
343
+ target=ErrorTarget.EVALUATE,
344
+ category=ErrorCategory.INVALID_VALUE,
345
+ blame=ErrorBlame.USER_ERROR,
346
+ )
347
+ if not os.path.exists(data):
348
+ msg = f"The input data file path '{data}' does not exist."
349
+ raise EvaluationException(
350
+ message=msg,
351
+ target=ErrorTarget.EVALUATE,
352
+ category=ErrorCategory.INVALID_VALUE,
208
353
  blame=ErrorBlame.USER_ERROR,
209
354
  )
210
355
 
211
356
  if target is not None:
212
357
  if not callable(target):
213
- msg = "target parameter must be a callable function."
358
+ msg = "The 'target' parameter must be a callable function."
214
359
  raise EvaluationException(
215
360
  message=msg,
216
- internal_message=msg,
217
361
  target=ErrorTarget.EVALUATE,
218
362
  category=ErrorCategory.INVALID_VALUE,
219
363
  blame=ErrorBlame.USER_ERROR,
220
364
  )
221
365
 
222
- if data is not None:
223
- if not isinstance(data, str):
224
- msg = "data parameter must be a string."
225
- raise EvaluationException(
226
- message=msg,
227
- internal_message=msg,
228
- target=ErrorTarget.EVALUATE,
229
- category=ErrorCategory.INVALID_VALUE,
230
- blame=ErrorBlame.USER_ERROR,
231
- )
366
+ if not evaluators:
367
+ msg = "The 'evaluators' parameter is required and cannot be None or empty."
368
+ raise EvaluationException(
369
+ message=msg,
370
+ target=ErrorTarget.EVALUATE,
371
+ category=ErrorCategory.INVALID_VALUE,
372
+ blame=ErrorBlame.USER_ERROR,
373
+ )
374
+ if not isinstance(evaluators, dict):
375
+ msg = "The 'evaluators' parameter must be a dictionary."
376
+ raise EvaluationException(
377
+ message=msg,
378
+ target=ErrorTarget.EVALUATE,
379
+ category=ErrorCategory.INVALID_VALUE,
380
+ blame=ErrorBlame.USER_ERROR,
381
+ )
232
382
 
233
- if evaluators is not None:
234
- if not isinstance(evaluators, dict):
235
- msg = "evaluators parameter must be a dictionary."
383
+ if output_path is not None:
384
+ if not isinstance(output_path, (os.PathLike, str)):
385
+ msg = "The 'output_path' parameter must be a string or a path-like object."
236
386
  raise EvaluationException(
237
387
  message=msg,
238
- internal_message=msg,
239
388
  target=ErrorTarget.EVALUATE,
240
389
  category=ErrorCategory.INVALID_VALUE,
241
390
  blame=ErrorBlame.USER_ERROR,
242
391
  )
243
392
 
244
- if output_path is not None:
245
- if not isinstance(output_path, str):
246
- msg = "output_path parameter must be a string."
393
+ output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
394
+ if not os.path.exists(output_dir):
395
+ msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
247
396
  raise EvaluationException(
248
397
  message=msg,
249
- internal_message=msg,
250
398
  target=ErrorTarget.EVALUATE,
251
399
  category=ErrorCategory.INVALID_VALUE,
252
400
  blame=ErrorBlame.USER_ERROR,
253
401
  )
254
402
 
255
403
  if azure_ai_project is not None:
256
- if not isinstance(azure_ai_project, Dict):
257
- msg = "azure_ai_project parameter must be a dictionary."
258
- raise EvaluationException(
259
- message=msg,
260
- internal_message=msg,
261
- target=ErrorTarget.EVALUATE,
262
- category=ErrorCategory.INVALID_VALUE,
263
- blame=ErrorBlame.USER_ERROR,
264
- )
404
+ validate_azure_ai_project(azure_ai_project)
265
405
 
266
406
  if evaluation_name is not None:
267
- if not isinstance(evaluation_name, str):
268
- msg = "evaluation_name parameter must be a string."
407
+ if not isinstance(evaluation_name, str) or not evaluation_name.strip():
408
+ msg = "The 'evaluation_name' parameter must be a non-empty string."
269
409
  raise EvaluationException(
270
410
  message=msg,
271
- internal_message=msg,
272
411
  target=ErrorTarget.EVALUATE,
273
412
  category=ErrorCategory.INVALID_VALUE,
274
413
  blame=ErrorBlame.USER_ERROR,
@@ -278,8 +417,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
278
417
  initial_data_df = pd.read_json(data, lines=True)
279
418
  except Exception as e:
280
419
  raise EvaluationException(
281
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
282
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
420
+ message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
283
421
  target=ErrorTarget.EVALUATE,
284
422
  category=ErrorCategory.INVALID_VALUE,
285
423
  blame=ErrorBlame.USER_ERROR,
@@ -288,57 +426,13 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
288
426
  return initial_data_df
289
427
 
290
428
 
291
- def _validate_columns(
292
- df: pd.DataFrame,
293
- evaluators: Dict[str, Callable],
294
- target: Optional[Callable],
295
- column_mapping: Dict[str, Dict[str, str]],
296
- ) -> None:
297
- """
298
- Check that all columns needed by evaluator or target function are present.
299
-
300
- :param df: The data frame to be validated.
301
- :type df: pd.DataFrame
302
- :param evaluators: The dictionary of evaluators.
303
- :type evaluators: Dict[str, Callable]
304
- :param target: The callable to be applied to data set.
305
- :type target: Optional[Callable]
306
- :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
307
- :type column_mapping: Dict[str, Dict[str, str]]
308
- :raises EvaluationException: If column starts from "__outputs." while target is defined.
309
- """
310
- if target:
311
- if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
312
- msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
313
- raise EvaluationException(
314
- message=msg,
315
- internal_message=msg,
316
- target=ErrorTarget.EVALUATE,
317
- category=ErrorCategory.INVALID_VALUE,
318
- blame=ErrorBlame.USER_ERROR,
319
- )
320
- # If the target function is given, it may return
321
- # several columns and hence we cannot check the availability of columns
322
- # without knowing target function semantics.
323
- # Instead, here we will validate the columns, taken by target.
324
- _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
325
- else:
326
- for evaluator_name, evaluator in evaluators.items():
327
- # Apply column mapping
328
- mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
329
- new_df = _apply_column_mapping(df, mapping_config)
330
-
331
- # Validate input data for evaluator
332
- _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
333
-
334
-
335
429
  def _apply_target_to_data(
336
430
  target: Callable,
337
- data: str,
431
+ data: Union[str, os.PathLike],
338
432
  pf_client: PFClient,
339
433
  initial_data: pd.DataFrame,
340
434
  evaluation_name: Optional[str] = None,
341
- _run_name: Optional[str] = None,
435
+ **kwargs,
342
436
  ) -> Tuple[pd.DataFrame, Set[str], Run]:
343
437
  """
344
438
  Apply the target function to the data set and return updated data and generated columns.
@@ -346,29 +440,45 @@ def _apply_target_to_data(
346
440
  :param target: The function to be applied to data.
347
441
  :type target: Callable
348
442
  :param data: The path to input jsonl file.
349
- :type data: str
443
+ :type data: Union[str, os.PathLike]
350
444
  :param pf_client: The promptflow client to be used.
351
445
  :type pf_client: PFClient
352
446
  :param initial_data: The data frame with the loaded data.
353
447
  :type initial_data: pd.DataFrame
354
448
  :param evaluation_name: The name of the evaluation.
355
449
  :type evaluation_name: Optional[str]
356
- :param _run_name: The name of target run. Used for testing only.
357
- :type _run_name: Optional[str]
358
450
  :return: The tuple, containing data frame and the list of added columns.
359
451
  :rtype: Tuple[pandas.DataFrame, List[str]]
360
452
  """
361
- # We are manually creating the temporary directory for the flow
362
- # because the way tempdir remove temporary directories will
363
- # hang the debugger, because promptflow will keep flow directory.
364
- run: Run = pf_client.run(
365
- flow=target,
366
- display_name=evaluation_name,
367
- data=data,
368
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
369
- stream=True,
370
- name=_run_name,
371
- )
453
+ _run_name = kwargs.get("_run_name")
454
+ upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
455
+
456
+ try:
457
+ with TargetRunContext(upload_target_snaphot):
458
+ run: Run = pf_client.run(
459
+ flow=target,
460
+ display_name=evaluation_name,
461
+ data=data,
462
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
463
+ stream=True,
464
+ name=_run_name,
465
+ )
466
+ except (UserAuthenticationError, UploadInternalError) as ex:
467
+ if "Failed to upload run" in ex.message:
468
+ msg = (
469
+ "Failed to upload the target run to the cloud. "
470
+ "This may be caused by insufficient permission to access storage or other errors."
471
+ )
472
+ raise EvaluationException(
473
+ message=msg,
474
+ target=ErrorTarget.EVALUATE,
475
+ category=ErrorCategory.FAILED_REMOTE_TRACKING,
476
+ blame=ErrorBlame.USER_ERROR,
477
+ tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
478
+ ) from ex
479
+
480
+ raise ex
481
+
372
482
  target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
373
483
  # Remove input and output prefix
374
484
  generated_columns = {
@@ -456,15 +566,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
456
566
  # @log_evaluate_activity
457
567
  def evaluate(
458
568
  *,
459
- data: str,
569
+ data: Union[str, os.PathLike],
460
570
  evaluators: Dict[str, Callable],
461
571
  evaluation_name: Optional[str] = None,
462
572
  target: Optional[Callable] = None,
463
573
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
464
574
  azure_ai_project: Optional[AzureAIProject] = None,
465
- output_path: Optional[str] = None,
575
+ output_path: Optional[Union[str, os.PathLike]] = None,
466
576
  **kwargs,
467
- ):
577
+ ) -> EvaluationResult:
468
578
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
469
579
  data will be run through target function and then results will be evaluated.
470
580
 
@@ -489,7 +599,7 @@ def evaluate(
489
599
  :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
490
600
  :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
491
601
  :return: Evaluation results.
492
- :rtype: dict
602
+ :rtype: ~azure.ai.evaluation.EvaluationResult
493
603
 
494
604
  :Example:
495
605
 
@@ -563,7 +673,17 @@ def evaluate(
563
673
  internal_message=error_message,
564
674
  target=ErrorTarget.EVALUATE,
565
675
  category=ErrorCategory.FAILED_EXECUTION,
566
- blame=ErrorBlame.UNKNOWN,
676
+ blame=ErrorBlame.USER_ERROR,
677
+ ) from e
678
+
679
+ # Ensure a consistent user experience when encountering errors by converting
680
+ # all other exceptions to EvaluationException.
681
+ if not isinstance(e, EvaluationException):
682
+ raise EvaluationException(
683
+ message=str(e),
684
+ target=ErrorTarget.EVALUATE,
685
+ category=ErrorCategory.FAILED_EXECUTION,
686
+ blame=ErrorBlame.SYSTEM_ERROR,
567
687
  ) from e
568
688
 
569
689
  raise e
@@ -586,12 +706,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
586
706
  evaluators: Dict[str, Callable],
587
707
  evaluation_name: Optional[str] = None,
588
708
  target: Optional[Callable] = None,
589
- data: str,
709
+ data: Union[str, os.PathLike],
590
710
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
591
711
  azure_ai_project: Optional[AzureAIProject] = None,
592
- output_path: Optional[str] = None,
712
+ output_path: Optional[Union[str, os.PathLike]] = None,
593
713
  **kwargs,
594
- ) -> EvaluateResult:
714
+ ) -> EvaluationResult:
595
715
  input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
596
716
 
597
717
  # Process evaluator config to replace ${target.} with ${data.}
@@ -604,7 +724,9 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
604
724
  for evaluator_name, evaluator_configuration in evaluator_config.items()
605
725
  }
606
726
  )
607
- _validate_columns(input_data_df, evaluators, target, column_mapping)
727
+
728
+ if target is not None:
729
+ _validate_columns_for_target(input_data_df, target)
608
730
 
609
731
  # Target Run
610
732
  try:
@@ -623,7 +745,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
623
745
  'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
624
746
  )
625
747
 
626
- raise EvaluationException(
748
+ raise EvaluationException( # pylint: disable=raise-missing-from
627
749
  message=msg,
628
750
  target=ErrorTarget.EVALUATE,
629
751
  category=ErrorCategory.MISSING_PACKAGE,
@@ -631,6 +753,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
631
753
  )
632
754
 
633
755
  trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
756
+
757
+ # Handle the case where the customer manually run "pf config set trace.destination=none"
758
+ if trace_destination and trace_destination.lower() == "none":
759
+ trace_destination = None
760
+
634
761
  target_run: Optional[Run] = None
635
762
 
636
763
  # Create default configuration for evaluators that directly maps
@@ -639,9 +766,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
639
766
  column_mapping.setdefault("default", {})
640
767
 
641
768
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
769
+ target_generated_columns: Set[str] = set()
642
770
  if data is not None and target is not None:
643
771
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
644
- target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
772
+ target, data, pf_client, input_data_df, evaluation_name, **kwargs
645
773
  )
646
774
 
647
775
  for evaluator_name, mapping in column_mapping.items():
@@ -656,9 +784,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
656
784
  if col not in mapping and run_output not in mapped_to_values:
657
785
  column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
658
786
 
659
- # After we have generated all columns we can check if we have
660
- # everything we need for evaluators.
661
- _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
787
+ # After we have generated all columns, we can check if we have everything we need for evaluators.
788
+ _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
662
789
 
663
790
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
664
791
  # via target mapping.
@@ -674,7 +801,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
674
801
  def eval_batch_run(
675
802
  batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
676
803
  ) -> Dict[str, __EvaluatorInfo]:
677
- with BatchRunContext(batch_run_client):
804
+ with EvalRunContext(batch_run_client):
678
805
  runs = {
679
806
  evaluator_name: batch_run_client.run(
680
807
  flow=evaluator,
@@ -688,7 +815,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
815
  for evaluator_name, evaluator in evaluators.items()
689
816
  }
690
817
 
691
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
818
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
692
819
  return {
693
820
  evaluator_name: {
694
821
  "result": batch_run_client.get_details(run, all_results=True),
@@ -758,7 +885,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
758
885
  evaluation_name,
759
886
  )
760
887
 
761
- result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
888
+ result_df_dict = result_df.to_dict("records")
889
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
762
890
 
763
891
  if output_path:
764
892
  _write_output(output_path, result)
@@ -16,10 +16,10 @@ from promptflow.client import PFClient
16
16
  from promptflow.core import Prompty as prompty_core
17
17
  from typing_extensions import ParamSpec
18
18
 
19
- from azure.ai.evaluation._model_configurations import AzureAIProject
19
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
20
20
 
21
21
  from ..._user_agent import USER_AGENT
22
- from .._utils import EvaluateResult, _trace_destination_from_project_scope
22
+ from .._utils import _trace_destination_from_project_scope
23
23
 
24
24
  LOGGER = logging.getLogger(__name__)
25
25
 
@@ -97,17 +97,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
97
97
 
98
98
 
99
99
  # cspell:ignore isna
100
- def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
100
+ def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
101
101
  """Decorator to log evaluate activity
102
102
 
103
103
  :param func: The function to be decorated
104
104
  :type func: Callable
105
105
  :returns: The decorated function
106
- :rtype: Callable[P, EvaluateResult]
106
+ :rtype: Callable[P, EvaluationResult]
107
107
  """
108
108
 
109
109
  @functools.wraps(func)
110
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
110
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
111
111
  from promptflow._sdk._telemetry import ActivityType, log_activity
112
112
  from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
113
113