azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,689 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import os
6
+ import re
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from promptflow._sdk._constants import LINE_NUMBER
13
+ from promptflow.client import PFClient
14
+
15
+ from .._model_configurations import AzureAIProject
16
+ from .._constants import (
17
+ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
18
+ EvaluationMetrics,
19
+ Prefixes,
20
+ _InternalEvaluationMetrics,
21
+ )
22
+ from .._user_agent import USER_AGENT
23
+ from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
24
+ from ._telemetry import log_evaluate_activity
25
+ from ._utils import (
26
+ _apply_column_mapping,
27
+ _log_metrics_and_instance_results,
28
+ _trace_destination_from_project_scope,
29
+ _write_output,
30
+ )
31
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
32
+
33
+
34
+ # pylint: disable=line-too-long
35
+ def _aggregate_content_safety_metrics(
36
+ df: pd.DataFrame, evaluators: Dict[str, Type]
37
+ ) -> Tuple[List[str], Dict[str, float]]:
38
+ """Find and aggregate defect rates for content safety metrics. Returns both a list
39
+ of columns that were used to calculate defect rates and the defect rates themselves.
40
+
41
+ :param df: The dataframe of evaluation results.
42
+ :type df: ~pandas.DataFrame
43
+ :param evaluators: A dictionary mapping of strings to evaluator classes. This is used to identify
44
+ content safety metrics, since they should start with a string that matches an evaluator name.
45
+ :type evaluators: Dict[str, type]
46
+ :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
47
+ and the second element is a dictionary of defect column names and defect rates.
48
+ :rtype: Tuple[List[str], Dict[str, float]]
49
+ """
50
+ content_safety_metrics = [
51
+ EvaluationMetrics.SEXUAL,
52
+ EvaluationMetrics.SELF_HARM,
53
+ EvaluationMetrics.HATE_UNFAIRNESS,
54
+ EvaluationMetrics.VIOLENCE,
55
+ ]
56
+ content_safety_cols = []
57
+ for col in df.columns:
58
+ evaluator_name = col.split(".")[0]
59
+ metric_name = col.split(".")[1]
60
+ if evaluator_name in evaluators:
61
+ # Check the namespace of the evaluator
62
+ module = inspect.getmodule(evaluators[evaluator_name])
63
+ if (
64
+ module
65
+ and module.__name__.startswith("azure.ai.evaluation.")
66
+ and metric_name.endswith("_score")
67
+ and metric_name.replace("_score", "") in content_safety_metrics
68
+ ):
69
+ content_safety_cols.append(col)
70
+
71
+ content_safety_df = df[content_safety_cols]
72
+ defect_rates = {}
73
+ for col in content_safety_df.columns:
74
+ defect_rate_name = col.replace("_score", "_defect_rate")
75
+ col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
76
+ defect_rates[defect_rate_name] = round(
77
+ np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
78
+ / col_with_numeric_values.count(),
79
+ 2,
80
+ )
81
+ return content_safety_cols, defect_rates
82
+
83
+
84
+ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
85
+ """Find and aggregate defect rates for label-based metrics. Returns both a list
86
+ of columns that were used to calculate defect rates and the defect rates themselves.
87
+
88
+ :param df: The dataframe of evaluation results.
89
+ :type df: ~pandas.DataFrame
90
+ :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
91
+ and the second element is a dictionary of defect column names and defect rates.
92
+ :rtype: Tuple[List[str], Dict[str, float]]
93
+ """
94
+ handled_metrics = [
95
+ EvaluationMetrics.PROTECTED_MATERIAL,
96
+ _InternalEvaluationMetrics.ECI,
97
+ EvaluationMetrics.XPIA,
98
+ ]
99
+ label_cols = []
100
+ for col in df.columns:
101
+ metric_name = col.split(".")[1]
102
+ if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
103
+ label_cols.append(col)
104
+
105
+ label_df = df[label_cols]
106
+ defect_rates = {}
107
+ for col in label_df.columns:
108
+ defect_rate_name = col.replace("_label", "_defect_rate")
109
+ col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
110
+ defect_rates[defect_rate_name] = round(
111
+ np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
112
+ 2,
113
+ )
114
+ return label_cols, defect_rates
115
+
116
+
117
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
118
+ """Aggregate metrics from the evaluation results.
119
+ On top of naively calculating the mean of most metrics, this function also identifies certain columns
120
+ that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
121
+ EX: protected_material_label -> protected_material_defect_rate
122
+
123
+ :param df: The dataframe of evaluation results.
124
+ :type df: ~pandas.DataFrame
125
+ :param evaluators: A dictionary mapping of strings to evaluator classes.
126
+ :type evaluators: Dict[str, Type]
127
+ :return: The aggregated metrics.
128
+ :rtype: Dict[str, float]
129
+ """
130
+ df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
131
+
132
+ handled_columns = []
133
+ defect_rates = {}
134
+ # Rename certain columns as defect rates if we know that's what their aggregates represent
135
+ # Content safety metrics
136
+ content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
137
+ handled_columns.extend(content_safety_cols)
138
+ defect_rates.update(cs_defect_rates)
139
+ # Label-based (true/false) metrics where 'true' means 'something is wrong'
140
+ label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
141
+ handled_columns.extend(label_cols)
142
+ defect_rates.update(label_defect_rates)
143
+
144
+ # For rest of metrics, we will calculate mean
145
+ df.drop(columns=handled_columns, inplace=True)
146
+
147
+ mean_value = df.mean(numeric_only=True)
148
+ metrics = mean_value.to_dict()
149
+ # Add defect rates back into metrics
150
+ metrics.update(defect_rates)
151
+ return metrics
152
+
153
+
154
+ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
155
+ required_inputs = [
156
+ param.name
157
+ for param in inspect.signature(evaluator).parameters.values()
158
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
159
+ ]
160
+
161
+ missing_inputs = [col for col in required_inputs if col not in df_data.columns]
162
+ if missing_inputs:
163
+ if not is_target_fn:
164
+ msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
165
+ raise EvaluationException(
166
+ message=msg,
167
+ internal_message=msg,
168
+ target=ErrorTarget.EVALUATE,
169
+ category=ErrorCategory.MISSING_FIELD,
170
+ blame=ErrorBlame.USER_ERROR,
171
+ )
172
+ msg = f"Missing required inputs for target : {missing_inputs}."
173
+ raise EvaluationException(
174
+ message=msg,
175
+ internal_message=msg,
176
+ target=ErrorTarget.EVALUATE,
177
+ category=ErrorCategory.MISSING_FIELD,
178
+ blame=ErrorBlame.USER_ERROR,
179
+ )
180
+
181
+
182
+ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
183
+ if data is None:
184
+ msg = "data parameter must be provided for evaluation."
185
+ raise EvaluationException(
186
+ message=msg,
187
+ internal_message=msg,
188
+ target=ErrorTarget.EVALUATE,
189
+ category=ErrorCategory.MISSING_FIELD,
190
+ blame=ErrorBlame.USER_ERROR,
191
+ )
192
+
193
+ if target is not None:
194
+ if not callable(target):
195
+ msg = "target parameter must be a callable function."
196
+ raise EvaluationException(
197
+ message=msg,
198
+ internal_message=msg,
199
+ target=ErrorTarget.EVALUATE,
200
+ category=ErrorCategory.INVALID_VALUE,
201
+ blame=ErrorBlame.USER_ERROR,
202
+ )
203
+
204
+ if data is not None:
205
+ if not isinstance(data, str):
206
+ msg = "data parameter must be a string."
207
+ raise EvaluationException(
208
+ message=msg,
209
+ internal_message=msg,
210
+ target=ErrorTarget.EVALUATE,
211
+ category=ErrorCategory.INVALID_VALUE,
212
+ blame=ErrorBlame.USER_ERROR,
213
+ )
214
+
215
+ if evaluators is not None:
216
+ if not isinstance(evaluators, dict):
217
+ msg = "evaluators parameter must be a dictionary."
218
+ raise EvaluationException(
219
+ message=msg,
220
+ internal_message=msg,
221
+ target=ErrorTarget.EVALUATE,
222
+ category=ErrorCategory.INVALID_VALUE,
223
+ blame=ErrorBlame.USER_ERROR,
224
+ )
225
+
226
+ if output_path is not None:
227
+ if not isinstance(output_path, str):
228
+ msg = "output_path parameter must be a string."
229
+ raise EvaluationException(
230
+ message=msg,
231
+ internal_message=msg,
232
+ target=ErrorTarget.EVALUATE,
233
+ category=ErrorCategory.INVALID_VALUE,
234
+ blame=ErrorBlame.USER_ERROR,
235
+ )
236
+
237
+ if azure_ai_project is not None:
238
+ if not isinstance(azure_ai_project, Dict):
239
+ msg = "azure_ai_project parameter must be a dictionary."
240
+ raise EvaluationException(
241
+ message=msg,
242
+ internal_message=msg,
243
+ target=ErrorTarget.EVALUATE,
244
+ category=ErrorCategory.INVALID_VALUE,
245
+ blame=ErrorBlame.USER_ERROR,
246
+ )
247
+
248
+ if evaluation_name is not None:
249
+ if not isinstance(evaluation_name, str):
250
+ msg = "evaluation_name parameter must be a string."
251
+ raise EvaluationException(
252
+ message=msg,
253
+ internal_message=msg,
254
+ target=ErrorTarget.EVALUATE,
255
+ category=ErrorCategory.INVALID_VALUE,
256
+ blame=ErrorBlame.USER_ERROR,
257
+ )
258
+
259
+ try:
260
+ initial_data_df = pd.read_json(data, lines=True)
261
+ except Exception as e:
262
+ raise EvaluationException(
263
+ message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
264
+ internal_message="Failed to load data. Confirm that it is valid jsonl data.",
265
+ target=ErrorTarget.EVALUATE,
266
+ category=ErrorCategory.INVALID_VALUE,
267
+ blame=ErrorBlame.USER_ERROR,
268
+ ) from e
269
+
270
+ return initial_data_df
271
+
272
+
273
+ def _validate_columns(
274
+ df: pd.DataFrame,
275
+ evaluators: Dict[str, Any],
276
+ target: Optional[Callable],
277
+ evaluator_config: Dict[str, Dict[str, str]],
278
+ ) -> None:
279
+ """
280
+ Check that all columns needed by evaluator or target function are present.
281
+
282
+ :param df: The data frame to be validated.
283
+ :type df: pd.DataFrame
284
+ :param evaluators: The dictionary of evaluators.
285
+ :type evaluators: Dict[str, Any]
286
+ :param target: The callable to be applied to data set.
287
+ :type target: Optional[Callable]
288
+ :param evaluator_config: The configuration for evaluators.
289
+ :type evaluator_config: Dict[str, Dict[str, str]]
290
+ :raises EvaluationException: If column starts from "__outputs." while target is defined.
291
+ """
292
+ if target:
293
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
294
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
295
+ raise EvaluationException(
296
+ message=msg,
297
+ internal_message=msg,
298
+ target=ErrorTarget.EVALUATE,
299
+ category=ErrorCategory.INVALID_VALUE,
300
+ blame=ErrorBlame.USER_ERROR,
301
+ )
302
+ # If the target function is given, it may return
303
+ # several columns and hence we cannot check the availability of columns
304
+ # without knowing target function semantics.
305
+ # Instead, here we will validate the columns, taken by target.
306
+ _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
307
+ else:
308
+ for evaluator_name, evaluator in evaluators.items():
309
+ # Apply column mapping
310
+ mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
311
+ new_df = _apply_column_mapping(df, mapping_config)
312
+
313
+ # Validate input data for evaluator
314
+ _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
315
+
316
+
317
+ def _apply_target_to_data(
318
+ target: Callable,
319
+ data: str,
320
+ pf_client: PFClient,
321
+ initial_data: pd.DataFrame,
322
+ evaluation_name: Optional[str] = None,
323
+ _run_name: Optional[str] = None,
324
+ ) -> Tuple[pd.DataFrame, Set[str]]:
325
+ """
326
+ Apply the target function to the data set and return updated data and generated columns.
327
+
328
+ :param target: The function to be applied to data.
329
+ :type target: Callable
330
+ :param data: The path to input jsonl file.
331
+ :type data: str
332
+ :param pf_client: The promptflow client to be used.
333
+ :type pf_client: PFClient
334
+ :param initial_data: The data frame with the loaded data.
335
+ :type initial_data: pd.DataFrame
336
+ :param evaluation_name: The name of the evaluation.
337
+ :type evaluation_name: Optional[str]
338
+ :param _run_name: The name of target run. Used for testing only.
339
+ :type _run_name: Optional[str]
340
+ :return: The tuple, containing data frame and the list of added columns.
341
+ :rtype: Tuple[pandas.DataFrame, List[str]]
342
+ """
343
+ # We are manually creating the temporary directory for the flow
344
+ # because the way tempdir remove temporary directories will
345
+ # hang the debugger, because promptflow will keep flow directory.
346
+ run = pf_client.run(
347
+ flow=target,
348
+ display_name=evaluation_name,
349
+ data=data,
350
+ properties={"runType": "eval_run", "isEvaluatorRun": "true"},
351
+ stream=True,
352
+ name=_run_name,
353
+ )
354
+ target_output = pf_client.runs.get_details(run, all_results=True)
355
+ # Remove input and output prefix
356
+ generated_columns = {
357
+ col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
358
+ }
359
+ # Sort output by line numbers
360
+ target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
361
+ target_output.sort_index(inplace=True)
362
+ target_output.reset_index(inplace=True, drop=False)
363
+ # target_output contains only input columns, taken by function,
364
+ # so we need to concatenate it to the input data frame.
365
+ drop_columns = list(filter(lambda x: x.startswith("inputs"), target_output.columns))
366
+ target_output.drop(drop_columns, inplace=True, axis=1)
367
+ # Rename outputs columns to __outputs
368
+ rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
369
+ target_output.rename(columns=rename_dict, inplace=True)
370
+ # Concatenate output to input
371
+ target_output = pd.concat([target_output, initial_data], axis=1)
372
+
373
+ return target_output, generated_columns, run
374
+
375
+
376
+ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
377
+ """Process evaluator_config to replace ${target.} with ${data.}
378
+
379
+ :param evaluator_config: The configuration for evaluators.
380
+ :type evaluator_config: Dict[str, Dict[str, str]]
381
+ :return: The processed configuration.
382
+ :rtype: Dict[str, Dict[str, str]]
383
+ """
384
+
385
+ processed_config = {}
386
+
387
+ unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
388
+
389
+ if evaluator_config:
390
+ for evaluator, mapping_config in evaluator_config.items():
391
+ if isinstance(mapping_config, dict):
392
+ processed_config[evaluator] = {}
393
+
394
+ for map_to_key, map_value in mapping_config.items():
395
+ # Check if there's any unexpected reference other than ${target.} or ${data.}
396
+ if unexpected_references.search(map_value):
397
+ msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
398
+ raise EvaluationException(
399
+ message=msg,
400
+ internal_message=msg,
401
+ target=ErrorTarget.EVALUATE,
402
+ category=ErrorCategory.INVALID_VALUE,
403
+ blame=ErrorBlame.USER_ERROR,
404
+ )
405
+
406
+ # Replace ${target.} with ${run.outputs.}
407
+ processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${run.outputs.")
408
+
409
+ return processed_config
410
+
411
+
412
+ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
413
+ """
414
+ Change the column names for data frame. The change happens inplace.
415
+
416
+ The columns with _OUTPUTS prefix will not be changed. _OUTPUTS prefix will
417
+ will be added to columns in target_generated set. The rest columns will get
418
+ ".inputs" prefix.
419
+
420
+ :param df: The data frame to apply changes to.
421
+ :type df: pandas.DataFrame
422
+ :return: The changed data frame.
423
+ :rtype: pandas.DataFrame
424
+ """
425
+ rename_dict = {}
426
+ for col in df.columns:
427
+ # Rename columns generated by target.
428
+ if Prefixes.TSG_OUTPUTS in col:
429
+ rename_dict[col] = col.replace(Prefixes.TSG_OUTPUTS, Prefixes.OUTPUTS)
430
+ else:
431
+ rename_dict[col] = f"inputs.{col}"
432
+ df.rename(columns=rename_dict, inplace=True)
433
+ return df
434
+
435
+
436
+ # @log_evaluate_activity
437
+ def evaluate(
438
+ *,
439
+ evaluation_name: Optional[str] = None,
440
+ target: Optional[Callable] = None,
441
+ data: Optional[str] = None,
442
+ evaluators: Optional[Dict[str, Callable]] = None,
443
+ evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
444
+ azure_ai_project: Optional[AzureAIProject] = None,
445
+ output_path: Optional[str] = None,
446
+ **kwargs,
447
+ ):
448
+ """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
449
+ data will be run through target function and then results will be evaluated.
450
+
451
+ :keyword evaluation_name: Display name of the evaluation.
452
+ :paramtype evaluation_name: Optional[str]
453
+ :keyword target: Target to be evaluated. `target` and `data` both cannot be None
454
+ :paramtype target: Optional[Callable]
455
+ :keyword data: Path to the data to be evaluated or passed to target if target is set.
456
+ Only .jsonl format files are supported. `target` and `data` both cannot be None
457
+ :paramtype data: Optional[str]
458
+ :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
459
+ and value as the evaluator function.
460
+ :paramtype evaluators: Optional[Dict[str, Callable]
461
+ :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
462
+ names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
463
+ keys as the column names in the evaluator input and values as the column names in the input data or data
464
+ generated by target.
465
+ :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
466
+ :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
467
+ the results will be saved to a file named `evaluation_results.json` in the folder.
468
+ :paramtype output_path: Optional[str]
469
+ :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
470
+ :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
471
+ :return: Evaluation results.
472
+ :rtype: dict
473
+
474
+ :Example:
475
+
476
+ Evaluate API can be used as follows:
477
+
478
+ .. code-block:: python
479
+
480
+ from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
481
+
482
+
483
+ model_config = {
484
+ "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
485
+ "api_key": os.environ.get("AZURE_OPENAI_KEY"),
486
+ "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
487
+ }
488
+
489
+ coherence_eval = CoherenceEvaluator(model_config=model_config)
490
+ relevance_eval = RelevanceEvaluator(model_config=model_config)
491
+
492
+ path = "evaluate_test_data.jsonl"
493
+ result = evaluate(
494
+ data=path,
495
+ evaluators={
496
+ "coherence": coherence_eval,
497
+ "relevance": relevance_eval,
498
+ },
499
+ evaluator_config={
500
+ "coherence": {
501
+ "response": "${data.response}",
502
+ "query": "${data.query}"
503
+ },
504
+ "relevance": {
505
+ "response": "${data.response}",
506
+ "context": "${data.context}",
507
+ "query": "${data.query}"
508
+ }
509
+ }
510
+ )
511
+
512
+ """
513
+ try:
514
+ return _evaluate(
515
+ evaluation_name=evaluation_name,
516
+ target=target,
517
+ data=data,
518
+ evaluators=evaluators,
519
+ evaluator_config=evaluator_config,
520
+ azure_ai_project=azure_ai_project,
521
+ output_path=output_path,
522
+ **kwargs,
523
+ )
524
+ except Exception as e:
525
+ # Handle multiprocess bootstrap error
526
+ bootstrap_error = (
527
+ "An attempt has been made to start a new process before the\n "
528
+ "current process has finished its bootstrapping phase."
529
+ )
530
+ if bootstrap_error in str(e):
531
+ error_message = (
532
+ "The evaluation failed due to an error during multiprocess bootstrapping."
533
+ "Please ensure the evaluate API is properly guarded with the '__main__' block:\n\n"
534
+ " if __name__ == '__main__':\n"
535
+ " evaluate(...)"
536
+ )
537
+ raise EvaluationException(
538
+ message=error_message,
539
+ internal_message=error_message,
540
+ target=ErrorTarget.EVALUATE,
541
+ category=ErrorCategory.FAILED_EXECUTION,
542
+ blame=ErrorBlame.UNKNOWN,
543
+ ) from e
544
+
545
+ raise e
546
+
547
+
548
+ def _evaluate( # pylint: disable=too-many-locals
549
+ *,
550
+ evaluation_name: Optional[str] = None,
551
+ target: Optional[Callable] = None,
552
+ data: Optional[str] = None,
553
+ evaluators: Optional[Dict[str, Callable]] = None,
554
+ evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
555
+ azure_ai_project: Optional[AzureAIProject] = None,
556
+ output_path: Optional[str] = None,
557
+ **kwargs,
558
+ ):
559
+ input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
560
+
561
+ # Process evaluator config to replace ${target.} with ${data.}
562
+ if evaluator_config is None:
563
+ evaluator_config = {}
564
+ evaluator_config = _process_evaluator_config(evaluator_config)
565
+ _validate_columns(input_data_df, evaluators, target, evaluator_config)
566
+
567
+ # Target Run
568
+ pf_client = PFClient(
569
+ config=(
570
+ {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
571
+ ),
572
+ user_agent=USER_AGENT,
573
+ )
574
+
575
+ trace_destination = pf_client._config.get_trace_destination()
576
+
577
+ target_run = None
578
+
579
+ target_generated_columns = set()
580
+ if data is not None and target is not None:
581
+ input_data_df, target_generated_columns, target_run = _apply_target_to_data(
582
+ target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
583
+ )
584
+
585
+ # Make sure, the default is always in the configuration.
586
+ if not evaluator_config:
587
+ evaluator_config = {}
588
+ if "default" not in evaluator_config:
589
+ evaluator_config["default"] = {}
590
+
591
+ for evaluator_name, mapping in evaluator_config.items():
592
+ mapped_to_values = set(mapping.values())
593
+ for col in target_generated_columns:
594
+ # If user defined mapping differently, do not change it.
595
+ # If it was mapped to target, we have already changed it
596
+ # in _process_evaluator_config
597
+ run_output = f"${{run.outputs.{col}}}"
598
+ # We will add our mapping only if
599
+ # customer did not mapped target output.
600
+ if col not in mapping and run_output not in mapped_to_values:
601
+ evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
602
+
603
+ # After we have generated all columns we can check if we have
604
+ # everything we need for evaluators.
605
+ _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
606
+
607
+ # Batch Run
608
+ evaluators_info = {}
609
+ use_pf_client = kwargs.get("_use_pf_client", True)
610
+ if use_pf_client:
611
+ batch_run_client = ProxyClient(pf_client)
612
+
613
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
614
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
615
+ data = os.path.abspath(data)
616
+ else:
617
+ batch_run_client = CodeClient()
618
+ data = input_data_df
619
+
620
+ with BatchRunContext(batch_run_client):
621
+ for evaluator_name, evaluator in evaluators.items():
622
+ evaluators_info[evaluator_name] = {}
623
+ evaluators_info[evaluator_name]["run"] = batch_run_client.run(
624
+ flow=evaluator,
625
+ run=target_run,
626
+ evaluator_name=evaluator_name,
627
+ column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
628
+ data=data,
629
+ stream=True,
630
+ name=kwargs.get("_run_name"),
631
+ )
632
+
633
+ # get_details needs to be called within BatchRunContext scope in order to have user agent populated
634
+ for evaluator_name, evaluator_info in evaluators_info.items():
635
+ evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
636
+ evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
637
+
638
+ # Concatenate all results
639
+ evaluators_result_df = None
640
+ evaluators_metric = {}
641
+ for evaluator_name, evaluator_info in evaluators_info.items():
642
+ evaluator_result_df = evaluator_info["result"]
643
+
644
+ # drop input columns
645
+ evaluator_result_df = evaluator_result_df.drop(
646
+ columns=[col for col in evaluator_result_df.columns if str(col).startswith(Prefixes.INPUTS)]
647
+ )
648
+
649
+ # rename output columns
650
+ # Assuming after removing inputs columns, all columns are output columns
651
+ evaluator_result_df.rename(
652
+ columns={
653
+ col: f"outputs.{evaluator_name}.{str(col).replace(Prefixes.OUTPUTS, '')}"
654
+ for col in evaluator_result_df.columns
655
+ },
656
+ inplace=True,
657
+ )
658
+
659
+ evaluators_result_df = (
660
+ pd.concat([evaluators_result_df, evaluator_result_df], axis=1, verify_integrity=True)
661
+ if evaluators_result_df is not None
662
+ else evaluator_result_df
663
+ )
664
+
665
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
666
+
667
+ # Rename columns, generated by target function to outputs instead of inputs.
668
+ # If target generates columns, already present in the input data, these columns
669
+ # will be marked as outputs already so we do not need to rename them.
670
+ input_data_df = _rename_columns_conditionally(input_data_df)
671
+
672
+ result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
673
+ metrics = _aggregate_metrics(evaluators_result_df, evaluators)
674
+ metrics.update(evaluators_metric)
675
+
676
+ studio_url = _log_metrics_and_instance_results(
677
+ metrics,
678
+ result_df,
679
+ trace_destination,
680
+ target_run,
681
+ evaluation_name,
682
+ )
683
+
684
+ result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
685
+
686
+ if output_path:
687
+ _write_output(output_path, result)
688
+
689
+ return result