azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,850 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
10
+
11
+ import pandas as pd
12
+ from promptflow._sdk._constants import LINE_NUMBER
13
+ from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
14
+ from promptflow.client import PFClient
15
+ from promptflow.entities import Run
16
+
17
+ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
19
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
+
21
+ from .._constants import (
22
+ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
23
+ EvaluationMetrics,
24
+ EvaluationRunProperties,
25
+ Prefixes,
26
+ _InternalEvaluationMetrics,
27
+ )
28
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
+ from .._user_agent import USER_AGENT
30
+ from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
31
+ from ._utils import (
32
+ _apply_column_mapping,
33
+ _log_metrics_and_instance_results,
34
+ _trace_destination_from_project_scope,
35
+ _write_output,
36
+ )
37
+
38
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
39
+ LOGGER = logging.getLogger(__name__)
40
+
41
+ # For metrics (aggregates) whose metric names intentionally differ from their
42
+ # originating column name, usually because the aggregation of the original value
43
+ # means something sufficiently different.
44
+ # Note that content safety metrics are handled seprately.
45
+ METRIC_COLUMN_NAME_REPLACEMENTS = {
46
+ "groundedness_pro_label": "groundedness_pro_passing_rate",
47
+ }
48
+
49
+
50
+ class __EvaluatorInfo(TypedDict):
51
+ result: pd.DataFrame
52
+ metrics: Dict[str, Any]
53
+ run_summary: Dict[str, Any]
54
+
55
+
56
+ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
57
+ """Identify and average various metrics that need to have the metric name be replaced,
58
+ instead of having the metric match the originating column name.
59
+ :param df: The dataframe of evaluation results.
60
+ :type df: ~pandas.DataFrame
61
+ :return: A tuple; the first element is a list of dataframe columns that were aggregated,
62
+ and the second element is a dictionary of resultant new metric column names and their values.
63
+ :rtype: Tuple[List[str], Dict[str, float]]
64
+ """
65
+ renamed_cols = []
66
+ metric_columns = {}
67
+ for col in df.columns:
68
+ metric_prefix = col.split(".")[0]
69
+ metric_name = col.split(".")[1]
70
+ if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
71
+ renamed_cols.append(col)
72
+ new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
73
+ col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
74
+ try:
75
+ metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76
+ except EvaluationException: # only exception that can be cause is all NaN values
77
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
78
+ LOGGER.warning(msg)
79
+
80
+ return renamed_cols, metric_columns
81
+
82
+
83
+ # pylint: disable=line-too-long
84
+ def _aggregate_content_safety_metrics(
85
+ df: pd.DataFrame, evaluators: Dict[str, Callable]
86
+ ) -> Tuple[List[str], Dict[str, float]]:
87
+ """Find and aggregate defect rates for content safety metrics. Returns both a list
88
+ of columns that were used to calculate defect rates and the defect rates themselves.
89
+
90
+ :param df: The dataframe of evaluation results.
91
+ :type df: ~pandas.DataFrame
92
+ :param evaluators: A dictionary mapping of strings to evaluator classes. This is used to identify
93
+ content safety metrics, since they should start with a string that matches an evaluator name.
94
+ :type evaluators: Dict[str, type]
95
+ :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
96
+ and the second element is a dictionary of defect column names and defect rates.
97
+ :rtype: Tuple[List[str], Dict[str, float]]
98
+ """
99
+ content_safety_metrics = [
100
+ EvaluationMetrics.SEXUAL,
101
+ EvaluationMetrics.SELF_HARM,
102
+ EvaluationMetrics.HATE_UNFAIRNESS,
103
+ EvaluationMetrics.VIOLENCE,
104
+ ]
105
+ content_safety_cols = []
106
+ for col in df.columns:
107
+ evaluator_name = col.split(".")[0]
108
+ metric_name = col.split(".")[1]
109
+ if evaluator_name in evaluators:
110
+ # Check the namespace of the evaluator
111
+ module = inspect.getmodule(evaluators[evaluator_name])
112
+ if (
113
+ module
114
+ and module.__name__.startswith("azure.ai.evaluation.")
115
+ and metric_name.endswith("_score")
116
+ and metric_name.replace("_score", "") in content_safety_metrics
117
+ ):
118
+ content_safety_cols.append(col)
119
+
120
+ content_safety_df = df[content_safety_cols]
121
+ defect_rates = {}
122
+ for col in content_safety_df.columns:
123
+ defect_rate_name = col.replace("_score", "_defect_rate")
124
+ col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
125
+ try:
126
+ col_with_boolean_values = apply_transform_nan_safe(
127
+ col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
128
+ )
129
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
130
+ except EvaluationException: # only exception that can be cause is all NaN values
131
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
132
+ LOGGER.warning(msg)
133
+
134
+ return content_safety_cols, defect_rates
135
+
136
+
137
+ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
138
+ """Find and aggregate defect rates for label-based metrics. Returns both a list
139
+ of columns that were used to calculate defect rates and the defect rates themselves.
140
+
141
+ :param df: The dataframe of evaluation results.
142
+ :type df: ~pandas.DataFrame
143
+ :return: A tuple; the first element is a list of dataframe columns that were used to calculate defect rates,
144
+ and the second element is a dictionary of defect column names and defect rates.
145
+ :rtype: Tuple[List[str], Dict[str, float]]
146
+ """
147
+ handled_metrics = [
148
+ EvaluationMetrics.PROTECTED_MATERIAL,
149
+ _InternalEvaluationMetrics.ECI,
150
+ EvaluationMetrics.XPIA,
151
+ ]
152
+ label_cols = []
153
+ for col in df.columns:
154
+ metric_name = col.split(".")[1]
155
+ if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
156
+ label_cols.append(col)
157
+
158
+ label_df = df[label_cols]
159
+ defect_rates = {}
160
+ for col in label_df.columns:
161
+ defect_rate_name = col.replace("_label", "_defect_rate")
162
+ col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
163
+ try:
164
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165
+ except EvaluationException: # only exception that can be cause is all NaN values
166
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167
+ LOGGER.warning(msg)
168
+ return label_cols, defect_rates
169
+
170
+
171
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
172
+ """Aggregate metrics from the evaluation results.
173
+ On top of naively calculating the mean of most metrics, this function also identifies certain columns
174
+ that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
175
+ EX: protected_material_label -> protected_material_defect_rate
176
+
177
+ :param df: The dataframe of evaluation results.
178
+ :type df: ~pandas.DataFrame
179
+ :param evaluators: A dictionary mapping of strings to evaluator classes.
180
+ :type evaluators: Dict[str, Callable]
181
+ :return: The aggregated metrics.
182
+ :rtype: Dict[str, float]
183
+ """
184
+ df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
185
+
186
+ handled_columns = []
187
+ defect_rates = {}
188
+ # Rename certain columns as defect rates if we know that's what their aggregates represent
189
+ # Content safety metrics
190
+ content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
191
+ other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
192
+ handled_columns.extend(content_safety_cols)
193
+ handled_columns.extend(other_renamed_cols)
194
+ defect_rates.update(cs_defect_rates)
195
+ defect_rates.update(renamed_cols)
196
+ # Label-based (true/false) metrics where 'true' means 'something is wrong'
197
+ label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
198
+ handled_columns.extend(label_cols)
199
+ defect_rates.update(label_defect_rates)
200
+
201
+ # For rest of metrics, we will calculate mean
202
+ df.drop(columns=handled_columns, inplace=True)
203
+
204
+ # NOTE: nan/None values don't count as as booleans, so boolean columns with
205
+ # nan/None values won't have a mean produced from them.
206
+ # This is different from label-based known evaluators, which have special handling.
207
+ mean_value = df.mean(numeric_only=True)
208
+ metrics = mean_value.to_dict()
209
+ # Add defect rates back into metrics
210
+ metrics.update(defect_rates)
211
+ return metrics
212
+
213
+
214
+ def _validate_columns_for_target(
215
+ df: pd.DataFrame,
216
+ target: Callable,
217
+ ) -> None:
218
+ """
219
+ Check that all columns needed by target function are present.
220
+
221
+ :param df: The data frame to be validated.
222
+ :type df: pd.DataFrame
223
+ :param target: The callable to be applied to data set.
224
+ :type target: Optional[Callable]
225
+ :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
226
+ """
227
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
228
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
229
+ raise EvaluationException(
230
+ message=msg,
231
+ internal_message=msg,
232
+ target=ErrorTarget.EVALUATE,
233
+ category=ErrorCategory.INVALID_VALUE,
234
+ blame=ErrorBlame.USER_ERROR,
235
+ )
236
+ # If the target function is given, it may return
237
+ # several columns and hence we cannot check the availability of columns
238
+ # without knowing target function semantics.
239
+ # Instead, here we will validate the columns, taken by target.
240
+ required_inputs = [
241
+ param.name
242
+ for param in inspect.signature(target).parameters.values()
243
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
244
+ ]
245
+
246
+ missing_inputs = [col for col in required_inputs if col not in df.columns]
247
+ if missing_inputs:
248
+ msg = f"Missing required inputs for target: {missing_inputs}."
249
+ raise EvaluationException(
250
+ message=msg,
251
+ target=ErrorTarget.EVALUATE,
252
+ category=ErrorCategory.MISSING_FIELD,
253
+ blame=ErrorBlame.USER_ERROR,
254
+ )
255
+
256
+
257
+ def _validate_columns_for_evaluators(
258
+ df: pd.DataFrame,
259
+ evaluators: Dict[str, Callable],
260
+ target: Optional[Callable],
261
+ target_generated_columns: Optional[Set[str]],
262
+ column_mapping: Dict[str, Dict[str, str]],
263
+ ) -> None:
264
+ """
265
+ Check that all columns needed by evaluators are present.
266
+
267
+ :param df: The data frame to be validated.
268
+ :type df: pd.DataFrame
269
+ :param evaluators: The dictionary of evaluators.
270
+ :type evaluators: Dict[str, Callable]
271
+ :param target: The callable to be applied to data set.
272
+ :type target: Optional[Callable]
273
+ :param target_generated_columns: The set of columns generated by the target callable.
274
+ :type target_generated_columns: Optional[Set[str]]
275
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
276
+ :type column_mapping: Dict[str, Dict[str, str]]
277
+ :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
278
+ """
279
+ missing_inputs_per_evaluator = {}
280
+
281
+ for evaluator_name, evaluator in evaluators.items():
282
+ # Apply column mapping
283
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
284
+ new_df = _apply_column_mapping(df, mapping_config)
285
+
286
+ # Validate input data for evaluator
287
+ is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
288
+ if is_built_in:
289
+ # Note that for built-in evaluators supporting the "conversation" parameter,
290
+ # input parameters are now optional.
291
+ evaluator_params = [
292
+ param.name
293
+ for param in inspect.signature(evaluator).parameters.values()
294
+ if param.name not in ["kwargs", "args", "self"]
295
+ ]
296
+
297
+ if "conversation" in evaluator_params and "conversation" in new_df.columns:
298
+ # Ignore the missing fields if "conversation" presents in the input data
299
+ missing_inputs = []
300
+ else:
301
+ optional_params = (
302
+ evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
303
+ if hasattr(evaluator, "_OPTIONAL_PARAMS")
304
+ else []
305
+ )
306
+ excluded_params = set(new_df.columns).union(optional_params)
307
+ missing_inputs = [col for col in evaluator_params if col not in excluded_params]
308
+
309
+ # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
310
+ # Otherwise, remove it from the missing inputs
311
+ if "conversation" in missing_inputs:
312
+ if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
313
+ missing_inputs.remove("conversation")
314
+ else:
315
+ evaluator_params = [
316
+ param.name
317
+ for param in inspect.signature(evaluator).parameters.values()
318
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
319
+ ]
320
+
321
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
322
+
323
+ if missing_inputs:
324
+ missing_inputs_per_evaluator[evaluator_name] = missing_inputs
325
+
326
+ if missing_inputs_per_evaluator:
327
+ msg = "Some evaluators are missing required inputs:\n"
328
+ for evaluator_name, missing in missing_inputs_per_evaluator.items():
329
+ msg += f"- {evaluator_name}: {missing}\n"
330
+
331
+ # Add the additional notes
332
+ msg += "\nTo resolve this issue:\n"
333
+ msg += "- Ensure the data contains required inputs.\n"
334
+ if target is not None:
335
+ msg += "- Verify that the target is generating the necessary columns for the evaluators. "
336
+ msg += f"Currently generated columns: {target_generated_columns} \n"
337
+ msg += "- Check that the column mapping is correctly configured."
338
+
339
+ raise EvaluationException(
340
+ message=msg.strip(),
341
+ target=ErrorTarget.EVALUATE,
342
+ category=ErrorCategory.MISSING_FIELD,
343
+ blame=ErrorBlame.USER_ERROR,
344
+ )
345
+
346
+
347
+ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
348
+ if data is None:
349
+ msg = "The 'data' parameter is required for evaluation."
350
+ raise EvaluationException(
351
+ message=msg,
352
+ target=ErrorTarget.EVALUATE,
353
+ category=ErrorCategory.INVALID_VALUE,
354
+ blame=ErrorBlame.USER_ERROR,
355
+ )
356
+ if not isinstance(data, (os.PathLike, str)):
357
+ msg = "The 'data' parameter must be a string or a path-like object."
358
+ raise EvaluationException(
359
+ message=msg,
360
+ target=ErrorTarget.EVALUATE,
361
+ category=ErrorCategory.INVALID_VALUE,
362
+ blame=ErrorBlame.USER_ERROR,
363
+ )
364
+ if not os.path.exists(data):
365
+ msg = f"The input data file path '{data}' does not exist."
366
+ raise EvaluationException(
367
+ message=msg,
368
+ target=ErrorTarget.EVALUATE,
369
+ category=ErrorCategory.INVALID_VALUE,
370
+ blame=ErrorBlame.USER_ERROR,
371
+ )
372
+
373
+ if target is not None:
374
+ if not callable(target):
375
+ msg = "The 'target' parameter must be a callable function."
376
+ raise EvaluationException(
377
+ message=msg,
378
+ target=ErrorTarget.EVALUATE,
379
+ category=ErrorCategory.INVALID_VALUE,
380
+ blame=ErrorBlame.USER_ERROR,
381
+ )
382
+
383
+ if not evaluators:
384
+ msg = "The 'evaluators' parameter is required and cannot be None or empty."
385
+ raise EvaluationException(
386
+ message=msg,
387
+ target=ErrorTarget.EVALUATE,
388
+ category=ErrorCategory.INVALID_VALUE,
389
+ blame=ErrorBlame.USER_ERROR,
390
+ )
391
+ if not isinstance(evaluators, dict):
392
+ msg = "The 'evaluators' parameter must be a dictionary."
393
+ raise EvaluationException(
394
+ message=msg,
395
+ target=ErrorTarget.EVALUATE,
396
+ category=ErrorCategory.INVALID_VALUE,
397
+ blame=ErrorBlame.USER_ERROR,
398
+ )
399
+
400
+ if output_path is not None:
401
+ if not isinstance(output_path, (os.PathLike, str)):
402
+ msg = "The 'output_path' parameter must be a string or a path-like object."
403
+ raise EvaluationException(
404
+ message=msg,
405
+ target=ErrorTarget.EVALUATE,
406
+ category=ErrorCategory.INVALID_VALUE,
407
+ blame=ErrorBlame.USER_ERROR,
408
+ )
409
+
410
+ output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
411
+ if output_dir and not os.path.exists(output_dir):
412
+ msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
413
+ raise EvaluationException(
414
+ message=msg,
415
+ target=ErrorTarget.EVALUATE,
416
+ category=ErrorCategory.INVALID_VALUE,
417
+ blame=ErrorBlame.USER_ERROR,
418
+ )
419
+
420
+ if azure_ai_project is not None:
421
+ validate_azure_ai_project(azure_ai_project)
422
+
423
+ if evaluation_name is not None:
424
+ if not isinstance(evaluation_name, str) or not evaluation_name.strip():
425
+ msg = "The 'evaluation_name' parameter must be a non-empty string."
426
+ raise EvaluationException(
427
+ message=msg,
428
+ target=ErrorTarget.EVALUATE,
429
+ category=ErrorCategory.INVALID_VALUE,
430
+ blame=ErrorBlame.USER_ERROR,
431
+ )
432
+
433
+ try:
434
+ initial_data_df = pd.read_json(data, lines=True)
435
+ except Exception as e:
436
+ raise EvaluationException(
437
+ message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
438
+ target=ErrorTarget.EVALUATE,
439
+ category=ErrorCategory.INVALID_VALUE,
440
+ blame=ErrorBlame.USER_ERROR,
441
+ ) from e
442
+
443
+ return initial_data_df
444
+
445
+
446
+ def _apply_target_to_data(
447
+ target: Callable,
448
+ data: Union[str, os.PathLike],
449
+ pf_client: PFClient,
450
+ initial_data: pd.DataFrame,
451
+ evaluation_name: Optional[str] = None,
452
+ **kwargs,
453
+ ) -> Tuple[pd.DataFrame, Set[str], Run]:
454
+ """
455
+ Apply the target function to the data set and return updated data and generated columns.
456
+
457
+ :param target: The function to be applied to data.
458
+ :type target: Callable
459
+ :param data: The path to input jsonl file.
460
+ :type data: Union[str, os.PathLike]
461
+ :param pf_client: The promptflow client to be used.
462
+ :type pf_client: PFClient
463
+ :param initial_data: The data frame with the loaded data.
464
+ :type initial_data: pd.DataFrame
465
+ :param evaluation_name: The name of the evaluation.
466
+ :type evaluation_name: Optional[str]
467
+ :return: The tuple, containing data frame and the list of added columns.
468
+ :rtype: Tuple[pandas.DataFrame, List[str]]
469
+ """
470
+ _run_name = kwargs.get("_run_name")
471
+ upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
472
+
473
+ try:
474
+ with TargetRunContext(upload_target_snaphot):
475
+ run: Run = pf_client.run(
476
+ flow=target,
477
+ display_name=evaluation_name,
478
+ data=data,
479
+ properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
+ stream=True,
481
+ name=_run_name,
482
+ )
483
+ except (UserAuthenticationError, UploadInternalError) as ex:
484
+ if "Failed to upload run" in ex.message:
485
+ msg = (
486
+ "Failed to upload the target run to the cloud. "
487
+ "This may be caused by insufficient permission to access storage or other errors."
488
+ )
489
+ raise EvaluationException(
490
+ message=msg,
491
+ target=ErrorTarget.EVALUATE,
492
+ category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
+ blame=ErrorBlame.USER_ERROR,
494
+ tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
+ ) from ex
496
+
497
+ raise ex
498
+
499
+ target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
500
+ # Remove input and output prefix
501
+ generated_columns = {
502
+ col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
503
+ }
504
+ # Sort output by line numbers
505
+ target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
506
+ target_output.sort_index(inplace=True)
507
+ target_output.reset_index(inplace=True, drop=False)
508
+ # target_output contains only input columns, taken by function,
509
+ # so we need to concatenate it to the input data frame.
510
+ drop_columns = list(filter(lambda x: x.startswith("inputs"), target_output.columns))
511
+ target_output.drop(drop_columns, inplace=True, axis=1)
512
+ # Rename outputs columns to __outputs
513
+ rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
514
+ target_output.rename(columns=rename_dict, inplace=True)
515
+ # Concatenate output to input
516
+ target_output = pd.concat([target_output, initial_data], axis=1)
517
+
518
+ return target_output, generated_columns, run
519
+
520
+
521
+ def _process_column_mappings(
522
+ column_mapping: Dict[str, Optional[Dict[str, str]]],
523
+ ) -> Dict[str, Dict[str, str]]:
524
+ """Process column_mapping to replace ${target.} with ${data.}
525
+
526
+ :param column_mapping: The configuration for evaluators.
527
+ :type column_mapping: Dict[str, Optional[Dict[str, str]]]
528
+ :return: The processed configuration.
529
+ :rtype: Dict[str, Dict[str, str]]
530
+ """
531
+
532
+ processed_config: Dict[str, Dict[str, str]] = {}
533
+
534
+ unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
535
+
536
+ if column_mapping:
537
+ for evaluator, mapping_config in column_mapping.items():
538
+ if isinstance(mapping_config, dict):
539
+ processed_config[evaluator] = {}
540
+
541
+ for map_to_key, map_value in mapping_config.items():
542
+ # Check if there's any unexpected reference other than ${target.} or ${data.}
543
+ if unexpected_references.search(map_value):
544
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
545
+ raise EvaluationException(
546
+ message=msg,
547
+ internal_message=msg,
548
+ target=ErrorTarget.EVALUATE,
549
+ category=ErrorCategory.INVALID_VALUE,
550
+ blame=ErrorBlame.USER_ERROR,
551
+ )
552
+
553
+ # Replace ${target.} with ${run.outputs.}
554
+ processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${run.outputs.")
555
+
556
+ return processed_config
557
+
558
+
559
+ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
560
+ """
561
+ Change the column names for data frame. The change happens inplace.
562
+
563
+ The columns with _OUTPUTS prefix will not be changed. _OUTPUTS prefix will
564
+ will be added to columns in target_generated set. The rest columns will get
565
+ ".inputs" prefix.
566
+
567
+ :param df: The data frame to apply changes to.
568
+ :type df: pandas.DataFrame
569
+ :return: The changed data frame.
570
+ :rtype: pandas.DataFrame
571
+ """
572
+ rename_dict = {}
573
+ for col in df.columns:
574
+ # Rename columns generated by target.
575
+ if Prefixes.TSG_OUTPUTS in col:
576
+ rename_dict[col] = col.replace(Prefixes.TSG_OUTPUTS, Prefixes.OUTPUTS)
577
+ else:
578
+ rename_dict[col] = f"inputs.{col}"
579
+ df.rename(columns=rename_dict, inplace=True)
580
+ return df
581
+
582
+
583
+ # @log_evaluate_activity
584
+ def evaluate(
585
+ *,
586
+ data: Union[str, os.PathLike],
587
+ evaluators: Dict[str, Callable],
588
+ evaluation_name: Optional[str] = None,
589
+ target: Optional[Callable] = None,
590
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
591
+ azure_ai_project: Optional[AzureAIProject] = None,
592
+ output_path: Optional[Union[str, os.PathLike]] = None,
593
+ **kwargs,
594
+ ) -> EvaluationResult:
595
+ """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
596
+ data will be run through target function and then results will be evaluated.
597
+
598
+ :keyword data: Path to the data to be evaluated or passed to target if target is set.
599
+ Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
600
+ :paramtype data: str
601
+ :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
602
+ and value as the evaluator function. Required.
603
+ :paramtype evaluators: Dict[str, Callable]
604
+ :keyword evaluation_name: Display name of the evaluation.
605
+ :paramtype evaluation_name: Optional[str]
606
+ :keyword target: Target to be evaluated. `target` and `data` both cannot be None
607
+ :paramtype target: Optional[Callable]
608
+ :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
609
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
610
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
611
+ input data or data generated by target.
612
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
613
+ :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
614
+ the results will be saved to a file named `evaluation_results.json` in the folder.
615
+ :paramtype output_path: Optional[str]
616
+ :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
617
+ :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
618
+ :return: Evaluation results.
619
+ :rtype: ~azure.ai.evaluation.EvaluationResult
620
+
621
+ .. admonition:: Example:
622
+
623
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
624
+ :start-after: [START evaluate_method]
625
+ :end-before: [END evaluate_method]
626
+ :language: python
627
+ :dedent: 8
628
+ :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
629
+ """
630
+ try:
631
+ return _evaluate(
632
+ evaluation_name=evaluation_name,
633
+ target=target,
634
+ data=data,
635
+ evaluators=evaluators,
636
+ evaluator_config=evaluator_config,
637
+ azure_ai_project=azure_ai_project,
638
+ output_path=output_path,
639
+ **kwargs,
640
+ )
641
+ except Exception as e:
642
+ # Handle multiprocess bootstrap error
643
+ bootstrap_error = (
644
+ "An attempt has been made to start a new process before the\n "
645
+ "current process has finished its bootstrapping phase."
646
+ )
647
+ if bootstrap_error in str(e):
648
+ error_message = (
649
+ "The evaluation failed due to an error during multiprocess bootstrapping."
650
+ "Please ensure the evaluate API is properly guarded with the '__main__' block:\n\n"
651
+ " if __name__ == '__main__':\n"
652
+ " evaluate(...)"
653
+ )
654
+ raise EvaluationException(
655
+ message=error_message,
656
+ internal_message=error_message,
657
+ target=ErrorTarget.EVALUATE,
658
+ category=ErrorCategory.FAILED_EXECUTION,
659
+ blame=ErrorBlame.USER_ERROR,
660
+ ) from e
661
+
662
+ # Ensure a consistent user experience when encountering errors by converting
663
+ # all other exceptions to EvaluationException.
664
+ if not isinstance(e, EvaluationException):
665
+ raise EvaluationException(
666
+ message=str(e),
667
+ target=ErrorTarget.EVALUATE,
668
+ category=ErrorCategory.FAILED_EXECUTION,
669
+ blame=ErrorBlame.SYSTEM_ERROR,
670
+ ) from e
671
+
672
+ raise e
673
+
674
+
675
+ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
676
+ # Extract evaluators with a non-empty "run_summary"
677
+ output_dict = {
678
+ name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
679
+ }
680
+
681
+ if output_dict:
682
+ print("======= Combined Run Summary (Per Evaluator) =======\n")
683
+ print(json.dumps(output_dict, indent=4))
684
+ print("\n====================================================\n")
685
+
686
+
687
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
+ *,
689
+ evaluators: Dict[str, Callable],
690
+ evaluation_name: Optional[str] = None,
691
+ target: Optional[Callable] = None,
692
+ data: Union[str, os.PathLike],
693
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
694
+ azure_ai_project: Optional[AzureAIProject] = None,
695
+ output_path: Optional[Union[str, os.PathLike]] = None,
696
+ **kwargs,
697
+ ) -> EvaluationResult:
698
+ input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
699
+
700
+ # Process evaluator config to replace ${target.} with ${data.}
701
+ if evaluator_config is None:
702
+ evaluator_config = {}
703
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
704
+ column_mapping = _process_column_mappings(
705
+ {
706
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
707
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
708
+ }
709
+ )
710
+
711
+ if target is not None:
712
+ _validate_columns_for_target(input_data_df, target)
713
+
714
+ pf_client = PFClient(user_agent=USER_AGENT)
715
+ target_run: Optional[Run] = None
716
+
717
+ # Create default configuration for evaluators that directly maps
718
+ # input data names to keyword inputs of the same name in the evaluators.
719
+ column_mapping = column_mapping or {}
720
+ column_mapping.setdefault("default", {})
721
+
722
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
723
+ target_generated_columns: Set[str] = set()
724
+ if data is not None and target is not None:
725
+ input_data_df, target_generated_columns, target_run = _apply_target_to_data(
726
+ target, data, pf_client, input_data_df, evaluation_name, **kwargs
727
+ )
728
+
729
+ for evaluator_name, mapping in column_mapping.items():
730
+ mapped_to_values = set(mapping.values())
731
+ for col in target_generated_columns:
732
+ # If user defined mapping differently, do not change it.
733
+ # If it was mapped to target, we have already changed it
734
+ # in _process_column_mappings
735
+ run_output = f"${{run.outputs.{col}}}"
736
+ # We will add our mapping only if
737
+ # customer did not mapped target output.
738
+ if col not in mapping and run_output not in mapped_to_values:
739
+ column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
740
+
741
+ # After we have generated all columns, we can check if we have everything we need for evaluators.
742
+ _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
743
+
744
+ # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
745
+ # via target mapping.
746
+ # If both the data and the output dictionary of the target function
747
+ # have the same column, then the target function value is used.
748
+ if input_data_df is not None:
749
+ for col in input_data_df.columns:
750
+ # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
751
+ # Also ignore columns that are already in config, since they've been covered by target mapping.
752
+ if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
753
+ column_mapping["default"][col] = f"${{data.{col}}}"
754
+
755
+ def eval_batch_run(
756
+ batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
757
+ ) -> Dict[str, __EvaluatorInfo]:
758
+ with EvalRunContext(batch_run_client):
759
+ runs = {
760
+ evaluator_name: batch_run_client.run(
761
+ flow=evaluator,
762
+ run=target_run,
763
+ evaluator_name=evaluator_name,
764
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
765
+ data=data,
766
+ stream=True,
767
+ name=kwargs.get("_run_name"),
768
+ )
769
+ for evaluator_name, evaluator in evaluators.items()
770
+ }
771
+
772
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
773
+ return {
774
+ evaluator_name: {
775
+ "result": batch_run_client.get_details(run, all_results=True),
776
+ "metrics": batch_run_client.get_metrics(run),
777
+ "run_summary": batch_run_client.get_run_summary(run),
778
+ }
779
+ for evaluator_name, run in runs.items()
780
+ }
781
+
782
+ # Batch Run
783
+ use_pf_client = kwargs.get("_use_pf_client", True)
784
+ if use_pf_client:
785
+ # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
786
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
787
+ data = os.path.abspath(data)
788
+ per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
789
+ else:
790
+ data = input_data_df
791
+ per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
792
+
793
+ # Concatenate all results
794
+ evaluators_result_df = None
795
+ evaluators_metric = {}
796
+ for evaluator_name, evaluator_result in per_evaluator_results.items():
797
+ evaluator_result_df = evaluator_result["result"]
798
+
799
+ # drop input columns
800
+ evaluator_result_df = evaluator_result_df.drop(
801
+ columns=[col for col in evaluator_result_df.columns if str(col).startswith(Prefixes.INPUTS)]
802
+ )
803
+
804
+ # rename output columns
805
+ # Assuming after removing inputs columns, all columns are output columns
806
+ evaluator_result_df.rename(
807
+ columns={
808
+ col: f"outputs.{evaluator_name}.{str(col).replace(Prefixes.OUTPUTS, '')}"
809
+ for col in evaluator_result_df.columns
810
+ },
811
+ inplace=True,
812
+ )
813
+
814
+ evaluators_result_df = (
815
+ pd.concat([evaluators_result_df, evaluator_result_df], axis=1, verify_integrity=True)
816
+ if evaluators_result_df is not None
817
+ else evaluator_result_df
818
+ )
819
+
820
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
821
+
822
+ # Rename columns, generated by target function to outputs instead of inputs.
823
+ # If target generates columns, already present in the input data, these columns
824
+ # will be marked as outputs already so we do not need to rename them.
825
+ input_data_df = _rename_columns_conditionally(input_data_df)
826
+
827
+ result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
828
+ metrics = _aggregate_metrics(evaluators_result_df, evaluators)
829
+ metrics.update(evaluators_metric)
830
+
831
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
832
+ target_run = None
833
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
834
+ studio_url = _log_metrics_and_instance_results(
835
+ metrics,
836
+ result_df,
837
+ trace_destination,
838
+ target_run,
839
+ evaluation_name,
840
+ )
841
+
842
+ result_df_dict = result_df.to_dict("records")
843
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
844
+
845
+ _print_summary(per_evaluator_results)
846
+
847
+ if output_path:
848
+ _write_output(output_path, result)
849
+
850
+ return result