azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +104 -35
- azure/ai/evaluation/_evaluate/_utils.py +4 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +697 -3067
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/METADATA +39 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/RECORD +49 -41
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/top_level.txt +0 -0
|
@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
|
29
29
|
eval_group_id: str
|
|
30
30
|
eval_run_id: str
|
|
31
31
|
grader_name_map: Dict[str, str]
|
|
32
|
+
# Total number of expected rows in the original dataset. Used to
|
|
33
|
+
# re-align AOAI grader results to guard against silent row drops
|
|
34
|
+
# causing horizontal concatenation misalignment.
|
|
35
|
+
expected_rows: int
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
def _split_evaluators_and_grader_configs(
|
|
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
|
|
|
157
161
|
)
|
|
158
162
|
|
|
159
163
|
return OAIEvalRunCreationInfo(
|
|
160
|
-
client=client,
|
|
164
|
+
client=client,
|
|
165
|
+
eval_group_id=eval_group_info.id,
|
|
166
|
+
eval_run_id=eval_run_id,
|
|
167
|
+
grader_name_map=grader_name_map,
|
|
168
|
+
expected_rows=len(data),
|
|
161
169
|
)
|
|
162
170
|
|
|
163
171
|
|
|
@@ -214,7 +222,7 @@ def _get_single_run_results(
|
|
|
214
222
|
)
|
|
215
223
|
|
|
216
224
|
# Convert run results into a dictionary of metrics
|
|
217
|
-
run_metrics = {}
|
|
225
|
+
run_metrics: Dict[str, Any] = {}
|
|
218
226
|
if run_results.per_testing_criteria_results is None:
|
|
219
227
|
msg = (
|
|
220
228
|
"AOAI evaluation run returned no results, despite 'completed' status. This might"
|
|
@@ -231,28 +239,16 @@ def _get_single_run_results(
|
|
|
231
239
|
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
|
|
232
240
|
passed = criteria_result.passed
|
|
233
241
|
failed = criteria_result.failed
|
|
234
|
-
ratio = passed / (passed + failed)
|
|
242
|
+
ratio = passed / (passed + failed) if (passed + failed) else 0.0
|
|
235
243
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
236
244
|
run_metrics[formatted_column_name] = ratio
|
|
237
245
|
|
|
238
|
-
# Get full results and convert them into a dataframe.
|
|
239
|
-
# Notes on raw full data output from OAI eval runs:
|
|
240
|
-
# Each row in the full results list in itself a list.
|
|
241
|
-
# Each entry corresponds to one grader's results from the criteria list
|
|
242
|
-
# that was inputted to the eval group.
|
|
243
|
-
# Each entry is a dictionary, with a name, sample, passed boolean, and score number.
|
|
244
|
-
# The name is used to figure out which grader the entry refers to, the sample is ignored.
|
|
245
|
-
# The passed and score values are then added to the results dictionary, prepended with the grader's name
|
|
246
|
-
# as entered by the user in the inputted dictionary.
|
|
247
|
-
# Other values, if they exist, are also added to the results dictionary.
|
|
248
|
-
|
|
249
246
|
# Collect all results with pagination
|
|
250
|
-
all_results = []
|
|
251
|
-
next_cursor = None
|
|
247
|
+
all_results: List[Any] = []
|
|
248
|
+
next_cursor: Optional[str] = None
|
|
252
249
|
limit = 100 # Max allowed by API
|
|
253
250
|
|
|
254
251
|
while True:
|
|
255
|
-
# Build kwargs for the API call
|
|
256
252
|
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
|
|
257
253
|
if next_cursor is not None:
|
|
258
254
|
list_kwargs["after"] = next_cursor
|
|
@@ -265,28 +261,50 @@ def _get_single_run_results(
|
|
|
265
261
|
# Check for more pages
|
|
266
262
|
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
|
|
267
263
|
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
|
|
268
|
-
# Get the last item's ID for cursor-based pagination
|
|
269
264
|
next_cursor = raw_list_results.data[-1].id
|
|
270
265
|
else:
|
|
271
266
|
break
|
|
272
267
|
else:
|
|
273
268
|
break
|
|
274
269
|
|
|
275
|
-
listed_results = {"index": []}
|
|
276
|
-
#
|
|
277
|
-
# datasource_item_id
|
|
270
|
+
listed_results: Dict[str, List[Any]] = {"index": []}
|
|
271
|
+
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
|
|
278
272
|
for row_result in all_results:
|
|
279
|
-
# Add the datasource_item_id for later sorting
|
|
280
273
|
listed_results["index"].append(row_result.datasource_item_id)
|
|
281
274
|
for single_grader_row_result in row_result.results:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
275
|
+
if isinstance(single_grader_row_result, dict):
|
|
276
|
+
result_dict = single_grader_row_result
|
|
277
|
+
elif hasattr(single_grader_row_result, "model_dump"):
|
|
278
|
+
result_dict = single_grader_row_result.model_dump()
|
|
279
|
+
elif hasattr(single_grader_row_result, "dict"):
|
|
280
|
+
result_dict = single_grader_row_result.dict()
|
|
281
|
+
elif hasattr(single_grader_row_result, "__dict__"):
|
|
282
|
+
result_dict = vars(single_grader_row_result)
|
|
283
|
+
else:
|
|
284
|
+
raise EvaluationException(
|
|
285
|
+
message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
|
|
286
|
+
blame=ErrorBlame.UNKNOWN,
|
|
287
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
288
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
grader_result_name = result_dict.get("name", None)
|
|
292
|
+
if grader_result_name is None:
|
|
293
|
+
raise EvaluationException(
|
|
294
|
+
message="AOAI evaluation response missing grader result name; unable to map to original grader.",
|
|
295
|
+
blame=ErrorBlame.UNKNOWN,
|
|
296
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
297
|
+
target=ErrorTarget.AOAI_GRADER,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
grader_name = run_info["grader_name_map"][grader_result_name]
|
|
301
|
+
for name, value in result_dict.items():
|
|
302
|
+
if name in ["name"]:
|
|
285
303
|
continue
|
|
286
304
|
if name.lower() == "passed":
|
|
287
|
-
#
|
|
305
|
+
# Create a `_result` column for each grader
|
|
288
306
|
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
|
|
289
|
-
if len(result_column_name) < 50:
|
|
307
|
+
if len(result_column_name) < 50:
|
|
290
308
|
if result_column_name not in listed_results:
|
|
291
309
|
listed_results[result_column_name] = []
|
|
292
310
|
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
|
|
@@ -296,23 +314,67 @@ def _get_single_run_results(
|
|
|
296
314
|
listed_results[formatted_column_name] = []
|
|
297
315
|
listed_results[formatted_column_name].append(value)
|
|
298
316
|
|
|
299
|
-
# Ensure all columns
|
|
317
|
+
# Ensure all columns are the same length as the 'index' list
|
|
300
318
|
num_rows = len(listed_results["index"])
|
|
301
319
|
for col_name in list(listed_results.keys()):
|
|
302
320
|
if col_name != "index":
|
|
303
321
|
col_length = len(listed_results[col_name])
|
|
304
322
|
if col_length < num_rows:
|
|
305
|
-
# Pad with None values
|
|
306
323
|
listed_results[col_name].extend([None] * (num_rows - col_length))
|
|
307
324
|
elif col_length > num_rows:
|
|
308
|
-
# This shouldn't happen, but truncate if it does
|
|
309
325
|
listed_results[col_name] = listed_results[col_name][:num_rows]
|
|
310
326
|
|
|
311
327
|
output_df = pd.DataFrame(listed_results)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
328
|
+
|
|
329
|
+
# If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
|
|
330
|
+
if "index" not in output_df.columns:
|
|
331
|
+
output_df["index"] = list(range(len(output_df)))
|
|
332
|
+
|
|
333
|
+
# Deterministic ordering by original datasource_item_id
|
|
334
|
+
output_df = output_df.sort_values("index", ascending=True)
|
|
335
|
+
|
|
336
|
+
# Keep a temporary row-id copy for debugging/inspection.
|
|
337
|
+
# Use underscores (not hyphens) to avoid pandas column handling quirks.
|
|
338
|
+
output_df["__azure_ai_evaluation_index"] = output_df["index"]
|
|
339
|
+
|
|
340
|
+
# Preserve original ids as index, then pad to expected length
|
|
341
|
+
output_df.set_index("index", inplace=True)
|
|
342
|
+
|
|
343
|
+
expected = run_info.get("expected_rows", None)
|
|
344
|
+
if expected is not None:
|
|
345
|
+
pre_len = len(output_df)
|
|
346
|
+
# Assumes original datasource_item_id space is 0..expected-1
|
|
347
|
+
output_df = output_df.reindex(range(expected))
|
|
348
|
+
if pre_len != expected:
|
|
349
|
+
missing_rows = expected - pre_len
|
|
350
|
+
LOGGER.warning(
|
|
351
|
+
"AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
|
|
352
|
+
run_info["eval_run_id"],
|
|
353
|
+
pre_len,
|
|
354
|
+
expected,
|
|
355
|
+
missing_rows,
|
|
356
|
+
)
|
|
357
|
+
# Add a per-grader 'row_missing' boolean for padded rows
|
|
358
|
+
grader_user_names: Set[str] = set()
|
|
359
|
+
for col in output_df.columns:
|
|
360
|
+
if col.startswith("outputs."):
|
|
361
|
+
parts = col.split(".")
|
|
362
|
+
if len(parts) > 2:
|
|
363
|
+
grader_user_names.add(parts[1])
|
|
364
|
+
if grader_user_names:
|
|
365
|
+
missing_index_mask = output_df.isna().all(axis=1)
|
|
366
|
+
for g in grader_user_names:
|
|
367
|
+
col_name = f"outputs.{g}.row_missing"
|
|
368
|
+
if col_name not in output_df:
|
|
369
|
+
output_df[col_name] = False
|
|
370
|
+
output_df.loc[missing_index_mask, col_name] = True
|
|
371
|
+
|
|
372
|
+
# Drop the temporary helper column before returning (no public surface change)
|
|
373
|
+
if "__azure_ai_evaluation_index" in output_df.columns:
|
|
374
|
+
output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
|
|
375
|
+
|
|
376
|
+
# Reset to RangeIndex so downstream concatenation aligns on position
|
|
377
|
+
output_df.reset_index(drop=True, inplace=True)
|
|
316
378
|
return output_df, run_metrics
|
|
317
379
|
|
|
318
380
|
|
|
@@ -406,8 +468,15 @@ def _get_graders_and_column_mappings(
|
|
|
406
468
|
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
|
|
407
469
|
"""
|
|
408
470
|
|
|
471
|
+
if column_mappings is None:
|
|
472
|
+
return [({name: grader}, None) for name, grader in graders.items()]
|
|
409
473
|
default_mapping = column_mappings.get("default", None)
|
|
410
|
-
|
|
474
|
+
if default_mapping is None:
|
|
475
|
+
default_mapping = {}
|
|
476
|
+
return [
|
|
477
|
+
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
|
|
478
|
+
for name, grader in graders.items()
|
|
479
|
+
]
|
|
411
480
|
|
|
412
481
|
|
|
413
482
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
138
138
|
project_url: str,
|
|
139
139
|
evaluation_name: Optional[str],
|
|
140
140
|
name_map: Dict[str, str],
|
|
141
|
+
tags: Optional[Dict[str, str]] = None,
|
|
141
142
|
**kwargs,
|
|
142
143
|
) -> Optional[str]:
|
|
143
144
|
|
|
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
191
192
|
evaluation=EvaluationUpload(
|
|
192
193
|
display_name=evaluation_name,
|
|
193
194
|
properties=properties,
|
|
195
|
+
tags=tags,
|
|
194
196
|
)
|
|
195
197
|
)
|
|
196
198
|
|
|
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
|
|
|
215
217
|
run: Optional[Run],
|
|
216
218
|
evaluation_name: Optional[str],
|
|
217
219
|
name_map: Dict[str, str],
|
|
220
|
+
tags: Optional[Dict[str, str]] = None,
|
|
218
221
|
**kwargs,
|
|
219
222
|
) -> Optional[str]:
|
|
220
223
|
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
|
|
|
244
247
|
workspace_name=ws_triad.workspace_name,
|
|
245
248
|
management_client=management_client,
|
|
246
249
|
promptflow_run=run,
|
|
250
|
+
tags=tags,
|
|
247
251
|
) as ev_run:
|
|
248
252
|
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
249
253
|
|
|
@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
66
66
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
67
67
|
|
|
68
68
|
@override
|
|
69
|
-
def __init__(self, model_config, *, threshold=3):
|
|
69
|
+
def __init__(self, model_config, *, threshold=3, credential=None):
|
|
70
70
|
current_dir = os.path.dirname(__file__)
|
|
71
71
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
72
72
|
self._threshold = threshold
|
|
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
prompty_file=prompty_path,
|
|
77
77
|
result_key=self._RESULT_KEY,
|
|
78
78
|
threshold=threshold,
|
|
79
|
+
credential=credential,
|
|
79
80
|
_higher_is_better=self._higher_is_better,
|
|
80
81
|
)
|
|
81
82
|
|
|
@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
170
170
|
|
|
171
171
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
172
172
|
|
|
173
|
-
def _derive_singleton_inputs(self) -> List[str]:
|
|
173
|
+
def _derive_singleton_inputs(self) -> List[List[str]]:
|
|
174
174
|
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
175
175
|
when the evaluator is being used in a non-conversation context.
|
|
176
176
|
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
177
177
|
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
178
178
|
function's signature, not the parent's.
|
|
179
179
|
|
|
180
|
-
:return: A list of
|
|
181
|
-
:rtype: List[str]
|
|
180
|
+
:return: A list of lists, where each inner list represents the singleton inputs for each overload.
|
|
181
|
+
:rtype: List[List[str]]
|
|
182
182
|
"""
|
|
183
183
|
|
|
184
184
|
overloads = get_overloads(self.__call__)
|
|
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
186
186
|
call_signatures = [inspect.signature(self.__call__)]
|
|
187
187
|
else:
|
|
188
188
|
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
189
|
-
|
|
190
|
-
|
|
189
|
+
|
|
190
|
+
overload_inputs = []
|
|
191
191
|
for call_signature in call_signatures:
|
|
192
192
|
params = call_signature.parameters
|
|
193
193
|
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
194
194
|
continue
|
|
195
195
|
# exclude self since it is not a singleton input
|
|
196
|
-
|
|
197
|
-
return
|
|
196
|
+
overload_inputs.append([p for p in params if p != "self"])
|
|
197
|
+
return overload_inputs
|
|
198
|
+
|
|
199
|
+
def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
|
|
200
|
+
"""Find the overload that matches the provided kwargs and return its input parameters.
|
|
201
|
+
|
|
202
|
+
:keyword kwargs: The keyword arguments to match against overloads.
|
|
203
|
+
:type kwargs: Dict
|
|
204
|
+
:return: List of input parameter names for the matching overload.
|
|
205
|
+
:rtype: List[str]
|
|
206
|
+
"""
|
|
207
|
+
overload_inputs = self._singleton_inputs
|
|
208
|
+
provided_keys = set(key for key, value in kwargs.items() if value is not None)
|
|
209
|
+
|
|
210
|
+
# Find the overload that best matches the provided parameters
|
|
211
|
+
best_match = None
|
|
212
|
+
best_score = -1
|
|
213
|
+
|
|
214
|
+
for inputs in overload_inputs:
|
|
215
|
+
input_set = set(inputs)
|
|
216
|
+
|
|
217
|
+
# Calculate match score: how many of the overload's params are provided
|
|
218
|
+
if input_set.issubset(provided_keys):
|
|
219
|
+
score = len(input_set)
|
|
220
|
+
if score > best_score:
|
|
221
|
+
best_score = score
|
|
222
|
+
best_match = inputs
|
|
223
|
+
|
|
224
|
+
# If exact match found, return it
|
|
225
|
+
if best_match is not None:
|
|
226
|
+
return best_match
|
|
227
|
+
|
|
228
|
+
# If no exact match, find the overload with the most overlap
|
|
229
|
+
for inputs in overload_inputs:
|
|
230
|
+
input_set = set(inputs)
|
|
231
|
+
overlap = len(input_set.intersection(provided_keys))
|
|
232
|
+
if overlap > best_score:
|
|
233
|
+
best_score = overlap
|
|
234
|
+
best_match = inputs
|
|
235
|
+
|
|
236
|
+
# Return the best match or the first overload as fallback
|
|
237
|
+
return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
|
|
238
|
+
|
|
239
|
+
def _get_all_singleton_inputs(self) -> List[str]:
|
|
240
|
+
"""Get a flattened list of all possible singleton inputs across all overloads.
|
|
241
|
+
|
|
242
|
+
:return: Flattened list of all singleton input names.
|
|
243
|
+
:rtype: List[str]
|
|
244
|
+
"""
|
|
245
|
+
all_inputs = set()
|
|
246
|
+
for inputs in self._singleton_inputs:
|
|
247
|
+
all_inputs.update(inputs)
|
|
248
|
+
return list(all_inputs)
|
|
198
249
|
|
|
199
250
|
def _derive_conversation_converter(
|
|
200
251
|
self,
|
|
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
206
257
|
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
207
258
|
:rtype: Callable
|
|
208
259
|
"""
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
260
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
261
|
+
include_context = "context" in all_singleton_inputs
|
|
262
|
+
include_query = "query" in all_singleton_inputs
|
|
263
|
+
include_response = "response" in all_singleton_inputs
|
|
264
|
+
include_ground_truth = "ground_truth" in all_singleton_inputs
|
|
213
265
|
|
|
214
266
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
215
267
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
319
371
|
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
320
372
|
values.
|
|
321
373
|
|
|
322
|
-
The self._singleton_inputs list assigned during initialization
|
|
323
|
-
|
|
324
|
-
|
|
374
|
+
The self._singleton_inputs list (containing overload signatures) assigned during initialization
|
|
375
|
+
is used to find and extract singleton keywords, and determine which overload matches the
|
|
376
|
+
provided arguments.
|
|
325
377
|
|
|
326
378
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
327
379
|
are inputted.
|
|
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
339
391
|
conversation = kwargs.get("conversation", None)
|
|
340
392
|
singletons = {}
|
|
341
393
|
if len(self._singleton_inputs) > 0:
|
|
342
|
-
|
|
394
|
+
# Get all possible singleton inputs and check what's provided
|
|
395
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
396
|
+
singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
|
|
397
|
+
|
|
343
398
|
# Check that both conversation and other inputs aren't set
|
|
344
399
|
if conversation is not None and any(singletons.values()):
|
|
345
400
|
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
354
409
|
if self._is_multi_modal_conversation(conversation):
|
|
355
410
|
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
356
411
|
return self._derive_conversation_converter()(conversation)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
412
|
+
|
|
413
|
+
# Handle Singletons - find matching overload
|
|
414
|
+
matching_inputs = self._get_matching_overload_inputs(**kwargs)
|
|
415
|
+
if matching_inputs:
|
|
416
|
+
# Check if all required inputs for this overload are provided
|
|
417
|
+
required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
|
|
418
|
+
required_singletons = remove_optional_singletons(self, required_singletons)
|
|
419
|
+
if all(value is not None for value in required_singletons.values()):
|
|
420
|
+
return [singletons]
|
|
421
|
+
|
|
361
422
|
# Missing input
|
|
362
423
|
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
363
424
|
raise EvaluationException(
|
|
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
416
477
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
417
478
|
return aggregated
|
|
418
479
|
|
|
480
|
+
def _parse_tools_from_response(self, response):
|
|
481
|
+
"""Parse the response to extract tool calls and results.
|
|
482
|
+
:param response: The response to parse.
|
|
483
|
+
:type response: Union[str, List[dict]]
|
|
484
|
+
:return: List of tool calls extracted from the response.
|
|
485
|
+
:rtype: List[dict]
|
|
486
|
+
"""
|
|
487
|
+
tool_calls = []
|
|
488
|
+
tool_results_map = {}
|
|
489
|
+
if isinstance(response, list):
|
|
490
|
+
for message in response:
|
|
491
|
+
# Extract tool calls from assistant messages
|
|
492
|
+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
493
|
+
for content_item in message.get("content"):
|
|
494
|
+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
495
|
+
tool_calls.append(content_item)
|
|
496
|
+
|
|
497
|
+
# Extract tool results from tool messages
|
|
498
|
+
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
499
|
+
tool_call_id = message.get("tool_call_id")
|
|
500
|
+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
501
|
+
result_content = message.get("content")[0]
|
|
502
|
+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
503
|
+
tool_results_map[tool_call_id] = result_content
|
|
504
|
+
|
|
505
|
+
# Attach results to their corresponding calls
|
|
506
|
+
for tool_call in tool_calls:
|
|
507
|
+
tool_call_id = tool_call.get("tool_call_id")
|
|
508
|
+
if tool_call_id in tool_results_map:
|
|
509
|
+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
510
|
+
|
|
511
|
+
return tool_calls
|
|
512
|
+
|
|
419
513
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
420
514
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
421
515
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
7
|
import os
|
|
8
|
-
from typing import Dict, TypeVar, Union
|
|
8
|
+
from typing import Dict, Optional, TypeVar, Union
|
|
9
9
|
|
|
10
10
|
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
11
|
from promptflow.core._flow import AsyncPrompty
|
|
@@ -13,6 +13,7 @@ else:
|
|
|
13
13
|
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
|
+
from azure.core.credentials import TokenCredential
|
|
16
17
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
17
18
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
18
19
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
63
64
|
model_config: dict,
|
|
64
65
|
eval_last_turn: bool = False,
|
|
65
66
|
threshold: int = 3,
|
|
67
|
+
credential: Optional[TokenCredential] = None,
|
|
66
68
|
_higher_is_better: bool = False,
|
|
67
69
|
**kwargs,
|
|
68
70
|
) -> None:
|
|
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
82
84
|
)
|
|
83
85
|
|
|
84
86
|
self._flow = AsyncPrompty.load(
|
|
85
|
-
source=self._prompty_file,
|
|
87
|
+
source=self._prompty_file,
|
|
88
|
+
model=prompty_model_config,
|
|
89
|
+
token_credential=credential,
|
|
90
|
+
is_reasoning_model=self._is_reasoning_model,
|
|
86
91
|
)
|
|
87
92
|
|
|
88
93
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
153
153
|
if query is not None and self._evaluate_query:
|
|
154
154
|
input_data["query"] = str(query)
|
|
155
155
|
|
|
156
|
-
if "context" in self.
|
|
156
|
+
if "context" in self._get_all_singleton_inputs():
|
|
157
157
|
context = eval_input.get("context", None)
|
|
158
158
|
if context is None:
|
|
159
159
|
raise EvaluationException(
|
|
@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
68
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
69
|
|
|
70
70
|
@override
|
|
71
|
-
def __init__(self, model_config, *, threshold=3):
|
|
71
|
+
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
72
72
|
current_dir = os.path.dirname(__file__)
|
|
73
73
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
74
74
|
self._threshold = threshold
|
|
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
prompty_file=prompty_path,
|
|
79
79
|
result_key=self._RESULT_KEY,
|
|
80
80
|
threshold=threshold,
|
|
81
|
+
credential=credential,
|
|
81
82
|
_higher_is_better=self._higher_is_better,
|
|
82
83
|
)
|
|
83
84
|
|