azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show
  1. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  2. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  3. azure/ai/evaluation/_converters/_models.py +75 -26
  4. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  5. azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
  6. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +104 -35
  7. azure/ai/evaluation/_evaluate/_utils.py +4 -0
  8. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
  9. azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
  10. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  11. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
  12. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
  13. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
  14. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
  15. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
  16. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
  17. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
  18. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
  19. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
  20. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
  21. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
  22. azure/ai/evaluation/_exceptions.py +1 -0
  23. azure/ai/evaluation/_version.py +1 -1
  24. azure/ai/evaluation/red_team/__init__.py +2 -1
  25. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  26. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  27. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  28. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  29. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  30. azure/ai/evaluation/red_team/_red_team.py +697 -3067
  31. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  32. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  33. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
  34. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  35. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  36. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  37. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  38. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  39. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  40. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  41. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  42. azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
  43. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
  44. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
  45. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/METADATA +39 -3
  46. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/RECORD +49 -41
  47. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/WHEEL +1 -1
  48. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info/licenses}/NOTICE.txt +0 -0
  49. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
29
29
  eval_group_id: str
30
30
  eval_run_id: str
31
31
  grader_name_map: Dict[str, str]
32
+ # Total number of expected rows in the original dataset. Used to
33
+ # re-align AOAI grader results to guard against silent row drops
34
+ # causing horizontal concatenation misalignment.
35
+ expected_rows: int
32
36
 
33
37
 
34
38
  def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
157
161
  )
158
162
 
159
163
  return OAIEvalRunCreationInfo(
160
- client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
164
+ client=client,
165
+ eval_group_id=eval_group_info.id,
166
+ eval_run_id=eval_run_id,
167
+ grader_name_map=grader_name_map,
168
+ expected_rows=len(data),
161
169
  )
162
170
 
163
171
 
@@ -214,7 +222,7 @@ def _get_single_run_results(
214
222
  )
215
223
 
216
224
  # Convert run results into a dictionary of metrics
217
- run_metrics = {}
225
+ run_metrics: Dict[str, Any] = {}
218
226
  if run_results.per_testing_criteria_results is None:
219
227
  msg = (
220
228
  "AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
231
239
  grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232
240
  passed = criteria_result.passed
233
241
  failed = criteria_result.failed
234
- ratio = passed / (passed + failed)
242
+ ratio = passed / (passed + failed) if (passed + failed) else 0.0
235
243
  formatted_column_name = f"{grader_name}.pass_rate"
236
244
  run_metrics[formatted_column_name] = ratio
237
245
 
238
- # Get full results and convert them into a dataframe.
239
- # Notes on raw full data output from OAI eval runs:
240
- # Each row in the full results list in itself a list.
241
- # Each entry corresponds to one grader's results from the criteria list
242
- # that was inputted to the eval group.
243
- # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
244
- # The name is used to figure out which grader the entry refers to, the sample is ignored.
245
- # The passed and score values are then added to the results dictionary, prepended with the grader's name
246
- # as entered by the user in the inputted dictionary.
247
- # Other values, if they exist, are also added to the results dictionary.
248
-
249
246
  # Collect all results with pagination
250
- all_results = []
251
- next_cursor = None
247
+ all_results: List[Any] = []
248
+ next_cursor: Optional[str] = None
252
249
  limit = 100 # Max allowed by API
253
250
 
254
251
  while True:
255
- # Build kwargs for the API call
256
252
  list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257
253
  if next_cursor is not None:
258
254
  list_kwargs["after"] = next_cursor
@@ -265,28 +261,50 @@ def _get_single_run_results(
265
261
  # Check for more pages
266
262
  if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267
263
  if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268
- # Get the last item's ID for cursor-based pagination
269
264
  next_cursor = raw_list_results.data[-1].id
270
265
  else:
271
266
  break
272
267
  else:
273
268
  break
274
269
 
275
- listed_results = {"index": []}
276
- # raw data has no order guarantees, we need to sort them by their
277
- # datasource_item_id
270
+ listed_results: Dict[str, List[Any]] = {"index": []}
271
+ # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
278
272
  for row_result in all_results:
279
- # Add the datasource_item_id for later sorting
280
273
  listed_results["index"].append(row_result.datasource_item_id)
281
274
  for single_grader_row_result in row_result.results:
282
- grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
283
- for name, value in single_grader_row_result.items():
284
- if name in ["name"]: # Todo decide if we also want to exclude "sample"
275
+ if isinstance(single_grader_row_result, dict):
276
+ result_dict = single_grader_row_result
277
+ elif hasattr(single_grader_row_result, "model_dump"):
278
+ result_dict = single_grader_row_result.model_dump()
279
+ elif hasattr(single_grader_row_result, "dict"):
280
+ result_dict = single_grader_row_result.dict()
281
+ elif hasattr(single_grader_row_result, "__dict__"):
282
+ result_dict = vars(single_grader_row_result)
283
+ else:
284
+ raise EvaluationException(
285
+ message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
286
+ blame=ErrorBlame.UNKNOWN,
287
+ category=ErrorCategory.FAILED_EXECUTION,
288
+ target=ErrorTarget.AOAI_GRADER,
289
+ )
290
+
291
+ grader_result_name = result_dict.get("name", None)
292
+ if grader_result_name is None:
293
+ raise EvaluationException(
294
+ message="AOAI evaluation response missing grader result name; unable to map to original grader.",
295
+ blame=ErrorBlame.UNKNOWN,
296
+ category=ErrorCategory.FAILED_EXECUTION,
297
+ target=ErrorTarget.AOAI_GRADER,
298
+ )
299
+
300
+ grader_name = run_info["grader_name_map"][grader_result_name]
301
+ for name, value in result_dict.items():
302
+ if name in ["name"]:
285
303
  continue
286
304
  if name.lower() == "passed":
287
- # create a `_result` column for each grader
305
+ # Create a `_result` column for each grader
288
306
  result_column_name = f"outputs.{grader_name}.{grader_name}_result"
289
- if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
307
+ if len(result_column_name) < 50:
290
308
  if result_column_name not in listed_results:
291
309
  listed_results[result_column_name] = []
292
310
  listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +314,67 @@ def _get_single_run_results(
296
314
  listed_results[formatted_column_name] = []
297
315
  listed_results[formatted_column_name].append(value)
298
316
 
299
- # Ensure all columns have the same length as the index
317
+ # Ensure all columns are the same length as the 'index' list
300
318
  num_rows = len(listed_results["index"])
301
319
  for col_name in list(listed_results.keys()):
302
320
  if col_name != "index":
303
321
  col_length = len(listed_results[col_name])
304
322
  if col_length < num_rows:
305
- # Pad with None values
306
323
  listed_results[col_name].extend([None] * (num_rows - col_length))
307
324
  elif col_length > num_rows:
308
- # This shouldn't happen, but truncate if it does
309
325
  listed_results[col_name] = listed_results[col_name][:num_rows]
310
326
 
311
327
  output_df = pd.DataFrame(listed_results)
312
- # sort by index
313
- output_df = output_df.sort_values("index", ascending=[True])
314
- # remove index column
315
- output_df.drop(columns=["index"], inplace=True)
328
+
329
+ # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
330
+ if "index" not in output_df.columns:
331
+ output_df["index"] = list(range(len(output_df)))
332
+
333
+ # Deterministic ordering by original datasource_item_id
334
+ output_df = output_df.sort_values("index", ascending=True)
335
+
336
+ # Keep a temporary row-id copy for debugging/inspection.
337
+ # Use underscores (not hyphens) to avoid pandas column handling quirks.
338
+ output_df["__azure_ai_evaluation_index"] = output_df["index"]
339
+
340
+ # Preserve original ids as index, then pad to expected length
341
+ output_df.set_index("index", inplace=True)
342
+
343
+ expected = run_info.get("expected_rows", None)
344
+ if expected is not None:
345
+ pre_len = len(output_df)
346
+ # Assumes original datasource_item_id space is 0..expected-1
347
+ output_df = output_df.reindex(range(expected))
348
+ if pre_len != expected:
349
+ missing_rows = expected - pre_len
350
+ LOGGER.warning(
351
+ "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
352
+ run_info["eval_run_id"],
353
+ pre_len,
354
+ expected,
355
+ missing_rows,
356
+ )
357
+ # Add a per-grader 'row_missing' boolean for padded rows
358
+ grader_user_names: Set[str] = set()
359
+ for col in output_df.columns:
360
+ if col.startswith("outputs."):
361
+ parts = col.split(".")
362
+ if len(parts) > 2:
363
+ grader_user_names.add(parts[1])
364
+ if grader_user_names:
365
+ missing_index_mask = output_df.isna().all(axis=1)
366
+ for g in grader_user_names:
367
+ col_name = f"outputs.{g}.row_missing"
368
+ if col_name not in output_df:
369
+ output_df[col_name] = False
370
+ output_df.loc[missing_index_mask, col_name] = True
371
+
372
+ # Drop the temporary helper column before returning (no public surface change)
373
+ if "__azure_ai_evaluation_index" in output_df.columns:
374
+ output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
375
+
376
+ # Reset to RangeIndex so downstream concatenation aligns on position
377
+ output_df.reset_index(drop=True, inplace=True)
316
378
  return output_df, run_metrics
317
379
 
318
380
 
@@ -406,8 +468,15 @@ def _get_graders_and_column_mappings(
406
468
  :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
407
469
  """
408
470
 
471
+ if column_mappings is None:
472
+ return [({name: grader}, None) for name, grader in graders.items()]
409
473
  default_mapping = column_mappings.get("default", None)
410
- return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
474
+ if default_mapping is None:
475
+ default_mapping = {}
476
+ return [
477
+ ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
478
+ for name, grader in graders.items()
479
+ ]
411
480
 
412
481
 
413
482
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
138
138
  project_url: str,
139
139
  evaluation_name: Optional[str],
140
140
  name_map: Dict[str, str],
141
+ tags: Optional[Dict[str, str]] = None,
141
142
  **kwargs,
142
143
  ) -> Optional[str]:
143
144
 
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
191
192
  evaluation=EvaluationUpload(
192
193
  display_name=evaluation_name,
193
194
  properties=properties,
195
+ tags=tags,
194
196
  )
195
197
  )
196
198
 
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
215
217
  run: Optional[Run],
216
218
  evaluation_name: Optional[str],
217
219
  name_map: Dict[str, str],
220
+ tags: Optional[Dict[str, str]] = None,
218
221
  **kwargs,
219
222
  ) -> Optional[str]:
220
223
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
244
247
  workspace_name=ws_triad.workspace_name,
245
248
  management_client=management_client,
246
249
  promptflow_run=run,
250
+ tags=tags,
247
251
  ) as ev_run:
248
252
  artifact_name = EvalRun.EVALUATION_ARTIFACT
249
253
 
@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
66
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
67
 
68
68
  @override
69
- def __init__(self, model_config, *, threshold=3):
69
+ def __init__(self, model_config, *, threshold=3, credential=None):
70
70
  current_dir = os.path.dirname(__file__)
71
71
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
72
  self._threshold = threshold
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
76
76
  prompty_file=prompty_path,
77
77
  result_key=self._RESULT_KEY,
78
78
  threshold=threshold,
79
+ credential=credential,
79
80
  _higher_is_better=self._higher_is_better,
80
81
  )
81
82
 
@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
170
170
 
171
171
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
172
172
 
173
- def _derive_singleton_inputs(self) -> List[str]:
173
+ def _derive_singleton_inputs(self) -> List[List[str]]:
174
174
  """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
175
175
  when the evaluator is being used in a non-conversation context.
176
176
  By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
177
177
  Thankfully this works the way you'd hope, with the call_signature being based on the child
178
178
  function's signature, not the parent's.
179
179
 
180
- :return: A list of strings representing the names of singleton inputs.
181
- :rtype: List[str]
180
+ :return: A list of lists, where each inner list represents the singleton inputs for each overload.
181
+ :rtype: List[List[str]]
182
182
  """
183
183
 
184
184
  overloads = get_overloads(self.__call__)
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
186
186
  call_signatures = [inspect.signature(self.__call__)]
187
187
  else:
188
188
  call_signatures = [inspect.signature(overload) for overload in overloads]
189
- call_signature = inspect.signature(self.__call__)
190
- singletons = []
189
+
190
+ overload_inputs = []
191
191
  for call_signature in call_signatures:
192
192
  params = call_signature.parameters
193
193
  if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
194
194
  continue
195
195
  # exclude self since it is not a singleton input
196
- singletons.extend([p for p in params if p != "self"])
197
- return singletons
196
+ overload_inputs.append([p for p in params if p != "self"])
197
+ return overload_inputs
198
+
199
+ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
200
+ """Find the overload that matches the provided kwargs and return its input parameters.
201
+
202
+ :keyword kwargs: The keyword arguments to match against overloads.
203
+ :type kwargs: Dict
204
+ :return: List of input parameter names for the matching overload.
205
+ :rtype: List[str]
206
+ """
207
+ overload_inputs = self._singleton_inputs
208
+ provided_keys = set(key for key, value in kwargs.items() if value is not None)
209
+
210
+ # Find the overload that best matches the provided parameters
211
+ best_match = None
212
+ best_score = -1
213
+
214
+ for inputs in overload_inputs:
215
+ input_set = set(inputs)
216
+
217
+ # Calculate match score: how many of the overload's params are provided
218
+ if input_set.issubset(provided_keys):
219
+ score = len(input_set)
220
+ if score > best_score:
221
+ best_score = score
222
+ best_match = inputs
223
+
224
+ # If exact match found, return it
225
+ if best_match is not None:
226
+ return best_match
227
+
228
+ # If no exact match, find the overload with the most overlap
229
+ for inputs in overload_inputs:
230
+ input_set = set(inputs)
231
+ overlap = len(input_set.intersection(provided_keys))
232
+ if overlap > best_score:
233
+ best_score = overlap
234
+ best_match = inputs
235
+
236
+ # Return the best match or the first overload as fallback
237
+ return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
238
+
239
+ def _get_all_singleton_inputs(self) -> List[str]:
240
+ """Get a flattened list of all possible singleton inputs across all overloads.
241
+
242
+ :return: Flattened list of all singleton input names.
243
+ :rtype: List[str]
244
+ """
245
+ all_inputs = set()
246
+ for inputs in self._singleton_inputs:
247
+ all_inputs.update(inputs)
248
+ return list(all_inputs)
198
249
 
199
250
  def _derive_conversation_converter(
200
251
  self,
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
206
257
  :return: The function that will be used to convert conversations to evaluable inputs.
207
258
  :rtype: Callable
208
259
  """
209
- include_context = "context" in self._singleton_inputs
210
- include_query = "query" in self._singleton_inputs
211
- include_response = "response" in self._singleton_inputs
212
- include_ground_truth = "ground_truth" in self._singleton_inputs
260
+ all_singleton_inputs = self._get_all_singleton_inputs()
261
+ include_context = "context" in all_singleton_inputs
262
+ include_query = "query" in all_singleton_inputs
263
+ include_response = "response" in all_singleton_inputs
264
+ include_ground_truth = "ground_truth" in all_singleton_inputs
213
265
 
214
266
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
215
267
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
319
371
  (like a query and response), or they receive conversation that iss a list of dictionary
320
372
  values.
321
373
 
322
- The self._singleton_inputs list assigned during initialization is used to find and extract
323
- singleton keywords, and self._allow_conversation_input is used to determine if a conversation
324
- is a valid input.
374
+ The self._singleton_inputs list (containing overload signatures) assigned during initialization
375
+ is used to find and extract singleton keywords, and determine which overload matches the
376
+ provided arguments.
325
377
 
326
378
  If both conversations and singletons are allowed, the function will raise an exception if both
327
379
  are inputted.
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
339
391
  conversation = kwargs.get("conversation", None)
340
392
  singletons = {}
341
393
  if len(self._singleton_inputs) > 0:
342
- singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
394
+ # Get all possible singleton inputs and check what's provided
395
+ all_singleton_inputs = self._get_all_singleton_inputs()
396
+ singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
397
+
343
398
  # Check that both conversation and other inputs aren't set
344
399
  if conversation is not None and any(singletons.values()):
345
400
  msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
354
409
  if self._is_multi_modal_conversation(conversation):
355
410
  return self._derive_multi_modal_conversation_converter()(conversation)
356
411
  return self._derive_conversation_converter()(conversation)
357
- # Handle Singletons
358
- required_singletons = remove_optional_singletons(self, singletons)
359
- if all(value is not None for value in required_singletons.values()):
360
- return [singletons]
412
+
413
+ # Handle Singletons - find matching overload
414
+ matching_inputs = self._get_matching_overload_inputs(**kwargs)
415
+ if matching_inputs:
416
+ # Check if all required inputs for this overload are provided
417
+ required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
418
+ required_singletons = remove_optional_singletons(self, required_singletons)
419
+ if all(value is not None for value in required_singletons.values()):
420
+ return [singletons]
421
+
361
422
  # Missing input
362
423
  msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
363
424
  raise EvaluationException(
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
416
477
  aggregated["evaluation_per_turn"] = evaluation_per_turn
417
478
  return aggregated
418
479
 
480
+ def _parse_tools_from_response(self, response):
481
+ """Parse the response to extract tool calls and results.
482
+ :param response: The response to parse.
483
+ :type response: Union[str, List[dict]]
484
+ :return: List of tool calls extracted from the response.
485
+ :rtype: List[dict]
486
+ """
487
+ tool_calls = []
488
+ tool_results_map = {}
489
+ if isinstance(response, list):
490
+ for message in response:
491
+ # Extract tool calls from assistant messages
492
+ if message.get("role") == "assistant" and isinstance(message.get("content"), list):
493
+ for content_item in message.get("content"):
494
+ if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
495
+ tool_calls.append(content_item)
496
+
497
+ # Extract tool results from tool messages
498
+ elif message.get("role") == "tool" and message.get("tool_call_id"):
499
+ tool_call_id = message.get("tool_call_id")
500
+ if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
501
+ result_content = message.get("content")[0]
502
+ if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
503
+ tool_results_map[tool_call_id] = result_content
504
+
505
+ # Attach results to their corresponding calls
506
+ for tool_call in tool_calls:
507
+ tool_call_id = tool_call.get("tool_call_id")
508
+ if tool_call_id in tool_results_map:
509
+ tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
510
+
511
+ return tool_calls
512
+
419
513
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
420
514
  """The asynchronous call where real end-to-end evaluation logic is performed.
421
515
 
@@ -5,7 +5,7 @@
5
5
  import math
6
6
  import re
7
7
  import os
8
- from typing import Dict, TypeVar, Union
8
+ from typing import Dict, Optional, TypeVar, Union
9
9
 
10
10
  if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
11
11
  from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
13
13
  from azure.ai.evaluation._legacy.prompty import AsyncPrompty
14
14
  from typing_extensions import override
15
15
 
16
+ from azure.core.credentials import TokenCredential
16
17
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
17
18
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
18
19
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
63
64
  model_config: dict,
64
65
  eval_last_turn: bool = False,
65
66
  threshold: int = 3,
67
+ credential: Optional[TokenCredential] = None,
66
68
  _higher_is_better: bool = False,
67
69
  **kwargs,
68
70
  ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
82
84
  )
83
85
 
84
86
  self._flow = AsyncPrompty.load(
85
- source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
87
+ source=self._prompty_file,
88
+ model=prompty_model_config,
89
+ token_credential=credential,
90
+ is_reasoning_model=self._is_reasoning_model,
86
91
  )
87
92
 
88
93
  # __call__ not overridden here because child classes have such varied signatures that there's no point
@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
153
153
  if query is not None and self._evaluate_query:
154
154
  input_data["query"] = str(query)
155
155
 
156
- if "context" in self._singleton_inputs:
156
+ if "context" in self._get_all_singleton_inputs():
157
157
  context = eval_input.get("context", None)
158
158
  if context is None:
159
159
  raise EvaluationException(
@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
68
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
69
 
70
70
  @override
71
- def __init__(self, model_config, *, threshold=3):
71
+ def __init__(self, model_config, *, credential=None, threshold=3):
72
72
  current_dir = os.path.dirname(__file__)
73
73
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
74
74
  self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  prompty_file=prompty_path,
79
79
  result_key=self._RESULT_KEY,
80
80
  threshold=threshold,
81
+ credential=credential,
81
82
  _higher_is_better=self._higher_is_better,
82
83
  )
83
84