azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -4,14 +4,34 @@
4
4
 
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
- from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
7
+ from typing import (
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ Generic,
12
+ List,
13
+ TypedDict,
14
+ TypeVar,
15
+ Union,
16
+ cast,
17
+ final,
18
+ Optional,
19
+ )
8
20
 
9
21
  from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
10
22
  from typing_extensions import ParamSpec, TypeAlias, get_overloads
11
23
 
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
24
+ from azure.ai.evaluation._exceptions import (
25
+ ErrorBlame,
26
+ ErrorCategory,
27
+ ErrorTarget,
28
+ EvaluationException,
29
+ )
13
30
  from azure.ai.evaluation._common.utils import remove_optional_singletons
14
- from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
31
+ from azure.ai.evaluation._constants import (
32
+ _AggregationType,
33
+ EVALUATION_PASS_FAIL_MAPPING,
34
+ )
15
35
  from azure.ai.evaluation._model_configurations import Conversation
16
36
  from azure.ai.evaluation._common._experimental import experimental
17
37
 
@@ -150,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
150
170
 
151
171
  # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
152
172
 
153
- def _derive_singleton_inputs(self) -> List[str]:
173
+ def _derive_singleton_inputs(self) -> List[List[str]]:
154
174
  """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
155
175
  when the evaluator is being used in a non-conversation context.
156
176
  By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
157
177
  Thankfully this works the way you'd hope, with the call_signature being based on the child
158
178
  function's signature, not the parent's.
159
179
 
160
- :return: A list of strings representing the names of singleton inputs.
161
- :rtype: List[str]
180
+ :return: A list of lists, where each inner list represents the singleton inputs for each overload.
181
+ :rtype: List[List[str]]
162
182
  """
163
183
 
164
184
  overloads = get_overloads(self.__call__)
@@ -166,17 +186,70 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
166
186
  call_signatures = [inspect.signature(self.__call__)]
167
187
  else:
168
188
  call_signatures = [inspect.signature(overload) for overload in overloads]
169
- call_signature = inspect.signature(self.__call__)
170
- singletons = []
189
+
190
+ overload_inputs = []
171
191
  for call_signature in call_signatures:
172
192
  params = call_signature.parameters
173
193
  if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
174
194
  continue
175
195
  # exclude self since it is not a singleton input
176
- singletons.extend([p for p in params if p != "self"])
177
- return singletons
196
+ overload_inputs.append([p for p in params if p != "self"])
197
+ return overload_inputs
198
+
199
+ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
200
+ """Find the overload that matches the provided kwargs and return its input parameters.
201
+
202
+ :keyword kwargs: The keyword arguments to match against overloads.
203
+ :type kwargs: Dict
204
+ :return: List of input parameter names for the matching overload.
205
+ :rtype: List[str]
206
+ """
207
+ overload_inputs = self._singleton_inputs
208
+ provided_keys = set(key for key, value in kwargs.items() if value is not None)
209
+
210
+ # Find the overload that best matches the provided parameters
211
+ best_match = None
212
+ best_score = -1
213
+
214
+ for inputs in overload_inputs:
215
+ input_set = set(inputs)
216
+
217
+ # Calculate match score: how many of the overload's params are provided
218
+ if input_set.issubset(provided_keys):
219
+ score = len(input_set)
220
+ if score > best_score:
221
+ best_score = score
222
+ best_match = inputs
223
+
224
+ # If exact match found, return it
225
+ if best_match is not None:
226
+ return best_match
227
+
228
+ # If no exact match, find the overload with the most overlap
229
+ for inputs in overload_inputs:
230
+ input_set = set(inputs)
231
+ overlap = len(input_set.intersection(provided_keys))
232
+ if overlap > best_score:
233
+ best_score = overlap
234
+ best_match = inputs
235
+
236
+ # Return the best match or the first overload as fallback
237
+ return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
238
+
239
+ def _get_all_singleton_inputs(self) -> List[str]:
240
+ """Get a flattened list of all possible singleton inputs across all overloads.
241
+
242
+ :return: Flattened list of all singleton input names.
243
+ :rtype: List[str]
244
+ """
245
+ all_inputs = set()
246
+ for inputs in self._singleton_inputs:
247
+ all_inputs.update(inputs)
248
+ return list(all_inputs)
178
249
 
179
- def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
250
+ def _derive_conversation_converter(
251
+ self,
252
+ ) -> Callable[[Dict], List[DerivedEvalInput]]:
180
253
  """Produce the function that will be used to convert conversations to a list of evaluable inputs.
181
254
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
182
255
  aspects of a conversation ought to be extracted.
@@ -184,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
184
257
  :return: The function that will be used to convert conversations to evaluable inputs.
185
258
  :rtype: Callable
186
259
  """
187
- include_context = "context" in self._singleton_inputs
188
- include_query = "query" in self._singleton_inputs
189
- include_response = "response" in self._singleton_inputs
190
- include_ground_truth = "ground_truth" in self._singleton_inputs
260
+ all_singleton_inputs = self._get_all_singleton_inputs()
261
+ include_context = "context" in all_singleton_inputs
262
+ include_query = "query" in all_singleton_inputs
263
+ include_response = "response" in all_singleton_inputs
264
+ include_ground_truth = "ground_truth" in all_singleton_inputs
191
265
 
192
266
  def converter(conversation: Dict) -> List[DerivedEvalInput]:
193
267
  messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -235,7 +309,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
235
309
 
236
310
  return converter
237
311
 
238
- def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
312
+ def _derive_multi_modal_conversation_converter(
313
+ self,
314
+ ) -> Callable[[Dict], List[Dict[str, Any]]]:
239
315
  """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
240
316
  This uses the inputs derived from the _derive_singleton_inputs function to determine which
241
317
  aspects of a conversation ought to be extracted.
@@ -288,16 +364,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
288
364
 
289
365
  return multi_modal_converter
290
366
 
291
- def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
367
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
292
368
  """Convert an arbitrary input into a list of inputs for evaluators.
293
369
  It is assumed that evaluators generally make use of their inputs in one of two ways.
294
370
  Either they receive a collection of keyname inputs that are all single values
295
371
  (like a query and response), or they receive conversation that iss a list of dictionary
296
372
  values.
297
373
 
298
- The self._singleton_inputs list assigned during initialization is used to find and extract
299
- singleton keywords, and self._allow_conversation_input is used to determine if a conversation
300
- is a valid input.
374
+ The self._singleton_inputs list (containing overload signatures) assigned during initialization
375
+ is used to find and extract singleton keywords, and determine which overload matches the
376
+ provided arguments.
301
377
 
302
378
  If both conversations and singletons are allowed, the function will raise an exception if both
303
379
  are inputted.
@@ -315,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
315
391
  conversation = kwargs.get("conversation", None)
316
392
  singletons = {}
317
393
  if len(self._singleton_inputs) > 0:
318
- singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
394
+ # Get all possible singleton inputs and check what's provided
395
+ all_singleton_inputs = self._get_all_singleton_inputs()
396
+ singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
397
+
319
398
  # Check that both conversation and other inputs aren't set
320
399
  if conversation is not None and any(singletons.values()):
321
400
  msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -330,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
330
409
  if self._is_multi_modal_conversation(conversation):
331
410
  return self._derive_multi_modal_conversation_converter()(conversation)
332
411
  return self._derive_conversation_converter()(conversation)
333
- # Handle Singletons
334
- required_singletons = remove_optional_singletons(self, singletons)
335
- if all(value is not None for value in required_singletons.values()):
336
- return [singletons]
412
+
413
+ # Handle Singletons - find matching overload
414
+ matching_inputs = self._get_matching_overload_inputs(**kwargs)
415
+ if matching_inputs:
416
+ # Check if all required inputs for this overload are provided
417
+ required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
418
+ required_singletons = remove_optional_singletons(self, required_singletons)
419
+ if all(value is not None for value in required_singletons.values()):
420
+ return [singletons]
421
+
337
422
  # Missing input
338
423
  msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
339
424
  raise EvaluationException(
@@ -392,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
392
477
  aggregated["evaluation_per_turn"] = evaluation_per_turn
393
478
  return aggregated
394
479
 
480
+ def _parse_tools_from_response(self, response):
481
+ """Parse the response to extract tool calls and results.
482
+ :param response: The response to parse.
483
+ :type response: Union[str, List[dict]]
484
+ :return: List of tool calls extracted from the response.
485
+ :rtype: List[dict]
486
+ """
487
+ tool_calls = []
488
+ tool_results_map = {}
489
+ if isinstance(response, list):
490
+ for message in response:
491
+ # Extract tool calls from assistant messages
492
+ if message.get("role") == "assistant" and isinstance(message.get("content"), list):
493
+ for content_item in message.get("content"):
494
+ if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
495
+ tool_calls.append(content_item)
496
+
497
+ # Extract tool results from tool messages
498
+ elif message.get("role") == "tool" and message.get("tool_call_id"):
499
+ tool_call_id = message.get("tool_call_id")
500
+ if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
501
+ result_content = message.get("content")[0]
502
+ if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
503
+ tool_results_map[tool_call_id] = result_content
504
+
505
+ # Attach results to their corresponding calls
506
+ for tool_call in tool_calls:
507
+ tool_call_id = tool_call.get("tool_call_id")
508
+ if tool_call_id in tool_results_map:
509
+ tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
510
+
511
+ return tool_calls
512
+
395
513
  async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
396
514
  """The asynchronous call where real end-to-end evaluation logic is performed.
397
515
 
@@ -5,7 +5,7 @@
5
5
  import math
6
6
  import re
7
7
  import os
8
- from typing import Dict, TypeVar, Union
8
+ from typing import Dict, Optional, TypeVar, Union
9
9
 
10
10
  if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
11
11
  from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
13
13
  from azure.ai.evaluation._legacy.prompty import AsyncPrompty
14
14
  from typing_extensions import override
15
15
 
16
+ from azure.core.credentials import TokenCredential
16
17
  from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
17
18
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
18
19
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
63
64
  model_config: dict,
64
65
  eval_last_turn: bool = False,
65
66
  threshold: int = 3,
67
+ credential: Optional[TokenCredential] = None,
66
68
  _higher_is_better: bool = False,
67
69
  **kwargs,
68
70
  ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
82
84
  )
83
85
 
84
86
  self._flow = AsyncPrompty.load(
85
- source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
87
+ source=self._prompty_file,
88
+ model=prompty_model_config,
89
+ token_credential=credential,
90
+ is_reasoning_model=self._is_reasoning_model,
86
91
  )
87
92
 
88
93
  # __call__ not overridden here because child classes have such varied signatures that there's no point
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
36
36
  aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
37
37
  when this occurs. Default is False, resulting full conversation evaluation and aggregation.
38
38
  :type eval_last_turn: bool
39
- :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
40
- to produce a single result.
39
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
41
40
  Default is ~azure.ai.evaluation._AggregationType.MEAN.
42
41
  :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
43
42
  :param threshold: The threshold for the evaluation. Default is 3.
44
43
  :type threshold: Optional[int]
45
44
  :param _higher_is_better: If True, higher scores are better. Default is True.
46
45
  :type _higher_is_better: Optional[bool]
46
+ :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
47
+ query-response pairs. If False, only the response will be evaluated. Default is False.
48
+ Can be passed as a keyword argument.
49
+ :type evaluate_query: bool
47
50
  """
48
51
 
49
52
  @override
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
56
59
  conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
57
60
  threshold: int = 3,
58
61
  _higher_is_better: Optional[bool] = False,
62
+ **kwargs,
59
63
  ):
60
64
  super().__init__(
61
65
  eval_last_turn=eval_last_turn,
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
67
71
  self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
68
72
  self._credential = credential
69
73
  self._threshold = threshold
74
+
75
+ # Handle evaluate_query parameter from kwargs
76
+ self._evaluate_query = kwargs.get("evaluate_query", False)
70
77
  self._higher_is_better = _higher_is_better
71
78
 
72
79
  @override
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
101
108
  :return: The evaluation result.
102
109
  :rtype: Dict
103
110
  """
104
- if "query" in eval_input and "response" in eval_input:
111
+ if "response" in eval_input:
105
112
  return await self._evaluate_query_response(eval_input)
106
113
 
107
114
  conversation = eval_input.get("conversation", None)
@@ -131,19 +138,22 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
131
138
  return result
132
139
 
133
140
  async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
134
- query = str(eval_input.get("query", None))
135
- response = str(eval_input.get("response", None))
136
- if query is None or response is None:
141
+ query = eval_input.get("query", None)
142
+ response = eval_input.get("response", None)
143
+ if response is None:
137
144
  raise EvaluationException(
138
145
  message="Not implemented",
139
146
  internal_message=(
140
- "Reached query/response evaluation without supplying query or response."
147
+ "Reached query/response evaluation without supplying response."
141
148
  + " This should have failed earlier."
142
149
  ),
143
150
  )
144
- input_data = {"query": query, "response": response}
151
+ input_data = {"response": str(response)}
152
+
153
+ if query is not None and self._evaluate_query:
154
+ input_data["query"] = str(query)
145
155
 
146
- if "context" in self._singleton_inputs:
156
+ if "context" in self._get_all_singleton_inputs():
147
157
  context = eval_input.get("context", None)
148
158
  if context is None:
149
159
  raise EvaluationException(
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
33
33
  :type self_harm_threshold: int
34
34
  :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
35
35
  :type hate_unfairness_threshold: int
36
+ :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
37
+ :type evaluate_query: bool
36
38
  :param kwargs: Additional arguments to pass to the evaluator.
37
39
  :type kwargs: Any
38
40
  :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
66
68
  :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
67
69
  """
68
70
 
69
- id = "content_safety"
71
+ id = "azureai://built-in/evaluators/content_safety"
70
72
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+ _OPTIONAL_PARAMS = ["query"]
71
74
 
72
75
  def __init__(
73
76
  self,
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
90
93
  if not isinstance(value, int):
91
94
  raise TypeError(f"{name} must be an int, got {type(value)}")
92
95
 
96
+ # Extract evaluate_query from kwargs if present
97
+ evaluate_query_kwargs = {}
98
+ if "evaluate_query" in kwargs:
99
+ evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
100
+
93
101
  evaluators = [
94
- ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
95
- SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
96
- SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
97
- HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
102
+ ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
103
+ SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
104
+ SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
105
+ HateUnfairnessEvaluator(
106
+ credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
107
+ ),
98
108
  ]
99
109
  super().__init__(evaluators=evaluators, **kwargs)
100
110
 
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
80
80
  :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
81
81
  """
82
82
 
83
- id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
83
+ id = "azureai://built-in/evaluators/hate_unfairness"
84
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
85
+ _OPTIONAL_PARAMS = ["query"]
85
86
 
86
87
  @override
87
88
  def __init__(
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
90
91
  azure_ai_project,
91
92
  *,
92
93
  threshold: int = 3,
94
+ **kwargs,
93
95
  ):
94
96
  super().__init__(
95
97
  eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
98
100
  conversation_aggregation_type=_AggregationType.MAX,
99
101
  threshold=threshold,
100
102
  _higher_is_better=False,
103
+ **kwargs,
101
104
  )
102
105
 
103
106
  @overload
@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
65
65
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
66
66
  """
67
67
 
68
- id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
68
+ id = "azureai://built-in/evaluators/self_harm"
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
+ _OPTIONAL_PARAMS = ["query"]
70
71
 
71
72
  @override
72
73
  def __init__(
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
75
76
  azure_ai_project,
76
77
  *,
77
78
  threshold: int = 3,
79
+ **kwargs,
78
80
  ):
79
81
  super().__init__(
80
82
  eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
83
85
  conversation_aggregation_type=_AggregationType.MAX,
84
86
  threshold=threshold,
85
87
  _higher_is_better=False,
88
+ **kwargs,
86
89
  )
87
90
 
88
91
  @overload
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a SexualEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/sexual"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
146
149
  key "messages". Conversation turns are expected
147
150
  to be dictionaries with keys "content" and "role".
148
151
  :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
149
- :return: The fluency score.
152
+ :return: The sexual score.
150
153
  :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
151
154
  """
152
155
  return super().__call__(*args, **kwargs)
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
76
76
  :caption: Initialize with threshold and call a ViolenceEvaluator.
77
77
  """
78
78
 
79
- id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
79
+ id = "azureai://built-in/evaluators/violence"
80
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
81
+ _OPTIONAL_PARAMS = ["query"]
81
82
 
82
83
  @override
83
84
  def __init__(
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
86
87
  azure_ai_project,
87
88
  *,
88
89
  threshold: int = 3,
90
+ **kwargs,
89
91
  ):
90
92
  super().__init__(
91
93
  eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
94
96
  conversation_aggregation_type=_AggregationType.MAX,
95
97
  threshold=threshold,
96
98
  _higher_is_better=False,
99
+ **kwargs,
97
100
  )
98
101
 
99
102
  @overload
@@ -49,6 +49,9 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
49
49
  :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
50
50
  """
51
51
 
52
+ id = "azureai://built-in/evaluators/document_retrieval"
53
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
54
+
52
55
  def __init__(
53
56
  self,
54
57
  *,
@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
52
52
 
53
53
  id = "eci"
54
54
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
+ _OPTIONAL_PARAMS = ["query"]
55
56
 
56
57
  @override
57
58
  def __init__(
58
59
  self,
59
60
  credential,
60
61
  azure_ai_project,
62
+ **kwargs,
61
63
  ):
62
64
  super().__init__(
63
65
  eval_metric=_InternalEvaluationMetrics.ECI,
64
66
  azure_ai_project=azure_ai_project,
65
67
  credential=credential,
68
+ **kwargs,
66
69
  )
67
70
 
68
71
  @overload
@@ -58,7 +58,7 @@ class F1ScoreEvaluator(EvaluatorBase):
58
58
  :caption: Initialize with threshold and call an F1ScoreEvaluator.
59
59
  """
60
60
 
61
- id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
61
+ id = "azureai://built-in/evaluators/f1_score"
62
62
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
63
63
 
64
64
  def __init__(self, *, threshold=0.5):
@@ -64,11 +64,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
64
64
  _PROMPTY_FILE = "fluency.prompty"
65
65
  _RESULT_KEY = "fluency"
66
66
 
67
- id = "azureml://registries/azureml/models/Fluency-Evaluator/versions/4"
67
+ id = "azureai://built-in/evaluators/fluency"
68
68
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
69
69
 
70
70
  @override
71
- def __init__(self, model_config, *, threshold=3):
71
+ def __init__(self, model_config, *, credential=None, threshold=3):
72
72
  current_dir = os.path.dirname(__file__)
73
73
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
74
74
  self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  prompty_file=prompty_path,
79
79
  result_key=self._RESULT_KEY,
80
80
  threshold=threshold,
81
+ credential=credential,
81
82
  _higher_is_better=self._higher_is_better,
82
83
  )
83
84
 
@@ -55,7 +55,7 @@ class GleuScoreEvaluator(EvaluatorBase):
55
55
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
56
56
  """
57
57
 
58
- id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
58
+ id = "azureai://built-in/evaluators/gleu_score"
59
59
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
60
60
 
61
61
  @override