azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- azure/ai/evaluation/_evaluate/_utils.py +5 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +4 -3
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +655 -2665
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -4,14 +4,34 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
Generic,
|
|
12
|
+
List,
|
|
13
|
+
TypedDict,
|
|
14
|
+
TypeVar,
|
|
15
|
+
Union,
|
|
16
|
+
cast,
|
|
17
|
+
final,
|
|
18
|
+
Optional,
|
|
19
|
+
)
|
|
8
20
|
|
|
9
21
|
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
|
|
10
22
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
23
|
|
|
12
|
-
from azure.ai.evaluation._exceptions import
|
|
24
|
+
from azure.ai.evaluation._exceptions import (
|
|
25
|
+
ErrorBlame,
|
|
26
|
+
ErrorCategory,
|
|
27
|
+
ErrorTarget,
|
|
28
|
+
EvaluationException,
|
|
29
|
+
)
|
|
13
30
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
|
-
from azure.ai.evaluation._constants import
|
|
31
|
+
from azure.ai.evaluation._constants import (
|
|
32
|
+
_AggregationType,
|
|
33
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
34
|
+
)
|
|
15
35
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
16
36
|
from azure.ai.evaluation._common._experimental import experimental
|
|
17
37
|
|
|
@@ -150,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
150
170
|
|
|
151
171
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
152
172
|
|
|
153
|
-
def _derive_singleton_inputs(self) -> List[str]:
|
|
173
|
+
def _derive_singleton_inputs(self) -> List[List[str]]:
|
|
154
174
|
"""Inspect the evaluator's __call__ function to determine what singleton inputs are expected
|
|
155
175
|
when the evaluator is being used in a non-conversation context.
|
|
156
176
|
By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
|
|
157
177
|
Thankfully this works the way you'd hope, with the call_signature being based on the child
|
|
158
178
|
function's signature, not the parent's.
|
|
159
179
|
|
|
160
|
-
:return: A list of
|
|
161
|
-
:rtype: List[str]
|
|
180
|
+
:return: A list of lists, where each inner list represents the singleton inputs for each overload.
|
|
181
|
+
:rtype: List[List[str]]
|
|
162
182
|
"""
|
|
163
183
|
|
|
164
184
|
overloads = get_overloads(self.__call__)
|
|
@@ -166,17 +186,70 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
166
186
|
call_signatures = [inspect.signature(self.__call__)]
|
|
167
187
|
else:
|
|
168
188
|
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
169
|
-
|
|
170
|
-
|
|
189
|
+
|
|
190
|
+
overload_inputs = []
|
|
171
191
|
for call_signature in call_signatures:
|
|
172
192
|
params = call_signature.parameters
|
|
173
193
|
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
174
194
|
continue
|
|
175
195
|
# exclude self since it is not a singleton input
|
|
176
|
-
|
|
177
|
-
return
|
|
196
|
+
overload_inputs.append([p for p in params if p != "self"])
|
|
197
|
+
return overload_inputs
|
|
198
|
+
|
|
199
|
+
def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
|
|
200
|
+
"""Find the overload that matches the provided kwargs and return its input parameters.
|
|
201
|
+
|
|
202
|
+
:keyword kwargs: The keyword arguments to match against overloads.
|
|
203
|
+
:type kwargs: Dict
|
|
204
|
+
:return: List of input parameter names for the matching overload.
|
|
205
|
+
:rtype: List[str]
|
|
206
|
+
"""
|
|
207
|
+
overload_inputs = self._singleton_inputs
|
|
208
|
+
provided_keys = set(key for key, value in kwargs.items() if value is not None)
|
|
209
|
+
|
|
210
|
+
# Find the overload that best matches the provided parameters
|
|
211
|
+
best_match = None
|
|
212
|
+
best_score = -1
|
|
213
|
+
|
|
214
|
+
for inputs in overload_inputs:
|
|
215
|
+
input_set = set(inputs)
|
|
216
|
+
|
|
217
|
+
# Calculate match score: how many of the overload's params are provided
|
|
218
|
+
if input_set.issubset(provided_keys):
|
|
219
|
+
score = len(input_set)
|
|
220
|
+
if score > best_score:
|
|
221
|
+
best_score = score
|
|
222
|
+
best_match = inputs
|
|
223
|
+
|
|
224
|
+
# If exact match found, return it
|
|
225
|
+
if best_match is not None:
|
|
226
|
+
return best_match
|
|
227
|
+
|
|
228
|
+
# If no exact match, find the overload with the most overlap
|
|
229
|
+
for inputs in overload_inputs:
|
|
230
|
+
input_set = set(inputs)
|
|
231
|
+
overlap = len(input_set.intersection(provided_keys))
|
|
232
|
+
if overlap > best_score:
|
|
233
|
+
best_score = overlap
|
|
234
|
+
best_match = inputs
|
|
235
|
+
|
|
236
|
+
# Return the best match or the first overload as fallback
|
|
237
|
+
return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
|
|
238
|
+
|
|
239
|
+
def _get_all_singleton_inputs(self) -> List[str]:
|
|
240
|
+
"""Get a flattened list of all possible singleton inputs across all overloads.
|
|
241
|
+
|
|
242
|
+
:return: Flattened list of all singleton input names.
|
|
243
|
+
:rtype: List[str]
|
|
244
|
+
"""
|
|
245
|
+
all_inputs = set()
|
|
246
|
+
for inputs in self._singleton_inputs:
|
|
247
|
+
all_inputs.update(inputs)
|
|
248
|
+
return list(all_inputs)
|
|
178
249
|
|
|
179
|
-
def _derive_conversation_converter(
|
|
250
|
+
def _derive_conversation_converter(
|
|
251
|
+
self,
|
|
252
|
+
) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
180
253
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
181
254
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
182
255
|
aspects of a conversation ought to be extracted.
|
|
@@ -184,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
184
257
|
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
185
258
|
:rtype: Callable
|
|
186
259
|
"""
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
260
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
261
|
+
include_context = "context" in all_singleton_inputs
|
|
262
|
+
include_query = "query" in all_singleton_inputs
|
|
263
|
+
include_response = "response" in all_singleton_inputs
|
|
264
|
+
include_ground_truth = "ground_truth" in all_singleton_inputs
|
|
191
265
|
|
|
192
266
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
193
267
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -235,7 +309,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
235
309
|
|
|
236
310
|
return converter
|
|
237
311
|
|
|
238
|
-
def _derive_multi_modal_conversation_converter(
|
|
312
|
+
def _derive_multi_modal_conversation_converter(
|
|
313
|
+
self,
|
|
314
|
+
) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
239
315
|
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
240
316
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
241
317
|
aspects of a conversation ought to be extracted.
|
|
@@ -288,16 +364,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
288
364
|
|
|
289
365
|
return multi_modal_converter
|
|
290
366
|
|
|
291
|
-
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
367
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
|
|
292
368
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
293
369
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
294
370
|
Either they receive a collection of keyname inputs that are all single values
|
|
295
371
|
(like a query and response), or they receive conversation that iss a list of dictionary
|
|
296
372
|
values.
|
|
297
373
|
|
|
298
|
-
The self._singleton_inputs list assigned during initialization
|
|
299
|
-
|
|
300
|
-
|
|
374
|
+
The self._singleton_inputs list (containing overload signatures) assigned during initialization
|
|
375
|
+
is used to find and extract singleton keywords, and determine which overload matches the
|
|
376
|
+
provided arguments.
|
|
301
377
|
|
|
302
378
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
303
379
|
are inputted.
|
|
@@ -315,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
315
391
|
conversation = kwargs.get("conversation", None)
|
|
316
392
|
singletons = {}
|
|
317
393
|
if len(self._singleton_inputs) > 0:
|
|
318
|
-
|
|
394
|
+
# Get all possible singleton inputs and check what's provided
|
|
395
|
+
all_singleton_inputs = self._get_all_singleton_inputs()
|
|
396
|
+
singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
|
|
397
|
+
|
|
319
398
|
# Check that both conversation and other inputs aren't set
|
|
320
399
|
if conversation is not None and any(singletons.values()):
|
|
321
400
|
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
@@ -330,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
330
409
|
if self._is_multi_modal_conversation(conversation):
|
|
331
410
|
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
332
411
|
return self._derive_conversation_converter()(conversation)
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
412
|
+
|
|
413
|
+
# Handle Singletons - find matching overload
|
|
414
|
+
matching_inputs = self._get_matching_overload_inputs(**kwargs)
|
|
415
|
+
if matching_inputs:
|
|
416
|
+
# Check if all required inputs for this overload are provided
|
|
417
|
+
required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
|
|
418
|
+
required_singletons = remove_optional_singletons(self, required_singletons)
|
|
419
|
+
if all(value is not None for value in required_singletons.values()):
|
|
420
|
+
return [singletons]
|
|
421
|
+
|
|
337
422
|
# Missing input
|
|
338
423
|
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
339
424
|
raise EvaluationException(
|
|
@@ -392,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
392
477
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
393
478
|
return aggregated
|
|
394
479
|
|
|
480
|
+
def _parse_tools_from_response(self, response):
|
|
481
|
+
"""Parse the response to extract tool calls and results.
|
|
482
|
+
:param response: The response to parse.
|
|
483
|
+
:type response: Union[str, List[dict]]
|
|
484
|
+
:return: List of tool calls extracted from the response.
|
|
485
|
+
:rtype: List[dict]
|
|
486
|
+
"""
|
|
487
|
+
tool_calls = []
|
|
488
|
+
tool_results_map = {}
|
|
489
|
+
if isinstance(response, list):
|
|
490
|
+
for message in response:
|
|
491
|
+
# Extract tool calls from assistant messages
|
|
492
|
+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
493
|
+
for content_item in message.get("content"):
|
|
494
|
+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
495
|
+
tool_calls.append(content_item)
|
|
496
|
+
|
|
497
|
+
# Extract tool results from tool messages
|
|
498
|
+
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
499
|
+
tool_call_id = message.get("tool_call_id")
|
|
500
|
+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
501
|
+
result_content = message.get("content")[0]
|
|
502
|
+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
503
|
+
tool_results_map[tool_call_id] = result_content
|
|
504
|
+
|
|
505
|
+
# Attach results to their corresponding calls
|
|
506
|
+
for tool_call in tool_calls:
|
|
507
|
+
tool_call_id = tool_call.get("tool_call_id")
|
|
508
|
+
if tool_call_id in tool_results_map:
|
|
509
|
+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
510
|
+
|
|
511
|
+
return tool_calls
|
|
512
|
+
|
|
395
513
|
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
396
514
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
397
515
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
7
|
import os
|
|
8
|
-
from typing import Dict, TypeVar, Union
|
|
8
|
+
from typing import Dict, Optional, TypeVar, Union
|
|
9
9
|
|
|
10
10
|
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
|
|
11
11
|
from promptflow.core._flow import AsyncPrompty
|
|
@@ -13,6 +13,7 @@ else:
|
|
|
13
13
|
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
|
+
from azure.core.credentials import TokenCredential
|
|
16
17
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
17
18
|
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
18
19
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
63
64
|
model_config: dict,
|
|
64
65
|
eval_last_turn: bool = False,
|
|
65
66
|
threshold: int = 3,
|
|
67
|
+
credential: Optional[TokenCredential] = None,
|
|
66
68
|
_higher_is_better: bool = False,
|
|
67
69
|
**kwargs,
|
|
68
70
|
) -> None:
|
|
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
82
84
|
)
|
|
83
85
|
|
|
84
86
|
self._flow = AsyncPrompty.load(
|
|
85
|
-
source=self._prompty_file,
|
|
87
|
+
source=self._prompty_file,
|
|
88
|
+
model=prompty_model_config,
|
|
89
|
+
token_credential=credential,
|
|
90
|
+
is_reasoning_model=self._is_reasoning_model,
|
|
86
91
|
)
|
|
87
92
|
|
|
88
93
|
# __call__ not overridden here because child classes have such varied signatures that there's no point
|
|
@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
36
36
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
37
37
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
38
38
|
:type eval_last_turn: bool
|
|
39
|
-
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
40
|
-
to produce a single result.
|
|
39
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
|
|
41
40
|
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
42
41
|
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
43
42
|
:param threshold: The threshold for the evaluation. Default is 3.
|
|
44
43
|
:type threshold: Optional[int]
|
|
45
44
|
:param _higher_is_better: If True, higher scores are better. Default is True.
|
|
46
45
|
:type _higher_is_better: Optional[bool]
|
|
46
|
+
:param evaluate_query: If True, the query will be included in the evaluation data when evaluating
|
|
47
|
+
query-response pairs. If False, only the response will be evaluated. Default is False.
|
|
48
|
+
Can be passed as a keyword argument.
|
|
49
|
+
:type evaluate_query: bool
|
|
47
50
|
"""
|
|
48
51
|
|
|
49
52
|
@override
|
|
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
56
59
|
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
57
60
|
threshold: int = 3,
|
|
58
61
|
_higher_is_better: Optional[bool] = False,
|
|
62
|
+
**kwargs,
|
|
59
63
|
):
|
|
60
64
|
super().__init__(
|
|
61
65
|
eval_last_turn=eval_last_turn,
|
|
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
67
71
|
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
68
72
|
self._credential = credential
|
|
69
73
|
self._threshold = threshold
|
|
74
|
+
|
|
75
|
+
# Handle evaluate_query parameter from kwargs
|
|
76
|
+
self._evaluate_query = kwargs.get("evaluate_query", False)
|
|
70
77
|
self._higher_is_better = _higher_is_better
|
|
71
78
|
|
|
72
79
|
@override
|
|
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
101
108
|
:return: The evaluation result.
|
|
102
109
|
:rtype: Dict
|
|
103
110
|
"""
|
|
104
|
-
if "
|
|
111
|
+
if "response" in eval_input:
|
|
105
112
|
return await self._evaluate_query_response(eval_input)
|
|
106
113
|
|
|
107
114
|
conversation = eval_input.get("conversation", None)
|
|
@@ -131,19 +138,22 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
131
138
|
return result
|
|
132
139
|
|
|
133
140
|
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
134
|
-
query =
|
|
135
|
-
response =
|
|
136
|
-
if
|
|
141
|
+
query = eval_input.get("query", None)
|
|
142
|
+
response = eval_input.get("response", None)
|
|
143
|
+
if response is None:
|
|
137
144
|
raise EvaluationException(
|
|
138
145
|
message="Not implemented",
|
|
139
146
|
internal_message=(
|
|
140
|
-
"Reached query/response evaluation without supplying
|
|
147
|
+
"Reached query/response evaluation without supplying response."
|
|
141
148
|
+ " This should have failed earlier."
|
|
142
149
|
),
|
|
143
150
|
)
|
|
144
|
-
input_data = {"
|
|
151
|
+
input_data = {"response": str(response)}
|
|
152
|
+
|
|
153
|
+
if query is not None and self._evaluate_query:
|
|
154
|
+
input_data["query"] = str(query)
|
|
145
155
|
|
|
146
|
-
if "context" in self.
|
|
156
|
+
if "context" in self._get_all_singleton_inputs():
|
|
147
157
|
context = eval_input.get("context", None)
|
|
148
158
|
if context is None:
|
|
149
159
|
raise EvaluationException(
|
|
@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
33
33
|
:type self_harm_threshold: int
|
|
34
34
|
:param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
|
|
35
35
|
:type hate_unfairness_threshold: int
|
|
36
|
+
:param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
|
|
37
|
+
:type evaluate_query: bool
|
|
36
38
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
37
39
|
:type kwargs: Any
|
|
38
40
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
66
68
|
:caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
|
|
67
69
|
"""
|
|
68
70
|
|
|
69
|
-
id = "content_safety"
|
|
71
|
+
id = "azureai://built-in/evaluators/content_safety"
|
|
70
72
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
73
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
71
74
|
|
|
72
75
|
def __init__(
|
|
73
76
|
self,
|
|
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
90
93
|
if not isinstance(value, int):
|
|
91
94
|
raise TypeError(f"{name} must be an int, got {type(value)}")
|
|
92
95
|
|
|
96
|
+
# Extract evaluate_query from kwargs if present
|
|
97
|
+
evaluate_query_kwargs = {}
|
|
98
|
+
if "evaluate_query" in kwargs:
|
|
99
|
+
evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
|
|
100
|
+
|
|
93
101
|
evaluators = [
|
|
94
|
-
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
|
|
95
|
-
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
|
|
96
|
-
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
|
|
97
|
-
HateUnfairnessEvaluator(
|
|
102
|
+
ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
|
|
103
|
+
SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
|
|
104
|
+
SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
|
|
105
|
+
HateUnfairnessEvaluator(
|
|
106
|
+
credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
|
|
107
|
+
),
|
|
98
108
|
]
|
|
99
109
|
super().__init__(evaluators=evaluators, **kwargs)
|
|
100
110
|
|
|
@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
80
80
|
:caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
|
|
81
81
|
"""
|
|
82
82
|
|
|
83
|
-
id = "
|
|
83
|
+
id = "azureai://built-in/evaluators/hate_unfairness"
|
|
84
84
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
85
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
85
86
|
|
|
86
87
|
@override
|
|
87
88
|
def __init__(
|
|
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
90
91
|
azure_ai_project,
|
|
91
92
|
*,
|
|
92
93
|
threshold: int = 3,
|
|
94
|
+
**kwargs,
|
|
93
95
|
):
|
|
94
96
|
super().__init__(
|
|
95
97
|
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
|
|
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
98
100
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
99
101
|
threshold=threshold,
|
|
100
102
|
_higher_is_better=False,
|
|
103
|
+
**kwargs,
|
|
101
104
|
)
|
|
102
105
|
|
|
103
106
|
@overload
|
|
@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
65
65
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
|
|
66
66
|
"""
|
|
67
67
|
|
|
68
|
-
id = "
|
|
68
|
+
id = "azureai://built-in/evaluators/self_harm"
|
|
69
69
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
70
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
70
71
|
|
|
71
72
|
@override
|
|
72
73
|
def __init__(
|
|
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
75
76
|
azure_ai_project,
|
|
76
77
|
*,
|
|
77
78
|
threshold: int = 3,
|
|
79
|
+
**kwargs,
|
|
78
80
|
):
|
|
79
81
|
super().__init__(
|
|
80
82
|
eval_metric=EvaluationMetrics.SELF_HARM,
|
|
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
83
85
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
84
86
|
threshold=threshold,
|
|
85
87
|
_higher_is_better=False,
|
|
88
|
+
**kwargs,
|
|
86
89
|
)
|
|
87
90
|
|
|
88
91
|
@overload
|
|
@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a SexualEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/sexual"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.SEXUAL,
|
|
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|
|
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
146
149
|
key "messages". Conversation turns are expected
|
|
147
150
|
to be dictionaries with keys "content" and "role".
|
|
148
151
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
149
|
-
:return: The
|
|
152
|
+
:return: The sexual score.
|
|
150
153
|
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
|
|
151
154
|
"""
|
|
152
155
|
return super().__call__(*args, **kwargs)
|
|
@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
76
76
|
:caption: Initialize with threshold and call a ViolenceEvaluator.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
|
-
id = "
|
|
79
|
+
id = "azureai://built-in/evaluators/violence"
|
|
80
80
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
81
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
81
82
|
|
|
82
83
|
@override
|
|
83
84
|
def __init__(
|
|
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
86
87
|
azure_ai_project,
|
|
87
88
|
*,
|
|
88
89
|
threshold: int = 3,
|
|
90
|
+
**kwargs,
|
|
89
91
|
):
|
|
90
92
|
super().__init__(
|
|
91
93
|
eval_metric=EvaluationMetrics.VIOLENCE,
|
|
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
|
|
|
94
96
|
conversation_aggregation_type=_AggregationType.MAX,
|
|
95
97
|
threshold=threshold,
|
|
96
98
|
_higher_is_better=False,
|
|
99
|
+
**kwargs,
|
|
97
100
|
)
|
|
98
101
|
|
|
99
102
|
@overload
|
|
@@ -49,6 +49,9 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
49
49
|
:caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
+
id = "azureai://built-in/evaluators/document_retrieval"
|
|
53
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
54
|
+
|
|
52
55
|
def __init__(
|
|
53
56
|
self,
|
|
54
57
|
*,
|
|
@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
|
|
|
52
52
|
|
|
53
53
|
id = "eci"
|
|
54
54
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
55
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
55
56
|
|
|
56
57
|
@override
|
|
57
58
|
def __init__(
|
|
58
59
|
self,
|
|
59
60
|
credential,
|
|
60
61
|
azure_ai_project,
|
|
62
|
+
**kwargs,
|
|
61
63
|
):
|
|
62
64
|
super().__init__(
|
|
63
65
|
eval_metric=_InternalEvaluationMetrics.ECI,
|
|
64
66
|
azure_ai_project=azure_ai_project,
|
|
65
67
|
credential=credential,
|
|
68
|
+
**kwargs,
|
|
66
69
|
)
|
|
67
70
|
|
|
68
71
|
@overload
|
|
@@ -58,7 +58,7 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
58
58
|
:caption: Initialize with threshold and call an F1ScoreEvaluator.
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
|
-
id = "
|
|
61
|
+
id = "azureai://built-in/evaluators/f1_score"
|
|
62
62
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
63
63
|
|
|
64
64
|
def __init__(self, *, threshold=0.5):
|
|
@@ -64,11 +64,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
64
64
|
_PROMPTY_FILE = "fluency.prompty"
|
|
65
65
|
_RESULT_KEY = "fluency"
|
|
66
66
|
|
|
67
|
-
id = "
|
|
67
|
+
id = "azureai://built-in/evaluators/fluency"
|
|
68
68
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
69
69
|
|
|
70
70
|
@override
|
|
71
|
-
def __init__(self, model_config, *, threshold=3):
|
|
71
|
+
def __init__(self, model_config, *, credential=None, threshold=3):
|
|
72
72
|
current_dir = os.path.dirname(__file__)
|
|
73
73
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
74
74
|
self._threshold = threshold
|
|
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
prompty_file=prompty_path,
|
|
79
79
|
result_key=self._RESULT_KEY,
|
|
80
80
|
threshold=threshold,
|
|
81
|
+
credential=credential,
|
|
81
82
|
_higher_is_better=self._higher_is_better,
|
|
82
83
|
)
|
|
83
84
|
|
|
@@ -55,7 +55,7 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
55
55
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
56
56
|
"""
|
|
57
57
|
|
|
58
|
-
id = "
|
|
58
|
+
id = "azureai://built-in/evaluators/gleu_score"
|
|
59
59
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
60
60
|
|
|
61
61
|
@override
|