azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +204 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/rai_service.py +30 -21
- azure/ai/evaluation/_constants.py +19 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +16 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +76 -44
- azure/ai/evaluation/_evaluate/_utils.py +93 -34
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +46 -25
- azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +140 -5
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +61 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -1
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +40 -2
- azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +6 -43
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +2 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +2 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +61 -68
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +45 -23
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +55 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +32 -27
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +44 -23
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +42 -82
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +35 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +40 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +24 -13
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/METADATA +84 -15
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/RECORD +47 -41
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,30 +1,16 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict
|
|
4
5
|
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
-
from
|
|
6
|
+
from typing_extensions import overload, override
|
|
6
7
|
|
|
7
8
|
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
9
|
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
9
11
|
|
|
10
|
-
class _AsyncBleuScoreEvaluator:
|
|
11
|
-
def __init__(self):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
15
|
-
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
-
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
-
|
|
18
|
-
# NIST Smoothing
|
|
19
|
-
smoothing_function = SmoothingFunction().method4
|
|
20
|
-
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
-
|
|
22
|
-
return {
|
|
23
|
-
"bleu_score": score,
|
|
24
|
-
}
|
|
25
12
|
|
|
26
|
-
|
|
27
|
-
class BleuScoreEvaluator:
|
|
13
|
+
class BleuScoreEvaluator(EvaluatorBase):
|
|
28
14
|
"""
|
|
29
15
|
Calculate the BLEU score for a given response and ground truth.
|
|
30
16
|
|
|
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
|
|
|
51
37
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
38
|
|
|
53
39
|
def __init__(self):
|
|
54
|
-
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
@override
|
|
43
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
44
|
+
"""Produce a glue score evaluation result.
|
|
45
|
+
|
|
46
|
+
:param eval_input: The input to the evaluation function.
|
|
47
|
+
:type eval_input: Dict
|
|
48
|
+
:return: The evaluation result.
|
|
49
|
+
:rtype: Dict
|
|
50
|
+
"""
|
|
51
|
+
ground_truth = eval_input["ground_truth"]
|
|
52
|
+
response = eval_input["response"]
|
|
53
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
54
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
# NIST Smoothing
|
|
57
|
+
smoothing_function = SmoothingFunction().method4
|
|
58
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"bleu_score": score,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
@overload # type: ignore
|
|
65
|
+
def __call__(self, *, response: str, ground_truth: str):
|
|
57
66
|
"""
|
|
58
67
|
Evaluate the BLEU score between the response and the ground truth.
|
|
59
68
|
|
|
@@ -64,9 +73,21 @@ class BleuScoreEvaluator:
|
|
|
64
73
|
:return: The BLEU score.
|
|
65
74
|
:rtype: Dict[str, float]
|
|
66
75
|
"""
|
|
67
|
-
return async_run_allowing_running_loop(
|
|
68
|
-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
69
|
-
)
|
|
70
76
|
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
@override
|
|
78
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
79
|
+
self,
|
|
80
|
+
*args,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
85
|
+
|
|
86
|
+
:keyword response: The response to be evaluated.
|
|
87
|
+
:paramtype response: str
|
|
88
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
89
|
+
:paramtype ground_truth: str
|
|
90
|
+
:return: The BLEU score.
|
|
91
|
+
:rtype: Dict[str, float]
|
|
92
|
+
"""
|
|
93
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -5,9 +5,11 @@
|
|
|
5
5
|
from ._base_eval import EvaluatorBase
|
|
6
6
|
from ._base_prompty_eval import PromptyEvaluatorBase
|
|
7
7
|
from ._base_rai_svc_eval import RaiServiceEvaluatorBase
|
|
8
|
+
from ._base_multi_eval import MultiEvaluatorBase
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"EvaluatorBase",
|
|
11
12
|
"PromptyEvaluatorBase",
|
|
12
13
|
"RaiServiceEvaluatorBase",
|
|
14
|
+
"MultiEvaluatorBase",
|
|
13
15
|
]
|
|
@@ -4,14 +4,18 @@
|
|
|
4
4
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
|
|
8
8
|
|
|
9
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
10
|
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
|
-
from azure.ai.evaluation._common.math import list_mean
|
|
13
12
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
13
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
14
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
15
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
16
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
17
|
+
|
|
18
|
+
from ._conversation_aggregators import GetAggregator, GetAggregatorType
|
|
15
19
|
|
|
16
20
|
P = ParamSpec("P")
|
|
17
21
|
T = TypeVar("T")
|
|
@@ -24,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
|
|
|
24
28
|
query: Dict[str, Any]
|
|
25
29
|
response: Dict[str, Any]
|
|
26
30
|
context: str
|
|
31
|
+
ground_truth: str
|
|
27
32
|
|
|
28
33
|
|
|
29
34
|
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
@@ -68,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
68
73
|
:type not_singleton_inputs: List[str]
|
|
69
74
|
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
|
|
70
75
|
:type eval_last_turn: bool
|
|
76
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
77
|
+
to produce a single result.
|
|
78
|
+
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
79
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
80
|
+
:param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
|
|
81
|
+
overrides the standard aggregator implied by conversation_aggregation_type. None by default.
|
|
82
|
+
:type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
|
|
71
83
|
"""
|
|
72
84
|
|
|
73
85
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
@@ -79,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
79
91
|
*,
|
|
80
92
|
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
|
|
81
93
|
eval_last_turn: bool = False,
|
|
94
|
+
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
95
|
+
conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
|
|
82
96
|
):
|
|
83
97
|
self._not_singleton_inputs = not_singleton_inputs
|
|
84
98
|
self._eval_last_turn = eval_last_turn
|
|
85
99
|
self._singleton_inputs = self._derive_singleton_inputs()
|
|
86
100
|
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
|
|
101
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
102
|
+
if conversation_aggregator_override is not None:
|
|
103
|
+
# Type ignore since we already checked for None, but mypy doesn't know that.
|
|
104
|
+
self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
|
|
87
105
|
|
|
88
106
|
# This needs to be overridden just to change the function header into something more informative,
|
|
89
107
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
@@ -157,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
157
175
|
include_context = "context" in self._singleton_inputs
|
|
158
176
|
include_query = "query" in self._singleton_inputs
|
|
159
177
|
include_response = "response" in self._singleton_inputs
|
|
178
|
+
include_ground_truth = "ground_truth" in self._singleton_inputs
|
|
160
179
|
|
|
161
180
|
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
162
181
|
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
@@ -197,11 +216,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
197
216
|
eval_input["response"] = response.get("content", "")
|
|
198
217
|
if include_context:
|
|
199
218
|
eval_input["context"] = str(context)
|
|
219
|
+
if include_ground_truth:
|
|
220
|
+
eval_input["ground_truth"] = response.get("ground_truth", "")
|
|
200
221
|
eval_inputs.append(eval_input)
|
|
201
222
|
return eval_inputs
|
|
202
223
|
|
|
203
224
|
return converter
|
|
204
225
|
|
|
226
|
+
def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
227
|
+
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
228
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
229
|
+
aspects of a conversation ought to be extracted.
|
|
230
|
+
|
|
231
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
232
|
+
:rtype: Callable
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
|
|
236
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
237
|
+
# Extract user messages, assistant messages from conversation
|
|
238
|
+
user_messages: List[Dict[str, Any]] = []
|
|
239
|
+
assistant_messages: List[Dict[str, Any]] = []
|
|
240
|
+
system_messages: List[Dict[str, Any]] = []
|
|
241
|
+
|
|
242
|
+
# Convert conversation slice into queries and responses.
|
|
243
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
244
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
245
|
+
messages = messages[-2:]
|
|
246
|
+
|
|
247
|
+
for each_turn in messages:
|
|
248
|
+
role = each_turn["role"]
|
|
249
|
+
if role == "user":
|
|
250
|
+
user_messages.append(each_turn)
|
|
251
|
+
elif role == "assistant":
|
|
252
|
+
assistant_messages.append(each_turn)
|
|
253
|
+
elif role == "system":
|
|
254
|
+
system_messages.append(each_turn)
|
|
255
|
+
|
|
256
|
+
# validation
|
|
257
|
+
if len(user_messages) != len(assistant_messages):
|
|
258
|
+
raise EvaluationException(
|
|
259
|
+
message="Mismatched number of user and assistant messages.",
|
|
260
|
+
internal_message=("Mismatched number of user and assistant messages."),
|
|
261
|
+
)
|
|
262
|
+
if len(assistant_messages) > 1:
|
|
263
|
+
raise EvaluationException(
|
|
264
|
+
message="Conversation can have only one assistant message.",
|
|
265
|
+
internal_message=("Conversation can have only one assistant message."),
|
|
266
|
+
)
|
|
267
|
+
eval_conv_inputs = []
|
|
268
|
+
for user_msg, assist_msg in zip(user_messages, assistant_messages):
|
|
269
|
+
conv_messages = []
|
|
270
|
+
if len(system_messages) == 1:
|
|
271
|
+
conv_messages.append(system_messages[0])
|
|
272
|
+
conv_messages.append(user_msg)
|
|
273
|
+
conv_messages.append(assist_msg)
|
|
274
|
+
eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
|
|
275
|
+
return eval_conv_inputs
|
|
276
|
+
|
|
277
|
+
return multi_modal_converter
|
|
278
|
+
|
|
205
279
|
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
206
280
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
207
281
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
@@ -210,7 +284,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
210
284
|
values.
|
|
211
285
|
|
|
212
286
|
The self._singleton_inputs list assigned during initialization is used to find and extract
|
|
213
|
-
singleton keywords, and self.
|
|
287
|
+
singleton keywords, and self._allow_conversation_input is used to determine if a conversation
|
|
214
288
|
is a valid input.
|
|
215
289
|
|
|
216
290
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
@@ -241,6 +315,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
241
315
|
)
|
|
242
316
|
# Handle Conversation
|
|
243
317
|
if conversation is not None:
|
|
318
|
+
if self._is_multi_modal_conversation(conversation):
|
|
319
|
+
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
244
320
|
return self._derive_conversation_converter()(conversation)
|
|
245
321
|
# Handle Singletons
|
|
246
322
|
required_singletons = remove_optional_singletons(self, singletons)
|
|
@@ -255,6 +331,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
255
331
|
target=ErrorTarget.CONVERSATION,
|
|
256
332
|
)
|
|
257
333
|
|
|
334
|
+
def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
|
|
335
|
+
if "messages" not in conversation:
|
|
336
|
+
return False
|
|
337
|
+
messages = conversation["messages"]
|
|
338
|
+
if not isinstance(messages, list):
|
|
339
|
+
return False
|
|
340
|
+
for message in messages:
|
|
341
|
+
if "content" in message:
|
|
342
|
+
content = message.get("content", "")
|
|
343
|
+
if isinstance(content, list):
|
|
344
|
+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
345
|
+
return True
|
|
346
|
+
return False
|
|
347
|
+
|
|
258
348
|
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
259
349
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
260
350
|
|
|
@@ -285,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
285
375
|
# Find and average all numeric values
|
|
286
376
|
for metric, values in evaluation_per_turn.items():
|
|
287
377
|
if all(isinstance(value, (int, float)) for value in values):
|
|
288
|
-
aggregated[metric] =
|
|
378
|
+
aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
|
|
289
379
|
# Slap the per-turn results back in.
|
|
290
380
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
291
381
|
return aggregated
|
|
@@ -313,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
313
403
|
# Otherwise, aggregate results.
|
|
314
404
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
315
405
|
|
|
406
|
+
# ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
|
|
407
|
+
|
|
316
408
|
@final
|
|
317
409
|
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
318
410
|
return self._async_evaluator
|
|
319
411
|
|
|
412
|
+
@experimental
|
|
413
|
+
@final
|
|
414
|
+
def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
|
|
415
|
+
"""Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
|
|
416
|
+
multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
|
|
417
|
+
multi-turn conversation into a single top-level result.
|
|
418
|
+
|
|
419
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn
|
|
420
|
+
results of a conversation to produce a single result.
|
|
421
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
422
|
+
"""
|
|
423
|
+
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
|
|
424
|
+
|
|
425
|
+
@experimental
|
|
426
|
+
@final
|
|
427
|
+
def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
|
|
428
|
+
"""Set the conversation aggregator function directly. This function will be applied to all numeric outputs
|
|
429
|
+
of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
|
|
430
|
+
evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
|
|
431
|
+
suit your needs, but use with caution.
|
|
432
|
+
|
|
433
|
+
:param aggregator: The function to use to aggregate per-turn results.
|
|
434
|
+
:type aggregator: Callable[[List[float]], float]
|
|
435
|
+
"""
|
|
436
|
+
self._conversation_aggregation_function = aggregator
|
|
437
|
+
|
|
438
|
+
@experimental
|
|
439
|
+
@final
|
|
440
|
+
def _get_conversation_aggregator_type(self) -> _AggregationType:
|
|
441
|
+
"""Get the current conversation aggregation type used by this evaluator. This refers to the
|
|
442
|
+
method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
|
|
443
|
+
is inputted into an evaluator that evaluates each turn individually). The individual inputs
|
|
444
|
+
are combined by the function implied here to produce a single overall result.
|
|
445
|
+
|
|
446
|
+
:return: The conversation aggregation type.
|
|
447
|
+
:rtype: ~azure.ai.evaluation._AggregationType
|
|
448
|
+
"""
|
|
449
|
+
return GetAggregatorType(self._conversation_aggregation_function)
|
|
450
|
+
|
|
320
451
|
|
|
321
452
|
class AsyncEvaluatorBase:
|
|
322
453
|
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
|
|
@@ -332,7 +463,9 @@ class AsyncEvaluatorBase:
|
|
|
332
463
|
# are just not passed into this function instead of ending up in kwargs.
|
|
333
464
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
334
465
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
335
|
-
async def __call__(
|
|
466
|
+
async def __call__(
|
|
467
|
+
self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
|
|
468
|
+
):
|
|
336
469
|
if conversation is not None:
|
|
337
470
|
kwargs["conversation"] = conversation
|
|
338
471
|
if query is not None:
|
|
@@ -341,4 +474,6 @@ class AsyncEvaluatorBase:
|
|
|
341
474
|
kwargs["response"] = response
|
|
342
475
|
if context is not None:
|
|
343
476
|
kwargs["context"] = context
|
|
477
|
+
if ground_truth is not None:
|
|
478
|
+
kwargs["ground_truth"] = ground_truth
|
|
344
479
|
return await self._real_call(**kwargs)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from concurrent.futures import as_completed
|
|
5
|
+
from typing import TypeVar, Dict, List
|
|
6
|
+
|
|
7
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
|
|
12
|
+
T = TypeVar("T")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiEvaluatorBase(EvaluatorBase[T]):
|
|
16
|
+
"""
|
|
17
|
+
Base class for evaluators that contain and run multiple other evaluators to produce a
|
|
18
|
+
suite of metrics.
|
|
19
|
+
|
|
20
|
+
Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
|
|
21
|
+
|
|
22
|
+
:param evaluators: The list of evaluators to run when this evaluator is called.
|
|
23
|
+
:type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
|
|
24
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
25
|
+
:type kwargs: Any
|
|
26
|
+
:return: An evaluator that runs multiple other evaluators and combines their results.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self._parallel = kwargs.pop("_parallel", True)
|
|
32
|
+
self._evaluators = evaluators
|
|
33
|
+
|
|
34
|
+
@override
|
|
35
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
36
|
+
"""Run each evaluator, possibly in parallel, and combine the results into
|
|
37
|
+
a single large dictionary containing each evaluation. Inputs are passed
|
|
38
|
+
directly to each evaluator without additional processing.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
:param eval_input: The input to the evaluation function.
|
|
42
|
+
:type eval_input: Dict
|
|
43
|
+
:return: The evaluation result.
|
|
44
|
+
:rtype: Dict
|
|
45
|
+
"""
|
|
46
|
+
results: Dict[str, T] = {}
|
|
47
|
+
if self._parallel:
|
|
48
|
+
with ThreadPoolExecutor() as executor:
|
|
49
|
+
# pylint: disable=no-value-for-parameter
|
|
50
|
+
futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
|
|
51
|
+
|
|
52
|
+
for future in as_completed(futures):
|
|
53
|
+
results.update(future.result())
|
|
54
|
+
else:
|
|
55
|
+
for evaluator in self._evaluators:
|
|
56
|
+
result = evaluator(**eval_input)
|
|
57
|
+
# Ignore is to avoid mypy getting upset over the amount of duck-typing
|
|
58
|
+
# that's going on to shove evaluators around like this.
|
|
59
|
+
results.update(result) # type: ignore[arg-type]
|
|
60
|
+
|
|
61
|
+
return results
|
|
@@ -10,6 +10,7 @@ from promptflow.core import AsyncPrompty
|
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
13
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
14
|
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
14
15
|
from . import EvaluatorBase
|
|
15
16
|
|
|
@@ -47,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
47
48
|
self._prompty_file = prompty_file
|
|
48
49
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
49
50
|
|
|
51
|
+
subclass_name = self.__class__.__name__
|
|
52
|
+
user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
|
|
50
53
|
prompty_model_config = construct_prompty_model_config(
|
|
51
54
|
validate_model_config(model_config),
|
|
52
55
|
self._DEFAULT_OPEN_API_VERSION,
|
|
53
|
-
|
|
56
|
+
user_agent,
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
|
|
@@ -69,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
|
69
72
|
:return: The evaluation result.
|
|
70
73
|
:rtype: Dict
|
|
71
74
|
"""
|
|
75
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
76
|
+
raise EvaluationException(
|
|
77
|
+
message="Only text conversation inputs are supported.",
|
|
78
|
+
internal_message="Only text conversation inputs are supported.",
|
|
79
|
+
blame=ErrorBlame.USER_ERROR,
|
|
80
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
81
|
+
target=ErrorTarget.CONVERSATION,
|
|
82
|
+
)
|
|
72
83
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
73
84
|
|
|
74
85
|
score = math.nan
|
|
@@ -11,9 +11,11 @@ from azure.ai.evaluation._common.constants import (
|
|
|
11
11
|
Tasks,
|
|
12
12
|
_InternalAnnotationTasks,
|
|
13
13
|
)
|
|
14
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
14
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
15
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
16
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_conversation
|
|
18
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
17
19
|
from azure.core.credentials import TokenCredential
|
|
18
20
|
|
|
19
21
|
from . import EvaluatorBase
|
|
@@ -34,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
34
36
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
35
37
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
36
38
|
:type eval_last_turn: bool
|
|
39
|
+
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
|
|
40
|
+
to produce a single result.
|
|
41
|
+
Default is ~azure.ai.evaluation._AggregationType.MEAN.
|
|
42
|
+
:type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
|
|
37
43
|
"""
|
|
38
44
|
|
|
39
45
|
@override
|
|
@@ -43,8 +49,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
43
49
|
azure_ai_project: dict,
|
|
44
50
|
credential: TokenCredential,
|
|
45
51
|
eval_last_turn: bool = False,
|
|
52
|
+
conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
|
|
46
53
|
):
|
|
47
|
-
super().__init__(eval_last_turn=eval_last_turn)
|
|
54
|
+
super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
|
|
48
55
|
self._eval_metric = eval_metric
|
|
49
56
|
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
50
57
|
self._credential = credential
|
|
@@ -81,6 +88,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
81
88
|
:return: The evaluation result.
|
|
82
89
|
:rtype: Dict
|
|
83
90
|
"""
|
|
91
|
+
if "query" in eval_input and "response" in eval_input:
|
|
92
|
+
return await self._evaluate_query_response(eval_input)
|
|
93
|
+
|
|
94
|
+
conversation = eval_input.get("conversation", None)
|
|
95
|
+
return await self._evaluate_conversation(conversation)
|
|
96
|
+
|
|
97
|
+
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
|
|
98
|
+
"""
|
|
99
|
+
Evaluates content according to this evaluator's metric.
|
|
100
|
+
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
101
|
+
Each message should have "role" and "content" keys.
|
|
102
|
+
|
|
103
|
+
:param conversation: The conversation to evaluate.
|
|
104
|
+
:type conversation: ~azure.ai.evaluation.Conversation
|
|
105
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
106
|
+
:rtype: Dict[str, Union[float, str]]
|
|
107
|
+
"""
|
|
108
|
+
# validate inputs
|
|
109
|
+
validate_conversation(conversation)
|
|
110
|
+
messages = conversation["messages"]
|
|
111
|
+
# Run score computation based on supplied metric.
|
|
112
|
+
result = await evaluate_with_rai_service_multimodal(
|
|
113
|
+
messages=messages,
|
|
114
|
+
metric_name=self._eval_metric,
|
|
115
|
+
project_scope=self._azure_ai_project,
|
|
116
|
+
credential=self._credential,
|
|
117
|
+
)
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
84
121
|
query = eval_input.get("query", None)
|
|
85
122
|
response = eval_input.get("response", None)
|
|
86
123
|
if query is None or response is None:
|
|
@@ -111,6 +148,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
111
148
|
project_scope=self._azure_ai_project,
|
|
112
149
|
credential=self._credential,
|
|
113
150
|
annotation_task=self._get_task(),
|
|
151
|
+
evaluator_name=self.__class__.__name__,
|
|
114
152
|
)
|
|
115
153
|
|
|
116
154
|
def _get_task(self):
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from typing import Callable, List
|
|
6
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
7
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
8
|
+
from azure.ai.evaluation._constants import _AggregationType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
|
|
12
|
+
if aggregation_type == _AggregationType.SUM:
|
|
13
|
+
return sum
|
|
14
|
+
if aggregation_type == _AggregationType.MEAN:
|
|
15
|
+
return list_mean
|
|
16
|
+
if aggregation_type == _AggregationType.MAX:
|
|
17
|
+
return max
|
|
18
|
+
if aggregation_type == _AggregationType.MIN:
|
|
19
|
+
return min
|
|
20
|
+
if aggregation_type == _AggregationType.CUSTOM:
|
|
21
|
+
msg = (
|
|
22
|
+
"Cannot 'get' aggregator function associated with custom aggregation enum."
|
|
23
|
+
+ " This enum value should only be outputted as an indicator of an injected"
|
|
24
|
+
+ " aggregation function, not inputted directly"
|
|
25
|
+
)
|
|
26
|
+
raise EvaluationException(
|
|
27
|
+
message=msg,
|
|
28
|
+
blame=ErrorBlame.UNKNOWN,
|
|
29
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
30
|
+
target=ErrorTarget.EVALUATE,
|
|
31
|
+
)
|
|
32
|
+
raise EvaluationException(
|
|
33
|
+
message=f"Unaccounted for aggregation type: {aggregation_type}",
|
|
34
|
+
blame=ErrorBlame.UNKNOWN,
|
|
35
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
36
|
+
target=ErrorTarget.EVALUATE,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
|
|
41
|
+
if aggregation_function == sum: # pylint: disable=comparison-with-callable
|
|
42
|
+
return _AggregationType.SUM
|
|
43
|
+
if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
|
|
44
|
+
return _AggregationType.MEAN
|
|
45
|
+
if aggregation_function == max: # pylint: disable=comparison-with-callable
|
|
46
|
+
return _AggregationType.MAX
|
|
47
|
+
if aggregation_function == min: # pylint: disable=comparison-with-callable
|
|
48
|
+
return _AggregationType.MIN
|
|
49
|
+
return _AggregationType.CUSTOM
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from
|
|
5
|
-
from typing import Callable, Dict, List, Union
|
|
4
|
+
from typing import Dict, List, Union
|
|
6
5
|
|
|
7
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
6
|
from typing_extensions import overload, override
|
|
9
7
|
|
|
10
|
-
from azure.ai.evaluation._evaluators._common import
|
|
8
|
+
from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
|
|
11
9
|
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
10
|
from azure.ai.evaluation._common._experimental import experimental
|
|
13
11
|
|
|
@@ -18,9 +16,9 @@ from ._violence import ViolenceEvaluator
|
|
|
18
16
|
|
|
19
17
|
|
|
20
18
|
@experimental
|
|
21
|
-
class ContentSafetyEvaluator(
|
|
19
|
+
class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
22
20
|
"""
|
|
23
|
-
Initialize a content safety evaluator configured to evaluate content
|
|
21
|
+
Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
|
|
24
22
|
|
|
25
23
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
26
24
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
@@ -44,16 +42,14 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
|
44
42
|
id = "content_safety"
|
|
45
43
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
44
|
|
|
47
|
-
# TODO address 3579092 to re-enabled parallel evals.
|
|
48
45
|
def __init__(self, credential, azure_ai_project, **kwargs):
|
|
49
|
-
|
|
50
|
-
self._parallel = kwargs.pop("_parallel", False)
|
|
51
|
-
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
46
|
+
evaluators = [
|
|
52
47
|
ViolenceEvaluator(credential, azure_ai_project),
|
|
53
48
|
SexualEvaluator(credential, azure_ai_project),
|
|
54
49
|
SelfHarmEvaluator(credential, azure_ai_project),
|
|
55
50
|
HateUnfairnessEvaluator(credential, azure_ai_project),
|
|
56
51
|
]
|
|
52
|
+
super().__init__(evaluators=evaluators, **kwargs)
|
|
57
53
|
|
|
58
54
|
@overload
|
|
59
55
|
def __call__(
|
|
@@ -109,36 +105,3 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
|
109
105
|
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
110
106
|
"""
|
|
111
107
|
return super().__call__(*args, **kwargs)
|
|
112
|
-
|
|
113
|
-
@override
|
|
114
|
-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
115
|
-
"""Perform the evaluation using the Azure AI RAI service.
|
|
116
|
-
The exact evaluation performed is determined by the evaluation metric supplied
|
|
117
|
-
by the child class initializer.
|
|
118
|
-
|
|
119
|
-
:param eval_input: The input to the evaluation function.
|
|
120
|
-
:type eval_input: Dict
|
|
121
|
-
:return: The evaluation result.
|
|
122
|
-
:rtype: Dict
|
|
123
|
-
"""
|
|
124
|
-
query = eval_input.get("query", None)
|
|
125
|
-
response = eval_input.get("response", None)
|
|
126
|
-
conversation = eval_input.get("conversation", None)
|
|
127
|
-
results: Dict[str, Union[str, float]] = {}
|
|
128
|
-
# TODO fix this to not explode on empty optional inputs (PF SKD error)
|
|
129
|
-
if self._parallel:
|
|
130
|
-
with ThreadPoolExecutor() as executor:
|
|
131
|
-
# pylint: disable=no-value-for-parameter
|
|
132
|
-
futures = {
|
|
133
|
-
executor.submit(query=query, response=response, conversation=conversation): evaluator
|
|
134
|
-
for evaluator in self._evaluators
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
for future in as_completed(futures):
|
|
138
|
-
results.update(future.result())
|
|
139
|
-
else:
|
|
140
|
-
for evaluator in self._evaluators:
|
|
141
|
-
result = evaluator(query=query, response=response, conversation=conversation)
|
|
142
|
-
results.update(result)
|
|
143
|
-
|
|
144
|
-
return results
|