azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +188 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +110 -50
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +2 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +72 -38
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +54 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -2,70 +2,101 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, Union, List
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
10
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
13
|
+
class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
13
14
|
"""
|
|
14
|
-
|
|
15
|
+
Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
|
|
16
|
+
|
|
17
|
+
The coherence measure assesses the ability of the language model to generate text that reads naturally,
|
|
18
|
+
flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
|
|
19
|
+
and user-friendliness of a model's generated responses in real-world applications.
|
|
15
20
|
|
|
16
21
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
17
22
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
18
23
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
.. code-block:: python
|
|
23
|
-
|
|
24
|
-
eval_fn = CoherenceEvaluator(model_config)
|
|
25
|
-
result = eval_fn(
|
|
26
|
-
query="What is the capital of Japan?",
|
|
27
|
-
response="The capital of Japan is Tokyo.")
|
|
25
|
+
.. admonition:: Example:
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
28
|
+
:start-after: [START coherence_evaluator]
|
|
29
|
+
:end-before: [END coherence_evaluator]
|
|
30
|
+
:language: python
|
|
31
|
+
:dedent: 8
|
|
32
|
+
:caption: Initialize and call a CoherenceEvaluator with a query and response.
|
|
30
33
|
|
|
31
|
-
..
|
|
34
|
+
.. note::
|
|
32
35
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
|
|
39
|
-
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
40
|
-
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
36
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
37
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
38
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
41
39
|
"""
|
|
42
40
|
|
|
43
41
|
_PROMPTY_FILE = "coherence.prompty"
|
|
44
42
|
_RESULT_KEY = "coherence"
|
|
45
43
|
|
|
44
|
+
id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
|
|
45
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
|
+
|
|
46
47
|
@override
|
|
47
48
|
def __init__(self, model_config):
|
|
48
49
|
current_dir = os.path.dirname(__file__)
|
|
49
50
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
50
51
|
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
51
52
|
|
|
52
|
-
@
|
|
53
|
+
@overload
|
|
54
|
+
def __call__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
query: str,
|
|
58
|
+
response: str,
|
|
59
|
+
) -> Dict[str, Union[str, float]]:
|
|
60
|
+
"""Evaluate coherence for given input of query, response
|
|
61
|
+
|
|
62
|
+
:keyword query: The query to be evaluated.
|
|
63
|
+
:paramtype query: str
|
|
64
|
+
:keyword response: The response to be evaluated.
|
|
65
|
+
:paramtype response: str
|
|
66
|
+
:return: The coherence score.
|
|
67
|
+
:rtype: Dict[str, float]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@overload
|
|
53
71
|
def __call__(
|
|
54
72
|
self,
|
|
55
73
|
*,
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
conversation
|
|
74
|
+
conversation: Conversation,
|
|
75
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
76
|
+
"""Evaluate coherence for a conversation
|
|
77
|
+
|
|
78
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
79
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
80
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
81
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
82
|
+
:return: The coherence score.
|
|
83
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@override
|
|
87
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
88
|
+
self,
|
|
89
|
+
*args,
|
|
59
90
|
**kwargs,
|
|
60
91
|
):
|
|
61
92
|
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
62
93
|
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
63
94
|
turns, the evaluator will aggregate the results of each turn.
|
|
64
95
|
|
|
96
|
+
:keyword query: The query to be evaluated.
|
|
97
|
+
:paramtype query: str
|
|
65
98
|
:keyword response: The response to be evaluated.
|
|
66
99
|
:paramtype response: Optional[str]
|
|
67
|
-
:keyword context: The context to be evaluated.
|
|
68
|
-
:paramtype context: Optional[str]
|
|
69
100
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
70
101
|
key "messages". Conversation turns are expected
|
|
71
102
|
to be dictionaries with keys "content" and "role".
|
|
@@ -73,4 +104,4 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
|
|
|
73
104
|
:return: The relevance score.
|
|
74
105
|
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
|
|
75
106
|
"""
|
|
76
|
-
return super().__call__(
|
|
107
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -7,11 +7,12 @@ from abc import ABC, abstractmethod
|
|
|
7
7
|
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
8
8
|
|
|
9
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
-
from typing_extensions import ParamSpec, TypeAlias
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias, get_overloads
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.math import list_mean
|
|
13
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
14
|
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
15
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
15
16
|
|
|
16
17
|
P = ParamSpec("P")
|
|
17
18
|
T = TypeVar("T")
|
|
@@ -88,7 +89,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
88
89
|
# This needs to be overridden just to change the function header into something more informative,
|
|
89
90
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
90
91
|
# super().__call__(<inputs>)
|
|
91
|
-
def __call__(
|
|
92
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
93
|
+
self,
|
|
94
|
+
*args,
|
|
95
|
+
**kwargs,
|
|
96
|
+
) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
92
97
|
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
93
98
|
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
94
99
|
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
@@ -127,11 +132,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
127
132
|
:rtype: List[str]
|
|
128
133
|
"""
|
|
129
134
|
|
|
135
|
+
overloads = get_overloads(self.__call__)
|
|
136
|
+
if not overloads:
|
|
137
|
+
call_signatures = [inspect.signature(self.__call__)]
|
|
138
|
+
else:
|
|
139
|
+
call_signatures = [inspect.signature(overload) for overload in overloads]
|
|
130
140
|
call_signature = inspect.signature(self.__call__)
|
|
131
141
|
singletons = []
|
|
132
|
-
for
|
|
133
|
-
|
|
134
|
-
|
|
142
|
+
for call_signature in call_signatures:
|
|
143
|
+
params = call_signature.parameters
|
|
144
|
+
if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
|
|
145
|
+
continue
|
|
146
|
+
# exclude self since it is not a singleton input
|
|
147
|
+
singletons.extend([p for p in params if p != "self"])
|
|
135
148
|
return singletons
|
|
136
149
|
|
|
137
150
|
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
@@ -190,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
190
203
|
|
|
191
204
|
return converter
|
|
192
205
|
|
|
206
|
+
def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
|
|
207
|
+
"""Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
|
|
208
|
+
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
209
|
+
aspects of a conversation ought to be extracted.
|
|
210
|
+
|
|
211
|
+
:return: The function that will be used to convert conversations to evaluable inputs.
|
|
212
|
+
:rtype: Callable
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
|
|
216
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
217
|
+
# Extract user messages, assistant messages from conversation
|
|
218
|
+
user_messages: List[Dict[str, Any]] = []
|
|
219
|
+
assistant_messages: List[Dict[str, Any]] = []
|
|
220
|
+
system_messages: List[Dict[str, Any]] = []
|
|
221
|
+
|
|
222
|
+
# Convert conversation slice into queries and responses.
|
|
223
|
+
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
224
|
+
if self._eval_last_turn and len(messages) > 1:
|
|
225
|
+
messages = messages[-2:]
|
|
226
|
+
|
|
227
|
+
for each_turn in messages:
|
|
228
|
+
role = each_turn["role"]
|
|
229
|
+
if role == "user":
|
|
230
|
+
user_messages.append(each_turn)
|
|
231
|
+
elif role == "assistant":
|
|
232
|
+
assistant_messages.append(each_turn)
|
|
233
|
+
elif role == "system":
|
|
234
|
+
system_messages.append(each_turn)
|
|
235
|
+
|
|
236
|
+
# validation
|
|
237
|
+
if len(user_messages) != len(assistant_messages):
|
|
238
|
+
raise EvaluationException(
|
|
239
|
+
message="Mismatched number of user and assistant messages.",
|
|
240
|
+
internal_message=("Mismatched number of user and assistant messages."),
|
|
241
|
+
)
|
|
242
|
+
if len(assistant_messages) > 1:
|
|
243
|
+
raise EvaluationException(
|
|
244
|
+
message="Conversation can have only one assistant message.",
|
|
245
|
+
internal_message=("Conversation can have only one assistant message."),
|
|
246
|
+
)
|
|
247
|
+
eval_conv_inputs = []
|
|
248
|
+
for user_msg, assist_msg in zip(user_messages, assistant_messages):
|
|
249
|
+
conv_messages = []
|
|
250
|
+
if len(system_messages) == 1:
|
|
251
|
+
conv_messages.append(system_messages[0])
|
|
252
|
+
conv_messages.append(user_msg)
|
|
253
|
+
conv_messages.append(assist_msg)
|
|
254
|
+
eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
|
|
255
|
+
return eval_conv_inputs
|
|
256
|
+
|
|
257
|
+
return multi_modal_converter
|
|
258
|
+
|
|
193
259
|
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
194
260
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
195
261
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
@@ -198,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
198
264
|
values.
|
|
199
265
|
|
|
200
266
|
The self._singleton_inputs list assigned during initialization is used to find and extract
|
|
201
|
-
singleton keywords, and self.
|
|
267
|
+
singleton keywords, and self._allow_conversation_input is used to determine if a conversation
|
|
202
268
|
is a valid input.
|
|
203
269
|
|
|
204
270
|
If both conversations and singletons are allowed, the function will raise an exception if both
|
|
@@ -229,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
229
295
|
)
|
|
230
296
|
# Handle Conversation
|
|
231
297
|
if conversation is not None:
|
|
298
|
+
if self._is_multi_modal_conversation(conversation):
|
|
299
|
+
return self._derive_multi_modal_conversation_converter()(conversation)
|
|
232
300
|
return self._derive_conversation_converter()(conversation)
|
|
233
301
|
# Handle Singletons
|
|
234
302
|
required_singletons = remove_optional_singletons(self, singletons)
|
|
@@ -243,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
243
311
|
target=ErrorTarget.CONVERSATION,
|
|
244
312
|
)
|
|
245
313
|
|
|
314
|
+
def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
|
|
315
|
+
if "messages" not in conversation:
|
|
316
|
+
return False
|
|
317
|
+
messages = conversation["messages"]
|
|
318
|
+
if not isinstance(messages, list):
|
|
319
|
+
return False
|
|
320
|
+
for message in messages:
|
|
321
|
+
if "content" in message:
|
|
322
|
+
content = message.get("content", "")
|
|
323
|
+
if isinstance(content, list):
|
|
324
|
+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
325
|
+
return True
|
|
326
|
+
return False
|
|
327
|
+
|
|
246
328
|
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
247
329
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
248
330
|
|
|
@@ -4,12 +4,13 @@
|
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
6
|
import re
|
|
7
|
-
from typing import Dict, Union
|
|
7
|
+
from typing import Dict, TypeVar, Union
|
|
8
8
|
|
|
9
9
|
from promptflow.core import AsyncPrompty
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
12
|
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
13
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
14
|
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
14
15
|
from . import EvaluatorBase
|
|
15
16
|
|
|
@@ -18,8 +19,10 @@ try:
|
|
|
18
19
|
except ImportError:
|
|
19
20
|
USER_AGENT = "None"
|
|
20
21
|
|
|
22
|
+
T = TypeVar("T")
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
|
|
25
|
+
class PromptyEvaluatorBase(EvaluatorBase[T]):
|
|
23
26
|
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
24
27
|
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
25
28
|
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
@@ -45,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
|
|
|
45
48
|
self._prompty_file = prompty_file
|
|
46
49
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
47
50
|
|
|
51
|
+
subclass_name = self.__class__.__name__
|
|
52
|
+
user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
|
|
48
53
|
prompty_model_config = construct_prompty_model_config(
|
|
49
54
|
validate_model_config(model_config),
|
|
50
55
|
self._DEFAULT_OPEN_API_VERSION,
|
|
51
|
-
|
|
56
|
+
user_agent,
|
|
52
57
|
)
|
|
53
58
|
|
|
54
59
|
self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
|
|
@@ -67,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
|
|
|
67
72
|
:return: The evaluation result.
|
|
68
73
|
:rtype: Dict
|
|
69
74
|
"""
|
|
75
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
76
|
+
raise EvaluationException(
|
|
77
|
+
message="Only text conversation inputs are supported.",
|
|
78
|
+
internal_message="Only text conversation inputs are supported.",
|
|
79
|
+
blame=ErrorBlame.USER_ERROR,
|
|
80
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
81
|
+
target=ErrorTarget.CONVERSATION,
|
|
82
|
+
)
|
|
70
83
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
71
84
|
|
|
72
85
|
score = math.nan
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Dict,
|
|
4
|
+
from typing import Dict, TypeVar, Union
|
|
5
5
|
|
|
6
6
|
from typing_extensions import override
|
|
7
7
|
|
|
@@ -11,14 +11,15 @@ from azure.ai.evaluation._common.constants import (
|
|
|
11
11
|
Tasks,
|
|
12
12
|
_InternalAnnotationTasks,
|
|
13
13
|
)
|
|
14
|
-
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
14
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
|
|
15
15
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
16
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_conversation
|
|
17
18
|
from azure.core.credentials import TokenCredential
|
|
18
19
|
|
|
19
20
|
from . import EvaluatorBase
|
|
20
21
|
|
|
21
|
-
T =
|
|
22
|
+
T = TypeVar("T")
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
@@ -50,12 +51,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
50
51
|
self._credential = credential
|
|
51
52
|
|
|
52
53
|
@override
|
|
53
|
-
def __call__(
|
|
54
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
54
55
|
self,
|
|
55
|
-
|
|
56
|
-
query: Optional[str] = None,
|
|
57
|
-
response: Optional[str] = None,
|
|
58
|
-
conversation=None,
|
|
56
|
+
*args,
|
|
59
57
|
**kwargs,
|
|
60
58
|
):
|
|
61
59
|
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
@@ -71,7 +69,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
71
69
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
72
70
|
:rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
|
|
73
71
|
"""
|
|
74
|
-
return super().__call__(
|
|
72
|
+
return super().__call__(*args, **kwargs)
|
|
75
73
|
|
|
76
74
|
@override
|
|
77
75
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
@@ -84,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
84
82
|
:return: The evaluation result.
|
|
85
83
|
:rtype: Dict
|
|
86
84
|
"""
|
|
85
|
+
if "query" in eval_input and "response" in eval_input:
|
|
86
|
+
return await self._evaluate_query_response(eval_input)
|
|
87
|
+
|
|
88
|
+
conversation = eval_input.get("conversation", None)
|
|
89
|
+
return await self._evaluate_conversation(conversation)
|
|
90
|
+
|
|
91
|
+
async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
|
|
92
|
+
"""
|
|
93
|
+
Evaluates content according to this evaluator's metric.
|
|
94
|
+
:keyword conversation: The conversation contains list of messages to be evaluated.
|
|
95
|
+
Each message should have "role" and "content" keys.
|
|
96
|
+
|
|
97
|
+
:param conversation: The conversation to evaluate.
|
|
98
|
+
:type conversation: ~azure.ai.evaluation.Conversation
|
|
99
|
+
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
100
|
+
:rtype: Dict[str, Union[float, str]]
|
|
101
|
+
"""
|
|
102
|
+
# validate inputs
|
|
103
|
+
validate_conversation(conversation)
|
|
104
|
+
messages = conversation["messages"]
|
|
105
|
+
# Run score computation based on supplied metric.
|
|
106
|
+
result = await evaluate_with_rai_service_multimodal(
|
|
107
|
+
messages=messages,
|
|
108
|
+
metric_name=self._eval_metric,
|
|
109
|
+
project_scope=self._azure_ai_project,
|
|
110
|
+
credential=self._credential,
|
|
111
|
+
)
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
|
|
87
115
|
query = eval_input.get("query", None)
|
|
88
116
|
response = eval_input.get("response", None)
|
|
89
117
|
if query is None or response is None:
|
|
@@ -108,12 +136,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
|
108
136
|
)
|
|
109
137
|
input_data["context"] = context
|
|
110
138
|
|
|
111
|
-
return await evaluate_with_rai_service(
|
|
139
|
+
return await evaluate_with_rai_service( # type: ignore
|
|
112
140
|
metric_name=self._eval_metric,
|
|
113
141
|
data=input_data,
|
|
114
142
|
project_scope=self._azure_ai_project,
|
|
115
143
|
credential=self._credential,
|
|
116
144
|
annotation_task=self._get_task(),
|
|
145
|
+
evaluator_name=self.__class__.__name__,
|
|
117
146
|
)
|
|
118
147
|
|
|
119
148
|
def _get_task(self):
|
|
@@ -2,13 +2,14 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from concurrent.futures import as_completed
|
|
5
|
-
from typing import Callable, Dict, List,
|
|
5
|
+
from typing import Callable, Dict, List, Union
|
|
6
6
|
|
|
7
7
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
-
from typing_extensions import override
|
|
8
|
+
from typing_extensions import overload, override
|
|
9
9
|
|
|
10
|
-
from azure.ai.evaluation._common._experimental import experimental
|
|
11
10
|
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
12
13
|
|
|
13
14
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
14
15
|
from ._self_harm import SelfHarmEvaluator
|
|
@@ -17,61 +18,36 @@ from ._violence import ViolenceEvaluator
|
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
@experimental
|
|
20
|
-
class ContentSafetyEvaluator(EvaluatorBase):
|
|
21
|
+
class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
21
22
|
"""
|
|
22
|
-
Initialize a content safety evaluator configured to evaluate content
|
|
23
|
+
Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
|
|
23
24
|
|
|
24
25
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
25
26
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
26
27
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
27
28
|
It contains subscription id, resource group, and project name.
|
|
28
29
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
29
|
-
:param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
|
|
30
|
-
:type eval_last_turn: bool
|
|
31
30
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
32
31
|
:type kwargs: Any
|
|
33
32
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
"project_name": "<project_name>",
|
|
44
|
-
}
|
|
45
|
-
eval_fn = ContentSafetyEvaluator(azure_ai_project)
|
|
46
|
-
result = eval_fn(
|
|
47
|
-
query="What is the capital of France?",
|
|
48
|
-
response="Paris.",
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
**Output format**
|
|
52
|
-
|
|
53
|
-
.. code-block:: python
|
|
54
|
-
|
|
55
|
-
{
|
|
56
|
-
"violence": "Medium",
|
|
57
|
-
"violence_score": 5.0,
|
|
58
|
-
"violence_reason": "Some reason",
|
|
59
|
-
"sexual": "Medium",
|
|
60
|
-
"sexual_score": 5.0,
|
|
61
|
-
"sexual_reason": "Some reason",
|
|
62
|
-
"self_harm": "Medium",
|
|
63
|
-
"self_harm_score": 5.0,
|
|
64
|
-
"self_harm_reason": "Some reason",
|
|
65
|
-
"hate_unfairness": "Medium",
|
|
66
|
-
"hate_unfairness_score": 5.0,
|
|
67
|
-
"hate_unfairness_reason": "Some reason"
|
|
68
|
-
}
|
|
33
|
+
|
|
34
|
+
.. admonition:: Example:
|
|
35
|
+
|
|
36
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
37
|
+
:start-after: [START content_safety_evaluator]
|
|
38
|
+
:end-before: [END content_safety_evaluator]
|
|
39
|
+
:language: python
|
|
40
|
+
:dedent: 8
|
|
41
|
+
:caption: Initialize and call a ContentSafetyEvaluator.
|
|
69
42
|
"""
|
|
70
43
|
|
|
44
|
+
id = "content_safety"
|
|
45
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
|
+
|
|
71
47
|
# TODO address 3579092 to re-enabled parallel evals.
|
|
72
|
-
def __init__(self, credential, azure_ai_project,
|
|
73
|
-
super().__init__(
|
|
74
|
-
self._parallel = kwargs.pop("
|
|
48
|
+
def __init__(self, credential, azure_ai_project, **kwargs):
|
|
49
|
+
super().__init__()
|
|
50
|
+
self._parallel = kwargs.pop("_parallel", True)
|
|
75
51
|
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
76
52
|
ViolenceEvaluator(credential, azure_ai_project),
|
|
77
53
|
SexualEvaluator(credential, azure_ai_project),
|
|
@@ -79,13 +55,43 @@ class ContentSafetyEvaluator(EvaluatorBase):
|
|
|
79
55
|
HateUnfairnessEvaluator(credential, azure_ai_project),
|
|
80
56
|
]
|
|
81
57
|
|
|
82
|
-
@
|
|
58
|
+
@overload
|
|
59
|
+
def __call__(
|
|
60
|
+
self,
|
|
61
|
+
*,
|
|
62
|
+
query: str,
|
|
63
|
+
response: str,
|
|
64
|
+
) -> Dict[str, Union[str, float]]:
|
|
65
|
+
"""Evaluate a collection of content safety metrics for the given query/response pair
|
|
66
|
+
|
|
67
|
+
:keyword query: The query to be evaluated.
|
|
68
|
+
:paramtype query: str
|
|
69
|
+
:keyword response: The response to be evaluated.
|
|
70
|
+
:paramtype response: str
|
|
71
|
+
:return: The content safety scores.
|
|
72
|
+
:rtype: Dict[str, Union[str, float]]
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
@overload
|
|
83
76
|
def __call__(
|
|
84
77
|
self,
|
|
85
78
|
*,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
conversation
|
|
79
|
+
conversation: Conversation,
|
|
80
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
81
|
+
"""Evaluate a collection of content safety metrics for a conversation
|
|
82
|
+
|
|
83
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
84
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
85
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
86
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
87
|
+
:return: The content safety scores.
|
|
88
|
+
:rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
@override
|
|
92
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
93
|
+
self,
|
|
94
|
+
*args,
|
|
89
95
|
**kwargs,
|
|
90
96
|
):
|
|
91
97
|
"""Evaluate a collection of content safety metrics for the given query/response pair or conversation.
|
|
@@ -100,9 +106,9 @@ class ContentSafetyEvaluator(EvaluatorBase):
|
|
|
100
106
|
to be dictionaries with keys "content", "role", and possibly "context".
|
|
101
107
|
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
102
108
|
:return: The evaluation result.
|
|
103
|
-
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[
|
|
109
|
+
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
104
110
|
"""
|
|
105
|
-
return super().__call__(
|
|
111
|
+
return super().__call__(*args, **kwargs)
|
|
106
112
|
|
|
107
113
|
@override
|
|
108
114
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
@@ -124,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase):
|
|
|
124
130
|
with ThreadPoolExecutor() as executor:
|
|
125
131
|
# pylint: disable=no-value-for-parameter
|
|
126
132
|
futures = {
|
|
127
|
-
executor.submit(query=query, response=response, conversation=conversation): evaluator
|
|
133
|
+
executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
|
|
128
134
|
for evaluator in self._evaluators
|
|
129
135
|
}
|
|
130
136
|
|