azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +23 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
- azure/ai/evaluation/_common/constants.py +9 -2
- azure/ai/evaluation/_common/math.py +29 -0
- azure/ai/evaluation/_common/rai_service.py +222 -93
- azure/ai/evaluation/_common/utils.py +328 -19
- azure/ai/evaluation/_constants.py +16 -8
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
- azure/ai/evaluation/_evaluate/_utils.py +77 -25
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
- azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
- azure/ai/evaluation/_exceptions.py +26 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +23 -6
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +222 -169
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -2,19 +2,56 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from typing import List, Dict, Callable, Any
|
|
6
5
|
import inspect
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
|
|
7
8
|
|
|
8
|
-
from abc import ABC
|
|
9
|
-
|
|
10
|
-
import numpy as np
|
|
11
9
|
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
10
|
+
from typing_extensions import ParamSpec, TypeAlias
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._common.math import list_mean
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._common.utils import remove_optional_singletons
|
|
15
|
+
|
|
16
|
+
P = ParamSpec("P")
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
T_EvalValue = TypeVar("T_EvalValue")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DerivedEvalInput(TypedDict, total=False):
|
|
22
|
+
"""The eval input generated by EvaluatorBase._derive_conversation_starter."""
|
|
23
|
+
|
|
24
|
+
query: Dict[str, Any]
|
|
25
|
+
response: Dict[str, Any]
|
|
26
|
+
context: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
|
|
30
|
+
"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
|
|
31
|
+
|
|
32
|
+
.. code-block:: python
|
|
12
33
|
|
|
13
|
-
|
|
34
|
+
foo: AggregateResult[float] = {
|
|
35
|
+
"evaluation_per_turn": {
|
|
36
|
+
"coherence": [1.0, 2.0, 3.0]
|
|
37
|
+
},
|
|
38
|
+
"coherence": 2.0
|
|
39
|
+
}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
DoEvalResult: TypeAlias = Dict[str, T]
|
|
43
|
+
"""TypeAlias that models the return value of EvaluatorBase._do_eval
|
|
44
|
+
|
|
45
|
+
.. code-block:: python
|
|
46
|
+
|
|
47
|
+
foo: DoEvalResult[float] = {
|
|
48
|
+
"coherence": 2.0
|
|
49
|
+
}
|
|
50
|
+
"""
|
|
14
51
|
|
|
15
52
|
|
|
16
53
|
# TODO exception target pass down?
|
|
17
|
-
class EvaluatorBase(ABC):
|
|
54
|
+
class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
18
55
|
"""Base class for all evaluators that are capable of accepting either a group of single values,
|
|
19
56
|
or conversation as input. All such evaluators need to implement two functions of their own:
|
|
20
57
|
- _convert_conversation_to_eval_input
|
|
@@ -51,7 +88,7 @@ class EvaluatorBase(ABC):
|
|
|
51
88
|
# This needs to be overridden just to change the function header into something more informative,
|
|
52
89
|
# and to be able to add a more specific docstring. The actual function contents should just be
|
|
53
90
|
# super().__call__(<inputs>)
|
|
54
|
-
def __call__(self, **kwargs) ->
|
|
91
|
+
def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
55
92
|
"""Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
|
|
56
93
|
one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
|
|
57
94
|
The actual behavior of this function shouldn't change beyond adding more inputs to the
|
|
@@ -60,13 +97,12 @@ class EvaluatorBase(ABC):
|
|
|
60
97
|
:keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
|
|
61
98
|
:type kwargs: Dict
|
|
62
99
|
:return: The evaluation result
|
|
63
|
-
:rtype:
|
|
100
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
64
101
|
"""
|
|
65
102
|
return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
|
|
66
103
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
async def _do_eval(self, eval_input: Any) -> Dict:
|
|
104
|
+
@abstractmethod
|
|
105
|
+
async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
|
|
70
106
|
"""Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
|
|
71
107
|
In the default case, all required inputs are assumed to be within eval_input, as user-friendly
|
|
72
108
|
typing is handled above this function in favor of polymorphic simplicity. This function must be
|
|
@@ -75,13 +111,8 @@ class EvaluatorBase(ABC):
|
|
|
75
111
|
:param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
|
|
76
112
|
:type eval_input: Any
|
|
77
113
|
:return: A single evaluation result
|
|
78
|
-
:rtype:
|
|
79
|
-
|
|
114
|
+
:rtype: DoEvalResult[T_EvalValue]
|
|
80
115
|
"""
|
|
81
|
-
raise EvaluationException(
|
|
82
|
-
message="Not implemented",
|
|
83
|
-
internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
|
|
84
|
-
)
|
|
85
116
|
|
|
86
117
|
# ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
87
118
|
|
|
@@ -103,7 +134,7 @@ class EvaluatorBase(ABC):
|
|
|
103
134
|
singletons.append(param)
|
|
104
135
|
return singletons
|
|
105
136
|
|
|
106
|
-
def _derive_conversation_converter(self) -> Callable:
|
|
137
|
+
def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
|
|
107
138
|
"""Produce the function that will be used to convert conversations to a list of evaluable inputs.
|
|
108
139
|
This uses the inputs derived from the _derive_singleton_inputs function to determine which
|
|
109
140
|
aspects of a conversation ought to be extracted.
|
|
@@ -115,12 +146,12 @@ class EvaluatorBase(ABC):
|
|
|
115
146
|
include_query = "query" in self._singleton_inputs
|
|
116
147
|
include_response = "response" in self._singleton_inputs
|
|
117
148
|
|
|
118
|
-
def converter(conversation: Dict) -> List:
|
|
119
|
-
messages = conversation["messages"]
|
|
149
|
+
def converter(conversation: Dict) -> List[DerivedEvalInput]:
|
|
150
|
+
messages = cast(List[Dict[str, Any]], conversation["messages"])
|
|
120
151
|
global_context = conversation.get("context", None)
|
|
121
152
|
# Extract queries, responses from conversation
|
|
122
|
-
queries = []
|
|
123
|
-
responses = []
|
|
153
|
+
queries: List[Dict[str, Any]] = []
|
|
154
|
+
responses: List[Dict[str, Any]] = []
|
|
124
155
|
|
|
125
156
|
# Convert conversation slice into queries and responses.
|
|
126
157
|
# Assume that 'user' role is asking queries and 'assistant' role is responding.
|
|
@@ -142,16 +173,16 @@ class EvaluatorBase(ABC):
|
|
|
142
173
|
response_context = response.get("context", None)
|
|
143
174
|
if global_context:
|
|
144
175
|
context["global_context"] = global_context
|
|
145
|
-
if query_context and
|
|
176
|
+
if query_context and include_query:
|
|
146
177
|
context["query_context"] = query_context
|
|
147
|
-
if response_context and
|
|
178
|
+
if response_context and include_response:
|
|
148
179
|
context["response_context"] = response_context
|
|
149
180
|
|
|
150
|
-
eval_input = {}
|
|
181
|
+
eval_input: DerivedEvalInput = {}
|
|
151
182
|
if include_query:
|
|
152
|
-
eval_input["query"] = query
|
|
183
|
+
eval_input["query"] = query.get("content", "")
|
|
153
184
|
if include_response:
|
|
154
|
-
eval_input["response"] = response
|
|
185
|
+
eval_input["response"] = response.get("content", "")
|
|
155
186
|
if include_context:
|
|
156
187
|
eval_input["context"] = str(context)
|
|
157
188
|
eval_inputs.append(eval_input)
|
|
@@ -159,7 +190,7 @@ class EvaluatorBase(ABC):
|
|
|
159
190
|
|
|
160
191
|
return converter
|
|
161
192
|
|
|
162
|
-
def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
|
|
193
|
+
def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
|
|
163
194
|
"""Convert an arbitrary input into a list of inputs for evaluators.
|
|
164
195
|
It is assumed that evaluators generally make use of their inputs in one of two ways.
|
|
165
196
|
Either they receive a collection of keyname inputs that are all single values
|
|
@@ -189,9 +220,9 @@ class EvaluatorBase(ABC):
|
|
|
189
220
|
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
|
|
190
221
|
# Check that both conversation and other inputs aren't set
|
|
191
222
|
if conversation is not None and any(singletons.values()):
|
|
223
|
+
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
|
|
192
224
|
raise EvaluationException(
|
|
193
|
-
message=
|
|
194
|
-
internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
|
|
225
|
+
message=msg,
|
|
195
226
|
blame=ErrorBlame.USER_ERROR,
|
|
196
227
|
category=ErrorCategory.INVALID_VALUE,
|
|
197
228
|
target=ErrorTarget.CONVERSATION,
|
|
@@ -200,18 +231,19 @@ class EvaluatorBase(ABC):
|
|
|
200
231
|
if conversation is not None:
|
|
201
232
|
return self._derive_conversation_converter()(conversation)
|
|
202
233
|
# Handle Singletons
|
|
203
|
-
|
|
204
|
-
|
|
234
|
+
required_singletons = remove_optional_singletons(self, singletons)
|
|
235
|
+
if all(value is not None for value in required_singletons.values()):
|
|
236
|
+
return [singletons]
|
|
205
237
|
# Missing input
|
|
238
|
+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
|
|
206
239
|
raise EvaluationException(
|
|
207
|
-
message=
|
|
208
|
-
internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
|
|
240
|
+
message=msg,
|
|
209
241
|
blame=ErrorBlame.USER_ERROR,
|
|
210
242
|
category=ErrorCategory.INVALID_VALUE,
|
|
211
243
|
target=ErrorTarget.CONVERSATION,
|
|
212
244
|
)
|
|
213
245
|
|
|
214
|
-
def _aggregate_results(self, per_turn_results: List[
|
|
246
|
+
def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
|
|
215
247
|
"""Aggregate the evaluation results of each conversation turn into a single result.
|
|
216
248
|
|
|
217
249
|
Exact implementation might need to vary slightly depending on the results produced.
|
|
@@ -224,11 +256,11 @@ class EvaluatorBase(ABC):
|
|
|
224
256
|
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
225
257
|
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
226
258
|
per-turn values.
|
|
227
|
-
:rtype:
|
|
259
|
+
:rtype: AggregateResult[T_EvalValue]
|
|
228
260
|
"""
|
|
229
261
|
|
|
230
|
-
aggregated = {}
|
|
231
|
-
evaluation_per_turn = {}
|
|
262
|
+
aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
|
|
263
|
+
evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
|
|
232
264
|
|
|
233
265
|
# Go over each turn, and rotate the results into a
|
|
234
266
|
# metric: List[values] format for the evals_per_turn dictionary.
|
|
@@ -241,19 +273,18 @@ class EvaluatorBase(ABC):
|
|
|
241
273
|
# Find and average all numeric values
|
|
242
274
|
for metric, values in evaluation_per_turn.items():
|
|
243
275
|
if all(isinstance(value, (int, float)) for value in values):
|
|
244
|
-
aggregated[metric] =
|
|
276
|
+
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
|
|
245
277
|
# Slap the per-turn results back in.
|
|
246
278
|
aggregated["evaluation_per_turn"] = evaluation_per_turn
|
|
247
|
-
|
|
248
279
|
return aggregated
|
|
249
280
|
|
|
250
|
-
async def _real_call(self, **kwargs):
|
|
281
|
+
async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
|
|
251
282
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
252
283
|
|
|
253
284
|
:keyword kwargs: The inputs to evaluate.
|
|
254
285
|
:type kwargs: Dict
|
|
255
286
|
:return: The evaluation result.
|
|
256
|
-
:rtype:
|
|
287
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
257
288
|
"""
|
|
258
289
|
# Convert inputs into list of evaluable inputs.
|
|
259
290
|
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
@@ -270,9 +301,8 @@ class EvaluatorBase(ABC):
|
|
|
270
301
|
# Otherwise, aggregate results.
|
|
271
302
|
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
272
303
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def _to_async(self):
|
|
304
|
+
@final
|
|
305
|
+
def _to_async(self) -> "AsyncEvaluatorBase":
|
|
276
306
|
return self._async_evaluator
|
|
277
307
|
|
|
278
308
|
|
|
@@ -286,7 +316,7 @@ class AsyncEvaluatorBase:
|
|
|
286
316
|
|
|
287
317
|
# Don't look at my shame. Nothing to see here....
|
|
288
318
|
# Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
|
|
289
|
-
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
|
|
319
|
+
# to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
|
|
290
320
|
# are just not passed into this function instead of ending up in kwargs.
|
|
291
321
|
# Since we want this to be relatively call-agnostic, we just account for every input that any children
|
|
292
322
|
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
|
|
@@ -2,26 +2,24 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import math
|
|
5
6
|
import re
|
|
6
|
-
from typing import Dict
|
|
7
|
-
|
|
8
|
-
from typing_extensions import override
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
import numpy as np
|
|
7
|
+
from typing import Dict, Union
|
|
12
8
|
|
|
13
9
|
from promptflow.core import AsyncPrompty
|
|
10
|
+
from typing_extensions import override
|
|
14
11
|
|
|
15
|
-
from
|
|
12
|
+
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
|
|
13
|
+
from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
|
|
14
|
+
from . import EvaluatorBase
|
|
16
15
|
|
|
17
16
|
try:
|
|
18
17
|
from ..._user_agent import USER_AGENT
|
|
19
18
|
except ImportError:
|
|
20
|
-
USER_AGENT = None
|
|
21
|
-
from . import EvaluatorBase
|
|
19
|
+
USER_AGENT = "None"
|
|
22
20
|
|
|
23
21
|
|
|
24
|
-
class PromptyEvaluatorBase(EvaluatorBase):
|
|
22
|
+
class PromptyEvaluatorBase(EvaluatorBase[float]):
|
|
25
23
|
"""Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
|
|
26
24
|
make use of a prompty file, and return their results as a dictionary, with a single key-value pair
|
|
27
25
|
linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
|
|
@@ -39,17 +37,17 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
39
37
|
:type ignore_queries: bool
|
|
40
38
|
"""
|
|
41
39
|
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
_LLM_CALL_TIMEOUT = 600
|
|
41
|
+
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
44
42
|
|
|
45
|
-
def __init__(self, *, result_key: str, prompty_file: str, model_config:
|
|
43
|
+
def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
|
|
46
44
|
self._result_key = result_key
|
|
47
45
|
self._prompty_file = prompty_file
|
|
48
46
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
49
47
|
|
|
50
48
|
prompty_model_config = construct_prompty_model_config(
|
|
51
|
-
model_config,
|
|
52
|
-
self.
|
|
49
|
+
validate_model_config(model_config),
|
|
50
|
+
self._DEFAULT_OPEN_API_VERSION,
|
|
53
51
|
USER_AGENT,
|
|
54
52
|
)
|
|
55
53
|
|
|
@@ -59,7 +57,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
59
57
|
# defining a default here.
|
|
60
58
|
|
|
61
59
|
@override
|
|
62
|
-
async def _do_eval(self, eval_input: Dict) -> Dict:
|
|
60
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
63
61
|
"""Do a relevance evaluation.
|
|
64
62
|
|
|
65
63
|
:param eval_input: The input to the evaluator. Expected to contain
|
|
@@ -69,11 +67,20 @@ class PromptyEvaluatorBase(EvaluatorBase):
|
|
|
69
67
|
:return: The evaluation result.
|
|
70
68
|
:rtype: Dict
|
|
71
69
|
"""
|
|
72
|
-
llm_output = await self._flow(timeout=self.
|
|
70
|
+
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
73
71
|
|
|
74
|
-
score =
|
|
72
|
+
score = math.nan
|
|
75
73
|
if llm_output:
|
|
74
|
+
# Parse out score and reason from evaluators known to possess them.
|
|
75
|
+
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
|
|
76
|
+
score, reason = parse_quality_evaluator_reason_score(llm_output)
|
|
77
|
+
return {
|
|
78
|
+
self._result_key: float(score),
|
|
79
|
+
f"gpt_{self._result_key}": float(score),
|
|
80
|
+
f"{self._result_key}_reason": reason,
|
|
81
|
+
}
|
|
76
82
|
match = re.search(r"\d", llm_output)
|
|
77
83
|
if match:
|
|
78
84
|
score = float(match.group())
|
|
79
|
-
|
|
85
|
+
return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
|
|
86
|
+
return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
|
|
@@ -1,48 +1,53 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, Optional, Union
|
|
4
5
|
|
|
5
|
-
from typing import Dict, Optional
|
|
6
6
|
from typing_extensions import override
|
|
7
7
|
|
|
8
|
-
from azure.
|
|
9
|
-
|
|
8
|
+
from azure.ai.evaluation._common.constants import (
|
|
9
|
+
EvaluationMetrics,
|
|
10
|
+
_InternalEvaluationMetrics,
|
|
11
|
+
Tasks,
|
|
12
|
+
_InternalAnnotationTasks,
|
|
13
|
+
)
|
|
10
14
|
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
15
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
11
16
|
from azure.ai.evaluation._exceptions import EvaluationException
|
|
17
|
+
from azure.core.credentials import TokenCredential
|
|
18
|
+
|
|
12
19
|
from . import EvaluatorBase
|
|
13
20
|
|
|
21
|
+
T = Union[str, float]
|
|
22
|
+
|
|
14
23
|
|
|
15
|
-
class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
24
|
+
class RaiServiceEvaluatorBase(EvaluatorBase[T]):
|
|
16
25
|
"""Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
|
|
17
26
|
This includes content safety evaluators, protected material evaluators, and others. These evaluators
|
|
18
27
|
are all assumed to be of the "query and response or conversation" input variety.
|
|
19
28
|
|
|
20
|
-
param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
21
|
-
|
|
22
|
-
type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
23
|
-
param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
29
|
+
:param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
|
|
30
|
+
to specify which evaluation to perform.
|
|
31
|
+
:type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
|
|
32
|
+
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
|
|
24
33
|
aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
|
|
25
34
|
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
|
|
26
35
|
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
|
|
27
|
-
type eval_last_turn: bool
|
|
36
|
+
:type eval_last_turn: bool
|
|
28
37
|
"""
|
|
29
38
|
|
|
30
39
|
@override
|
|
31
40
|
def __init__(
|
|
32
41
|
self,
|
|
33
|
-
eval_metric: EvaluationMetrics,
|
|
42
|
+
eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
34
43
|
azure_ai_project: dict,
|
|
35
|
-
credential:
|
|
44
|
+
credential: TokenCredential,
|
|
36
45
|
eval_last_turn: bool = False,
|
|
37
46
|
):
|
|
38
47
|
super().__init__(eval_last_turn=eval_last_turn)
|
|
39
48
|
self._eval_metric = eval_metric
|
|
40
|
-
self._azure_ai_project = azure_ai_project
|
|
41
|
-
|
|
42
|
-
# Use DefaultCredential if no credential is provided
|
|
43
|
-
self._credential = DefaultAzureCredential()
|
|
44
|
-
else:
|
|
45
|
-
self._credential = credential
|
|
49
|
+
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
50
|
+
self._credential = credential
|
|
46
51
|
|
|
47
52
|
@override
|
|
48
53
|
def __call__(
|
|
@@ -50,8 +55,8 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
|
50
55
|
*,
|
|
51
56
|
query: Optional[str] = None,
|
|
52
57
|
response: Optional[str] = None,
|
|
53
|
-
conversation
|
|
54
|
-
**kwargs
|
|
58
|
+
conversation=None,
|
|
59
|
+
**kwargs,
|
|
55
60
|
):
|
|
56
61
|
"""Evaluate either a query and response or a conversation. Must supply either a query AND response,
|
|
57
62
|
or a conversation, but not both.
|
|
@@ -63,14 +68,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
|
63
68
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
64
69
|
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
65
70
|
to be dictionaries with keys "content", "role", and possibly "context".
|
|
66
|
-
:paramtype conversation: Optional[
|
|
67
|
-
:
|
|
68
|
-
:rtype: Dict
|
|
71
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
72
|
+
:rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
|
|
69
73
|
"""
|
|
70
74
|
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
71
75
|
|
|
72
76
|
@override
|
|
73
|
-
async def _do_eval(self, eval_input: Dict):
|
|
77
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
|
|
74
78
|
"""Perform the evaluation using the Azure AI RAI service.
|
|
75
79
|
The exact evaluation performed is determined by the evaluation metric supplied
|
|
76
80
|
by the child class initializer.
|
|
@@ -90,10 +94,43 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
|
|
|
90
94
|
+ " This should have failed earlier."
|
|
91
95
|
),
|
|
92
96
|
)
|
|
97
|
+
input_data = {"query": query, "response": response}
|
|
98
|
+
|
|
99
|
+
if "context" in self._singleton_inputs:
|
|
100
|
+
context = eval_input.get("context", None)
|
|
101
|
+
if context is None:
|
|
102
|
+
raise EvaluationException(
|
|
103
|
+
message="Not implemented",
|
|
104
|
+
internal_message=(
|
|
105
|
+
"Attempted context-based evaluation without supplying context."
|
|
106
|
+
+ " This should have failed earlier."
|
|
107
|
+
),
|
|
108
|
+
)
|
|
109
|
+
input_data["context"] = context
|
|
110
|
+
|
|
93
111
|
return await evaluate_with_rai_service(
|
|
94
112
|
metric_name=self._eval_metric,
|
|
95
|
-
|
|
96
|
-
response=response,
|
|
113
|
+
data=input_data,
|
|
97
114
|
project_scope=self._azure_ai_project,
|
|
98
115
|
credential=self._credential,
|
|
116
|
+
annotation_task=self._get_task(),
|
|
99
117
|
)
|
|
118
|
+
|
|
119
|
+
def _get_task(self):
|
|
120
|
+
"""Get the annotation task for the current evaluation metric.
|
|
121
|
+
The annotation task is used by the RAI service script to determine a the message format
|
|
122
|
+
of the API call, and how the output is processed, among other things.
|
|
123
|
+
|
|
124
|
+
:return: The annotation task for the evaluator's self._eval_metric value.
|
|
125
|
+
:rtype: ~azure.ai.evaluation._common.constants.Tasks
|
|
126
|
+
|
|
127
|
+
"""
|
|
128
|
+
if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
|
|
129
|
+
return Tasks.GROUNDEDNESS
|
|
130
|
+
if self._eval_metric == EvaluationMetrics.XPIA:
|
|
131
|
+
return Tasks.XPIA
|
|
132
|
+
if self._eval_metric == _InternalEvaluationMetrics.ECI:
|
|
133
|
+
return _InternalAnnotationTasks.ECI
|
|
134
|
+
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
135
|
+
return Tasks.PROTECTED_MATERIAL
|
|
136
|
+
return Tasks.CONTENT_HARM
|
|
@@ -2,32 +2,34 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from concurrent.futures import as_completed
|
|
5
|
+
from typing import Callable, Dict, List, Optional, Union
|
|
5
6
|
|
|
6
7
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
+
from typing_extensions import override
|
|
7
9
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from ._self_harm import SelfHarmEvaluator
|
|
11
|
-
from ._sexual import SexualEvaluator
|
|
12
|
-
from ._violence import ViolenceEvaluator
|
|
13
|
-
except ImportError:
|
|
14
|
-
from _hate_unfairness import HateUnfairnessEvaluator
|
|
15
|
-
from _self_harm import SelfHarmEvaluator
|
|
16
|
-
from _sexual import SexualEvaluator
|
|
17
|
-
from _violence import ViolenceEvaluator
|
|
10
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
11
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
18
12
|
|
|
13
|
+
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
14
|
+
from ._self_harm import SelfHarmEvaluator
|
|
15
|
+
from ._sexual import SexualEvaluator
|
|
16
|
+
from ._violence import ViolenceEvaluator
|
|
19
17
|
|
|
20
|
-
|
|
18
|
+
|
|
19
|
+
@experimental
|
|
20
|
+
class ContentSafetyEvaluator(EvaluatorBase):
|
|
21
21
|
"""
|
|
22
22
|
Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
|
|
23
23
|
|
|
24
|
+
:param credential: The credential for connecting to Azure AI project. Required
|
|
25
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
24
26
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
25
27
|
It contains subscription id, resource group, and project name.
|
|
26
28
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
27
|
-
:param
|
|
28
|
-
|
|
29
|
-
:param
|
|
30
|
-
:type
|
|
29
|
+
:param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
|
|
30
|
+
:type eval_last_turn: bool
|
|
31
|
+
:param kwargs: Additional arguments to pass to the evaluator.
|
|
32
|
+
:type kwargs: Any
|
|
31
33
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
32
34
|
:rtype: Callable
|
|
33
35
|
|
|
@@ -66,33 +68,63 @@ class ContentSafetyEvaluator:
|
|
|
66
68
|
}
|
|
67
69
|
"""
|
|
68
70
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
71
|
+
# TODO address 3579092 to re-enabled parallel evals.
|
|
72
|
+
def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
|
|
73
|
+
super().__init__(eval_last_turn=eval_last_turn)
|
|
74
|
+
self._parallel = kwargs.pop("parallel", False)
|
|
75
|
+
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
76
|
+
ViolenceEvaluator(credential, azure_ai_project),
|
|
77
|
+
SexualEvaluator(credential, azure_ai_project),
|
|
78
|
+
SelfHarmEvaluator(credential, azure_ai_project),
|
|
79
|
+
HateUnfairnessEvaluator(credential, azure_ai_project),
|
|
76
80
|
]
|
|
77
81
|
|
|
78
|
-
|
|
82
|
+
@override
|
|
83
|
+
def __call__(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
query: Optional[str] = None,
|
|
87
|
+
response: Optional[str] = None,
|
|
88
|
+
conversation=None,
|
|
89
|
+
**kwargs,
|
|
90
|
+
):
|
|
91
|
+
"""Evaluate a collection of content safety metrics for the given query/response pair or conversation.
|
|
92
|
+
This inputs must supply either a query AND response, or a conversation, but not both.
|
|
93
|
+
|
|
94
|
+
:keyword query: The query to evaluate.
|
|
95
|
+
:paramtype query: Optional[str]
|
|
96
|
+
:keyword response: The response to evaluate.
|
|
97
|
+
:paramtype response: Optional[str]
|
|
98
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
99
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
100
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
101
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
102
|
+
:return: The evaluation result.
|
|
103
|
+
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
|
|
79
104
|
"""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
:
|
|
89
|
-
:
|
|
105
|
+
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
|
|
106
|
+
|
|
107
|
+
@override
|
|
108
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
109
|
+
"""Perform the evaluation using the Azure AI RAI service.
|
|
110
|
+
The exact evaluation performed is determined by the evaluation metric supplied
|
|
111
|
+
by the child class initializer.
|
|
112
|
+
|
|
113
|
+
:param eval_input: The input to the evaluation function.
|
|
114
|
+
:type eval_input: Dict
|
|
115
|
+
:return: The evaluation result.
|
|
116
|
+
:rtype: Dict
|
|
90
117
|
"""
|
|
91
|
-
|
|
118
|
+
query = eval_input.get("query", None)
|
|
119
|
+
response = eval_input.get("response", None)
|
|
120
|
+
conversation = eval_input.get("conversation", None)
|
|
121
|
+
results: Dict[str, Union[str, float]] = {}
|
|
122
|
+
# TODO fix this to not explode on empty optional inputs (PF SKD error)
|
|
92
123
|
if self._parallel:
|
|
93
124
|
with ThreadPoolExecutor() as executor:
|
|
125
|
+
# pylint: disable=no-value-for-parameter
|
|
94
126
|
futures = {
|
|
95
|
-
executor.submit(
|
|
127
|
+
executor.submit(query=query, response=response, conversation=conversation): evaluator
|
|
96
128
|
for evaluator in self._evaluators
|
|
97
129
|
}
|
|
98
130
|
|
|
@@ -100,7 +132,7 @@ class ContentSafetyEvaluator:
|
|
|
100
132
|
results.update(future.result())
|
|
101
133
|
else:
|
|
102
134
|
for evaluator in self._evaluators:
|
|
103
|
-
result = evaluator(query=query, response=response,
|
|
135
|
+
result = evaluator(query=query, response=response, conversation=conversation)
|
|
104
136
|
results.update(result)
|
|
105
137
|
|
|
106
138
|
return results
|