azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +23 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
- azure/ai/evaluation/_common/constants.py +9 -2
- azure/ai/evaluation/_common/math.py +29 -0
- azure/ai/evaluation/_common/rai_service.py +222 -93
- azure/ai/evaluation/_common/utils.py +328 -19
- azure/ai/evaluation/_constants.py +16 -8
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
- azure/ai/evaluation/_evaluate/_utils.py +77 -25
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
- azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
- azure/ai/evaluation/_exceptions.py +26 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +23 -6
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +222 -169
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -2,21 +2,30 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import re
|
|
6
|
+
import math
|
|
5
7
|
import threading
|
|
6
|
-
from typing import List, Union
|
|
8
|
+
from typing import Any, List, Literal, Mapping, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
|
|
7
9
|
|
|
8
10
|
import nltk
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
11
|
+
from typing_extensions import NotRequired, Required, TypeGuard
|
|
12
|
+
from promptflow.core._errors import MissingRequiredPackage
|
|
12
13
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
14
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
|
+
from azure.ai.evaluation._model_configurations import (
|
|
16
|
+
AzureAIProject,
|
|
17
|
+
AzureOpenAIModelConfiguration,
|
|
18
|
+
OpenAIModelConfiguration,
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
from . import constants
|
|
15
22
|
|
|
16
23
|
_nltk_data_download_lock = threading.Lock()
|
|
17
24
|
|
|
25
|
+
T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
|
|
26
|
+
|
|
18
27
|
|
|
19
|
-
def get_harm_severity_level(harm_score: int) -> str:
|
|
28
|
+
def get_harm_severity_level(harm_score: Union[float, int]) -> Union[str, float]:
|
|
20
29
|
"""Generate harm severity level based on harm score.
|
|
21
30
|
|
|
22
31
|
:param harm_score: The harm score to be evaluated.
|
|
@@ -30,23 +39,29 @@ def get_harm_severity_level(harm_score: int) -> str:
|
|
|
30
39
|
constants.HarmSeverityLevel.Medium: [4, 5],
|
|
31
40
|
constants.HarmSeverityLevel.High: [6, 7],
|
|
32
41
|
}
|
|
33
|
-
if harm_score
|
|
34
|
-
return
|
|
42
|
+
if math.isnan(harm_score) or harm_score is None:
|
|
43
|
+
return math.nan
|
|
35
44
|
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
|
|
36
45
|
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
|
|
37
46
|
return harm_level.value
|
|
38
|
-
return
|
|
47
|
+
return math.nan
|
|
39
48
|
|
|
40
49
|
|
|
41
50
|
def ensure_nltk_data_downloaded():
|
|
42
51
|
"""Download NLTK data packages if not already downloaded."""
|
|
52
|
+
nltk_data = [
|
|
53
|
+
("wordnet", "corpora/wordnet.zip"),
|
|
54
|
+
("perluniprops", "misc/perluniprops.zip"),
|
|
55
|
+
("punkt", "tokenizers/punkt.zip"),
|
|
56
|
+
("punkt_tab", "tokenizers/punkt_tab.zip"),
|
|
57
|
+
]
|
|
58
|
+
|
|
43
59
|
with _nltk_data_download_lock:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
nltk.download("punkt_tab")
|
|
60
|
+
for _id, resource_name in nltk_data:
|
|
61
|
+
try:
|
|
62
|
+
nltk.find(resource_name)
|
|
63
|
+
except LookupError:
|
|
64
|
+
nltk.download(_id)
|
|
50
65
|
|
|
51
66
|
|
|
52
67
|
def nltk_tokenize(text: str) -> List[str]:
|
|
@@ -71,12 +86,20 @@ def nltk_tokenize(text: str) -> List[str]:
|
|
|
71
86
|
return list(tokens)
|
|
72
87
|
|
|
73
88
|
|
|
89
|
+
def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]:
|
|
90
|
+
return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment"))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
|
|
94
|
+
return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
|
|
95
|
+
|
|
96
|
+
|
|
74
97
|
def parse_model_config_type(
|
|
75
98
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
76
99
|
) -> None:
|
|
77
|
-
if
|
|
100
|
+
if _is_aoi_model_config(model_config):
|
|
78
101
|
model_config["type"] = AZURE_OPENAI_TYPE
|
|
79
|
-
|
|
102
|
+
elif _is_openai_model_config(model_config):
|
|
80
103
|
model_config["type"] = OPENAI_TYPE
|
|
81
104
|
|
|
82
105
|
|
|
@@ -87,16 +110,302 @@ def construct_prompty_model_config(
|
|
|
87
110
|
) -> dict:
|
|
88
111
|
parse_model_config_type(model_config)
|
|
89
112
|
|
|
90
|
-
if model_config
|
|
113
|
+
if _is_aoi_model_config(model_config):
|
|
91
114
|
model_config["api_version"] = model_config.get("api_version", default_api_version)
|
|
92
115
|
|
|
93
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
116
|
+
prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
94
117
|
|
|
95
118
|
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
96
119
|
# https://github.com/encode/httpx/discussions/2959
|
|
97
120
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
98
121
|
|
|
99
|
-
if model_config
|
|
122
|
+
if _is_aoi_model_config(model_config) and user_agent:
|
|
100
123
|
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
101
124
|
|
|
102
125
|
return prompty_model_config
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
129
|
+
fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
|
|
130
|
+
|
|
131
|
+
if not isinstance(o, dict):
|
|
132
|
+
msg = "The 'azure_ai_project' parameter must be a dictionary."
|
|
133
|
+
raise EvaluationException(
|
|
134
|
+
message=msg,
|
|
135
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
136
|
+
blame=ErrorBlame.USER_ERROR,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
missing_fields = set(fields.keys()) - o.keys()
|
|
140
|
+
|
|
141
|
+
if missing_fields:
|
|
142
|
+
msg = (
|
|
143
|
+
"The 'azure_ai_project' dictionary is missing the following required "
|
|
144
|
+
f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
|
|
145
|
+
)
|
|
146
|
+
raise EvaluationException(
|
|
147
|
+
message=msg,
|
|
148
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
149
|
+
blame=ErrorBlame.USER_ERROR,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
for field_name, expected_type in fields.items():
|
|
153
|
+
if isinstance(o[field_name], expected_type):
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
|
|
157
|
+
raise EvaluationException(
|
|
158
|
+
message=msg,
|
|
159
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
160
|
+
blame=ErrorBlame.USER_ERROR,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return cast(AzureAIProject, o)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
|
|
167
|
+
try:
|
|
168
|
+
return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
|
|
169
|
+
except TypeError:
|
|
170
|
+
try:
|
|
171
|
+
return _validate_typed_dict(config, OpenAIModelConfiguration)
|
|
172
|
+
except TypeError as e:
|
|
173
|
+
msg = "Model config validation failed."
|
|
174
|
+
raise EvaluationException(
|
|
175
|
+
message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR
|
|
176
|
+
) from e
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
180
|
+
"""Do very basic runtime validation that an object is a typed dict
|
|
181
|
+
|
|
182
|
+
.. warning::
|
|
183
|
+
|
|
184
|
+
This validation is very basic, robust enough to cover some very simple TypedDicts.
|
|
185
|
+
Ideally, validation of this kind should be delegated to something more robust.
|
|
186
|
+
|
|
187
|
+
You will very quickly run into limitations trying to apply this function more broadly:
|
|
188
|
+
* Doesn't support stringized annotations at all
|
|
189
|
+
* Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types.
|
|
190
|
+
* Error messages are poor, especially if there is any nesting.
|
|
191
|
+
|
|
192
|
+
:param object o: The object to check
|
|
193
|
+
:param Type[T_TypedDict] t: The TypedDict to validate against
|
|
194
|
+
:raises NotImplementedError: Several forms of validation are unsupported
|
|
195
|
+
* Checking against stringized annotations
|
|
196
|
+
* Checking a generic that is not one of a few basic forms
|
|
197
|
+
:raises TypeError: If a value does not match the specified annotation
|
|
198
|
+
:raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...)
|
|
199
|
+
:returns: The object passed in
|
|
200
|
+
:rtype: T_TypedDict
|
|
201
|
+
"""
|
|
202
|
+
if not isinstance(o, dict):
|
|
203
|
+
raise TypeError(f"Expected type 'dict', got type '{type(object)}'.")
|
|
204
|
+
|
|
205
|
+
annotations = t.__annotations__
|
|
206
|
+
is_total = getattr(t, "__total__", False)
|
|
207
|
+
unknown_keys = set(o.keys()) - annotations.keys()
|
|
208
|
+
|
|
209
|
+
if unknown_keys:
|
|
210
|
+
raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}")
|
|
211
|
+
|
|
212
|
+
required_keys = {
|
|
213
|
+
k
|
|
214
|
+
for k in annotations
|
|
215
|
+
if (is_total and get_origin(annotations[k]) is not NotRequired)
|
|
216
|
+
or (not is_total and get_origin(annotations[k]) is Required)
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
missing_keys = required_keys - o.keys()
|
|
220
|
+
|
|
221
|
+
if missing_keys:
|
|
222
|
+
raise TypeError(f"Missing required keys: {list(missing_keys)!r}.")
|
|
223
|
+
|
|
224
|
+
def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool:
|
|
225
|
+
if isinstance(annotation, str):
|
|
226
|
+
raise NotImplementedError("Missing support for validating against stringized annotations.")
|
|
227
|
+
|
|
228
|
+
if (origin := get_origin(annotation)) is not None:
|
|
229
|
+
if origin is tuple:
|
|
230
|
+
validate_annotation(v, tuple)
|
|
231
|
+
tuple_args = get_args(annotation)
|
|
232
|
+
if len(cast(tuple, v)) != len(tuple_args):
|
|
233
|
+
raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.")
|
|
234
|
+
for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args):
|
|
235
|
+
validate_annotation(tuple_val, tuple_args)
|
|
236
|
+
elif origin is dict:
|
|
237
|
+
validate_annotation(v, dict)
|
|
238
|
+
dict_key_ann, dict_val_ann = get_args(annotation)
|
|
239
|
+
for dict_key, dict_val in cast(dict, v).items():
|
|
240
|
+
validate_annotation(dict_val, dict_val_ann)
|
|
241
|
+
validate_annotation(dict_key, dict_key_ann)
|
|
242
|
+
elif origin is list:
|
|
243
|
+
validate_annotation(v, list)
|
|
244
|
+
list_val_ann = get_args(annotation)[0]
|
|
245
|
+
for list_val in cast(list, v):
|
|
246
|
+
validate_annotation(list_val, list_val_ann)
|
|
247
|
+
elif origin is Union:
|
|
248
|
+
for generic_arg in get_args(annotation):
|
|
249
|
+
try:
|
|
250
|
+
validate_annotation(v, generic_arg)
|
|
251
|
+
return True
|
|
252
|
+
except TypeError:
|
|
253
|
+
pass
|
|
254
|
+
raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}")
|
|
255
|
+
elif origin is Literal:
|
|
256
|
+
literal_args = get_args(annotation)
|
|
257
|
+
if not any(type(literal) is type(v) and literal == v for literal in literal_args):
|
|
258
|
+
raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}")
|
|
259
|
+
elif any(origin is g for g in (NotRequired, Required)):
|
|
260
|
+
validate_annotation(v, get_args(annotation)[0])
|
|
261
|
+
else:
|
|
262
|
+
raise NotImplementedError(f"Validation not implemented for generic {origin}.")
|
|
263
|
+
return True
|
|
264
|
+
|
|
265
|
+
if isinstance(annotation, type):
|
|
266
|
+
if not isinstance(v, annotation):
|
|
267
|
+
raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.")
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
raise ValueError("Annotation to validate against should be a str, type, or generic.")
|
|
271
|
+
|
|
272
|
+
for k, v in o.items():
|
|
273
|
+
validate_annotation(v, annotations[k])
|
|
274
|
+
|
|
275
|
+
return cast(T_TypedDict, o)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
279
|
+
"""Parse the output of prompt-based quality evaluators that return a score and reason.
|
|
280
|
+
|
|
281
|
+
Current supported evaluators:
|
|
282
|
+
- Fluency
|
|
283
|
+
- Relevance
|
|
284
|
+
- Retrieval
|
|
285
|
+
- Groundedness
|
|
286
|
+
- Coherence
|
|
287
|
+
|
|
288
|
+
:param llm_output: The output of the prompt-based quality evaluator.
|
|
289
|
+
:type llm_output: str
|
|
290
|
+
:return: The score and reason.
|
|
291
|
+
:rtype: Tuple[float, str]
|
|
292
|
+
"""
|
|
293
|
+
score = math.nan
|
|
294
|
+
reason = ""
|
|
295
|
+
if llm_output:
|
|
296
|
+
score_pattern = r"<S2>(.*?)</S2>"
|
|
297
|
+
reason_pattern = r"<S1>(.*?)</S1>"
|
|
298
|
+
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
299
|
+
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
300
|
+
if score_match:
|
|
301
|
+
score = float(score_match[0].strip())
|
|
302
|
+
if reason_match:
|
|
303
|
+
reason = reason_match[0].strip()
|
|
304
|
+
|
|
305
|
+
return score, reason
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def remove_optional_singletons(eval_class, singletons):
|
|
309
|
+
required_singletons = singletons.copy()
|
|
310
|
+
if hasattr(eval_class, "_OPTIONAL_PARAMS"): # pylint: disable=protected-access
|
|
311
|
+
for param in eval_class._OPTIONAL_PARAMS: # pylint: disable=protected-access
|
|
312
|
+
if param in singletons:
|
|
313
|
+
del required_singletons[param]
|
|
314
|
+
return required_singletons
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def retrieve_content_type(assistant_messages: List, metric: str) -> str:
|
|
318
|
+
"""Get the content type for service payload.
|
|
319
|
+
|
|
320
|
+
:param assistant_messages: The list of messages to be annotated by evaluation service
|
|
321
|
+
:type assistant_messages: list
|
|
322
|
+
:param metric: A string representing the metric type
|
|
323
|
+
:type metric: str
|
|
324
|
+
:return: A text representing the content type. Example: 'text', or 'image'
|
|
325
|
+
:rtype: str
|
|
326
|
+
"""
|
|
327
|
+
# Check if metric is "protected_material"
|
|
328
|
+
if metric == "protected_material":
|
|
329
|
+
return "image"
|
|
330
|
+
|
|
331
|
+
# Iterate through each message
|
|
332
|
+
for item in assistant_messages:
|
|
333
|
+
# Ensure "content" exists in the message and is iterable
|
|
334
|
+
content = item.get("content", [])
|
|
335
|
+
for message in content:
|
|
336
|
+
if message.get("type", "") == "image_url":
|
|
337
|
+
return "image"
|
|
338
|
+
# Default return if no image was found
|
|
339
|
+
return "text"
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def validate_conversation(conversation):
|
|
343
|
+
def raise_exception(msg, target):
|
|
344
|
+
raise EvaluationException(
|
|
345
|
+
message=msg,
|
|
346
|
+
internal_message=msg,
|
|
347
|
+
target=target,
|
|
348
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
349
|
+
blame=ErrorBlame.USER_ERROR,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
if not conversation or "messages" not in conversation:
|
|
353
|
+
raise_exception(
|
|
354
|
+
"Attribute 'messages' is missing in the request",
|
|
355
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
356
|
+
)
|
|
357
|
+
messages = conversation["messages"]
|
|
358
|
+
if not isinstance(messages, list):
|
|
359
|
+
raise_exception(
|
|
360
|
+
"'messages' parameter must be a JSON-compatible list of chat messages",
|
|
361
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
362
|
+
)
|
|
363
|
+
expected_roles = {"user", "assistant", "system"}
|
|
364
|
+
image_found = False
|
|
365
|
+
for num, message in enumerate(messages, 1):
|
|
366
|
+
if not isinstance(message, dict):
|
|
367
|
+
try:
|
|
368
|
+
from azure.ai.inference.models import (
|
|
369
|
+
ChatRequestMessage,
|
|
370
|
+
UserMessage,
|
|
371
|
+
AssistantMessage,
|
|
372
|
+
SystemMessage,
|
|
373
|
+
ImageContentItem,
|
|
374
|
+
)
|
|
375
|
+
except ImportError as ex:
|
|
376
|
+
raise MissingRequiredPackage(
|
|
377
|
+
message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage"
|
|
378
|
+
) from ex
|
|
379
|
+
|
|
380
|
+
if isinstance(messages[0], ChatRequestMessage) and not isinstance(
|
|
381
|
+
message, (UserMessage, AssistantMessage, SystemMessage)
|
|
382
|
+
):
|
|
383
|
+
raise_exception(
|
|
384
|
+
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
385
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if isinstance(message.content, list) and any(
|
|
389
|
+
isinstance(item, ImageContentItem) for item in message.content
|
|
390
|
+
):
|
|
391
|
+
image_found = True
|
|
392
|
+
continue
|
|
393
|
+
if message.get("role") not in expected_roles:
|
|
394
|
+
raise_exception(
|
|
395
|
+
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
396
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
397
|
+
)
|
|
398
|
+
content = message.get("content")
|
|
399
|
+
if not isinstance(content, (str, list)):
|
|
400
|
+
raise_exception(
|
|
401
|
+
f"Content in each turn must be a string or array. Message number: {num}",
|
|
402
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
403
|
+
)
|
|
404
|
+
if isinstance(content, list):
|
|
405
|
+
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
406
|
+
image_found = True
|
|
407
|
+
if not image_found:
|
|
408
|
+
raise_exception(
|
|
409
|
+
"Message needs to have multi-modal input like images.",
|
|
410
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
411
|
+
)
|
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Literal
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class EvaluationMetrics:
|
|
7
8
|
"""Metrics for model evaluation."""
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
GROUNDEDNESS = "groundedness"
|
|
11
|
+
RELEVANCE = "relevance"
|
|
12
|
+
COHERENCE = "coherence"
|
|
13
|
+
FLUENCY = "fluency"
|
|
14
|
+
SIMILARITY = "similarity"
|
|
14
15
|
F1_SCORE = "f1_score"
|
|
15
|
-
|
|
16
|
+
RETRIEVAL_SCORE = "retrieval_score"
|
|
16
17
|
HATE_FAIRNESS = "hate_fairness"
|
|
17
18
|
HATE_UNFAIRNESS = "hate_unfairness"
|
|
18
19
|
VIOLENCE = "violence"
|
|
@@ -48,6 +49,13 @@ class DefaultOpenEncoding:
|
|
|
48
49
|
"""SDK Default Encoding when writing a file"""
|
|
49
50
|
|
|
50
51
|
|
|
52
|
+
class EvaluationRunProperties:
|
|
53
|
+
"""Defines properties used to identify an evaluation run by UI"""
|
|
54
|
+
|
|
55
|
+
RUN_TYPE = "runType"
|
|
56
|
+
EVALUATION_RUN = "_azureml.evaluation_run"
|
|
57
|
+
|
|
58
|
+
|
|
51
59
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
52
60
|
|
|
53
61
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -58,6 +66,6 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
|
|
|
58
66
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
|
|
59
67
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
60
68
|
|
|
61
|
-
AZURE_OPENAI_TYPE = "azure_openai"
|
|
69
|
+
AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
|
|
62
70
|
|
|
63
|
-
OPENAI_TYPE = "openai"
|
|
71
|
+
OPENAI_TYPE: Literal["openai"] = "openai"
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from .
|
|
4
|
+
from .eval_run_context import EvalRunContext
|
|
5
5
|
from .code_client import CodeClient
|
|
6
6
|
from .proxy_client import ProxyClient
|
|
7
|
+
from .target_run_context import TargetRunContext
|
|
7
8
|
|
|
8
|
-
__all__ = ["CodeClient", "ProxyClient", "
|
|
9
|
+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
|
|
@@ -5,8 +5,9 @@ import inspect
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
+
from concurrent.futures import Future
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Callable, Dict, Optional, Union
|
|
10
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
10
11
|
|
|
11
12
|
import pandas as pd
|
|
12
13
|
from promptflow.contracts.types import AttrDict
|
|
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
|
|
|
22
23
|
|
|
23
24
|
class CodeRun:
|
|
24
25
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
run: Future,
|
|
29
|
+
input_data,
|
|
30
|
+
evaluator_name: Optional[str] = None,
|
|
31
|
+
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
+
**kwargs, # pylint: disable=unused-argument
|
|
33
|
+
) -> None:
|
|
27
34
|
self.run = run
|
|
28
35
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
29
36
|
self.input_data = input_data
|
|
30
|
-
self.aggregated_metrics =
|
|
37
|
+
self.aggregated_metrics = aggregator(self)
|
|
31
38
|
|
|
32
|
-
def get_result_df(self, exclude_inputs=False):
|
|
39
|
+
def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
|
|
33
40
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
34
|
-
result_df = self.run.result(timeout=batch_run_timeout)
|
|
41
|
+
result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
|
|
35
42
|
if exclude_inputs:
|
|
36
43
|
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
37
44
|
return result_df
|
|
38
45
|
|
|
39
|
-
def get_aggregated_metrics(self):
|
|
46
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
|
40
47
|
try:
|
|
41
48
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
42
|
-
aggregated_metrics = (
|
|
43
|
-
self.aggregated_metrics.result(timeout=batch_run_timeout)
|
|
49
|
+
aggregated_metrics: Optional[Any] = (
|
|
50
|
+
cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
|
|
44
51
|
if self.aggregated_metrics is not None
|
|
45
52
|
else None
|
|
46
53
|
)
|
|
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
104
111
|
verify_integrity=True,
|
|
105
112
|
)
|
|
106
113
|
|
|
107
|
-
|
|
114
|
+
@staticmethod
|
|
115
|
+
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
108
116
|
try:
|
|
109
117
|
if _has_aggregator(evaluator):
|
|
110
|
-
aggregate_input = None
|
|
111
118
|
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
112
119
|
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
113
120
|
aggregate_input = evaluator_output["output"].tolist()
|
|
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
152
159
|
column_mapping=column_mapping,
|
|
153
160
|
evaluator_name=evaluator_name,
|
|
154
161
|
)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
|
|
163
|
+
return CodeRun(
|
|
164
|
+
run=eval_future,
|
|
165
|
+
input_data=data,
|
|
166
|
+
evaluator_name=evaluator_name,
|
|
167
|
+
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
+
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
+
),
|
|
170
|
+
)
|
|
159
171
|
|
|
160
172
|
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
161
173
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
162
174
|
return result_df
|
|
163
175
|
|
|
164
|
-
def get_metrics(self, run: CodeRun) ->
|
|
176
|
+
def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
|
|
165
177
|
try:
|
|
166
178
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
167
179
|
print("Aggregated metrics")
|
|
168
180
|
print(aggregated_metrics)
|
|
169
181
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
170
182
|
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
171
|
-
return
|
|
183
|
+
return {}
|
|
172
184
|
return aggregated_metrics
|
|
185
|
+
|
|
186
|
+
def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
|
|
187
|
+
# Not implemented
|
|
188
|
+
return None
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type, Union
|
|
5
7
|
|
|
6
8
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
7
9
|
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
@@ -20,22 +22,22 @@ from .code_client import CodeClient
|
|
|
20
22
|
from .proxy_client import ProxyClient
|
|
21
23
|
|
|
22
24
|
|
|
23
|
-
class
|
|
24
|
-
"""Context manager for batch run
|
|
25
|
+
class EvalRunContext:
|
|
26
|
+
"""Context manager for eval batch run.
|
|
25
27
|
|
|
26
28
|
:param client: The client to run in the context.
|
|
27
29
|
:type client: Union[
|
|
28
|
-
~azure.ai.evaluation._evaluate.
|
|
29
|
-
~azure.ai.evaluation._evaluate.
|
|
30
|
+
~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
|
|
31
|
+
~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
|
|
30
32
|
]
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
|
-
def __init__(self, client) -> None:
|
|
35
|
+
def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
|
|
34
36
|
self.client = client
|
|
35
37
|
self._is_batch_timeout_set_by_system = False
|
|
36
38
|
self._is_otel_timeout_set_by_system = False
|
|
37
39
|
|
|
38
|
-
def __enter__(self):
|
|
40
|
+
def __enter__(self) -> None:
|
|
39
41
|
if isinstance(self.client, CodeClient):
|
|
40
42
|
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
41
43
|
inject_openai_api()
|
|
@@ -56,7 +58,12 @@ class BatchRunContext:
|
|
|
56
58
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
57
59
|
set_event_loop_policy()
|
|
58
60
|
|
|
59
|
-
def __exit__(
|
|
61
|
+
def __exit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: Optional[Type[BaseException]],
|
|
64
|
+
exc_value: Optional[BaseException],
|
|
65
|
+
exc_tb: Optional[types.TracebackType],
|
|
66
|
+
) -> None:
|
|
60
67
|
if isinstance(self.client, CodeClient):
|
|
61
68
|
recover_openai_api()
|
|
62
69
|
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# pylint: disable=protected-access
|
|
6
|
+
|
|
4
7
|
import inspect
|
|
5
8
|
import logging
|
|
9
|
+
import math
|
|
6
10
|
import os
|
|
11
|
+
from collections import OrderedDict
|
|
7
12
|
from concurrent.futures import Future
|
|
8
13
|
from typing import Any, Callable, Dict, Optional, Union
|
|
9
14
|
|
|
10
|
-
import numpy as np
|
|
11
15
|
import pandas as pd
|
|
12
16
|
from promptflow.client import PFClient
|
|
13
17
|
from promptflow.entities import Run
|
|
@@ -36,7 +40,7 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
36
40
|
**kwargs
|
|
37
41
|
) -> ProxyRun:
|
|
38
42
|
flow_to_run = flow
|
|
39
|
-
if hasattr(flow, "_to_async"):
|
|
43
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
|
|
40
44
|
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
41
45
|
|
|
42
46
|
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
@@ -53,16 +57,30 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
53
57
|
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
|
|
54
58
|
run: Run = proxy_run.run.result()
|
|
55
59
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
56
|
-
result_df.replace("(Failed)",
|
|
60
|
+
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
57
61
|
return result_df
|
|
58
62
|
|
|
59
63
|
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
60
64
|
run: Run = proxy_run.run.result()
|
|
61
65
|
return self._pf_client.get_metrics(run)
|
|
62
66
|
|
|
67
|
+
def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
68
|
+
run = proxy_run.run.result()
|
|
69
|
+
|
|
70
|
+
# pylint: disable=protected-access
|
|
71
|
+
return OrderedDict(
|
|
72
|
+
[
|
|
73
|
+
("status", run.status),
|
|
74
|
+
("duration", str(run._end_time - run._created_on)),
|
|
75
|
+
("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
|
|
76
|
+
("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
|
|
77
|
+
("log_path", str(run._output_path)),
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
|
|
63
81
|
@staticmethod
|
|
64
82
|
def _should_batch_use_async(flow):
|
|
65
|
-
if os.getenv("
|
|
83
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
66
84
|
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
|
|
67
85
|
return True
|
|
68
86
|
if inspect.iscoroutinefunction(flow):
|