azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +80 -29
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +1 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +11 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +34 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +59 -103
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +6 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +17 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -50
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +15 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +25 -34
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +11 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_simulator.py +51 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/METADATA +232 -324
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/RECORD +60 -61
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
import functools
|
|
6
7
|
import inspect
|
|
7
8
|
import logging
|
|
@@ -149,6 +150,9 @@ def _get_indentation_size(doc_string: str) -> int:
|
|
|
149
150
|
def _should_skip_warning():
|
|
150
151
|
skip_warning_msg = False
|
|
151
152
|
|
|
153
|
+
if os.getenv("AI_EVALS_DISABLE_EXPERIMENTAL_WARNING", "false").lower() == "true":
|
|
154
|
+
skip_warning_msg = True
|
|
155
|
+
|
|
152
156
|
# Cases where we want to suppress the warning:
|
|
153
157
|
# 1. When converting from REST object to SDK object
|
|
154
158
|
for frame in inspect.stack():
|
|
@@ -3,20 +3,44 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
|
-
from typing import List
|
|
6
|
+
from typing import List, Callable, Any
|
|
7
7
|
|
|
8
8
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def list_sum(lst: List[float]) -> float:
|
|
12
|
+
"""Given a list of floats, return the sum of the values.
|
|
13
|
+
|
|
14
|
+
:param lst: A list of floats.
|
|
15
|
+
:type lst: List[float]
|
|
16
|
+
:return: The sum of the values in the list.
|
|
17
|
+
:rtype: float
|
|
18
|
+
"""
|
|
19
|
+
|
|
12
20
|
return sum(lst)
|
|
13
21
|
|
|
14
22
|
|
|
15
23
|
def list_mean(lst: List[float]) -> float:
|
|
24
|
+
"""Given a list of floats, calculate the mean of the values.
|
|
25
|
+
|
|
26
|
+
:param lst: A list of floats.
|
|
27
|
+
:type lst: List[float]
|
|
28
|
+
:return: The mean of the values in the list.
|
|
29
|
+
:rtype: float
|
|
30
|
+
"""
|
|
31
|
+
|
|
16
32
|
return list_sum(lst) / len(lst)
|
|
17
33
|
|
|
18
34
|
|
|
19
35
|
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
36
|
+
"""Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
|
|
37
|
+
|
|
38
|
+
:param lst: A list of floats.
|
|
39
|
+
:type lst: List[float]
|
|
40
|
+
:return: The mean of the values in the list.
|
|
41
|
+
:rtype: float
|
|
42
|
+
"""
|
|
43
|
+
|
|
20
44
|
msg = "All score values are NaN. The mean cannot be calculated."
|
|
21
45
|
if all(math.isnan(l) for l in lst):
|
|
22
46
|
raise EvaluationException(
|
|
@@ -26,4 +50,40 @@ def list_mean_nan_safe(lst: List[float]) -> float:
|
|
|
26
50
|
category=ErrorCategory.INVALID_VALUE,
|
|
27
51
|
target=ErrorTarget.CONVERSATION,
|
|
28
52
|
)
|
|
29
|
-
return list_mean([l for l in lst if not
|
|
53
|
+
return list_mean([l for l in lst if not is_none_or_nan(l)])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
|
|
57
|
+
"""Given a list of floats, remove all nan values, then apply the inputted transform function
|
|
58
|
+
to the remaining values, and return the resulting list of outputted values.
|
|
59
|
+
|
|
60
|
+
:param lst: A list of floats.
|
|
61
|
+
:type lst: List[float]
|
|
62
|
+
:param transform_fn: A function that produces something when applied to a float.
|
|
63
|
+
:type transform_fn: Callable[[float], Any]
|
|
64
|
+
:return: A list of the transformed values.
|
|
65
|
+
:rtype: List[Any]
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
69
|
+
if all(math.isnan(l) for l in lst):
|
|
70
|
+
raise EvaluationException(
|
|
71
|
+
message=msg,
|
|
72
|
+
internal_message=msg,
|
|
73
|
+
blame=ErrorBlame.USER_ERROR,
|
|
74
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
75
|
+
target=ErrorTarget.CONVERSATION,
|
|
76
|
+
)
|
|
77
|
+
return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_none_or_nan(val: float) -> bool:
|
|
81
|
+
"""math.isnan raises an error if None is inputted. This is a more robust wrapper.
|
|
82
|
+
|
|
83
|
+
:param val: The value to check.
|
|
84
|
+
:type val: float
|
|
85
|
+
:return: Whether the value is None or NaN.
|
|
86
|
+
:rtype: bool
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
return val is None or math.isnan(val)
|
|
@@ -6,6 +6,8 @@ import importlib.metadata
|
|
|
6
6
|
import math
|
|
7
7
|
import re
|
|
8
8
|
import time
|
|
9
|
+
import json
|
|
10
|
+
import html
|
|
9
11
|
from ast import literal_eval
|
|
10
12
|
from typing import Dict, List, Optional, Union, cast
|
|
11
13
|
from urllib.parse import urlparse
|
|
@@ -38,10 +40,38 @@ USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
|
38
40
|
|
|
39
41
|
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
40
42
|
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
41
|
-
Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
42
43
|
}
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
|
+
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
48
|
+
payload for the RAI service. Requires specific per-task logic.
|
|
49
|
+
|
|
50
|
+
:param data: The data to incorporate into the payload.
|
|
51
|
+
:type data: dict
|
|
52
|
+
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
53
|
+
:type annotation_task: str
|
|
54
|
+
:return: The formatted based on the data and task template.
|
|
55
|
+
:rtype: str
|
|
56
|
+
"""
|
|
57
|
+
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
58
|
+
# JSON format manually.
|
|
59
|
+
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
60
|
+
if annotation_task == Tasks.GROUNDEDNESS:
|
|
61
|
+
as_dict = {
|
|
62
|
+
"question": data.get("query", ""),
|
|
63
|
+
"answer": data.get("response", ""),
|
|
64
|
+
"context": data.get("context", ""),
|
|
65
|
+
}
|
|
66
|
+
return json.dumps(as_dict)
|
|
67
|
+
as_dict = {
|
|
68
|
+
"query": html.escape(data.get("query", "")),
|
|
69
|
+
"response": html.escape(data.get("response", "")),
|
|
70
|
+
}
|
|
71
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
72
|
+
return user_text.replace("'", '\\"')
|
|
73
|
+
|
|
74
|
+
|
|
45
75
|
def get_common_headers(token: str) -> Dict:
|
|
46
76
|
"""Get common headers for the HTTP request
|
|
47
77
|
|
|
@@ -83,27 +113,31 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
83
113
|
async with get_async_http_client() as client:
|
|
84
114
|
response = await client.get(svc_liveness_url, headers=headers)
|
|
85
115
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
116
|
+
if response.status_code != 200:
|
|
117
|
+
msg = (
|
|
118
|
+
f"RAI service is unavailable in this region, or you lack the necessary permissions "
|
|
119
|
+
f"to access the AI project. Status Code: {response.status_code}"
|
|
120
|
+
)
|
|
121
|
+
raise EvaluationException(
|
|
122
|
+
message=msg,
|
|
123
|
+
internal_message=msg,
|
|
124
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
125
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
126
|
+
blame=ErrorBlame.USER_ERROR,
|
|
127
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
128
|
+
)
|
|
97
129
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
130
|
+
capabilities = response.json()
|
|
131
|
+
if capability and capability not in capabilities:
|
|
132
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
133
|
+
raise EvaluationException(
|
|
134
|
+
message=msg,
|
|
135
|
+
internal_message=msg,
|
|
136
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
137
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
138
|
+
blame=ErrorBlame.USER_ERROR,
|
|
139
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
140
|
+
)
|
|
107
141
|
|
|
108
142
|
|
|
109
143
|
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
@@ -157,8 +191,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
|
|
|
157
191
|
:return: The operation ID.
|
|
158
192
|
:rtype: str
|
|
159
193
|
"""
|
|
160
|
-
|
|
161
|
-
normalized_user_text = user_text.replace("'", '\\"')
|
|
194
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
162
195
|
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
163
196
|
|
|
164
197
|
url = rai_svc_url + "/submitannotation"
|
|
@@ -235,13 +268,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
235
268
|
_InternalEvaluationMetrics.ECI,
|
|
236
269
|
EvaluationMetrics.XPIA,
|
|
237
270
|
}:
|
|
238
|
-
|
|
271
|
+
result = {}
|
|
272
|
+
if not batch_response or len(batch_response[0]) == 0:
|
|
273
|
+
return {}
|
|
274
|
+
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
275
|
+
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
276
|
+
for pm_metric_name in pm_metric_names:
|
|
277
|
+
response = batch_response[0][pm_metric_name]
|
|
278
|
+
response = response.replace("false", "False")
|
|
279
|
+
response = response.replace("true", "True")
|
|
280
|
+
parsed_response = literal_eval(response)
|
|
281
|
+
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
282
|
+
result[pm_metric_name + "_reason"] = (
|
|
283
|
+
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
284
|
+
)
|
|
285
|
+
return result
|
|
286
|
+
if metric_name not in batch_response[0]:
|
|
239
287
|
return {}
|
|
240
288
|
response = batch_response[0][metric_name]
|
|
241
289
|
response = response.replace("false", "False")
|
|
242
290
|
response = response.replace("true", "True")
|
|
243
291
|
parsed_response = literal_eval(response)
|
|
244
|
-
result = {}
|
|
245
292
|
# Use label instead of score since these are assumed to be boolean results.
|
|
246
293
|
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
247
294
|
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
@@ -371,13 +418,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
371
418
|
)
|
|
372
419
|
|
|
373
420
|
if response.status_code != 200:
|
|
374
|
-
msg =
|
|
421
|
+
msg = (
|
|
422
|
+
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
423
|
+
f"and make sure you have the necessary access permissions. "
|
|
424
|
+
f"Status code: {response.status_code}."
|
|
425
|
+
)
|
|
375
426
|
raise EvaluationException(
|
|
376
427
|
message=msg,
|
|
377
|
-
internal_message=msg,
|
|
378
428
|
target=ErrorTarget.RAI_CLIENT,
|
|
379
|
-
|
|
380
|
-
|
|
429
|
+
blame=ErrorBlame.USER_ERROR,
|
|
430
|
+
category=ErrorCategory.PROJECT_ACCESS_ERROR,
|
|
431
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
381
432
|
)
|
|
382
433
|
|
|
383
434
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
@@ -293,14 +293,22 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
293
293
|
score = math.nan
|
|
294
294
|
reason = ""
|
|
295
295
|
if llm_output:
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
296
|
+
try:
|
|
297
|
+
score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
|
|
298
|
+
reason_pattern = r"<S1>(.*?)</S1>"
|
|
299
|
+
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
300
|
+
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
301
|
+
if score_match:
|
|
302
|
+
score = float(score_match[0].strip())
|
|
303
|
+
if reason_match:
|
|
304
|
+
reason = reason_match[0].strip()
|
|
305
|
+
except ValueError as exc:
|
|
306
|
+
raise EvaluationException(
|
|
307
|
+
message=f"Failed to parse model output: \n{llm_output}",
|
|
308
|
+
internal_message="Failed to parse model output.",
|
|
309
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
310
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
311
|
+
) from exc
|
|
304
312
|
|
|
305
313
|
return score, reason
|
|
306
314
|
|
|
@@ -329,12 +337,12 @@ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
|
|
|
329
337
|
return "image"
|
|
330
338
|
|
|
331
339
|
# Iterate through each message
|
|
332
|
-
for
|
|
340
|
+
for message in assistant_messages:
|
|
333
341
|
# Ensure "content" exists in the message and is iterable
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
342
|
+
if isinstance(message.get("content", []), list):
|
|
343
|
+
for content in message.get("content", []):
|
|
344
|
+
if content.get("type") == "image_url":
|
|
345
|
+
return "image"
|
|
338
346
|
# Default return if no image was found
|
|
339
347
|
return "text"
|
|
340
348
|
|
|
@@ -362,6 +370,8 @@ def validate_conversation(conversation):
|
|
|
362
370
|
)
|
|
363
371
|
expected_roles = {"user", "assistant", "system"}
|
|
364
372
|
image_found = False
|
|
373
|
+
assistant_message_count = 0
|
|
374
|
+
user_message_count = 0
|
|
365
375
|
for num, message in enumerate(messages, 1):
|
|
366
376
|
if not isinstance(message, dict):
|
|
367
377
|
try:
|
|
@@ -374,17 +384,21 @@ def validate_conversation(conversation):
|
|
|
374
384
|
)
|
|
375
385
|
except ImportError as ex:
|
|
376
386
|
raise MissingRequiredPackage(
|
|
377
|
-
message="Please install 'azure-ai-inference' package to use SystemMessage,
|
|
387
|
+
message="Please install 'azure-ai-inference' package to use SystemMessage, "
|
|
388
|
+
"UserMessage or AssistantMessage."
|
|
378
389
|
) from ex
|
|
379
390
|
|
|
380
|
-
if isinstance(
|
|
391
|
+
if isinstance(message, ChatRequestMessage) and not isinstance(
|
|
381
392
|
message, (UserMessage, AssistantMessage, SystemMessage)
|
|
382
393
|
):
|
|
383
394
|
raise_exception(
|
|
384
395
|
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
385
396
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
386
397
|
)
|
|
387
|
-
|
|
398
|
+
if isinstance(message, AssistantMessage):
|
|
399
|
+
assistant_message_count += 1
|
|
400
|
+
if isinstance(message, UserMessage):
|
|
401
|
+
user_message_count += 1
|
|
388
402
|
if isinstance(message.content, list) and any(
|
|
389
403
|
isinstance(item, ImageContentItem) for item in message.content
|
|
390
404
|
):
|
|
@@ -395,6 +409,10 @@ def validate_conversation(conversation):
|
|
|
395
409
|
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
396
410
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
397
411
|
)
|
|
412
|
+
if message.get("role") == "assistant":
|
|
413
|
+
assistant_message_count += 1
|
|
414
|
+
if message.get("role") == "user":
|
|
415
|
+
user_message_count += 1
|
|
398
416
|
content = message.get("content")
|
|
399
417
|
if not isinstance(content, (str, list)):
|
|
400
418
|
raise_exception(
|
|
@@ -409,3 +427,19 @@ def validate_conversation(conversation):
|
|
|
409
427
|
"Message needs to have multi-modal input like images.",
|
|
410
428
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
411
429
|
)
|
|
430
|
+
if assistant_message_count == 0:
|
|
431
|
+
raise_exception(
|
|
432
|
+
"Assistant role required in one of the messages.",
|
|
433
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
434
|
+
)
|
|
435
|
+
if user_message_count == 0:
|
|
436
|
+
raise_exception(
|
|
437
|
+
"User role required in one of the messages.",
|
|
438
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
439
|
+
)
|
|
440
|
+
if assistant_message_count > 1:
|
|
441
|
+
raise_exception(
|
|
442
|
+
"Evaluators for multimodal conversations only support single turn. "
|
|
443
|
+
"User and assistant role expected as the only role in each message.",
|
|
444
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
445
|
+
)
|
|
@@ -62,6 +62,7 @@ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
|
62
62
|
|
|
63
63
|
PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
|
|
64
64
|
PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
|
|
65
|
+
PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
|
|
65
66
|
|
|
66
67
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
|
|
67
68
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
@@ -14,6 +14,7 @@ from azure.ai.evaluation._constants import (
|
|
|
14
14
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
15
15
|
PF_BATCH_TIMEOUT_SEC,
|
|
16
16
|
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
17
|
+
PF_DISABLE_TRACING,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
from ..._user_agent import USER_AGENT
|
|
@@ -36,8 +37,12 @@ class EvalRunContext:
|
|
|
36
37
|
self.client = client
|
|
37
38
|
self._is_batch_timeout_set_by_system = False
|
|
38
39
|
self._is_otel_timeout_set_by_system = False
|
|
40
|
+
self._original_cwd = os.getcwd()
|
|
39
41
|
|
|
40
42
|
def __enter__(self) -> None:
|
|
43
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
44
|
+
self._original_cwd = os.getcwd()
|
|
45
|
+
|
|
41
46
|
if isinstance(self.client, CodeClient):
|
|
42
47
|
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
43
48
|
inject_openai_api()
|
|
@@ -45,6 +50,7 @@ class EvalRunContext:
|
|
|
45
50
|
if isinstance(self.client, ProxyClient):
|
|
46
51
|
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
47
52
|
os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
|
|
53
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
48
54
|
|
|
49
55
|
if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
|
|
50
56
|
os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
@@ -64,12 +70,15 @@ class EvalRunContext:
|
|
|
64
70
|
exc_value: Optional[BaseException],
|
|
65
71
|
exc_tb: Optional[types.TracebackType],
|
|
66
72
|
) -> None:
|
|
73
|
+
os.chdir(self._original_cwd)
|
|
74
|
+
|
|
67
75
|
if isinstance(self.client, CodeClient):
|
|
68
76
|
recover_openai_api()
|
|
69
77
|
|
|
70
78
|
if isinstance(self.client, ProxyClient):
|
|
71
79
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
72
80
|
os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
|
|
81
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
73
82
|
|
|
74
83
|
if self._is_batch_timeout_set_by_system:
|
|
75
84
|
os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
|
|
@@ -68,12 +68,22 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
68
68
|
run = proxy_run.run.result()
|
|
69
69
|
|
|
70
70
|
# pylint: disable=protected-access
|
|
71
|
+
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
72
|
+
failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
|
|
73
|
+
|
|
74
|
+
# Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
|
|
75
|
+
if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
|
|
76
|
+
status = "Completed with Errors"
|
|
77
|
+
else:
|
|
78
|
+
status = run.status
|
|
79
|
+
|
|
80
|
+
# Return the ordered dictionary with the updated status
|
|
71
81
|
return OrderedDict(
|
|
72
82
|
[
|
|
73
|
-
("status",
|
|
83
|
+
("status", status),
|
|
74
84
|
("duration", str(run._end_time - run._created_on)),
|
|
75
|
-
("completed_lines",
|
|
76
|
-
("failed_lines",
|
|
85
|
+
("completed_lines", completed_lines),
|
|
86
|
+
("failed_lines", failed_lines),
|
|
77
87
|
("log_path", str(run._output_path)),
|
|
78
88
|
]
|
|
79
89
|
)
|
|
@@ -6,6 +6,7 @@ import types
|
|
|
6
6
|
from typing import Optional, Type
|
|
7
7
|
|
|
8
8
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
|
|
9
|
+
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class TargetRunContext:
|
|
@@ -17,19 +18,29 @@ class TargetRunContext:
|
|
|
17
18
|
|
|
18
19
|
def __init__(self, upload_snapshot: bool) -> None:
|
|
19
20
|
self._upload_snapshot = upload_snapshot
|
|
21
|
+
self._original_cwd = os.getcwd()
|
|
20
22
|
|
|
21
23
|
def __enter__(self) -> None:
|
|
24
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
25
|
+
self._original_cwd = os.getcwd()
|
|
26
|
+
|
|
22
27
|
# Address "[WinError 32] The process cannot access the file" error,
|
|
23
28
|
# caused by conflicts when the venv and target function are in the same directory.
|
|
24
29
|
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
|
|
25
30
|
if not self._upload_snapshot:
|
|
26
31
|
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
27
32
|
|
|
33
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
34
|
+
|
|
28
35
|
def __exit__(
|
|
29
36
|
self,
|
|
30
37
|
exc_type: Optional[Type[BaseException]],
|
|
31
38
|
exc_value: Optional[BaseException],
|
|
32
39
|
exc_tb: Optional[types.TracebackType],
|
|
33
40
|
) -> None:
|
|
41
|
+
os.chdir(self._original_cwd)
|
|
42
|
+
|
|
34
43
|
if not self._upload_snapshot:
|
|
35
44
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
45
|
+
|
|
46
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
@@ -34,14 +34,15 @@ try:
|
|
|
34
34
|
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
35
35
|
from azure.storage.blob import BlobServiceClient
|
|
36
36
|
except (ModuleNotFoundError, ImportError):
|
|
37
|
-
#
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
37
|
+
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
38
|
+
message=(
|
|
39
|
+
"The required packages for remote tracking are missing.\n"
|
|
40
|
+
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
41
|
+
),
|
|
42
|
+
target=ErrorTarget.EVALUATE,
|
|
43
|
+
category=ErrorCategory.MISSING_PACKAGE,
|
|
44
|
+
blame=ErrorBlame.USER_ERROR,
|
|
45
|
+
)
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
@dataclasses.dataclass
|
|
@@ -103,7 +104,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
103
104
|
_SCOPE = "https://management.azure.com/.default"
|
|
104
105
|
|
|
105
106
|
EVALUATION_ARTIFACT = "instance_results.jsonl"
|
|
106
|
-
EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
|
|
107
107
|
|
|
108
108
|
def __init__(
|
|
109
109
|
self,
|
|
@@ -413,7 +413,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
413
413
|
"""
|
|
414
414
|
if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
|
|
415
415
|
return
|
|
416
|
-
# Check if artifact
|
|
416
|
+
# Check if artifact directory is empty or does not exist.
|
|
417
417
|
if not os.path.isdir(artifact_folder):
|
|
418
418
|
LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
|
|
419
419
|
return
|
|
@@ -469,6 +469,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
469
469
|
# we are rewriting already registered artifact and need to skip this step.
|
|
470
470
|
if self._is_promptflow_run:
|
|
471
471
|
return
|
|
472
|
+
|
|
472
473
|
url = (
|
|
473
474
|
f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
|
|
474
475
|
f"/resourceGroups/{self._resource_group_name}/providers/"
|
|
@@ -491,6 +492,29 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
491
492
|
if response.status_code != 200:
|
|
492
493
|
self._log_warning("register artifact", response)
|
|
493
494
|
|
|
495
|
+
# register artifacts for images if exists in image folder
|
|
496
|
+
try:
|
|
497
|
+
for remote_path in remote_paths["paths"]:
|
|
498
|
+
remote_file_path = remote_path["path"]
|
|
499
|
+
if "images" in os.path.normpath(remote_file_path).split(os.sep):
|
|
500
|
+
response = self.request_with_retry(
|
|
501
|
+
url=url,
|
|
502
|
+
method="POST",
|
|
503
|
+
json_dict={
|
|
504
|
+
"origin": "ExperimentRun",
|
|
505
|
+
"container": f"dcid.{self.info.run_id}",
|
|
506
|
+
"path": posixpath.join("images", os.path.basename(remote_file_path)),
|
|
507
|
+
"dataPath": {
|
|
508
|
+
"dataStoreName": datastore.name,
|
|
509
|
+
"relativePath": remote_file_path,
|
|
510
|
+
},
|
|
511
|
+
},
|
|
512
|
+
)
|
|
513
|
+
if response.status_code != 200:
|
|
514
|
+
self._log_warning("register image artifact", response)
|
|
515
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
516
|
+
LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
|
|
517
|
+
|
|
494
518
|
def _get_datastore_credential(self, datastore: "Datastore"):
|
|
495
519
|
# Reference the logic in azure.ai.ml._artifact._artifact_utilities
|
|
496
520
|
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
|