azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_azure/__init__.py +3 -0
- azure/ai/evaluation/_azure/_clients.py +188 -0
- azure/ai/evaluation/_azure/_models.py +227 -0
- azure/ai/evaluation/_azure/_token_manager.py +118 -0
- azure/ai/evaluation/_common/_experimental.py +4 -0
- azure/ai/evaluation/_common/math.py +62 -2
- azure/ai/evaluation/_common/rai_service.py +110 -50
- azure/ai/evaluation/_common/utils.py +50 -16
- azure/ai/evaluation/_constants.py +2 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +13 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +12 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +38 -43
- azure/ai/evaluation/_evaluate/_evaluate.py +62 -131
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +2 -1
- azure/ai/evaluation/_evaluate/_utils.py +72 -38
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +16 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +60 -29
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +88 -6
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +16 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +39 -10
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +58 -52
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +79 -34
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +73 -34
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +74 -33
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -34
- azure/ai/evaluation/_evaluators/_eci/_eci.py +28 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +57 -26
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +13 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +68 -30
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +17 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +10 -8
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -2
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +10 -6
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +6 -2
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +6 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +57 -34
- azure/ai/evaluation/_evaluators/_qa/_qa.py +25 -37
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +63 -29
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +76 -161
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +24 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +65 -67
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +26 -20
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +74 -40
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_http_utils.py +6 -4
- azure/ai/evaluation/_model_configurations.py +65 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +17 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +57 -47
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +128 -7
- azure/ai/evaluation/simulator/_conversation/_conversation.py +0 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +16 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +12 -1
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +3 -1
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +48 -4
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -0
- azure/ai/evaluation/simulator/_simulator.py +54 -45
- azure/ai/evaluation/simulator/_utils.py +25 -7
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/METADATA +240 -327
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/RECORD +71 -68
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b5.dist-info → azure_ai_evaluation-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,8 @@ import importlib.metadata
|
|
|
6
6
|
import math
|
|
7
7
|
import re
|
|
8
8
|
import time
|
|
9
|
+
import json
|
|
10
|
+
import html
|
|
9
11
|
from ast import literal_eval
|
|
10
12
|
from typing import Dict, List, Optional, Union, cast
|
|
11
13
|
from urllib.parse import urlparse
|
|
@@ -38,22 +40,53 @@ USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
|
38
40
|
|
|
39
41
|
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
40
42
|
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
41
|
-
Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
42
43
|
}
|
|
43
44
|
|
|
44
45
|
|
|
45
|
-
def
|
|
46
|
+
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
|
+
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
48
|
+
payload for the RAI service. Requires specific per-task logic.
|
|
49
|
+
|
|
50
|
+
:param data: The data to incorporate into the payload.
|
|
51
|
+
:type data: dict
|
|
52
|
+
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
53
|
+
:type annotation_task: str
|
|
54
|
+
:return: The formatted based on the data and task template.
|
|
55
|
+
:rtype: str
|
|
56
|
+
"""
|
|
57
|
+
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
58
|
+
# JSON format manually.
|
|
59
|
+
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
60
|
+
if annotation_task == Tasks.GROUNDEDNESS:
|
|
61
|
+
as_dict = {
|
|
62
|
+
"question": data.get("query", ""),
|
|
63
|
+
"answer": data.get("response", ""),
|
|
64
|
+
"context": data.get("context", ""),
|
|
65
|
+
}
|
|
66
|
+
return json.dumps(as_dict)
|
|
67
|
+
as_dict = {
|
|
68
|
+
"query": html.escape(data.get("query", "")),
|
|
69
|
+
"response": html.escape(data.get("response", "")),
|
|
70
|
+
}
|
|
71
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
72
|
+
return user_text.replace("'", '\\"')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
|
|
46
76
|
"""Get common headers for the HTTP request
|
|
47
77
|
|
|
48
78
|
:param token: The Azure authentication token.
|
|
49
79
|
:type token: str
|
|
80
|
+
:param evaluator_name: The evaluator name. Default is None.
|
|
81
|
+
:type evaluator_name: str
|
|
50
82
|
:return: The common headers.
|
|
51
83
|
:rtype: Dict
|
|
52
84
|
"""
|
|
85
|
+
user_agent = f"{USER_AGENT} (type=evaluator; subtype={evaluator_name})" if evaluator_name else USER_AGENT
|
|
53
86
|
return {
|
|
54
87
|
"Authorization": f"Bearer {token}",
|
|
55
88
|
"Content-Type": "application/json",
|
|
56
|
-
"User-Agent":
|
|
89
|
+
"User-Agent": user_agent,
|
|
57
90
|
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
58
91
|
# https://github.com/encode/httpx/discussions/2959
|
|
59
92
|
"Connection": "close",
|
|
@@ -83,27 +116,31 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
83
116
|
async with get_async_http_client() as client:
|
|
84
117
|
response = await client.get(svc_liveness_url, headers=headers)
|
|
85
118
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
119
|
+
if response.status_code != 200:
|
|
120
|
+
msg = (
|
|
121
|
+
f"RAI service is unavailable in this region, or you lack the necessary permissions "
|
|
122
|
+
f"to access the AI project. Status Code: {response.status_code}"
|
|
123
|
+
)
|
|
124
|
+
raise EvaluationException(
|
|
125
|
+
message=msg,
|
|
126
|
+
internal_message=msg,
|
|
127
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
128
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
129
|
+
blame=ErrorBlame.USER_ERROR,
|
|
130
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
131
|
+
)
|
|
97
132
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
133
|
+
capabilities = response.json()
|
|
134
|
+
if capability and capability not in capabilities:
|
|
135
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
136
|
+
raise EvaluationException(
|
|
137
|
+
message=msg,
|
|
138
|
+
internal_message=msg,
|
|
139
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
140
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
141
|
+
blame=ErrorBlame.USER_ERROR,
|
|
142
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
143
|
+
)
|
|
107
144
|
|
|
108
145
|
|
|
109
146
|
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
@@ -141,7 +178,9 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
141
178
|
)
|
|
142
179
|
|
|
143
180
|
|
|
144
|
-
async def submit_request(
|
|
181
|
+
async def submit_request(
|
|
182
|
+
data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
|
|
183
|
+
) -> str:
|
|
145
184
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
146
185
|
|
|
147
186
|
:param data: The data to evaluate.
|
|
@@ -154,15 +193,16 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
|
|
|
154
193
|
:type token: str
|
|
155
194
|
:param annotation_task: The annotation task to use.
|
|
156
195
|
:type annotation_task: str
|
|
196
|
+
:param evaluator_name: The evaluator name.
|
|
197
|
+
:type evaluator_name: str
|
|
157
198
|
:return: The operation ID.
|
|
158
199
|
:rtype: str
|
|
159
200
|
"""
|
|
160
|
-
|
|
161
|
-
normalized_user_text = user_text.replace("'", '\\"')
|
|
201
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
162
202
|
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
163
203
|
|
|
164
204
|
url = rai_svc_url + "/submitannotation"
|
|
165
|
-
headers = get_common_headers(token)
|
|
205
|
+
headers = get_common_headers(token, evaluator_name)
|
|
166
206
|
|
|
167
207
|
async with get_async_http_client_with_timeout() as client:
|
|
168
208
|
http_response = await client.post(url, json=payload, headers=headers)
|
|
@@ -235,13 +275,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
235
275
|
_InternalEvaluationMetrics.ECI,
|
|
236
276
|
EvaluationMetrics.XPIA,
|
|
237
277
|
}:
|
|
238
|
-
|
|
278
|
+
result = {}
|
|
279
|
+
if not batch_response or len(batch_response[0]) == 0:
|
|
280
|
+
return {}
|
|
281
|
+
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
282
|
+
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
283
|
+
for pm_metric_name in pm_metric_names:
|
|
284
|
+
response = batch_response[0][pm_metric_name]
|
|
285
|
+
response = response.replace("false", "False")
|
|
286
|
+
response = response.replace("true", "True")
|
|
287
|
+
parsed_response = literal_eval(response)
|
|
288
|
+
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
289
|
+
result[pm_metric_name + "_reason"] = (
|
|
290
|
+
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
291
|
+
)
|
|
292
|
+
return result
|
|
293
|
+
if metric_name not in batch_response[0]:
|
|
239
294
|
return {}
|
|
240
295
|
response = batch_response[0][metric_name]
|
|
241
296
|
response = response.replace("false", "False")
|
|
242
297
|
response = response.replace("true", "True")
|
|
243
298
|
parsed_response = literal_eval(response)
|
|
244
|
-
result = {}
|
|
245
299
|
# Use label instead of score since these are assumed to be boolean results.
|
|
246
300
|
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
247
301
|
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
@@ -371,13 +425,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
371
425
|
)
|
|
372
426
|
|
|
373
427
|
if response.status_code != 200:
|
|
374
|
-
msg =
|
|
428
|
+
msg = (
|
|
429
|
+
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
430
|
+
f"and make sure you have the necessary access permissions. "
|
|
431
|
+
f"Status code: {response.status_code}."
|
|
432
|
+
)
|
|
375
433
|
raise EvaluationException(
|
|
376
434
|
message=msg,
|
|
377
|
-
internal_message=msg,
|
|
378
435
|
target=ErrorTarget.RAI_CLIENT,
|
|
379
|
-
|
|
380
|
-
|
|
436
|
+
blame=ErrorBlame.USER_ERROR,
|
|
437
|
+
category=ErrorCategory.PROJECT_ACCESS_ERROR,
|
|
438
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
381
439
|
)
|
|
382
440
|
|
|
383
441
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
@@ -442,24 +500,26 @@ async def evaluate_with_rai_service(
|
|
|
442
500
|
credential: TokenCredential,
|
|
443
501
|
annotation_task: str = Tasks.CONTENT_HARM,
|
|
444
502
|
metric_display_name=None,
|
|
503
|
+
evaluator_name=None,
|
|
445
504
|
) -> Dict[str, Union[str, float]]:
|
|
446
|
-
"""
|
|
505
|
+
"""Evaluate the content safety of the response using Responsible AI service
|
|
447
506
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
507
|
+
:param data: The data to evaluate.
|
|
508
|
+
:type data: dict
|
|
509
|
+
:param metric_name: The evaluation metric to use.
|
|
510
|
+
:type metric_name: str
|
|
511
|
+
:param project_scope: The Azure AI project scope details.
|
|
512
|
+
:type project_scope: Dict
|
|
513
|
+
:param credential: The Azure authentication credential.
|
|
514
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
515
|
+
:param annotation_task: The annotation task to use.
|
|
516
|
+
:type annotation_task: str
|
|
517
|
+
:param metric_display_name: The display name of metric to use.
|
|
518
|
+
:type metric_display_name: str
|
|
519
|
+
:param evaluator_name: The evaluator name to use.
|
|
520
|
+
:type evaluator_name: str
|
|
521
|
+
:return: The parsed annotation result.
|
|
522
|
+
:rtype: Dict[str, Union[str, float]]
|
|
463
523
|
"""
|
|
464
524
|
|
|
465
525
|
# Get RAI service URL from discovery service and check service availability
|
|
@@ -468,7 +528,7 @@ async def evaluate_with_rai_service(
|
|
|
468
528
|
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
469
529
|
|
|
470
530
|
# Submit annotation request and fetch result
|
|
471
|
-
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
531
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
|
|
472
532
|
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
473
533
|
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
474
534
|
|
|
@@ -293,14 +293,22 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
293
293
|
score = math.nan
|
|
294
294
|
reason = ""
|
|
295
295
|
if llm_output:
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
296
|
+
try:
|
|
297
|
+
score_pattern = r"<S2>\D*?([1-5]).*?</S2>"
|
|
298
|
+
reason_pattern = r"<S1>(.*?)</S1>"
|
|
299
|
+
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
300
|
+
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
301
|
+
if score_match:
|
|
302
|
+
score = float(score_match[0].strip())
|
|
303
|
+
if reason_match:
|
|
304
|
+
reason = reason_match[0].strip()
|
|
305
|
+
except ValueError as exc:
|
|
306
|
+
raise EvaluationException(
|
|
307
|
+
message=f"Failed to parse model output: \n{llm_output}",
|
|
308
|
+
internal_message="Failed to parse model output.",
|
|
309
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
310
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
311
|
+
) from exc
|
|
304
312
|
|
|
305
313
|
return score, reason
|
|
306
314
|
|
|
@@ -329,12 +337,12 @@ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
|
|
|
329
337
|
return "image"
|
|
330
338
|
|
|
331
339
|
# Iterate through each message
|
|
332
|
-
for
|
|
340
|
+
for message in assistant_messages:
|
|
333
341
|
# Ensure "content" exists in the message and is iterable
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
342
|
+
if isinstance(message.get("content", []), list):
|
|
343
|
+
for content in message.get("content", []):
|
|
344
|
+
if content.get("type") == "image_url":
|
|
345
|
+
return "image"
|
|
338
346
|
# Default return if no image was found
|
|
339
347
|
return "text"
|
|
340
348
|
|
|
@@ -362,6 +370,8 @@ def validate_conversation(conversation):
|
|
|
362
370
|
)
|
|
363
371
|
expected_roles = {"user", "assistant", "system"}
|
|
364
372
|
image_found = False
|
|
373
|
+
assistant_message_count = 0
|
|
374
|
+
user_message_count = 0
|
|
365
375
|
for num, message in enumerate(messages, 1):
|
|
366
376
|
if not isinstance(message, dict):
|
|
367
377
|
try:
|
|
@@ -374,17 +384,21 @@ def validate_conversation(conversation):
|
|
|
374
384
|
)
|
|
375
385
|
except ImportError as ex:
|
|
376
386
|
raise MissingRequiredPackage(
|
|
377
|
-
message="Please install 'azure-ai-inference' package to use SystemMessage,
|
|
387
|
+
message="Please install 'azure-ai-inference' package to use SystemMessage, "
|
|
388
|
+
"UserMessage or AssistantMessage."
|
|
378
389
|
) from ex
|
|
379
390
|
|
|
380
|
-
if isinstance(
|
|
391
|
+
if isinstance(message, ChatRequestMessage) and not isinstance(
|
|
381
392
|
message, (UserMessage, AssistantMessage, SystemMessage)
|
|
382
393
|
):
|
|
383
394
|
raise_exception(
|
|
384
395
|
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
385
396
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
386
397
|
)
|
|
387
|
-
|
|
398
|
+
if isinstance(message, AssistantMessage):
|
|
399
|
+
assistant_message_count += 1
|
|
400
|
+
if isinstance(message, UserMessage):
|
|
401
|
+
user_message_count += 1
|
|
388
402
|
if isinstance(message.content, list) and any(
|
|
389
403
|
isinstance(item, ImageContentItem) for item in message.content
|
|
390
404
|
):
|
|
@@ -395,6 +409,10 @@ def validate_conversation(conversation):
|
|
|
395
409
|
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
396
410
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
397
411
|
)
|
|
412
|
+
if message.get("role") == "assistant":
|
|
413
|
+
assistant_message_count += 1
|
|
414
|
+
if message.get("role") == "user":
|
|
415
|
+
user_message_count += 1
|
|
398
416
|
content = message.get("content")
|
|
399
417
|
if not isinstance(content, (str, list)):
|
|
400
418
|
raise_exception(
|
|
@@ -409,3 +427,19 @@ def validate_conversation(conversation):
|
|
|
409
427
|
"Message needs to have multi-modal input like images.",
|
|
410
428
|
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
411
429
|
)
|
|
430
|
+
if assistant_message_count == 0:
|
|
431
|
+
raise_exception(
|
|
432
|
+
"Assistant role required in one of the messages.",
|
|
433
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
434
|
+
)
|
|
435
|
+
if user_message_count == 0:
|
|
436
|
+
raise_exception(
|
|
437
|
+
"User role required in one of the messages.",
|
|
438
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
439
|
+
)
|
|
440
|
+
if assistant_message_count > 1:
|
|
441
|
+
raise_exception(
|
|
442
|
+
"Evaluators for multimodal conversations only support single turn. "
|
|
443
|
+
"User and assistant role expected as the only role in each message.",
|
|
444
|
+
ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
|
|
445
|
+
)
|
|
@@ -54,6 +54,7 @@ class EvaluationRunProperties:
|
|
|
54
54
|
|
|
55
55
|
RUN_TYPE = "runType"
|
|
56
56
|
EVALUATION_RUN = "_azureml.evaluation_run"
|
|
57
|
+
EVALUATION_SDK = "_azureml.evaluation_sdk_name"
|
|
57
58
|
|
|
58
59
|
|
|
59
60
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
@@ -62,6 +63,7 @@ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
|
62
63
|
|
|
63
64
|
PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
|
|
64
65
|
PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
|
|
66
|
+
PF_DISABLE_TRACING = "PF_DISABLE_TRACING"
|
|
65
67
|
|
|
66
68
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
|
|
67
69
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
@@ -14,6 +14,7 @@ from azure.ai.evaluation._constants import (
|
|
|
14
14
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
15
15
|
PF_BATCH_TIMEOUT_SEC,
|
|
16
16
|
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
17
|
+
PF_DISABLE_TRACING,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
from ..._user_agent import USER_AGENT
|
|
@@ -36,8 +37,12 @@ class EvalRunContext:
|
|
|
36
37
|
self.client = client
|
|
37
38
|
self._is_batch_timeout_set_by_system = False
|
|
38
39
|
self._is_otel_timeout_set_by_system = False
|
|
40
|
+
self._original_cwd = os.getcwd()
|
|
39
41
|
|
|
40
42
|
def __enter__(self) -> None:
|
|
43
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
44
|
+
self._original_cwd = os.getcwd()
|
|
45
|
+
|
|
41
46
|
if isinstance(self.client, CodeClient):
|
|
42
47
|
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
43
48
|
inject_openai_api()
|
|
@@ -45,6 +50,7 @@ class EvalRunContext:
|
|
|
45
50
|
if isinstance(self.client, ProxyClient):
|
|
46
51
|
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
47
52
|
os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
|
|
53
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
48
54
|
|
|
49
55
|
if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
|
|
50
56
|
os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
@@ -64,12 +70,15 @@ class EvalRunContext:
|
|
|
64
70
|
exc_value: Optional[BaseException],
|
|
65
71
|
exc_tb: Optional[types.TracebackType],
|
|
66
72
|
) -> None:
|
|
73
|
+
os.chdir(self._original_cwd)
|
|
74
|
+
|
|
67
75
|
if isinstance(self.client, CodeClient):
|
|
68
76
|
recover_openai_api()
|
|
69
77
|
|
|
70
78
|
if isinstance(self.client, ProxyClient):
|
|
71
79
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
72
80
|
os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
|
|
81
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
73
82
|
|
|
74
83
|
if self._is_batch_timeout_set_by_system:
|
|
75
84
|
os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
|
|
@@ -68,12 +68,22 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
68
68
|
run = proxy_run.run.result()
|
|
69
69
|
|
|
70
70
|
# pylint: disable=protected-access
|
|
71
|
+
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
72
|
+
failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
|
|
73
|
+
|
|
74
|
+
# Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
|
|
75
|
+
if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
|
|
76
|
+
status = "Completed with Errors"
|
|
77
|
+
else:
|
|
78
|
+
status = run.status
|
|
79
|
+
|
|
80
|
+
# Return the ordered dictionary with the updated status
|
|
71
81
|
return OrderedDict(
|
|
72
82
|
[
|
|
73
|
-
("status",
|
|
83
|
+
("status", status),
|
|
74
84
|
("duration", str(run._end_time - run._created_on)),
|
|
75
|
-
("completed_lines",
|
|
76
|
-
("failed_lines",
|
|
85
|
+
("completed_lines", completed_lines),
|
|
86
|
+
("failed_lines", failed_lines),
|
|
77
87
|
("log_path", str(run._output_path)),
|
|
78
88
|
]
|
|
79
89
|
)
|
|
@@ -6,6 +6,7 @@ import types
|
|
|
6
6
|
from typing import Optional, Type
|
|
7
7
|
|
|
8
8
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
|
|
9
|
+
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class TargetRunContext:
|
|
@@ -15,21 +16,31 @@ class TargetRunContext:
|
|
|
15
16
|
:type upload_snapshot: bool
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
|
-
def __init__(self, upload_snapshot: bool) -> None:
|
|
19
|
+
def __init__(self, upload_snapshot: bool = False) -> None:
|
|
19
20
|
self._upload_snapshot = upload_snapshot
|
|
21
|
+
self._original_cwd = os.getcwd()
|
|
20
22
|
|
|
21
23
|
def __enter__(self) -> None:
|
|
24
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
25
|
+
self._original_cwd = os.getcwd()
|
|
26
|
+
|
|
22
27
|
# Address "[WinError 32] The process cannot access the file" error,
|
|
23
28
|
# caused by conflicts when the venv and target function are in the same directory.
|
|
24
29
|
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
|
|
25
30
|
if not self._upload_snapshot:
|
|
26
31
|
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
27
32
|
|
|
33
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
34
|
+
|
|
28
35
|
def __exit__(
|
|
29
36
|
self,
|
|
30
37
|
exc_type: Optional[Type[BaseException]],
|
|
31
38
|
exc_value: Optional[BaseException],
|
|
32
39
|
exc_tb: Optional[types.TracebackType],
|
|
33
40
|
) -> None:
|
|
41
|
+
os.chdir(self._original_cwd)
|
|
42
|
+
|
|
34
43
|
if not self._upload_snapshot:
|
|
35
44
|
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
45
|
+
|
|
46
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
@@ -22,28 +22,12 @@ from azure.ai.evaluation._version import VERSION
|
|
|
22
22
|
from azure.core.pipeline.policies import RetryPolicy
|
|
23
23
|
from azure.core.rest import HttpResponse
|
|
24
24
|
from azure.core.exceptions import HttpResponseError
|
|
25
|
+
from azure.storage.blob import BlobServiceClient
|
|
26
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
25
27
|
|
|
26
28
|
LOGGER = logging.getLogger(__name__)
|
|
27
29
|
|
|
28
30
|
|
|
29
|
-
# Handle optional import. The azure libraries are only present if
|
|
30
|
-
# promptflow-azure is installed.
|
|
31
|
-
try:
|
|
32
|
-
from azure.ai.ml import MLClient
|
|
33
|
-
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
34
|
-
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
35
|
-
from azure.storage.blob import BlobServiceClient
|
|
36
|
-
except (ModuleNotFoundError, ImportError):
|
|
37
|
-
# If the above mentioned modules cannot be imported, we are running
|
|
38
|
-
# in local mode and MLClient in the constructor will be None, so
|
|
39
|
-
# we will not arrive to Azure-dependent code.
|
|
40
|
-
|
|
41
|
-
# We are logging the import failure only if debug logging level is set because:
|
|
42
|
-
# - If the project configuration was not provided this import is not needed.
|
|
43
|
-
# - If the project configuration was provided, the error will be raised by PFClient.
|
|
44
|
-
LOGGER.debug("promptflow.azure is not installed.")
|
|
45
|
-
|
|
46
|
-
|
|
47
31
|
@dataclasses.dataclass
|
|
48
32
|
class RunInfo:
|
|
49
33
|
"""
|
|
@@ -92,18 +76,18 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
92
76
|
:type group_name: str
|
|
93
77
|
:param workspace_name: The name of workspace/project used to track run.
|
|
94
78
|
:type workspace_name: str
|
|
95
|
-
:param
|
|
96
|
-
:type
|
|
79
|
+
:param management_client: The trace destination string to parse the AI ML workspace blob store from.
|
|
80
|
+
:type management_client:
|
|
81
|
+
~azure.ai.evaluation._promptflow.azure._lite_azure_management_client.LiteMLClient
|
|
97
82
|
:param promptflow_run: The promptflow run used by the
|
|
83
|
+
:type promptflow_run: Optional[promptflow._sdk.entities.Run]
|
|
98
84
|
"""
|
|
99
85
|
|
|
100
86
|
_MAX_RETRIES = 5
|
|
101
87
|
_BACKOFF_FACTOR = 2
|
|
102
88
|
_TIMEOUT = 5
|
|
103
|
-
_SCOPE = "https://management.azure.com/.default"
|
|
104
89
|
|
|
105
90
|
EVALUATION_ARTIFACT = "instance_results.jsonl"
|
|
106
|
-
EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
|
|
107
91
|
|
|
108
92
|
def __init__(
|
|
109
93
|
self,
|
|
@@ -112,14 +96,14 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
112
96
|
subscription_id: str,
|
|
113
97
|
group_name: str,
|
|
114
98
|
workspace_name: str,
|
|
115
|
-
|
|
99
|
+
management_client: LiteMLClient,
|
|
116
100
|
promptflow_run: Optional[Run] = None,
|
|
117
101
|
) -> None:
|
|
118
102
|
self._tracking_uri: str = tracking_uri
|
|
119
103
|
self._subscription_id: str = subscription_id
|
|
120
104
|
self._resource_group_name: str = group_name
|
|
121
105
|
self._workspace_name: str = workspace_name
|
|
122
|
-
self.
|
|
106
|
+
self._management_client: LiteMLClient = management_client
|
|
123
107
|
self._is_promptflow_run: bool = promptflow_run is not None
|
|
124
108
|
self._run_name = run_name
|
|
125
109
|
self._promptflow_run = promptflow_run
|
|
@@ -184,7 +168,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
184
168
|
if self._promptflow_run is not None:
|
|
185
169
|
self._info = RunInfo(
|
|
186
170
|
self._promptflow_run.name,
|
|
187
|
-
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
|
+
self._promptflow_run._experiment_name or "", # pylint: disable=protected-access
|
|
188
172
|
self._promptflow_run.name,
|
|
189
173
|
)
|
|
190
174
|
else:
|
|
@@ -310,12 +294,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
310
294
|
"""
|
|
311
295
|
return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
|
|
312
296
|
|
|
313
|
-
def _get_token(self):
|
|
314
|
-
|
|
315
|
-
# is an optional dependency.
|
|
316
|
-
from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
|
|
317
|
-
|
|
318
|
-
return ArmTokenCache().get_token(self._ml_client._credential) # pylint: disable=protected-access
|
|
297
|
+
def _get_token(self) -> str:
|
|
298
|
+
return self._management_client.get_token()
|
|
319
299
|
|
|
320
300
|
def request_with_retry(
|
|
321
301
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|
|
@@ -413,7 +393,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
413
393
|
"""
|
|
414
394
|
if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
|
|
415
395
|
return
|
|
416
|
-
# Check if artifact
|
|
396
|
+
# Check if artifact directory is empty or does not exist.
|
|
417
397
|
if not os.path.isdir(artifact_folder):
|
|
418
398
|
LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
|
|
419
399
|
return
|
|
@@ -441,9 +421,10 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
441
421
|
local_paths.append(local_file_path)
|
|
442
422
|
|
|
443
423
|
# We will write the artifacts to the workspaceblobstore
|
|
444
|
-
datastore = self.
|
|
424
|
+
datastore = self._management_client.workspace_get_default_datastore(self._workspace_name, True)
|
|
445
425
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
446
|
-
|
|
426
|
+
|
|
427
|
+
svc_client = BlobServiceClient(account_url=account_url, credential=datastore.credential)
|
|
447
428
|
try:
|
|
448
429
|
for local, remote in zip(local_paths, remote_paths["paths"]):
|
|
449
430
|
blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
|
|
@@ -469,6 +450,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
469
450
|
# we are rewriting already registered artifact and need to skip this step.
|
|
470
451
|
if self._is_promptflow_run:
|
|
471
452
|
return
|
|
453
|
+
|
|
472
454
|
url = (
|
|
473
455
|
f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
|
|
474
456
|
f"/resourceGroups/{self._resource_group_name}/providers/"
|
|
@@ -491,15 +473,28 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
491
473
|
if response.status_code != 200:
|
|
492
474
|
self._log_warning("register artifact", response)
|
|
493
475
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
476
|
+
# register artifacts for images if exists in image folder
|
|
477
|
+
try:
|
|
478
|
+
for remote_path in remote_paths["paths"]:
|
|
479
|
+
remote_file_path = remote_path["path"]
|
|
480
|
+
if "images" in os.path.normpath(remote_file_path).split(os.sep):
|
|
481
|
+
response = self.request_with_retry(
|
|
482
|
+
url=url,
|
|
483
|
+
method="POST",
|
|
484
|
+
json_dict={
|
|
485
|
+
"origin": "ExperimentRun",
|
|
486
|
+
"container": f"dcid.{self.info.run_id}",
|
|
487
|
+
"path": posixpath.join("images", os.path.basename(remote_file_path)),
|
|
488
|
+
"dataPath": {
|
|
489
|
+
"dataStoreName": datastore.name,
|
|
490
|
+
"relativePath": remote_file_path,
|
|
491
|
+
},
|
|
492
|
+
},
|
|
493
|
+
)
|
|
494
|
+
if response.status_code != 200:
|
|
495
|
+
self._log_warning("register image artifact", response)
|
|
496
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
497
|
+
LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
|
|
503
498
|
|
|
504
499
|
def log_metric(self, key: str, value: float) -> None:
|
|
505
500
|
"""
|