azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -1
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +182 -12
- azure/ai/evaluation/_constants.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
- azure/ai/evaluation/_exceptions.py +9 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +5 -5
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +112 -113
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
+
from azure.core import CaseInsensitiveEnumMeta
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
class CommonConstants:
|
|
8
10
|
"""Define common constants."""
|
|
@@ -43,7 +45,7 @@ class _InternalAnnotationTasks:
|
|
|
43
45
|
ECI = "eci"
|
|
44
46
|
|
|
45
47
|
|
|
46
|
-
class EvaluationMetrics:
|
|
48
|
+
class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
47
49
|
"""Evaluation metrics to aid the RAI service in determining what
|
|
48
50
|
metrics to request, and how to present them back to the user."""
|
|
49
51
|
|
|
@@ -56,7 +58,7 @@ class EvaluationMetrics:
|
|
|
56
58
|
XPIA = "xpia"
|
|
57
59
|
|
|
58
60
|
|
|
59
|
-
class _InternalEvaluationMetrics:
|
|
61
|
+
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
60
62
|
"""Evaluation metrics that are not publicly supported.
|
|
61
63
|
These metrics are experimental and subject to potential change or migration to the main
|
|
62
64
|
enum over time.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def list_sum(lst: List[float]) -> float:
|
|
10
|
+
return sum(lst)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def list_mean(lst: List[float]) -> float:
|
|
14
|
+
return list_sum(lst) / len(lst)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
18
|
+
return list_mean([l for l in lst if not math.isnan(l)])
|
|
@@ -3,20 +3,20 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import asyncio
|
|
5
5
|
import importlib.metadata
|
|
6
|
+
import math
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from ast import literal_eval
|
|
9
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Optional, Union, cast
|
|
10
11
|
from urllib.parse import urlparse
|
|
11
12
|
|
|
12
13
|
import jwt
|
|
13
|
-
import numpy as np
|
|
14
14
|
|
|
15
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
-
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
16
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
18
|
from azure.core.credentials import TokenCredential
|
|
19
|
-
from azure.
|
|
19
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
20
20
|
|
|
21
21
|
from .constants import (
|
|
22
22
|
CommonConstants,
|
|
@@ -53,7 +53,13 @@ def get_common_headers(token: str) -> Dict:
|
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
|
|
57
|
+
return get_async_http_client().with_policies(
|
|
58
|
+
retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
57
63
|
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
58
64
|
|
|
59
65
|
:param rai_svc_url: The Responsible AI service URL.
|
|
@@ -68,9 +74,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
68
74
|
svc_liveness_url = rai_svc_url + "/checkannotation"
|
|
69
75
|
|
|
70
76
|
async with get_async_http_client() as client:
|
|
71
|
-
response = await client.get(
|
|
72
|
-
svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
73
|
-
)
|
|
77
|
+
response = await client.get(svc_liveness_url, headers=headers)
|
|
74
78
|
|
|
75
79
|
if response.status_code != 200:
|
|
76
80
|
msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
|
|
@@ -154,16 +158,14 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
|
|
|
154
158
|
url = rai_svc_url + "/submitannotation"
|
|
155
159
|
headers = get_common_headers(token)
|
|
156
160
|
|
|
157
|
-
async with
|
|
158
|
-
|
|
159
|
-
url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
160
|
-
)
|
|
161
|
+
async with get_async_http_client_with_timeout() as client:
|
|
162
|
+
http_response = await client.post(url, json=payload, headers=headers)
|
|
161
163
|
|
|
162
|
-
if
|
|
163
|
-
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"],
|
|
164
|
-
|
|
164
|
+
if http_response.status_code != 202:
|
|
165
|
+
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
166
|
+
http_response.raise_for_status()
|
|
165
167
|
|
|
166
|
-
result =
|
|
168
|
+
result = http_response.json()
|
|
167
169
|
operation_id = result["location"].split("/")[-1]
|
|
168
170
|
return operation_id
|
|
169
171
|
|
|
@@ -190,10 +192,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
190
192
|
token = await fetch_or_reuse_token(credential, token)
|
|
191
193
|
headers = get_common_headers(token)
|
|
192
194
|
|
|
193
|
-
async with
|
|
194
|
-
response = await client.get(
|
|
195
|
-
url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
196
|
-
)
|
|
195
|
+
async with get_async_http_client_with_timeout() as client:
|
|
196
|
+
response = await client.get(url, headers=headers)
|
|
197
197
|
|
|
198
198
|
if response.status_code == 200:
|
|
199
199
|
return response.json()
|
|
@@ -209,7 +209,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
209
209
|
|
|
210
210
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
211
211
|
batch_response: List[Dict], metric_name: str
|
|
212
|
-
) -> Dict:
|
|
212
|
+
) -> Dict[str, Union[str, float]]:
|
|
213
213
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
214
214
|
|
|
215
215
|
:param batch_response: The annotation response from Responsible AI service.
|
|
@@ -217,7 +217,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
217
217
|
:param metric_name: The evaluation metric to use.
|
|
218
218
|
:type metric_name: str
|
|
219
219
|
:return: The parsed annotation result.
|
|
220
|
-
:rtype:
|
|
220
|
+
:rtype: Dict[str, Union[str, float]]
|
|
221
221
|
"""
|
|
222
222
|
# non-numeric metrics
|
|
223
223
|
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
|
|
@@ -229,27 +229,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
229
229
|
parsed_response = literal_eval(response)
|
|
230
230
|
result = {}
|
|
231
231
|
# Use label instead of score since these are assumed to be boolean results.
|
|
232
|
-
# Use
|
|
233
|
-
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else
|
|
232
|
+
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
233
|
+
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
234
234
|
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
235
235
|
|
|
236
236
|
if metric_name == EvaluationMetrics.XPIA:
|
|
237
237
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
238
|
-
# if present else set them to
|
|
238
|
+
# if present else set them to math.nan
|
|
239
239
|
result[metric_name + "_manipulated_content"] = (
|
|
240
|
-
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else
|
|
240
|
+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
241
241
|
)
|
|
242
242
|
result[metric_name + "_intrusion"] = (
|
|
243
|
-
parsed_response["intrusion"] if "intrusion" in parsed_response else
|
|
243
|
+
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
244
244
|
)
|
|
245
245
|
result[metric_name + "_information_gathering"] = (
|
|
246
|
-
parsed_response["information_gathering"] if "information_gathering" in parsed_response else
|
|
246
|
+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
247
247
|
)
|
|
248
248
|
return result
|
|
249
249
|
return _parse_content_harm_response(batch_response, metric_name)
|
|
250
250
|
|
|
251
251
|
|
|
252
|
-
def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
|
|
252
|
+
def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
|
|
253
253
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
254
254
|
|
|
255
255
|
:param batch_response: The annotation response from Responsible AI service.
|
|
@@ -257,7 +257,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
257
257
|
:param metric_name: The evaluation metric to use.
|
|
258
258
|
:type metric_name: str
|
|
259
259
|
:return: The parsed annotation result.
|
|
260
|
-
:rtype:
|
|
260
|
+
:rtype: Dict[str, Union[str, float]]
|
|
261
261
|
"""
|
|
262
262
|
# Fix the metric name if it's "hate_fairness"
|
|
263
263
|
# Eventually we will remove this fix once the RAI service is updated
|
|
@@ -265,7 +265,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
265
265
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
266
266
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
267
267
|
|
|
268
|
-
result = {key:
|
|
268
|
+
result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
|
|
269
269
|
|
|
270
270
|
response = batch_response[0]
|
|
271
271
|
if metric_name not in response:
|
|
@@ -285,9 +285,9 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
285
285
|
if "label" in harm_response:
|
|
286
286
|
metric_value = harm_response["label"]
|
|
287
287
|
elif "valid" in harm_response:
|
|
288
|
-
metric_value = 0 if harm_response["valid"] else
|
|
288
|
+
metric_value = 0 if harm_response["valid"] else math.nan
|
|
289
289
|
else:
|
|
290
|
-
metric_value =
|
|
290
|
+
metric_value = math.nan
|
|
291
291
|
|
|
292
292
|
# get reason
|
|
293
293
|
if "reasoning" in harm_response:
|
|
@@ -301,21 +301,21 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
301
301
|
if metric_value_match:
|
|
302
302
|
metric_value = int(metric_value_match[0])
|
|
303
303
|
else:
|
|
304
|
-
metric_value =
|
|
304
|
+
metric_value = math.nan
|
|
305
305
|
reason = harm_response
|
|
306
306
|
elif harm_response != "" and isinstance(harm_response, (int, float)):
|
|
307
307
|
if 0 < harm_response <= 7:
|
|
308
308
|
metric_value = harm_response
|
|
309
309
|
else:
|
|
310
|
-
metric_value =
|
|
310
|
+
metric_value = math.nan
|
|
311
311
|
reason = ""
|
|
312
312
|
else:
|
|
313
|
-
metric_value =
|
|
313
|
+
metric_value = math.nan
|
|
314
314
|
reason = ""
|
|
315
315
|
|
|
316
316
|
harm_score = metric_value
|
|
317
|
-
if not
|
|
318
|
-
# int(
|
|
317
|
+
if not math.isnan(metric_value):
|
|
318
|
+
# int(math.nan) causes a value error, and math.nan is already handled
|
|
319
319
|
# by get_harm_severity_level
|
|
320
320
|
harm_score = int(metric_value)
|
|
321
321
|
result[key] = get_harm_severity_level(harm_score)
|
|
@@ -337,14 +337,13 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
337
337
|
"""
|
|
338
338
|
headers = get_common_headers(token)
|
|
339
339
|
|
|
340
|
-
async with
|
|
341
|
-
response = await client.get(
|
|
340
|
+
async with get_async_http_client_with_timeout() as client:
|
|
341
|
+
response = await client.get(
|
|
342
342
|
f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
|
|
343
343
|
f"resourceGroups/{azure_ai_project['resource_group_name']}/"
|
|
344
344
|
f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
|
|
345
345
|
f"api-version=2023-08-01-preview",
|
|
346
346
|
headers=headers,
|
|
347
|
-
timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
|
|
348
347
|
)
|
|
349
348
|
|
|
350
349
|
if response.status_code != 200:
|
|
@@ -361,7 +360,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
361
360
|
return f"{base_url.scheme}://{base_url.netloc}"
|
|
362
361
|
|
|
363
362
|
|
|
364
|
-
async def get_rai_svc_url(project_scope:
|
|
363
|
+
async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
365
364
|
"""Get the Responsible AI service URL
|
|
366
365
|
|
|
367
366
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -385,7 +384,7 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
|
|
|
385
384
|
return rai_url
|
|
386
385
|
|
|
387
386
|
|
|
388
|
-
async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
|
|
387
|
+
async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
|
|
389
388
|
"""Get token. Fetch a new token if the current token is near expiry
|
|
390
389
|
|
|
391
390
|
:param credential: The Azure authentication credential.
|
|
@@ -395,29 +394,26 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
|
|
|
395
394
|
:type token: str
|
|
396
395
|
:return: The Azure authentication token.
|
|
397
396
|
"""
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# Decode the token to get its expiration time
|
|
397
|
+
if token:
|
|
398
|
+
# Decode the token to get its expiration time
|
|
399
|
+
try:
|
|
402
400
|
decoded_token = jwt.decode(token, options={"verify_signature": False})
|
|
401
|
+
except jwt.PyJWTError:
|
|
402
|
+
pass
|
|
403
|
+
else:
|
|
403
404
|
exp_time = decoded_token["exp"]
|
|
404
405
|
current_time = time.time()
|
|
405
406
|
|
|
406
|
-
#
|
|
407
|
+
# Return current token if not near expiry
|
|
407
408
|
if (exp_time - current_time) >= 300:
|
|
408
|
-
|
|
409
|
-
except Exception: # pylint: disable=broad-exception-caught
|
|
410
|
-
pass
|
|
411
|
-
|
|
412
|
-
if acquire_new_token:
|
|
413
|
-
token = credential.get_token("https://management.azure.com/.default").token
|
|
409
|
+
return token
|
|
414
410
|
|
|
415
|
-
return token
|
|
411
|
+
return credential.get_token("https://management.azure.com/.default").token
|
|
416
412
|
|
|
417
413
|
|
|
418
414
|
async def evaluate_with_rai_service(
|
|
419
415
|
query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
420
|
-
):
|
|
416
|
+
) -> Dict[str, Union[str, float]]:
|
|
421
417
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
422
418
|
|
|
423
419
|
:param query: The query to evaluate.
|
|
@@ -432,12 +428,8 @@ async def evaluate_with_rai_service(
|
|
|
432
428
|
:type credential:
|
|
433
429
|
~azure.core.credentials.TokenCredential
|
|
434
430
|
:return: The parsed annotation result.
|
|
435
|
-
:rtype:
|
|
431
|
+
:rtype: Dict[str, Union[str, float]]
|
|
436
432
|
"""
|
|
437
|
-
# Use DefaultAzureCredential if no credential is provided
|
|
438
|
-
# This is for the for batch run scenario as the credential cannot be serialized by promoptflow
|
|
439
|
-
if credential is None or credential == {}:
|
|
440
|
-
credential = DefaultAzureCredential()
|
|
441
433
|
|
|
442
434
|
# Get RAI service URL from discovery service and check service availability
|
|
443
435
|
token = await fetch_or_reuse_token(credential)
|
|
@@ -446,7 +438,7 @@ async def evaluate_with_rai_service(
|
|
|
446
438
|
|
|
447
439
|
# Submit annotation request and fetch result
|
|
448
440
|
operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
|
|
449
|
-
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
|
|
441
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
450
442
|
result = parse_response(annotation_response, metric_name)
|
|
451
443
|
|
|
452
444
|
return result
|
|
@@ -2,21 +2,29 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import math
|
|
5
6
|
import threading
|
|
6
|
-
from typing import List, Union
|
|
7
|
+
from typing import Any, List, Literal, Mapping, Type, TypeVar, Union, cast, get_args, get_origin
|
|
7
8
|
|
|
8
9
|
import nltk
|
|
9
|
-
import
|
|
10
|
+
from typing_extensions import NotRequired, Required, TypeGuard
|
|
10
11
|
|
|
11
|
-
from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
12
12
|
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
|
|
13
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
|
+
from azure.ai.evaluation._model_configurations import (
|
|
15
|
+
AzureAIProject,
|
|
16
|
+
AzureOpenAIModelConfiguration,
|
|
17
|
+
OpenAIModelConfiguration,
|
|
18
|
+
)
|
|
13
19
|
|
|
14
20
|
from . import constants
|
|
15
21
|
|
|
16
22
|
_nltk_data_download_lock = threading.Lock()
|
|
17
23
|
|
|
24
|
+
T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
|
|
18
25
|
|
|
19
|
-
|
|
26
|
+
|
|
27
|
+
def get_harm_severity_level(harm_score: int) -> Union[str, float]:
|
|
20
28
|
"""Generate harm severity level based on harm score.
|
|
21
29
|
|
|
22
30
|
:param harm_score: The harm score to be evaluated.
|
|
@@ -30,12 +38,12 @@ def get_harm_severity_level(harm_score: int) -> str:
|
|
|
30
38
|
constants.HarmSeverityLevel.Medium: [4, 5],
|
|
31
39
|
constants.HarmSeverityLevel.High: [6, 7],
|
|
32
40
|
}
|
|
33
|
-
if harm_score
|
|
34
|
-
return
|
|
41
|
+
if math.isnan(harm_score) or harm_score is None:
|
|
42
|
+
return math.nan
|
|
35
43
|
for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
|
|
36
44
|
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
|
|
37
45
|
return harm_level.value
|
|
38
|
-
return
|
|
46
|
+
return math.nan
|
|
39
47
|
|
|
40
48
|
|
|
41
49
|
def ensure_nltk_data_downloaded():
|
|
@@ -71,12 +79,20 @@ def nltk_tokenize(text: str) -> List[str]:
|
|
|
71
79
|
return list(tokens)
|
|
72
80
|
|
|
73
81
|
|
|
82
|
+
def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]:
|
|
83
|
+
return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment"))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
|
|
87
|
+
return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
|
|
88
|
+
|
|
89
|
+
|
|
74
90
|
def parse_model_config_type(
|
|
75
91
|
model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
76
92
|
) -> None:
|
|
77
|
-
if
|
|
93
|
+
if _is_aoi_model_config(model_config):
|
|
78
94
|
model_config["type"] = AZURE_OPENAI_TYPE
|
|
79
|
-
|
|
95
|
+
elif _is_openai_model_config(model_config):
|
|
80
96
|
model_config["type"] = OPENAI_TYPE
|
|
81
97
|
|
|
82
98
|
|
|
@@ -87,16 +103,170 @@ def construct_prompty_model_config(
|
|
|
87
103
|
) -> dict:
|
|
88
104
|
parse_model_config_type(model_config)
|
|
89
105
|
|
|
90
|
-
if model_config
|
|
106
|
+
if _is_aoi_model_config(model_config):
|
|
91
107
|
model_config["api_version"] = model_config.get("api_version", default_api_version)
|
|
92
108
|
|
|
93
|
-
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
109
|
+
prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
94
110
|
|
|
95
111
|
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
96
112
|
# https://github.com/encode/httpx/discussions/2959
|
|
97
113
|
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
98
114
|
|
|
99
|
-
if model_config
|
|
115
|
+
if _is_aoi_model_config(model_config) and user_agent:
|
|
100
116
|
prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
|
|
101
117
|
|
|
102
118
|
return prompty_model_config
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def validate_azure_ai_project(o: object) -> AzureAIProject:
|
|
122
|
+
fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
|
|
123
|
+
|
|
124
|
+
if not isinstance(o, dict):
|
|
125
|
+
msg = "azure_ai_project must be a dictionary"
|
|
126
|
+
raise EvaluationException(
|
|
127
|
+
message=msg,
|
|
128
|
+
internal_message=msg,
|
|
129
|
+
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
|
|
130
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
131
|
+
blame=ErrorBlame.USER_ERROR,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
missing_fields = set(fields.keys()) - o.keys()
|
|
135
|
+
|
|
136
|
+
if missing_fields:
|
|
137
|
+
msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields)
|
|
138
|
+
raise EvaluationException(
|
|
139
|
+
message=msg,
|
|
140
|
+
internal_message=msg,
|
|
141
|
+
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
|
|
142
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
143
|
+
blame=ErrorBlame.USER_ERROR,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
for field_name, expected_type in fields.items():
|
|
147
|
+
if isinstance(o[field_name], expected_type):
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}."
|
|
151
|
+
|
|
152
|
+
raise EvaluationException(
|
|
153
|
+
message=f"{msg}. Got {type(o[field_name])}.",
|
|
154
|
+
internal_message=msg,
|
|
155
|
+
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
|
|
156
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
157
|
+
blame=ErrorBlame.USER_ERROR,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return cast(AzureAIProject, o)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
|
|
164
|
+
try:
|
|
165
|
+
return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
|
|
166
|
+
except TypeError:
|
|
167
|
+
try:
|
|
168
|
+
return _validate_typed_dict(config, OpenAIModelConfiguration)
|
|
169
|
+
except TypeError as e:
|
|
170
|
+
msg = "Model config validation failed."
|
|
171
|
+
raise EvaluationException(
|
|
172
|
+
message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR
|
|
173
|
+
) from e
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
177
|
+
"""Do very basic runtime validation that an object is a typed dict
|
|
178
|
+
|
|
179
|
+
.. warning::
|
|
180
|
+
|
|
181
|
+
This validation is very basic, robust enough to cover some very simple TypedDicts.
|
|
182
|
+
Ideally, validation of this kind should be delegated to something more robust.
|
|
183
|
+
|
|
184
|
+
You will very quickly run into limitations trying to apply this function more broadly:
|
|
185
|
+
* Doesn't support stringized annotations at all
|
|
186
|
+
* Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types.
|
|
187
|
+
* Error messages are poor, especially if there is any nesting.
|
|
188
|
+
|
|
189
|
+
:param object o: The object to check
|
|
190
|
+
:param Type[T_TypedDict] t: The TypedDict to validate against
|
|
191
|
+
:raises NotImplementedError: Several forms of validation are unsupported
|
|
192
|
+
* Checking against stringized annotations
|
|
193
|
+
* Checking a generic that is not one of a few basic forms
|
|
194
|
+
:raises TypeError: If a value does not match the specified annotation
|
|
195
|
+
:raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...)
|
|
196
|
+
:returns: The object passed in
|
|
197
|
+
:rtype: T_TypedDict
|
|
198
|
+
"""
|
|
199
|
+
if not isinstance(o, dict):
|
|
200
|
+
raise TypeError(f"Expected type 'dict', got type '{type(object)}'.")
|
|
201
|
+
|
|
202
|
+
annotations = t.__annotations__
|
|
203
|
+
is_total = getattr(t, "__total__", False)
|
|
204
|
+
unknown_keys = set(o.keys()) - annotations.keys()
|
|
205
|
+
|
|
206
|
+
if unknown_keys:
|
|
207
|
+
raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}")
|
|
208
|
+
|
|
209
|
+
required_keys = {
|
|
210
|
+
k
|
|
211
|
+
for k in annotations
|
|
212
|
+
if (is_total and get_origin(annotations[k]) is not NotRequired)
|
|
213
|
+
or (not is_total and get_origin(annotations[k]) is Required)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
missing_keys = required_keys - o.keys()
|
|
217
|
+
|
|
218
|
+
if missing_keys:
|
|
219
|
+
raise TypeError(f"Missing required keys: {list(missing_keys)!r}.")
|
|
220
|
+
|
|
221
|
+
def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool:
|
|
222
|
+
if isinstance(annotation, str):
|
|
223
|
+
raise NotImplementedError("Missing support for validating against stringized annotations.")
|
|
224
|
+
|
|
225
|
+
if (origin := get_origin(annotation)) is not None:
|
|
226
|
+
if origin is tuple:
|
|
227
|
+
validate_annotation(v, tuple)
|
|
228
|
+
tuple_args = get_args(annotation)
|
|
229
|
+
if len(cast(tuple, v)) != len(tuple_args):
|
|
230
|
+
raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.")
|
|
231
|
+
for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args):
|
|
232
|
+
validate_annotation(tuple_val, tuple_args)
|
|
233
|
+
elif origin is dict:
|
|
234
|
+
validate_annotation(v, dict)
|
|
235
|
+
dict_key_ann, dict_val_ann = get_args(annotation)
|
|
236
|
+
for dict_key, dict_val in cast(dict, v).items():
|
|
237
|
+
validate_annotation(dict_val, dict_val_ann)
|
|
238
|
+
validate_annotation(dict_key, dict_key_ann)
|
|
239
|
+
elif origin is list:
|
|
240
|
+
validate_annotation(v, list)
|
|
241
|
+
list_val_ann = get_args(annotation)[0]
|
|
242
|
+
for list_val in cast(list, v):
|
|
243
|
+
validate_annotation(list_val, list_val_ann)
|
|
244
|
+
elif origin is Union:
|
|
245
|
+
for generic_arg in get_args(annotation):
|
|
246
|
+
try:
|
|
247
|
+
validate_annotation(v, generic_arg)
|
|
248
|
+
return True
|
|
249
|
+
except TypeError:
|
|
250
|
+
pass
|
|
251
|
+
raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}")
|
|
252
|
+
elif origin is Literal:
|
|
253
|
+
literal_args = get_args(annotation)
|
|
254
|
+
if not any(type(literal) is type(v) and literal == v for literal in literal_args):
|
|
255
|
+
raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}")
|
|
256
|
+
elif any(origin is g for g in (NotRequired, Required)):
|
|
257
|
+
validate_annotation(v, get_args(annotation)[0])
|
|
258
|
+
else:
|
|
259
|
+
raise NotImplementedError(f"Validation not implemented for generic {origin}.")
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
if isinstance(annotation, type):
|
|
263
|
+
if not isinstance(v, annotation):
|
|
264
|
+
raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.")
|
|
265
|
+
return True
|
|
266
|
+
|
|
267
|
+
raise ValueError("Annotation to validate against should be a str, type, or generic.")
|
|
268
|
+
|
|
269
|
+
for k, v in o.items():
|
|
270
|
+
validate_annotation(v, annotations[k])
|
|
271
|
+
|
|
272
|
+
return cast(T_TypedDict, o)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from typing import Literal
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class EvaluationMetrics:
|
|
@@ -48,6 +49,13 @@ class DefaultOpenEncoding:
|
|
|
48
49
|
"""SDK Default Encoding when writing a file"""
|
|
49
50
|
|
|
50
51
|
|
|
52
|
+
class EvaluationRunProperties:
|
|
53
|
+
"""Defines properties used to identify an evaluation run by UI"""
|
|
54
|
+
|
|
55
|
+
RUN_TYPE = "runType"
|
|
56
|
+
EVALUATION_RUN = "_azureml.evaluation_run"
|
|
57
|
+
|
|
58
|
+
|
|
51
59
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
52
60
|
|
|
53
61
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -58,6 +66,6 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
|
|
|
58
66
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
|
|
59
67
|
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
60
68
|
|
|
61
|
-
AZURE_OPENAI_TYPE = "azure_openai"
|
|
69
|
+
AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
|
|
62
70
|
|
|
63
|
-
OPENAI_TYPE = "openai"
|
|
71
|
+
OPENAI_TYPE: Literal["openai"] = "openai"
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type, Union
|
|
5
7
|
|
|
6
8
|
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
7
9
|
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
@@ -30,12 +32,12 @@ class BatchRunContext:
|
|
|
30
32
|
]
|
|
31
33
|
"""
|
|
32
34
|
|
|
33
|
-
def __init__(self, client) -> None:
|
|
35
|
+
def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
|
|
34
36
|
self.client = client
|
|
35
37
|
self._is_batch_timeout_set_by_system = False
|
|
36
38
|
self._is_otel_timeout_set_by_system = False
|
|
37
39
|
|
|
38
|
-
def __enter__(self):
|
|
40
|
+
def __enter__(self) -> None:
|
|
39
41
|
if isinstance(self.client, CodeClient):
|
|
40
42
|
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
41
43
|
inject_openai_api()
|
|
@@ -56,7 +58,12 @@ class BatchRunContext:
|
|
|
56
58
|
# For addressing the issue of asyncio event loop closed on Windows
|
|
57
59
|
set_event_loop_policy()
|
|
58
60
|
|
|
59
|
-
def __exit__(
|
|
61
|
+
def __exit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: Optional[Type[BaseException]],
|
|
64
|
+
exc_value: Optional[BaseException],
|
|
65
|
+
exc_tb: Optional[types.TracebackType],
|
|
66
|
+
) -> None:
|
|
60
67
|
if isinstance(self.client, CodeClient):
|
|
61
68
|
recover_openai_api()
|
|
62
69
|
|