azure-ai-evaluation 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +42 -14
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +38 -4
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +30 -10
- azure/ai/evaluation/_constants.py +10 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +36 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +43 -3
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +3 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +43 -4
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +21 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +157 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -1
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_red_team/__init__.py +3 -0
- azure/ai/evaluation/_red_team/_attack_objective_generator.py +192 -0
- azure/ai/evaluation/_red_team/_attack_strategy.py +42 -0
- azure/ai/evaluation/_red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/_red_team/_default_converter.py +21 -0
- azure/ai/evaluation/_red_team/_red_team.py +1858 -0
- azure/ai/evaluation/_red_team/_red_team_result.py +246 -0
- azure/ai/evaluation/_red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/_red_team/_utils/constants.py +64 -0
- azure/ai/evaluation/_red_team/_utils/formatting_utils.py +164 -0
- azure/ai/evaluation/_red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/_red_team/_utils/strategy_utils.py +188 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +741 -0
- azure/ai/evaluation/_version.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +61 -27
- azure/ai/evaluation/simulator/_conversation/__init__.py +4 -5
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +2 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/METADATA +75 -15
- azure_ai_evaluation-1.4.0.dist-info/RECORD +197 -0
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure_ai_evaluation-1.2.0.dist-info/RECORD +0 -125
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.2.0.dist-info → azure_ai_evaluation-1.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# ------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation.
|
|
3
|
+
# Licensed under the MIT License.
|
|
4
|
+
# ------------------------------------
|
|
5
|
+
"""Customize generated code here.
|
|
6
|
+
|
|
7
|
+
Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
8
|
+
"""
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
__all__: List[str] = [] # Add all objects you want publicly available to users at this package level
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def patch_sdk():
|
|
15
|
+
"""Do not remove from this file.
|
|
16
|
+
|
|
17
|
+
`patch_sdk` is a last resort escape hatch that allows you to do customizations
|
|
18
|
+
you can't accomplish using the techniques described in
|
|
19
|
+
https://aka.ms/azsdk/python/dpcodegen/python/customize
|
|
20
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marker file for PEP 561.
|
|
@@ -274,8 +274,26 @@ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
|
|
|
274
274
|
|
|
275
275
|
return cast(T_TypedDict, o)
|
|
276
276
|
|
|
277
|
+
def check_score_is_valid(score: Union[str, float], min_score = 1, max_score = 5) -> bool:
|
|
278
|
+
"""Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
|
|
279
|
+
|
|
280
|
+
:param score: The score to check.
|
|
281
|
+
:type score: Union[str, float]
|
|
282
|
+
:param min_score: The minimum score. Default is 1.
|
|
283
|
+
:type min_score: int
|
|
284
|
+
:param max_score: The maximum score. Default is 5.
|
|
285
|
+
:type max_score: int
|
|
286
|
+
:return: True if the score is valid, False otherwise.
|
|
287
|
+
:rtype: bool
|
|
288
|
+
"""
|
|
289
|
+
try:
|
|
290
|
+
numeric_score = float(score)
|
|
291
|
+
except (ValueError, TypeError):
|
|
292
|
+
return False
|
|
277
293
|
|
|
278
|
-
|
|
294
|
+
return min_score <= numeric_score <= max_score
|
|
295
|
+
|
|
296
|
+
def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
|
|
279
297
|
"""Parse the output of prompt-based quality evaluators that return a score and reason.
|
|
280
298
|
|
|
281
299
|
Current supported evaluators:
|
|
@@ -284,6 +302,8 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
284
302
|
- Retrieval
|
|
285
303
|
- Groundedness
|
|
286
304
|
- Coherence
|
|
305
|
+
- ResponseCompleteness
|
|
306
|
+
- TaskAdherence
|
|
287
307
|
|
|
288
308
|
:param llm_output: The output of the prompt-based quality evaluator.
|
|
289
309
|
:type llm_output: str
|
|
@@ -294,7 +314,7 @@ def parse_quality_evaluator_reason_score(llm_output: str) -> Tuple[float, str]:
|
|
|
294
314
|
reason = ""
|
|
295
315
|
if llm_output:
|
|
296
316
|
try:
|
|
297
|
-
score_pattern =
|
|
317
|
+
score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
|
|
298
318
|
reason_pattern = r"<S1>(.*?)</S1>"
|
|
299
319
|
score_match = re.findall(score_pattern, llm_output, re.DOTALL)
|
|
300
320
|
reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
|
|
@@ -366,7 +386,7 @@ def validate_conversation(conversation):
|
|
|
366
386
|
if not isinstance(messages, list):
|
|
367
387
|
raise_exception(
|
|
368
388
|
"'messages' parameter must be a JSON-compatible list of chat messages",
|
|
369
|
-
ErrorTarget.
|
|
389
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
370
390
|
)
|
|
371
391
|
expected_roles = {"user", "assistant", "system"}
|
|
372
392
|
image_found = False
|
|
@@ -393,7 +413,7 @@ def validate_conversation(conversation):
|
|
|
393
413
|
):
|
|
394
414
|
raise_exception(
|
|
395
415
|
f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
|
|
396
|
-
ErrorTarget.
|
|
416
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
397
417
|
)
|
|
398
418
|
if isinstance(message, AssistantMessage):
|
|
399
419
|
assistant_message_count += 1
|
|
@@ -407,7 +427,7 @@ def validate_conversation(conversation):
|
|
|
407
427
|
if message.get("role") not in expected_roles:
|
|
408
428
|
raise_exception(
|
|
409
429
|
f"Invalid role provided: {message.get('role')}. Message number: {num}",
|
|
410
|
-
ErrorTarget.
|
|
430
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
411
431
|
)
|
|
412
432
|
if message.get("role") == "assistant":
|
|
413
433
|
assistant_message_count += 1
|
|
@@ -417,7 +437,7 @@ def validate_conversation(conversation):
|
|
|
417
437
|
if not isinstance(content, (str, list)):
|
|
418
438
|
raise_exception(
|
|
419
439
|
f"Content in each turn must be a string or array. Message number: {num}",
|
|
420
|
-
ErrorTarget.
|
|
440
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
421
441
|
)
|
|
422
442
|
if isinstance(content, list):
|
|
423
443
|
if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
|
|
@@ -425,21 +445,21 @@ def validate_conversation(conversation):
|
|
|
425
445
|
if not image_found:
|
|
426
446
|
raise_exception(
|
|
427
447
|
"Message needs to have multi-modal input like images.",
|
|
428
|
-
ErrorTarget.
|
|
448
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
429
449
|
)
|
|
430
450
|
if assistant_message_count == 0:
|
|
431
451
|
raise_exception(
|
|
432
452
|
"Assistant role required in one of the messages.",
|
|
433
|
-
ErrorTarget.
|
|
453
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
434
454
|
)
|
|
435
455
|
if user_message_count == 0:
|
|
436
456
|
raise_exception(
|
|
437
457
|
"User role required in one of the messages.",
|
|
438
|
-
ErrorTarget.
|
|
458
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
439
459
|
)
|
|
440
460
|
if assistant_message_count > 1:
|
|
441
461
|
raise_exception(
|
|
442
462
|
"Evaluators for multimodal conversations only support single turn. "
|
|
443
463
|
"User and assistant role expected as the only role in each message.",
|
|
444
|
-
ErrorTarget.
|
|
464
|
+
ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
445
465
|
)
|
|
@@ -22,7 +22,12 @@ class EvaluationMetrics:
|
|
|
22
22
|
SELF_HARM = "self_harm"
|
|
23
23
|
SEXUAL = "sexual"
|
|
24
24
|
PROTECTED_MATERIAL = "protected_material"
|
|
25
|
+
ARTWORK = "artwork"
|
|
26
|
+
FICTIONAL_CHARACTERS = "fictional_characters"
|
|
27
|
+
LOGOS_AND_BRANDS = "logos_and_brands"
|
|
25
28
|
XPIA = "xpia"
|
|
29
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
30
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
26
31
|
|
|
27
32
|
|
|
28
33
|
class _InternalEvaluationMetrics:
|
|
@@ -89,3 +94,8 @@ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
|
|
|
89
94
|
AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
|
|
90
95
|
|
|
91
96
|
OPENAI_TYPE: Literal["openai"] = "openai"
|
|
97
|
+
|
|
98
|
+
EVALUATION_PASS_FAIL_MAPPING = {
|
|
99
|
+
True: "pass",
|
|
100
|
+
False: "fail",
|
|
101
|
+
}
|