azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +73 -2
- azure/ai/evaluation/_common/rai_service.py +250 -62
- azure/ai/evaluation/_common/utils.py +196 -23
- azure/ai/evaluation/_constants.py +7 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
- azure/ai/evaluation/_evaluate/_utils.py +46 -11
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
- azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
- azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
- azure/ai/evaluation/_exceptions.py +19 -0
- azure/ai/evaluation/_model_configurations.py +83 -15
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +165 -105
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
+
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
+
ContentSafetyMultimodalEvaluator,
|
|
17
|
+
HateUnfairnessMultimodalEvaluator,
|
|
18
|
+
SelfHarmMultimodalEvaluator,
|
|
19
|
+
SexualMultimodalEvaluator,
|
|
20
|
+
ViolenceMultimodalEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
15
23
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
16
24
|
from ._evaluators._fluency import FluencyEvaluator
|
|
17
25
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
18
26
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
+
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
19
28
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
20
29
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
21
30
|
from ._evaluators._qa import QAEvaluator
|
|
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
|
|
|
27
36
|
from ._model_configurations import (
|
|
28
37
|
AzureAIProject,
|
|
29
38
|
AzureOpenAIModelConfiguration,
|
|
39
|
+
Conversation,
|
|
40
|
+
EvaluationResult,
|
|
30
41
|
EvaluatorConfig,
|
|
42
|
+
Message,
|
|
31
43
|
OpenAIModelConfiguration,
|
|
32
44
|
)
|
|
33
45
|
|
|
@@ -37,6 +49,7 @@ __all__ = [
|
|
|
37
49
|
"F1ScoreEvaluator",
|
|
38
50
|
"FluencyEvaluator",
|
|
39
51
|
"GroundednessEvaluator",
|
|
52
|
+
"GroundednessProEvaluator",
|
|
40
53
|
"RelevanceEvaluator",
|
|
41
54
|
"SimilarityEvaluator",
|
|
42
55
|
"QAEvaluator",
|
|
@@ -57,4 +70,13 @@ __all__ = [
|
|
|
57
70
|
"AzureOpenAIModelConfiguration",
|
|
58
71
|
"OpenAIModelConfiguration",
|
|
59
72
|
"EvaluatorConfig",
|
|
73
|
+
"Conversation",
|
|
74
|
+
"Message",
|
|
75
|
+
"EvaluationResult",
|
|
76
|
+
"ContentSafetyMultimodalEvaluator",
|
|
77
|
+
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
+
"SelfHarmMultimodalEvaluator",
|
|
79
|
+
"SexualMultimodalEvaluator",
|
|
80
|
+
"ViolenceMultimodalEvaluator",
|
|
81
|
+
"ProtectedMaterialMultimodalEvaluator",
|
|
60
82
|
]
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
import functools
|
|
6
7
|
import inspect
|
|
7
8
|
import logging
|
|
@@ -149,6 +150,9 @@ def _get_indentation_size(doc_string: str) -> int:
|
|
|
149
150
|
def _should_skip_warning():
|
|
150
151
|
skip_warning_msg = False
|
|
151
152
|
|
|
153
|
+
if os.getenv("AI_EVALS_DISABLE_EXPERIMENTAL_WARNING", "false").lower() == "true":
|
|
154
|
+
skip_warning_msg = True
|
|
155
|
+
|
|
152
156
|
# Cases where we want to suppress the warning:
|
|
153
157
|
# 1. When converting from REST object to SDK object
|
|
154
158
|
for frame in inspect.stack():
|
|
@@ -6,6 +6,9 @@ from enum import Enum
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
class CommonConstants:
|
|
10
13
|
"""Define common constants."""
|
|
11
14
|
|
|
@@ -35,6 +38,7 @@ class Tasks:
|
|
|
35
38
|
CONTENT_HARM = "content harm"
|
|
36
39
|
PROTECTED_MATERIAL = "protected material"
|
|
37
40
|
XPIA = "xpia"
|
|
41
|
+
GROUNDEDNESS = "groundedness"
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
class _InternalAnnotationTasks:
|
|
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
|
56
60
|
SEXUAL = "sexual"
|
|
57
61
|
PROTECTED_MATERIAL = "protected_material"
|
|
58
62
|
XPIA = "xpia"
|
|
63
|
+
GROUNDEDNESS = "generic_groundedness"
|
|
59
64
|
|
|
60
65
|
|
|
61
66
|
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
@@ -3,16 +3,87 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import math
|
|
6
|
-
from typing import List
|
|
6
|
+
from typing import List, Callable, Any
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
def list_sum(lst: List[float]) -> float:
|
|
12
|
+
"""Given a list of floats, return the sum of the values.
|
|
13
|
+
|
|
14
|
+
:param lst: A list of floats.
|
|
15
|
+
:type lst: List[float]
|
|
16
|
+
:return: The sum of the values in the list.
|
|
17
|
+
:rtype: float
|
|
18
|
+
"""
|
|
19
|
+
|
|
10
20
|
return sum(lst)
|
|
11
21
|
|
|
12
22
|
|
|
13
23
|
def list_mean(lst: List[float]) -> float:
|
|
24
|
+
"""Given a list of floats, calculate the mean of the values.
|
|
25
|
+
|
|
26
|
+
:param lst: A list of floats.
|
|
27
|
+
:type lst: List[float]
|
|
28
|
+
:return: The mean of the values in the list.
|
|
29
|
+
:rtype: float
|
|
30
|
+
"""
|
|
31
|
+
|
|
14
32
|
return list_sum(lst) / len(lst)
|
|
15
33
|
|
|
16
34
|
|
|
17
35
|
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
18
|
-
|
|
36
|
+
"""Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
|
|
37
|
+
|
|
38
|
+
:param lst: A list of floats.
|
|
39
|
+
:type lst: List[float]
|
|
40
|
+
:return: The mean of the values in the list.
|
|
41
|
+
:rtype: float
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
45
|
+
if all(math.isnan(l) for l in lst):
|
|
46
|
+
raise EvaluationException(
|
|
47
|
+
message=msg,
|
|
48
|
+
internal_message=msg,
|
|
49
|
+
blame=ErrorBlame.USER_ERROR,
|
|
50
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
51
|
+
target=ErrorTarget.CONVERSATION,
|
|
52
|
+
)
|
|
53
|
+
return list_mean([l for l in lst if not is_none_or_nan(l)])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
|
|
57
|
+
"""Given a list of floats, remove all nan values, then apply the inputted transform function
|
|
58
|
+
to the remaining values, and return the resulting list of outputted values.
|
|
59
|
+
|
|
60
|
+
:param lst: A list of floats.
|
|
61
|
+
:type lst: List[float]
|
|
62
|
+
:param transform_fn: A function that produces something when applied to a float.
|
|
63
|
+
:type transform_fn: Callable[[float], Any]
|
|
64
|
+
:return: A list of the transformed values.
|
|
65
|
+
:rtype: List[Any]
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
69
|
+
if all(math.isnan(l) for l in lst):
|
|
70
|
+
raise EvaluationException(
|
|
71
|
+
message=msg,
|
|
72
|
+
internal_message=msg,
|
|
73
|
+
blame=ErrorBlame.USER_ERROR,
|
|
74
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
75
|
+
target=ErrorTarget.CONVERSATION,
|
|
76
|
+
)
|
|
77
|
+
return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_none_or_nan(val: float) -> bool:
|
|
81
|
+
"""math.isnan raises an error if None is inputted. This is a more robust wrapper.
|
|
82
|
+
|
|
83
|
+
:param val: The value to check.
|
|
84
|
+
:type val: float
|
|
85
|
+
:return: Whether the value is None or NaN.
|
|
86
|
+
:rtype: bool
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
return val is None or math.isnan(val)
|