azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._evaluate._evaluate import evaluate
|
|
6
|
+
from ._evaluators._bleu import BleuScoreEvaluator
|
|
7
|
+
from ._evaluators._coherence import CoherenceEvaluator
|
|
8
|
+
from ._evaluators._content_safety import (
|
|
9
|
+
ContentSafetyEvaluator,
|
|
10
|
+
HateUnfairnessEvaluator,
|
|
11
|
+
SelfHarmEvaluator,
|
|
12
|
+
SexualEvaluator,
|
|
13
|
+
ViolenceEvaluator,
|
|
14
|
+
)
|
|
15
|
+
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
+
ContentSafetyMultimodalEvaluator,
|
|
17
|
+
HateUnfairnessMultimodalEvaluator,
|
|
18
|
+
SelfHarmMultimodalEvaluator,
|
|
19
|
+
SexualMultimodalEvaluator,
|
|
20
|
+
ViolenceMultimodalEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
23
|
+
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
24
|
+
from ._evaluators._fluency import FluencyEvaluator
|
|
25
|
+
from ._evaluators._gleu import GleuScoreEvaluator
|
|
26
|
+
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
+
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
28
|
+
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
29
|
+
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
30
|
+
from ._evaluators._qa import QAEvaluator
|
|
31
|
+
from ._evaluators._relevance import RelevanceEvaluator
|
|
32
|
+
from ._evaluators._retrieval import RetrievalEvaluator
|
|
33
|
+
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
34
|
+
from ._evaluators._similarity import SimilarityEvaluator
|
|
35
|
+
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
36
|
+
from ._model_configurations import (
|
|
37
|
+
AzureAIProject,
|
|
38
|
+
AzureOpenAIModelConfiguration,
|
|
39
|
+
Conversation,
|
|
40
|
+
EvaluationResult,
|
|
41
|
+
EvaluatorConfig,
|
|
42
|
+
Message,
|
|
43
|
+
OpenAIModelConfiguration,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"evaluate",
|
|
48
|
+
"CoherenceEvaluator",
|
|
49
|
+
"F1ScoreEvaluator",
|
|
50
|
+
"FluencyEvaluator",
|
|
51
|
+
"GroundednessEvaluator",
|
|
52
|
+
"GroundednessProEvaluator",
|
|
53
|
+
"RelevanceEvaluator",
|
|
54
|
+
"SimilarityEvaluator",
|
|
55
|
+
"QAEvaluator",
|
|
56
|
+
"ViolenceEvaluator",
|
|
57
|
+
"SexualEvaluator",
|
|
58
|
+
"SelfHarmEvaluator",
|
|
59
|
+
"HateUnfairnessEvaluator",
|
|
60
|
+
"ContentSafetyEvaluator",
|
|
61
|
+
"IndirectAttackEvaluator",
|
|
62
|
+
"BleuScoreEvaluator",
|
|
63
|
+
"GleuScoreEvaluator",
|
|
64
|
+
"MeteorScoreEvaluator",
|
|
65
|
+
"RetrievalEvaluator",
|
|
66
|
+
"RougeScoreEvaluator",
|
|
67
|
+
"RougeType",
|
|
68
|
+
"ProtectedMaterialEvaluator",
|
|
69
|
+
"AzureAIProject",
|
|
70
|
+
"AzureOpenAIModelConfiguration",
|
|
71
|
+
"OpenAIModelConfiguration",
|
|
72
|
+
"EvaluatorConfig",
|
|
73
|
+
"Conversation",
|
|
74
|
+
"Message",
|
|
75
|
+
"EvaluationResult",
|
|
76
|
+
"ContentSafetyMultimodalEvaluator",
|
|
77
|
+
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
+
"SelfHarmMultimodalEvaluator",
|
|
79
|
+
"SexualMultimodalEvaluator",
|
|
80
|
+
"ViolenceMultimodalEvaluator",
|
|
81
|
+
"ProtectedMaterialMultimodalEvaluator",
|
|
82
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# To minimize relative imports in our evaluators, the scope of this package also includes anything
|
|
6
|
+
# that would have otherwise been a relative import scoped to single evaluator directories.
|
|
7
|
+
|
|
8
|
+
from . import constants
|
|
9
|
+
from .rai_service import evaluate_with_rai_service
|
|
10
|
+
from .utils import get_harm_severity_level
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"get_harm_severity_level",
|
|
14
|
+
"evaluate_with_rai_service",
|
|
15
|
+
"constants",
|
|
16
|
+
]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import functools
|
|
7
|
+
import inspect
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Callable, Type, TypeVar, Union, overload
|
|
11
|
+
|
|
12
|
+
from typing_extensions import ParamSpec, TypeGuard
|
|
13
|
+
|
|
14
|
+
DOCSTRING_TEMPLATE = ".. note:: {0} {1}\n\n"
|
|
15
|
+
DOCSTRING_DEFAULT_INDENTATION = 8
|
|
16
|
+
EXPERIMENTAL_CLASS_MESSAGE = "This is an experimental class,"
|
|
17
|
+
EXPERIMENTAL_METHOD_MESSAGE = "This is an experimental method,"
|
|
18
|
+
EXPERIMENTAL_FIELD_MESSAGE = "This is an experimental field,"
|
|
19
|
+
EXPERIMENTAL_LINK_MESSAGE = (
|
|
20
|
+
"and may change at any time. Please see https://aka.ms/azuremlexperimental for more information."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_warning_cache = set()
|
|
24
|
+
module_logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
P = ParamSpec("P")
|
|
27
|
+
T = TypeVar("T")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@overload
|
|
31
|
+
def experimental(wrapped: Type[T]) -> Type[T]: ...
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@overload
|
|
35
|
+
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
|
|
39
|
+
"""Add experimental tag to a class or a method.
|
|
40
|
+
|
|
41
|
+
:param wrapped: Either a Class or Function to mark as experimental
|
|
42
|
+
:type wrapped: Union[Type[T], Callable[P, T]]
|
|
43
|
+
:return: The wrapped class or method
|
|
44
|
+
:rtype: Union[Type[T], Callable[P, T]]
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def is_class(t: Union[Type[T], Callable[P, T]]) -> TypeGuard[Type[T]]:
|
|
48
|
+
return isinstance(t, type)
|
|
49
|
+
|
|
50
|
+
if is_class(wrapped):
|
|
51
|
+
return _add_class_docstring(wrapped)
|
|
52
|
+
if inspect.isfunction(wrapped):
|
|
53
|
+
return _add_method_docstring(wrapped)
|
|
54
|
+
return wrapped
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _add_class_docstring(cls: Type[T]) -> Type[T]:
|
|
58
|
+
"""Add experimental tag to the class doc string.
|
|
59
|
+
|
|
60
|
+
:return: The updated class
|
|
61
|
+
:rtype: Type[T]
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
P2 = ParamSpec("P2")
|
|
65
|
+
|
|
66
|
+
def _add_class_warning(func: Callable[P2, None]) -> Callable[P2, None]:
|
|
67
|
+
"""Add warning message for class __init__.
|
|
68
|
+
|
|
69
|
+
:param func: The original __init__ function
|
|
70
|
+
:type func: Callable[P2, None]
|
|
71
|
+
:return: Updated __init__
|
|
72
|
+
:rtype: Callable[P2, None]
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
@functools.wraps(func)
|
|
76
|
+
def wrapped(*args, **kwargs):
|
|
77
|
+
message = "Class {0}: {1} {2}".format(cls.__name__, EXPERIMENTAL_CLASS_MESSAGE, EXPERIMENTAL_LINK_MESSAGE)
|
|
78
|
+
if not _should_skip_warning() and not _is_warning_cached(message):
|
|
79
|
+
module_logger.warning(message)
|
|
80
|
+
return func(*args, **kwargs)
|
|
81
|
+
|
|
82
|
+
return wrapped
|
|
83
|
+
|
|
84
|
+
doc_string = DOCSTRING_TEMPLATE.format(EXPERIMENTAL_CLASS_MESSAGE, EXPERIMENTAL_LINK_MESSAGE)
|
|
85
|
+
if cls.__doc__:
|
|
86
|
+
cls.__doc__ = _add_note_to_docstring(cls.__doc__, doc_string)
|
|
87
|
+
else:
|
|
88
|
+
cls.__doc__ = doc_string + ">"
|
|
89
|
+
cls.__init__ = _add_class_warning(cls.__init__) # type: ignore[method-assign]
|
|
90
|
+
return cls
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _add_method_docstring(func: Callable[P, T]) -> Callable[P, T]:
|
|
94
|
+
"""Add experimental tag to the method doc string.
|
|
95
|
+
|
|
96
|
+
:param func: The function to update
|
|
97
|
+
:type func: Callable[P, T]
|
|
98
|
+
:return: A wrapped method marked as experimental
|
|
99
|
+
:rtype: Callable[P,T]
|
|
100
|
+
"""
|
|
101
|
+
doc_string = DOCSTRING_TEMPLATE.format(EXPERIMENTAL_METHOD_MESSAGE, EXPERIMENTAL_LINK_MESSAGE)
|
|
102
|
+
if func.__doc__:
|
|
103
|
+
func.__doc__ = _add_note_to_docstring(func.__doc__, doc_string)
|
|
104
|
+
else:
|
|
105
|
+
# '>' is required. Otherwise the note section can't be generated
|
|
106
|
+
func.__doc__ = doc_string + ">"
|
|
107
|
+
|
|
108
|
+
@functools.wraps(func)
|
|
109
|
+
def wrapped(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
110
|
+
message = "Method {0}: {1} {2}".format(func.__name__, EXPERIMENTAL_METHOD_MESSAGE, EXPERIMENTAL_LINK_MESSAGE)
|
|
111
|
+
if not _should_skip_warning() and not _is_warning_cached(message):
|
|
112
|
+
module_logger.warning(message)
|
|
113
|
+
return func(*args, **kwargs)
|
|
114
|
+
|
|
115
|
+
return wrapped
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _add_note_to_docstring(doc_string: str, note: str) -> str:
|
|
119
|
+
"""Adds experimental note to docstring at the top and correctly indents original docstring.
|
|
120
|
+
|
|
121
|
+
:param doc_string: The docstring
|
|
122
|
+
:type doc_string: str
|
|
123
|
+
:param note: The note to add to the docstring
|
|
124
|
+
:type note: str
|
|
125
|
+
:return: Updated docstring
|
|
126
|
+
:rtype: str
|
|
127
|
+
"""
|
|
128
|
+
indent = _get_indentation_size(doc_string)
|
|
129
|
+
doc_string = doc_string.rjust(len(doc_string) + indent)
|
|
130
|
+
return note + doc_string
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _get_indentation_size(doc_string: str) -> int:
|
|
134
|
+
"""Finds the minimum indentation of all non-blank lines after the first line.
|
|
135
|
+
|
|
136
|
+
:param doc_string: The docstring
|
|
137
|
+
:type doc_string: str
|
|
138
|
+
:return: Minimum number of indentation of the docstring
|
|
139
|
+
:rtype: int
|
|
140
|
+
"""
|
|
141
|
+
lines = doc_string.expandtabs().splitlines()
|
|
142
|
+
indent = sys.maxsize
|
|
143
|
+
for line in lines[1:]:
|
|
144
|
+
stripped = line.lstrip()
|
|
145
|
+
if stripped:
|
|
146
|
+
indent = min(indent, len(line) - len(stripped))
|
|
147
|
+
return indent if indent < sys.maxsize else DOCSTRING_DEFAULT_INDENTATION
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _should_skip_warning():
|
|
151
|
+
skip_warning_msg = False
|
|
152
|
+
|
|
153
|
+
if os.getenv("AI_EVALS_DISABLE_EXPERIMENTAL_WARNING", "false").lower() == "true":
|
|
154
|
+
skip_warning_msg = True
|
|
155
|
+
|
|
156
|
+
# Cases where we want to suppress the warning:
|
|
157
|
+
# 1. When converting from REST object to SDK object
|
|
158
|
+
for frame in inspect.stack():
|
|
159
|
+
if frame.function == "_from_rest_object":
|
|
160
|
+
skip_warning_msg = True
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
return skip_warning_msg
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _is_warning_cached(warning_msg):
|
|
167
|
+
# use cache to make sure we only print same warning message once under same session
|
|
168
|
+
# this prevents duplicated warnings got printed when user does a loop call on a method or a class
|
|
169
|
+
if warning_msg in _warning_cache:
|
|
170
|
+
return True
|
|
171
|
+
_warning_cache.add(warning_msg)
|
|
172
|
+
return False
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
from azure.core import CaseInsensitiveEnumMeta
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CommonConstants:
|
|
13
|
+
"""Define common constants."""
|
|
14
|
+
|
|
15
|
+
DEFAULT_HTTP_TIMEOUT = 60
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RAIService:
|
|
19
|
+
"""Define constants related to RAI service"""
|
|
20
|
+
|
|
21
|
+
TIMEOUT = 1800
|
|
22
|
+
SLEEP_TIME = 2
|
|
23
|
+
HARM_SEVERITY_THRESHOLD = 4
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HarmSeverityLevel(Enum):
|
|
27
|
+
"""Harm severity levels."""
|
|
28
|
+
|
|
29
|
+
VeryLow = "Very low"
|
|
30
|
+
Low = "Low"
|
|
31
|
+
Medium = "Medium"
|
|
32
|
+
High = "High"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Tasks:
|
|
36
|
+
"""Defines types of annotation tasks supported by RAI Service."""
|
|
37
|
+
|
|
38
|
+
CONTENT_HARM = "content harm"
|
|
39
|
+
PROTECTED_MATERIAL = "protected material"
|
|
40
|
+
XPIA = "xpia"
|
|
41
|
+
GROUNDEDNESS = "groundedness"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class _InternalAnnotationTasks:
|
|
45
|
+
"""Defines types of annotation tasks that are supported for internal use. Such tasks are
|
|
46
|
+
experimental and subject to potential change or migration to the main Evaluation Metrics
|
|
47
|
+
enum over time."""
|
|
48
|
+
|
|
49
|
+
ECI = "eci"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
53
|
+
"""Evaluation metrics to aid the RAI service in determining what
|
|
54
|
+
metrics to request, and how to present them back to the user."""
|
|
55
|
+
|
|
56
|
+
HATE_FAIRNESS = "hate_fairness"
|
|
57
|
+
HATE_UNFAIRNESS = "hate_unfairness"
|
|
58
|
+
VIOLENCE = "violence"
|
|
59
|
+
SELF_HARM = "self_harm"
|
|
60
|
+
SEXUAL = "sexual"
|
|
61
|
+
PROTECTED_MATERIAL = "protected_material"
|
|
62
|
+
XPIA = "xpia"
|
|
63
|
+
GROUNDEDNESS = "generic_groundedness"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
67
|
+
"""Evaluation metrics that are not publicly supported.
|
|
68
|
+
These metrics are experimental and subject to potential change or migration to the main
|
|
69
|
+
enum over time.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
ECI = "eci"
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import List, Callable, Any
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_sum(lst: List[float]) -> float:
|
|
12
|
+
"""Given a list of floats, return the sum of the values.
|
|
13
|
+
|
|
14
|
+
:param lst: A list of floats.
|
|
15
|
+
:type lst: List[float]
|
|
16
|
+
:return: The sum of the values in the list.
|
|
17
|
+
:rtype: float
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
return sum(lst)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def list_mean(lst: List[float]) -> float:
|
|
24
|
+
"""Given a list of floats, calculate the mean of the values.
|
|
25
|
+
|
|
26
|
+
:param lst: A list of floats.
|
|
27
|
+
:type lst: List[float]
|
|
28
|
+
:return: The mean of the values in the list.
|
|
29
|
+
:rtype: float
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
return list_sum(lst) / len(lst)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
36
|
+
"""Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
|
|
37
|
+
|
|
38
|
+
:param lst: A list of floats.
|
|
39
|
+
:type lst: List[float]
|
|
40
|
+
:return: The mean of the values in the list.
|
|
41
|
+
:rtype: float
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
45
|
+
if all(math.isnan(l) for l in lst):
|
|
46
|
+
raise EvaluationException(
|
|
47
|
+
message=msg,
|
|
48
|
+
internal_message=msg,
|
|
49
|
+
blame=ErrorBlame.USER_ERROR,
|
|
50
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
51
|
+
target=ErrorTarget.CONVERSATION,
|
|
52
|
+
)
|
|
53
|
+
return list_mean([l for l in lst if not is_none_or_nan(l)])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
|
|
57
|
+
"""Given a list of floats, remove all nan values, then apply the inputted transform function
|
|
58
|
+
to the remaining values, and return the resulting list of outputted values.
|
|
59
|
+
|
|
60
|
+
:param lst: A list of floats.
|
|
61
|
+
:type lst: List[float]
|
|
62
|
+
:param transform_fn: A function that produces something when applied to a float.
|
|
63
|
+
:type transform_fn: Callable[[float], Any]
|
|
64
|
+
:return: A list of the transformed values.
|
|
65
|
+
:rtype: List[Any]
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
69
|
+
if all(math.isnan(l) for l in lst):
|
|
70
|
+
raise EvaluationException(
|
|
71
|
+
message=msg,
|
|
72
|
+
internal_message=msg,
|
|
73
|
+
blame=ErrorBlame.USER_ERROR,
|
|
74
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
75
|
+
target=ErrorTarget.CONVERSATION,
|
|
76
|
+
)
|
|
77
|
+
return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_none_or_nan(val: float) -> bool:
|
|
81
|
+
"""math.isnan raises an error if None is inputted. This is a more robust wrapper.
|
|
82
|
+
|
|
83
|
+
:param val: The value to check.
|
|
84
|
+
:type val: float
|
|
85
|
+
:return: Whether the value is None or NaN.
|
|
86
|
+
:rtype: bool
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
return val is None or math.isnan(val)
|