azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +23 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
- azure/ai/evaluation/_common/constants.py +9 -2
- azure/ai/evaluation/_common/math.py +29 -0
- azure/ai/evaluation/_common/rai_service.py +222 -93
- azure/ai/evaluation/_common/utils.py +328 -19
- azure/ai/evaluation/_constants.py +16 -8
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
- azure/ai/evaluation/_evaluate/_utils.py +77 -25
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
- azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
- azure/ai/evaluation/_exceptions.py +26 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +23 -6
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +222 -169
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
+
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
+
ContentSafetyMultimodalEvaluator,
|
|
17
|
+
HateUnfairnessMultimodalEvaluator,
|
|
18
|
+
SelfHarmMultimodalEvaluator,
|
|
19
|
+
SexualMultimodalEvaluator,
|
|
20
|
+
ViolenceMultimodalEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
15
23
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
16
24
|
from ._evaluators._fluency import FluencyEvaluator
|
|
17
25
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
18
26
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
+
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
19
28
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
20
29
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
21
30
|
from ._evaluators._qa import QAEvaluator
|
|
@@ -27,8 +36,11 @@ from ._evaluators._xpia import IndirectAttackEvaluator
|
|
|
27
36
|
from ._model_configurations import (
|
|
28
37
|
AzureAIProject,
|
|
29
38
|
AzureOpenAIModelConfiguration,
|
|
30
|
-
|
|
39
|
+
Conversation,
|
|
40
|
+
EvaluationResult,
|
|
31
41
|
EvaluatorConfig,
|
|
42
|
+
Message,
|
|
43
|
+
OpenAIModelConfiguration,
|
|
32
44
|
)
|
|
33
45
|
|
|
34
46
|
__all__ = [
|
|
@@ -37,6 +49,7 @@ __all__ = [
|
|
|
37
49
|
"F1ScoreEvaluator",
|
|
38
50
|
"FluencyEvaluator",
|
|
39
51
|
"GroundednessEvaluator",
|
|
52
|
+
"GroundednessProEvaluator",
|
|
40
53
|
"RelevanceEvaluator",
|
|
41
54
|
"SimilarityEvaluator",
|
|
42
55
|
"QAEvaluator",
|
|
@@ -57,4 +70,13 @@ __all__ = [
|
|
|
57
70
|
"AzureOpenAIModelConfiguration",
|
|
58
71
|
"OpenAIModelConfiguration",
|
|
59
72
|
"EvaluatorConfig",
|
|
73
|
+
"Conversation",
|
|
74
|
+
"Message",
|
|
75
|
+
"EvaluationResult",
|
|
76
|
+
"ContentSafetyMultimodalEvaluator",
|
|
77
|
+
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
+
"SelfHarmMultimodalEvaluator",
|
|
79
|
+
"SexualMultimodalEvaluator",
|
|
80
|
+
"ViolenceMultimodalEvaluator",
|
|
81
|
+
"ProtectedMaterialMultimodalEvaluator",
|
|
60
82
|
]
|
|
@@ -6,9 +6,9 @@ import functools
|
|
|
6
6
|
import inspect
|
|
7
7
|
import logging
|
|
8
8
|
import sys
|
|
9
|
-
from typing import Callable, Type, TypeVar, Union
|
|
9
|
+
from typing import Callable, Type, TypeVar, Union, overload
|
|
10
10
|
|
|
11
|
-
from typing_extensions import ParamSpec
|
|
11
|
+
from typing_extensions import ParamSpec, TypeGuard
|
|
12
12
|
|
|
13
13
|
DOCSTRING_TEMPLATE = ".. note:: {0} {1}\n\n"
|
|
14
14
|
DOCSTRING_DEFAULT_INDENTATION = 8
|
|
@@ -22,20 +22,31 @@ EXPERIMENTAL_LINK_MESSAGE = (
|
|
|
22
22
|
_warning_cache = set()
|
|
23
23
|
module_logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
25
|
-
TExperimental = TypeVar("TExperimental", bound=Union[Type, Callable])
|
|
26
25
|
P = ParamSpec("P")
|
|
27
26
|
T = TypeVar("T")
|
|
28
27
|
|
|
29
28
|
|
|
30
|
-
|
|
29
|
+
@overload
|
|
30
|
+
def experimental(wrapped: Type[T]) -> Type[T]: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@overload
|
|
34
|
+
def experimental(wrapped: Callable[P, T]) -> Callable[P, T]: ...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def experimental(wrapped: Union[Type[T], Callable[P, T]]) -> Union[Type[T], Callable[P, T]]:
|
|
31
38
|
"""Add experimental tag to a class or a method.
|
|
32
39
|
|
|
33
40
|
:param wrapped: Either a Class or Function to mark as experimental
|
|
34
|
-
:type wrapped:
|
|
41
|
+
:type wrapped: Union[Type[T], Callable[P, T]]
|
|
35
42
|
:return: The wrapped class or method
|
|
36
|
-
:rtype:
|
|
43
|
+
:rtype: Union[Type[T], Callable[P, T]]
|
|
37
44
|
"""
|
|
38
|
-
|
|
45
|
+
|
|
46
|
+
def is_class(t: Union[Type[T], Callable[P, T]]) -> TypeGuard[Type[T]]:
|
|
47
|
+
return isinstance(t, type)
|
|
48
|
+
|
|
49
|
+
if is_class(wrapped):
|
|
39
50
|
return _add_class_docstring(wrapped)
|
|
40
51
|
if inspect.isfunction(wrapped):
|
|
41
52
|
return _add_method_docstring(wrapped)
|
|
@@ -74,11 +85,11 @@ def _add_class_docstring(cls: Type[T]) -> Type[T]:
|
|
|
74
85
|
cls.__doc__ = _add_note_to_docstring(cls.__doc__, doc_string)
|
|
75
86
|
else:
|
|
76
87
|
cls.__doc__ = doc_string + ">"
|
|
77
|
-
cls.__init__ = _add_class_warning(cls.__init__)
|
|
88
|
+
cls.__init__ = _add_class_warning(cls.__init__) # type: ignore[method-assign]
|
|
78
89
|
return cls
|
|
79
90
|
|
|
80
91
|
|
|
81
|
-
def _add_method_docstring(func: Callable[P, T]
|
|
92
|
+
def _add_method_docstring(func: Callable[P, T]) -> Callable[P, T]:
|
|
82
93
|
"""Add experimental tag to the method doc string.
|
|
83
94
|
|
|
84
95
|
:param func: The function to update
|
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
+
from azure.core import CaseInsensitiveEnumMeta
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
class CommonConstants:
|
|
8
13
|
"""Define common constants."""
|
|
@@ -33,6 +38,7 @@ class Tasks:
|
|
|
33
38
|
CONTENT_HARM = "content harm"
|
|
34
39
|
PROTECTED_MATERIAL = "protected material"
|
|
35
40
|
XPIA = "xpia"
|
|
41
|
+
GROUNDEDNESS = "groundedness"
|
|
36
42
|
|
|
37
43
|
|
|
38
44
|
class _InternalAnnotationTasks:
|
|
@@ -43,7 +49,7 @@ class _InternalAnnotationTasks:
|
|
|
43
49
|
ECI = "eci"
|
|
44
50
|
|
|
45
51
|
|
|
46
|
-
class EvaluationMetrics:
|
|
52
|
+
class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
47
53
|
"""Evaluation metrics to aid the RAI service in determining what
|
|
48
54
|
metrics to request, and how to present them back to the user."""
|
|
49
55
|
|
|
@@ -54,9 +60,10 @@ class EvaluationMetrics:
|
|
|
54
60
|
SEXUAL = "sexual"
|
|
55
61
|
PROTECTED_MATERIAL = "protected_material"
|
|
56
62
|
XPIA = "xpia"
|
|
63
|
+
GROUNDEDNESS = "generic_groundedness"
|
|
57
64
|
|
|
58
65
|
|
|
59
|
-
class _InternalEvaluationMetrics:
|
|
66
|
+
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
60
67
|
"""Evaluation metrics that are not publicly supported.
|
|
61
68
|
These metrics are experimental and subject to potential change or migration to the main
|
|
62
69
|
enum over time.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_sum(lst: List[float]) -> float:
|
|
12
|
+
return sum(lst)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def list_mean(lst: List[float]) -> float:
|
|
16
|
+
return list_sum(lst) / len(lst)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
20
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
21
|
+
if all(math.isnan(l) for l in lst):
|
|
22
|
+
raise EvaluationException(
|
|
23
|
+
message=msg,
|
|
24
|
+
internal_message=msg,
|
|
25
|
+
blame=ErrorBlame.USER_ERROR,
|
|
26
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
27
|
+
target=ErrorTarget.CONVERSATION,
|
|
28
|
+
)
|
|
29
|
+
return list_mean([l for l in lst if not math.isnan(l)])
|