azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -4,42 +4,30 @@
|
|
|
4
4
|
|
|
5
5
|
from ._evaluate._evaluate import evaluate
|
|
6
6
|
from ._evaluators._bleu import BleuScoreEvaluator
|
|
7
|
+
from ._evaluators._chat import ChatEvaluator
|
|
7
8
|
from ._evaluators._coherence import CoherenceEvaluator
|
|
8
9
|
from ._evaluators._content_safety import (
|
|
10
|
+
ContentSafetyChatEvaluator,
|
|
9
11
|
ContentSafetyEvaluator,
|
|
10
12
|
HateUnfairnessEvaluator,
|
|
11
13
|
SelfHarmEvaluator,
|
|
12
14
|
SexualEvaluator,
|
|
13
15
|
ViolenceEvaluator,
|
|
14
16
|
)
|
|
15
|
-
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
-
ContentSafetyMultimodalEvaluator,
|
|
17
|
-
HateUnfairnessMultimodalEvaluator,
|
|
18
|
-
SelfHarmMultimodalEvaluator,
|
|
19
|
-
SexualMultimodalEvaluator,
|
|
20
|
-
ViolenceMultimodalEvaluator,
|
|
21
|
-
)
|
|
22
|
-
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
23
17
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
24
18
|
from ._evaluators._fluency import FluencyEvaluator
|
|
25
19
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
26
20
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
-
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
28
21
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
29
22
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
30
23
|
from ._evaluators._qa import QAEvaluator
|
|
31
24
|
from ._evaluators._relevance import RelevanceEvaluator
|
|
32
|
-
from ._evaluators._retrieval import RetrievalEvaluator
|
|
33
25
|
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
|
|
34
26
|
from ._evaluators._similarity import SimilarityEvaluator
|
|
35
27
|
from ._evaluators._xpia import IndirectAttackEvaluator
|
|
36
28
|
from ._model_configurations import (
|
|
37
29
|
AzureAIProject,
|
|
38
30
|
AzureOpenAIModelConfiguration,
|
|
39
|
-
Conversation,
|
|
40
|
-
EvaluationResult,
|
|
41
|
-
EvaluatorConfig,
|
|
42
|
-
Message,
|
|
43
31
|
OpenAIModelConfiguration,
|
|
44
32
|
)
|
|
45
33
|
|
|
@@ -49,34 +37,24 @@ __all__ = [
|
|
|
49
37
|
"F1ScoreEvaluator",
|
|
50
38
|
"FluencyEvaluator",
|
|
51
39
|
"GroundednessEvaluator",
|
|
52
|
-
"GroundednessProEvaluator",
|
|
53
40
|
"RelevanceEvaluator",
|
|
54
41
|
"SimilarityEvaluator",
|
|
55
42
|
"QAEvaluator",
|
|
43
|
+
"ChatEvaluator",
|
|
56
44
|
"ViolenceEvaluator",
|
|
57
45
|
"SexualEvaluator",
|
|
58
46
|
"SelfHarmEvaluator",
|
|
59
47
|
"HateUnfairnessEvaluator",
|
|
60
48
|
"ContentSafetyEvaluator",
|
|
49
|
+
"ContentSafetyChatEvaluator",
|
|
61
50
|
"IndirectAttackEvaluator",
|
|
62
51
|
"BleuScoreEvaluator",
|
|
63
52
|
"GleuScoreEvaluator",
|
|
64
53
|
"MeteorScoreEvaluator",
|
|
65
|
-
"RetrievalEvaluator",
|
|
66
54
|
"RougeScoreEvaluator",
|
|
67
55
|
"RougeType",
|
|
68
56
|
"ProtectedMaterialEvaluator",
|
|
69
57
|
"AzureAIProject",
|
|
70
58
|
"AzureOpenAIModelConfiguration",
|
|
71
59
|
"OpenAIModelConfiguration",
|
|
72
|
-
"EvaluatorConfig",
|
|
73
|
-
"Conversation",
|
|
74
|
-
"Message",
|
|
75
|
-
"EvaluationResult",
|
|
76
|
-
"ContentSafetyMultimodalEvaluator",
|
|
77
|
-
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
-
"SelfHarmMultimodalEvaluator",
|
|
79
|
-
"SexualMultimodalEvaluator",
|
|
80
|
-
"ViolenceMultimodalEvaluator",
|
|
81
|
-
"ProtectedMaterialMultimodalEvaluator",
|
|
82
60
|
]
|
|
@@ -3,11 +3,6 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
|
|
6
|
-
from azure.core import CaseInsensitiveEnumMeta
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
-
|
|
11
6
|
|
|
12
7
|
class CommonConstants:
|
|
13
8
|
"""Define common constants."""
|
|
@@ -38,7 +33,6 @@ class Tasks:
|
|
|
38
33
|
CONTENT_HARM = "content harm"
|
|
39
34
|
PROTECTED_MATERIAL = "protected material"
|
|
40
35
|
XPIA = "xpia"
|
|
41
|
-
GROUNDEDNESS = "groundedness"
|
|
42
36
|
|
|
43
37
|
|
|
44
38
|
class _InternalAnnotationTasks:
|
|
@@ -49,7 +43,7 @@ class _InternalAnnotationTasks:
|
|
|
49
43
|
ECI = "eci"
|
|
50
44
|
|
|
51
45
|
|
|
52
|
-
class EvaluationMetrics
|
|
46
|
+
class EvaluationMetrics:
|
|
53
47
|
"""Evaluation metrics to aid the RAI service in determining what
|
|
54
48
|
metrics to request, and how to present them back to the user."""
|
|
55
49
|
|
|
@@ -60,10 +54,9 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
|
60
54
|
SEXUAL = "sexual"
|
|
61
55
|
PROTECTED_MATERIAL = "protected_material"
|
|
62
56
|
XPIA = "xpia"
|
|
63
|
-
GROUNDEDNESS = "generic_groundedness"
|
|
64
57
|
|
|
65
58
|
|
|
66
|
-
class _InternalEvaluationMetrics
|
|
59
|
+
class _InternalEvaluationMetrics:
|
|
67
60
|
"""Evaluation metrics that are not publicly supported.
|
|
68
61
|
These metrics are experimental and subject to potential change or migration to the main
|
|
69
62
|
enum over time.
|