azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,9 @@ class EvaluationMetrics:
|
|
|
28
28
|
XPIA = "xpia"
|
|
29
29
|
CODE_VULNERABILITY = "code_vulnerability"
|
|
30
30
|
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
31
|
+
SENSITIVE_DATA_LEAKAGE = "sensitive_data_leakage"
|
|
32
|
+
TASK_ADHERENCE = "task_adherence"
|
|
33
|
+
PROHIBITED_ACTIONS = "prohibited_actions"
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
class _InternalEvaluationMetrics:
|
|
@@ -90,6 +93,100 @@ class TokenScope(str, enum.Enum):
|
|
|
90
93
|
AZURE_ML = "https://ml.azure.com/.default"
|
|
91
94
|
|
|
92
95
|
|
|
96
|
+
class _EvaluatorMetricMapping:
|
|
97
|
+
"""
|
|
98
|
+
Static mapping of evaluator names to their metric names, based on assets.json.
|
|
99
|
+
The 'builtin.' prefix is removed from the evaluator name keys.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
EVALUATOR_NAME_METRICS_MAPPINGS = {
|
|
103
|
+
"bleu_score": ["bleu"],
|
|
104
|
+
"coherence": ["coherence"],
|
|
105
|
+
"document_retrieval": [
|
|
106
|
+
"ndcg@3",
|
|
107
|
+
"xdcg@3",
|
|
108
|
+
"fidelity",
|
|
109
|
+
"top1_relevance",
|
|
110
|
+
"top3_max_relevance",
|
|
111
|
+
"holes",
|
|
112
|
+
"holes_ratio",
|
|
113
|
+
"total_retrieved_documents",
|
|
114
|
+
"total_ground_truth_documents",
|
|
115
|
+
],
|
|
116
|
+
"f1_score": ["f1_score"],
|
|
117
|
+
"fluency": ["fluency"],
|
|
118
|
+
"gleu_score": ["gleu"],
|
|
119
|
+
"meteor_score": ["meteor"],
|
|
120
|
+
"relevance": ["relevance"],
|
|
121
|
+
"response_completeness": ["response_completeness"],
|
|
122
|
+
"rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"],
|
|
123
|
+
"groundedness_pro": ["groundedness_pro"],
|
|
124
|
+
"similarity": ["similarity"],
|
|
125
|
+
"intent_resolution": ["intent_resolution"],
|
|
126
|
+
"retrieval": ["retrieval"],
|
|
127
|
+
"task_adherence": ["task_adherence"],
|
|
128
|
+
"tool_call_accuracy": ["tool_call_accuracy"],
|
|
129
|
+
"groundedness": ["groundedness"],
|
|
130
|
+
"code_vulnerability": ["code_vulnerability"],
|
|
131
|
+
"eci": ["eci"],
|
|
132
|
+
"protected_material": ["protected_material"],
|
|
133
|
+
"ungrounded_attributes": ["ungrounded_attributes"],
|
|
134
|
+
"indirect_attack": ["xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"],
|
|
135
|
+
"label_grader": ["label_model"],
|
|
136
|
+
"string_check_grader": ["string_check"],
|
|
137
|
+
"text_similarity_grader": ["similarity"],
|
|
138
|
+
"score_model_grader": ["score_model"],
|
|
139
|
+
"sexual": ["sexual"],
|
|
140
|
+
"self_harm": ["self_harm"],
|
|
141
|
+
"violence": ["violence"],
|
|
142
|
+
"hate_unfairness": ["hate_unfairness"],
|
|
143
|
+
"tool_input_accuracy": ["tool_input_accuracy"],
|
|
144
|
+
"task_completion": ["task_completion"],
|
|
145
|
+
"tool_success": ["tool_success"],
|
|
146
|
+
"tool_selection": ["tool_selection"],
|
|
147
|
+
"tool_output_utilization": ["tool_output_utilization"],
|
|
148
|
+
"task_navigation_efficiency": ["task_navigation_efficiency"],
|
|
149
|
+
"text_similarity": ["similarity"],
|
|
150
|
+
"string_check": ["string_check"],
|
|
151
|
+
"sensitive_data_leakage": ["prohibited_actions"],
|
|
152
|
+
"score_model": ["score_model"],
|
|
153
|
+
"label_model": ["label_model"],
|
|
154
|
+
"prohibited_actions": ["prohibited_actions"],
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
EVAL_CLASS_NAME_MAP = {
|
|
158
|
+
"BleuScoreEvaluator": "bleu_score",
|
|
159
|
+
"CodeVulnerabilityEvaluator": "code_vulnerability",
|
|
160
|
+
"CoherenceEvaluator": "coherence",
|
|
161
|
+
"ContentSafetyEvaluator": "content_safety",
|
|
162
|
+
"DocumentRetrievalEvaluator": "document_retrieval",
|
|
163
|
+
"ECIEvaluator": "eci",
|
|
164
|
+
"F1ScoreEvaluator": "f1_score",
|
|
165
|
+
"FluencyEvaluator": "fluency",
|
|
166
|
+
"GleuScoreEvaluator": "gleu_score",
|
|
167
|
+
"GroundednessEvaluator": "groundedness",
|
|
168
|
+
"GroundednessProEvaluator": "groundedness_pro",
|
|
169
|
+
"HateUnfairnessEvaluator": "hate_unfairness",
|
|
170
|
+
"IndirectAttackEvaluator": "indirect_attack",
|
|
171
|
+
"IntentResolutionEvaluator": "intent_resolution",
|
|
172
|
+
"MeteorScoreEvaluator": "meteor_score",
|
|
173
|
+
"ProtectedMaterialEvaluator": "protected_material",
|
|
174
|
+
"QAEvaluator": "qa",
|
|
175
|
+
"RelevanceEvaluator": "relevance",
|
|
176
|
+
"ResponseCompletenessEvaluator": "response_completeness",
|
|
177
|
+
"RetrievalEvaluator": "retrieval",
|
|
178
|
+
"RougeScoreEvaluator": "rouge_score",
|
|
179
|
+
"SelfHarmEvaluator": "self_harm",
|
|
180
|
+
"SexualEvaluator": "sexual",
|
|
181
|
+
"SimilarityEvaluator": "similarity",
|
|
182
|
+
"TaskAdherenceEvaluator": "task_adherence",
|
|
183
|
+
"TaskCompletionEvaluator": "task_completion",
|
|
184
|
+
"ToolCallAccuracyEvaluator": "tool_call_accuracy",
|
|
185
|
+
"UngroundedAttributesEvaluator": "ungrounded_attributes",
|
|
186
|
+
"ViolenceEvaluator": "violence",
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
93
190
|
DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
|
|
94
191
|
|
|
95
192
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
|
|
@@ -116,3 +213,6 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
|
|
|
116
213
|
AOAI_COLUMN_NAME = "aoai"
|
|
117
214
|
DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
|
|
118
215
|
DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
|
|
216
|
+
|
|
217
|
+
# OpenTelemetry event names
|
|
218
|
+
EVALUATION_EVENT_NAME = "gen_ai.evaluation.result"
|
|
@@ -11,6 +11,11 @@
|
|
|
11
11
|
|
|
12
12
|
# Import all evals
|
|
13
13
|
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
|
|
14
|
+
from azure.ai.evaluation._evaluators._task_completion import _TaskCompletionEvaluator
|
|
15
|
+
from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
|
|
16
|
+
from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
|
|
17
|
+
from azure.ai.evaluation._evaluators._tool_success import _ToolSuccessEvaluator
|
|
18
|
+
from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
|
|
14
19
|
from azure.ai.evaluation import (
|
|
15
20
|
BleuScoreEvaluator,
|
|
16
21
|
CodeVulnerabilityEvaluator,
|
|
@@ -67,7 +72,12 @@ EVAL_CLASS_MAP = {
|
|
|
67
72
|
SexualEvaluator: "sexual",
|
|
68
73
|
SimilarityEvaluator: "similarity",
|
|
69
74
|
TaskAdherenceEvaluator: "task_adherence",
|
|
75
|
+
_TaskCompletionEvaluator: "task_completion",
|
|
76
|
+
_TaskNavigationEfficiencyEvaluator: "task_navigation_efficiency",
|
|
70
77
|
ToolCallAccuracyEvaluator: "tool_call_accuracy",
|
|
78
|
+
_ToolInputAccuracyEvaluator: "tool_input_accuracy",
|
|
79
|
+
_ToolSelectionEvaluator: "tool_selection",
|
|
80
|
+
_ToolSuccessEvaluator: "tool_success",
|
|
71
81
|
UngroundedAttributesEvaluator: "ungrounded_attributes",
|
|
72
82
|
ViolenceEvaluator: "violence",
|
|
73
83
|
}
|
|
@@ -159,6 +159,16 @@ class RunSubmitterClient:
|
|
|
159
159
|
"completed_lines": total_lines - failed_lines,
|
|
160
160
|
"failed_lines": failed_lines,
|
|
161
161
|
"log_path": None,
|
|
162
|
+
"error_message": (
|
|
163
|
+
f"({run.result.error.blame.value}) {run.result.error.message}"
|
|
164
|
+
if run.result and run.result.error and run.result.error.blame
|
|
165
|
+
else None
|
|
166
|
+
),
|
|
167
|
+
"error_code": (
|
|
168
|
+
f"{run.result.error.category.value}"
|
|
169
|
+
if run.result and run.result.error and run.result.error.category
|
|
170
|
+
else None
|
|
171
|
+
),
|
|
162
172
|
}
|
|
163
173
|
|
|
164
174
|
@staticmethod
|