azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,9 @@ class EvaluationMetrics:
28
28
  XPIA = "xpia"
29
29
  CODE_VULNERABILITY = "code_vulnerability"
30
30
  UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
31
+ SENSITIVE_DATA_LEAKAGE = "sensitive_data_leakage"
32
+ TASK_ADHERENCE = "task_adherence"
33
+ PROHIBITED_ACTIONS = "prohibited_actions"
31
34
 
32
35
 
33
36
  class _InternalEvaluationMetrics:
@@ -90,6 +93,100 @@ class TokenScope(str, enum.Enum):
90
93
  AZURE_ML = "https://ml.azure.com/.default"
91
94
 
92
95
 
96
+ class _EvaluatorMetricMapping:
97
+ """
98
+ Static mapping of evaluator names to their metric names, based on assets.json.
99
+ The 'builtin.' prefix is removed from the evaluator name keys.
100
+ """
101
+
102
+ EVALUATOR_NAME_METRICS_MAPPINGS = {
103
+ "bleu_score": ["bleu"],
104
+ "coherence": ["coherence"],
105
+ "document_retrieval": [
106
+ "ndcg@3",
107
+ "xdcg@3",
108
+ "fidelity",
109
+ "top1_relevance",
110
+ "top3_max_relevance",
111
+ "holes",
112
+ "holes_ratio",
113
+ "total_retrieved_documents",
114
+ "total_ground_truth_documents",
115
+ ],
116
+ "f1_score": ["f1_score"],
117
+ "fluency": ["fluency"],
118
+ "gleu_score": ["gleu"],
119
+ "meteor_score": ["meteor"],
120
+ "relevance": ["relevance"],
121
+ "response_completeness": ["response_completeness"],
122
+ "rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"],
123
+ "groundedness_pro": ["groundedness_pro"],
124
+ "similarity": ["similarity"],
125
+ "intent_resolution": ["intent_resolution"],
126
+ "retrieval": ["retrieval"],
127
+ "task_adherence": ["task_adherence"],
128
+ "tool_call_accuracy": ["tool_call_accuracy"],
129
+ "groundedness": ["groundedness"],
130
+ "code_vulnerability": ["code_vulnerability"],
131
+ "eci": ["eci"],
132
+ "protected_material": ["protected_material"],
133
+ "ungrounded_attributes": ["ungrounded_attributes"],
134
+ "indirect_attack": ["xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"],
135
+ "label_grader": ["label_model"],
136
+ "string_check_grader": ["string_check"],
137
+ "text_similarity_grader": ["similarity"],
138
+ "score_model_grader": ["score_model"],
139
+ "sexual": ["sexual"],
140
+ "self_harm": ["self_harm"],
141
+ "violence": ["violence"],
142
+ "hate_unfairness": ["hate_unfairness"],
143
+ "tool_input_accuracy": ["tool_input_accuracy"],
144
+ "task_completion": ["task_completion"],
145
+ "tool_success": ["tool_success"],
146
+ "tool_selection": ["tool_selection"],
147
+ "tool_output_utilization": ["tool_output_utilization"],
148
+ "task_navigation_efficiency": ["task_navigation_efficiency"],
149
+ "text_similarity": ["similarity"],
150
+ "string_check": ["string_check"],
151
+ "sensitive_data_leakage": ["prohibited_actions"],
152
+ "score_model": ["score_model"],
153
+ "label_model": ["label_model"],
154
+ "prohibited_actions": ["prohibited_actions"],
155
+ }
156
+
157
+ EVAL_CLASS_NAME_MAP = {
158
+ "BleuScoreEvaluator": "bleu_score",
159
+ "CodeVulnerabilityEvaluator": "code_vulnerability",
160
+ "CoherenceEvaluator": "coherence",
161
+ "ContentSafetyEvaluator": "content_safety",
162
+ "DocumentRetrievalEvaluator": "document_retrieval",
163
+ "ECIEvaluator": "eci",
164
+ "F1ScoreEvaluator": "f1_score",
165
+ "FluencyEvaluator": "fluency",
166
+ "GleuScoreEvaluator": "gleu_score",
167
+ "GroundednessEvaluator": "groundedness",
168
+ "GroundednessProEvaluator": "groundedness_pro",
169
+ "HateUnfairnessEvaluator": "hate_unfairness",
170
+ "IndirectAttackEvaluator": "indirect_attack",
171
+ "IntentResolutionEvaluator": "intent_resolution",
172
+ "MeteorScoreEvaluator": "meteor_score",
173
+ "ProtectedMaterialEvaluator": "protected_material",
174
+ "QAEvaluator": "qa",
175
+ "RelevanceEvaluator": "relevance",
176
+ "ResponseCompletenessEvaluator": "response_completeness",
177
+ "RetrievalEvaluator": "retrieval",
178
+ "RougeScoreEvaluator": "rouge_score",
179
+ "SelfHarmEvaluator": "self_harm",
180
+ "SexualEvaluator": "sexual",
181
+ "SimilarityEvaluator": "similarity",
182
+ "TaskAdherenceEvaluator": "task_adherence",
183
+ "TaskCompletionEvaluator": "task_completion",
184
+ "ToolCallAccuracyEvaluator": "tool_call_accuracy",
185
+ "UngroundedAttributesEvaluator": "ungrounded_attributes",
186
+ "ViolenceEvaluator": "violence",
187
+ }
188
+
189
+
93
190
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
94
191
 
95
192
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -116,3 +213,6 @@ BINARY_AGGREGATE_SUFFIX = "binary_aggregate"
116
213
  AOAI_COLUMN_NAME = "aoai"
117
214
  DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
118
215
  DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.
216
+
217
+ # OpenTelemetry event names
218
+ EVALUATION_EVENT_NAME = "gen_ai.evaluation.result"
@@ -11,6 +11,11 @@
11
11
 
12
12
  # Import all evals
13
13
  from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
14
+ from azure.ai.evaluation._evaluators._task_completion import _TaskCompletionEvaluator
15
+ from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
16
+ from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
17
+ from azure.ai.evaluation._evaluators._tool_success import _ToolSuccessEvaluator
18
+ from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
14
19
  from azure.ai.evaluation import (
15
20
  BleuScoreEvaluator,
16
21
  CodeVulnerabilityEvaluator,
@@ -67,7 +72,12 @@ EVAL_CLASS_MAP = {
67
72
  SexualEvaluator: "sexual",
68
73
  SimilarityEvaluator: "similarity",
69
74
  TaskAdherenceEvaluator: "task_adherence",
75
+ _TaskCompletionEvaluator: "task_completion",
76
+ _TaskNavigationEfficiencyEvaluator: "task_navigation_efficiency",
70
77
  ToolCallAccuracyEvaluator: "tool_call_accuracy",
78
+ _ToolInputAccuracyEvaluator: "tool_input_accuracy",
79
+ _ToolSelectionEvaluator: "tool_selection",
80
+ _ToolSuccessEvaluator: "tool_success",
71
81
  UngroundedAttributesEvaluator: "ungrounded_attributes",
72
82
  ViolenceEvaluator: "violence",
73
83
  }
@@ -159,6 +159,16 @@ class RunSubmitterClient:
159
159
  "completed_lines": total_lines - failed_lines,
160
160
  "failed_lines": failed_lines,
161
161
  "log_path": None,
162
+ "error_message": (
163
+ f"({run.result.error.blame.value}) {run.result.error.message}"
164
+ if run.result and run.result.error and run.result.error.blame
165
+ else None
166
+ ),
167
+ "error_code": (
168
+ f"{run.result.error.category.value}"
169
+ if run.result and run.result.error and run.result.error.category
170
+ else None
171
+ ),
162
172
  }
163
173
 
164
174
  @staticmethod