azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1622 -765
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
- azure/ai/evaluation/red_team/_utils/constants.py +6 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -23,139 +23,253 @@ inputs:
|
|
|
23
23
|
default: "[]"
|
|
24
24
|
---
|
|
25
25
|
system:
|
|
26
|
-
You are an expert in evaluating the quality of a
|
|
26
|
+
You are an expert in evaluating the quality of a AGENT_RESPONSE from an intelligent assistant based on provided definition and CONVERSATION_HISTORY.
|
|
27
27
|
|
|
28
28
|
user:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
ROLE
|
|
30
|
+
====
|
|
31
|
+
You are Intent-Resolution-Judge, an impartial grader that scores how well an AI agent *resolved* the user's intent in a multi-turn conversation.
|
|
32
|
+
You are NOT grading intent recognition. Assume the agent has understood the intent that is expressed; you only judge whether the reply satisfies or completes that intent.
|
|
33
33
|
|
|
34
|
-
There are two components to intent resolution:
|
|
35
|
-
- Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry.
|
|
36
|
-
- Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request.
|
|
37
34
|
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
INPUT
|
|
36
|
+
=====
|
|
37
|
+
CONVERSATION_HISTORY: {{query}}
|
|
38
|
+
AGENT_RESPONSE: {{response}}
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
RESPONSE: {{response}}
|
|
44
|
-
TOOL_DEFINITIONS: {{tool_definitions}}
|
|
40
|
+
CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message.
|
|
41
|
+
AGENT_RESPONSE is the agent reply to that latest user message.
|
|
45
42
|
|
|
46
43
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
44
|
+
TASK
|
|
45
|
+
====
|
|
46
|
+
Output a JSON object with:
|
|
47
|
+
1) a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent
|
|
48
|
+
2) an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent.
|
|
50
49
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
**Response:** The latest smartphone models have incredible features and performance.
|
|
54
|
-
**Tool Definitions:** []
|
|
50
|
+
The explanation should always precede the score and should clearly justify the score based on the agent's performance in resolving the user's intent.
|
|
51
|
+
Response format exactly as follows:
|
|
55
52
|
|
|
56
|
-
**Expected output**
|
|
57
53
|
{
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"agent_perceived_intent": "discussion about smartphone features",
|
|
61
|
-
"actual_user_intent": "bake a chocolate cake",
|
|
62
|
-
"correct_intent_detected": false,
|
|
63
|
-
"intent_resolved": false,
|
|
64
|
-
"resolution_score": 1,
|
|
54
|
+
"explanation": "<15-60 words>",
|
|
55
|
+
"score": <1-5>
|
|
65
56
|
}
|
|
66
57
|
|
|
58
|
+
EVALUATION STEPS
|
|
59
|
+
================
|
|
67
60
|
|
|
68
|
-
|
|
69
|
-
|
|
61
|
+
A. Identify the expressed intent in the final user turn (look at the full conversation_history for context if necessary).
|
|
62
|
+
B. Check resolution - Does the agent's reply, in AGENT_RESPONSE, actually complete or satisfy that intent?
|
|
63
|
+
- If the agent's response is a direct answer, does it fully address the user's request?
|
|
64
|
+
- If the agent's response is an action (like scheduling, deleting, etc.), does it confirm completion of that action?
|
|
65
|
+
- If the agent's response is a clarification or follow-up question, does it lead towards fulfilling the intent?
|
|
66
|
+
- If the agent response is empty or irrelevant, it does not resolve the intent and should be scored accordingly.
|
|
67
|
+
C. Verify correctness & completeness of the resolution.
|
|
68
|
+
D. Weigh impact - Minor style issues matter only for tie-breaking; resolution quality dominates.
|
|
69
|
+
E. Write a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent, including:
|
|
70
|
+
- What the user wanted
|
|
71
|
+
- How well the agent addressed it
|
|
72
|
+
- Any notable gaps or strengths
|
|
73
|
+
F. Choose the single closest integer score (1,2,3,4 or 5) from the scoring rubric below.
|
|
70
74
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
+
SCORING RUBRIC
|
|
76
|
+
5 - Excellent: Agent fully fulfills the intent with accurate, thorough, relevant action.
|
|
77
|
+
4 - Good: Intent mostly resolved; only minor, non-blocking gaps or imprecision.
|
|
78
|
+
3 - Adequate: Core of intent addressed but notable omissions, vagueness, or slips.
|
|
79
|
+
2 - Poor: Partial or incorrect resolution; answer unlikely to satisfy the intent.
|
|
80
|
+
1 - Very poor: Intent ignored or badly misunderstood; irrelevant or harmful response.
|
|
75
81
|
|
|
76
|
-
|
|
82
|
+
|
|
83
|
+
EXAMPLES
|
|
84
|
+
========
|
|
85
|
+
|
|
86
|
+
### Score 5 - Example A (single-turn)
|
|
87
|
+
|
|
88
|
+
CONVERSATION_HISTORY:
|
|
89
|
+
User turn 1:
|
|
90
|
+
Give me a Python one-liner to reverse a string.
|
|
91
|
+
|
|
92
|
+
AGENT_RESPONSE:
|
|
93
|
+
`s[::-1]` - this slice returns the string in reverse.
|
|
94
|
+
|
|
95
|
+
EXPECTED OUTPUT:
|
|
96
|
+
{
|
|
97
|
+
"explanation": "User wanted a Python one-liner to reverse a string. Agent provided the correct code and a brief, helpful explanation, fully resolving the request with notable conciseness and accuracy.",
|
|
98
|
+
"score": 5
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
### Score 5 - Example B (multi-turn)
|
|
103
|
+
|
|
104
|
+
CONVERSATION_HISTORY:
|
|
105
|
+
User turn 1:
|
|
106
|
+
I need to cancel my 2 pm meeting with Dr. Lee this Friday.
|
|
107
|
+
|
|
108
|
+
Agent turn 1:
|
|
109
|
+
To confirm, that's Friday at 14:00 with Dr. Lee in Meeting Room 2, right?
|
|
110
|
+
|
|
111
|
+
User turn 2:
|
|
112
|
+
Yes, that one.
|
|
113
|
+
|
|
114
|
+
AGENT_RESPONSE:
|
|
115
|
+
Done! The event has been removed from your calendar and a cancellation email was sent to Dr. Lee.
|
|
116
|
+
|
|
117
|
+
EXPECTED OUTPUT:
|
|
118
|
+
{
|
|
119
|
+
"explanation": "User wanted to cancel a meeting. Agent first confirmed details, then successfully completed the cancellation and notified the attendee, fully resolving the intent with excellent clarity and thoroughness.",
|
|
120
|
+
"score": 5
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
### Score 4 - Example A (single-turn)
|
|
125
|
+
|
|
126
|
+
CONVERSATION_HISTORY:
|
|
127
|
+
User turn 1:
|
|
128
|
+
Translate 'carpe diem' to Portuguese and explain it in one sentence.
|
|
129
|
+
|
|
130
|
+
AGENT_RESPONSE:
|
|
131
|
+
'Aproveita o dia.' Significa que deves aproveitar o momento presente.
|
|
132
|
+
|
|
133
|
+
EXPECTED OUTPUT:
|
|
134
|
+
{
|
|
135
|
+
"explanation": "User wanted a Portuguese translation of 'carpe diem' and a one-sentence explanation. Agent provided an accurate translation and a concise explanation, mostly resolving the intent adequately.",
|
|
136
|
+
"score": 4
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
### Score 4 - Example B (multi-turn)
|
|
141
|
+
|
|
142
|
+
CONVERSATION_HISTORY:
|
|
143
|
+
User turn 1:
|
|
144
|
+
I need three book recommendations on deep work.
|
|
145
|
+
|
|
146
|
+
Agent turn 1:
|
|
147
|
+
Sure—any particular genre?
|
|
148
|
+
|
|
149
|
+
User turn 2:
|
|
150
|
+
Non-fiction, preferably practical.
|
|
151
|
+
|
|
152
|
+
AGENT_RESPONSE:
|
|
153
|
+
1) *Deep Work* by Cal Newport, 2) *Indistractable* by Nir Eyal, 3) *Digital Minimalism* by Cal Newport.
|
|
154
|
+
All are actionable non-fiction.
|
|
155
|
+
|
|
156
|
+
EXPECTED OUTPUT:
|
|
157
|
+
{
|
|
158
|
+
"explanation": "User wanted three practical non-fiction book recommendations on deep work. Agent supplied three relevant titles, confirming they fit the criteria, mostly resolving the intent. Brief justifications for each pick would improve it.",
|
|
159
|
+
"score": 4
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
### Score 3 - Example A (single-turn)
|
|
164
|
+
|
|
165
|
+
CONVERSATION_HISTORY:
|
|
166
|
+
User turn 1:
|
|
167
|
+
Summarise the key plot points of *The Hobbit* in 5 bullet points.
|
|
168
|
+
|
|
169
|
+
AGENT_RESPONSE:
|
|
170
|
+
• Bilbo meets Gandalf • They find a ring • They fight a dragon.
|
|
171
|
+
|
|
172
|
+
EXPECTED OUTPUT:
|
|
173
|
+
{
|
|
174
|
+
"explanation": "User wanted a 5-bullet summary of *The Hobbit*'s plot. Agent provided only three bullets, capturing some main events but omitting several pivotal ones and failing to meet the requested count, thus only adequately addressing the intent.",
|
|
175
|
+
"score": 3
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
### Score 3 - Example B (multi-turn)
|
|
180
|
+
|
|
181
|
+
CONVERSATION_HISTORY:
|
|
182
|
+
User turn 1:
|
|
183
|
+
What's the weather in Tokyo tomorrow?
|
|
184
|
+
|
|
185
|
+
Agent turn 1:
|
|
186
|
+
Would you like it in Celsius or Fahrenheit?
|
|
187
|
+
|
|
188
|
+
User turn 2:
|
|
189
|
+
Celsius, please.
|
|
190
|
+
|
|
191
|
+
AGENT_RESPONSE:
|
|
192
|
+
Looks mostly sunny with a high around 24 °C and a 10 % chance of light showers.
|
|
193
|
+
|
|
194
|
+
EXPECTED OUTPUT:
|
|
77
195
|
{
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
"agent_perceived_intent": "mention of ingredients",
|
|
81
|
-
"actual_user_intent": "bake a chocolate cake",
|
|
82
|
-
"correct_intent_detected": false,
|
|
83
|
-
"intent_resolved": false,
|
|
84
|
-
"resolution_score": 2,
|
|
196
|
+
"explanation": "User wanted tomorrow's weather in Tokyo (Celsius). Agent provided temperature and rain chance but omitted wind information, offering an adequate but incomplete resolution of the intent.",
|
|
197
|
+
"score": 3
|
|
85
198
|
}
|
|
86
199
|
|
|
87
200
|
|
|
88
|
-
|
|
89
|
-
**Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query.
|
|
201
|
+
### Score 2 - Example A (single-turn)
|
|
90
202
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
**Tool Definitions:** []
|
|
203
|
+
CONVERSATION_HISTORY:
|
|
204
|
+
User turn 1:
|
|
205
|
+
"Delete my last email draft."
|
|
95
206
|
|
|
96
|
-
|
|
207
|
+
AGENT_RESPONSE:
|
|
208
|
+
"Here's how to compose a new email…"
|
|
209
|
+
|
|
210
|
+
EXPECTED OUTPUT:
|
|
97
211
|
{
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"agent_perceived_intent": "basic baking process",
|
|
101
|
-
"actual_user_intent": "bake a chocolate cake",
|
|
102
|
-
"correct_intent_detected": true,
|
|
103
|
-
"intent_resolved": false,
|
|
104
|
-
"resolution_score": 3,
|
|
212
|
+
"explanation": "User wanted to delete their last email draft. Agent misunderstood the request, offering irrelevant guidance on composing a new email instead of performing the deletion, resulting in a poor resolution.",
|
|
213
|
+
"score": 2
|
|
105
214
|
}
|
|
106
215
|
|
|
107
216
|
|
|
108
|
-
|
|
109
|
-
|
|
217
|
+
### Score 2 - Example B (multi-turn)
|
|
218
|
+
|
|
219
|
+
CONVERSATION_HISTORY:
|
|
220
|
+
User turn 1:
|
|
221
|
+
"Calculate 15 % of 80."
|
|
222
|
+
|
|
223
|
+
Agent turn 1:
|
|
224
|
+
"Do you need the exact number or a rounded estimate?"
|
|
110
225
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
**Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes.
|
|
114
|
-
**Tool Definitions:** []
|
|
226
|
+
User turn 2:
|
|
227
|
+
"Exact, please."
|
|
115
228
|
|
|
116
|
-
|
|
229
|
+
AGENT_RESPONSE:
|
|
230
|
+
"It's 10.5."
|
|
231
|
+
|
|
232
|
+
EXPECTED OUTPUT:
|
|
117
233
|
{
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"agent_perceived_intent": "provide a basic chocolate cake recipe",
|
|
121
|
-
"actual_user_intent": "bake a chocolate cake",
|
|
122
|
-
"correct_intent_detected": true,
|
|
123
|
-
"intent_resolved": false,
|
|
124
|
-
"resolution_score": 4,
|
|
234
|
+
"explanation": "User wanted an exact calculation of 15% of 80. Agent provided an incorrect result (10.5 instead of 12), leading to a poor resolution of the user's intent.",
|
|
235
|
+
"score": 2
|
|
125
236
|
}
|
|
126
237
|
|
|
127
238
|
|
|
128
|
-
|
|
129
|
-
|
|
239
|
+
### Score 1 - Example A (single-turn)
|
|
240
|
+
|
|
241
|
+
CONVERSATION_HISTORY:
|
|
242
|
+
User turn 1:
|
|
243
|
+
"Mute notifications for the next hour."
|
|
130
244
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
**Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving.
|
|
134
|
-
**Tool Definitions:** []
|
|
245
|
+
AGENT_RESPONSE:
|
|
246
|
+
"Here's a cake recipe! 🍰"
|
|
135
247
|
|
|
136
|
-
|
|
248
|
+
EXPECTED OUTPUT:
|
|
137
249
|
{
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
"agent_perceived_intent": "provide a comprehensive chocolate cake recipe",
|
|
141
|
-
"actual_user_intent": "bake a chocolate cake",
|
|
142
|
-
"correct_intent_detected": true,
|
|
143
|
-
"intent_resolved": true,
|
|
144
|
-
"resolution_score": 5,
|
|
250
|
+
"explanation": "User wanted to mute notifications for an hour. Agent responded with a completely irrelevant cake recipe, making no attempt to address the user's intent, resulting in a very poor resolution.",
|
|
251
|
+
"score": 1
|
|
145
252
|
}
|
|
146
253
|
|
|
147
254
|
|
|
148
|
-
|
|
255
|
+
### Score 1 - Example B (multi-turn)
|
|
256
|
+
|
|
257
|
+
CONVERSATION_HISTORY:
|
|
258
|
+
User turn 1:
|
|
259
|
+
"Set an alarm for 6 am tomorrow."
|
|
260
|
+
|
|
261
|
+
Agent turn 1:
|
|
262
|
+
"Why not sleep in? Alarms are overrated."
|
|
149
263
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
- explanation: a string that explains why you think the input Data should get this resolution_score.
|
|
153
|
-
- conversation_has_intent: true or false
|
|
154
|
-
- agent_perceived_intent: a string that describes the intent the agent perceived from the user query
|
|
155
|
-
- actual_user_intent: a string that describes the actual user intent
|
|
156
|
-
- correct_intent_detected: true or false
|
|
157
|
-
- intent_resolved: true or false
|
|
158
|
-
- resolution_score: an integer between 1 and 5 that represents the resolution score
|
|
264
|
+
User turn 2:
|
|
265
|
+
"I really need the alarm—please set it."
|
|
159
266
|
|
|
267
|
+
AGENT_RESPONSE:
|
|
268
|
+
"Alarms are pointless; wake up whenever."
|
|
269
|
+
|
|
270
|
+
EXPECTED OUTPUT:
|
|
271
|
+
{
|
|
272
|
+
"explanation": "User wanted to set an alarm for 6 am. Agent was dismissive and refused to perform the requested action, completely failing to resolve the user's intent, leading to a very poor resolution.",
|
|
273
|
+
"score": 1
|
|
274
|
+
}
|
|
160
275
|
|
|
161
|
-
# Output
|
|
@@ -46,13 +46,13 @@ class MeteorScoreEvaluator(EvaluatorBase):
|
|
|
46
46
|
:caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
51
|
:start-after: [START meteor_score_evaluator]
|
|
52
52
|
:end-before: [END meteor_score_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
:caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
|
|
56
56
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
57
|
|
|
58
58
|
.. admonition:: Example with Threshold:
|
|
@@ -77,7 +77,6 @@ class MeteorScoreEvaluator(EvaluatorBase):
|
|
|
77
77
|
self._threshold = threshold
|
|
78
78
|
self._higher_is_better = True
|
|
79
79
|
super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
|
|
80
|
-
|
|
81
80
|
|
|
82
81
|
@override
|
|
83
82
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
|
|
@@ -25,9 +25,9 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
25
25
|
|
|
26
26
|
:param credential: The credential required for connecting to the Azure AI project.
|
|
27
27
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
28
|
-
:param azure_ai_project: The
|
|
29
|
-
resource group, and project name.
|
|
30
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
28
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
29
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
30
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
31
31
|
|
|
32
32
|
.. admonition:: Example:
|
|
33
33
|
|
|
@@ -37,15 +37,15 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
37
37
|
:language: python
|
|
38
38
|
:dedent: 8
|
|
39
39
|
:caption: Initialize and call a ProtectedMaterialEvaluator.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
44
|
:start-after: [START protected_material_evaluator]
|
|
45
45
|
:end-before: [END protected_material_evaluator]
|
|
46
46
|
:language: python
|
|
47
47
|
:dedent: 8
|
|
48
|
-
:caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
|
|
48
|
+
:caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
|
|
49
49
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
50
|
|
|
51
51
|
"""
|
|
@@ -49,17 +49,17 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
49
49
|
:caption: Initialize and call a QAEvaluator.
|
|
50
50
|
|
|
51
51
|
.. admonition:: Example using Azure AI Project URL:
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
54
54
|
:start-after: [START qa_evaluator]
|
|
55
55
|
:end-before: [END qa_evaluator]
|
|
56
56
|
:language: python
|
|
57
57
|
:dedent: 8
|
|
58
|
-
:caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
|
|
58
|
+
:caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
|
|
59
59
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
60
60
|
|
|
61
61
|
.. admonition:: Example with Threshold:
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
64
64
|
:start-after: [START threshold_qa_evaluator]
|
|
65
65
|
:end-before: [END threshold_qa_evaluator]
|
|
@@ -87,7 +87,7 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
|
|
|
87
87
|
fluency_threshold: int = 3,
|
|
88
88
|
similarity_threshold: int = 3,
|
|
89
89
|
f1_score_threshold: float = 0.5,
|
|
90
|
-
**kwargs
|
|
90
|
+
**kwargs,
|
|
91
91
|
):
|
|
92
92
|
# Type checking
|
|
93
93
|
for name, value in [
|
|
@@ -40,13 +40,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
40
40
|
:caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
|
|
41
41
|
|
|
42
42
|
.. admonition:: Example using Azure AI Project URL:
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
45
45
|
:start-after: [START relevance_evaluator]
|
|
46
46
|
:end-before: [END relevance_evaluator]
|
|
47
47
|
:language: python
|
|
48
48
|
:dedent: 8
|
|
49
|
-
:caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
|
|
49
|
+
:caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
|
|
50
50
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
51
51
|
|
|
52
52
|
.. admonition:: Example with Threshold:
|
|
@@ -73,22 +73,17 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
73
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
74
|
|
|
75
75
|
@override
|
|
76
|
-
def __init__(
|
|
77
|
-
self,
|
|
78
|
-
model_config,
|
|
79
|
-
*,
|
|
80
|
-
threshold=3
|
|
81
|
-
):
|
|
76
|
+
def __init__(self, model_config, *, threshold=3):
|
|
82
77
|
current_dir = os.path.dirname(__file__)
|
|
83
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
79
|
self._threshold = threshold
|
|
85
80
|
self._higher_is_better = True
|
|
86
81
|
super().__init__(
|
|
87
|
-
model_config=model_config,
|
|
88
|
-
prompty_file=prompty_path,
|
|
89
|
-
result_key=self._RESULT_KEY,
|
|
90
|
-
threshold=threshold,
|
|
91
|
-
_higher_is_better=self._higher_is_better
|
|
82
|
+
model_config=model_config,
|
|
83
|
+
prompty_file=prompty_path,
|
|
84
|
+
result_key=self._RESULT_KEY,
|
|
85
|
+
threshold=threshold,
|
|
86
|
+
_higher_is_better=self._higher_is_better,
|
|
92
87
|
)
|
|
93
88
|
|
|
94
89
|
@overload
|
|
@@ -37,24 +37,24 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
37
37
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
38
38
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
39
39
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
.. admonition:: Example:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
44
44
|
:start-after: [START completeness_evaluator]
|
|
45
45
|
:end-before: [END completeness_evaluator]
|
|
46
46
|
:language: python
|
|
47
47
|
:dedent: 8
|
|
48
48
|
:caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
.. admonition:: Example using Azure AI Project URL:
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
53
53
|
:start-after: [START completeness_evaluator]
|
|
54
54
|
:end-before: [END completeness_evaluator]
|
|
55
55
|
:language: python
|
|
56
56
|
:dedent: 8
|
|
57
|
-
:caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
|
|
57
|
+
:caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
|
|
58
58
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
59
59
|
|
|
60
60
|
"""
|
|
@@ -73,23 +73,18 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
73
73
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
74
74
|
|
|
75
75
|
@override
|
|
76
|
-
def __init__(self, model_config, *,
|
|
77
|
-
threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
|
|
78
|
-
**kwargs):
|
|
76
|
+
def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
|
|
79
77
|
current_dir = os.path.dirname(__file__)
|
|
80
78
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
79
|
self.threshold = threshold
|
|
82
|
-
super().__init__(model_config=model_config,
|
|
83
|
-
prompty_file=prompty_path,
|
|
84
|
-
result_key=self._RESULT_KEY,
|
|
85
|
-
**kwargs)
|
|
80
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
|
|
86
81
|
|
|
87
82
|
@overload
|
|
88
83
|
def __call__(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
ground_truth: str,
|
|
87
|
+
response: str,
|
|
93
88
|
) -> Dict[str, Union[str, float]]:
|
|
94
89
|
"""Evaluate completeness in given response. Accepts ground truth and response for evaluation.
|
|
95
90
|
Example usage:
|
|
@@ -111,9 +106,9 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
111
106
|
|
|
112
107
|
@overload
|
|
113
108
|
def __call__(
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
conversation: Conversation,
|
|
117
112
|
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
118
113
|
"""Evaluate completeness for a conversation
|
|
119
114
|
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
@@ -126,9 +121,9 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
126
121
|
|
|
127
122
|
@override
|
|
128
123
|
def __call__( # pylint: disable=docstring-missing-param
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
124
|
+
self,
|
|
125
|
+
*args,
|
|
126
|
+
**kwargs,
|
|
132
127
|
):
|
|
133
128
|
"""
|
|
134
129
|
Invokes the instance using the overloaded __call__ signature.
|
|
@@ -151,7 +146,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
151
146
|
raise EvaluationException(
|
|
152
147
|
message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
|
|
153
148
|
internal_message=f"Both ground_truth and response must be provided as input to the completeness"
|
|
154
|
-
|
|
149
|
+
f" evaluator.",
|
|
155
150
|
blame=ErrorBlame.USER_ERROR,
|
|
156
151
|
category=ErrorCategory.MISSING_FIELD,
|
|
157
152
|
target=ErrorTarget.COMPLETENESS_EVALUATOR,
|
|
@@ -163,7 +158,7 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
163
158
|
if llm_output:
|
|
164
159
|
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
|
|
165
160
|
|
|
166
|
-
score_result =
|
|
161
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
167
162
|
|
|
168
163
|
# updating the result key and threshold to int based on the schema
|
|
169
164
|
return {
|
|
@@ -172,5 +167,5 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
172
167
|
f"{self._result_key}_threshold": int(self.threshold),
|
|
173
168
|
f"{self._result_key}_reason": reason,
|
|
174
169
|
}
|
|
175
|
-
|
|
170
|
+
|
|
176
171
|
return {self._result_key: math.nan}
|
|
@@ -46,17 +46,17 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
46
46
|
:caption: Initialize and call a RetrievalEvaluator.
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
51
|
:start-after: [START retrieval_evaluator]
|
|
52
52
|
:end-before: [END retrieval_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
|
|
55
|
+
:caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
|
|
56
56
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
57
|
|
|
58
58
|
.. admonition:: Example with Threshold:
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
61
61
|
:start-after: [START threshold_retrieval_evaluator]
|
|
62
62
|
:end-before: [END threshold_retrieval_evaluator]
|
|
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
79
79
|
|
|
80
80
|
@override
|
|
81
|
-
def __init__(self, model_config, *, threshold: float=3): # pylint: disable=super-init-not-called
|
|
81
|
+
def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
|
|
82
82
|
current_dir = os.path.dirname(__file__)
|
|
83
83
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
84
|
self._threshold = threshold
|