@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -1
- package/package.json +4 -3
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +144 -333
- package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
- package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
- package/schema/v1/examples/valid/multi-turn-output.json +2 -0
- package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
- package/schema/version.json +1 -1
- package/src/clients/cli/api_clients/A2A/a2a_client.py +57 -10
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/common.py +8 -14
- package/src/clients/cli/error_messages.py +91 -0
- package/src/clients/cli/evaluation_runner.py +108 -97
- package/src/clients/cli/evaluator_resolver.py +8 -33
- package/src/clients/cli/generate_report.py +125 -96
- package/src/clients/cli/main.py +2 -1
- package/src/clients/cli/readme.md +1 -1
- package/src/clients/cli/result_writer.py +129 -110
- package/src/clients/cli/status_derivation.py +91 -0
- package/src/clients/node-js/bin/runevals.js +31 -9
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +20 -13
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
3
|
+
"schemaVersion": "1.4.0",
|
|
4
|
+
"metadata": {
|
|
5
|
+
"name": "All scenarios — comprehensive comparison fixture",
|
|
6
|
+
"description": "Single fixture exercising every output shape under v1.4.0. Items are structured to isolate one variable at a time so a reader can see exactly which combination drives which top-level `status` and whether the top-level `error` field is populated. Each item carries an `extensions.scenario` label for jq/grep.",
|
|
7
|
+
"evaluatedAt": "2026-05-01T11:00:00Z",
|
|
8
|
+
"agentName": "Test Agent",
|
|
9
|
+
"cliVersion": "1.4.0"
|
|
10
|
+
},
|
|
11
|
+
"items": [
|
|
12
|
+
|
|
13
|
+
{
|
|
14
|
+
"extensions": {
|
|
15
|
+
"scenario": "1-single-turn-uniform-pass",
|
|
16
|
+
"notes": "Baseline. All evaluators uniformly pass. status=pass. No top-level error field.",
|
|
17
|
+
"evaluators_in_scores": ["pass", "pass"],
|
|
18
|
+
"expected_status": "pass",
|
|
19
|
+
"expected_error_field_populated": false
|
|
20
|
+
},
|
|
21
|
+
"prompt": "What is Microsoft Graph API?",
|
|
22
|
+
"expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
|
|
23
|
+
"response": "Microsoft Graph API is a gateway to data and intelligence in Microsoft 365.",
|
|
24
|
+
"scores": {
|
|
25
|
+
"relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
|
|
26
|
+
"coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
|
|
27
|
+
},
|
|
28
|
+
"status": "pass"
|
|
29
|
+
},
|
|
30
|
+
|
|
31
|
+
{
|
|
32
|
+
"extensions": {
|
|
33
|
+
"scenario": "2-single-turn-uniform-fail",
|
|
34
|
+
"notes": "All evaluators uniformly fail. status=fail. No top-level error field. Under v1.4.0 this is the ONLY shape producing status=fail — uniform-fail is strict.",
|
|
35
|
+
"evaluators_in_scores": ["fail", "fail"],
|
|
36
|
+
"expected_status": "fail",
|
|
37
|
+
"expected_error_field_populated": false
|
|
38
|
+
},
|
|
39
|
+
"prompt": "What is the boiling point of water in Fahrenheit at sea level?",
|
|
40
|
+
"expected_response": "212°F.",
|
|
41
|
+
"response": "Water boils at 150°F at sea level.",
|
|
42
|
+
"scores": {
|
|
43
|
+
"relevance": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect." },
|
|
44
|
+
"coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contains an internal contradiction." }
|
|
45
|
+
},
|
|
46
|
+
"status": "fail"
|
|
47
|
+
},
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"extensions": {
|
|
51
|
+
"scenario": "3-single-turn-fail-pass-fail-mix",
|
|
52
|
+
"notes": "All evaluators ran successfully; one passed, one returned a fail verdict. status=fail (covers uniform-fail and pass+fail mixes). Top-level error field is ABSENT — no errored evaluator to summarize.",
|
|
53
|
+
"evaluators_in_scores": ["pass", "fail"],
|
|
54
|
+
"expected_status": "fail",
|
|
55
|
+
"expected_error_field_populated": false
|
|
56
|
+
},
|
|
57
|
+
"prompt": "Explain the difference between SharePoint and OneDrive.",
|
|
58
|
+
"expected_response": "Covers shared vs personal storage, permissions model.",
|
|
59
|
+
"response": "SharePoint is for team collaboration; OneDrive is for personal files. Both use the same permissions engine.",
|
|
60
|
+
"scores": {
|
|
61
|
+
"relevance": { "score": 4.0, "result": "pass", "threshold": 3, "reason": "Response addresses the key distinction." },
|
|
62
|
+
"coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "The claim about a 'same permissions engine' is inaccurate; the response is internally inconsistent with product reality." }
|
|
63
|
+
},
|
|
64
|
+
"status": "fail"
|
|
65
|
+
},
|
|
66
|
+
|
|
67
|
+
{
|
|
68
|
+
"extensions": {
|
|
69
|
+
"scenario": "4-single-turn-partial-fail-plus-evaluator-crash",
|
|
70
|
+
"notes": "One fail + one evaluator crash (judge raised an exception). status=partial — error takes priority over pass/fail when ≥1 evaluator errored. Top-level error field IS populated with the evaluatorsFailed summary. Per-evaluator error uses the 'Evaluator failed:' prefix and appends exception.message text.",
|
|
71
|
+
"evaluators_in_scores": ["fail", "error"],
|
|
72
|
+
"expected_status": "partial",
|
|
73
|
+
"expected_error_field_populated": true
|
|
74
|
+
},
|
|
75
|
+
"prompt": "How do I configure conditional access for guest accounts?",
|
|
76
|
+
"expected_response": "Covers Azure AD guest CA policies.",
|
|
77
|
+
"response": "Use the on-prem AD Users and Computers tool to block external logins.",
|
|
78
|
+
"scores": {
|
|
79
|
+
"relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response references on-prem AD instead of Azure AD." },
|
|
80
|
+
"coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
|
|
81
|
+
},
|
|
82
|
+
"status": "partial",
|
|
83
|
+
"error": {
|
|
84
|
+
"code": "evaluatorsFailed",
|
|
85
|
+
"message": "Agent response obtained. 1 of 2 evaluators failed to run."
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
|
|
89
|
+
{
|
|
90
|
+
"extensions": {
|
|
91
|
+
"scenario": "5-single-turn-partial-all-evaluators-errored",
|
|
92
|
+
"notes": "Response obtained, but every attempted evaluator errored (zero verdicts rendered). status=partial (not error) — the agent DID respond; we just couldn't fully evaluate it. Same unified message template as scenario 4, now reading '2 of 2' to indicate all evaluators errored. The 'Agent response obtained.' prefix distinguishes this case from scenario 6 (no response, status=error).",
|
|
93
|
+
"evaluators_in_scores": ["error", "error"],
|
|
94
|
+
"expected_status": "partial",
|
|
95
|
+
"expected_error_field_populated": true
|
|
96
|
+
},
|
|
97
|
+
"prompt": "List the top 5 security best practices for M365 tenants.",
|
|
98
|
+
"expected_response": "Covers MFA, conditional access, audit logging, DLP, least privilege.",
|
|
99
|
+
"response": "Here are five M365 security best practices: 1) MFA, 2) conditional access, 3) DLP, 4) audit logs, 5) least-privilege roles.",
|
|
100
|
+
"scores": {
|
|
101
|
+
"relevance": { "result": "error", "error": "Evaluator failed: Service rate limit exceeded after 3 retries" },
|
|
102
|
+
"coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
|
|
103
|
+
},
|
|
104
|
+
"status": "partial",
|
|
105
|
+
"error": {
|
|
106
|
+
"code": "evaluatorsFailed",
|
|
107
|
+
"message": "Agent response obtained. 2 of 2 evaluators failed to run."
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
|
|
111
|
+
{
|
|
112
|
+
"extensions": {
|
|
113
|
+
"scenario": "6-single-turn-error-no-response",
|
|
114
|
+
"notes": "Agent did not respond after retries. No evaluators attempted. status=error. Top-level error field IS populated with the request-failure cause template. This is the ONLY single-turn case producing status=error under v1.4.0.",
|
|
115
|
+
"evaluators_in_scores": [],
|
|
116
|
+
"expected_status": "error",
|
|
117
|
+
"expected_error_field_populated": true
|
|
118
|
+
},
|
|
119
|
+
"prompt": "What is Microsoft Graph API?",
|
|
120
|
+
"expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
|
|
121
|
+
"response": "",
|
|
122
|
+
"scores": {},
|
|
123
|
+
"status": "error",
|
|
124
|
+
"error": {
|
|
125
|
+
"code": "agentRequestFailed",
|
|
126
|
+
"message": "Agent request failed: HTTP 503 Service Unavailable"
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
|
|
130
|
+
{
|
|
131
|
+
"extensions": {
|
|
132
|
+
"scenario": "7-multi-turn-partial-mixed-turn-outcomes",
|
|
133
|
+
"notes": "3-turn thread with no errored turns. Turn 1 uniformly passed; Turn 2 had a pass+error evaluator mix (partial); Turn 3 had a uniform-fail (status=fail). Per-turn statuses: [pass, partial, fail] — any partial turn drives thread to partial under FR-004's priority rules. Summary invariant: 1+1+1+0=3.",
|
|
134
|
+
"per_turn_statuses": ["pass", "partial", "fail"],
|
|
135
|
+
"expected_overall_status": "partial"
|
|
136
|
+
},
|
|
137
|
+
"name": "Seattle trip planning — mixed turn outcomes",
|
|
138
|
+
"conversation_id": "conv-abc-007",
|
|
139
|
+
"turns": [
|
|
140
|
+
{
|
|
141
|
+
"prompt": "I'm based in Seattle.",
|
|
142
|
+
"expected_response": "I can help with Seattle-related queries.",
|
|
143
|
+
"response": "Understood — I can help with Seattle questions.",
|
|
144
|
+
"scores": {
|
|
145
|
+
"relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
|
|
146
|
+
"coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
|
|
147
|
+
},
|
|
148
|
+
"status": "pass"
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
"prompt": "What's the weather like here?",
|
|
152
|
+
"expected_response": "Seattle has mild, rainy weather.",
|
|
153
|
+
"response": "Seattle tends to be rainy most of the year, especially in winter.",
|
|
154
|
+
"scores": {
|
|
155
|
+
"relevance": { "score": 5.0, "result": "pass", "threshold": 3, "reason": "Response addresses Seattle weather accurately." },
|
|
156
|
+
"coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
|
|
157
|
+
},
|
|
158
|
+
"status": "partial",
|
|
159
|
+
"error": {
|
|
160
|
+
"code": "evaluatorsFailed",
|
|
161
|
+
"message": "Agent response obtained. 1 of 2 evaluators failed to run."
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"prompt": "What's the average temperature in Seattle in March?",
|
|
166
|
+
"expected_response": "Around 50°F (10°C).",
|
|
167
|
+
"response": "Seattle averages 80°F in March.",
|
|
168
|
+
"scores": {
|
|
169
|
+
"relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect — Seattle's March averages are far below 80°F." },
|
|
170
|
+
"coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contradicts well-known regional climate data." }
|
|
171
|
+
},
|
|
172
|
+
"status": "fail"
|
|
173
|
+
}
|
|
174
|
+
],
|
|
175
|
+
"summary": {
|
|
176
|
+
"turns_total": 3,
|
|
177
|
+
"turns_passed": 1,
|
|
178
|
+
"turns_failed": 1,
|
|
179
|
+
"turns_partial": 1,
|
|
180
|
+
"turns_errored": 0,
|
|
181
|
+
"overall_status": "partial"
|
|
182
|
+
}
|
|
183
|
+
},
|
|
184
|
+
|
|
185
|
+
{
|
|
186
|
+
"extensions": {
|
|
187
|
+
"scenario": "8-multi-turn-error-any-errored-turn",
|
|
188
|
+
"notes": "3-turn thread with a mid-conversation request failure. Turn 1 uniformly passed; Turn 2's request failed; Turn 3 was downstream-skipped. Per-turn statuses: [pass, error, error] — under FR-004's priority rules, any errored turn drives the thread to error (the run didn't complete). The two error turns carry distinct error codes (agentRequestFailed vs turnSkipped) demonstrating the cascade. Summary invariant: 1+0+0+2=3.",
|
|
189
|
+
"per_turn_statuses": ["pass", "error", "error"],
|
|
190
|
+
"expected_overall_status": "error"
|
|
191
|
+
},
|
|
192
|
+
"name": "Conversation that aborted mid-thread",
|
|
193
|
+
"conversation_id": "conv-abc-008",
|
|
194
|
+
"turns": [
|
|
195
|
+
{
|
|
196
|
+
"prompt": "I'd like to plan a trip from SFO.",
|
|
197
|
+
"expected_response": "I can help with travel planning from SFO.",
|
|
198
|
+
"response": "Sure — I can help plan a trip from SFO.",
|
|
199
|
+
"scores": {
|
|
200
|
+
"relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
|
|
201
|
+
"coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
|
|
202
|
+
},
|
|
203
|
+
"status": "pass"
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
"prompt": "Book me a flight from SFO to SEA next Tuesday.",
|
|
207
|
+
"expected_response": "I can help with flight queries.",
|
|
208
|
+
"response": "",
|
|
209
|
+
"scores": {},
|
|
210
|
+
"status": "error",
|
|
211
|
+
"error": {
|
|
212
|
+
"code": "agentRequestFailed",
|
|
213
|
+
"message": "Agent request failed: DNS resolution failed for agent endpoint"
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
{
|
|
217
|
+
"prompt": "Prefer morning departure, aisle seat.",
|
|
218
|
+
"expected_response": "Noted.",
|
|
219
|
+
"response": "",
|
|
220
|
+
"scores": {},
|
|
221
|
+
"status": "error",
|
|
222
|
+
"error": {
|
|
223
|
+
"code": "turnSkipped",
|
|
224
|
+
"message": "Turn not attempted: preceding turn failed"
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
],
|
|
228
|
+
"summary": {
|
|
229
|
+
"turns_total": 3,
|
|
230
|
+
"turns_passed": 1,
|
|
231
|
+
"turns_failed": 0,
|
|
232
|
+
"turns_partial": 0,
|
|
233
|
+
"turns_errored": 2,
|
|
234
|
+
"overall_status": "error"
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
]
|
|
239
|
+
}
|
package/schema/version.json
CHANGED
|
@@ -8,7 +8,7 @@ import re
|
|
|
8
8
|
import urllib.error
|
|
9
9
|
import urllib.request
|
|
10
10
|
import uuid
|
|
11
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
from api_clients.base_agent_client import BaseAgentClient
|
|
14
14
|
from cli_logging.console_diagnostics import emit_structured_log
|
|
@@ -35,6 +35,7 @@ class A2AClient(BaseAgentClient):
|
|
|
35
35
|
access_token: str,
|
|
36
36
|
logger: Optional[logging.Logger] = None,
|
|
37
37
|
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
38
|
+
token_refresh_fn: Optional[Callable[[], str]] = None,
|
|
38
39
|
) -> None:
|
|
39
40
|
"""
|
|
40
41
|
Args:
|
|
@@ -42,11 +43,15 @@ class A2AClient(BaseAgentClient):
|
|
|
42
43
|
access_token: Bearer token for A2A authentication.
|
|
43
44
|
logger: Logger to use. Defaults to a module-level logger if not provided.
|
|
44
45
|
diagnostic_records: List to accumulate structured log entries.
|
|
46
|
+
token_refresh_fn: Optional callable that returns a fresh access token string.
|
|
47
|
+
When provided, a single HTTP 401 response will trigger a token refresh
|
|
48
|
+
and one automatic retry, making the refresh invisible to the caller.
|
|
45
49
|
"""
|
|
46
50
|
self._endpoint = a2a_endpoint.rstrip("/")
|
|
47
51
|
self._access_token = access_token
|
|
48
52
|
self._logger = logger or logging.getLogger(__name__)
|
|
49
53
|
self._diagnostic_records = diagnostic_records
|
|
54
|
+
self._token_refresh_fn = token_refresh_fn
|
|
50
55
|
self._resolved_agent_url: Optional[str] = None
|
|
51
56
|
|
|
52
57
|
# ------------------------------------------------------------------ #
|
|
@@ -261,6 +266,12 @@ class A2AClient(BaseAgentClient):
|
|
|
261
266
|
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
|
262
267
|
"""Send a JSON-RPC message to the agent and parse the response.
|
|
263
268
|
|
|
269
|
+
When a ``token_refresh_fn`` was supplied at construction time and the
|
|
270
|
+
server responds with HTTP 401 (Unauthorized), the token is refreshed
|
|
271
|
+
automatically and the request is retried exactly once. This keeps
|
|
272
|
+
long-running eval sessions alive beyond the initial token lifetime
|
|
273
|
+
without requiring any user interaction.
|
|
274
|
+
|
|
264
275
|
Returns:
|
|
265
276
|
A tuple of (result_dict, raw_result) where result_dict is the
|
|
266
277
|
normalized response dict (raw_response_text, display_response_text,
|
|
@@ -275,15 +286,51 @@ class A2AClient(BaseAgentClient):
|
|
|
275
286
|
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
276
287
|
raw = resp.read().decode("utf-8", errors="replace")
|
|
277
288
|
except urllib.error.HTTPError as e:
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
289
|
+
if e.code == 401 and self._token_refresh_fn is not None:
|
|
290
|
+
emit_structured_log(
|
|
291
|
+
"info",
|
|
292
|
+
"[A2A] Access token expired (HTTP 401); refreshing token and retrying.",
|
|
293
|
+
Operation.AUTHENTICATE,
|
|
294
|
+
logger=self._logger,
|
|
295
|
+
diagnostic_records=self._diagnostic_records,
|
|
296
|
+
)
|
|
297
|
+
new_token = self._token_refresh_fn()
|
|
298
|
+
if not new_token:
|
|
299
|
+
raise RuntimeError(
|
|
300
|
+
"A2A request failed (HTTP 401 Unauthorized) and token refresh returned no token."
|
|
301
|
+
) from e
|
|
302
|
+
self._access_token = new_token
|
|
303
|
+
headers["Authorization"] = f"Bearer {self._access_token}"
|
|
304
|
+
retry_req = urllib.request.Request(
|
|
305
|
+
agent_url, data=payload, headers=headers, method="POST"
|
|
306
|
+
)
|
|
307
|
+
try:
|
|
308
|
+
with urllib.request.urlopen(retry_req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
309
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
|
310
|
+
except urllib.error.HTTPError as retry_e:
|
|
311
|
+
body = ""
|
|
312
|
+
try:
|
|
313
|
+
body = retry_e.read().decode("utf-8", errors="replace")
|
|
314
|
+
except Exception:
|
|
315
|
+
pass
|
|
316
|
+
raise RuntimeError(
|
|
317
|
+
f"A2A request failed (HTTP {retry_e.code} {retry_e.reason}) after token refresh."
|
|
318
|
+
+ (f" Body: {body[:500]}" if body else "")
|
|
319
|
+
) from retry_e
|
|
320
|
+
except urllib.error.URLError as retry_e:
|
|
321
|
+
raise RuntimeError(
|
|
322
|
+
f"A2A connection error after token refresh: {getattr(retry_e, 'reason', str(retry_e))}"
|
|
323
|
+
) from retry_e
|
|
324
|
+
else:
|
|
325
|
+
body = ""
|
|
326
|
+
try:
|
|
327
|
+
body = e.read().decode("utf-8", errors="replace")
|
|
328
|
+
except Exception:
|
|
329
|
+
pass
|
|
330
|
+
raise RuntimeError(
|
|
331
|
+
f"A2A request failed (HTTP {e.code} {e.reason})."
|
|
332
|
+
+ (f" Body: {body[:500]}" if body else "")
|
|
333
|
+
) from e
|
|
287
334
|
except urllib.error.URLError as e:
|
|
288
335
|
raise RuntimeError(
|
|
289
336
|
f"A2A connection error: {getattr(e, 'reason', str(e))}"
|
|
@@ -13,7 +13,7 @@ https://github.com/AzureAD/microsoft-authentication-extensions-for-python
|
|
|
13
13
|
import os
|
|
14
14
|
import platform
|
|
15
15
|
import logging
|
|
16
|
-
from typing import Optional
|
|
16
|
+
from typing import Callable, Optional
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
import jwt
|
|
19
19
|
from msal import PublicClientApplication
|
|
@@ -260,3 +260,23 @@ class AuthHandler:
|
|
|
260
260
|
return oid
|
|
261
261
|
except jwt.DecodeError as e:
|
|
262
262
|
raise ValueError(f"Failed to decode token: {e}")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def make_token_refresh_fn(auth_handler: "AuthHandler") -> Callable[[], str]:
|
|
266
|
+
"""Return a callable that silently refreshes the A2A access token.
|
|
267
|
+
|
|
268
|
+
On a 401 response the caller invokes this function. It first attempts a
|
|
269
|
+
silent refresh (using the MSAL refresh token) and falls back to interactive
|
|
270
|
+
authentication only when a silent refresh is not possible. The returned
|
|
271
|
+
string is the new access token; an empty string signals failure.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
auth_handler: An initialized AuthHandler instance to use for token acquisition.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
A zero-argument callable that returns a fresh access token string.
|
|
278
|
+
"""
|
|
279
|
+
def _refresh() -> str:
|
|
280
|
+
result = auth_handler.acquire_token_interactive() or {}
|
|
281
|
+
return result.get("access_token") or ""
|
|
282
|
+
return _refresh
|
|
@@ -39,19 +39,14 @@ CITATIONS = "Citations"
|
|
|
39
39
|
EXACT_MATCH = "ExactMatch"
|
|
40
40
|
PARTIAL_MATCH = "PartialMatch"
|
|
41
41
|
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# Error state (evaluation couldn't complete):
|
|
51
|
-
STATUS_ERROR = "error" # API call failed / response couldn't be obtained
|
|
52
|
-
# Thread-level aggregate status (multi-turn only):
|
|
53
|
-
STATUS_PARTIAL = "partial" # Some turns passed, some did not
|
|
54
|
-
# Fallback for missing status:
|
|
42
|
+
# Evaluation status constants — four-value enum used at the turn/item level
|
|
43
|
+
# AND the thread-level overall_status. See status_derivation.py for the
|
|
44
|
+
# canonical derivation and rollup rules.
|
|
45
|
+
STATUS_PASS = "pass"
|
|
46
|
+
STATUS_FAIL = "fail"
|
|
47
|
+
STATUS_PARTIAL = "partial"
|
|
48
|
+
STATUS_ERROR = "error"
|
|
49
|
+
# Internal-only sentinel — never appears in emitted output.
|
|
55
50
|
STATUS_UNKNOWN = "unknown"
|
|
56
51
|
|
|
57
52
|
# System defaults when no file-level or env-level defaults are configured
|
|
@@ -77,7 +72,6 @@ METRIC_IDS = {
|
|
|
77
72
|
@dataclass
|
|
78
73
|
class RegistryEntry:
|
|
79
74
|
type: str # "llm", "tool", or "non-llm"
|
|
80
|
-
requires: List[str]
|
|
81
75
|
default_threshold: Optional[float]
|
|
82
76
|
|
|
83
77
|
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Canonical error-message templates for persisted evaluation output.
|
|
2
|
+
|
|
3
|
+
Every error string written into a JSON/CSV/HTML output file MUST be produced
|
|
4
|
+
by a builder in this module. The builders accept only string arguments — never
|
|
5
|
+
exception objects — which keeps ``repr(exc)``, ``traceback.format_exc()``, and
|
|
6
|
+
SDK class names out of persisted output by construction.
|
|
7
|
+
|
|
8
|
+
Two flavours:
|
|
9
|
+
|
|
10
|
+
* **Turn/item-level `ErrorObject` builders** return a structured ``{code, message}``
|
|
11
|
+
dict for the top-level ``error`` field on a turn or single-turn item. Used
|
|
12
|
+
when ``status == "error"`` (cause) or ``status == "partial"`` with at least
|
|
13
|
+
one errored evaluator (summary).
|
|
14
|
+
|
|
15
|
+
* **Per-evaluator string builders** return a flat string formatted as
|
|
16
|
+
``"<category prefix>: <detail>"`` for the ``error`` field inside an
|
|
17
|
+
``ErroredScore`` entry. Evaluator identity is encoded by the ``scores`` map's
|
|
18
|
+
parent property key, so no ``code`` is needed at this level.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import TypedDict
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ErrorObject(TypedDict):
|
|
27
|
+
"""Turn/item-level top-level error shape — `{code, message}` per ErrorObject $def."""
|
|
28
|
+
code: str
|
|
29
|
+
message: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ── Error code constants ──────────────────────────────────────────────
|
|
33
|
+
# These are the canonical machine-readable codes emitted in the top-level
|
|
34
|
+
# `error.code` field. They appear in persisted output and consumer-facing
|
|
35
|
+
# documentation; treat the string values as part of the public contract
|
|
36
|
+
# (do not rename without a schema-version bump).
|
|
37
|
+
|
|
38
|
+
ERROR_CODE_AGENT_REQUEST_FAILED = "agentRequestFailed"
|
|
39
|
+
ERROR_CODE_TURN_SKIPPED = "turnSkipped"
|
|
40
|
+
ERROR_CODE_EVALUATORS_FAILED = "evaluatorsFailed"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Turn/item-level ErrorObject builders ──────────────────────────────
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def agent_request_failed(exc_message: str) -> ErrorObject:
|
|
47
|
+
"""`status == "error"` cause when the agent client raised — no response obtained."""
|
|
48
|
+
return {
|
|
49
|
+
"code": ERROR_CODE_AGENT_REQUEST_FAILED,
|
|
50
|
+
"message": f"Agent request failed: {exc_message}",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def turn_skipped() -> ErrorObject:
|
|
55
|
+
"""`status == "error"` cause for downstream turns after a preceding turn failed.
|
|
56
|
+
|
|
57
|
+
Synthesized cause — no exception text appended (FR-013).
|
|
58
|
+
"""
|
|
59
|
+
return {
|
|
60
|
+
"code": ERROR_CODE_TURN_SKIPPED,
|
|
61
|
+
"message": "Turn not attempted: preceding turn failed",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def evaluators_failed_summary(error_count: int, total: int) -> ErrorObject:
|
|
66
|
+
"""`status == "partial"` summary when at least one evaluator returned `result: "error"`.
|
|
67
|
+
|
|
68
|
+
Unified template regardless of error_count vs total — per-evaluator detail
|
|
69
|
+
(crash vs missing-prereq, with optional exception text) lives in `scores`.
|
|
70
|
+
"""
|
|
71
|
+
return {
|
|
72
|
+
"code": ERROR_CODE_EVALUATORS_FAILED,
|
|
73
|
+
"message": f"Agent response obtained. {error_count} of {total} evaluators failed to run.",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Per-evaluator string builders (inside `scores`) ──────────────────
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def evaluator_failed(exc_message: str) -> str:
|
|
81
|
+
"""Per-evaluator `error` string when the evaluator raised during run."""
|
|
82
|
+
return f"Evaluator failed: {exc_message}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Per-evaluator prerequisite-miss builders are not present today because no
|
|
86
|
+
# reachable prereq-fail exists: `validate_environment()` exits the process if
|
|
87
|
+
# Azure OpenAI config is missing, and no registered evaluator has a
|
|
88
|
+
# data-dependent prereq. When that changes, add builders following the
|
|
89
|
+
# convention `"Evaluator missing prerequisites: <description>"` and wire a
|
|
90
|
+
# prereq check in evaluation_runner. See specs/236-unified-error-output for
|
|
91
|
+
# the deferred sub-cases.
|