@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ {
2
+ "schemaVersion": "1.4.0",
3
+ "items": [
4
+ {
5
+ "prompt": "What is Microsoft Graph?",
6
+ "scores": {
7
+ "relevance": {
8
+ "result": "error"
9
+ }
10
+ }
11
+ }
12
+ ]
13
+ }
@@ -52,6 +52,8 @@
52
52
  "turns_total": 2,
53
53
  "turns_passed": 2,
54
54
  "turns_failed": 0,
55
+ "turns_partial": 0,
56
+ "turns_errored": 0,
55
57
  "overall_status": "pass"
56
58
  }
57
59
  }
@@ -0,0 +1,239 @@
1
+ {
2
+ "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
3
+ "schemaVersion": "1.4.0",
4
+ "metadata": {
5
+ "name": "All scenarios — comprehensive comparison fixture",
6
+ "description": "Single fixture exercising every output shape under v1.4.0. Items are structured to isolate one variable at a time so a reader can see exactly which combination drives which top-level `status` and whether the top-level `error` field is populated. Each item carries an `extensions.scenario` label for jq/grep.",
7
+ "evaluatedAt": "2026-05-01T11:00:00Z",
8
+ "agentName": "Test Agent",
9
+ "cliVersion": "1.4.0"
10
+ },
11
+ "items": [
12
+
13
+ {
14
+ "extensions": {
15
+ "scenario": "1-single-turn-uniform-pass",
16
+ "notes": "Baseline. All evaluators uniformly pass. status=pass. No top-level error field.",
17
+ "evaluators_in_scores": ["pass", "pass"],
18
+ "expected_status": "pass",
19
+ "expected_error_field_populated": false
20
+ },
21
+ "prompt": "What is Microsoft Graph API?",
22
+ "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
23
+ "response": "Microsoft Graph API is a gateway to data and intelligence in Microsoft 365.",
24
+ "scores": {
25
+ "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
26
+ "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
27
+ },
28
+ "status": "pass"
29
+ },
30
+
31
+ {
32
+ "extensions": {
33
+ "scenario": "2-single-turn-uniform-fail",
34
+ "notes": "All evaluators uniformly fail. status=fail. No top-level error field. Under v1.4.0 this is the ONLY shape producing status=fail — uniform-fail is strict.",
35
+ "evaluators_in_scores": ["fail", "fail"],
36
+ "expected_status": "fail",
37
+ "expected_error_field_populated": false
38
+ },
39
+ "prompt": "What is the boiling point of water in Fahrenheit at sea level?",
40
+ "expected_response": "212°F.",
41
+ "response": "Water boils at 150°F at sea level.",
42
+ "scores": {
43
+ "relevance": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect." },
44
+ "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contains an internal contradiction." }
45
+ },
46
+ "status": "fail"
47
+ },
48
+
49
+ {
50
+ "extensions": {
51
+ "scenario": "3-single-turn-fail-pass-fail-mix",
52
+ "notes": "All evaluators ran successfully; one passed, one returned a fail verdict. status=fail (covers uniform-fail and pass+fail mixes). Top-level error field is ABSENT — no errored evaluator to summarize.",
53
+ "evaluators_in_scores": ["pass", "fail"],
54
+ "expected_status": "fail",
55
+ "expected_error_field_populated": false
56
+ },
57
+ "prompt": "Explain the difference between SharePoint and OneDrive.",
58
+ "expected_response": "Covers shared vs personal storage, permissions model.",
59
+ "response": "SharePoint is for team collaboration; OneDrive is for personal files. Both use the same permissions engine.",
60
+ "scores": {
61
+ "relevance": { "score": 4.0, "result": "pass", "threshold": 3, "reason": "Response addresses the key distinction." },
62
+ "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "The claim about a 'same permissions engine' is inaccurate; the response is internally inconsistent with product reality." }
63
+ },
64
+ "status": "fail"
65
+ },
66
+
67
+ {
68
+ "extensions": {
69
+ "scenario": "4-single-turn-partial-fail-plus-evaluator-crash",
70
+ "notes": "One fail + one evaluator crash (judge raised an exception). status=partial — error takes priority over pass/fail when ≥1 evaluator errored. Top-level error field IS populated with the evaluatorsFailed summary. Per-evaluator error uses the 'Evaluator failed:' prefix and appends exception.message text.",
71
+ "evaluators_in_scores": ["fail", "error"],
72
+ "expected_status": "partial",
73
+ "expected_error_field_populated": true
74
+ },
75
+ "prompt": "How do I configure conditional access for guest accounts?",
76
+ "expected_response": "Covers Azure AD guest CA policies.",
77
+ "response": "Use the on-prem AD Users and Computers tool to block external logins.",
78
+ "scores": {
79
+ "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response references on-prem AD instead of Azure AD." },
80
+ "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
81
+ },
82
+ "status": "partial",
83
+ "error": {
84
+ "code": "evaluatorsFailed",
85
+ "message": "Agent response obtained. 1 of 2 evaluators failed to run."
86
+ }
87
+ },
88
+
89
+ {
90
+ "extensions": {
91
+ "scenario": "5-single-turn-partial-all-evaluators-errored",
92
+ "notes": "Response obtained, but every attempted evaluator errored (zero verdicts rendered). status=partial (not error) — the agent DID respond; we just couldn't fully evaluate it. Same unified message template as scenario 4, now reading '2 of 2' to indicate all evaluators errored. The 'Agent response obtained.' prefix distinguishes this case from scenario 6 (no response, status=error).",
93
+ "evaluators_in_scores": ["error", "error"],
94
+ "expected_status": "partial",
95
+ "expected_error_field_populated": true
96
+ },
97
+ "prompt": "List the top 5 security best practices for M365 tenants.",
98
+ "expected_response": "Covers MFA, conditional access, audit logging, DLP, least privilege.",
99
+ "response": "Here are five M365 security best practices: 1) MFA, 2) conditional access, 3) DLP, 4) audit logs, 5) least-privilege roles.",
100
+ "scores": {
101
+ "relevance": { "result": "error", "error": "Evaluator failed: Service rate limit exceeded after 3 retries" },
102
+ "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
103
+ },
104
+ "status": "partial",
105
+ "error": {
106
+ "code": "evaluatorsFailed",
107
+ "message": "Agent response obtained. 2 of 2 evaluators failed to run."
108
+ }
109
+ },
110
+
111
+ {
112
+ "extensions": {
113
+ "scenario": "6-single-turn-error-no-response",
114
+ "notes": "Agent did not respond after retries. No evaluators attempted. status=error. Top-level error field IS populated with the request-failure cause template. This is the ONLY single-turn case producing status=error under v1.4.0.",
115
+ "evaluators_in_scores": [],
116
+ "expected_status": "error",
117
+ "expected_error_field_populated": true
118
+ },
119
+ "prompt": "What is Microsoft Graph API?",
120
+ "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
121
+ "response": "",
122
+ "scores": {},
123
+ "status": "error",
124
+ "error": {
125
+ "code": "agentRequestFailed",
126
+ "message": "Agent request failed: HTTP 503 Service Unavailable"
127
+ }
128
+ },
129
+
130
+ {
131
+ "extensions": {
132
+ "scenario": "7-multi-turn-partial-mixed-turn-outcomes",
133
+ "notes": "3-turn thread with no errored turns. Turn 1 uniformly passed; Turn 2 had a pass+error evaluator mix (partial); Turn 3 had a uniform-fail (status=fail). Per-turn statuses: [pass, partial, fail] — any partial turn drives thread to partial under FR-004's priority rules. Summary invariant: 1+1+1+0=3.",
134
+ "per_turn_statuses": ["pass", "partial", "fail"],
135
+ "expected_overall_status": "partial"
136
+ },
137
+ "name": "Seattle trip planning — mixed turn outcomes",
138
+ "conversation_id": "conv-abc-007",
139
+ "turns": [
140
+ {
141
+ "prompt": "I'm based in Seattle.",
142
+ "expected_response": "I can help with Seattle-related queries.",
143
+ "response": "Understood — I can help with Seattle questions.",
144
+ "scores": {
145
+ "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
146
+ "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
147
+ },
148
+ "status": "pass"
149
+ },
150
+ {
151
+ "prompt": "What's the weather like here?",
152
+ "expected_response": "Seattle has mild, rainy weather.",
153
+ "response": "Seattle tends to be rainy most of the year, especially in winter.",
154
+ "scores": {
155
+ "relevance": { "score": 5.0, "result": "pass", "threshold": 3, "reason": "Response addresses Seattle weather accurately." },
156
+ "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
157
+ },
158
+ "status": "partial",
159
+ "error": {
160
+ "code": "evaluatorsFailed",
161
+ "message": "Agent response obtained. 1 of 2 evaluators failed to run."
162
+ }
163
+ },
164
+ {
165
+ "prompt": "What's the average temperature in Seattle in March?",
166
+ "expected_response": "Around 50°F (10°C).",
167
+ "response": "Seattle averages 80°F in March.",
168
+ "scores": {
169
+ "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect — Seattle's March averages are far below 80°F." },
170
+ "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contradicts well-known regional climate data." }
171
+ },
172
+ "status": "fail"
173
+ }
174
+ ],
175
+ "summary": {
176
+ "turns_total": 3,
177
+ "turns_passed": 1,
178
+ "turns_failed": 1,
179
+ "turns_partial": 1,
180
+ "turns_errored": 0,
181
+ "overall_status": "partial"
182
+ }
183
+ },
184
+
185
+ {
186
+ "extensions": {
187
+ "scenario": "8-multi-turn-error-any-errored-turn",
188
+ "notes": "3-turn thread with a mid-conversation request failure. Turn 1 uniformly passed; Turn 2's request failed; Turn 3 was downstream-skipped. Per-turn statuses: [pass, error, error] — under FR-004's priority rules, any errored turn drives the thread to error (the run didn't complete). The two error turns carry distinct error codes (agentRequestFailed vs turnSkipped) demonstrating the cascade. Summary invariant: 1+0+0+2=3.",
189
+ "per_turn_statuses": ["pass", "error", "error"],
190
+ "expected_overall_status": "error"
191
+ },
192
+ "name": "Conversation that aborted mid-thread",
193
+ "conversation_id": "conv-abc-008",
194
+ "turns": [
195
+ {
196
+ "prompt": "I'd like to plan a trip from SFO.",
197
+ "expected_response": "I can help with travel planning from SFO.",
198
+ "response": "Sure — I can help plan a trip from SFO.",
199
+ "scores": {
200
+ "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
201
+ "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
202
+ },
203
+ "status": "pass"
204
+ },
205
+ {
206
+ "prompt": "Book me a flight from SFO to SEA next Tuesday.",
207
+ "expected_response": "I can help with flight queries.",
208
+ "response": "",
209
+ "scores": {},
210
+ "status": "error",
211
+ "error": {
212
+ "code": "agentRequestFailed",
213
+ "message": "Agent request failed: DNS resolution failed for agent endpoint"
214
+ }
215
+ },
216
+ {
217
+ "prompt": "Prefer morning departure, aisle seat.",
218
+ "expected_response": "Noted.",
219
+ "response": "",
220
+ "scores": {},
221
+ "status": "error",
222
+ "error": {
223
+ "code": "turnSkipped",
224
+ "message": "Turn not attempted: preceding turn failed"
225
+ }
226
+ }
227
+ ],
228
+ "summary": {
229
+ "turns_total": 3,
230
+ "turns_passed": 1,
231
+ "turns_failed": 0,
232
+ "turns_partial": 0,
233
+ "turns_errored": 2,
234
+ "overall_status": "error"
235
+ }
236
+ }
237
+
238
+ ]
239
+ }
@@ -39,19 +39,14 @@ CITATIONS = "Citations"
39
39
  EXACT_MATCH = "ExactMatch"
40
40
  PARTIAL_MATCH = "PartialMatch"
41
41
 
42
- # Prerequisite constants
43
- REQUIRES_AZURE_OPENAI = "azure_openai"
44
- REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
45
-
46
- # Evaluation status constants
47
- # Outcome statuses (agent responded, evaluators ran):
48
- STATUS_PASS = "pass" # All evaluators scored above threshold
49
- STATUS_FAIL = "fail" # At least one evaluator scored below threshold
50
- # Error state (evaluation couldn't complete):
51
- STATUS_ERROR = "error" # API call failed / response couldn't be obtained
52
- # Thread-level aggregate status (multi-turn only):
53
- STATUS_PARTIAL = "partial" # Some turns passed, some did not
54
- # Fallback for missing status:
42
+ # Evaluation status constants — four-value enum used at the turn/item level
43
+ # AND the thread-level overall_status. See status_derivation.py for the
44
+ # canonical derivation and rollup rules.
45
+ STATUS_PASS = "pass"
46
+ STATUS_FAIL = "fail"
47
+ STATUS_PARTIAL = "partial"
48
+ STATUS_ERROR = "error"
49
+ # Internal-only sentinel never appears in emitted output.
55
50
  STATUS_UNKNOWN = "unknown"
56
51
 
57
52
  # System defaults when no file-level or env-level defaults are configured
@@ -77,7 +72,6 @@ METRIC_IDS = {
77
72
  @dataclass
78
73
  class RegistryEntry:
79
74
  type: str # "llm", "tool", or "non-llm"
80
- requires: List[str]
81
75
  default_threshold: Optional[float]
82
76
 
83
77
 
@@ -0,0 +1,91 @@
1
+ """Canonical error-message templates for persisted evaluation output.
2
+
3
+ Every error string written into a JSON/CSV/HTML output file MUST be produced
4
+ by a builder in this module. The builders accept only string arguments — never
5
+ exception objects — which keeps ``repr(exc)``, ``traceback.format_exc()``, and
6
+ SDK class names out of persisted output by construction.
7
+
8
+ Two flavours:
9
+
10
+ * **Turn/item-level `ErrorObject` builders** return a structured ``{code, message}``
11
+ dict for the top-level ``error`` field on a turn or single-turn item. Used
12
+ when ``status == "error"`` (cause) or ``status == "partial"`` with at least
13
+ one errored evaluator (summary).
14
+
15
+ * **Per-evaluator string builders** return a flat string formatted as
16
+ ``"<category prefix>: <detail>"`` for the ``error`` field inside an
17
+ ``ErroredScore`` entry. Evaluator identity is encoded by the ``scores`` map's
18
+ parent property key, so no ``code`` is needed at this level.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import TypedDict
24
+
25
+
26
+ class ErrorObject(TypedDict):
27
+ """Turn/item-level top-level error shape — `{code, message}` per ErrorObject $def."""
28
+ code: str
29
+ message: str
30
+
31
+
32
+ # ── Error code constants ──────────────────────────────────────────────
33
+ # These are the canonical machine-readable codes emitted in the top-level
34
+ # `error.code` field. They appear in persisted output and consumer-facing
35
+ # documentation; treat the string values as part of the public contract
36
+ # (do not rename without a schema-version bump).
37
+
38
+ ERROR_CODE_AGENT_REQUEST_FAILED = "agentRequestFailed"
39
+ ERROR_CODE_TURN_SKIPPED = "turnSkipped"
40
+ ERROR_CODE_EVALUATORS_FAILED = "evaluatorsFailed"
41
+
42
+
43
+ # ── Turn/item-level ErrorObject builders ──────────────────────────────
44
+
45
+
46
+ def agent_request_failed(exc_message: str) -> ErrorObject:
47
+ """`status == "error"` cause when the agent client raised — no response obtained."""
48
+ return {
49
+ "code": ERROR_CODE_AGENT_REQUEST_FAILED,
50
+ "message": f"Agent request failed: {exc_message}",
51
+ }
52
+
53
+
54
+ def turn_skipped() -> ErrorObject:
55
+ """`status == "error"` cause for downstream turns after a preceding turn failed.
56
+
57
+ Synthesized cause — no exception text appended (FR-013).
58
+ """
59
+ return {
60
+ "code": ERROR_CODE_TURN_SKIPPED,
61
+ "message": "Turn not attempted: preceding turn failed",
62
+ }
63
+
64
+
65
+ def evaluators_failed_summary(error_count: int, total: int) -> ErrorObject:
66
+ """`status == "partial"` summary when at least one evaluator returned `result: "error"`.
67
+
68
+ Unified template regardless of error_count vs total — per-evaluator detail
69
+ (crash vs missing-prereq, with optional exception text) lives in `scores`.
70
+ """
71
+ return {
72
+ "code": ERROR_CODE_EVALUATORS_FAILED,
73
+ "message": f"Agent response obtained. {error_count} of {total} evaluators failed to run.",
74
+ }
75
+
76
+
77
+ # ── Per-evaluator string builders (inside `scores`) ──────────────────
78
+
79
+
80
+ def evaluator_failed(exc_message: str) -> str:
81
+ """Per-evaluator `error` string when the evaluator raised during run."""
82
+ return f"Evaluator failed: {exc_message}"
83
+
84
+
85
+ # Per-evaluator prerequisite-miss builders are not present today because no
86
+ # reachable prereq-fail exists: `validate_environment()` exits the process if
87
+ # Azure OpenAI config is missing, and no registered evaluator has a
88
+ # data-dependent prereq. When that changes, add builders following the
89
+ # convention `"Evaluator missing prerequisites: <description>"` and wire a
90
+ # prereq check in evaluation_runner. See specs/236-unified-error-output for
91
+ # the deferred sub-cases.