@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,8 +15,8 @@
15
15
  "schemaVersion": {
16
16
  "type": "string",
17
17
  "pattern": "^1\\.\\d+\\.\\d+$",
18
- "description": "SemVer string identifying the schema version this document conforms to (e.g., '1.0.0')",
19
- "examples": ["1.0.0", "1.1.0", "1.2.0"]
18
+ "description": "SemVer string identifying the schema version this document conforms to (e.g., '1.4.0')",
19
+ "examples": ["1.0.0", "1.1.0", "1.2.0", "1.3.0", "1.4.0"]
20
20
  },
21
21
  "metadata": {
22
22
  "$ref": "#/$defs/DocumentMetadata"
@@ -43,52 +43,16 @@
43
43
  "description": "Optional metadata about the evaluation document",
44
44
  "additionalProperties": true,
45
45
  "properties": {
46
- "name": {
47
- "type": "string",
48
- "description": "Human-readable name for the evaluation set"
49
- },
50
- "description": {
51
- "type": "string",
52
- "description": "Description of what this evaluation set tests"
53
- },
54
- "createdAt": {
55
- "type": "string",
56
- "format": "date-time",
57
- "description": "ISO 8601 timestamp when the document was created"
58
- },
59
- "createdBy": {
60
- "type": "string",
61
- "description": "Author or system that created the document"
62
- },
63
- "evaluatedAt": {
64
- "type": "string",
65
- "format": "date-time",
66
- "description": "ISO 8601 timestamp when evaluation was performed"
67
- },
68
- "tags": {
69
- "type": "array",
70
- "items": {
71
- "type": "string"
72
- },
73
- "description": "Tags for categorization and filtering"
74
- },
75
- "agentId": {
76
- "type": "string",
77
- "description": "M365 Agent ID this evaluation targets"
78
- },
79
- "agentName": {
80
- "type": "string",
81
- "description": "Name of the M365 agent this evaluation targets"
82
- },
83
- "cliVersion": {
84
- "type": "string",
85
- "description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
86
- },
87
- "extensions": {
88
- "type": "object",
89
- "additionalProperties": true,
90
- "description": "Extension point for custom metadata. Use reverse-domain notation for field names."
91
- }
46
+ "name": { "type": "string", "description": "Human-readable name for the evaluation set" },
47
+ "description": { "type": "string", "description": "Description of what this evaluation set tests" },
48
+ "createdAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when the document was created" },
49
+ "createdBy": { "type": "string", "description": "Author or system that created the document" },
50
+ "evaluatedAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when evaluation was performed" },
51
+ "tags": { "type": "array", "items": { "type": "string" }, "description": "Tags for categorization and filtering" },
52
+ "agentId": { "type": "string", "description": "M365 Agent ID this evaluation targets" },
53
+ "agentName": { "type": "string", "description": "Name of the M365 agent this evaluation targets" },
54
+ "cliVersion": { "type": "string", "description": "Version of the M365 Copilot Agent Evals CLI that produced this document" },
55
+ "extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom metadata. Use reverse-domain notation for field names." }
92
56
  }
93
57
  },
94
58
  "SingleTurnEvaluation": {
@@ -97,52 +61,26 @@
97
61
  "required": ["prompt"],
98
62
  "additionalProperties": false,
99
63
  "properties": {
100
- "prompt": {
101
- "type": "string",
102
- "minLength": 1,
103
- "description": "The input prompt to evaluate"
104
- },
105
- "expected_response": {
106
- "type": "string",
107
- "description": "Expected or ideal response for comparison during evaluation"
108
- },
109
- "response": {
110
- "type": "string",
111
- "description": "Actual response from the agent"
112
- },
113
- "context": {
114
- "type": "string",
115
- "description": "Additional context for grounding evaluation"
116
- },
117
- "evaluators": {
118
- "$ref": "#/$defs/EvaluatorMap",
119
- "description": "Per-prompt evaluator overrides"
120
- },
121
- "evaluators_mode": {
64
+ "prompt": { "type": "string", "minLength": 1, "description": "The input prompt to evaluate" },
65
+ "expected_response": { "type": "string", "description": "Expected or ideal response for comparison during evaluation" },
66
+ "response": { "type": "string", "description": "Actual response from the agent" },
67
+ "context": { "type": "string", "description": "Additional context for grounding evaluation" },
68
+ "evaluators": { "$ref": "#/$defs/EvaluatorMap", "description": "Per-prompt evaluator overrides" },
69
+ "evaluators_mode": { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-prompt evaluators combine with defaults" },
70
+ "citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
71
+ "scores": { "$ref": "#/$defs/ScoreCollection" },
72
+ "status": {
122
73
  "type": "string",
123
- "enum": ["extend", "replace"],
124
- "default": "extend",
125
- "description": "How per-prompt evaluators combine with defaults"
126
- },
127
- "citations": {
128
- "type": "array",
129
- "items": {
130
- "$ref": "#/$defs/Citation"
131
- },
132
- "description": "Citations included in the response"
74
+ "enum": ["pass", "fail", "partial", "error"],
75
+ "description": "Overall status of this item."
133
76
  },
134
- "scores": {
135
- "$ref": "#/$defs/ScoreCollection"
77
+ "error": {
78
+ "$ref": "#/$defs/ErrorObject",
79
+ "description": "Error details for this item, if any."
136
80
  },
137
- "extensions": {
138
- "type": "object",
139
- "additionalProperties": true,
140
- "description": "Extension point for custom item-level fields"
141
- }
81
+ "extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom item-level fields" }
142
82
  },
143
- "not": {
144
- "required": ["turns"]
145
- }
83
+ "not": { "required": ["turns"] }
146
84
  },
147
85
  "MultiTurnThread": {
148
86
  "type": "object",
@@ -150,38 +88,14 @@
150
88
  "required": ["turns"],
151
89
  "additionalProperties": false,
152
90
  "properties": {
153
- "name": {
154
- "type": "string",
155
- "description": "Human-readable name for the thread"
156
- },
157
- "description": {
158
- "type": "string",
159
- "description": "Description of what this thread tests"
160
- },
161
- "turns": {
162
- "type": "array",
163
- "minItems": 1,
164
- "maxItems": 20,
165
- "items": { "$ref": "#/$defs/Turn" },
166
- "description": "Ordered array of conversation turns"
167
- },
168
- "conversation_id": {
169
- "type": "string",
170
- "description": "Unique identifier for this conversation thread"
171
- },
172
- "summary": {
173
- "$ref": "#/$defs/ThreadSummary",
174
- "description": "Aggregate statistics for the thread"
175
- },
176
- "extensions": {
177
- "type": "object",
178
- "additionalProperties": true,
179
- "description": "Extension point for custom thread-level fields"
180
- }
91
+ "name": { "type": "string", "description": "Human-readable name for the thread" },
92
+ "description": { "type": "string", "description": "Description of what this thread tests" },
93
+ "turns": { "type": "array", "minItems": 1, "maxItems": 20, "items": { "$ref": "#/$defs/Turn" }, "description": "Ordered array of conversation turns" },
94
+ "conversation_id": { "type": "string", "description": "Unique identifier for this conversation thread" },
95
+ "summary": { "$ref": "#/$defs/ThreadSummary", "description": "Aggregate statistics for the thread" },
96
+ "extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom thread-level fields" }
181
97
  },
182
- "not": {
183
- "required": ["prompt"]
184
- }
98
+ "not": { "required": ["prompt"] }
185
99
  },
186
100
  "Turn": {
187
101
  "type": "object",
@@ -189,237 +103,159 @@
189
103
  "required": ["prompt"],
190
104
  "additionalProperties": false,
191
105
  "properties": {
192
- "prompt": {
193
- "type": "string",
194
- "minLength": 1,
195
- "description": "The user message for this turn"
196
- },
197
- "expected_response": {
198
- "type": "string",
199
- "description": "Expected agent response for this turn"
200
- },
201
- "response": {
202
- "type": "string",
203
- "description": "Actual agent response"
204
- },
205
- "context": {
206
- "type": "string",
207
- "description": "Additional context for grounding evaluation"
208
- },
209
- "evaluators": {
210
- "$ref": "#/$defs/EvaluatorMap",
211
- "description": "Per-turn evaluator overrides"
212
- },
213
- "evaluators_mode": {
214
- "type": "string",
215
- "enum": ["extend", "replace"],
216
- "default": "extend",
217
- "description": "How per-turn evaluators combine with defaults"
218
- },
219
- "citations": {
220
- "type": "array",
221
- "items": {
222
- "$ref": "#/$defs/Citation"
223
- },
224
- "description": "Citations included in the response"
225
- },
226
- "scores": {
227
- "$ref": "#/$defs/ScoreCollection"
228
- },
106
+ "prompt": { "type": "string", "minLength": 1, "description": "The user message for this turn" },
107
+ "expected_response": { "type": "string", "description": "Expected agent response for this turn" },
108
+ "response": { "type": "string", "description": "Actual agent response" },
109
+ "context": { "type": "string", "description": "Additional context for grounding evaluation" },
110
+ "evaluators": { "$ref": "#/$defs/EvaluatorMap", "description": "Per-turn evaluator overrides" },
111
+ "evaluators_mode": { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-turn evaluators combine with defaults" },
112
+ "citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
113
+ "scores": { "$ref": "#/$defs/ScoreCollection" },
229
114
  "status": {
230
115
  "type": "string",
231
- "enum": ["pass", "fail", "error"],
232
- "description": "Overall status of this turn"
116
+ "enum": ["pass", "fail", "partial", "error"],
117
+ "description": "Overall status of this turn."
233
118
  },
234
119
  "error": {
235
- "type": "string",
236
- "description": "Error message if status is 'error'"
120
+ "$ref": "#/$defs/ErrorObject",
121
+ "description": "Error details for this turn, if any."
237
122
  },
238
- "extensions": {
239
- "type": "object",
240
- "additionalProperties": true,
241
- "description": "Extension point for custom turn-level fields"
242
- }
123
+ "extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom turn-level fields" }
243
124
  }
244
125
  },
245
126
  "ThreadSummary": {
246
127
  "type": "object",
247
- "description": "Aggregate statistics for a thread",
248
- "required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
128
+ "description": "Aggregate statistics for a thread.",
129
+ "required": ["turns_total", "turns_passed", "turns_failed", "turns_partial", "turns_errored", "overall_status"],
249
130
  "additionalProperties": false,
250
131
  "properties": {
251
- "turns_total": {
252
- "type": "integer",
253
- "minimum": 1,
254
- "description": "Total number of turns executed"
255
- },
256
- "turns_passed": {
257
- "type": "integer",
258
- "minimum": 0,
259
- "description": "Number of turns where all evaluators passed"
260
- },
261
- "turns_failed": {
262
- "type": "integer",
263
- "minimum": 0,
264
- "description": "Number of turns where any evaluator failed"
265
- },
132
+ "turns_total": { "type": "integer", "minimum": 1, "description": "Total number of turns executed" },
133
+ "turns_passed": { "type": "integer", "minimum": 0, "description": "Count of turns with status='pass'" },
134
+ "turns_failed": { "type": "integer", "minimum": 0, "description": "Count of turns with status='fail'" },
135
+ "turns_partial": { "type": "integer", "minimum": 0, "description": "Count of turns with status='partial'" },
136
+ "turns_errored": { "type": "integer", "minimum": 0, "description": "Count of turns with status='error'" },
266
137
  "overall_status": {
267
138
  "type": "string",
268
- "enum": ["pass", "partial", "fail"],
269
- "description": "pass: all turns passed, partial: some failed, fail: all failed or error"
139
+ "enum": ["pass", "fail", "partial", "error"],
140
+ "description": "Overall status of the thread."
270
141
  }
271
142
  }
272
143
  },
273
144
  "ScoreCollection": {
274
145
  "type": "object",
275
- "description": "Collection of evaluation scores for an item",
146
+ "description": "Collection of evaluation scores for an item. Each entry is either a valid result (ValidScore variants) or an errored record (ErroredScore) under the discriminated oneOf.",
276
147
  "additionalProperties": true,
277
148
  "properties": {
278
- "relevance": {
279
- "$ref": "#/$defs/EvalScore",
280
- "description": "Relevance score (1-5)"
281
- },
282
- "coherence": {
283
- "$ref": "#/$defs/EvalScore",
284
- "description": "Coherence score (1-5)"
285
- },
286
- "groundedness": {
287
- "$ref": "#/$defs/EvalScore",
288
- "description": "Groundedness score (1-5)"
289
- },
290
- "similarity": {
291
- "$ref": "#/$defs/EvalScore",
292
- "description": "Similarity score (1-5)"
293
- },
294
- "citations": {
295
- "$ref": "#/$defs/CitationScore",
296
- "description": "Citation evaluation results"
297
- },
298
- "exactMatch": {
299
- "$ref": "#/$defs/ExactMatchScore",
300
- "description": "Exact match evaluation result"
301
- },
302
- "partialMatch": {
303
- "$ref": "#/$defs/PartialMatchScore",
304
- "description": "Partial match evaluation result"
305
- }
149
+ "relevance": { "$ref": "#/$defs/EvalScore", "description": "Relevance score (1-5) or errored entry" },
150
+ "coherence": { "$ref": "#/$defs/EvalScore", "description": "Coherence score (1-5) or errored entry" },
151
+ "groundedness": { "$ref": "#/$defs/EvalScore", "description": "Groundedness score (1-5) or errored entry" },
152
+ "similarity": { "$ref": "#/$defs/EvalScore", "description": "Similarity score (1-5) or errored entry" },
153
+ "citations": { "$ref": "#/$defs/CitationScore", "description": "Citation evaluation result or errored entry" },
154
+ "exactMatch": { "$ref": "#/$defs/ExactMatchScore", "description": "Exact match evaluation result or errored entry" },
155
+ "partialMatch": { "$ref": "#/$defs/PartialMatchScore", "description": "Partial match evaluation result or errored entry" }
306
156
  }
307
157
  },
308
158
  "EvalScore": {
159
+ "description": "Standard evaluation score (1-5 scale) — valid result OR errored-evaluator record.",
160
+ "oneOf": [
161
+ { "$ref": "#/$defs/EvalScoreValid" },
162
+ { "$ref": "#/$defs/ErroredScore" }
163
+ ]
164
+ },
165
+ "EvalScoreValid": {
309
166
  "type": "object",
310
- "description": "Standard evaluation score (1-5 scale)",
167
+ "description": "Valid 1-5 score result. Required when result is pass or fail.",
311
168
  "required": ["score", "result", "threshold"],
312
169
  "additionalProperties": true,
313
170
  "properties": {
314
- "score": {
315
- "type": "number",
316
- "minimum": 1,
317
- "maximum": 5,
318
- "description": "Numeric score from 1.0 (worst) to 5.0 (best)"
319
- },
320
- "result": {
321
- "type": "string",
322
- "enum": ["pass", "fail"],
323
- "description": "Pass/fail result based on threshold comparison"
324
- },
325
- "threshold": {
326
- "type": "number",
327
- "minimum": 1,
328
- "maximum": 5,
329
- "description": "Threshold used for pass/fail determination"
330
- },
331
- "reason": {
332
- "type": "string",
333
- "description": "Explanation of why this score was assigned"
334
- },
335
- "evaluator": {
336
- "type": "string",
337
- "description": "Name or identifier of the evaluator that produced this score"
338
- }
171
+ "score": { "type": "number", "minimum": 1, "maximum": 5, "description": "Numeric score from 1.0 (worst) to 5.0 (best)" },
172
+ "result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on threshold comparison" },
173
+ "threshold": { "type": "number", "minimum": 1, "maximum": 5, "description": "Threshold used for pass/fail determination" },
174
+ "reason": { "type": "string", "description": "Explanation of why this score was assigned" }
339
175
  }
340
176
  },
341
177
  "CitationScore": {
178
+ "description": "Citation-specific evaluation result — valid result OR errored-evaluator record.",
179
+ "oneOf": [
180
+ { "$ref": "#/$defs/CitationScoreValid" },
181
+ { "$ref": "#/$defs/ErroredScore" }
182
+ ]
183
+ },
184
+ "CitationScoreValid": {
342
185
  "type": "object",
343
- "description": "Citation-specific evaluation score",
186
+ "description": "Valid citation-count result. Required when result is pass or fail.",
344
187
  "required": ["count", "result", "threshold"],
345
188
  "additionalProperties": true,
346
189
  "properties": {
347
- "count": {
348
- "type": "integer",
349
- "minimum": 0,
350
- "description": "Number of citations found in the response"
351
- },
352
- "result": {
353
- "type": "string",
354
- "enum": ["pass", "fail"],
355
- "description": "Pass/fail result based on citation count vs threshold"
356
- },
357
- "threshold": {
358
- "type": "integer",
359
- "minimum": 0,
360
- "description": "Minimum required number of citations for pass"
361
- },
362
- "format": {
363
- "type": "string",
364
- "description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'. Additional formats may be added.",
365
- "examples": ["oai_unicode", "bracket", "mixed"]
366
- },
367
- "citations": {
368
- "type": "array",
369
- "items": {
370
- "$ref": "#/$defs/Citation"
371
- },
372
- "description": "Parsed citation objects"
373
- }
190
+ "count": { "type": "integer", "minimum": 0, "description": "Number of citations found in the response" },
191
+ "result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on citation count vs threshold" },
192
+ "threshold": { "type": "integer", "minimum": 0, "description": "Minimum required number of citations for pass" },
193
+ "format": { "type": "string", "description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'.", "examples": ["oai_unicode", "bracket", "mixed"] },
194
+ "citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Parsed citation objects" }
374
195
  }
375
196
  },
376
197
  "ExactMatchScore": {
198
+ "description": "Exact match evaluation result — valid result OR errored-evaluator record.",
199
+ "oneOf": [
200
+ { "$ref": "#/$defs/ExactMatchScoreValid" },
201
+ { "$ref": "#/$defs/ErroredScore" }
202
+ ]
203
+ },
204
+ "ExactMatchScoreValid": {
377
205
  "type": "object",
378
- "description": "Exact match evaluation result",
206
+ "description": "Valid exact-match result. Required when result is pass or fail.",
379
207
  "required": ["match", "result"],
380
208
  "additionalProperties": true,
381
209
  "properties": {
382
- "match": {
383
- "type": "boolean",
384
- "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
385
- },
386
- "result": {
387
- "type": "string",
388
- "enum": ["pass", "fail"],
389
- "description": "Pass when match is true, fail otherwise"
390
- },
391
- "reason": {
392
- "type": "string",
393
- "description": "Explanation of the match result"
394
- }
210
+ "match": { "type": "boolean", "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)" },
211
+ "result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass when match is true, fail otherwise" },
212
+ "reason": { "type": "string", "description": "Explanation of the match result" }
395
213
  }
396
214
  },
397
215
  "PartialMatchScore": {
216
+ "description": "Partial match evaluation result — valid result OR errored-evaluator record.",
217
+ "oneOf": [
218
+ { "$ref": "#/$defs/PartialMatchScoreValid" },
219
+ { "$ref": "#/$defs/ErroredScore" }
220
+ ]
221
+ },
222
+ "PartialMatchScoreValid": {
398
223
  "type": "object",
399
- "description": "Partial match evaluation result",
224
+ "description": "Valid partial-match result (0.0-1.0 score). Required when result is pass or fail.",
400
225
  "required": ["score", "result", "threshold"],
401
226
  "additionalProperties": true,
402
227
  "properties": {
403
- "score": {
404
- "type": "number",
405
- "minimum": 0,
406
- "maximum": 1,
407
- "description": "Match score from 0.0 (no match) to 1.0 (full match)"
408
- },
409
- "result": {
228
+ "score": { "type": "number", "minimum": 0, "maximum": 1, "description": "Match score from 0.0 (no match) to 1.0 (full match)" },
229
+ "result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail based on score vs threshold" },
230
+ "threshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "Minimum score required for pass (default: 0.5)" },
231
+ "reason": { "type": "string", "description": "Explanation of the match result" }
232
+ }
233
+ },
234
+ "ErroredScore": {
235
+ "type": "object",
236
+ "description": "Per-evaluator entry for an evaluator that did not produce a result.",
237
+ "required": ["result", "error"],
238
+ "additionalProperties": false,
239
+ "properties": {
240
+ "result": { "type": "string", "const": "error", "description": "Always 'error' for this variant." },
241
+ "error": { "type": "string", "minLength": 1, "description": "Error message describing why the evaluator did not produce a result." }
242
+ }
243
+ },
244
+ "ErrorObject": {
245
+ "type": "object",
246
+ "description": "Structured turn/item-level error with a machine-readable code and a human-readable message.",
247
+ "required": ["code", "message"],
248
+ "additionalProperties": false,
249
+ "properties": {
250
+ "code": {
410
251
  "type": "string",
411
- "enum": ["pass", "fail"],
412
- "description": "Pass/fail based on score vs threshold"
413
- },
414
- "threshold": {
415
- "type": "number",
416
- "minimum": 0,
417
- "maximum": 1,
418
- "description": "Minimum score required for pass (default: 0.5)"
252
+ "minLength": 1,
253
+ "description": "Machine-readable error category. One of: 'agentRequestFailed', 'turnSkipped', 'evaluatorsFailed'."
419
254
  },
420
- "reason": {
255
+ "message": {
421
256
  "type": "string",
422
- "description": "Explanation of the match result"
257
+ "minLength": 1,
258
+ "description": "Human-readable message paired with the code."
423
259
  }
424
260
  }
425
261
  },
@@ -438,25 +274,10 @@
438
274
  "description": "Evaluator configuration options. Use empty object {} for defaults.",
439
275
  "additionalProperties": false,
440
276
  "properties": {
441
- "threshold": {
442
- "type": "number",
443
- "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
444
- },
445
- "citation_format": {
446
- "type": "string",
447
- "examples": ["oai_unicode", "bracket", "mixed"],
448
- "description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
449
- },
450
- "case_sensitive": {
451
- "type": "boolean",
452
- "default": false,
453
- "description": "Case-sensitive matching for ExactMatch/PartialMatch"
454
- },
455
- "options": {
456
- "type": "object",
457
- "additionalProperties": true,
458
- "description": "Evaluator-specific configuration"
459
- }
277
+ "threshold": { "type": "number", "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (default: 1), 0.0-1.0 for PartialMatch (default: 0.5)." },
278
+ "citation_format": { "type": "string", "examples": ["oai_unicode", "bracket", "mixed"], "description": "Citation format for detection. Default: oai_unicode." },
279
+ "case_sensitive": { "type": "boolean", "default": false, "description": "Case-sensitive matching for ExactMatch/PartialMatch" },
280
+ "options": { "type": "object", "additionalProperties": true, "description": "Evaluator-specific configuration" }
460
281
  }
461
282
  },
462
283
  "Citation": {
@@ -465,19 +286,9 @@
465
286
  "required": ["index"],
466
287
  "additionalProperties": true,
467
288
  "properties": {
468
- "index": {
469
- "type": "integer",
470
- "minimum": 1,
471
- "description": "Citation index (1-based)"
472
- },
473
- "text": {
474
- "type": "string",
475
- "description": "The cited text"
476
- },
477
- "source": {
478
- "type": "string",
479
- "description": "Source reference (URL, document name, etc.)"
480
- }
289
+ "index": { "type": "integer", "minimum": 1, "description": "Citation index (1-based)" },
290
+ "text": { "type": "string", "description": "The cited text" },
291
+ "source": { "type": "string", "description": "Source reference (URL, document name, etc.)" }
481
292
  }
482
293
  }
483
294
  }
@@ -0,0 +1,16 @@
1
+ {
2
+ "schemaVersion": "1.4.0",
3
+ "items": [
4
+ {
5
+ "prompt": "What is Microsoft Graph?",
6
+ "scores": {
7
+ "relevance": {
8
+ "result": "error",
9
+ "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint",
10
+ "score": 0,
11
+ "threshold": 3
12
+ }
13
+ }
14
+ }
15
+ ]
16
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "schemaVersion": "1.4.0",
3
+ "items": [
4
+ {
5
+ "prompt": "What is Microsoft Graph?",
6
+ "scores": {
7
+ "relevance": {
8
+ "result": "error"
9
+ }
10
+ }
11
+ }
12
+ ]
13
+ }
@@ -52,6 +52,8 @@
52
52
  "turns_total": 2,
53
53
  "turns_passed": 2,
54
54
  "turns_failed": 0,
55
+ "turns_partial": 0,
56
+ "turns_errored": 0,
55
57
  "overall_status": "pass"
56
58
  }
57
59
  }