@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/package.json +2 -2
- package/schema/v1/eval-document.schema.json +144 -333
- package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
- package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
- package/schema/v1/examples/valid/multi-turn-output.json +2 -0
- package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
- package/src/clients/cli/common.py +8 -14
- package/src/clients/cli/error_messages.py +91 -0
- package/src/clients/cli/evaluation_runner.py +108 -97
- package/src/clients/cli/evaluator_resolver.py +8 -33
- package/src/clients/cli/generate_report.py +125 -96
- package/src/clients/cli/readme.md +1 -1
- package/src/clients/cli/result_writer.py +129 -110
- package/src/clients/cli/status_derivation.py +91 -0
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +20 -13
package/README.md
CHANGED
|
@@ -62,6 +62,7 @@ ATK projects already check in `.env.local` with agent configuration. **Do not pu
|
|
|
62
62
|
# .env.local (checked in — no secrets!)
|
|
63
63
|
# Already present from ATK:
|
|
64
64
|
M365_TITLE_ID="T_your-title-id-here" # Auto-generated by ATK
|
|
65
|
+
TEAMS_APP_TENANT_ID="your-tenant-id" # Auto-generated by ATK
|
|
65
66
|
```
|
|
66
67
|
|
|
67
68
|
```bash
|
|
@@ -70,7 +71,6 @@ AZURE_AI_OPENAI_ENDPOINT="<your-azure-openai-endpoint>"
|
|
|
70
71
|
AZURE_AI_API_KEY="<your-api-key-from-azure-portal>"
|
|
71
72
|
AZURE_AI_API_VERSION="2024-12-01-preview" # default
|
|
72
73
|
AZURE_AI_MODEL_NAME="gpt-4o-mini" # recommended
|
|
73
|
-
TENANT_ID="<your-tenant-id>"
|
|
74
74
|
```
|
|
75
75
|
|
|
76
76
|
Add `.env.local.user` to your `.gitignore`:
|
|
@@ -110,6 +110,9 @@ Now that you know what's needed, here's how to get the required values:
|
|
|
110
110
|
|
|
111
111
|
Your Azure Active Directory (AAD) tenant ID.
|
|
112
112
|
|
|
113
|
+
- If you have created your agent using Agents Toolkit, the tool automatically reads `TEAMS_APP_TENANT_ID` from `.env.local` and uses it as the tenant ID. No additional configuration is needed.
|
|
114
|
+
- For non-ATK projects, set `TENANT_ID` in your env file.
|
|
115
|
+
|
|
113
116
|
**How to obtain:**
|
|
114
117
|
|
|
115
118
|
1. Go to [Azure Portal](https://portal.azure.com)
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@microsoft/m365-copilot-eval",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0-preview.1",
|
|
4
4
|
"minCliVersion": "1.0.1-preview.1",
|
|
5
5
|
"description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
|
|
6
|
-
"publishDate": "2026-05-
|
|
6
|
+
"publishDate": "2026-05-14",
|
|
7
7
|
"main": "src/clients/node-js/lib/index.js",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"bin": {
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
"schemaVersion": {
|
|
16
16
|
"type": "string",
|
|
17
17
|
"pattern": "^1\\.\\d+\\.\\d+$",
|
|
18
|
-
"description": "SemVer string identifying the schema version this document conforms to (e.g., '1.
|
|
19
|
-
"examples": ["1.0.0", "1.1.0", "1.2.0"]
|
|
18
|
+
"description": "SemVer string identifying the schema version this document conforms to (e.g., '1.4.0')",
|
|
19
|
+
"examples": ["1.0.0", "1.1.0", "1.2.0", "1.3.0", "1.4.0"]
|
|
20
20
|
},
|
|
21
21
|
"metadata": {
|
|
22
22
|
"$ref": "#/$defs/DocumentMetadata"
|
|
@@ -43,52 +43,16 @@
|
|
|
43
43
|
"description": "Optional metadata about the evaluation document",
|
|
44
44
|
"additionalProperties": true,
|
|
45
45
|
"properties": {
|
|
46
|
-
"name":
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
},
|
|
50
|
-
"description":
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
},
|
|
54
|
-
"
|
|
55
|
-
|
|
56
|
-
"format": "date-time",
|
|
57
|
-
"description": "ISO 8601 timestamp when the document was created"
|
|
58
|
-
},
|
|
59
|
-
"createdBy": {
|
|
60
|
-
"type": "string",
|
|
61
|
-
"description": "Author or system that created the document"
|
|
62
|
-
},
|
|
63
|
-
"evaluatedAt": {
|
|
64
|
-
"type": "string",
|
|
65
|
-
"format": "date-time",
|
|
66
|
-
"description": "ISO 8601 timestamp when evaluation was performed"
|
|
67
|
-
},
|
|
68
|
-
"tags": {
|
|
69
|
-
"type": "array",
|
|
70
|
-
"items": {
|
|
71
|
-
"type": "string"
|
|
72
|
-
},
|
|
73
|
-
"description": "Tags for categorization and filtering"
|
|
74
|
-
},
|
|
75
|
-
"agentId": {
|
|
76
|
-
"type": "string",
|
|
77
|
-
"description": "M365 Agent ID this evaluation targets"
|
|
78
|
-
},
|
|
79
|
-
"agentName": {
|
|
80
|
-
"type": "string",
|
|
81
|
-
"description": "Name of the M365 agent this evaluation targets"
|
|
82
|
-
},
|
|
83
|
-
"cliVersion": {
|
|
84
|
-
"type": "string",
|
|
85
|
-
"description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
|
|
86
|
-
},
|
|
87
|
-
"extensions": {
|
|
88
|
-
"type": "object",
|
|
89
|
-
"additionalProperties": true,
|
|
90
|
-
"description": "Extension point for custom metadata. Use reverse-domain notation for field names."
|
|
91
|
-
}
|
|
46
|
+
"name": { "type": "string", "description": "Human-readable name for the evaluation set" },
|
|
47
|
+
"description": { "type": "string", "description": "Description of what this evaluation set tests" },
|
|
48
|
+
"createdAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when the document was created" },
|
|
49
|
+
"createdBy": { "type": "string", "description": "Author or system that created the document" },
|
|
50
|
+
"evaluatedAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when evaluation was performed" },
|
|
51
|
+
"tags": { "type": "array", "items": { "type": "string" }, "description": "Tags for categorization and filtering" },
|
|
52
|
+
"agentId": { "type": "string", "description": "M365 Agent ID this evaluation targets" },
|
|
53
|
+
"agentName": { "type": "string", "description": "Name of the M365 agent this evaluation targets" },
|
|
54
|
+
"cliVersion": { "type": "string", "description": "Version of the M365 Copilot Agent Evals CLI that produced this document" },
|
|
55
|
+
"extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom metadata. Use reverse-domain notation for field names." }
|
|
92
56
|
}
|
|
93
57
|
},
|
|
94
58
|
"SingleTurnEvaluation": {
|
|
@@ -97,52 +61,26 @@
|
|
|
97
61
|
"required": ["prompt"],
|
|
98
62
|
"additionalProperties": false,
|
|
99
63
|
"properties": {
|
|
100
|
-
"prompt":
|
|
101
|
-
|
|
102
|
-
"
|
|
103
|
-
|
|
104
|
-
},
|
|
105
|
-
"
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"response": {
|
|
110
|
-
"type": "string",
|
|
111
|
-
"description": "Actual response from the agent"
|
|
112
|
-
},
|
|
113
|
-
"context": {
|
|
114
|
-
"type": "string",
|
|
115
|
-
"description": "Additional context for grounding evaluation"
|
|
116
|
-
},
|
|
117
|
-
"evaluators": {
|
|
118
|
-
"$ref": "#/$defs/EvaluatorMap",
|
|
119
|
-
"description": "Per-prompt evaluator overrides"
|
|
120
|
-
},
|
|
121
|
-
"evaluators_mode": {
|
|
64
|
+
"prompt": { "type": "string", "minLength": 1, "description": "The input prompt to evaluate" },
|
|
65
|
+
"expected_response": { "type": "string", "description": "Expected or ideal response for comparison during evaluation" },
|
|
66
|
+
"response": { "type": "string", "description": "Actual response from the agent" },
|
|
67
|
+
"context": { "type": "string", "description": "Additional context for grounding evaluation" },
|
|
68
|
+
"evaluators": { "$ref": "#/$defs/EvaluatorMap", "description": "Per-prompt evaluator overrides" },
|
|
69
|
+
"evaluators_mode": { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-prompt evaluators combine with defaults" },
|
|
70
|
+
"citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
|
|
71
|
+
"scores": { "$ref": "#/$defs/ScoreCollection" },
|
|
72
|
+
"status": {
|
|
122
73
|
"type": "string",
|
|
123
|
-
"enum": ["
|
|
124
|
-
"
|
|
125
|
-
"description": "How per-prompt evaluators combine with defaults"
|
|
126
|
-
},
|
|
127
|
-
"citations": {
|
|
128
|
-
"type": "array",
|
|
129
|
-
"items": {
|
|
130
|
-
"$ref": "#/$defs/Citation"
|
|
131
|
-
},
|
|
132
|
-
"description": "Citations included in the response"
|
|
74
|
+
"enum": ["pass", "fail", "partial", "error"],
|
|
75
|
+
"description": "Overall status of this item."
|
|
133
76
|
},
|
|
134
|
-
"
|
|
135
|
-
"$ref": "#/$defs/
|
|
77
|
+
"error": {
|
|
78
|
+
"$ref": "#/$defs/ErrorObject",
|
|
79
|
+
"description": "Error details for this item, if any."
|
|
136
80
|
},
|
|
137
|
-
"extensions":
|
|
138
|
-
"type": "object",
|
|
139
|
-
"additionalProperties": true,
|
|
140
|
-
"description": "Extension point for custom item-level fields"
|
|
141
|
-
}
|
|
81
|
+
"extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom item-level fields" }
|
|
142
82
|
},
|
|
143
|
-
"not": {
|
|
144
|
-
"required": ["turns"]
|
|
145
|
-
}
|
|
83
|
+
"not": { "required": ["turns"] }
|
|
146
84
|
},
|
|
147
85
|
"MultiTurnThread": {
|
|
148
86
|
"type": "object",
|
|
@@ -150,38 +88,14 @@
|
|
|
150
88
|
"required": ["turns"],
|
|
151
89
|
"additionalProperties": false,
|
|
152
90
|
"properties": {
|
|
153
|
-
"name":
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
},
|
|
157
|
-
"description":
|
|
158
|
-
|
|
159
|
-
"description": "Description of what this thread tests"
|
|
160
|
-
},
|
|
161
|
-
"turns": {
|
|
162
|
-
"type": "array",
|
|
163
|
-
"minItems": 1,
|
|
164
|
-
"maxItems": 20,
|
|
165
|
-
"items": { "$ref": "#/$defs/Turn" },
|
|
166
|
-
"description": "Ordered array of conversation turns"
|
|
167
|
-
},
|
|
168
|
-
"conversation_id": {
|
|
169
|
-
"type": "string",
|
|
170
|
-
"description": "Unique identifier for this conversation thread"
|
|
171
|
-
},
|
|
172
|
-
"summary": {
|
|
173
|
-
"$ref": "#/$defs/ThreadSummary",
|
|
174
|
-
"description": "Aggregate statistics for the thread"
|
|
175
|
-
},
|
|
176
|
-
"extensions": {
|
|
177
|
-
"type": "object",
|
|
178
|
-
"additionalProperties": true,
|
|
179
|
-
"description": "Extension point for custom thread-level fields"
|
|
180
|
-
}
|
|
91
|
+
"name": { "type": "string", "description": "Human-readable name for the thread" },
|
|
92
|
+
"description": { "type": "string", "description": "Description of what this thread tests" },
|
|
93
|
+
"turns": { "type": "array", "minItems": 1, "maxItems": 20, "items": { "$ref": "#/$defs/Turn" }, "description": "Ordered array of conversation turns" },
|
|
94
|
+
"conversation_id": { "type": "string", "description": "Unique identifier for this conversation thread" },
|
|
95
|
+
"summary": { "$ref": "#/$defs/ThreadSummary", "description": "Aggregate statistics for the thread" },
|
|
96
|
+
"extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom thread-level fields" }
|
|
181
97
|
},
|
|
182
|
-
"not": {
|
|
183
|
-
"required": ["prompt"]
|
|
184
|
-
}
|
|
98
|
+
"not": { "required": ["prompt"] }
|
|
185
99
|
},
|
|
186
100
|
"Turn": {
|
|
187
101
|
"type": "object",
|
|
@@ -189,237 +103,159 @@
|
|
|
189
103
|
"required": ["prompt"],
|
|
190
104
|
"additionalProperties": false,
|
|
191
105
|
"properties": {
|
|
192
|
-
"prompt":
|
|
193
|
-
|
|
194
|
-
"
|
|
195
|
-
|
|
196
|
-
},
|
|
197
|
-
"
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
},
|
|
201
|
-
"response": {
|
|
202
|
-
"type": "string",
|
|
203
|
-
"description": "Actual agent response"
|
|
204
|
-
},
|
|
205
|
-
"context": {
|
|
206
|
-
"type": "string",
|
|
207
|
-
"description": "Additional context for grounding evaluation"
|
|
208
|
-
},
|
|
209
|
-
"evaluators": {
|
|
210
|
-
"$ref": "#/$defs/EvaluatorMap",
|
|
211
|
-
"description": "Per-turn evaluator overrides"
|
|
212
|
-
},
|
|
213
|
-
"evaluators_mode": {
|
|
214
|
-
"type": "string",
|
|
215
|
-
"enum": ["extend", "replace"],
|
|
216
|
-
"default": "extend",
|
|
217
|
-
"description": "How per-turn evaluators combine with defaults"
|
|
218
|
-
},
|
|
219
|
-
"citations": {
|
|
220
|
-
"type": "array",
|
|
221
|
-
"items": {
|
|
222
|
-
"$ref": "#/$defs/Citation"
|
|
223
|
-
},
|
|
224
|
-
"description": "Citations included in the response"
|
|
225
|
-
},
|
|
226
|
-
"scores": {
|
|
227
|
-
"$ref": "#/$defs/ScoreCollection"
|
|
228
|
-
},
|
|
106
|
+
"prompt": { "type": "string", "minLength": 1, "description": "The user message for this turn" },
|
|
107
|
+
"expected_response": { "type": "string", "description": "Expected agent response for this turn" },
|
|
108
|
+
"response": { "type": "string", "description": "Actual agent response" },
|
|
109
|
+
"context": { "type": "string", "description": "Additional context for grounding evaluation" },
|
|
110
|
+
"evaluators": { "$ref": "#/$defs/EvaluatorMap", "description": "Per-turn evaluator overrides" },
|
|
111
|
+
"evaluators_mode": { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-turn evaluators combine with defaults" },
|
|
112
|
+
"citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
|
|
113
|
+
"scores": { "$ref": "#/$defs/ScoreCollection" },
|
|
229
114
|
"status": {
|
|
230
115
|
"type": "string",
|
|
231
|
-
"enum": ["pass", "fail", "error"],
|
|
232
|
-
"description": "Overall status of this turn"
|
|
116
|
+
"enum": ["pass", "fail", "partial", "error"],
|
|
117
|
+
"description": "Overall status of this turn."
|
|
233
118
|
},
|
|
234
119
|
"error": {
|
|
235
|
-
"
|
|
236
|
-
"description": "Error
|
|
120
|
+
"$ref": "#/$defs/ErrorObject",
|
|
121
|
+
"description": "Error details for this turn, if any."
|
|
237
122
|
},
|
|
238
|
-
"extensions":
|
|
239
|
-
"type": "object",
|
|
240
|
-
"additionalProperties": true,
|
|
241
|
-
"description": "Extension point for custom turn-level fields"
|
|
242
|
-
}
|
|
123
|
+
"extensions": { "type": "object", "additionalProperties": true, "description": "Extension point for custom turn-level fields" }
|
|
243
124
|
}
|
|
244
125
|
},
|
|
245
126
|
"ThreadSummary": {
|
|
246
127
|
"type": "object",
|
|
247
|
-
"description": "Aggregate statistics for a thread",
|
|
248
|
-
"required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
|
|
128
|
+
"description": "Aggregate statistics for a thread.",
|
|
129
|
+
"required": ["turns_total", "turns_passed", "turns_failed", "turns_partial", "turns_errored", "overall_status"],
|
|
249
130
|
"additionalProperties": false,
|
|
250
131
|
"properties": {
|
|
251
|
-
"turns_total":
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
},
|
|
256
|
-
"turns_passed": {
|
|
257
|
-
"type": "integer",
|
|
258
|
-
"minimum": 0,
|
|
259
|
-
"description": "Number of turns where all evaluators passed"
|
|
260
|
-
},
|
|
261
|
-
"turns_failed": {
|
|
262
|
-
"type": "integer",
|
|
263
|
-
"minimum": 0,
|
|
264
|
-
"description": "Number of turns where any evaluator failed"
|
|
265
|
-
},
|
|
132
|
+
"turns_total": { "type": "integer", "minimum": 1, "description": "Total number of turns executed" },
|
|
133
|
+
"turns_passed": { "type": "integer", "minimum": 0, "description": "Count of turns with status='pass'" },
|
|
134
|
+
"turns_failed": { "type": "integer", "minimum": 0, "description": "Count of turns with status='fail'" },
|
|
135
|
+
"turns_partial": { "type": "integer", "minimum": 0, "description": "Count of turns with status='partial'" },
|
|
136
|
+
"turns_errored": { "type": "integer", "minimum": 0, "description": "Count of turns with status='error'" },
|
|
266
137
|
"overall_status": {
|
|
267
138
|
"type": "string",
|
|
268
|
-
"enum": ["pass", "partial", "
|
|
269
|
-
"description": "
|
|
139
|
+
"enum": ["pass", "fail", "partial", "error"],
|
|
140
|
+
"description": "Overall status of the thread."
|
|
270
141
|
}
|
|
271
142
|
}
|
|
272
143
|
},
|
|
273
144
|
"ScoreCollection": {
|
|
274
145
|
"type": "object",
|
|
275
|
-
"description": "Collection of evaluation scores for an item",
|
|
146
|
+
"description": "Collection of evaluation scores for an item. Each entry is either a valid result (ValidScore variants) or an errored record (ErroredScore) under the discriminated oneOf.",
|
|
276
147
|
"additionalProperties": true,
|
|
277
148
|
"properties": {
|
|
278
|
-
"relevance":
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
},
|
|
282
|
-
"
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
},
|
|
286
|
-
"groundedness": {
|
|
287
|
-
"$ref": "#/$defs/EvalScore",
|
|
288
|
-
"description": "Groundedness score (1-5)"
|
|
289
|
-
},
|
|
290
|
-
"similarity": {
|
|
291
|
-
"$ref": "#/$defs/EvalScore",
|
|
292
|
-
"description": "Similarity score (1-5)"
|
|
293
|
-
},
|
|
294
|
-
"citations": {
|
|
295
|
-
"$ref": "#/$defs/CitationScore",
|
|
296
|
-
"description": "Citation evaluation results"
|
|
297
|
-
},
|
|
298
|
-
"exactMatch": {
|
|
299
|
-
"$ref": "#/$defs/ExactMatchScore",
|
|
300
|
-
"description": "Exact match evaluation result"
|
|
301
|
-
},
|
|
302
|
-
"partialMatch": {
|
|
303
|
-
"$ref": "#/$defs/PartialMatchScore",
|
|
304
|
-
"description": "Partial match evaluation result"
|
|
305
|
-
}
|
|
149
|
+
"relevance": { "$ref": "#/$defs/EvalScore", "description": "Relevance score (1-5) or errored entry" },
|
|
150
|
+
"coherence": { "$ref": "#/$defs/EvalScore", "description": "Coherence score (1-5) or errored entry" },
|
|
151
|
+
"groundedness": { "$ref": "#/$defs/EvalScore", "description": "Groundedness score (1-5) or errored entry" },
|
|
152
|
+
"similarity": { "$ref": "#/$defs/EvalScore", "description": "Similarity score (1-5) or errored entry" },
|
|
153
|
+
"citations": { "$ref": "#/$defs/CitationScore", "description": "Citation evaluation result or errored entry" },
|
|
154
|
+
"exactMatch": { "$ref": "#/$defs/ExactMatchScore", "description": "Exact match evaluation result or errored entry" },
|
|
155
|
+
"partialMatch": { "$ref": "#/$defs/PartialMatchScore", "description": "Partial match evaluation result or errored entry" }
|
|
306
156
|
}
|
|
307
157
|
},
|
|
308
158
|
"EvalScore": {
|
|
159
|
+
"description": "Standard evaluation score (1-5 scale) — valid result OR errored-evaluator record.",
|
|
160
|
+
"oneOf": [
|
|
161
|
+
{ "$ref": "#/$defs/EvalScoreValid" },
|
|
162
|
+
{ "$ref": "#/$defs/ErroredScore" }
|
|
163
|
+
]
|
|
164
|
+
},
|
|
165
|
+
"EvalScoreValid": {
|
|
309
166
|
"type": "object",
|
|
310
|
-
"description": "
|
|
167
|
+
"description": "Valid 1-5 score result. Required when result is pass or fail.",
|
|
311
168
|
"required": ["score", "result", "threshold"],
|
|
312
169
|
"additionalProperties": true,
|
|
313
170
|
"properties": {
|
|
314
|
-
"score":
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
"description": "Numeric score from 1.0 (worst) to 5.0 (best)"
|
|
319
|
-
},
|
|
320
|
-
"result": {
|
|
321
|
-
"type": "string",
|
|
322
|
-
"enum": ["pass", "fail"],
|
|
323
|
-
"description": "Pass/fail result based on threshold comparison"
|
|
324
|
-
},
|
|
325
|
-
"threshold": {
|
|
326
|
-
"type": "number",
|
|
327
|
-
"minimum": 1,
|
|
328
|
-
"maximum": 5,
|
|
329
|
-
"description": "Threshold used for pass/fail determination"
|
|
330
|
-
},
|
|
331
|
-
"reason": {
|
|
332
|
-
"type": "string",
|
|
333
|
-
"description": "Explanation of why this score was assigned"
|
|
334
|
-
},
|
|
335
|
-
"evaluator": {
|
|
336
|
-
"type": "string",
|
|
337
|
-
"description": "Name or identifier of the evaluator that produced this score"
|
|
338
|
-
}
|
|
171
|
+
"score": { "type": "number", "minimum": 1, "maximum": 5, "description": "Numeric score from 1.0 (worst) to 5.0 (best)" },
|
|
172
|
+
"result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on threshold comparison" },
|
|
173
|
+
"threshold": { "type": "number", "minimum": 1, "maximum": 5, "description": "Threshold used for pass/fail determination" },
|
|
174
|
+
"reason": { "type": "string", "description": "Explanation of why this score was assigned" }
|
|
339
175
|
}
|
|
340
176
|
},
|
|
341
177
|
"CitationScore": {
|
|
178
|
+
"description": "Citation-specific evaluation result — valid result OR errored-evaluator record.",
|
|
179
|
+
"oneOf": [
|
|
180
|
+
{ "$ref": "#/$defs/CitationScoreValid" },
|
|
181
|
+
{ "$ref": "#/$defs/ErroredScore" }
|
|
182
|
+
]
|
|
183
|
+
},
|
|
184
|
+
"CitationScoreValid": {
|
|
342
185
|
"type": "object",
|
|
343
|
-
"description": "
|
|
186
|
+
"description": "Valid citation-count result. Required when result is pass or fail.",
|
|
344
187
|
"required": ["count", "result", "threshold"],
|
|
345
188
|
"additionalProperties": true,
|
|
346
189
|
"properties": {
|
|
347
|
-
"count":
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
},
|
|
352
|
-
"result": {
|
|
353
|
-
"type": "string",
|
|
354
|
-
"enum": ["pass", "fail"],
|
|
355
|
-
"description": "Pass/fail result based on citation count vs threshold"
|
|
356
|
-
},
|
|
357
|
-
"threshold": {
|
|
358
|
-
"type": "integer",
|
|
359
|
-
"minimum": 0,
|
|
360
|
-
"description": "Minimum required number of citations for pass"
|
|
361
|
-
},
|
|
362
|
-
"format": {
|
|
363
|
-
"type": "string",
|
|
364
|
-
"description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'. Additional formats may be added.",
|
|
365
|
-
"examples": ["oai_unicode", "bracket", "mixed"]
|
|
366
|
-
},
|
|
367
|
-
"citations": {
|
|
368
|
-
"type": "array",
|
|
369
|
-
"items": {
|
|
370
|
-
"$ref": "#/$defs/Citation"
|
|
371
|
-
},
|
|
372
|
-
"description": "Parsed citation objects"
|
|
373
|
-
}
|
|
190
|
+
"count": { "type": "integer", "minimum": 0, "description": "Number of citations found in the response" },
|
|
191
|
+
"result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on citation count vs threshold" },
|
|
192
|
+
"threshold": { "type": "integer", "minimum": 0, "description": "Minimum required number of citations for pass" },
|
|
193
|
+
"format": { "type": "string", "description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'.", "examples": ["oai_unicode", "bracket", "mixed"] },
|
|
194
|
+
"citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Parsed citation objects" }
|
|
374
195
|
}
|
|
375
196
|
},
|
|
376
197
|
"ExactMatchScore": {
|
|
198
|
+
"description": "Exact match evaluation result — valid result OR errored-evaluator record.",
|
|
199
|
+
"oneOf": [
|
|
200
|
+
{ "$ref": "#/$defs/ExactMatchScoreValid" },
|
|
201
|
+
{ "$ref": "#/$defs/ErroredScore" }
|
|
202
|
+
]
|
|
203
|
+
},
|
|
204
|
+
"ExactMatchScoreValid": {
|
|
377
205
|
"type": "object",
|
|
378
|
-
"description": "
|
|
206
|
+
"description": "Valid exact-match result. Required when result is pass or fail.",
|
|
379
207
|
"required": ["match", "result"],
|
|
380
208
|
"additionalProperties": true,
|
|
381
209
|
"properties": {
|
|
382
|
-
"match":
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
},
|
|
386
|
-
"result": {
|
|
387
|
-
"type": "string",
|
|
388
|
-
"enum": ["pass", "fail"],
|
|
389
|
-
"description": "Pass when match is true, fail otherwise"
|
|
390
|
-
},
|
|
391
|
-
"reason": {
|
|
392
|
-
"type": "string",
|
|
393
|
-
"description": "Explanation of the match result"
|
|
394
|
-
}
|
|
210
|
+
"match": { "type": "boolean", "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)" },
|
|
211
|
+
"result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass when match is true, fail otherwise" },
|
|
212
|
+
"reason": { "type": "string", "description": "Explanation of the match result" }
|
|
395
213
|
}
|
|
396
214
|
},
|
|
397
215
|
"PartialMatchScore": {
|
|
216
|
+
"description": "Partial match evaluation result — valid result OR errored-evaluator record.",
|
|
217
|
+
"oneOf": [
|
|
218
|
+
{ "$ref": "#/$defs/PartialMatchScoreValid" },
|
|
219
|
+
{ "$ref": "#/$defs/ErroredScore" }
|
|
220
|
+
]
|
|
221
|
+
},
|
|
222
|
+
"PartialMatchScoreValid": {
|
|
398
223
|
"type": "object",
|
|
399
|
-
"description": "
|
|
224
|
+
"description": "Valid partial-match result (0.0-1.0 score). Required when result is pass or fail.",
|
|
400
225
|
"required": ["score", "result", "threshold"],
|
|
401
226
|
"additionalProperties": true,
|
|
402
227
|
"properties": {
|
|
403
|
-
"score":
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
228
|
+
"score": { "type": "number", "minimum": 0, "maximum": 1, "description": "Match score from 0.0 (no match) to 1.0 (full match)" },
|
|
229
|
+
"result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail based on score vs threshold" },
|
|
230
|
+
"threshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "Minimum score required for pass (default: 0.5)" },
|
|
231
|
+
"reason": { "type": "string", "description": "Explanation of the match result" }
|
|
232
|
+
}
|
|
233
|
+
},
|
|
234
|
+
"ErroredScore": {
|
|
235
|
+
"type": "object",
|
|
236
|
+
"description": "Per-evaluator entry for an evaluator that did not produce a result.",
|
|
237
|
+
"required": ["result", "error"],
|
|
238
|
+
"additionalProperties": false,
|
|
239
|
+
"properties": {
|
|
240
|
+
"result": { "type": "string", "const": "error", "description": "Always 'error' for this variant." },
|
|
241
|
+
"error": { "type": "string", "minLength": 1, "description": "Error message describing why the evaluator did not produce a result." }
|
|
242
|
+
}
|
|
243
|
+
},
|
|
244
|
+
"ErrorObject": {
|
|
245
|
+
"type": "object",
|
|
246
|
+
"description": "Structured turn/item-level error with a machine-readable code and a human-readable message.",
|
|
247
|
+
"required": ["code", "message"],
|
|
248
|
+
"additionalProperties": false,
|
|
249
|
+
"properties": {
|
|
250
|
+
"code": {
|
|
410
251
|
"type": "string",
|
|
411
|
-
"
|
|
412
|
-
"description": "
|
|
413
|
-
},
|
|
414
|
-
"threshold": {
|
|
415
|
-
"type": "number",
|
|
416
|
-
"minimum": 0,
|
|
417
|
-
"maximum": 1,
|
|
418
|
-
"description": "Minimum score required for pass (default: 0.5)"
|
|
252
|
+
"minLength": 1,
|
|
253
|
+
"description": "Machine-readable error category. One of: 'agentRequestFailed', 'turnSkipped', 'evaluatorsFailed'."
|
|
419
254
|
},
|
|
420
|
-
"
|
|
255
|
+
"message": {
|
|
421
256
|
"type": "string",
|
|
422
|
-
"
|
|
257
|
+
"minLength": 1,
|
|
258
|
+
"description": "Human-readable message paired with the code."
|
|
423
259
|
}
|
|
424
260
|
}
|
|
425
261
|
},
|
|
@@ -438,25 +274,10 @@
|
|
|
438
274
|
"description": "Evaluator configuration options. Use empty object {} for defaults.",
|
|
439
275
|
"additionalProperties": false,
|
|
440
276
|
"properties": {
|
|
441
|
-
"threshold":
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
}
|
|
445
|
-
"citation_format": {
|
|
446
|
-
"type": "string",
|
|
447
|
-
"examples": ["oai_unicode", "bracket", "mixed"],
|
|
448
|
-
"description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
|
|
449
|
-
},
|
|
450
|
-
"case_sensitive": {
|
|
451
|
-
"type": "boolean",
|
|
452
|
-
"default": false,
|
|
453
|
-
"description": "Case-sensitive matching for ExactMatch/PartialMatch"
|
|
454
|
-
},
|
|
455
|
-
"options": {
|
|
456
|
-
"type": "object",
|
|
457
|
-
"additionalProperties": true,
|
|
458
|
-
"description": "Evaluator-specific configuration"
|
|
459
|
-
}
|
|
277
|
+
"threshold": { "type": "number", "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (default: 1), 0.0-1.0 for PartialMatch (default: 0.5)." },
|
|
278
|
+
"citation_format": { "type": "string", "examples": ["oai_unicode", "bracket", "mixed"], "description": "Citation format for detection. Default: oai_unicode." },
|
|
279
|
+
"case_sensitive": { "type": "boolean", "default": false, "description": "Case-sensitive matching for ExactMatch/PartialMatch" },
|
|
280
|
+
"options": { "type": "object", "additionalProperties": true, "description": "Evaluator-specific configuration" }
|
|
460
281
|
}
|
|
461
282
|
},
|
|
462
283
|
"Citation": {
|
|
@@ -465,19 +286,9 @@
|
|
|
465
286
|
"required": ["index"],
|
|
466
287
|
"additionalProperties": true,
|
|
467
288
|
"properties": {
|
|
468
|
-
"index":
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
"description": "Citation index (1-based)"
|
|
472
|
-
},
|
|
473
|
-
"text": {
|
|
474
|
-
"type": "string",
|
|
475
|
-
"description": "The cited text"
|
|
476
|
-
},
|
|
477
|
-
"source": {
|
|
478
|
-
"type": "string",
|
|
479
|
-
"description": "Source reference (URL, document name, etc.)"
|
|
480
|
-
}
|
|
289
|
+
"index": { "type": "integer", "minimum": 1, "description": "Citation index (1-based)" },
|
|
290
|
+
"text": { "type": "string", "description": "The cited text" },
|
|
291
|
+
"source": { "type": "string", "description": "Source reference (URL, document name, etc.)" }
|
|
481
292
|
}
|
|
482
293
|
}
|
|
483
294
|
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.4.0",
|
|
3
|
+
"items": [
|
|
4
|
+
{
|
|
5
|
+
"prompt": "What is Microsoft Graph?",
|
|
6
|
+
"scores": {
|
|
7
|
+
"relevance": {
|
|
8
|
+
"result": "error",
|
|
9
|
+
"error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint",
|
|
10
|
+
"score": 0,
|
|
11
|
+
"threshold": 3
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|