@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -97
- package/package.json +7 -4
- package/schema/v1/eval-document.schema.json +140 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +11 -0
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +1006 -476
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +12 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +79 -16
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
package/README.md
CHANGED
|
@@ -5,21 +5,28 @@
|
|
|
5
5
|
A **zero-configuration** CLI for evaluating M365 Copilot agents. Send prompts to your agent, get responses, and automatically score them with Azure AI Evaluation metrics (relevance, coherence, groundedness).
|
|
6
6
|
- Send a batch (or interactive set) of prompts to a configured chat API endpoint.
|
|
7
7
|
- Collect agent responses and evaluate them locally using Azure AI Evaluation SDK.
|
|
8
|
-
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
- -
|
|
13
|
-
- -
|
|
8
|
+
- The CLI supports 7 evaluator types. Evaluators marked with ⭐ are **enabled by default**.
|
|
9
|
+
|
|
10
|
+
| Evaluator | Type | Scale | Default Threshold | Default |
|
|
11
|
+
|-----------|------|-------|-------------------|---------|
|
|
12
|
+
| **Relevance** ⭐ | LLM-based | 1-5 | 3 | Yes |
|
|
13
|
+
| **Coherence** ⭐ | LLM-based | 1-5 | 3 | Yes |
|
|
14
|
+
| **Groundedness** | LLM-based | 1-5 | 3 | No |
|
|
15
|
+
| **ToolCallAccuracy** | LLM-based | 1-5 | 3 | No |
|
|
16
|
+
| **Citations** | Count-based | >= 0 | 1 | No |
|
|
17
|
+
| **ExactMatch** | String match | boolean | N/A | No |
|
|
18
|
+
| **PartialMatch** | String match | 0.0-1.0 | 0.5 | No |
|
|
14
19
|
- Multiple input modes: command‑line list, JSON file, interactive.
|
|
15
20
|
- Multiple output formats: console (colorized), JSON, CSV, HTML (auto‑opens report).
|
|
16
21
|
|
|
17
22
|
## 📋 Prerequisites
|
|
18
23
|
|
|
24
|
+
- **M365 Copilot License** for your tenant
|
|
19
25
|
- **M365 Copilot Agent** deployed to your tenant (can be created with [M365 Agents Toolkit](https://learn.microsoft.com/en-us/microsoft-365/developer/overview-m365-agents-toolkit) or any other method)
|
|
20
26
|
- **Node.js 24.12.0+** (check: `node --version`)
|
|
21
27
|
- **Environment file** with your credentials and agent ID (see [Environment Setup](#-environment-setup) below)
|
|
22
|
-
- **Your Tenant ID
|
|
28
|
+
- **Your Tenant ID** - get your tenant id using the instructions [here](https://learn.microsoft.com/en-us/azure/azure-portal/get-subscription-tenant-id)
|
|
29
|
+
- **Azure OpenAI endpoint, and API key** (see [Getting Variables](#-getting-variables) below)
|
|
23
30
|
|
|
24
31
|
> Note: Authentication is currently supported on Windows only. Support for other operating systems is coming soon.
|
|
25
32
|
|
|
@@ -123,16 +130,29 @@ You need both the endpoint URL and API key from your Azure OpenAI resource for "
|
|
|
123
130
|
**How to obtain:**
|
|
124
131
|
|
|
125
132
|
1. Go to [Azure Portal](https://portal.azure.com)
|
|
126
|
-
2.
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
133
|
+
2. Open Azure Portal. Search OpenAI in the search bar and select Azure OpenAi.
|
|
134
|
+

|
|
135
|
+
3. once you select Azure OpenAi, then Create an AI Foundry Resource.
|
|
136
|
+

|
|
137
|
+
4. On the Create Foundry Resource, fill in the details and click 'Review + Create'.
|
|
138
|
+

|
|
139
|
+
5. Once the resource deployed, go to foundry portal
|
|
140
|
+

|
|
141
|
+
6. At this point, you should be able to deploy an LLM model.
|
|
142
|
+
7. Select Models + Endpoints on the left rail
|
|
143
|
+

|
|
144
|
+
8. Select Deploy Model -> Deploy base model (we recommend gpt-4o-mini model)
|
|
145
|
+

|
|
146
|
+
9. Select Confirm, then select Customize
|
|
147
|
+

|
|
148
|
+
10. Click on Customize and change the capacity to 50K tokens per minute
|
|
149
|
+

|
|
150
|
+

|
|
151
|
+
11. Hit deploy and wait for a few minutes for the model to deploy.
|
|
152
|
+
12. Once the deployment finishes, you are redirected to the API endpoint and API_Key page.
|
|
153
|
+
13. Copy the following values from that page.
|
|
154
|
+

|
|
155
|
+
14. Add all of these values to your `.env.dev` file as shown in the [Setup Steps](#setup-steps) above
|
|
136
156
|
|
|
137
157
|
**Required model:** Ensure you have `gpt-4o-mini` (or similar) deployed in your Azure OpenAI resource.
|
|
138
158
|
|
|
@@ -165,64 +185,110 @@ runevals --env dev
|
|
|
165
185
|
|
|
166
186
|
---
|
|
167
187
|
|
|
168
|
-
## 📝
|
|
188
|
+
## 📝 Eval Document Format
|
|
169
189
|
|
|
170
|
-
The
|
|
190
|
+
The eval document schema is versioned independently from the CLI, following [Semantic Versioning](https://semver.org/).
|
|
171
191
|
|
|
172
|
-
|
|
192
|
+
- **Schema location**: [`schema/v1/eval-document.schema.json`](schema/v1/eval-document.schema.json)
|
|
193
|
+
- **Schema changelog**: [`schema/CHANGELOG.md`](schema/CHANGELOG.md)
|
|
194
|
+
|
|
195
|
+
> **New in Schema v1.2.0**: Multi-turn conversation threads — test context persistence across multiple turns within a shared conversation session. Each thread supports 1-20 turns.
|
|
196
|
+
|
|
197
|
+
> **New in Schema v1.1.0**: Per-prompt evaluator overrides with `evaluators_mode` (`extend`/`replace`), file-level `default_evaluators`, and `ExactMatch`/`PartialMatch` evaluators.
|
|
198
|
+
|
|
199
|
+
### Getting Started
|
|
173
200
|
|
|
174
|
-
When you run `runevals`, it searches:
|
|
201
|
+
The CLI auto-discovers prompts files in your project. When you run `runevals`, it searches:
|
|
175
202
|
1. Current directory: `prompts.json`, `evals.json`, `tests.json`
|
|
176
203
|
2. `./evals/` subdirectory: `prompts.json`, `evals.json`, `tests.json`
|
|
177
204
|
|
|
178
|
-
**
|
|
179
|
-
```
|
|
180
|
-
my-agent/
|
|
181
|
-
├── .env.local # Your credentials
|
|
182
|
-
├── evals/
|
|
183
|
-
│ └── evals.json # Your test prompts (auto-discovered!)
|
|
184
|
-
└── .evals/
|
|
185
|
-
└── 2025-12-03_14-30-45.html # Generated reports
|
|
186
|
-
```
|
|
205
|
+
**No prompts file?** The CLI will offer to create a starter file with example prompts for you.
|
|
187
206
|
|
|
188
|
-
|
|
207
|
+
A minimal eval document:
|
|
189
208
|
|
|
190
|
-
|
|
209
|
+
```json
|
|
210
|
+
{
|
|
211
|
+
"schemaVersion": "1.2.0",
|
|
212
|
+
"items": [
|
|
213
|
+
{
|
|
214
|
+
"prompt": "What is Microsoft 365?",
|
|
215
|
+
"expected_response": "Microsoft 365 is a cloud-based productivity suite..."
|
|
216
|
+
}
|
|
217
|
+
]
|
|
218
|
+
}
|
|
191
219
|
```
|
|
192
|
-
⚠️ No prompts file found in current directory or ./evals/
|
|
193
220
|
|
|
194
|
-
|
|
195
|
-
```
|
|
221
|
+
### Evaluator Configuration
|
|
196
222
|
|
|
197
|
-
|
|
223
|
+
Use `default_evaluators` to set file-level defaults, and per-item `evaluators` with `evaluators_mode` to customize:
|
|
198
224
|
|
|
199
225
|
```json
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
"
|
|
226
|
+
{
|
|
227
|
+
"schemaVersion": "1.2.0",
|
|
228
|
+
"default_evaluators": {
|
|
229
|
+
"Relevance": {},
|
|
230
|
+
"Coherence": {}
|
|
204
231
|
},
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
232
|
+
"items": [
|
|
233
|
+
{
|
|
234
|
+
"prompt": "What is Microsoft Graph?",
|
|
235
|
+
"expected_response": "A unified API endpoint for Microsoft services.",
|
|
236
|
+
"evaluators": {
|
|
237
|
+
"Citations": { "citation_format": "mixed" }
|
|
238
|
+
},
|
|
239
|
+
"evaluators_mode": "extend"
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"name": "Expense policy flow",
|
|
243
|
+
"turns": [
|
|
244
|
+
{
|
|
245
|
+
"prompt": "I spent $250 on dinner. Is that okay?",
|
|
246
|
+
"expected_response": "The per-diem meal allowance is $200."
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"prompt": "What should I do about the overage?",
|
|
250
|
+
"expected_response": "Request manager approval.",
|
|
251
|
+
"evaluators": {
|
|
252
|
+
"ExactMatch": { "case_sensitive": false }
|
|
253
|
+
},
|
|
254
|
+
"evaluators_mode": "replace"
|
|
255
|
+
}
|
|
256
|
+
]
|
|
257
|
+
}
|
|
258
|
+
]
|
|
259
|
+
}
|
|
210
260
|
```
|
|
211
261
|
|
|
212
|
-
|
|
262
|
+
**How evaluator modes work in this example:**
|
|
213
263
|
|
|
214
|
-
|
|
264
|
+
| Item | `evaluators_mode` | Active Evaluators | Why |
|
|
265
|
+
|------|-------------------|-------------------|-----|
|
|
266
|
+
| Single-turn (Graph) | `extend` | Relevance, Coherence, Citations | Per-prompt Citations **merged** with defaults |
|
|
267
|
+
| Multi-turn turn 1 (dinner) | _(none)_ | Relevance, Coherence | **Inherits** file-level defaults |
|
|
268
|
+
| Multi-turn turn 2 (overage) | `replace` | ExactMatch | Per-turn ExactMatch **replaces** defaults entirely |
|
|
215
269
|
|
|
216
|
-
|
|
270
|
+
### Evaluator Modes
|
|
217
271
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
272
|
+
| Mode | Behavior |
|
|
273
|
+
|------|----------|
|
|
274
|
+
| `"extend"` (default) | Per-item evaluators **merge** with defaults. Both run. |
|
|
275
|
+
| `"replace"` | Per-item evaluators **replace** defaults entirely. Only per-item evaluators run. |
|
|
276
|
+
| _(none)_ | Inherits file-level `default_evaluators`, or system defaults (Relevance, Coherence) if not set. |
|
|
277
|
+
|
|
278
|
+
See `schema/v1/examples/` in the package for more examples including per-turn evaluator overrides, mixed single/multi-turn files, and output format.
|
|
279
|
+
|
|
280
|
+
### Auto-Upgrade Behavior
|
|
281
|
+
|
|
282
|
+
When the CLI loads an eval document:
|
|
283
|
+
|
|
284
|
+
- **Legacy documents** (missing `schemaVersion`): Automatically upgraded with a timestamped backup (e.g., `file.json.bak.20260205143052`)
|
|
285
|
+
- **Older versions** (same major version): `schemaVersion` field updated without backup
|
|
286
|
+
- **Invalid documents**: CLI exits with an error message and guidance to review the schema changelog
|
|
287
|
+
- **Future versions**: CLI rejects with a message suggesting a CLI update
|
|
288
|
+
|
|
289
|
+
### Version Compatibility
|
|
290
|
+
|
|
291
|
+
Within a major version (e.g., 1.x.x), we aim to maintain backward compatibility for documents that conform to the published schema for their version. Compatibility does not extend to undeclared or ad-hoc fields outside the schema definition; review the [schema changelog](schema/CHANGELOG.md) when upgrading between minor versions.
|
|
226
292
|
|
|
227
293
|
## 🎯 Usage Examples
|
|
228
294
|
|
|
@@ -271,6 +337,10 @@ runevals --log-level info
|
|
|
271
337
|
runevals --log-level warning
|
|
272
338
|
runevals --log-level error
|
|
273
339
|
|
|
340
|
+
# Parallel prompt execution control
|
|
341
|
+
runevals --concurrency 5 --prompts-file ./evals/evals.json
|
|
342
|
+
runevals --concurrency 1000 --prompts-file ./evals/evals.json # Python CLI clamps to 5
|
|
343
|
+
|
|
274
344
|
# Custom output location in your project
|
|
275
345
|
runevals --output ./reports/results.html
|
|
276
346
|
```
|
|
@@ -309,10 +379,9 @@ npm run eval:dev
|
|
|
309
379
|
## 📊 Output Formats
|
|
310
380
|
|
|
311
381
|
Results are automatically saved to `./evals/YYYY-MM-DD_HH-MM-SS.html` with:
|
|
312
|
-
-
|
|
313
|
-
-
|
|
314
|
-
-
|
|
315
|
-
- Per-prompt details and aggregate metrics
|
|
382
|
+
- Per-prompt and per-turn evaluation scores from configured evaluators
|
|
383
|
+
- Aggregate statistics across all evaluated items
|
|
384
|
+
- Multi-turn thread summaries (turns passed/failed, overall status)
|
|
316
385
|
|
|
317
386
|
Other formats:
|
|
318
387
|
```bash
|
|
@@ -410,43 +479,6 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
|
|
|
410
479
|
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
|
411
480
|
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
|
412
481
|
|
|
413
|
-
## Schema Versioning
|
|
414
|
-
|
|
415
|
-
The eval document schema is versioned independently from the CLI, following [Semantic Versioning](https://semver.org/). This allows external consumers to depend on a stable contract without coupling to CLI release cycles.
|
|
416
|
-
|
|
417
|
-
- **Schema location**: [`schema/v1/eval-document.schema.json`](schema/v1/eval-document.schema.json)
|
|
418
|
-
- **Schema changelog**: [`schema/CHANGELOG.md`](schema/CHANGELOG.md)
|
|
419
|
-
- **Consumer quickstart**: [`specs/wi-6081652-dataset-schema-versioning/quickstart.md`](specs/wi-6081652-dataset-schema-versioning/quickstart.md)
|
|
420
|
-
|
|
421
|
-
### Eval Document Format
|
|
422
|
-
|
|
423
|
-
Eval documents should include a `schemaVersion` field:
|
|
424
|
-
|
|
425
|
-
```json
|
|
426
|
-
{
|
|
427
|
-
"schemaVersion": "1.0.0",
|
|
428
|
-
"items": [
|
|
429
|
-
{
|
|
430
|
-
"prompt": "What is Microsoft 365?",
|
|
431
|
-
"expected_response": "Microsoft 365 is a cloud-based productivity suite."
|
|
432
|
-
}
|
|
433
|
-
]
|
|
434
|
-
}
|
|
435
|
-
```
|
|
436
|
-
|
|
437
|
-
### Auto-Upgrade Behavior
|
|
438
|
-
|
|
439
|
-
When the CLI loads an eval document:
|
|
440
|
-
|
|
441
|
-
- **Legacy documents** (missing `schemaVersion`): Automatically upgraded with a timestamped backup (e.g., `file.json.bak.20260205143052`)
|
|
442
|
-
- **Older versions** (same major version): `schemaVersion` field updated without backup
|
|
443
|
-
- **Invalid documents**: CLI exits with an error message and guidance to review the schema changelog
|
|
444
|
-
- **Future versions**: CLI rejects with a message suggesting a CLI update
|
|
445
|
-
|
|
446
|
-
### Version Compatibility
|
|
447
|
-
|
|
448
|
-
Within a major version (e.g., 1.x.x), backward compatibility is guaranteed. Documents valid against 1.0.0 will remain valid against 1.1.0, 1.2.0, etc.
|
|
449
|
-
|
|
450
482
|
## Trademarks
|
|
451
483
|
|
|
452
484
|
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@microsoft/m365-copilot-eval",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0-preview.1",
|
|
4
4
|
"minCliVersion": "1.0.1-preview.1",
|
|
5
5
|
"description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
|
|
6
|
-
"publishDate": "2026-04-
|
|
6
|
+
"publishDate": "2026-04-22",
|
|
7
7
|
"main": "src/clients/node-js/lib/index.js",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"bin": {
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"build": "npm run prettier:check && npm run clean && npm run lint",
|
|
15
15
|
"clean": "rimraf node_modules/.cache dist coverage",
|
|
16
16
|
"test": "node --test tests/clients/node-js/**/*.test.js",
|
|
17
|
+
"test:coverage": "c8 --reporter=cobertura --reporter=text node --test tests/clients/node-js/**/*.test.js",
|
|
17
18
|
"install-credprovider": "artifacts-npm-credprovider && npm ci",
|
|
18
19
|
"set-publish-date": "node scripts/set-publish-date.js",
|
|
19
20
|
"prepublishOnly": "node scripts/set-publish-date.js",
|
|
@@ -41,9 +42,10 @@
|
|
|
41
42
|
},
|
|
42
43
|
"dependencies": {
|
|
43
44
|
"commander": "^12.1.0",
|
|
45
|
+
"dotenv": "^16.0.0",
|
|
46
|
+
"https-proxy-agent": "^7.0.5",
|
|
44
47
|
"node-fetch": "^3.3.2",
|
|
45
|
-
"tar": "^7.5.4"
|
|
46
|
-
"https-proxy-agent": "^7.0.5"
|
|
48
|
+
"tar": "^7.5.4"
|
|
47
49
|
},
|
|
48
50
|
"devDependencies": {
|
|
49
51
|
"@microsoft/eslint-config-msgraph": "^5.0.0",
|
|
@@ -52,6 +54,7 @@
|
|
|
52
54
|
"@vitest/coverage-istanbul": "^3.0.0",
|
|
53
55
|
"@vitest/coverage-v8": "^3.0.0",
|
|
54
56
|
"@vitest/ui": "^3.0.0",
|
|
57
|
+
"c8": "^11.0.0",
|
|
55
58
|
"eslint": "^9.7.0",
|
|
56
59
|
"eslint-config-prettier": "^10.0.0",
|
|
57
60
|
"eslint-plugin-jsdoc": "^50.1.0",
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
3
|
"$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
4
4
|
"title": "M365 Copilot Eval Document",
|
|
5
|
-
"description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI.
|
|
5
|
+
"description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Supports single-turn and multi-turn evaluations.",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"required": ["schemaVersion", "items"],
|
|
8
8
|
"additionalProperties": true,
|
|
@@ -28,9 +28,12 @@
|
|
|
28
28
|
"items": {
|
|
29
29
|
"type": "array",
|
|
30
30
|
"minItems": 1,
|
|
31
|
-
"description": "Array of evaluation items
|
|
31
|
+
"description": "Array of evaluation items: single-turn evaluations or multi-turn threads",
|
|
32
32
|
"items": {
|
|
33
|
-
"
|
|
33
|
+
"oneOf": [
|
|
34
|
+
{ "$ref": "#/$defs/SingleTurnEvaluation" },
|
|
35
|
+
{ "$ref": "#/$defs/MultiTurnThread" }
|
|
36
|
+
]
|
|
34
37
|
}
|
|
35
38
|
}
|
|
36
39
|
},
|
|
@@ -60,7 +63,7 @@
|
|
|
60
63
|
"evaluatedAt": {
|
|
61
64
|
"type": "string",
|
|
62
65
|
"format": "date-time",
|
|
63
|
-
"description": "ISO 8601 timestamp when evaluation was performed
|
|
66
|
+
"description": "ISO 8601 timestamp when evaluation was performed"
|
|
64
67
|
},
|
|
65
68
|
"tags": {
|
|
66
69
|
"type": "array",
|
|
@@ -88,11 +91,11 @@
|
|
|
88
91
|
}
|
|
89
92
|
}
|
|
90
93
|
},
|
|
91
|
-
"
|
|
94
|
+
"SingleTurnEvaluation": {
|
|
92
95
|
"type": "object",
|
|
93
|
-
"description": "A single
|
|
96
|
+
"description": "A standalone single-turn prompt-response evaluation",
|
|
94
97
|
"required": ["prompt"],
|
|
95
|
-
"additionalProperties":
|
|
98
|
+
"additionalProperties": false,
|
|
96
99
|
"properties": {
|
|
97
100
|
"prompt": {
|
|
98
101
|
"type": "string",
|
|
@@ -105,7 +108,7 @@
|
|
|
105
108
|
},
|
|
106
109
|
"response": {
|
|
107
110
|
"type": "string",
|
|
108
|
-
"description": "Actual response from the agent
|
|
111
|
+
"description": "Actual response from the agent"
|
|
109
112
|
},
|
|
110
113
|
"context": {
|
|
111
114
|
"type": "string",
|
|
@@ -136,6 +139,135 @@
|
|
|
136
139
|
"additionalProperties": true,
|
|
137
140
|
"description": "Extension point for custom item-level fields"
|
|
138
141
|
}
|
|
142
|
+
},
|
|
143
|
+
"not": {
|
|
144
|
+
"required": ["turns"]
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"MultiTurnThread": {
|
|
148
|
+
"type": "object",
|
|
149
|
+
"description": "A multi-turn conversation thread with ordered turns sharing conversation context",
|
|
150
|
+
"required": ["turns"],
|
|
151
|
+
"additionalProperties": false,
|
|
152
|
+
"properties": {
|
|
153
|
+
"name": {
|
|
154
|
+
"type": "string",
|
|
155
|
+
"description": "Human-readable name for the thread"
|
|
156
|
+
},
|
|
157
|
+
"description": {
|
|
158
|
+
"type": "string",
|
|
159
|
+
"description": "Description of what this thread tests"
|
|
160
|
+
},
|
|
161
|
+
"turns": {
|
|
162
|
+
"type": "array",
|
|
163
|
+
"minItems": 1,
|
|
164
|
+
"maxItems": 20,
|
|
165
|
+
"items": { "$ref": "#/$defs/Turn" },
|
|
166
|
+
"description": "Ordered array of conversation turns"
|
|
167
|
+
},
|
|
168
|
+
"conversation_id": {
|
|
169
|
+
"type": "string",
|
|
170
|
+
"description": "Unique identifier for this conversation thread"
|
|
171
|
+
},
|
|
172
|
+
"summary": {
|
|
173
|
+
"$ref": "#/$defs/ThreadSummary",
|
|
174
|
+
"description": "Aggregate statistics for the thread"
|
|
175
|
+
},
|
|
176
|
+
"extensions": {
|
|
177
|
+
"type": "object",
|
|
178
|
+
"additionalProperties": true,
|
|
179
|
+
"description": "Extension point for custom thread-level fields"
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
"not": {
|
|
183
|
+
"required": ["prompt"]
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
"Turn": {
|
|
187
|
+
"type": "object",
|
|
188
|
+
"description": "A single turn within a multi-turn thread",
|
|
189
|
+
"required": ["prompt"],
|
|
190
|
+
"additionalProperties": false,
|
|
191
|
+
"properties": {
|
|
192
|
+
"prompt": {
|
|
193
|
+
"type": "string",
|
|
194
|
+
"minLength": 1,
|
|
195
|
+
"description": "The user message for this turn"
|
|
196
|
+
},
|
|
197
|
+
"expected_response": {
|
|
198
|
+
"type": "string",
|
|
199
|
+
"description": "Expected agent response for this turn"
|
|
200
|
+
},
|
|
201
|
+
"response": {
|
|
202
|
+
"type": "string",
|
|
203
|
+
"description": "Actual agent response"
|
|
204
|
+
},
|
|
205
|
+
"context": {
|
|
206
|
+
"type": "string",
|
|
207
|
+
"description": "Additional context for grounding evaluation"
|
|
208
|
+
},
|
|
209
|
+
"evaluators": {
|
|
210
|
+
"$ref": "#/$defs/EvaluatorMap",
|
|
211
|
+
"description": "Per-turn evaluator overrides"
|
|
212
|
+
},
|
|
213
|
+
"evaluators_mode": {
|
|
214
|
+
"type": "string",
|
|
215
|
+
"enum": ["extend", "replace"],
|
|
216
|
+
"default": "extend",
|
|
217
|
+
"description": "How per-turn evaluators combine with defaults"
|
|
218
|
+
},
|
|
219
|
+
"citations": {
|
|
220
|
+
"type": "array",
|
|
221
|
+
"items": {
|
|
222
|
+
"$ref": "#/$defs/Citation"
|
|
223
|
+
},
|
|
224
|
+
"description": "Citations included in the response"
|
|
225
|
+
},
|
|
226
|
+
"scores": {
|
|
227
|
+
"$ref": "#/$defs/ScoreCollection"
|
|
228
|
+
},
|
|
229
|
+
"status": {
|
|
230
|
+
"type": "string",
|
|
231
|
+
"enum": ["pass", "fail", "error"],
|
|
232
|
+
"description": "Overall status of this turn"
|
|
233
|
+
},
|
|
234
|
+
"error": {
|
|
235
|
+
"type": "string",
|
|
236
|
+
"description": "Error message if status is 'error'"
|
|
237
|
+
},
|
|
238
|
+
"extensions": {
|
|
239
|
+
"type": "object",
|
|
240
|
+
"additionalProperties": true,
|
|
241
|
+
"description": "Extension point for custom turn-level fields"
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
},
|
|
245
|
+
"ThreadSummary": {
|
|
246
|
+
"type": "object",
|
|
247
|
+
"description": "Aggregate statistics for a thread",
|
|
248
|
+
"required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
|
|
249
|
+
"additionalProperties": false,
|
|
250
|
+
"properties": {
|
|
251
|
+
"turns_total": {
|
|
252
|
+
"type": "integer",
|
|
253
|
+
"minimum": 1,
|
|
254
|
+
"description": "Total number of turns executed"
|
|
255
|
+
},
|
|
256
|
+
"turns_passed": {
|
|
257
|
+
"type": "integer",
|
|
258
|
+
"minimum": 0,
|
|
259
|
+
"description": "Number of turns where all evaluators passed"
|
|
260
|
+
},
|
|
261
|
+
"turns_failed": {
|
|
262
|
+
"type": "integer",
|
|
263
|
+
"minimum": 0,
|
|
264
|
+
"description": "Number of turns where any evaluator failed"
|
|
265
|
+
},
|
|
266
|
+
"overall_status": {
|
|
267
|
+
"type": "string",
|
|
268
|
+
"enum": ["pass", "partial", "fail"],
|
|
269
|
+
"description": "pass: all turns passed, partial: some failed, fail: all failed or error"
|
|
270
|
+
}
|
|
139
271
|
}
|
|
140
272
|
},
|
|
141
273
|
"ScoreCollection": {
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What is Microsoft Graph API?",
|
|
10
|
+
"expected_response": "Microsoft Graph API is a unified endpoint for accessing Microsoft services."
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"name": "Canadian Employee HR Inquiry",
|
|
14
|
+
"turns": [
|
|
15
|
+
{
|
|
16
|
+
"prompt": "I'm a Canadian employee based in Toronto.",
|
|
17
|
+
"expected_response": "Got it! I can help with Canada-specific HR questions."
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"prompt": "Is July 4th a holiday for me?",
|
|
21
|
+
"expected_response": "July 4th is not a statutory holiday in Canada. However, July 1st (Canada Day) is."
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"prompt": "How do I authenticate with Microsoft Graph?",
|
|
27
|
+
"expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|