@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
package/README.md CHANGED
@@ -1,25 +1,33 @@
1
1
  # M365 Copilot Agent Evaluations
2
2
 
3
- > **🔒 PRIVATE PREVIEW:** This tool is currently in private preview. And the instructions below are for Private Preview.
3
+ > **PUBLIC PREVIEW:** This tool is currently in public preview; refer to the instructions below to get started.
4
4
 
5
- A **zero-configuration** CLI for evaluating M365 Copilot agents. Send prompts to your agent, get responses, and automatically score them with Azure AI Evaluation metrics (relevance, coherence, groundedness).
5
+ A CLI for evaluating M365 Copilot agents. Send prompts to your agent, get responses, and automatically score them with Azure AI Evaluation metrics (relevance, coherence, groundedness).
6
6
  - Send a batch (or interactive set) of prompts to a configured chat API endpoint.
7
7
  - Collect agent responses and evaluate them locally using Azure AI Evaluation SDK.
8
- - Metrics produced per prompt:
9
- - - Relevance (1–5)
10
- - - Coherence (1–5)
11
- - - Groundedness (1–5)
12
- - - Tool Call Accuracy (1–5)
13
- - - Citations (0–1)
8
+ - The CLI supports 7 evaluator types. Evaluators marked with ⭐ are **enabled by default**.
9
+
10
+ | Evaluator | Type | Scale | Default Threshold | Default |
11
+ |-----------|------|-------|-------------------|---------|
12
+ | **Relevance** ⭐ | LLM-based | 1-5 | 3 | Yes |
13
+ | **Coherence** ⭐ | LLM-based | 1-5 | 3 | Yes |
14
+ | **Groundedness** | LLM-based | 1-5 | 3 | No |
15
+ | **Similarity** | LLM-based | 1-5 | 3 | No |
16
+ | **Citations** | Count-based | >= 0 | 1 | No |
17
+ | **ExactMatch** | String match | boolean | N/A | No |
18
+ | **PartialMatch** | String match | 0.0-1.0 | 0.5 | No |
14
19
  - Multiple input modes: command‑line list, JSON file, interactive.
15
20
  - Multiple output formats: console (colorized), JSON, CSV, HTML (auto‑opens report).
16
21
 
17
22
  ## 📋 Prerequisites
18
23
 
24
+ - **M365 Copilot License** for your tenant
19
25
  - **M365 Copilot Agent** deployed to your tenant (can be created with [M365 Agents Toolkit](https://learn.microsoft.com/en-us/microsoft-365/developer/overview-m365-agents-toolkit) or any other method)
20
26
  - **Node.js 24.12.0+** (check: `node --version`)
21
27
  - **Environment file** with your credentials and agent ID (see [Environment Setup](#-environment-setup) below)
22
- - **Your Tenant ID, Azure OpenAI endpoint, and API key** (see [Getting Variables](#-getting-variables) below)
28
+ - **Your Tenant ID** - get your tenant id using the instructions [here](https://learn.microsoft.com/en-us/azure/azure-portal/get-subscription-tenant-id)
29
+ - Admin approval to run WORKIQ Client App for your tenant [here](https://github.com/microsoft/work-iq/blob/main/ADMIN-INSTRUCTIONS.md)
30
+ - **Azure OpenAI endpoint, and API key** (see [Getting Variables](#-getting-variables) below)
23
31
 
24
32
  > Note: Authentication is currently supported on Windows only. Support for other operating systems is coming soon.
25
33
 
@@ -59,6 +67,8 @@ M365_TITLE_ID="T_your-title-id-here" # Auto-generated by ATK
59
67
  # .env.local.user (NOT checked in — secrets go here)
60
68
  AZURE_AI_OPENAI_ENDPOINT="<your-azure-openai-endpoint>"
61
69
  AZURE_AI_API_KEY="<your-api-key-from-azure-portal>"
70
+ AZURE_AI_API_VERSION="2024-12-01-preview" # default
71
+ AZURE_AI_MODEL_NAME="gpt-4o-mini" # recommended
62
72
  TENANT_ID="<your-tenant-id>"
63
73
  ```
64
74
 
@@ -83,7 +93,7 @@ M365_AGENT_ID="your-agent-id" # e.g., U_0dc4a8a2-b95f-edac-91c8-d802023ec2d4
83
93
  AZURE_AI_OPENAI_ENDPOINT="<your-azure-openai-endpoint>"
84
94
  AZURE_AI_API_KEY="<your-api-key-from-azure-portal>"
85
95
  AZURE_AI_API_VERSION="2024-12-01-preview" # default
86
- AZURE_AI_MODEL_NAME="gpt-4o-mini" # default
96
+ AZURE_AI_MODEL_NAME="gpt-4o-mini" # recommended
87
97
  TENANT_ID="<your-tenant-id>"
88
98
  ```
89
99
 
@@ -123,16 +133,29 @@ You need both the endpoint URL and API key from your Azure OpenAI resource for "
123
133
  **How to obtain:**
124
134
 
125
135
  1. Go to [Azure Portal](https://portal.azure.com)
126
- 2. Navigate to your Azure OpenAI service
127
- - **Path:** Portal All Services Search "OpenAI" → Select your resource
128
- - **Or create new:** Portal Create a resource Search "OpenAI"
129
- 3. In the **Overview** section, copy the **Endpoint** value
130
- - Format: `https://YOUR-RESOURCE-NAME.openai.azure.com/`
131
- - This is your `AZURE_AI_OPENAI_ENDPOINT`
132
- 4. In the left sidebar, click **Keys and Endpoint**
133
- 5. Copy **KEY 1** or **KEY 2**
134
- - This is your `AZURE_AI_API_KEY`
135
- 6. Add both values to your `.env.dev` file as shown in the [Setup Steps](#setup-steps) above
136
+ 2. Open Azure Portal. Search OpenAI in the search bar and select Azure OpenAi.
137
+ ![Azure Portal search bar showing Azure OpenAI service](docs/images/image.png)
138
+ 3. once you select Azure OpenAi, then Create an AI Foundry Resource.
139
+ ![Azure OpenAI service page with Create AI Foundry Resource button](docs/images/image-1.png)
140
+ 4. On the Create Foundry Resource, fill in the details and click 'Review + Create'.
141
+ ![Create AI Foundry Resource form with Review + Create button](docs/images/image-2.png)
142
+ 5. Once the resource deployed, go to foundry portal
143
+ ![Resource deployment complete with link to AI Foundry portal](docs/images/image-3.png)
144
+ 6. At this point, you should be able to deploy an LLM model.
145
+ 7. Select Models + Endpoints on the left rail
146
+ ![AI Foundry portal left navigation with Models + Endpoints selected](docs/images/image-4.png)
147
+ 8. Select Deploy Model -> Deploy base model (we recommend gpt-4o-mini model)
148
+ ![Deploy Model dropdown showing Deploy base model option](docs/images/image-5.png)
149
+ 9. Select Confirm, then select Customize
150
+ ![Model deployment confirmation dialog with Customize button](docs/images/image-6.png)
151
+ 10. Click on Customize and change the capacity to 50K tokens per minute
152
+ ![Model deployment customization showing token capacity setting](docs/images/image-7.png)
153
+ ![Token capacity set to 50K tokens per minute](docs/images/image-8.png)
154
+ 11. Hit deploy and wait for a few minutes for the model to deploy.
155
+ 12. Once the deployment finishes, you are redirected to the API endpoint and API_Key page.
156
+ 13. Copy the following values from that page.
157
+ ![API endpoint and API key values on the model deployment page](docs/images/image-10.png)
158
+ 14. Add all of these values to your `.env.dev` file as shown in the [Setup Steps](#setup-steps) above
136
159
 
137
160
  **Required model:** Ensure you have `gpt-4o-mini` (or similar) deployed in your Azure OpenAI resource.
138
161
 
@@ -165,64 +188,110 @@ runevals --env dev
165
188
 
166
189
  ---
167
190
 
168
- ## 📝 Creating Prompts Files
191
+ ## 📝 Eval Document Format
192
+
193
+ The eval document schema is versioned independently from the CLI, following [Semantic Versioning](https://semver.org/).
194
+
195
+ - **Schema location**: [`schema/v1/eval-document.schema.json`](schema/v1/eval-document.schema.json)
196
+ - **Schema changelog**: [`schema/CHANGELOG.md`](schema/CHANGELOG.md)
169
197
 
170
- The CLI auto-discovers prompts files in your project:
198
+ > **New in Schema v1.2.0**: Multi-turn conversation threads test context persistence across multiple turns within a shared conversation session. Each thread supports 1-20 turns.
171
199
 
172
- ### Auto-Discovery
200
+ > **New in Schema v1.1.0**: Per-prompt evaluator overrides with `evaluators_mode` (`extend`/`replace`), file-level `default_evaluators`, and `ExactMatch`/`PartialMatch` evaluators.
173
201
 
174
- When you run `runevals`, it searches:
202
+ ### Getting Started
203
+
204
+ The CLI auto-discovers prompts files in your project. When you run `runevals`, it searches:
175
205
  1. Current directory: `prompts.json`, `evals.json`, `tests.json`
176
206
  2. `./evals/` subdirectory: `prompts.json`, `evals.json`, `tests.json`
177
207
 
178
- **Example project structure:**
179
- ```
180
- my-agent/
181
- ├── .env.local # Your credentials
182
- ├── evals/
183
- │ └── evals.json # Your test prompts (auto-discovered!)
184
- └── .evals/
185
- └── 2025-12-03_14-30-45.html # Generated reports
186
- ```
208
+ **No prompts file?** The CLI will offer to create a starter file with example prompts for you.
187
209
 
188
- ### Starter File Creation
210
+ A minimal eval document:
189
211
 
190
- If no file is found:
212
+ ```json
213
+ {
214
+ "schemaVersion": "1.2.0",
215
+ "items": [
216
+ {
217
+ "prompt": "What is Microsoft 365?",
218
+ "expected_response": "Microsoft 365 is a cloud-based productivity suite..."
219
+ }
220
+ ]
221
+ }
191
222
  ```
192
- ⚠️ No prompts file found in current directory or ./evals/
193
223
 
194
- Create a starter evals file with sample prompts? (Y/n):
195
- ```
224
+ ### Evaluator Configuration
196
225
 
197
- Answering "Y" creates `./evals/evals.json` with 2 starter prompts:
226
+ Use `default_evaluators` to set file-level defaults, and per-item `evaluators` with `evaluators_mode` to customize:
198
227
 
199
228
  ```json
200
- [
201
- {
202
- "prompt": "What is Microsoft 365?",
203
- "expected_response": "Microsoft 365 is a cloud-based productivity suite..."
229
+ {
230
+ "schemaVersion": "1.2.0",
231
+ "default_evaluators": {
232
+ "Relevance": {},
233
+ "Coherence": {}
204
234
  },
205
- {
206
- "prompt": "How can I share a file in Teams?",
207
- "expected_response": "You can share a file in Teams by uploading it..."
208
- }
209
- ]
235
+ "items": [
236
+ {
237
+ "prompt": "What is Microsoft Graph?",
238
+ "expected_response": "A unified API endpoint for Microsoft services.",
239
+ "evaluators": {
240
+ "Citations": { "citation_format": "mixed" }
241
+ },
242
+ "evaluators_mode": "extend"
243
+ },
244
+ {
245
+ "name": "Expense policy flow",
246
+ "turns": [
247
+ {
248
+ "prompt": "I spent $250 on dinner. Is that okay?",
249
+ "expected_response": "The per-diem meal allowance is $200."
250
+ },
251
+ {
252
+ "prompt": "What should I do about the overage?",
253
+ "expected_response": "Request manager approval.",
254
+ "evaluators": {
255
+ "ExactMatch": { "case_sensitive": false }
256
+ },
257
+ "evaluators_mode": "replace"
258
+ }
259
+ ]
260
+ }
261
+ ]
262
+ }
210
263
  ```
211
264
 
212
- Edit this file with your own prompts and run again!
265
+ **How evaluator modes work in this example:**
213
266
 
214
- ### Manual Creation
267
+ | Item | `evaluators_mode` | Active Evaluators | Why |
268
+ |------|-------------------|-------------------|-----|
269
+ | Single-turn (Graph) | `extend` | Relevance, Coherence, Citations | Per-prompt Citations **merged** with defaults |
270
+ | Multi-turn turn 1 (dinner) | _(none)_ | Relevance, Coherence | **Inherits** file-level defaults |
271
+ | Multi-turn turn 2 (overage) | `replace` | ExactMatch | Per-turn ExactMatch **replaces** defaults entirely |
215
272
 
216
- Create `./evals/prompts.json`:
273
+ ### Evaluator Modes
217
274
 
218
- ```json
219
- [
220
- {
221
- "prompt": "Your test prompt here",
222
- "expected_response": "Expected agent response"
223
- }
224
- ]
225
- ```
275
+ | Mode | Behavior |
276
+ |------|----------|
277
+ | `"extend"` (default) | Per-item evaluators **merge** with defaults. Both run. |
278
+ | `"replace"` | Per-item evaluators **replace** defaults entirely. Only per-item evaluators run. |
279
+ | _(none)_ | Inherits file-level `default_evaluators`, or system defaults (Relevance, Coherence) if not set. |
280
+
281
+ See `schema/v1/examples/` in the package for more examples including per-turn evaluator overrides, mixed single/multi-turn files, and output format.
282
+
283
+ ### Auto-Upgrade Behavior
284
+
285
+ When the CLI loads an eval document:
286
+
287
+ - **Legacy documents** (missing `schemaVersion`): Automatically upgraded with a timestamped backup (e.g., `file.json.bak.20260205143052`)
288
+ - **Older versions** (same major version): `schemaVersion` field updated without backup
289
+ - **Invalid documents**: CLI exits with an error message and guidance to review the schema changelog
290
+ - **Future versions**: CLI rejects with a message suggesting a CLI update
291
+
292
+ ### Version Compatibility
293
+
294
+ Within a major version (e.g., 1.x.x), we aim to maintain backward compatibility for documents that conform to the published schema for their version. Compatibility does not extend to undeclared or ad-hoc fields outside the schema definition; review the [schema changelog](schema/CHANGELOG.md) when upgrading between minor versions.
226
295
 
227
296
  ## 🎯 Usage Examples
228
297
 
@@ -271,6 +340,10 @@ runevals --log-level info
271
340
  runevals --log-level warning
272
341
  runevals --log-level error
273
342
 
343
+ # Parallel prompt execution control
344
+ runevals --concurrency 5 --prompts-file ./evals/evals.json
345
+ runevals --concurrency 1000 --prompts-file ./evals/evals.json # Python CLI clamps to 5
346
+
274
347
  # Custom output location in your project
275
348
  runevals --output ./reports/results.html
276
349
  ```
@@ -309,10 +382,9 @@ npm run eval:dev
309
382
  ## 📊 Output Formats
310
383
 
311
384
  Results are automatically saved to `./evals/YYYY-MM-DD_HH-MM-SS.html` with:
312
- - **Relevance** score (1-5)
313
- - **Coherence** score (1-5)
314
- - **Groundedness** score (1-5)
315
- - Per-prompt details and aggregate metrics
385
+ - Per-prompt and per-turn evaluation scores from configured evaluators
386
+ - Aggregate statistics across all evaluated items
387
+ - Multi-turn thread summaries (turns passed/failed, overall status)
316
388
 
317
389
  Other formats:
318
390
  ```bash
@@ -410,43 +482,6 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
410
482
  For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
411
483
  contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
412
484
 
413
- ## Schema Versioning
414
-
415
- The eval document schema is versioned independently from the CLI, following [Semantic Versioning](https://semver.org/). This allows external consumers to depend on a stable contract without coupling to CLI release cycles.
416
-
417
- - **Schema location**: [`schema/v1/eval-document.schema.json`](schema/v1/eval-document.schema.json)
418
- - **Schema changelog**: [`schema/CHANGELOG.md`](schema/CHANGELOG.md)
419
- - **Consumer quickstart**: [`specs/wi-6081652-dataset-schema-versioning/quickstart.md`](specs/wi-6081652-dataset-schema-versioning/quickstart.md)
420
-
421
- ### Eval Document Format
422
-
423
- Eval documents should include a `schemaVersion` field:
424
-
425
- ```json
426
- {
427
- "schemaVersion": "1.0.0",
428
- "items": [
429
- {
430
- "prompt": "What is Microsoft 365?",
431
- "expected_response": "Microsoft 365 is a cloud-based productivity suite."
432
- }
433
- ]
434
- }
435
- ```
436
-
437
- ### Auto-Upgrade Behavior
438
-
439
- When the CLI loads an eval document:
440
-
441
- - **Legacy documents** (missing `schemaVersion`): Automatically upgraded with a timestamped backup (e.g., `file.json.bak.20260205143052`)
442
- - **Older versions** (same major version): `schemaVersion` field updated without backup
443
- - **Invalid documents**: CLI exits with an error message and guidance to review the schema changelog
444
- - **Future versions**: CLI rejects with a message suggesting a CLI update
445
-
446
- ### Version Compatibility
447
-
448
- Within a major version (e.g., 1.x.x), backward compatibility is guaranteed. Documents valid against 1.0.0 will remain valid against 1.1.0, 1.2.0, etc.
449
-
450
485
  ## Trademarks
451
486
 
452
487
  This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
package/package.json CHANGED
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "name": "@microsoft/m365-copilot-eval",
3
- "version": "1.3.0-preview.1",
3
+ "version": "1.5.0-preview.1",
4
4
  "minCliVersion": "1.0.1-preview.1",
5
5
  "description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
6
- "publishDate": "2026-04-01",
6
+ "publishDate": "2026-04-30",
7
7
  "main": "src/clients/node-js/lib/index.js",
8
8
  "type": "module",
9
9
  "bin": {
@@ -14,6 +14,7 @@
14
14
  "build": "npm run prettier:check && npm run clean && npm run lint",
15
15
  "clean": "rimraf node_modules/.cache dist coverage",
16
16
  "test": "node --test tests/clients/node-js/**/*.test.js",
17
+ "test:coverage": "c8 --reporter=cobertura --reporter=text node --test tests/clients/node-js/**/*.test.js",
17
18
  "install-credprovider": "artifacts-npm-credprovider && npm ci",
18
19
  "set-publish-date": "node scripts/set-publish-date.js",
19
20
  "prepublishOnly": "node scripts/set-publish-date.js",
@@ -41,9 +42,10 @@
41
42
  },
42
43
  "dependencies": {
43
44
  "commander": "^12.1.0",
45
+ "dotenv": "^16.0.0",
46
+ "https-proxy-agent": "^7.0.5",
44
47
  "node-fetch": "^3.3.2",
45
- "tar": "^7.5.4",
46
- "https-proxy-agent": "^7.0.5"
48
+ "tar": "^7.5.4"
47
49
  },
48
50
  "devDependencies": {
49
51
  "@microsoft/eslint-config-msgraph": "^5.0.0",
@@ -52,6 +54,7 @@
52
54
  "@vitest/coverage-istanbul": "^3.0.0",
53
55
  "@vitest/coverage-v8": "^3.0.0",
54
56
  "@vitest/ui": "^3.0.0",
57
+ "c8": "^11.0.0",
55
58
  "eslint": "^9.7.0",
56
59
  "eslint-config-prettier": "^10.0.0",
57
60
  "eslint-plugin-jsdoc": "^50.1.0",
@@ -5,6 +5,13 @@ All notable changes to the eval document schema will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.2.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.1.0...schema-v1.2.0) (2026-04-22)
9
+
10
+
11
+ ### Features
12
+
13
+ * **schema:** add multi-turn evaluation support (v1.2.0) ([#208](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/208)) ([a5ad22b](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/a5ad22bb4f6ac8ba548dc7f431ace073fa5970ce))
14
+
8
15
  ## [1.1.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.0.0...schema-v1.1.0) (2026-03-30)
9
16
 
10
17
 
@@ -2,7 +2,7 @@
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
3
  "$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
4
4
  "title": "M365 Copilot Eval Document",
5
- "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.1.0.",
5
+ "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Supports single-turn and multi-turn evaluations.",
6
6
  "type": "object",
7
7
  "required": ["schemaVersion", "items"],
8
8
  "additionalProperties": true,
@@ -28,9 +28,12 @@
28
28
  "items": {
29
29
  "type": "array",
30
30
  "minItems": 1,
31
- "description": "Array of evaluation items (prompts and optionally responses with scores)",
31
+ "description": "Array of evaluation items: single-turn evaluations or multi-turn threads",
32
32
  "items": {
33
- "$ref": "#/$defs/EvalItem"
33
+ "oneOf": [
34
+ { "$ref": "#/$defs/SingleTurnEvaluation" },
35
+ { "$ref": "#/$defs/MultiTurnThread" }
36
+ ]
34
37
  }
35
38
  }
36
39
  },
@@ -60,7 +63,7 @@
60
63
  "evaluatedAt": {
61
64
  "type": "string",
62
65
  "format": "date-time",
63
- "description": "ISO 8601 timestamp when evaluation was performed (output documents)"
66
+ "description": "ISO 8601 timestamp when evaluation was performed"
64
67
  },
65
68
  "tags": {
66
69
  "type": "array",
@@ -88,11 +91,11 @@
88
91
  }
89
92
  }
90
93
  },
91
- "EvalItem": {
94
+ "SingleTurnEvaluation": {
92
95
  "type": "object",
93
- "description": "A single evaluation item containing a prompt and optionally a response with scores",
96
+ "description": "A standalone single-turn prompt-response evaluation",
94
97
  "required": ["prompt"],
95
- "additionalProperties": true,
98
+ "additionalProperties": false,
96
99
  "properties": {
97
100
  "prompt": {
98
101
  "type": "string",
@@ -105,7 +108,7 @@
105
108
  },
106
109
  "response": {
107
110
  "type": "string",
108
- "description": "Actual response from the agent (present in output documents)"
111
+ "description": "Actual response from the agent"
109
112
  },
110
113
  "context": {
111
114
  "type": "string",
@@ -136,6 +139,135 @@
136
139
  "additionalProperties": true,
137
140
  "description": "Extension point for custom item-level fields"
138
141
  }
142
+ },
143
+ "not": {
144
+ "required": ["turns"]
145
+ }
146
+ },
147
+ "MultiTurnThread": {
148
+ "type": "object",
149
+ "description": "A multi-turn conversation thread with ordered turns sharing conversation context",
150
+ "required": ["turns"],
151
+ "additionalProperties": false,
152
+ "properties": {
153
+ "name": {
154
+ "type": "string",
155
+ "description": "Human-readable name for the thread"
156
+ },
157
+ "description": {
158
+ "type": "string",
159
+ "description": "Description of what this thread tests"
160
+ },
161
+ "turns": {
162
+ "type": "array",
163
+ "minItems": 1,
164
+ "maxItems": 20,
165
+ "items": { "$ref": "#/$defs/Turn" },
166
+ "description": "Ordered array of conversation turns"
167
+ },
168
+ "conversation_id": {
169
+ "type": "string",
170
+ "description": "Unique identifier for this conversation thread"
171
+ },
172
+ "summary": {
173
+ "$ref": "#/$defs/ThreadSummary",
174
+ "description": "Aggregate statistics for the thread"
175
+ },
176
+ "extensions": {
177
+ "type": "object",
178
+ "additionalProperties": true,
179
+ "description": "Extension point for custom thread-level fields"
180
+ }
181
+ },
182
+ "not": {
183
+ "required": ["prompt"]
184
+ }
185
+ },
186
+ "Turn": {
187
+ "type": "object",
188
+ "description": "A single turn within a multi-turn thread",
189
+ "required": ["prompt"],
190
+ "additionalProperties": false,
191
+ "properties": {
192
+ "prompt": {
193
+ "type": "string",
194
+ "minLength": 1,
195
+ "description": "The user message for this turn"
196
+ },
197
+ "expected_response": {
198
+ "type": "string",
199
+ "description": "Expected agent response for this turn"
200
+ },
201
+ "response": {
202
+ "type": "string",
203
+ "description": "Actual agent response"
204
+ },
205
+ "context": {
206
+ "type": "string",
207
+ "description": "Additional context for grounding evaluation"
208
+ },
209
+ "evaluators": {
210
+ "$ref": "#/$defs/EvaluatorMap",
211
+ "description": "Per-turn evaluator overrides"
212
+ },
213
+ "evaluators_mode": {
214
+ "type": "string",
215
+ "enum": ["extend", "replace"],
216
+ "default": "extend",
217
+ "description": "How per-turn evaluators combine with defaults"
218
+ },
219
+ "citations": {
220
+ "type": "array",
221
+ "items": {
222
+ "$ref": "#/$defs/Citation"
223
+ },
224
+ "description": "Citations included in the response"
225
+ },
226
+ "scores": {
227
+ "$ref": "#/$defs/ScoreCollection"
228
+ },
229
+ "status": {
230
+ "type": "string",
231
+ "enum": ["pass", "fail", "error"],
232
+ "description": "Overall status of this turn"
233
+ },
234
+ "error": {
235
+ "type": "string",
236
+ "description": "Error message if status is 'error'"
237
+ },
238
+ "extensions": {
239
+ "type": "object",
240
+ "additionalProperties": true,
241
+ "description": "Extension point for custom turn-level fields"
242
+ }
243
+ }
244
+ },
245
+ "ThreadSummary": {
246
+ "type": "object",
247
+ "description": "Aggregate statistics for a thread",
248
+ "required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
249
+ "additionalProperties": false,
250
+ "properties": {
251
+ "turns_total": {
252
+ "type": "integer",
253
+ "minimum": 1,
254
+ "description": "Total number of turns executed"
255
+ },
256
+ "turns_passed": {
257
+ "type": "integer",
258
+ "minimum": 0,
259
+ "description": "Number of turns where all evaluators passed"
260
+ },
261
+ "turns_failed": {
262
+ "type": "integer",
263
+ "minimum": 0,
264
+ "description": "Number of turns where any evaluator failed"
265
+ },
266
+ "overall_status": {
267
+ "type": "string",
268
+ "enum": ["pass", "partial", "fail"],
269
+ "description": "pass: all turns passed, partial: some failed, fail: all failed or error"
270
+ }
139
271
  }
140
272
  },
141
273
  "ScoreCollection": {
@@ -155,9 +287,9 @@
155
287
  "$ref": "#/$defs/EvalScore",
156
288
  "description": "Groundedness score (1-5)"
157
289
  },
158
- "toolCallAccuracy": {
290
+ "similarity": {
159
291
  "$ref": "#/$defs/EvalScore",
160
- "description": "Tool call accuracy score (1-5)"
292
+ "description": "Similarity score (1-5)"
161
293
  },
162
294
  "citations": {
163
295
  "$ref": "#/$defs/CitationScore",
@@ -295,7 +427,7 @@
295
427
  "type": "object",
296
428
  "description": "Map of evaluator names to their configuration options",
297
429
  "propertyNames": {
298
- "enum": ["Relevance", "Coherence", "Groundedness", "ToolCallAccuracy", "Citations", "ExactMatch", "PartialMatch"]
430
+ "enum": ["Relevance", "Coherence", "Groundedness", "Similarity", "Citations", "ExactMatch", "PartialMatch"]
299
431
  },
300
432
  "additionalProperties": {
301
433
  "$ref": "#/$defs/EvaluatorOptions"
@@ -0,0 +1,8 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "turns": []
6
+ }
7
+ ]
8
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "prompt": "This item has both prompt and turns",
6
+ "turns": [
7
+ {
8
+ "prompt": "Turn 1"
9
+ }
10
+ ]
11
+ }
12
+ ]
13
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "turns": [
6
+ {
7
+ "expected_response": "This turn is missing a prompt."
8
+ }
9
+ ]
10
+ }
11
+ ]
12
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "turns": [
6
+ {
7
+ "prompt": "Hello",
8
+ "expeceted_response": "Typo in field name"
9
+ }
10
+ ]
11
+ }
12
+ ]
13
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "turns": [
6
+ {
7
+ "prompt": "Hello",
8
+ "evaluators": {
9
+ "TaskCompletionEvaluator": {}
10
+ }
11
+ }
12
+ ]
13
+ }
14
+ ]
15
+ }