snapeval 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,131 +1,178 @@
1
1
  # snapeval
2
2
 
3
- Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.
3
+ Harness-agnostic eval runner for [agentskills.io](https://agentskills.io) skills.
4
4
 
5
5
  [![CI](https://github.com/matantsach/snapeval/actions/workflows/ci.yml/badge.svg)](https://github.com/matantsach/snapeval/actions/workflows/ci.yml)
6
6
  [![npm version](https://img.shields.io/npm/v/snapeval.svg)](https://www.npmjs.com/package/snapeval)
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
8
 
9
- snapeval evaluates [agentskills.io](https://agentskills.io) skills through semantic snapshot testing. It analyzes your skill's `SKILL.md`, collaborates with you to design a test strategy through an interactive browser-based viewer, then captures baselines and detects regressions — all with zero manual test authoring.
9
+ snapeval runs every eval case **with and without** your skill, grades assertions, and computes a benchmark delta so you can see exactly what value your skill adds.
10
10
 
11
- ## Why snapeval?
11
+ ```
12
+ snapeval — greeter
13
+ Baseline = without SKILL.md (raw AI response)
14
+ ────────────────────────────────────────────────────────────
15
+ #1 formal greeting for Eleanor
16
+ Skill: 100% | Baseline: 33% | 5.2s
17
+ #2 casual greeting for Marcus
18
+ Skill: 100% ↑ was 67% | Baseline: 67% | 2.7s
19
+ #3 pirate greeting for Zoe
20
+ Skill: 100% | Baseline: 67% | 2.5s
21
+ ────────────────────────────────────────────────────────────
22
+ Summary:
23
+ Skill pass rate: 100.0%
24
+ Baseline pass rate: 55.6%
25
+ Improvement: +44.4%
26
+ ```
12
27
 
13
- - **Interactive ideation** — AI decomposes your skill into behaviors, dimensions, and failure modes, then opens a visual viewer where you shape the test strategy together.
14
- - **Zero assertions** — No test logic to write. The AI generates realistic, messy prompts that mirror how real users actually type.
15
- - **Semantic comparison** — Tiered pipeline: schema check (free) → LLM judge with order-swap debiasing (when needed). Most checks cost $0.
16
- - **Free inference** — Uses gpt-5-mini via Copilot CLI and GitHub Models API.
17
- - **Platform-agnostic** — Adapter-based architecture. Copilot CLI first, others coming.
28
+ ## How it works
18
29
 
19
- ## Install
30
+ 1. You write a `SKILL.md` and an `evals.json` with test cases and assertions
31
+ 2. snapeval runs each eval **twice** — once with your skill loaded, once without (baseline)
32
+ 3. Assertions are graded by an LLM judge (semantic) and/or shell scripts (deterministic)
33
+ 4. A benchmark shows where your skill adds value vs. where the raw AI already handles it
20
34
 
21
- ### From the marketplace
35
+ ## Quick start
22
36
 
23
- The snapeval marketplace is bundled with the repo. Add it once, then install by name:
37
+ ### As a Copilot plugin
24
38
 
25
39
  ```bash
26
- copilot plugin marketplace add matantsach/snapeval
27
- copilot plugin install snapeval@snapeval-marketplace
40
+ copilot plugin install matantsach/snapeval
28
41
  ```
29
42
 
30
- ### From GitHub directly
43
+ Then in Copilot CLI, just say `evaluate my skill` — the snapeval skill handles the rest.
44
+
45
+ ### Standalone CLI
31
46
 
32
47
  ```bash
33
- copilot plugin install matantsach/snapeval
48
+ git clone https://github.com/matantsach/snapeval.git
49
+ cd snapeval && npm install
50
+ npx tsx bin/snapeval.ts eval <skill-dir>
34
51
  ```
35
52
 
36
- ### Verify installation
53
+ ## Eval format
37
54
 
38
- ```bash
39
- copilot plugin list
55
+ ```
56
+ my-skill/
57
+ ├── SKILL.md
58
+ └── evals/
59
+ ├── evals.json
60
+ └── scripts/ ← optional deterministic checks
61
+ └── validate.sh
62
+ ```
63
+
64
+ **evals.json:**
65
+
66
+ ```json
67
+ {
68
+ "skill_name": "greeter",
69
+ "evals": [
70
+ {
71
+ "id": 1,
72
+ "label": "formal greeting for Eleanor",
73
+ "prompt": "Can you give me a formal greeting for Eleanor?",
74
+ "expected_output": "Returns the formal greeting addressed to Eleanor.",
75
+ "assertions": [
76
+ "Output contains the name Eleanor",
77
+ "Output uses a formal tone",
78
+ "script:validate.sh"
79
+ ]
80
+ }
81
+ ]
82
+ }
40
83
  ```
41
84
 
42
- ## Usage
85
+ | Field | Required | Description |
86
+ |-------|----------|-------------|
87
+ | `id` | yes | Unique numeric identifier |
88
+ | `prompt` | yes | The user prompt sent to the harness |
89
+ | `expected_output` | yes | Human description of the expected behavior |
90
+ | `label` | no | Human-readable name shown in terminal output |
91
+ | `slug` | no | Filesystem-safe name for the eval directory |
92
+ | `assertions` | no | List of assertions to grade (LLM semantic or `script:` prefixed) |
93
+ | `files` | no | Input files to attach to the prompt |
43
94
 
44
- In Copilot CLI, just talk naturally:
95
+ ### Assertions
96
+
97
+ **Semantic** — graded by an LLM. Write specific, verifiable statements:
45
98
 
46
99
  ```
47
- > evaluate my greeter skill
48
- > test skills/code-reviewer for regressions
49
- > check if I broke anything in my-skill
50
- > approve scenario 3
100
+ "Output contains a YAML block with an 'id' field for each issue"
101
+ "Response declines because the pipeline already has unclaimed issues"
51
102
  ```
52
103
 
53
- snapeval activates automatically based on your prompt.
54
-
55
- ### What happens when you evaluate
104
+ **Script** prefix with `script:`. Scripts live in `evals/scripts/`, receive the output directory as `$1`, and pass on exit code 0:
56
105
 
57
- 1. **Analyze** — snapeval reads your SKILL.md and reasons through behaviors, input dimensions, failure modes, and ambiguities
58
- 2. **View** — A browser-based viewer opens showing the analysis with proposed scenarios you can toggle, edit, and extend
59
- 3. **Confirm** — You review, make changes, and click "Confirm & Run" to export your plan
60
- 4. **Capture** — snapeval writes `evals.json` and runs the scenarios against your skill, saving baseline snapshots
106
+ ```
107
+ "script:validate-json-structure.sh"
108
+ ```
61
109
 
62
- After initial setup, use `check` to detect regressions and `approve` to accept intentional changes.
110
+ ## CLI reference
63
111
 
64
- ## CLI Reference
112
+ ### `eval`
65
113
 
66
- The CLI is the headless backend — useful for CI, scripting, and power users.
114
+ Run evals, grade assertions, compute benchmark.
67
115
 
68
- ```
69
- snapeval init [skill-dir] Generate test cases from SKILL.md
70
- snapeval capture [skill-dir] Run scenarios and save baseline snapshots
71
- snapeval check [skill-dir] Compare current output against baselines
72
- snapeval approve [skill-dir] Approve regressed scenarios as new baselines
73
- snapeval report [skill-dir] Write results with optional HTML viewer
74
- snapeval ideate [skill-dir] Open the interactive scenario ideation viewer
116
+ ```bash
117
+ npx snapeval eval [skill-dir] [options]
75
118
  ```
76
119
 
77
120
  | Flag | Description | Default |
78
121
  |------|-------------|---------|
79
- | `--adapter <name>` | Skill adapter | `copilot-cli` |
80
- | `--inference <name>` | Inference adapter | `auto` |
81
- | `--budget <amount>` | Spend cap in USD | `unlimited` |
82
- | `--runs <n>` | Baseline runs per scenario | `1` |
83
- | `--ci` | CI mode: exit 1 on regressions | off |
84
- | `--html` | Generate HTML report viewer | off |
85
- | `--scenario <ids>` | Comma-separated scenario IDs | all |
122
+ | `--harness <name>` | Harness adapter | `copilot-sdk` |
123
+ | `--inference <name>` | Inference adapter for grading | `auto` |
124
+ | `--workspace <path>` | Output directory | `../{skill_name}-workspace` |
125
+ | `--runs <n>` | Harness invocations per eval for statistical averaging | `1` |
126
+ | `--concurrency <n>` | Parallel eval cases (1-10) | `1` |
127
+ | `--only <ids>` | Run specific eval IDs (e.g. `--only 1,3,5`) | all |
128
+ | `--threshold <rate>` | Minimum pass rate 0-1 for exit code 0 | none |
129
+ | `--old-skill <path>` | Compare against old skill version | none |
86
130
  | `--verbose` | Verbose output | off |
87
131
 
88
- ## How It Works
132
+ ### `review`
89
133
 
90
- ```
91
- SKILL.md → AI analyzes skill → Interactive ideation viewer → Capture baselines
92
-
93
- Modify skill Re-run scenarios → Compare via tiered pipeline
94
-
95
- Schema match? → PASS (free, instant)
96
- LLM Judge agrees? → PASS/REGRESSED
134
+ Run eval + generate HTML report + open in browser.
135
+
136
+ ```bash
137
+ npx snapeval review [skill-dir] [options]
97
138
  ```
98
139
 
99
- ### Comparison Pipeline
140
+ Same flags as `eval`, plus `--no-open` to skip opening the browser.
100
141
 
101
- | Tier | Method | Cost | When Used |
102
- |------|--------|------|-----------|
103
- | 1 | Schema check | Free | Structural skeleton matches |
104
- | 2 | LLM judge (order-swap) | Cheap | Schema differs, needs semantic comparison |
142
+ ### Exit codes
105
143
 
106
- Most stable skills are checked entirely at Tier 1 — $0.00 per run.
144
+ | Code | Meaning |
145
+ |------|---------|
146
+ | 0 | Success |
147
+ | 1 | Threshold not met (eval ran but pass rate below `--threshold`) |
148
+ | 2 | Config/input error (bad JSON, missing fields, invalid flags) |
149
+ | 3 | File not found (missing skill dir, evals.json, or script) |
150
+ | 4 | Runtime error (harness failure, grading failure, timeout) |
107
151
 
108
- ## Eval Format
152
+ ## Output artifacts
109
153
 
110
- snapeval follows the [agentskills.io evaluation standard](https://agentskills.io/skill-creation/evaluating-skills):
154
+ Each run creates an iteration directory:
111
155
 
112
156
  ```
113
- my-skill/
114
- ├── SKILL.md
115
- └── evals/
116
- ├── evals.json Test scenarios (AI-generated or from ideation)
117
- ├── analysis.json ← Skill analysis (behaviors, dimensions, gaps)
118
- ├── snapshots/ ← Captured baseline outputs
119
- └── results/
120
- └── iteration-N/
121
- ├── grading.json
157
+ workspace/
158
+ └── iteration-1/
159
+ ├── benchmark.json ← aggregate stats with delta
160
+ ├── SKILL.md.snapshot copy of skill used
161
+ └── eval-{slug}/
162
+ ├── with_skill/
163
+ │ ├── outputs/output.txt
164
+ │ ├── timing.json
165
+ ├── grading.json
166
+ │ └── transcript.log
167
+ └── without_skill/
168
+ ├── outputs/output.txt
122
169
  ├── timing.json
123
- └── benchmark.json
170
+ └── grading.json
124
171
  ```
125
172
 
126
- ## In CI
173
+ **benchmark.json** includes metadata: `eval_count`, `eval_ids`, `skill_name`, `runs_per_eval`, `timestamp`.
127
174
 
128
- Commit your `evals.json` and `snapshots/` directory, then add a workflow:
175
+ ## CI integration
129
176
 
130
177
  ```yaml
131
178
  name: Skill Evaluation
@@ -140,22 +187,10 @@ jobs:
140
187
  with:
141
188
  node-version: 22
142
189
  - run: npm ci
143
- - run: npx snapeval check skills/my-skill --ci
190
+ - run: npx snapeval eval skills/my-skill --threshold 0.8 --runs 3
144
191
  ```
145
192
 
146
- ## Local Development
147
-
148
- ```bash
149
- git clone https://github.com/matantsach/snapeval.git
150
- cd snapeval && npm install
151
- npx tsx bin/snapeval.ts check <skill-path>
152
- ```
153
-
154
- Or load as a local plugin:
155
-
156
- ```bash
157
- copilot plugin install ./path/to/snapeval
158
- ```
193
+ Exit code 1 when pass rate falls below threshold — blocks the PR.
159
194
 
160
195
  ## Configuration
161
196
 
@@ -163,32 +198,37 @@ Create `snapeval.config.json` in your skill or project root:
163
198
 
164
199
  ```json
165
200
  {
166
- "adapter": "copilot-cli",
201
+ "harness": "copilot-sdk",
167
202
  "inference": "auto",
168
- "runs": 3,
169
- "budget": "unlimited"
203
+ "workspace": "../{skill_name}-workspace",
204
+ "runs": 1,
205
+ "concurrency": 1
170
206
  }
171
207
  ```
172
208
 
173
- CLI flags override config file values.
209
+ Resolution order: defaults → project config skill-dir config → CLI flags.
174
210
 
175
- ## Architecture
211
+ ## Harness adapters
176
212
 
177
- Three surfaces over a shared core engine:
213
+ | Adapter | Description | Default |
214
+ |---------|-------------|---------|
215
+ | `copilot-sdk` | Programmatic via `@github/copilot-sdk` with native skill loading | yes |
216
+ | `copilot-cli` | Shells out to `copilot` CLI binary | no |
178
217
 
179
- - **Plugin** (SKILL.md) Interactive product. AI handles everything.
180
- - **CLI** (`npx snapeval`) — Headless backend for CI and power users.
181
- - **GitHub Action** — CI wrapper (planned).
218
+ The SDK harness loads skills natively via `skillDirectories`, captures full transcripts, and extracts real token counts from `assistant.usage` events.
182
219
 
183
- Adapter layers for platform independence:
220
+ ## Inference adapters
184
221
 
185
- - **SkillAdapter** How to invoke a skill (Copilot CLI, others planned)
186
- - **InferenceAdapter** — Where to get LLM capabilities (Copilot gpt-5-mini, GitHub Models API)
187
- - **ReportAdapter** How to present results (terminal, JSON, HTML viewer)
222
+ | Adapter | Description |
223
+ |---------|-------------|
224
+ | `auto` | Copilot CLI if available, else GitHub Models API |
225
+ | `copilot` | Copilot CLI (`copilot` binary) |
226
+ | `copilot-sdk` | `@github/copilot-sdk` programmatic |
227
+ | `github-models` | GitHub Models API (requires `GITHUB_TOKEN`) |
188
228
 
189
229
  ## Contributing
190
230
 
191
- See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
231
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
192
232
 
193
233
  ## License
194
234
 
package/bin/snapeval.ts CHANGED
@@ -1,4 +1,13 @@
1
1
  #!/usr/bin/env tsx
2
+
3
+ // Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
4
+ const _origEmit = process.emit;
5
+ // @ts-ignore — override to filter warnings
6
+ process.emit = function (event: string, ...args: any[]) {
7
+ if (event === 'warning' && args[0]?.name === 'ExperimentalWarning') return false;
8
+ return _origEmit.apply(process, [event, ...args] as any);
9
+ };
10
+
2
11
  import { Command } from 'commander';
3
12
  import { resolveConfig } from '../src/config.js';
4
13
  import { resolveInference } from '../src/adapters/inference/resolve.js';
@@ -1,4 +1,12 @@
1
1
  #!/usr/bin/env tsx
2
+ // Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
3
+ const _origEmit = process.emit;
4
+ // @ts-ignore — override to filter warnings
5
+ process.emit = function (event, ...args) {
6
+ if (event === 'warning' && args[0]?.name === 'ExperimentalWarning')
7
+ return false;
8
+ return _origEmit.apply(process, [event, ...args]);
9
+ };
2
10
  import { Command } from 'commander';
3
11
  import { resolveConfig } from '../src/config.js';
4
12
  import { resolveInference } from '../src/adapters/inference/resolve.js';
@@ -1 +1 @@
1
- {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,uCAAuC,CAAC;AACnE,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,cAAc,EAAE,iEAAiE,CAAC;KACzF,MAAM,CAAC,oBAAoB,EAAE,6EAA6E,CAAC;KAC3G,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI;YACpB,CAAC,CAAE,IAAI,CAAC,IAAe,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS;YAC9B,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,SAAmB,CAAC;YACtC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,IAAI;YACJ,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,iEAAiE;QACjE,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,WAAW,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,uDAAuD;AACvD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE5D,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
1
+ {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AAEA,4EAA4E;AAC5E,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC;AAC/B,2CAA2C;AAC3C,OAAO,CAAC,IAAI,GAAG,UAAU,KAAa,EAAE,GAAG,IAAW;IACpD,IAAI,KAAK,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,qBAAqB;QAAE,OAAO,KAAK,CAAC;IACjF,OAAO,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,GAAG,IAAI,CAAQ,CAAC,CAAC;AAC3D,CAAC,CAAC;AAEF,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,uCAAuC,CAAC;AACnE,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,cAAc,EAAE,iEAAiE,CAAC;KACzF,MAAM,CAAC,oBAAoB,EAAE,6EAA6E,CAAC;KAC3G,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI;YACpB,CAAC,CAAE,IAAI,CAAC,IAAe,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS;YAC9B,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,SAAmB,CAAC;YACtC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,IAAI;YACJ,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,iEAAiE;QACjE,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,WAAW,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,uDAAuD;AACvD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE5D,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
@@ -25,7 +25,9 @@ export async function getClient() {
25
25
  if (!CopilotClient) {
26
26
  throw new Error('Could not find CopilotClient export in @github/copilot-sdk. The package may have changed its API.');
27
27
  }
28
- clientInstance = new CopilotClient({ logLevel: 'none' });
28
+ // Suppress ExperimentalWarning (e.g., SQLite) in the spawned CLI subprocess
29
+ const env = { ...process.env, NODE_OPTIONS: [process.env.NODE_OPTIONS, '--no-warnings'].filter(Boolean).join(' ') };
30
+ clientInstance = new CopilotClient({ logLevel: 'none', env });
29
31
  await clientInstance.start();
30
32
  clientStarted = true;
31
33
  return clientInstance;
@@ -1 +1 @@
1
- {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAC;IACzD,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
1
+ {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,4EAA4E;IAC5E,MAAM,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,YAAY,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;IACpH,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -29,10 +29,10 @@ function loadPreviousIteration(iterationDir) {
29
29
  }
30
30
  }
31
31
  function evalLabel(run) {
32
- // Use expected_output or slug as a readable label instead of truncated prompt
32
+ if (run.label)
33
+ return run.label;
33
34
  if (run.slug && run.slug !== `${run.evalId}`)
34
35
  return run.slug;
35
- // Truncate prompt but show first meaningful line
36
36
  const firstLine = run.prompt.split('\n')[0].slice(0, 60);
37
37
  return firstLine;
38
38
  }
@@ -1 +1 @@
1
- {"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,MAAM,OAAO,CAAC;AAQ1B,SAAS,qBAAqB,CAAC,YAAoB;IACjD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,UAAU,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC/D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAuE,CAAC;QAChG,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,CAAC;YAC7E,MAAM,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5F,MAAM,GAAG,GAAG,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC/F,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IACjC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,GAAqD;IACtE,8EAA8E;IAC9E,IAAI,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE;QAAE,OAAO,GAAG,CAAC,IAAI,CAAC;IAC9D,iDAAiD;IACjD,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACzD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,UAAU,CAAC;IAE3B,KAAK,CAAC,MAAM,CAAC,OAAoB;QAC/B,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;QAEnD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,SAAS,EAAE,CAAC,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,IAAI,GAAG,qBAAqB,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC;YACxC,MAAM,MAAM,GAAG,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5C,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5D,MAAM,OAAO,GAAG,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAC/E,MAAM,QAAQ,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAClF,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;YACrF,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAEvE,8CAA8C;YAC9C,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC1D,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;gBAC3D,IAAI,QAAQ,KAAK,SAAS,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;oBACnD,MAAM,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;oBACjC,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;wBACjB,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;wBAC7D,YAAY,GAAG,IAAI,KAAK,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBACjE,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,OAAO,CAAC,GAAG,YAAY,gBAAgB,QAAQ,MAAM,SAAS,GAAG,CAAC,CAAC;YAErG,gCAAgC;YAChC,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,MAAM,GAAG,SAAS,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;gBACpE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;oBACvB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC9C,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;wBACf,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;oBAClE,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC;QAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC,aAAa,CAAC;QAChD,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAEnG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC9E,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC/E,OAAO,CAAC,GAAG,CAAC,yBAAyB,UAAU,CAAC,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAE9H,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;YACtE,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC;YACnC,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;YACnC,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;YAClF,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEnJ,gCAAgC;YAChC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC;YACtC,IAAI,aAAa,KAAK,aAAa,EAAE,CAAC;gBACpC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,aAAa,MAAM,aAAa,SAAS,CAAC,CAAC,CAAC;YACjG,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
1
+ {"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,MAAM,OAAO,CAAC;AAQ1B,SAAS,qBAAqB,CAAC,YAAoB;IACjD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,UAAU,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC/D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAuE,CAAC;QAChG,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,CAAC;YAC7E,MAAM,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5F,MAAM,GAAG,GAAG,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC/F,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IACjC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,GAAqE;IACtF,IAAI,GAAG,CAAC,KAAK;QAAE,OAAO,GAAG,CAAC,KAAK,CAAC;IAChC,IAAI,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE;QAAE,OAAO,GAAG,CAAC,IAAI,CAAC;IAC9D,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACzD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,UAAU,CAAC;IAE3B,KAAK,CAAC,MAAM,CAAC,OAAoB;QAC/B,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;QAEnD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,SAAS,EAAE,CAAC,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,IAAI,GAAG,qBAAqB,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC;YACxC,MAAM,MAAM,GAAG,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5C,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5D,MAAM,OAAO,GAAG,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAC/E,MAAM,QAAQ,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAClF,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;YACrF,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAEvE,8CAA8C;YAC9C,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC1D,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;gBAC3D,IAAI,QAAQ,KAAK,SAAS,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;oBACnD,MAAM,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;oBACjC,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;wBACjB,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;wBAC7D,YAAY,GAAG,IAAI,KAAK,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBACjE,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,OAAO,CAAC,GAAG,YAAY,gBAAgB,QAAQ,MAAM,SAAS,GAAG,CAAC,CAAC;YAErG,gCAAgC;YAChC,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,MAAM,GAAG,SAAS,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;gBACpE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;oBACvB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC9C,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;wBACf,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;oBAClE,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC;QAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC,aAAa,CAAC;QAChD,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAEnG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC9E,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC/E,OAAO,CAAC,GAAG,CAAC,yBAAyB,UAAU,CAAC,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAE9H,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;YACtE,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC;YACnC,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;YACnC,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;YAClF,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEnJ,gCAAgC;YAChC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC;YACtC,IAAI,aAAa,KAAK,aAAa,EAAE,CAAC;gBACpC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,aAAa,MAAM,aAAa,SAAS,CAAC,CAAC,CAAC;YACjG,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
@@ -18,6 +18,31 @@ async function runWithConcurrency(tasks, limit) {
18
18
  return results;
19
19
  }
20
20
  const MAX_CONCURRENCY = 10;
21
+ /**
22
+ * Average pass rates across multiple grading runs.
23
+ * Uses the last run's assertion_results for display, but averages the
24
+ * pass_rate across all runs so --runs N provides statistical significance.
25
+ */
26
+ function averageGradings(gradings) {
27
+ const valid = gradings.filter((g) => g !== null);
28
+ if (valid.length === 0)
29
+ return undefined;
30
+ if (valid.length === 1)
31
+ return valid[0];
32
+ const avgPassRate = valid.reduce((sum, g) => sum + g.summary.pass_rate, 0) / valid.length;
33
+ const avgPassed = valid.reduce((sum, g) => sum + g.summary.passed, 0) / valid.length;
34
+ const avgFailed = valid.reduce((sum, g) => sum + g.summary.failed, 0) / valid.length;
35
+ const last = valid[valid.length - 1];
36
+ return {
37
+ assertion_results: last.assertion_results,
38
+ summary: {
39
+ passed: Math.round(avgPassed),
40
+ failed: Math.round(avgFailed),
41
+ total: last.summary.total,
42
+ pass_rate: avgPassRate,
43
+ },
44
+ };
45
+ }
21
46
  function validateEvalsFile(evalsFile, evalsPath) {
22
47
  if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
23
48
  throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
@@ -63,6 +88,9 @@ export async function evalCommand(skillPath, harness, inference, options) {
63
88
  }
64
89
  evalsFile = { ...evalsFile, evals: filtered };
65
90
  }
91
+ if (options.threshold !== undefined && (options.threshold < 0 || options.threshold > 1)) {
92
+ throw new SnapevalError(`Threshold must be between 0 and 1 (e.g., 0.8 for 80%). Got: ${options.threshold}`);
93
+ }
66
94
  const ws = new WorkspaceManager(skillPath, options.workspace);
67
95
  const iterationDir = ws.createIteration();
68
96
  // Track which SKILL.md was used for this iteration
@@ -95,20 +123,31 @@ export async function evalCommand(skillPath, harness, inference, options) {
95
123
  if (!lastRun) {
96
124
  throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
97
125
  }
98
- // Use the last run's grading as the primary result (written to grading.json)
99
- // but all gradings contribute to benchmark stats via pass rates
100
- const lastGrading = allGradings[allGradings.length - 1];
126
+ // Average pass rates across all runs for statistical significance
127
+ const withSkillGrading = averageGradings(allGradings.map(g => g.withSkill));
128
+ const withoutSkillGrading = averageGradings(allGradings.map(g => g.withoutSkill));
129
+ // When runs > 1, overwrite grading.json with averaged results so
130
+ // artifacts match the benchmark (not just the last run's raw data)
131
+ if (runs > 1) {
132
+ if (withSkillGrading) {
133
+ fs.writeFileSync(path.join(evalDir, 'with_skill', 'grading.json'), JSON.stringify(withSkillGrading, null, 2));
134
+ }
135
+ if (withoutSkillGrading) {
136
+ fs.writeFileSync(path.join(evalDir, baselineVariant, 'grading.json'), JSON.stringify(withoutSkillGrading, null, 2));
137
+ }
138
+ }
101
139
  return {
102
140
  evalId: evalCase.id,
103
141
  slug,
142
+ label: evalCase.label,
104
143
  prompt: evalCase.prompt,
105
144
  withSkill: {
106
145
  output: lastRun.withSkill.output,
107
- grading: lastGrading.withSkill ?? undefined,
146
+ grading: withSkillGrading,
108
147
  },
109
148
  withoutSkill: {
110
149
  output: lastRun.withoutSkill.output,
111
- grading: lastGrading.withoutSkill ?? undefined,
150
+ grading: withoutSkillGrading,
112
151
  },
113
152
  };
114
153
  });
@@ -121,10 +160,11 @@ export async function evalCommand(skillPath, harness, inference, options) {
121
160
  eval_count: evalRuns.length,
122
161
  eval_ids: evalRuns.map((r) => r.evalId),
123
162
  skill_name: evalsFile.skill_name,
163
+ runs_per_eval: runs,
124
164
  timestamp: new Date().toISOString(),
125
165
  },
126
166
  };
127
- fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta, null, 2));
167
+ fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta, (_key, value) => typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2));
128
168
  // Check threshold if set (for CI gating)
129
169
  if (options.threshold !== undefined) {
130
170
  const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
@@ -1 +1 @@
1
- {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,6EAA6E;QAC7E,gEAAgE;QAChE,MAAM,WAAW,GAAG,WAAW,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAExD,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,WAAW,CAAC,SAAS,IAAI,SAAS;aAC5C;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,WAAW,CAAC,YAAY,IAAI,SAAS;aAC/C;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,wDAAwD;IACxD,MAAM,iBAAiB,GAAG;QACxB,GAAG,SAAS;QACZ,QAAQ,EAAE;YACR,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,QAAQ,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;IAEF,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,iBAAiB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC3C,CAAC;IAEF,yCAAyC;IACzC,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;QACjE,IAAI,QAAQ,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;YACjC,yEAAyE;YACzE,MAAM,OAAO,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;YACvF,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B;;;;GAIG;AACH,SAAS,eAAe,CAAC,QAAkC;IACzD,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsB,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;IACrE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC;IACzC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;IAExC,MAAM,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IAC1F,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACrF,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACrF,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAErC,OAAO;QACL,iBAAiB,EAAE,IAAI,CAAC,iBAAiB;QACzC,OAAO,EAAE;YACP,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;YAC7B,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;YAC7B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;YACzB,SAAS,EAAE,WAAW;SACvB;KACF,CAAC;AACJ,CAAC;AAED,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC,IAAI,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC,EAAE,CAAC;QACxF,MAAM,IAAI,aAAa,CAAC,+DAA+D,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9G,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,kEAAkE;QAClE,MAAM,gBAAgB,GAAG,eAAe,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;QAC5E,MAAM,mBAAmB,GAAG,eAAe,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC;QAElF,iEAAiE;QACjE,mEAAmE;QACnE,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;YACb,IAAI,gBAAgB,EAAE,CAAC;gBACrB,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC1C,CAAC;YACJ,CAAC;YACD,IAAI,mBAAmB,EAAE,CAAC;gBACxB,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,EACnD,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC7C,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,gBAAgB;aAC1B;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,mBAAmB;aAC7B;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,wDAAwD;IACxD,MAAM,iBAAiB,GAAG;QACxB,GAAG,SAAS;QACZ,QAAQ,EAAE;YACR,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,QAAQ,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;IAEF,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,iBAAiB,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAChD,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAC5E,CAAC;IAEF,yCAAyC;IACzC,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;QACjE,IAAI,QAAQ,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;YACjC,yEAAyE;YACzE,MAAM,OAAO,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;YACvF,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
@@ -50,7 +50,7 @@ function runScript(scriptName, outputDir, scriptsDir) {
50
50
  return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
51
51
  }
52
52
  try {
53
- const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
53
+ const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
54
54
  const evidence = stdout || `Script passed: ${scriptName}`;
55
55
  return { text: `script:${scriptName}`, passed: true, evidence };
56
56
  }
@@ -1 +1 @@
1
- {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACnG,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
1
+ {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -2,6 +2,7 @@ import type { Harness, HarnessRunResult, EvalCase } from '../types.js';
2
2
  interface RunEvalResult {
3
3
  evalId: number;
4
4
  slug: string;
5
+ label?: string;
5
6
  prompt: string;
6
7
  withSkill: {
7
8
  output: HarnessRunResult;
@@ -35,6 +35,7 @@ export async function runEval(evalCase, skillPath, evalDir, harness, oldSkillPat
35
35
  return {
36
36
  evalId: evalCase.id,
37
37
  slug: evalCase.slug ?? `${evalCase.id}`,
38
+ label: evalCase.label,
38
39
  prompt: evalCase.prompt,
39
40
  withSkill: { output: withSkillResult },
40
41
  withoutSkill: { output: baselineResult },
@@ -1 +1 @@
1
- {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAWlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,CAAC,eAAe,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC;YACV,SAAS;YACT,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;SAC9C,CAAC;QACF,OAAO,CAAC,GAAG,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;SAC7C,CAAC;KACH,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAYlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,CAAC,eAAe,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC;YACV,SAAS;YACT,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;SAC9C,CAAC;QACF,OAAO,CAAC,GAAG,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;SAC7C,CAAC;KACH,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,KAAK,EAAE,QAAQ,CAAC,KAAK;QACrB,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
@@ -32,6 +32,7 @@ export interface EvalCase {
32
32
  id: number;
33
33
  prompt: string;
34
34
  expected_output: string;
35
+ label?: string;
35
36
  slug?: string;
36
37
  files?: string[];
37
38
  assertions?: string[];
@@ -85,6 +86,7 @@ export interface FeedbackData {
85
86
  export interface EvalRunResult {
86
87
  evalId: number;
87
88
  slug: string;
89
+ label?: string;
88
90
  prompt: string;
89
91
  withSkill: {
90
92
  output: HarnessRunResult;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "2.1.0",
3
+ "version": "2.1.1",
4
4
  "description": "Harness-agnostic eval runner for agentskills.io skills",
5
5
  "type": "module",
6
6
  "bin": {
package/plugin.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "2.1.0",
3
+ "version": "2.1.1",
4
4
  "description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
5
5
  "author": "Matan Tsach",
6
6
  "license": "MIT",
@@ -3,7 +3,7 @@ name: snapeval
3
3
  description: Evaluate AI skills using the agentskills.io eval spec. Runs with/without skill comparisons, grades assertions, and computes benchmarks. Use when the user wants to evaluate, test, or review any skill — including phrases like "test my skill", "run evals", "evaluate this", "set up evals", or "how good is my skill."
4
4
  ---
5
5
 
6
- You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by designing test scenarios, running with/without skill comparisons, grading assertions, and iterating on skill quality.
6
+ You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by understanding what matters to them, designing targeted test scenarios, running with/without skill comparisons, and iterating on skill quality.
7
7
 
8
8
  ## Mode Detection
9
9
 
@@ -12,52 +12,88 @@ Before acting, determine the current state by checking files in the skill direct
12
12
  | State | Condition | Mode |
13
13
  |-------|-----------|------|
14
14
  | **Fresh** | No `evals/evals.json` | First Evaluation |
15
- | **Has evals, no workspace** | `evals/evals.json` exists but no workspace directory | Run First Eval |
16
- | **Has results** | Workspace with `iteration-N/` exists | Review or Re-eval |
15
+ | **Has evals, no workspace** | `evals/evals.json` exists but no workspace directory | Run Eval or Review (skip all interactive phases — go straight to running the command) |
16
+ | **Has results** | Workspace with `iteration-N/` exists | Re-eval or Review (skip all interactive phases) |
17
+
18
+ **Important:** The interactive phases (Discover, Analyze, Interview, Propose) only apply to the **First Evaluation** flow when no evals.json exists. When evals.json already exists, skip straight to running the `eval` or `review` command. If the user says "run", "just do it", or "without asking", always skip interactive phases.
17
19
 
18
20
  ## First Evaluation
19
21
 
20
- Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
22
+ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill", "how good is my skill"
21
23
 
22
24
  ### Phase 1 — Discover
23
25
 
24
- 1. Ask the user which skill to evaluate (or accept the path they provide)
26
+ 1. Identify the skill to evaluate accept the path the user provides, or infer it from context if they mention a skill name or directory
25
27
  2. Read the target skill's SKILL.md using the Read tool
26
28
  3. Summarize what the skill does in 1-2 sentences
27
29
  4. Confirm understanding: "This skill [summary]. Is that right?"
28
30
 
29
- ### Phase 2 Analyze & Propose
31
+ **STOP. Do not proceed to Phase 2 until the user confirms your understanding is correct. Wait for the user to respond.**
32
+
33
+ ### Phase 2 — Deep Skill Analysis
34
+
35
+ Before asking the user anything, do your own homework. Study the skill thoroughly to map its surface area:
36
+
37
+ 1. **Re-read the SKILL.md carefully** — not just the summary, but every instruction, rule, format spec, and example
38
+ 2. **Map the behavior space** — identify every distinct thing the skill does (e.g., "generates commit messages", "handles empty diffs", "detects breaking changes")
39
+ 3. **Map the input space** — what kinds of inputs does it accept? What dimensions vary? (language, length, complexity, format, edge cases)
40
+ 4. **Identify implicit assumptions** — what does the skill assume about context, user intent, or environment that could break?
41
+ 5. **Spot gaps and ambiguities** — where are the instructions vague, contradictory, or silent? These are often where failures hide
42
+
43
+ Present this analysis to the user as a brief skill map:
44
+ > "I've analyzed your skill in depth. Here's what I see:
45
+ > - **N core behaviors**: [list them]
46
+ > - **N input dimensions**: [list them]
47
+ > - **N potential weak spots**: [list them — gaps, ambiguities, untested assumptions]"
48
+
49
+ ### Phase 3 — Interview
50
+
51
+ Now ask targeted questions to fill gaps your analysis couldn't answer. You've done the work — your questions should be specific and informed, not generic.
52
+
53
+ Ask 2-3 focused questions (one at a time) based on what you found in Phase 2. Examples:
54
+
55
+ - "Your skill says [X] but doesn't specify what happens when [Y]. What should it do?"
56
+ - "I see the skill handles [A] and [B] but doesn't mention [C]. Is that a case you care about?"
57
+ - "The output format section says [X]. In practice, do your users need exactly that, or is there flexibility?"
58
+ - "I noticed the skill doesn't address [edge case]. Has that come up, or is it not a concern?"
59
+
60
+ Ask ONE question at a time. Wait for the answer before asking the next one. Two to three questions is usually enough — don't turn this into an interrogation. If the user seems impatient or says "just test it", respect that and move to Phase 4 (Propose Scenarios) with reasonable defaults.
61
+
62
+ **STOP after each question. Wait for the user to respond before asking the next question or moving on.**
63
+
64
+ ### Phase 4 — Propose Scenarios
30
65
 
31
- 1. Decompose the skill into behaviors, input dimensions, and failure modes
32
- 2. Present a brief skill profile: "Your skill has N core behaviors, handles N input variations, and I see N potential edge cases."
33
- 3. Generate 5-8 test scenarios covering:
34
- - Happy path scenarios (normal use cases)
35
- - Edge cases (empty input, unusual input)
36
- - At least one negative test
37
- 4. Present scenarios as a numbered list. For each scenario show:
66
+ Using your analysis and the user's answers, generate 5-8 test scenarios tailored to what actually matters.
67
+
68
+ 1. Present a brief skill profile: "Based on what you told me, I'll focus on [key concerns]. Your skill has N core behaviors and I see N areas worth testing."
69
+ 2. Present scenarios as a numbered list. For each scenario show:
38
70
  - The prompt (realistic — messy, with typos, abbreviations, personal context)
39
- - What it tests
71
+ - What it tests and why (connected back to the user's answers)
40
72
  - Why it matters
41
- 5. Ask: "Want to adjust any of these, or should I run them?"
73
+ 3. Ask: "Want to adjust any of these, or should I run them?"
74
+
75
+ **STOP. Do not write evals.json or run any commands until the user approves the scenario list (or says "just run it", "looks good", "I trust you", etc). Wait for the user to respond.**
42
76
 
43
- ### Phase 3 — Handle Feedback
77
+ ### Phase 5 — Handle Feedback
44
78
 
45
79
  - If the user wants changes, adjust conversationally
46
80
  - "Drop 3, add one about empty input" → adjust the list and re-present
47
81
  - Loop until confirmed
48
- - If the user says "just run it" → skip to Phase 4 immediately
82
+ - If the user says "just run it", "looks good", "I trust you", or similar → skip to Phase 6 immediately
49
83
 
50
- ### Phase 4 — Write evals.json & First Eval
84
+ ### Phase 6 — Write evals.json & Run
51
85
 
52
- 1. Write the approved scenarios to `<skill-path>/evals/evals.json` with assertions derived from the "What it tests" analysis. Format:
86
+ 1. Write the approved scenarios to `<skill-path>/evals/evals.json`. Format:
53
87
  ```json
54
88
  {
55
89
  "skill_name": "<skill-name>",
56
90
  "evals": [
57
91
  {
58
92
  "id": 1,
93
+ "label": "short descriptive name",
59
94
  "slug": "kebab-case-slug",
60
95
  "prompt": "The realistic user prompt",
96
+ "expected_output": "Human description of expected behavior",
61
97
  "assertions": ["Assertion 1", "Assertion 2"],
62
98
  "files": []
63
99
  }
@@ -71,18 +107,47 @@ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
71
107
  - Good: `"Response declines to scout because the pipeline already has unclaimed issues"`
72
108
  - Bad: `"Handles edge case properly"`
73
109
 
74
- Script assertions are also supported: prefix with `script:` (e.g. `"script:check-yaml.sh"`). Scripts live in `<skill-path>/evals/scripts/`, receive the output directory as their first argument, and pass on exit code 0.
110
+ **Prefer semantic assertions for first evaluations.** Script assertions (`script:check.sh`) are powerful but add setup complexity (permissions, paths). Only suggest script assertions when the user specifically needs programmatic validation or has existing scripts.
75
111
 
76
112
  2. Run: `npx snapeval eval <skill-path>` — runs each eval with and without the skill, grades assertions, produces grading.json + benchmark.json
77
- 3. Interpret the benchmark:
78
- > "With skill: X% pass rate. Without skill: Y% pass rate. Delta: +Z%. The skill adds value on [specific assertions]."
113
+
114
+ 3. Interpret the benchmark using these guidelines:
115
+
116
+ | Delta | Interpretation |
117
+ |-------|----------------|
118
+ | **+20% or more** | "Your skill adds significant value — it passes X% more assertions than raw AI." |
119
+ | **+1% to +19%** | "Your skill helps, but the improvement is modest. Here's where it adds value: [specific assertions]." |
120
+ | **0%** | "Your skill isn't measurably helping on these tests. The raw AI handles them equally well. Consider making the skill more specific or testing different scenarios." |
121
+ | **Negative** | "Your skill is actually hurting performance on these tests. The raw AI does better without it. Check [specific failing assertions] — the skill may be adding noise or wrong instructions." |
122
+
123
+ ## Adding or Modifying Evals
124
+
125
+ When the user wants to add, edit, or remove specific eval cases (not regenerate from scratch):
126
+
127
+ 1. Read the existing `evals/evals.json`
128
+ 2. Make the requested change (add new eval, modify assertion, remove eval)
129
+ 3. Preserve all unchanged evals — never regenerate the full file. Never renumber existing eval IDs.
130
+ 4. For new evals, append with the next available ID (e.g., if max ID is 7, use 8)
131
+ 5. Run just the new/modified eval to verify it works: `npx snapeval eval <skill-path> --only <new-id>`
132
+
133
+ ## Re-eval After Skill Change
134
+
135
+ When the user has modified their SKILL.md and wants to see if results improved:
136
+
137
+ 1. Detect that `evals/evals.json` already exists — do NOT regenerate scenarios
138
+ 2. Run: `npx snapeval eval <skill-path>` — this creates the next iteration automatically
139
+ 3. Compare the new iteration with the previous one:
140
+ - Read both `benchmark.json` files
141
+ - Show per-eval pass rate changes
142
+ - Highlight which evals improved, which regressed, and which stayed the same
143
+ 4. Give a verdict: "Your changes improved X evals, regressed Y evals, net delta: +Z%"
79
144
 
80
145
  ## Review & Iterate
81
146
 
82
147
  Triggered by: "review", "show results", "how did it do"
83
148
 
84
149
  1. Run: `npx snapeval review <skill-path>` — runs eval + creates feedback.json template
85
- 2. Interpret results using the three signals from the spec:
150
+ 2. Interpret results using the three signals:
86
151
  - **Failed assertions** — specific gaps in the skill
87
152
  - **Human feedback** — broader quality issues (user fills in feedback.json)
88
153
  - **Benchmark delta** — where the skill adds value vs doesn't
@@ -92,7 +157,10 @@ Triggered by: "review", "show results", "how did it do"
92
157
  - **Always-fail assertions** — possibly broken, investigate
93
158
  - **Differentiating assertions** — pass with skill, fail without — this is where the skill shines
94
159
 
95
- 4. Suggest iteration: "Want to feed these signals to an LLM to propose SKILL.md improvements?"
160
+ 4. Suggest concrete improvement strategies:
161
+ - Add few-shot examples to SKILL.md for failing scenarios
162
+ - Strengthen format constraints if output structure is inconsistent
163
+ - Remove redundant or conflicting instructions
96
164
 
97
165
  ## Comparing Skill Versions
98
166
 
@@ -108,8 +176,14 @@ Never show raw stack traces. Translate errors into plain language with a suggest
108
176
  | Error | Response |
109
177
  |-------|----------|
110
178
  | No evals.json | "No test cases exist yet. Want me to design scenarios and create evals.json?" |
111
- | Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`)." |
179
+ | Skill path doesn't exist | "I can't find a skill at that path. Check the directory exists and contains a SKILL.md." |
180
+ | Harness unavailable | "The eval harness isn't available. Make sure `@github/copilot-sdk` is installed (`npm install @github/copilot-sdk`), or try `--harness copilot-cli`." |
181
+ | Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`) or set GITHUB_TOKEN." |
182
+ | Eval command crashes | "The eval run failed: `<error>`. This might be a config issue — check the error message and try again." |
112
183
  | Skill invocation failure | "The skill failed to respond to eval N: `<error>`. This might be a bug in the skill — want to skip this eval and continue?" |
184
+ | Invalid evals.json | "The evals.json file has a syntax error. Check for missing commas, trailing commas, or mismatched brackets." |
185
+
186
+ If the same command fails twice, do not retry blindly. Explain the issue and ask the user how to proceed.
113
187
 
114
188
  ## Rules
115
189
 
@@ -117,3 +191,7 @@ Never show raw stack traces. Translate errors into plain language with a suggest
117
191
  - Always read the target skill's SKILL.md before generating scenarios
118
192
  - Only reference CLI commands that exist: `eval`, `review`
119
193
  - Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--concurrency`, `--only`, `--threshold`, `--old-skill`, `--no-open`, `--verbose`
194
+ - Use `--only <id>` to run specific eval IDs when the user wants to test a single eval (e.g., `--only 5` or `--only 1,3,7`)
195
+ - Use `--concurrency 5` for parallel execution when running multiple evals
196
+ - Use `--runs 3` when the user needs statistical confidence (averages pass rates across runs)
197
+ - Use `--threshold 0.8` for CI gating (exits with code 1 if pass rate below threshold; value must be 0-1)
@@ -33,7 +33,9 @@ export async function getClient(): Promise<any> {
33
33
  );
34
34
  }
35
35
 
36
- clientInstance = new CopilotClient({ logLevel: 'none' });
36
+ // Suppress ExperimentalWarning (e.g., SQLite) in the spawned CLI subprocess
37
+ const env = { ...process.env, NODE_OPTIONS: [process.env.NODE_OPTIONS, '--no-warnings'].filter(Boolean).join(' ') };
38
+ clientInstance = new CopilotClient({ logLevel: 'none', env });
37
39
  await clientInstance.start();
38
40
  clientStarted = true;
39
41
  return clientInstance;
@@ -33,10 +33,9 @@ function loadPreviousIteration(iterationDir: string): PreviousIteration | null {
33
33
  }
34
34
  }
35
35
 
36
- function evalLabel(run: { evalId: number; slug: string; prompt: string }): string {
37
- // Use expected_output or slug as a readable label instead of truncated prompt
36
+ function evalLabel(run: { evalId: number; slug: string; label?: string; prompt: string }): string {
37
+ if (run.label) return run.label;
38
38
  if (run.slug && run.slug !== `${run.evalId}`) return run.slug;
39
- // Truncate prompt but show first meaningful line
40
39
  const firstLine = run.prompt.split('\n')[0].slice(0, 60);
41
40
  return firstLine;
42
41
  }
@@ -32,6 +32,32 @@ async function runWithConcurrency<T>(
32
32
 
33
33
  const MAX_CONCURRENCY = 10;
34
34
 
35
+ /**
36
+ * Average pass rates across multiple grading runs.
37
+ * Uses the last run's assertion_results for display, but averages the
38
+ * pass_rate across all runs so --runs N provides statistical significance.
39
+ */
40
+ function averageGradings(gradings: (GradingResult | null)[]): GradingResult | undefined {
41
+ const valid = gradings.filter((g): g is GradingResult => g !== null);
42
+ if (valid.length === 0) return undefined;
43
+ if (valid.length === 1) return valid[0];
44
+
45
+ const avgPassRate = valid.reduce((sum, g) => sum + g.summary.pass_rate, 0) / valid.length;
46
+ const avgPassed = valid.reduce((sum, g) => sum + g.summary.passed, 0) / valid.length;
47
+ const avgFailed = valid.reduce((sum, g) => sum + g.summary.failed, 0) / valid.length;
48
+ const last = valid[valid.length - 1];
49
+
50
+ return {
51
+ assertion_results: last.assertion_results,
52
+ summary: {
53
+ passed: Math.round(avgPassed),
54
+ failed: Math.round(avgFailed),
55
+ total: last.summary.total,
56
+ pass_rate: avgPassRate,
57
+ },
58
+ };
59
+ }
60
+
35
61
  function validateEvalsFile(evalsFile: EvalsFile, evalsPath: string): void {
36
62
  if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
37
63
  throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
@@ -85,6 +111,10 @@ export async function evalCommand(
85
111
  evalsFile = { ...evalsFile, evals: filtered };
86
112
  }
87
113
 
114
+ if (options.threshold !== undefined && (options.threshold < 0 || options.threshold > 1)) {
115
+ throw new SnapevalError(`Threshold must be between 0 and 1 (e.g., 0.8 for 80%). Got: ${options.threshold}`);
116
+ }
117
+
88
118
  const ws = new WorkspaceManager(skillPath, options.workspace);
89
119
  const iterationDir = ws.createIteration();
90
120
 
@@ -136,21 +166,39 @@ export async function evalCommand(
136
166
  throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
137
167
  }
138
168
 
139
- // Use the last run's grading as the primary result (written to grading.json)
140
- // but all gradings contribute to benchmark stats via pass rates
141
- const lastGrading = allGradings[allGradings.length - 1];
169
+ // Average pass rates across all runs for statistical significance
170
+ const withSkillGrading = averageGradings(allGradings.map(g => g.withSkill));
171
+ const withoutSkillGrading = averageGradings(allGradings.map(g => g.withoutSkill));
172
+
173
+ // When runs > 1, overwrite grading.json with averaged results so
174
+ // artifacts match the benchmark (not just the last run's raw data)
175
+ if (runs > 1) {
176
+ if (withSkillGrading) {
177
+ fs.writeFileSync(
178
+ path.join(evalDir, 'with_skill', 'grading.json'),
179
+ JSON.stringify(withSkillGrading, null, 2),
180
+ );
181
+ }
182
+ if (withoutSkillGrading) {
183
+ fs.writeFileSync(
184
+ path.join(evalDir, baselineVariant, 'grading.json'),
185
+ JSON.stringify(withoutSkillGrading, null, 2),
186
+ );
187
+ }
188
+ }
142
189
 
143
190
  return {
144
191
  evalId: evalCase.id,
145
192
  slug,
193
+ label: evalCase.label,
146
194
  prompt: evalCase.prompt,
147
195
  withSkill: {
148
196
  output: lastRun.withSkill.output,
149
- grading: lastGrading.withSkill ?? undefined,
197
+ grading: withSkillGrading,
150
198
  },
151
199
  withoutSkill: {
152
200
  output: lastRun.withoutSkill.output,
153
- grading: lastGrading.withoutSkill ?? undefined,
201
+ grading: withoutSkillGrading,
154
202
  },
155
203
  };
156
204
  });
@@ -165,13 +213,15 @@ export async function evalCommand(
165
213
  eval_count: evalRuns.length,
166
214
  eval_ids: evalRuns.map((r) => r.evalId),
167
215
  skill_name: evalsFile.skill_name,
216
+ runs_per_eval: runs,
168
217
  timestamp: new Date().toISOString(),
169
218
  },
170
219
  };
171
220
 
172
221
  fs.writeFileSync(
173
222
  path.join(iterationDir, 'benchmark.json'),
174
- JSON.stringify(benchmarkWithMeta, null, 2)
223
+ JSON.stringify(benchmarkWithMeta, (_key, value) =>
224
+ typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2)
175
225
  );
176
226
 
177
227
  // Check threshold if set (for CI gating)
@@ -63,7 +63,7 @@ function runScript(
63
63
  return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
64
64
  }
65
65
  try {
66
- const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
66
+ const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
67
67
  const evidence = stdout || `Script passed: ${scriptName}`;
68
68
  return { text: `script:${scriptName}`, passed: true, evidence };
69
69
  } catch (err: any) {
@@ -5,6 +5,7 @@ import type { Harness, HarnessRunResult, EvalCase, TimingData } from '../types.j
5
5
  interface RunEvalResult {
6
6
  evalId: number;
7
7
  slug: string;
8
+ label?: string;
8
9
  prompt: string;
9
10
  withSkill: { output: HarnessRunResult };
10
11
  withoutSkill: { output: HarnessRunResult };
@@ -55,6 +56,7 @@ export async function runEval(
55
56
  return {
56
57
  evalId: evalCase.id,
57
58
  slug: evalCase.slug ?? `${evalCase.id}`,
59
+ label: evalCase.label,
58
60
  prompt: evalCase.prompt,
59
61
  withSkill: { output: withSkillResult },
60
62
  withoutSkill: { output: baselineResult },
package/src/types.ts CHANGED
@@ -43,6 +43,7 @@ export interface EvalCase {
43
43
  id: number;
44
44
  prompt: string;
45
45
  expected_output: string;
46
+ label?: string;
46
47
  slug?: string;
47
48
  files?: string[];
48
49
  assertions?: string[];
@@ -110,6 +111,7 @@ export interface FeedbackData {
110
111
  export interface EvalRunResult {
111
112
  evalId: number;
112
113
  slug: string;
114
+ label?: string;
113
115
  prompt: string;
114
116
  withSkill: {
115
117
  output: HarnessRunResult;