snapeval 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -104
- package/bin/snapeval.ts +9 -0
- package/dist/bin/snapeval.js +8 -0
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.js +3 -1
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/report/terminal.js +2 -2
- package/dist/src/adapters/report/terminal.js.map +1 -1
- package/dist/src/commands/eval.js +46 -6
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/engine/grader.js +1 -1
- package/dist/src/engine/grader.js.map +1 -1
- package/dist/src/engine/runner.d.ts +1 -0
- package/dist/src/engine/runner.js +1 -0
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/types.d.ts +2 -0
- package/package.json +1 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +103 -25
- package/src/adapters/copilot-sdk-client.ts +3 -1
- package/src/adapters/report/terminal.ts +2 -3
- package/src/commands/eval.ts +56 -6
- package/src/engine/grader.ts +1 -1
- package/src/engine/runner.ts +2 -0
- package/src/types.ts +2 -0
package/README.md
CHANGED
|
@@ -1,131 +1,178 @@
|
|
|
1
1
|
# snapeval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Harness-agnostic eval runner for [agentskills.io](https://agentskills.io) skills.
|
|
4
4
|
|
|
5
5
|
[](https://github.com/matantsach/snapeval/actions/workflows/ci.yml)
|
|
6
6
|
[](https://www.npmjs.com/package/snapeval)
|
|
7
7
|
[](https://opensource.org/licenses/MIT)
|
|
8
8
|
|
|
9
|
-
snapeval
|
|
9
|
+
snapeval runs every eval case **with and without** your skill, grades assertions, and computes a benchmark delta — so you can see exactly what value your skill adds.
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
```
|
|
12
|
+
snapeval — greeter
|
|
13
|
+
Baseline = without SKILL.md (raw AI response)
|
|
14
|
+
────────────────────────────────────────────────────────────
|
|
15
|
+
#1 formal greeting for Eleanor
|
|
16
|
+
Skill: 100% | Baseline: 33% | 5.2s
|
|
17
|
+
#2 casual greeting for Marcus
|
|
18
|
+
Skill: 100% ↑ was 67% | Baseline: 67% | 2.7s
|
|
19
|
+
#3 pirate greeting for Zoe
|
|
20
|
+
Skill: 100% | Baseline: 67% | 2.5s
|
|
21
|
+
────────────────────────────────────────────────────────────
|
|
22
|
+
Summary:
|
|
23
|
+
Skill pass rate: 100.0%
|
|
24
|
+
Baseline pass rate: 55.6%
|
|
25
|
+
Improvement: +44.4%
|
|
26
|
+
```
|
|
12
27
|
|
|
13
|
-
|
|
14
|
-
- **Zero assertions** — No test logic to write. The AI generates realistic, messy prompts that mirror how real users actually type.
|
|
15
|
-
- **Semantic comparison** — Tiered pipeline: schema check (free) → LLM judge with order-swap debiasing (when needed). Most checks cost $0.
|
|
16
|
-
- **Free inference** — Uses gpt-5-mini via Copilot CLI and GitHub Models API.
|
|
17
|
-
- **Platform-agnostic** — Adapter-based architecture. Copilot CLI first, others coming.
|
|
28
|
+
## How it works
|
|
18
29
|
|
|
19
|
-
|
|
30
|
+
1. You write a `SKILL.md` and an `evals.json` with test cases and assertions
|
|
31
|
+
2. snapeval runs each eval **twice** — once with your skill loaded, once without (baseline)
|
|
32
|
+
3. Assertions are graded by an LLM judge (semantic) and/or shell scripts (deterministic)
|
|
33
|
+
4. A benchmark shows where your skill adds value vs. where the raw AI already handles it
|
|
20
34
|
|
|
21
|
-
|
|
35
|
+
## Quick start
|
|
22
36
|
|
|
23
|
-
|
|
37
|
+
### As a Copilot plugin
|
|
24
38
|
|
|
25
39
|
```bash
|
|
26
|
-
copilot plugin
|
|
27
|
-
copilot plugin install snapeval@snapeval-marketplace
|
|
40
|
+
copilot plugin install matantsach/snapeval
|
|
28
41
|
```
|
|
29
42
|
|
|
30
|
-
|
|
43
|
+
Then in Copilot CLI, just say `evaluate my skill` — the snapeval skill handles the rest.
|
|
44
|
+
|
|
45
|
+
### Standalone CLI
|
|
31
46
|
|
|
32
47
|
```bash
|
|
33
|
-
|
|
48
|
+
git clone https://github.com/matantsach/snapeval.git
|
|
49
|
+
cd snapeval && npm install
|
|
50
|
+
npx tsx bin/snapeval.ts eval <skill-dir>
|
|
34
51
|
```
|
|
35
52
|
|
|
36
|
-
|
|
53
|
+
## Eval format
|
|
37
54
|
|
|
38
|
-
```
|
|
39
|
-
|
|
55
|
+
```
|
|
56
|
+
my-skill/
|
|
57
|
+
├── SKILL.md
|
|
58
|
+
└── evals/
|
|
59
|
+
├── evals.json
|
|
60
|
+
└── scripts/ ← optional deterministic checks
|
|
61
|
+
└── validate.sh
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**evals.json:**
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"skill_name": "greeter",
|
|
69
|
+
"evals": [
|
|
70
|
+
{
|
|
71
|
+
"id": 1,
|
|
72
|
+
"label": "formal greeting for Eleanor",
|
|
73
|
+
"prompt": "Can you give me a formal greeting for Eleanor?",
|
|
74
|
+
"expected_output": "Returns the formal greeting addressed to Eleanor.",
|
|
75
|
+
"assertions": [
|
|
76
|
+
"Output contains the name Eleanor",
|
|
77
|
+
"Output uses a formal tone",
|
|
78
|
+
"script:validate.sh"
|
|
79
|
+
]
|
|
80
|
+
}
|
|
81
|
+
]
|
|
82
|
+
}
|
|
40
83
|
```
|
|
41
84
|
|
|
42
|
-
|
|
85
|
+
| Field | Required | Description |
|
|
86
|
+
|-------|----------|-------------|
|
|
87
|
+
| `id` | yes | Unique numeric identifier |
|
|
88
|
+
| `prompt` | yes | The user prompt sent to the harness |
|
|
89
|
+
| `expected_output` | yes | Human description of the expected behavior |
|
|
90
|
+
| `label` | no | Human-readable name shown in terminal output |
|
|
91
|
+
| `slug` | no | Filesystem-safe name for the eval directory |
|
|
92
|
+
| `assertions` | no | List of assertions to grade (LLM semantic or `script:` prefixed) |
|
|
93
|
+
| `files` | no | Input files to attach to the prompt |
|
|
43
94
|
|
|
44
|
-
|
|
95
|
+
### Assertions
|
|
96
|
+
|
|
97
|
+
**Semantic** — graded by an LLM. Write specific, verifiable statements:
|
|
45
98
|
|
|
46
99
|
```
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
> check if I broke anything in my-skill
|
|
50
|
-
> approve scenario 3
|
|
100
|
+
"Output contains a YAML block with an 'id' field for each issue"
|
|
101
|
+
"Response declines because the pipeline already has unclaimed issues"
|
|
51
102
|
```
|
|
52
103
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
### What happens when you evaluate
|
|
104
|
+
**Script** — prefix with `script:`. Scripts live in `evals/scripts/`, receive the output directory as `$1`, and pass on exit code 0:
|
|
56
105
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
4. **Capture** — snapeval writes `evals.json` and runs the scenarios against your skill, saving baseline snapshots
|
|
106
|
+
```
|
|
107
|
+
"script:validate-json-structure.sh"
|
|
108
|
+
```
|
|
61
109
|
|
|
62
|
-
|
|
110
|
+
## CLI reference
|
|
63
111
|
|
|
64
|
-
|
|
112
|
+
### `eval`
|
|
65
113
|
|
|
66
|
-
|
|
114
|
+
Run evals, grade assertions, compute benchmark.
|
|
67
115
|
|
|
68
|
-
```
|
|
69
|
-
snapeval
|
|
70
|
-
snapeval capture [skill-dir] Run scenarios and save baseline snapshots
|
|
71
|
-
snapeval check [skill-dir] Compare current output against baselines
|
|
72
|
-
snapeval approve [skill-dir] Approve regressed scenarios as new baselines
|
|
73
|
-
snapeval report [skill-dir] Write results with optional HTML viewer
|
|
74
|
-
snapeval ideate [skill-dir] Open the interactive scenario ideation viewer
|
|
116
|
+
```bash
|
|
117
|
+
npx snapeval eval [skill-dir] [options]
|
|
75
118
|
```
|
|
76
119
|
|
|
77
120
|
| Flag | Description | Default |
|
|
78
121
|
|------|-------------|---------|
|
|
79
|
-
| `--
|
|
80
|
-
| `--inference <name>` | Inference adapter | `auto` |
|
|
81
|
-
| `--
|
|
82
|
-
| `--runs <n>` |
|
|
83
|
-
| `--
|
|
84
|
-
| `--
|
|
85
|
-
| `--
|
|
122
|
+
| `--harness <name>` | Harness adapter | `copilot-sdk` |
|
|
123
|
+
| `--inference <name>` | Inference adapter for grading | `auto` |
|
|
124
|
+
| `--workspace <path>` | Output directory | `../{skill_name}-workspace` |
|
|
125
|
+
| `--runs <n>` | Harness invocations per eval for statistical averaging | `1` |
|
|
126
|
+
| `--concurrency <n>` | Parallel eval cases (1-10) | `1` |
|
|
127
|
+
| `--only <ids>` | Run specific eval IDs (e.g. `--only 1,3,5`) | all |
|
|
128
|
+
| `--threshold <rate>` | Minimum pass rate 0-1 for exit code 0 | none |
|
|
129
|
+
| `--old-skill <path>` | Compare against old skill version | none |
|
|
86
130
|
| `--verbose` | Verbose output | off |
|
|
87
131
|
|
|
88
|
-
|
|
132
|
+
### `review`
|
|
89
133
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
↓
|
|
95
|
-
Schema match? → PASS (free, instant)
|
|
96
|
-
LLM Judge agrees? → PASS/REGRESSED
|
|
134
|
+
Run eval + generate HTML report + open in browser.
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
npx snapeval review [skill-dir] [options]
|
|
97
138
|
```
|
|
98
139
|
|
|
99
|
-
|
|
140
|
+
Same flags as `eval`, plus `--no-open` to skip opening the browser.
|
|
100
141
|
|
|
101
|
-
|
|
102
|
-
|------|--------|------|-----------|
|
|
103
|
-
| 1 | Schema check | Free | Structural skeleton matches |
|
|
104
|
-
| 2 | LLM judge (order-swap) | Cheap | Schema differs, needs semantic comparison |
|
|
142
|
+
### Exit codes
|
|
105
143
|
|
|
106
|
-
|
|
144
|
+
| Code | Meaning |
|
|
145
|
+
|------|---------|
|
|
146
|
+
| 0 | Success |
|
|
147
|
+
| 1 | Threshold not met (eval ran but pass rate below `--threshold`) |
|
|
148
|
+
| 2 | Config/input error (bad JSON, missing fields, invalid flags) |
|
|
149
|
+
| 3 | File not found (missing skill dir, evals.json, or script) |
|
|
150
|
+
| 4 | Runtime error (harness failure, grading failure, timeout) |
|
|
107
151
|
|
|
108
|
-
##
|
|
152
|
+
## Output artifacts
|
|
109
153
|
|
|
110
|
-
|
|
154
|
+
Each run creates an iteration directory:
|
|
111
155
|
|
|
112
156
|
```
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
├──
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
157
|
+
workspace/
|
|
158
|
+
└── iteration-1/
|
|
159
|
+
├── benchmark.json ← aggregate stats with delta
|
|
160
|
+
├── SKILL.md.snapshot ← copy of skill used
|
|
161
|
+
└── eval-{slug}/
|
|
162
|
+
├── with_skill/
|
|
163
|
+
│ ├── outputs/output.txt
|
|
164
|
+
│ ├── timing.json
|
|
165
|
+
│ ├── grading.json
|
|
166
|
+
│ └── transcript.log
|
|
167
|
+
└── without_skill/
|
|
168
|
+
├── outputs/output.txt
|
|
122
169
|
├── timing.json
|
|
123
|
-
└──
|
|
170
|
+
└── grading.json
|
|
124
171
|
```
|
|
125
172
|
|
|
126
|
-
|
|
173
|
+
**benchmark.json** includes metadata: `eval_count`, `eval_ids`, `skill_name`, `runs_per_eval`, `timestamp`.
|
|
127
174
|
|
|
128
|
-
|
|
175
|
+
## CI integration
|
|
129
176
|
|
|
130
177
|
```yaml
|
|
131
178
|
name: Skill Evaluation
|
|
@@ -140,22 +187,10 @@ jobs:
|
|
|
140
187
|
with:
|
|
141
188
|
node-version: 22
|
|
142
189
|
- run: npm ci
|
|
143
|
-
- run: npx snapeval
|
|
190
|
+
- run: npx snapeval eval skills/my-skill --threshold 0.8 --runs 3
|
|
144
191
|
```
|
|
145
192
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
```bash
|
|
149
|
-
git clone https://github.com/matantsach/snapeval.git
|
|
150
|
-
cd snapeval && npm install
|
|
151
|
-
npx tsx bin/snapeval.ts check <skill-path>
|
|
152
|
-
```
|
|
153
|
-
|
|
154
|
-
Or load as a local plugin:
|
|
155
|
-
|
|
156
|
-
```bash
|
|
157
|
-
copilot plugin install ./path/to/snapeval
|
|
158
|
-
```
|
|
193
|
+
Exit code 1 when pass rate falls below threshold — blocks the PR.
|
|
159
194
|
|
|
160
195
|
## Configuration
|
|
161
196
|
|
|
@@ -163,32 +198,37 @@ Create `snapeval.config.json` in your skill or project root:
|
|
|
163
198
|
|
|
164
199
|
```json
|
|
165
200
|
{
|
|
166
|
-
"
|
|
201
|
+
"harness": "copilot-sdk",
|
|
167
202
|
"inference": "auto",
|
|
168
|
-
"
|
|
169
|
-
"
|
|
203
|
+
"workspace": "../{skill_name}-workspace",
|
|
204
|
+
"runs": 1,
|
|
205
|
+
"concurrency": 1
|
|
170
206
|
}
|
|
171
207
|
```
|
|
172
208
|
|
|
173
|
-
|
|
209
|
+
Resolution order: defaults → project config → skill-dir config → CLI flags.
|
|
174
210
|
|
|
175
|
-
##
|
|
211
|
+
## Harness adapters
|
|
176
212
|
|
|
177
|
-
|
|
213
|
+
| Adapter | Description | Default |
|
|
214
|
+
|---------|-------------|---------|
|
|
215
|
+
| `copilot-sdk` | Programmatic via `@github/copilot-sdk` with native skill loading | yes |
|
|
216
|
+
| `copilot-cli` | Shells out to `copilot` CLI binary | no |
|
|
178
217
|
|
|
179
|
-
|
|
180
|
-
- **CLI** (`npx snapeval`) — Headless backend for CI and power users.
|
|
181
|
-
- **GitHub Action** — CI wrapper (planned).
|
|
218
|
+
The SDK harness loads skills natively via `skillDirectories`, captures full transcripts, and extracts real token counts from `assistant.usage` events.
|
|
182
219
|
|
|
183
|
-
|
|
220
|
+
## Inference adapters
|
|
184
221
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
222
|
+
| Adapter | Description |
|
|
223
|
+
|---------|-------------|
|
|
224
|
+
| `auto` | Copilot CLI if available, else GitHub Models API |
|
|
225
|
+
| `copilot` | Copilot CLI (`copilot` binary) |
|
|
226
|
+
| `copilot-sdk` | `@github/copilot-sdk` programmatic |
|
|
227
|
+
| `github-models` | GitHub Models API (requires `GITHUB_TOKEN`) |
|
|
188
228
|
|
|
189
229
|
## Contributing
|
|
190
230
|
|
|
191
|
-
See [CONTRIBUTING.md](CONTRIBUTING.md)
|
|
231
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
192
232
|
|
|
193
233
|
## License
|
|
194
234
|
|
package/bin/snapeval.ts
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env tsx
|
|
2
|
+
|
|
3
|
+
// Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
|
|
4
|
+
const _origEmit = process.emit;
|
|
5
|
+
// @ts-ignore — override to filter warnings
|
|
6
|
+
process.emit = function (event: string, ...args: any[]) {
|
|
7
|
+
if (event === 'warning' && args[0]?.name === 'ExperimentalWarning') return false;
|
|
8
|
+
return _origEmit.apply(process, [event, ...args] as any);
|
|
9
|
+
};
|
|
10
|
+
|
|
2
11
|
import { Command } from 'commander';
|
|
3
12
|
import { resolveConfig } from '../src/config.js';
|
|
4
13
|
import { resolveInference } from '../src/adapters/inference/resolve.js';
|
package/dist/bin/snapeval.js
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env tsx
|
|
2
|
+
// Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
|
|
3
|
+
const _origEmit = process.emit;
|
|
4
|
+
// @ts-ignore — override to filter warnings
|
|
5
|
+
process.emit = function (event, ...args) {
|
|
6
|
+
if (event === 'warning' && args[0]?.name === 'ExperimentalWarning')
|
|
7
|
+
return false;
|
|
8
|
+
return _origEmit.apply(process, [event, ...args]);
|
|
9
|
+
};
|
|
2
10
|
import { Command } from 'commander';
|
|
3
11
|
import { resolveConfig } from '../src/config.js';
|
|
4
12
|
import { resolveInference } from '../src/adapters/inference/resolve.js';
|
package/dist/bin/snapeval.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";
|
|
1
|
+
{"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AAEA,4EAA4E;AAC5E,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC;AAC/B,2CAA2C;AAC3C,OAAO,CAAC,IAAI,GAAG,UAAU,KAAa,EAAE,GAAG,IAAW;IACpD,IAAI,KAAK,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,qBAAqB;QAAE,OAAO,KAAK,CAAC;IACjF,OAAO,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,GAAG,IAAI,CAAQ,CAAC,CAAC;AAC3D,CAAC,CAAC;AAEF,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,uCAAuC,CAAC;AACnE,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,cAAc,EAAE,iEAAiE,CAAC;KACzF,MAAM,CAAC,oBAAoB,EAAE,6EAA6E,CAAC;KAC3G,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI;YACpB,CAAC,CAAE,IAAI,CAAC,IAAe,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS;YAC9B,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,SAAmB,CAAC;YACtC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,IAAI;YACJ,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,iEAAiE;QACjE,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,WAAW,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,uDAAuD;AACvD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE5D,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -25,7 +25,9 @@ export async function getClient() {
|
|
|
25
25
|
if (!CopilotClient) {
|
|
26
26
|
throw new Error('Could not find CopilotClient export in @github/copilot-sdk. The package may have changed its API.');
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
// Suppress ExperimentalWarning (e.g., SQLite) in the spawned CLI subprocess
|
|
29
|
+
const env = { ...process.env, NODE_OPTIONS: [process.env.NODE_OPTIONS, '--no-warnings'].filter(Boolean).join(' ') };
|
|
30
|
+
clientInstance = new CopilotClient({ logLevel: 'none', env });
|
|
29
31
|
await clientInstance.start();
|
|
30
32
|
clientStarted = true;
|
|
31
33
|
return clientInstance;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,4EAA4E;IAC5E,MAAM,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,YAAY,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;IACpH,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -29,10 +29,10 @@ function loadPreviousIteration(iterationDir) {
|
|
|
29
29
|
}
|
|
30
30
|
}
|
|
31
31
|
function evalLabel(run) {
|
|
32
|
-
|
|
32
|
+
if (run.label)
|
|
33
|
+
return run.label;
|
|
33
34
|
if (run.slug && run.slug !== `${run.evalId}`)
|
|
34
35
|
return run.slug;
|
|
35
|
-
// Truncate prompt but show first meaningful line
|
|
36
36
|
const firstLine = run.prompt.split('\n')[0].slice(0, 60);
|
|
37
37
|
return firstLine;
|
|
38
38
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,MAAM,OAAO,CAAC;AAQ1B,SAAS,qBAAqB,CAAC,YAAoB;IACjD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,UAAU,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC/D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAuE,CAAC;QAChG,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,CAAC;YAC7E,MAAM,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5F,MAAM,GAAG,GAAG,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC/F,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IACjC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,
|
|
1
|
+
{"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,MAAM,OAAO,CAAC;AAQ1B,SAAS,qBAAqB,CAAC,YAAoB;IACjD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,UAAU,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC/D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAuE,CAAC;QAChG,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,CAAC;YAC7E,MAAM,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5F,MAAM,GAAG,GAAG,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC/F,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IACjC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,GAAqE;IACtF,IAAI,GAAG,CAAC,KAAK;QAAE,OAAO,GAAG,CAAC,KAAK,CAAC;IAChC,IAAI,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE;QAAE,OAAO,GAAG,CAAC,IAAI,CAAC;IAC9D,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACzD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,UAAU,CAAC;IAE3B,KAAK,CAAC,MAAM,CAAC,OAAoB;QAC/B,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;QAEnD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,SAAS,EAAE,CAAC,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,IAAI,GAAG,qBAAqB,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC;YACxC,MAAM,MAAM,GAAG,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5C,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5D,MAAM,OAAO,GAAG,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAC/E,MAAM,QAAQ,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAClF,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;YACrF,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAEvE,8CAA8C;YAC9C,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC1D,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;gBAC3D,IAAI,QAAQ,KAAK,SAAS,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;oBACnD,MAAM,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;oBACjC,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;wBACjB,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;wBAC7D,YAAY,GAAG,IAAI,KAAK,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBACjE,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,OAAO,CAAC,GAAG,YAAY,gBAAgB,QAAQ,MAAM,SAAS,GAAG,CAAC,CAAC;YAErG,gCAAgC;YAChC,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,MAAM,GAAG,SAAS,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;gBACpE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;oBACvB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC9C,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;wBACf,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;oBAClE,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC;QAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC,aAAa,CAAC;QAChD,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAEnG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC9E,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC/E,OAAO,CAAC,GAAG,CAAC,yBAAyB,UAAU,CAAC,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAE9H,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;YACtE,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC;YACnC,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;YACnC,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;YAClF,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEnJ,gCAAgC;YAChC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC;YACtC,IAAI,aAAa,KAAK,aAAa,EAAE,CAAC;gBACpC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,aAAa,MAAM,aAAa,SAAS,CAAC,CAAC,CAAC;YACjG,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
|
|
@@ -18,6 +18,31 @@ async function runWithConcurrency(tasks, limit) {
|
|
|
18
18
|
return results;
|
|
19
19
|
}
|
|
20
20
|
const MAX_CONCURRENCY = 10;
|
|
21
|
+
/**
|
|
22
|
+
* Average pass rates across multiple grading runs.
|
|
23
|
+
* Uses the last run's assertion_results for display, but averages the
|
|
24
|
+
* pass_rate across all runs so --runs N provides statistical significance.
|
|
25
|
+
*/
|
|
26
|
+
function averageGradings(gradings) {
|
|
27
|
+
const valid = gradings.filter((g) => g !== null);
|
|
28
|
+
if (valid.length === 0)
|
|
29
|
+
return undefined;
|
|
30
|
+
if (valid.length === 1)
|
|
31
|
+
return valid[0];
|
|
32
|
+
const avgPassRate = valid.reduce((sum, g) => sum + g.summary.pass_rate, 0) / valid.length;
|
|
33
|
+
const avgPassed = valid.reduce((sum, g) => sum + g.summary.passed, 0) / valid.length;
|
|
34
|
+
const avgFailed = valid.reduce((sum, g) => sum + g.summary.failed, 0) / valid.length;
|
|
35
|
+
const last = valid[valid.length - 1];
|
|
36
|
+
return {
|
|
37
|
+
assertion_results: last.assertion_results,
|
|
38
|
+
summary: {
|
|
39
|
+
passed: Math.round(avgPassed),
|
|
40
|
+
failed: Math.round(avgFailed),
|
|
41
|
+
total: last.summary.total,
|
|
42
|
+
pass_rate: avgPassRate,
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
}
|
|
21
46
|
function validateEvalsFile(evalsFile, evalsPath) {
|
|
22
47
|
if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
|
|
23
48
|
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
|
|
@@ -63,6 +88,9 @@ export async function evalCommand(skillPath, harness, inference, options) {
|
|
|
63
88
|
}
|
|
64
89
|
evalsFile = { ...evalsFile, evals: filtered };
|
|
65
90
|
}
|
|
91
|
+
if (options.threshold !== undefined && (options.threshold < 0 || options.threshold > 1)) {
|
|
92
|
+
throw new SnapevalError(`Threshold must be between 0 and 1 (e.g., 0.8 for 80%). Got: ${options.threshold}`);
|
|
93
|
+
}
|
|
66
94
|
const ws = new WorkspaceManager(skillPath, options.workspace);
|
|
67
95
|
const iterationDir = ws.createIteration();
|
|
68
96
|
// Track which SKILL.md was used for this iteration
|
|
@@ -95,20 +123,31 @@ export async function evalCommand(skillPath, harness, inference, options) {
|
|
|
95
123
|
if (!lastRun) {
|
|
96
124
|
throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
|
|
97
125
|
}
|
|
98
|
-
//
|
|
99
|
-
|
|
100
|
-
const
|
|
126
|
+
// Average pass rates across all runs for statistical significance
|
|
127
|
+
const withSkillGrading = averageGradings(allGradings.map(g => g.withSkill));
|
|
128
|
+
const withoutSkillGrading = averageGradings(allGradings.map(g => g.withoutSkill));
|
|
129
|
+
// When runs > 1, overwrite grading.json with averaged results so
|
|
130
|
+
// artifacts match the benchmark (not just the last run's raw data)
|
|
131
|
+
if (runs > 1) {
|
|
132
|
+
if (withSkillGrading) {
|
|
133
|
+
fs.writeFileSync(path.join(evalDir, 'with_skill', 'grading.json'), JSON.stringify(withSkillGrading, null, 2));
|
|
134
|
+
}
|
|
135
|
+
if (withoutSkillGrading) {
|
|
136
|
+
fs.writeFileSync(path.join(evalDir, baselineVariant, 'grading.json'), JSON.stringify(withoutSkillGrading, null, 2));
|
|
137
|
+
}
|
|
138
|
+
}
|
|
101
139
|
return {
|
|
102
140
|
evalId: evalCase.id,
|
|
103
141
|
slug,
|
|
142
|
+
label: evalCase.label,
|
|
104
143
|
prompt: evalCase.prompt,
|
|
105
144
|
withSkill: {
|
|
106
145
|
output: lastRun.withSkill.output,
|
|
107
|
-
grading:
|
|
146
|
+
grading: withSkillGrading,
|
|
108
147
|
},
|
|
109
148
|
withoutSkill: {
|
|
110
149
|
output: lastRun.withoutSkill.output,
|
|
111
|
-
grading:
|
|
150
|
+
grading: withoutSkillGrading,
|
|
112
151
|
},
|
|
113
152
|
};
|
|
114
153
|
});
|
|
@@ -121,10 +160,11 @@ export async function evalCommand(skillPath, harness, inference, options) {
|
|
|
121
160
|
eval_count: evalRuns.length,
|
|
122
161
|
eval_ids: evalRuns.map((r) => r.evalId),
|
|
123
162
|
skill_name: evalsFile.skill_name,
|
|
163
|
+
runs_per_eval: runs,
|
|
124
164
|
timestamp: new Date().toISOString(),
|
|
125
165
|
},
|
|
126
166
|
};
|
|
127
|
-
fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta,
|
|
167
|
+
fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta, (_key, value) => typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2));
|
|
128
168
|
// Check threshold if set (for CI gating)
|
|
129
169
|
if (options.threshold !== undefined) {
|
|
130
170
|
const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B;;;;GAIG;AACH,SAAS,eAAe,CAAC,QAAkC;IACzD,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAsB,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;IACrE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC;IACzC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;IAExC,MAAM,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IAC1F,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACrF,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;IACrF,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAErC,OAAO;QACL,iBAAiB,EAAE,IAAI,CAAC,iBAAiB;QACzC,OAAO,EAAE;YACP,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;YAC7B,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;YAC7B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;YACzB,SAAS,EAAE,WAAW;SACvB;KACF,CAAC;AACJ,CAAC;AAED,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC,IAAI,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC,EAAE,CAAC;QACxF,MAAM,IAAI,aAAa,CAAC,+DAA+D,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9G,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,kEAAkE;QAClE,MAAM,gBAAgB,GAAG,eAAe,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC;QAC5E,MAAM,mBAAmB,GAAG,eAAe,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC;QAElF,iEAAiE;QACjE,mEAAmE;QACnE,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;YACb,IAAI,gBAAgB,EAAE,CAAC;gBACrB,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC1C,CAAC;YACJ,CAAC;YACD,IAAI,mBAAmB,EAAE,CAAC;gBACxB,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,EACnD,IAAI,CAAC,SAAS,CAAC,mBAAmB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC7C,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,gBAAgB;aAC1B;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,mBAAmB;aAC7B;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,wDAAwD;IACxD,MAAM,iBAAiB,GAAG;QACxB,GAAG,SAAS;QACZ,QAAQ,EAAE;YACR,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,QAAQ,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;IAEF,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,iBAAiB,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAChD,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAC5E,CAAC;IAEF,yCAAyC;IACzC,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;QACjE,IAAI,QAAQ,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;YACjC,yEAAyE;YACzE,MAAM,OAAO,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;YACvF,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
|
|
@@ -50,7 +50,7 @@ function runScript(scriptName, outputDir, scriptsDir) {
|
|
|
50
50
|
return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
|
|
51
51
|
}
|
|
52
52
|
try {
|
|
53
|
-
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
|
|
53
|
+
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
|
|
54
54
|
const evidence = stdout || `Script passed: ${scriptName}`;
|
|
55
55
|
return { text: `script:${scriptName}`, passed: true, evidence };
|
|
56
56
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -35,6 +35,7 @@ export async function runEval(evalCase, skillPath, evalDir, harness, oldSkillPat
|
|
|
35
35
|
return {
|
|
36
36
|
evalId: evalCase.id,
|
|
37
37
|
slug: evalCase.slug ?? `${evalCase.id}`,
|
|
38
|
+
label: evalCase.label,
|
|
38
39
|
prompt: evalCase.prompt,
|
|
39
40
|
withSkill: { output: withSkillResult },
|
|
40
41
|
withoutSkill: { output: baselineResult },
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAYlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,CAAC,eAAe,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC;YACV,SAAS;YACT,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;SAC9C,CAAC;QACF,OAAO,CAAC,GAAG,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;SAC7C,CAAC;KACH,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,KAAK,EAAE,QAAQ,CAAC,KAAK;QACrB,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
|
package/dist/src/types.d.ts
CHANGED
|
@@ -32,6 +32,7 @@ export interface EvalCase {
|
|
|
32
32
|
id: number;
|
|
33
33
|
prompt: string;
|
|
34
34
|
expected_output: string;
|
|
35
|
+
label?: string;
|
|
35
36
|
slug?: string;
|
|
36
37
|
files?: string[];
|
|
37
38
|
assertions?: string[];
|
|
@@ -85,6 +86,7 @@ export interface FeedbackData {
|
|
|
85
86
|
export interface EvalRunResult {
|
|
86
87
|
evalId: number;
|
|
87
88
|
slug: string;
|
|
89
|
+
label?: string;
|
|
88
90
|
prompt: string;
|
|
89
91
|
withSkill: {
|
|
90
92
|
output: HarnessRunResult;
|
package/package.json
CHANGED
package/plugin.json
CHANGED
package/skills/snapeval/SKILL.md
CHANGED
|
@@ -3,7 +3,7 @@ name: snapeval
|
|
|
3
3
|
description: Evaluate AI skills using the agentskills.io eval spec. Runs with/without skill comparisons, grades assertions, and computes benchmarks. Use when the user wants to evaluate, test, or review any skill — including phrases like "test my skill", "run evals", "evaluate this", "set up evals", or "how good is my skill."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by designing test scenarios, running with/without skill comparisons,
|
|
6
|
+
You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by understanding what matters to them, designing targeted test scenarios, running with/without skill comparisons, and iterating on skill quality.
|
|
7
7
|
|
|
8
8
|
## Mode Detection
|
|
9
9
|
|
|
@@ -12,52 +12,88 @@ Before acting, determine the current state by checking files in the skill direct
|
|
|
12
12
|
| State | Condition | Mode |
|
|
13
13
|
|-------|-----------|------|
|
|
14
14
|
| **Fresh** | No `evals/evals.json` | First Evaluation |
|
|
15
|
-
| **Has evals, no workspace** | `evals/evals.json` exists but no workspace directory | Run
|
|
16
|
-
| **Has results** | Workspace with `iteration-N/` exists |
|
|
15
|
+
| **Has evals, no workspace** | `evals/evals.json` exists but no workspace directory | Run Eval or Review (skip all interactive phases — go straight to running the command) |
|
|
16
|
+
| **Has results** | Workspace with `iteration-N/` exists | Re-eval or Review (skip all interactive phases) |
|
|
17
|
+
|
|
18
|
+
**Important:** The interactive phases (Discover, Analyze, Interview, Propose) only apply to the **First Evaluation** flow when no evals.json exists. When evals.json already exists, skip straight to running the `eval` or `review` command. If the user says "run", "just do it", or "without asking", always skip interactive phases.
|
|
17
19
|
|
|
18
20
|
## First Evaluation
|
|
19
21
|
|
|
20
|
-
Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
|
|
22
|
+
Triggered by: "evaluate", "test", "set up evals", "evaluate my skill", "how good is my skill"
|
|
21
23
|
|
|
22
24
|
### Phase 1 — Discover
|
|
23
25
|
|
|
24
|
-
1.
|
|
26
|
+
1. Identify the skill to evaluate — accept the path the user provides, or infer it from context if they mention a skill name or directory
|
|
25
27
|
2. Read the target skill's SKILL.md using the Read tool
|
|
26
28
|
3. Summarize what the skill does in 1-2 sentences
|
|
27
29
|
4. Confirm understanding: "This skill [summary]. Is that right?"
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
**STOP. Do not proceed to Phase 2 until the user confirms your understanding is correct. Wait for the user to respond.**
|
|
32
|
+
|
|
33
|
+
### Phase 2 — Deep Skill Analysis
|
|
34
|
+
|
|
35
|
+
Before asking the user anything, do your own homework. Study the skill thoroughly to map its surface area:
|
|
36
|
+
|
|
37
|
+
1. **Re-read the SKILL.md carefully** — not just the summary, but every instruction, rule, format spec, and example
|
|
38
|
+
2. **Map the behavior space** — identify every distinct thing the skill does (e.g., "generates commit messages", "handles empty diffs", "detects breaking changes")
|
|
39
|
+
3. **Map the input space** — what kinds of inputs does it accept? What dimensions vary? (language, length, complexity, format, edge cases)
|
|
40
|
+
4. **Identify implicit assumptions** — what does the skill assume about context, user intent, or environment that could break?
|
|
41
|
+
5. **Spot gaps and ambiguities** — where are the instructions vague, contradictory, or silent? These are often where failures hide
|
|
42
|
+
|
|
43
|
+
Present this analysis to the user as a brief skill map:
|
|
44
|
+
> "I've analyzed your skill in depth. Here's what I see:
|
|
45
|
+
> - **N core behaviors**: [list them]
|
|
46
|
+
> - **N input dimensions**: [list them]
|
|
47
|
+
> - **N potential weak spots**: [list them — gaps, ambiguities, untested assumptions]"
|
|
48
|
+
|
|
49
|
+
### Phase 3 — Interview
|
|
50
|
+
|
|
51
|
+
Now ask targeted questions to fill gaps your analysis couldn't answer. You've done the work — your questions should be specific and informed, not generic.
|
|
52
|
+
|
|
53
|
+
Ask 2-3 focused questions (one at a time) based on what you found in Phase 2. Examples:
|
|
54
|
+
|
|
55
|
+
- "Your skill says [X] but doesn't specify what happens when [Y]. What should it do?"
|
|
56
|
+
- "I see the skill handles [A] and [B] but doesn't mention [C]. Is that a case you care about?"
|
|
57
|
+
- "The output format section says [X]. In practice, do your users need exactly that, or is there flexibility?"
|
|
58
|
+
- "I noticed the skill doesn't address [edge case]. Has that come up, or is it not a concern?"
|
|
59
|
+
|
|
60
|
+
Ask ONE question at a time. Wait for the answer before asking the next one. Two to three questions is usually enough — don't turn this into an interrogation. If the user seems impatient or says "just test it", respect that and move to Phase 4 (Propose Scenarios) with reasonable defaults.
|
|
61
|
+
|
|
62
|
+
**STOP after each question. Wait for the user to respond before asking the next question or moving on.**
|
|
63
|
+
|
|
64
|
+
### Phase 4 — Propose Scenarios
|
|
30
65
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
- Edge cases (empty input, unusual input)
|
|
36
|
-
- At least one negative test
|
|
37
|
-
4. Present scenarios as a numbered list. For each scenario show:
|
|
66
|
+
Using your analysis and the user's answers, generate 5-8 test scenarios tailored to what actually matters.
|
|
67
|
+
|
|
68
|
+
1. Present a brief skill profile: "Based on what you told me, I'll focus on [key concerns]. Your skill has N core behaviors and I see N areas worth testing."
|
|
69
|
+
2. Present scenarios as a numbered list. For each scenario show:
|
|
38
70
|
- The prompt (realistic — messy, with typos, abbreviations, personal context)
|
|
39
|
-
- What it tests
|
|
71
|
+
- What it tests and why (connected back to the user's answers)
|
|
40
72
|
- Why it matters
|
|
41
|
-
|
|
73
|
+
3. Ask: "Want to adjust any of these, or should I run them?"
|
|
74
|
+
|
|
75
|
+
**STOP. Do not write evals.json or run any commands until the user approves the scenario list (or says "just run it", "looks good", "I trust you", etc). Wait for the user to respond.**
|
|
42
76
|
|
|
43
|
-
### Phase
|
|
77
|
+
### Phase 5 — Handle Feedback
|
|
44
78
|
|
|
45
79
|
- If the user wants changes, adjust conversationally
|
|
46
80
|
- "Drop 3, add one about empty input" → adjust the list and re-present
|
|
47
81
|
- Loop until confirmed
|
|
48
|
-
- If the user says "just run it" → skip to Phase
|
|
82
|
+
- If the user says "just run it", "looks good", "I trust you", or similar → skip to Phase 6 immediately
|
|
49
83
|
|
|
50
|
-
### Phase
|
|
84
|
+
### Phase 6 — Write evals.json & Run
|
|
51
85
|
|
|
52
|
-
1. Write the approved scenarios to `<skill-path>/evals/evals.json
|
|
86
|
+
1. Write the approved scenarios to `<skill-path>/evals/evals.json`. Format:
|
|
53
87
|
```json
|
|
54
88
|
{
|
|
55
89
|
"skill_name": "<skill-name>",
|
|
56
90
|
"evals": [
|
|
57
91
|
{
|
|
58
92
|
"id": 1,
|
|
93
|
+
"label": "short descriptive name",
|
|
59
94
|
"slug": "kebab-case-slug",
|
|
60
95
|
"prompt": "The realistic user prompt",
|
|
96
|
+
"expected_output": "Human description of expected behavior",
|
|
61
97
|
"assertions": ["Assertion 1", "Assertion 2"],
|
|
62
98
|
"files": []
|
|
63
99
|
}
|
|
@@ -71,18 +107,47 @@ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
|
|
|
71
107
|
- Good: `"Response declines to scout because the pipeline already has unclaimed issues"`
|
|
72
108
|
- Bad: `"Handles edge case properly"`
|
|
73
109
|
|
|
74
|
-
|
|
110
|
+
**Prefer semantic assertions for first evaluations.** Script assertions (`script:check.sh`) are powerful but add setup complexity (permissions, paths). Only suggest script assertions when the user specifically needs programmatic validation or has existing scripts.
|
|
75
111
|
|
|
76
112
|
2. Run: `npx snapeval eval <skill-path>` — runs each eval with and without the skill, grades assertions, produces grading.json + benchmark.json
|
|
77
|
-
|
|
78
|
-
|
|
113
|
+
|
|
114
|
+
3. Interpret the benchmark using these guidelines:
|
|
115
|
+
|
|
116
|
+
| Delta | Interpretation |
|
|
117
|
+
|-------|----------------|
|
|
118
|
+
| **+20% or more** | "Your skill adds significant value — it passes X% more assertions than raw AI." |
|
|
119
|
+
| **+1% to +19%** | "Your skill helps, but the improvement is modest. Here's where it adds value: [specific assertions]." |
|
|
120
|
+
| **0%** | "Your skill isn't measurably helping on these tests. The raw AI handles them equally well. Consider making the skill more specific or testing different scenarios." |
|
|
121
|
+
| **Negative** | "Your skill is actually hurting performance on these tests. The raw AI does better without it. Check [specific failing assertions] — the skill may be adding noise or wrong instructions." |
|
|
122
|
+
|
|
123
|
+
## Adding or Modifying Evals
|
|
124
|
+
|
|
125
|
+
When the user wants to add, edit, or remove specific eval cases (not regenerate from scratch):
|
|
126
|
+
|
|
127
|
+
1. Read the existing `evals/evals.json`
|
|
128
|
+
2. Make the requested change (add new eval, modify assertion, remove eval)
|
|
129
|
+
3. Preserve all unchanged evals — never regenerate the full file. Never renumber existing eval IDs.
|
|
130
|
+
4. For new evals, append with the next available ID (e.g., if max ID is 7, use 8)
|
|
131
|
+
5. Run just the new/modified eval to verify it works: `npx snapeval eval <skill-path> --only <new-id>`
|
|
132
|
+
|
|
133
|
+
## Re-eval After Skill Change
|
|
134
|
+
|
|
135
|
+
When the user has modified their SKILL.md and wants to see if results improved:
|
|
136
|
+
|
|
137
|
+
1. Detect that `evals/evals.json` already exists — do NOT regenerate scenarios
|
|
138
|
+
2. Run: `npx snapeval eval <skill-path>` — this creates the next iteration automatically
|
|
139
|
+
3. Compare the new iteration with the previous one:
|
|
140
|
+
- Read both `benchmark.json` files
|
|
141
|
+
- Show per-eval pass rate changes
|
|
142
|
+
- Highlight which evals improved, which regressed, and which stayed the same
|
|
143
|
+
4. Give a verdict: "Your changes improved X evals, regressed Y evals, net delta: +Z%"
|
|
79
144
|
|
|
80
145
|
## Review & Iterate
|
|
81
146
|
|
|
82
147
|
Triggered by: "review", "show results", "how did it do"
|
|
83
148
|
|
|
84
149
|
1. Run: `npx snapeval review <skill-path>` — runs eval + creates feedback.json template
|
|
85
|
-
2. Interpret results using the three signals
|
|
150
|
+
2. Interpret results using the three signals:
|
|
86
151
|
- **Failed assertions** — specific gaps in the skill
|
|
87
152
|
- **Human feedback** — broader quality issues (user fills in feedback.json)
|
|
88
153
|
- **Benchmark delta** — where the skill adds value vs doesn't
|
|
@@ -92,7 +157,10 @@ Triggered by: "review", "show results", "how did it do"
|
|
|
92
157
|
- **Always-fail assertions** — possibly broken, investigate
|
|
93
158
|
- **Differentiating assertions** — pass with skill, fail without — this is where the skill shines
|
|
94
159
|
|
|
95
|
-
4. Suggest
|
|
160
|
+
4. Suggest concrete improvement strategies:
|
|
161
|
+
- Add few-shot examples to SKILL.md for failing scenarios
|
|
162
|
+
- Strengthen format constraints if output structure is inconsistent
|
|
163
|
+
- Remove redundant or conflicting instructions
|
|
96
164
|
|
|
97
165
|
## Comparing Skill Versions
|
|
98
166
|
|
|
@@ -108,8 +176,14 @@ Never show raw stack traces. Translate errors into plain language with a suggest
|
|
|
108
176
|
| Error | Response |
|
|
109
177
|
|-------|----------|
|
|
110
178
|
| No evals.json | "No test cases exist yet. Want me to design scenarios and create evals.json?" |
|
|
111
|
-
|
|
|
179
|
+
| Skill path doesn't exist | "I can't find a skill at that path. Check the directory exists and contains a SKILL.md." |
|
|
180
|
+
| Harness unavailable | "The eval harness isn't available. Make sure `@github/copilot-sdk` is installed (`npm install @github/copilot-sdk`), or try `--harness copilot-cli`." |
|
|
181
|
+
| Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`) or set GITHUB_TOKEN." |
|
|
182
|
+
| Eval command crashes | "The eval run failed: `<error>`. This might be a config issue — check the error message and try again." |
|
|
112
183
|
| Skill invocation failure | "The skill failed to respond to eval N: `<error>`. This might be a bug in the skill — want to skip this eval and continue?" |
|
|
184
|
+
| Invalid evals.json | "The evals.json file has a syntax error. Check for missing commas, trailing commas, or mismatched brackets." |
|
|
185
|
+
|
|
186
|
+
If the same command fails twice, do not retry blindly. Explain the issue and ask the user how to proceed.
|
|
113
187
|
|
|
114
188
|
## Rules
|
|
115
189
|
|
|
@@ -117,3 +191,7 @@ Never show raw stack traces. Translate errors into plain language with a suggest
|
|
|
117
191
|
- Always read the target skill's SKILL.md before generating scenarios
|
|
118
192
|
- Only reference CLI commands that exist: `eval`, `review`
|
|
119
193
|
- Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--concurrency`, `--only`, `--threshold`, `--old-skill`, `--no-open`, `--verbose`
|
|
194
|
+
- Use `--only <id>` to run specific eval IDs when the user wants to test a single eval (e.g., `--only 5` or `--only 1,3,7`)
|
|
195
|
+
- Use `--concurrency 5` for parallel execution when running multiple evals
|
|
196
|
+
- Use `--runs 3` when the user needs statistical confidence (averages pass rates across runs)
|
|
197
|
+
- Use `--threshold 0.8` for CI gating (exits with code 1 if pass rate below threshold; value must be 0-1)
|
|
@@ -33,7 +33,9 @@ export async function getClient(): Promise<any> {
|
|
|
33
33
|
);
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
// Suppress ExperimentalWarning (e.g., SQLite) in the spawned CLI subprocess
|
|
37
|
+
const env = { ...process.env, NODE_OPTIONS: [process.env.NODE_OPTIONS, '--no-warnings'].filter(Boolean).join(' ') };
|
|
38
|
+
clientInstance = new CopilotClient({ logLevel: 'none', env });
|
|
37
39
|
await clientInstance.start();
|
|
38
40
|
clientStarted = true;
|
|
39
41
|
return clientInstance;
|
|
@@ -33,10 +33,9 @@ function loadPreviousIteration(iterationDir: string): PreviousIteration | null {
|
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
function evalLabel(run: { evalId: number; slug: string; prompt: string }): string {
|
|
37
|
-
|
|
36
|
+
function evalLabel(run: { evalId: number; slug: string; label?: string; prompt: string }): string {
|
|
37
|
+
if (run.label) return run.label;
|
|
38
38
|
if (run.slug && run.slug !== `${run.evalId}`) return run.slug;
|
|
39
|
-
// Truncate prompt but show first meaningful line
|
|
40
39
|
const firstLine = run.prompt.split('\n')[0].slice(0, 60);
|
|
41
40
|
return firstLine;
|
|
42
41
|
}
|
package/src/commands/eval.ts
CHANGED
|
@@ -32,6 +32,32 @@ async function runWithConcurrency<T>(
|
|
|
32
32
|
|
|
33
33
|
const MAX_CONCURRENCY = 10;
|
|
34
34
|
|
|
35
|
+
/**
|
|
36
|
+
* Average pass rates across multiple grading runs.
|
|
37
|
+
* Uses the last run's assertion_results for display, but averages the
|
|
38
|
+
* pass_rate across all runs so --runs N provides statistical significance.
|
|
39
|
+
*/
|
|
40
|
+
function averageGradings(gradings: (GradingResult | null)[]): GradingResult | undefined {
|
|
41
|
+
const valid = gradings.filter((g): g is GradingResult => g !== null);
|
|
42
|
+
if (valid.length === 0) return undefined;
|
|
43
|
+
if (valid.length === 1) return valid[0];
|
|
44
|
+
|
|
45
|
+
const avgPassRate = valid.reduce((sum, g) => sum + g.summary.pass_rate, 0) / valid.length;
|
|
46
|
+
const avgPassed = valid.reduce((sum, g) => sum + g.summary.passed, 0) / valid.length;
|
|
47
|
+
const avgFailed = valid.reduce((sum, g) => sum + g.summary.failed, 0) / valid.length;
|
|
48
|
+
const last = valid[valid.length - 1];
|
|
49
|
+
|
|
50
|
+
return {
|
|
51
|
+
assertion_results: last.assertion_results,
|
|
52
|
+
summary: {
|
|
53
|
+
passed: Math.round(avgPassed),
|
|
54
|
+
failed: Math.round(avgFailed),
|
|
55
|
+
total: last.summary.total,
|
|
56
|
+
pass_rate: avgPassRate,
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
35
61
|
function validateEvalsFile(evalsFile: EvalsFile, evalsPath: string): void {
|
|
36
62
|
if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
|
|
37
63
|
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
|
|
@@ -85,6 +111,10 @@ export async function evalCommand(
|
|
|
85
111
|
evalsFile = { ...evalsFile, evals: filtered };
|
|
86
112
|
}
|
|
87
113
|
|
|
114
|
+
if (options.threshold !== undefined && (options.threshold < 0 || options.threshold > 1)) {
|
|
115
|
+
throw new SnapevalError(`Threshold must be between 0 and 1 (e.g., 0.8 for 80%). Got: ${options.threshold}`);
|
|
116
|
+
}
|
|
117
|
+
|
|
88
118
|
const ws = new WorkspaceManager(skillPath, options.workspace);
|
|
89
119
|
const iterationDir = ws.createIteration();
|
|
90
120
|
|
|
@@ -136,21 +166,39 @@ export async function evalCommand(
|
|
|
136
166
|
throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
|
|
137
167
|
}
|
|
138
168
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
const
|
|
169
|
+
// Average pass rates across all runs for statistical significance
|
|
170
|
+
const withSkillGrading = averageGradings(allGradings.map(g => g.withSkill));
|
|
171
|
+
const withoutSkillGrading = averageGradings(allGradings.map(g => g.withoutSkill));
|
|
172
|
+
|
|
173
|
+
// When runs > 1, overwrite grading.json with averaged results so
|
|
174
|
+
// artifacts match the benchmark (not just the last run's raw data)
|
|
175
|
+
if (runs > 1) {
|
|
176
|
+
if (withSkillGrading) {
|
|
177
|
+
fs.writeFileSync(
|
|
178
|
+
path.join(evalDir, 'with_skill', 'grading.json'),
|
|
179
|
+
JSON.stringify(withSkillGrading, null, 2),
|
|
180
|
+
);
|
|
181
|
+
}
|
|
182
|
+
if (withoutSkillGrading) {
|
|
183
|
+
fs.writeFileSync(
|
|
184
|
+
path.join(evalDir, baselineVariant, 'grading.json'),
|
|
185
|
+
JSON.stringify(withoutSkillGrading, null, 2),
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
142
189
|
|
|
143
190
|
return {
|
|
144
191
|
evalId: evalCase.id,
|
|
145
192
|
slug,
|
|
193
|
+
label: evalCase.label,
|
|
146
194
|
prompt: evalCase.prompt,
|
|
147
195
|
withSkill: {
|
|
148
196
|
output: lastRun.withSkill.output,
|
|
149
|
-
grading:
|
|
197
|
+
grading: withSkillGrading,
|
|
150
198
|
},
|
|
151
199
|
withoutSkill: {
|
|
152
200
|
output: lastRun.withoutSkill.output,
|
|
153
|
-
grading:
|
|
201
|
+
grading: withoutSkillGrading,
|
|
154
202
|
},
|
|
155
203
|
};
|
|
156
204
|
});
|
|
@@ -165,13 +213,15 @@ export async function evalCommand(
|
|
|
165
213
|
eval_count: evalRuns.length,
|
|
166
214
|
eval_ids: evalRuns.map((r) => r.evalId),
|
|
167
215
|
skill_name: evalsFile.skill_name,
|
|
216
|
+
runs_per_eval: runs,
|
|
168
217
|
timestamp: new Date().toISOString(),
|
|
169
218
|
},
|
|
170
219
|
};
|
|
171
220
|
|
|
172
221
|
fs.writeFileSync(
|
|
173
222
|
path.join(iterationDir, 'benchmark.json'),
|
|
174
|
-
JSON.stringify(benchmarkWithMeta,
|
|
223
|
+
JSON.stringify(benchmarkWithMeta, (_key, value) =>
|
|
224
|
+
typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2)
|
|
175
225
|
);
|
|
176
226
|
|
|
177
227
|
// Check threshold if set (for CI gating)
|
package/src/engine/grader.ts
CHANGED
|
@@ -63,7 +63,7 @@ function runScript(
|
|
|
63
63
|
return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
|
|
64
64
|
}
|
|
65
65
|
try {
|
|
66
|
-
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
|
|
66
|
+
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
|
|
67
67
|
const evidence = stdout || `Script passed: ${scriptName}`;
|
|
68
68
|
return { text: `script:${scriptName}`, passed: true, evidence };
|
|
69
69
|
} catch (err: any) {
|
package/src/engine/runner.ts
CHANGED
|
@@ -5,6 +5,7 @@ import type { Harness, HarnessRunResult, EvalCase, TimingData } from '../types.j
|
|
|
5
5
|
interface RunEvalResult {
|
|
6
6
|
evalId: number;
|
|
7
7
|
slug: string;
|
|
8
|
+
label?: string;
|
|
8
9
|
prompt: string;
|
|
9
10
|
withSkill: { output: HarnessRunResult };
|
|
10
11
|
withoutSkill: { output: HarnessRunResult };
|
|
@@ -55,6 +56,7 @@ export async function runEval(
|
|
|
55
56
|
return {
|
|
56
57
|
evalId: evalCase.id,
|
|
57
58
|
slug: evalCase.slug ?? `${evalCase.id}`,
|
|
59
|
+
label: evalCase.label,
|
|
58
60
|
prompt: evalCase.prompt,
|
|
59
61
|
withSkill: { output: withSkillResult },
|
|
60
62
|
withoutSkill: { output: baselineResult },
|
package/src/types.ts
CHANGED
|
@@ -43,6 +43,7 @@ export interface EvalCase {
|
|
|
43
43
|
id: number;
|
|
44
44
|
prompt: string;
|
|
45
45
|
expected_output: string;
|
|
46
|
+
label?: string;
|
|
46
47
|
slug?: string;
|
|
47
48
|
files?: string[];
|
|
48
49
|
assertions?: string[];
|
|
@@ -110,6 +111,7 @@ export interface FeedbackData {
|
|
|
110
111
|
export interface EvalRunResult {
|
|
111
112
|
evalId: number;
|
|
112
113
|
slug: string;
|
|
114
|
+
label?: string;
|
|
113
115
|
prompt: string;
|
|
114
116
|
withSkill: {
|
|
115
117
|
output: HarnessRunResult;
|