snapeval 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +144 -104
  2. package/bin/snapeval.ts +39 -1
  3. package/dist/bin/snapeval.js +33 -0
  4. package/dist/bin/snapeval.js.map +1 -1
  5. package/dist/src/adapters/copilot-sdk-client.js +3 -1
  6. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  7. package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
  8. package/dist/src/adapters/harness/copilot-sdk.js +101 -0
  9. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
  10. package/dist/src/adapters/harness/resolve.js +10 -2
  11. package/dist/src/adapters/harness/resolve.js.map +1 -1
  12. package/dist/src/adapters/inference/copilot-sdk.js +4 -1
  13. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  14. package/dist/src/adapters/report/terminal.js +89 -9
  15. package/dist/src/adapters/report/terminal.js.map +1 -1
  16. package/dist/src/commands/eval.d.ts +3 -0
  17. package/dist/src/commands/eval.js +146 -17
  18. package/dist/src/commands/eval.js.map +1 -1
  19. package/dist/src/commands/review.d.ts +1 -0
  20. package/dist/src/commands/review.js.map +1 -1
  21. package/dist/src/config.js +2 -1
  22. package/dist/src/config.js.map +1 -1
  23. package/dist/src/engine/grader.js +67 -9
  24. package/dist/src/engine/grader.js.map +1 -1
  25. package/dist/src/engine/runner.d.ts +1 -0
  26. package/dist/src/engine/runner.js +15 -12
  27. package/dist/src/engine/runner.js.map +1 -1
  28. package/dist/src/errors.d.ts +6 -0
  29. package/dist/src/errors.js +21 -3
  30. package/dist/src/errors.js.map +1 -1
  31. package/dist/src/types.d.ts +3 -0
  32. package/package.json +4 -1
  33. package/plugin.json +1 -1
  34. package/skills/snapeval/SKILL.md +132 -39
  35. package/src/adapters/copilot-sdk-client.ts +3 -1
  36. package/src/adapters/harness/copilot-sdk.ts +126 -0
  37. package/src/adapters/harness/resolve.ts +13 -2
  38. package/src/adapters/inference/copilot-sdk.ts +5 -1
  39. package/src/adapters/report/terminal.ts +99 -10
  40. package/src/commands/eval.ts +183 -31
  41. package/src/commands/review.ts +1 -1
  42. package/src/config.ts +2 -1
  43. package/src/engine/grader.ts +59 -8
  44. package/src/engine/runner.ts +16 -13
  45. package/src/errors.ts +24 -3
  46. package/src/types.ts +3 -0
package/README.md CHANGED
@@ -1,131 +1,178 @@
1
1
  # snapeval
2
2
 
3
- Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.
3
+ Harness-agnostic eval runner for [agentskills.io](https://agentskills.io) skills.
4
4
 
5
5
  [![CI](https://github.com/matantsach/snapeval/actions/workflows/ci.yml/badge.svg)](https://github.com/matantsach/snapeval/actions/workflows/ci.yml)
6
6
  [![npm version](https://img.shields.io/npm/v/snapeval.svg)](https://www.npmjs.com/package/snapeval)
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
8
 
9
- snapeval evaluates [agentskills.io](https://agentskills.io) skills through semantic snapshot testing. It analyzes your skill's `SKILL.md`, collaborates with you to design a test strategy through an interactive browser-based viewer, then captures baselines and detects regressions — all with zero manual test authoring.
9
+ snapeval runs every eval case **with and without** your skill, grades assertions, and computes a benchmark delta so you can see exactly what value your skill adds.
10
10
 
11
- ## Why snapeval?
11
+ ```
12
+ snapeval — greeter
13
+ Baseline = without SKILL.md (raw AI response)
14
+ ────────────────────────────────────────────────────────────
15
+ #1 formal greeting for Eleanor
16
+ Skill: 100% | Baseline: 33% | 5.2s
17
+ #2 casual greeting for Marcus
18
+ Skill: 100% ↑ was 67% | Baseline: 67% | 2.7s
19
+ #3 pirate greeting for Zoe
20
+ Skill: 100% | Baseline: 67% | 2.5s
21
+ ────────────────────────────────────────────────────────────
22
+ Summary:
23
+ Skill pass rate: 100.0%
24
+ Baseline pass rate: 55.6%
25
+ Improvement: +44.4%
26
+ ```
12
27
 
13
- - **Interactive ideation** — AI decomposes your skill into behaviors, dimensions, and failure modes, then opens a visual viewer where you shape the test strategy together.
14
- - **Zero assertions** — No test logic to write. The AI generates realistic, messy prompts that mirror how real users actually type.
15
- - **Semantic comparison** — Tiered pipeline: schema check (free) → LLM judge with order-swap debiasing (when needed). Most checks cost $0.
16
- - **Free inference** — Uses gpt-5-mini via Copilot CLI and GitHub Models API.
17
- - **Platform-agnostic** — Adapter-based architecture. Copilot CLI first, others coming.
28
+ ## How it works
18
29
 
19
- ## Install
30
+ 1. You write a `SKILL.md` and an `evals.json` with test cases and assertions
31
+ 2. snapeval runs each eval **twice** — once with your skill loaded, once without (baseline)
32
+ 3. Assertions are graded by an LLM judge (semantic) and/or shell scripts (deterministic)
33
+ 4. A benchmark shows where your skill adds value vs. where the raw AI already handles it
20
34
 
21
- ### From the marketplace
35
+ ## Quick start
22
36
 
23
- The snapeval marketplace is bundled with the repo. Add it once, then install by name:
37
+ ### As a Copilot plugin
24
38
 
25
39
  ```bash
26
- copilot plugin marketplace add matantsach/snapeval
27
- copilot plugin install snapeval@snapeval-marketplace
40
+ copilot plugin install matantsach/snapeval
28
41
  ```
29
42
 
30
- ### From GitHub directly
43
+ Then in Copilot CLI, just say `evaluate my skill` — the snapeval skill handles the rest.
44
+
45
+ ### Standalone CLI
31
46
 
32
47
  ```bash
33
- copilot plugin install matantsach/snapeval
48
+ git clone https://github.com/matantsach/snapeval.git
49
+ cd snapeval && npm install
50
+ npx tsx bin/snapeval.ts eval <skill-dir>
34
51
  ```
35
52
 
36
- ### Verify installation
53
+ ## Eval format
37
54
 
38
- ```bash
39
- copilot plugin list
55
+ ```
56
+ my-skill/
57
+ ├── SKILL.md
58
+ └── evals/
59
+ ├── evals.json
60
+ └── scripts/ ← optional deterministic checks
61
+ └── validate.sh
62
+ ```
63
+
64
+ **evals.json:**
65
+
66
+ ```json
67
+ {
68
+ "skill_name": "greeter",
69
+ "evals": [
70
+ {
71
+ "id": 1,
72
+ "label": "formal greeting for Eleanor",
73
+ "prompt": "Can you give me a formal greeting for Eleanor?",
74
+ "expected_output": "Returns the formal greeting addressed to Eleanor.",
75
+ "assertions": [
76
+ "Output contains the name Eleanor",
77
+ "Output uses a formal tone",
78
+ "script:validate.sh"
79
+ ]
80
+ }
81
+ ]
82
+ }
40
83
  ```
41
84
 
42
- ## Usage
85
+ | Field | Required | Description |
86
+ |-------|----------|-------------|
87
+ | `id` | yes | Unique numeric identifier |
88
+ | `prompt` | yes | The user prompt sent to the harness |
89
+ | `expected_output` | yes | Human description of the expected behavior |
90
+ | `label` | no | Human-readable name shown in terminal output |
91
+ | `slug` | no | Filesystem-safe name for the eval directory |
92
+ | `assertions` | no | List of assertions to grade (LLM semantic or `script:` prefixed) |
93
+ | `files` | no | Input files to attach to the prompt |
43
94
 
44
- In Copilot CLI, just talk naturally:
95
+ ### Assertions
96
+
97
+ **Semantic** — graded by an LLM. Write specific, verifiable statements:
45
98
 
46
99
  ```
47
- > evaluate my greeter skill
48
- > test skills/code-reviewer for regressions
49
- > check if I broke anything in my-skill
50
- > approve scenario 3
100
+ "Output contains a YAML block with an 'id' field for each issue"
101
+ "Response declines because the pipeline already has unclaimed issues"
51
102
  ```
52
103
 
53
- snapeval activates automatically based on your prompt.
54
-
55
- ### What happens when you evaluate
104
+ **Script** prefix with `script:`. Scripts live in `evals/scripts/`, receive the output directory as `$1`, and pass on exit code 0:
56
105
 
57
- 1. **Analyze** — snapeval reads your SKILL.md and reasons through behaviors, input dimensions, failure modes, and ambiguities
58
- 2. **View** — A browser-based viewer opens showing the analysis with proposed scenarios you can toggle, edit, and extend
59
- 3. **Confirm** — You review, make changes, and click "Confirm & Run" to export your plan
60
- 4. **Capture** — snapeval writes `evals.json` and runs the scenarios against your skill, saving baseline snapshots
106
+ ```
107
+ "script:validate-json-structure.sh"
108
+ ```
61
109
 
62
- After initial setup, use `check` to detect regressions and `approve` to accept intentional changes.
110
+ ## CLI reference
63
111
 
64
- ## CLI Reference
112
+ ### `eval`
65
113
 
66
- The CLI is the headless backend — useful for CI, scripting, and power users.
114
+ Run evals, grade assertions, compute benchmark.
67
115
 
68
- ```
69
- snapeval init [skill-dir] Generate test cases from SKILL.md
70
- snapeval capture [skill-dir] Run scenarios and save baseline snapshots
71
- snapeval check [skill-dir] Compare current output against baselines
72
- snapeval approve [skill-dir] Approve regressed scenarios as new baselines
73
- snapeval report [skill-dir] Write results with optional HTML viewer
74
- snapeval ideate [skill-dir] Open the interactive scenario ideation viewer
116
+ ```bash
117
+ npx snapeval eval [skill-dir] [options]
75
118
  ```
76
119
 
77
120
  | Flag | Description | Default |
78
121
  |------|-------------|---------|
79
- | `--adapter <name>` | Skill adapter | `copilot-cli` |
80
- | `--inference <name>` | Inference adapter | `auto` |
81
- | `--budget <amount>` | Spend cap in USD | `unlimited` |
82
- | `--runs <n>` | Baseline runs per scenario | `1` |
83
- | `--ci` | CI mode: exit 1 on regressions | off |
84
- | `--html` | Generate HTML report viewer | off |
85
- | `--scenario <ids>` | Comma-separated scenario IDs | all |
122
+ | `--harness <name>` | Harness adapter | `copilot-sdk` |
123
+ | `--inference <name>` | Inference adapter for grading | `auto` |
124
+ | `--workspace <path>` | Output directory | `../{skill_name}-workspace` |
125
+ | `--runs <n>` | Harness invocations per eval for statistical averaging | `1` |
126
+ | `--concurrency <n>` | Parallel eval cases (1-10) | `1` |
127
+ | `--only <ids>` | Run specific eval IDs (e.g. `--only 1,3,5`) | all |
128
+ | `--threshold <rate>` | Minimum pass rate 0-1 for exit code 0 | none |
129
+ | `--old-skill <path>` | Compare against old skill version | none |
86
130
  | `--verbose` | Verbose output | off |
87
131
 
88
- ## How It Works
132
+ ### `review`
89
133
 
90
- ```
91
- SKILL.md → AI analyzes skill → Interactive ideation viewer → Capture baselines
92
-
93
- Modify skill Re-run scenarios → Compare via tiered pipeline
94
-
95
- Schema match? → PASS (free, instant)
96
- LLM Judge agrees? → PASS/REGRESSED
134
+ Run eval + generate HTML report + open in browser.
135
+
136
+ ```bash
137
+ npx snapeval review [skill-dir] [options]
97
138
  ```
98
139
 
99
- ### Comparison Pipeline
140
+ Same flags as `eval`, plus `--no-open` to skip opening the browser.
100
141
 
101
- | Tier | Method | Cost | When Used |
102
- |------|--------|------|-----------|
103
- | 1 | Schema check | Free | Structural skeleton matches |
104
- | 2 | LLM judge (order-swap) | Cheap | Schema differs, needs semantic comparison |
142
+ ### Exit codes
105
143
 
106
- Most stable skills are checked entirely at Tier 1 — $0.00 per run.
144
+ | Code | Meaning |
145
+ |------|---------|
146
+ | 0 | Success |
147
+ | 1 | Threshold not met (eval ran but pass rate below `--threshold`) |
148
+ | 2 | Config/input error (bad JSON, missing fields, invalid flags) |
149
+ | 3 | File not found (missing skill dir, evals.json, or script) |
150
+ | 4 | Runtime error (harness failure, grading failure, timeout) |
107
151
 
108
- ## Eval Format
152
+ ## Output artifacts
109
153
 
110
- snapeval follows the [agentskills.io evaluation standard](https://agentskills.io/skill-creation/evaluating-skills):
154
+ Each run creates an iteration directory:
111
155
 
112
156
  ```
113
- my-skill/
114
- ├── SKILL.md
115
- └── evals/
116
- ├── evals.json Test scenarios (AI-generated or from ideation)
117
- ├── analysis.json ← Skill analysis (behaviors, dimensions, gaps)
118
- ├── snapshots/ ← Captured baseline outputs
119
- └── results/
120
- └── iteration-N/
121
- ├── grading.json
157
+ workspace/
158
+ └── iteration-1/
159
+ ├── benchmark.json ← aggregate stats with delta
160
+ ├── SKILL.md.snapshot copy of skill used
161
+ └── eval-{slug}/
162
+ ├── with_skill/
163
+ │ ├── outputs/output.txt
164
+ │ ├── timing.json
165
+ ├── grading.json
166
+ │ └── transcript.log
167
+ └── without_skill/
168
+ ├── outputs/output.txt
122
169
  ├── timing.json
123
- └── benchmark.json
170
+ └── grading.json
124
171
  ```
125
172
 
126
- ## In CI
173
+ **benchmark.json** includes metadata: `eval_count`, `eval_ids`, `skill_name`, `runs_per_eval`, `timestamp`.
127
174
 
128
- Commit your `evals.json` and `snapshots/` directory, then add a workflow:
175
+ ## CI integration
129
176
 
130
177
  ```yaml
131
178
  name: Skill Evaluation
@@ -140,22 +187,10 @@ jobs:
140
187
  with:
141
188
  node-version: 22
142
189
  - run: npm ci
143
- - run: npx snapeval check skills/my-skill --ci
190
+ - run: npx snapeval eval skills/my-skill --threshold 0.8 --runs 3
144
191
  ```
145
192
 
146
- ## Local Development
147
-
148
- ```bash
149
- git clone https://github.com/matantsach/snapeval.git
150
- cd snapeval && npm install
151
- npx tsx bin/snapeval.ts check <skill-path>
152
- ```
153
-
154
- Or load as a local plugin:
155
-
156
- ```bash
157
- copilot plugin install ./path/to/snapeval
158
- ```
193
+ Exit code 1 when pass rate falls below threshold — blocks the PR.
159
194
 
160
195
  ## Configuration
161
196
 
@@ -163,32 +198,37 @@ Create `snapeval.config.json` in your skill or project root:
163
198
 
164
199
  ```json
165
200
  {
166
- "adapter": "copilot-cli",
201
+ "harness": "copilot-sdk",
167
202
  "inference": "auto",
168
- "runs": 3,
169
- "budget": "unlimited"
203
+ "workspace": "../{skill_name}-workspace",
204
+ "runs": 1,
205
+ "concurrency": 1
170
206
  }
171
207
  ```
172
208
 
173
- CLI flags override config file values.
209
+ Resolution order: defaults → project config skill-dir config → CLI flags.
174
210
 
175
- ## Architecture
211
+ ## Harness adapters
176
212
 
177
- Three surfaces over a shared core engine:
213
+ | Adapter | Description | Default |
214
+ |---------|-------------|---------|
215
+ | `copilot-sdk` | Programmatic via `@github/copilot-sdk` with native skill loading | yes |
216
+ | `copilot-cli` | Shells out to `copilot` CLI binary | no |
178
217
 
179
- - **Plugin** (SKILL.md) Interactive product. AI handles everything.
180
- - **CLI** (`npx snapeval`) — Headless backend for CI and power users.
181
- - **GitHub Action** — CI wrapper (planned).
218
+ The SDK harness loads skills natively via `skillDirectories`, captures full transcripts, and extracts real token counts from `assistant.usage` events.
182
219
 
183
- Adapter layers for platform independence:
220
+ ## Inference adapters
184
221
 
185
- - **SkillAdapter** How to invoke a skill (Copilot CLI, others planned)
186
- - **InferenceAdapter** — Where to get LLM capabilities (Copilot gpt-5-mini, GitHub Models API)
187
- - **ReportAdapter** How to present results (terminal, JSON, HTML viewer)
222
+ | Adapter | Description |
223
+ |---------|-------------|
224
+ | `auto` | Copilot CLI if available, else GitHub Models API |
225
+ | `copilot` | Copilot CLI (`copilot` binary) |
226
+ | `copilot-sdk` | `@github/copilot-sdk` programmatic |
227
+ | `github-models` | GitHub Models API (requires `GITHUB_TOKEN`) |
188
228
 
189
229
  ## Contributing
190
230
 
191
- See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
231
+ See [CONTRIBUTING.md](CONTRIBUTING.md).
192
232
 
193
233
  ## License
194
234
 
package/bin/snapeval.ts CHANGED
@@ -1,4 +1,13 @@
1
1
  #!/usr/bin/env tsx
2
+
3
+ // Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
4
+ const _origEmit = process.emit;
5
+ // @ts-ignore — override to filter warnings
6
+ process.emit = function (event: string, ...args: any[]) {
7
+ if (event === 'warning' && args[0]?.name === 'ExperimentalWarning') return false;
8
+ return _origEmit.apply(process, [event, ...args] as any);
9
+ };
10
+
2
11
  import { Command } from 'commander';
3
12
  import { resolveConfig } from '../src/config.js';
4
13
  import { resolveInference } from '../src/adapters/inference/resolve.js';
@@ -7,6 +16,7 @@ import { evalCommand } from '../src/commands/eval.js';
7
16
  import { reviewCommand } from '../src/commands/review.js';
8
17
  import { TerminalReporter } from '../src/adapters/report/terminal.js';
9
18
  import { SnapevalError } from '../src/errors.js';
19
+ import { stopClient } from '../src/adapters/copilot-sdk-client.js';
10
20
  import * as path from 'node:path';
11
21
 
12
22
  const program = new Command();
@@ -24,6 +34,9 @@ program
24
34
  .option('--inference <inference>', 'Inference adapter to use')
25
35
  .option('--workspace <path>', 'Workspace directory')
26
36
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
37
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
38
+ .option('--only <ids>', 'Run only specific eval IDs (comma-separated, e.g. --only 1,3,5)')
39
+ .option('--threshold <rate>', 'Minimum pass rate (0-1) for exit code 0. Below threshold exits with code 1.')
27
40
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
28
41
  .option('--verbose', 'Verbose output')
29
42
  .argument('[skill-dir]', 'Path to skill directory', process.cwd())
@@ -36,15 +49,26 @@ program
36
49
  inference: opts.inference as string,
37
50
  workspace: opts.workspace as string,
38
51
  runs: opts.runs ? parseInt(opts.runs as string, 10) : undefined,
52
+ concurrency: opts.concurrency ? parseInt(opts.concurrency as string, 10) : undefined,
39
53
  },
40
54
  process.cwd(), skillPath
41
55
  );
42
56
  const harness = resolveHarness(config.harness);
43
57
  const inference = resolveInference(config.inference);
44
58
 
59
+ const only = opts.only
60
+ ? (opts.only as string).split(',').map((s) => parseInt(s.trim(), 10))
61
+ : undefined;
62
+ const threshold = opts.threshold
63
+ ? parseFloat(opts.threshold as string)
64
+ : undefined;
65
+
45
66
  const results = await evalCommand(skillPath, harness, inference, {
46
67
  workspace: config.workspace,
47
68
  runs: config.runs,
69
+ concurrency: config.concurrency,
70
+ only,
71
+ threshold,
48
72
  oldSkill: opts.oldSkill as string | undefined,
49
73
  });
50
74
 
@@ -52,7 +76,15 @@ program
52
76
  await terminal.report(results);
53
77
  console.log(`Results at ${results.iterationDir}`);
54
78
  process.exit(0);
55
- } catch (err) { handleError(err); }
79
+ } catch (err: any) {
80
+ // ThresholdError has results attached — show them before failing
81
+ if (err.results) {
82
+ const terminal = new TerminalReporter();
83
+ await terminal.report(err.results);
84
+ console.log(`Results at ${err.results.iterationDir}`);
85
+ }
86
+ handleError(err);
87
+ }
56
88
  });
57
89
 
58
90
  // --- review ---
@@ -63,6 +95,7 @@ program
63
95
  .option('--inference <inference>', 'Inference adapter to use')
64
96
  .option('--workspace <path>', 'Workspace directory')
65
97
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
98
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
66
99
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
67
100
  .option('--no-open', 'Do not open browser')
68
101
  .option('--verbose', 'Verbose output')
@@ -76,6 +109,7 @@ program
76
109
  inference: opts.inference as string,
77
110
  workspace: opts.workspace as string,
78
111
  runs: opts.runs ? parseInt(opts.runs as string, 10) : undefined,
112
+ concurrency: opts.concurrency ? parseInt(opts.concurrency as string, 10) : undefined,
79
113
  },
80
114
  process.cwd(), skillPath
81
115
  );
@@ -85,6 +119,7 @@ program
85
119
  await reviewCommand(skillPath, harness, inference, {
86
120
  workspace: config.workspace,
87
121
  runs: config.runs,
122
+ concurrency: config.concurrency,
88
123
  oldSkill: opts.oldSkill as string | undefined,
89
124
  noOpen: opts.open === false,
90
125
  });
@@ -92,6 +127,9 @@ program
92
127
  } catch (err) { handleError(err); }
93
128
  });
94
129
 
130
+ // Clean up SDK client on exit (no-op if never started)
131
+ process.on('exit', () => { stopClient().catch(() => {}); });
132
+
95
133
  function handleError(err: unknown): never {
96
134
  if (err instanceof SnapevalError) {
97
135
  console.error(`Error: ${err.message}`);
@@ -1,4 +1,12 @@
1
1
  #!/usr/bin/env tsx
2
+ // Suppress Node.js ExperimentalWarning (e.g., SQLite) from polluting output
3
+ const _origEmit = process.emit;
4
+ // @ts-ignore — override to filter warnings
5
+ process.emit = function (event, ...args) {
6
+ if (event === 'warning' && args[0]?.name === 'ExperimentalWarning')
7
+ return false;
8
+ return _origEmit.apply(process, [event, ...args]);
9
+ };
2
10
  import { Command } from 'commander';
3
11
  import { resolveConfig } from '../src/config.js';
4
12
  import { resolveInference } from '../src/adapters/inference/resolve.js';
@@ -7,6 +15,7 @@ import { evalCommand } from '../src/commands/eval.js';
7
15
  import { reviewCommand } from '../src/commands/review.js';
8
16
  import { TerminalReporter } from '../src/adapters/report/terminal.js';
9
17
  import { SnapevalError } from '../src/errors.js';
18
+ import { stopClient } from '../src/adapters/copilot-sdk-client.js';
10
19
  import * as path from 'node:path';
11
20
  const program = new Command();
12
21
  program
@@ -21,6 +30,9 @@ program
21
30
  .option('--inference <inference>', 'Inference adapter to use')
22
31
  .option('--workspace <path>', 'Workspace directory')
23
32
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
33
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
34
+ .option('--only <ids>', 'Run only specific eval IDs (comma-separated, e.g. --only 1,3,5)')
35
+ .option('--threshold <rate>', 'Minimum pass rate (0-1) for exit code 0. Below threshold exits with code 1.')
24
36
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
25
37
  .option('--verbose', 'Verbose output')
26
38
  .argument('[skill-dir]', 'Path to skill directory', process.cwd())
@@ -32,12 +44,22 @@ program
32
44
  inference: opts.inference,
33
45
  workspace: opts.workspace,
34
46
  runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
47
+ concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
35
48
  }, process.cwd(), skillPath);
36
49
  const harness = resolveHarness(config.harness);
37
50
  const inference = resolveInference(config.inference);
51
+ const only = opts.only
52
+ ? opts.only.split(',').map((s) => parseInt(s.trim(), 10))
53
+ : undefined;
54
+ const threshold = opts.threshold
55
+ ? parseFloat(opts.threshold)
56
+ : undefined;
38
57
  const results = await evalCommand(skillPath, harness, inference, {
39
58
  workspace: config.workspace,
40
59
  runs: config.runs,
60
+ concurrency: config.concurrency,
61
+ only,
62
+ threshold,
41
63
  oldSkill: opts.oldSkill,
42
64
  });
43
65
  const terminal = new TerminalReporter();
@@ -46,6 +68,12 @@ program
46
68
  process.exit(0);
47
69
  }
48
70
  catch (err) {
71
+ // ThresholdError has results attached — show them before failing
72
+ if (err.results) {
73
+ const terminal = new TerminalReporter();
74
+ await terminal.report(err.results);
75
+ console.log(`Results at ${err.results.iterationDir}`);
76
+ }
49
77
  handleError(err);
50
78
  }
51
79
  });
@@ -57,6 +85,7 @@ program
57
85
  .option('--inference <inference>', 'Inference adapter to use')
58
86
  .option('--workspace <path>', 'Workspace directory')
59
87
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
88
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
60
89
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
61
90
  .option('--no-open', 'Do not open browser')
62
91
  .option('--verbose', 'Verbose output')
@@ -69,12 +98,14 @@ program
69
98
  inference: opts.inference,
70
99
  workspace: opts.workspace,
71
100
  runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
101
+ concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
72
102
  }, process.cwd(), skillPath);
73
103
  const harness = resolveHarness(config.harness);
74
104
  const inference = resolveInference(config.inference);
75
105
  await reviewCommand(skillPath, harness, inference, {
76
106
  workspace: config.workspace,
77
107
  runs: config.runs,
108
+ concurrency: config.concurrency,
78
109
  oldSkill: opts.oldSkill,
79
110
  noOpen: opts.open === false,
80
111
  });
@@ -84,6 +115,8 @@ program
84
115
  handleError(err);
85
116
  }
86
117
  });
118
+ // Clean up SDK client on exit (no-op if never started)
119
+ process.on('exit', () => { stopClient().catch(() => { }); });
87
120
  function handleError(err) {
88
121
  if (err instanceof SnapevalError) {
89
122
  console.error(`Error: ${err.message}`);
@@ -1 +1 @@
1
- {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SAChE,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SAChE,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
1
+ {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AAEA,4EAA4E;AAC5E,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC;AAC/B,2CAA2C;AAC3C,OAAO,CAAC,IAAI,GAAG,UAAU,KAAa,EAAE,GAAG,IAAW;IACpD,IAAI,KAAK,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,qBAAqB;QAAE,OAAO,KAAK,CAAC;IACjF,OAAO,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,GAAG,IAAI,CAAQ,CAAC,CAAC;AAC3D,CAAC,CAAC;AAEF,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,uCAAuC,CAAC;AACnE,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,cAAc,EAAE,iEAAiE,CAAC;KACzF,MAAM,CAAC,oBAAoB,EAAE,6EAA6E,CAAC;KAC3G,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI;YACpB,CAAC,CAAE,IAAI,CAAC,IAAe,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS;YAC9B,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,SAAmB,CAAC;YACtC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,IAAI;YACJ,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,iEAAiE;QACjE,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,WAAW,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,uDAAuD;AACvD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE5D,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
@@ -25,7 +25,9 @@ export async function getClient() {
25
25
  if (!CopilotClient) {
26
26
  throw new Error('Could not find CopilotClient export in @github/copilot-sdk. The package may have changed its API.');
27
27
  }
28
- clientInstance = new CopilotClient();
28
+ // Suppress ExperimentalWarning (e.g., SQLite) in the spawned CLI subprocess
29
+ const env = { ...process.env, NODE_OPTIONS: [process.env.NODE_OPTIONS, '--no-warnings'].filter(Boolean).join(' ') };
30
+ clientInstance = new CopilotClient({ logLevel: 'none', env });
29
31
  await clientInstance.start();
30
32
  clientStarted = true;
31
33
  return clientInstance;
@@ -1 +1 @@
1
- {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,cAAc,GAAG,IAAI,aAAa,EAAE,CAAC;IACrC,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
1
+ {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,4EAA4E;IAC5E,MAAM,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,YAAY,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;IACpH,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -0,0 +1,11 @@
1
+ import type { Harness, HarnessRunResult } from '../../types.js';
2
+ export declare class CopilotSDKHarness implements Harness {
3
+ readonly name = "copilot-sdk";
4
+ run(options: {
5
+ skillPath?: string;
6
+ prompt: string;
7
+ files?: string[];
8
+ outputDir: string;
9
+ }): Promise<HarnessRunResult>;
10
+ isAvailable(): Promise<boolean>;
11
+ }