snapeval 1.8.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/snapeval.ts +30 -24
  2. package/dist/bin/snapeval.js +25 -22
  3. package/dist/bin/snapeval.js.map +1 -1
  4. package/dist/src/adapters/copilot-sdk-client.js +1 -1
  5. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  6. package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
  7. package/dist/src/adapters/harness/copilot-sdk.js +101 -0
  8. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
  9. package/dist/src/adapters/harness/resolve.js +10 -2
  10. package/dist/src/adapters/harness/resolve.js.map +1 -1
  11. package/dist/src/adapters/inference/copilot-sdk.js +4 -1
  12. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  13. package/dist/src/adapters/report/terminal.js +89 -9
  14. package/dist/src/adapters/report/terminal.js.map +1 -1
  15. package/dist/src/commands/eval.d.ts +3 -0
  16. package/dist/src/commands/eval.js +106 -17
  17. package/dist/src/commands/eval.js.map +1 -1
  18. package/dist/src/commands/review.d.ts +1 -0
  19. package/dist/src/commands/review.js.map +1 -1
  20. package/dist/src/config.js +2 -1
  21. package/dist/src/config.js.map +1 -1
  22. package/dist/src/engine/grader.js +67 -9
  23. package/dist/src/engine/grader.js.map +1 -1
  24. package/dist/src/engine/runner.js +14 -12
  25. package/dist/src/engine/runner.js.map +1 -1
  26. package/dist/src/errors.d.ts +6 -0
  27. package/dist/src/errors.js +21 -3
  28. package/dist/src/errors.js.map +1 -1
  29. package/dist/src/types.d.ts +1 -0
  30. package/package.json +4 -1
  31. package/plugin.json +1 -1
  32. package/skills/snapeval/SKILL.md +33 -18
  33. package/src/adapters/copilot-sdk-client.ts +1 -1
  34. package/src/adapters/harness/copilot-sdk.ts +126 -0
  35. package/src/adapters/harness/resolve.ts +13 -2
  36. package/src/adapters/inference/copilot-sdk.ts +5 -1
  37. package/src/adapters/report/terminal.ts +100 -10
  38. package/src/commands/eval.ts +133 -31
  39. package/src/commands/review.ts +1 -1
  40. package/src/config.ts +2 -1
  41. package/src/engine/grader.ts +59 -8
  42. package/src/engine/runner.ts +14 -13
  43. package/src/errors.ts +24 -3
  44. package/src/types.ts +1 -0
  45. package/dist/src/commands/init.d.ts +0 -2
  46. package/dist/src/commands/init.js +0 -27
  47. package/dist/src/commands/init.js.map +0 -1
  48. package/dist/src/engine/generator.d.ts +0 -3
  49. package/dist/src/engine/generator.js +0 -51
  50. package/dist/src/engine/generator.js.map +0 -1
  51. package/src/commands/init.ts +0 -38
  52. package/src/engine/generator.ts +0 -60
@@ -4,47 +4,136 @@ import { WorkspaceManager } from '../engine/workspace.js';
4
4
  import { runEval } from '../engine/runner.js';
5
5
  import { gradeAssertions } from '../engine/grader.js';
6
6
  import { computeBenchmark } from '../engine/aggregator.js';
7
- import { SnapevalError } from '../errors.js';
7
+ import { SnapevalError, FileNotFoundError, ThresholdError } from '../errors.js';
8
+ async function runWithConcurrency(tasks, limit) {
9
+ const results = new Array(tasks.length);
10
+ let index = 0;
11
+ async function worker() {
12
+ while (index < tasks.length) {
13
+ const i = index++;
14
+ results[i] = await tasks[i]();
15
+ }
16
+ }
17
+ await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
18
+ return results;
19
+ }
20
+ const MAX_CONCURRENCY = 10;
21
+ function validateEvalsFile(evalsFile, evalsPath) {
22
+ if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
23
+ throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
24
+ }
25
+ if (!Array.isArray(evalsFile.evals)) {
26
+ throw new SnapevalError(`Invalid evals.json at ${evalsPath}: "evals" must be an array.`);
27
+ }
28
+ for (const [i, evalCase] of evalsFile.evals.entries()) {
29
+ const prefix = `Invalid evals.json at ${evalsPath}: evals[${i}]`;
30
+ if (typeof evalCase.id !== 'number') {
31
+ throw new SnapevalError(`${prefix} missing or invalid "id" (must be a number).`);
32
+ }
33
+ if (typeof evalCase.prompt !== 'string') {
34
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "prompt" field.`);
35
+ }
36
+ if (typeof evalCase.expected_output !== 'string') {
37
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "expected_output" field.`);
38
+ }
39
+ if (evalCase.assertions !== undefined && !Array.isArray(evalCase.assertions)) {
40
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) "assertions" must be an array of strings.`);
41
+ }
42
+ }
43
+ }
8
44
  export async function evalCommand(skillPath, harness, inference, options) {
9
45
  const evalsPath = path.join(skillPath, 'evals', 'evals.json');
10
46
  if (!fs.existsSync(evalsPath)) {
11
- throw new SnapevalError(`No evals.json found at ${evalsPath}. Run \`snapeval init\` first.`);
47
+ throw new FileNotFoundError(evalsPath, 'Create evals/evals.json with test scenarios first');
48
+ }
49
+ let evalsFile;
50
+ try {
51
+ evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
52
+ }
53
+ catch {
54
+ throw new SnapevalError(`Invalid JSON in ${evalsPath}. Check for syntax errors (missing commas, trailing commas, etc).`);
55
+ }
56
+ validateEvalsFile(evalsFile, evalsPath);
57
+ // Filter to specific eval IDs if --only is provided
58
+ if (options.only && options.only.length > 0) {
59
+ const ids = new Set(options.only);
60
+ const filtered = evalsFile.evals.filter((e) => ids.has(e.id));
61
+ if (filtered.length === 0) {
62
+ throw new SnapevalError(`No eval cases match --only ${options.only.join(',')}. Available IDs: ${evalsFile.evals.map((e) => e.id).join(', ')}`);
63
+ }
64
+ evalsFile = { ...evalsFile, evals: filtered };
12
65
  }
13
- const evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
14
66
  const ws = new WorkspaceManager(skillPath, options.workspace);
15
67
  const iterationDir = ws.createIteration();
68
+ // Track which SKILL.md was used for this iteration
69
+ const skillMdPath = path.join(skillPath, 'SKILL.md');
70
+ if (fs.existsSync(skillMdPath)) {
71
+ fs.copyFileSync(skillMdPath, path.join(iterationDir, 'SKILL.md.snapshot'));
72
+ }
16
73
  const runs = options.runs ?? 1;
74
+ const concurrency = Math.min(Math.max(options.concurrency ?? 1, 1), MAX_CONCURRENCY);
17
75
  const baselineVariant = options.oldSkill ? 'old_skill' : 'without_skill';
18
76
  const scriptsDir = path.join(skillPath, 'evals', 'scripts');
19
- const evalRuns = [];
20
- for (const evalCase of evalsFile.evals) {
77
+ // Pre-create eval directories sequentially (filesystem setup)
78
+ const evalDirs = evalsFile.evals.map((evalCase) => {
21
79
  const slug = WorkspaceManager.getEvalSlug(evalCase).replace('eval-', '');
22
- const evalDir = ws.createEvalDir(iterationDir, slug, baselineVariant);
80
+ return { evalCase, slug, evalDir: ws.createEvalDir(iterationDir, slug, baselineVariant) };
81
+ });
82
+ const tasks = evalDirs.map(({ evalCase, slug, evalDir }) => async () => {
83
+ const assertions = evalCase.assertions ?? [];
84
+ const allGradings = [];
23
85
  let lastRun = null;
24
86
  for (let i = 0; i < runs; i++) {
25
87
  lastRun = await runEval(evalCase, skillPath, evalDir, harness, options.oldSkill);
88
+ // Grade every run, not just the last
89
+ const [wsGrading, wosGrading] = await Promise.all([
90
+ gradeAssertions(assertions, lastRun.withSkill.output, path.join(evalDir, 'with_skill'), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined),
91
+ gradeAssertions(assertions, lastRun.withoutSkill.output, path.join(evalDir, baselineVariant), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined),
92
+ ]);
93
+ allGradings.push({ withSkill: wsGrading, withoutSkill: wosGrading });
26
94
  }
27
- if (!lastRun)
28
- continue;
29
- const assertions = evalCase.assertions ?? [];
30
- const withSkillGrading = await gradeAssertions(assertions, lastRun.withSkill.output, path.join(evalDir, 'with_skill'), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined);
31
- const withoutSkillGrading = await gradeAssertions(assertions, lastRun.withoutSkill.output, path.join(evalDir, baselineVariant), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined);
32
- evalRuns.push({
95
+ if (!lastRun) {
96
+ throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
97
+ }
98
+ // Use the last run's grading as the primary result (written to grading.json)
99
+ // but all gradings contribute to benchmark stats via pass rates
100
+ const lastGrading = allGradings[allGradings.length - 1];
101
+ return {
33
102
  evalId: evalCase.id,
34
103
  slug,
35
104
  prompt: evalCase.prompt,
36
105
  withSkill: {
37
106
  output: lastRun.withSkill.output,
38
- grading: withSkillGrading ?? undefined,
107
+ grading: lastGrading.withSkill ?? undefined,
39
108
  },
40
109
  withoutSkill: {
41
110
  output: lastRun.withoutSkill.output,
42
- grading: withoutSkillGrading ?? undefined,
111
+ grading: lastGrading.withoutSkill ?? undefined,
43
112
  },
44
- });
45
- }
113
+ };
114
+ });
115
+ const evalRuns = await runWithConcurrency(tasks, concurrency);
46
116
  const benchmark = computeBenchmark(evalRuns);
47
- fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmark, null, 2));
117
+ // Add iteration metadata for cross-iteration comparison
118
+ const benchmarkWithMeta = {
119
+ ...benchmark,
120
+ metadata: {
121
+ eval_count: evalRuns.length,
122
+ eval_ids: evalRuns.map((r) => r.evalId),
123
+ skill_name: evalsFile.skill_name,
124
+ timestamp: new Date().toISOString(),
125
+ },
126
+ };
127
+ fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta, null, 2));
128
+ // Check threshold if set (for CI gating)
129
+ if (options.threshold !== undefined) {
130
+ const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
131
+ if (passRate < options.threshold) {
132
+ // Still return results so the reporter can display them before the error
133
+ const results = { skillName: evalsFile.skill_name, evalRuns, benchmark, iterationDir };
134
+ throw Object.assign(new ThresholdError(passRate, options.threshold), { results });
135
+ }
136
+ }
48
137
  return {
49
138
  skillName: evalsFile.skill_name,
50
139
  evalRuns,
@@ -1 +1 @@
1
- {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAQlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAE7C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAAiE;IAEjE,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,aAAa,CAAC,0BAA0B,SAAS,gCAAgC,CAAC,CAAC;IAC/F,CAAC;IAED,MAAM,SAAS,GAAc,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC7E,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,MAAM,QAAQ,GAAoB,EAAE,CAAC;IAErC,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,MAAM,OAAO,GAAG,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;QAEtE,IAAI,OAAO,GAA+C,IAAI,CAAC;QAC/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACnF,CAAC;QAED,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,gBAAgB,GAAG,MAAM,eAAe,CAC5C,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD,CAAC;QACF,MAAM,mBAAmB,GAAG,MAAM,eAAe,CAC/C,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD,CAAC;QAEF,QAAQ,CAAC,IAAI,CAAC;YACZ,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,gBAAgB,IAAI,SAAS;aACvC;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,mBAAmB,IAAI,SAAS;aAC1C;SACF,CAAC,CAAC;IACL,CAAC;IAED,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,CACnC,CAAC;IAEF,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,6EAA6E;QAC7E,gEAAgE;QAChE,MAAM,WAAW,GAAG,WAAW,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAExD,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,WAAW,CAAC,SAAS,IAAI,SAAS;aAC5C;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,WAAW,CAAC,YAAY,IAAI,SAAS;aAC/C;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,wDAAwD;IACxD,MAAM,iBAAiB,GAAG;QACxB,GAAG,SAAS;QACZ,QAAQ,EAAE;YACR,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,QAAQ,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;IAEF,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,iBAAiB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC3C,CAAC;IAEF,yCAAyC;IACzC,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;QACjE,IAAI,QAAQ,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;YACjC,yEAAyE;YACzE,MAAM,OAAO,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;YACvF,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
@@ -4,4 +4,5 @@ export declare function reviewCommand(skillPath: string, harness: Harness, infer
4
4
  runs?: number;
5
5
  oldSkill?: string;
6
6
  noOpen?: boolean;
7
+ concurrency?: number;
7
8
  }): Promise<void>;
@@ -1 +1 @@
1
- {"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAElE,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAAmF;IAEnF,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/B,yBAAyB;IACzB,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACnC,QAAQ,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;IACpC,CAAC;IACD,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,eAAe,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAClC,CAAC;IAEF,oEAAoE;IACpE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;QACrE,aAAa,CAAC,UAAU,CAAC,CAAC;IAC5B,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,QAAgB;IACrC,MAAM,GAAG,GACP,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;IACpD,MAAM,IAAI,GACR,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAC5E,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;QAC1B,IAAI,GAAG;YAAE,OAAO,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;AACL,CAAC"}
1
+ {"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAElE,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAAyG;IAEzG,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/B,yBAAyB;IACzB,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACnC,QAAQ,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;IACpC,CAAC;IACD,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,eAAe,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAClC,CAAC;IAEF,oEAAoE;IACpE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;QACrE,aAAa,CAAC,UAAU,CAAC,CAAC;IAC5B,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,QAAgB;IACrC,MAAM,GAAG,GACP,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;IACpD,MAAM,IAAI,GACR,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAC5E,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;QAC1B,IAAI,GAAG;YAAE,OAAO,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -1,10 +1,11 @@
1
1
  import * as fs from 'node:fs';
2
2
  import * as path from 'node:path';
3
3
  export const DEFAULT_CONFIG = {
4
- harness: 'copilot-cli',
4
+ harness: 'copilot-sdk',
5
5
  inference: 'auto',
6
6
  workspace: '../{skill_name}-workspace',
7
7
  runs: 1,
8
+ concurrency: 1,
8
9
  };
9
10
  function loadConfigFile(dirPath) {
10
11
  const configPath = path.join(dirPath, 'snapeval.config.json');
@@ -1 +1 @@
1
- {"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,MAAM,CAAC,MAAM,cAAc,GAAmB;IAC5C,OAAO,EAAE,aAAa;IACtB,SAAS,EAAE,MAAM;IACjB,SAAS,EAAE,2BAA2B;IACtC,IAAI,EAAE,CAAC;CACR,CAAC;AAEF,SAAS,cAAc,CAAC,OAAe;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,sBAAsB,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,MAAM,UAAU,aAAa,CAC3B,QAAiC,EACjC,WAAmB,EACnB,QAAiB;IAEjB,MAAM,cAAc,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAClE,MAAM,aAAa,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAClD,OAAO;QACL,GAAG,cAAc;QACjB,GAAG,CAAC,aAAa,IAAI,EAAE,CAAC;QACxB,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QACzB,GAAG,cAAc,CAAC,QAAQ,CAAC;KAC5B,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,GAA4B;IAClD,OAAO,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,CAAC;AACpF,CAAC"}
1
+ {"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,MAAM,CAAC,MAAM,cAAc,GAAmB;IAC5C,OAAO,EAAE,aAAa;IACtB,SAAS,EAAE,MAAM;IACjB,SAAS,EAAE,2BAA2B;IACtC,IAAI,EAAE,CAAC;IACP,WAAW,EAAE,CAAC;CACf,CAAC;AAEF,SAAS,cAAc,CAAC,OAAe;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,sBAAsB,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,MAAM,UAAU,aAAa,CAC3B,QAAiC,EACjC,WAAmB,EACnB,QAAiB;IAEjB,MAAM,cAAc,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAClE,MAAM,aAAa,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAClD,OAAO;QACL,GAAG,cAAc;QACjB,GAAG,CAAC,aAAa,IAAI,EAAE,CAAC;QACxB,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QACzB,GAAG,cAAc,CAAC,QAAQ,CAAC;KAC5B,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,GAA4B;IAClD,OAAO,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,CAAC;AACpF,CAAC"}
@@ -1,9 +1,33 @@
1
1
  import * as fs from 'node:fs';
2
2
  import * as path from 'node:path';
3
3
  import { execFileSync } from 'node:child_process';
4
+ const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
5
+ function gradeExactMatch(assertion, output) {
6
+ const match = assertion.match(EXACT_MATCH_PATTERN);
7
+ if (!match)
8
+ return null;
9
+ const expected = match[1];
10
+ const actual = output.trim();
11
+ const passed = actual === expected;
12
+ return {
13
+ text: assertion,
14
+ passed,
15
+ evidence: passed
16
+ ? `Exact match: "${expected}"`
17
+ : `Expected: "${expected}"\nGot: "${actual}"`,
18
+ };
19
+ }
4
20
  function buildGradingPrompt(assertions, output, files) {
5
21
  const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
6
- return `You are a strict eval grader. For each assertion, determine PASS or FAIL based on the output below. Require concrete evidence for a PASS — do not give the benefit of the doubt.
22
+ return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
23
+
24
+ GRADING RULES:
25
+ - PASS if the output satisfies the assertion's intent, even if wording differs slightly.
26
+ - FAIL only if the output clearly does not satisfy the assertion.
27
+ - Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
28
+ - For "contains" assertions: look for semantic presence, not exact substring.
29
+ - For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
30
+ - Always cite specific text from the output as evidence.
7
31
 
8
32
  OUTPUT:
9
33
  ---
@@ -16,7 +40,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
16
40
  Respond with JSON only:
17
41
  {
18
42
  "results": [
19
- {"text": "<assertion text>", "passed": true/false, "evidence": "<quote or reference from output>"}
43
+ {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
20
44
  ]
21
45
  }`;
22
46
  }
@@ -26,25 +50,54 @@ function runScript(scriptName, outputDir, scriptsDir) {
26
50
  return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
27
51
  }
28
52
  try {
29
- const evidence = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
53
+ const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
54
+ const evidence = stdout || `Script passed: ${scriptName}`;
30
55
  return { text: `script:${scriptName}`, passed: true, evidence };
31
56
  }
32
57
  catch (err) {
33
- const evidence = err.stdout?.trim() || err.message || 'Script exited with non-zero code';
58
+ // Extract the most useful error info without raw stack traces
59
+ const stderr = err.stderr?.trim();
60
+ const stdout = err.stdout?.trim();
61
+ let evidence;
62
+ if (err.code === 'EACCES') {
63
+ evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
64
+ }
65
+ else if (stderr) {
66
+ // Take only the first line of stderr to avoid stack trace noise
67
+ evidence = stderr.split('\n')[0];
68
+ }
69
+ else if (stdout) {
70
+ evidence = stdout.split('\n')[0];
71
+ }
72
+ else {
73
+ evidence = `Script exited with code ${err.status ?? 'unknown'}`;
74
+ }
34
75
  return { text: `script:${scriptName}`, passed: false, evidence };
35
76
  }
36
77
  }
37
78
  function extractJSON(text) {
38
- const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
39
- if (match)
40
- return match[1].trim();
41
- return text.trim();
79
+ // Try JSON-tagged fence first, then bare fence, then raw text
80
+ const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
81
+ if (jsonFence)
82
+ return jsonFence[1].trim();
83
+ // Try parsing raw text as JSON before falling back to any fence
84
+ const trimmed = text.trim();
85
+ try {
86
+ JSON.parse(trimmed);
87
+ return trimmed;
88
+ }
89
+ catch { /* not raw JSON */ }
90
+ const anyFence = text.match(/```\s*([\s\S]*?)```/);
91
+ if (anyFence)
92
+ return anyFence[1].trim();
93
+ return trimmed;
42
94
  }
43
95
  export async function gradeAssertions(assertions, output, runDir, inference, scriptsDir) {
44
96
  if (assertions.length === 0)
45
97
  return null;
46
98
  const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
47
- const llmAssertions = assertions.filter(a => !a.startsWith('script:'));
99
+ const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
100
+ const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
48
101
  const results = [];
49
102
  for (const assertion of scriptAssertions) {
50
103
  const scriptName = assertion.slice('script:'.length);
@@ -52,6 +105,11 @@ export async function gradeAssertions(assertions, output, runDir, inference, scr
52
105
  const dir = scriptsDir ?? path.join(runDir, '..', '..', '..', 'evals', 'scripts');
53
106
  results.push(runScript(scriptName, outputDir, dir));
54
107
  }
108
+ for (const assertion of exactAssertions) {
109
+ const result = gradeExactMatch(assertion, output.raw);
110
+ if (result)
111
+ results.push(result);
112
+ }
55
113
  if (llmAssertions.length > 0) {
56
114
  const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
57
115
  const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0, responseFormat: 'json' });
@@ -1 +1 @@
1
- {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;EAIP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACrG,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,GAAG,CAAC,OAAO,IAAI,kCAAkC,CAAC;QACzF,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IACzD,IAAI,KAAK;QAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAClC,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;AACrB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACvE,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
1
+ {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACnG,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -14,20 +14,22 @@ export async function runEval(evalCase, skillPath, evalDir, harness, oldSkillPat
14
14
  const withSkillDir = path.join(evalDir, 'with_skill');
15
15
  const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
16
16
  const baselineDir = path.join(evalDir, baselineVariant);
17
- const withSkillResult = await harness.run({
18
- skillPath,
19
- prompt: evalCase.prompt,
20
- files: evalCase.files,
21
- outputDir: path.join(withSkillDir, 'outputs'),
22
- });
17
+ const [withSkillResult, baselineResult] = await Promise.all([
18
+ harness.run({
19
+ skillPath,
20
+ prompt: evalCase.prompt,
21
+ files: evalCase.files,
22
+ outputDir: path.join(withSkillDir, 'outputs'),
23
+ }),
24
+ harness.run({
25
+ skillPath: oldSkillPath,
26
+ prompt: evalCase.prompt,
27
+ files: evalCase.files,
28
+ outputDir: path.join(baselineDir, 'outputs'),
29
+ }),
30
+ ]);
23
31
  writeTiming(withSkillDir, withSkillResult);
24
32
  writeOutput(withSkillDir, withSkillResult);
25
- const baselineResult = await harness.run({
26
- skillPath: oldSkillPath,
27
- prompt: evalCase.prompt,
28
- files: evalCase.files,
29
- outputDir: path.join(baselineDir, 'outputs'),
30
- });
31
33
  writeTiming(baselineDir, baselineResult);
32
34
  writeOutput(baselineDir, baselineResult);
33
35
  return {
@@ -1 +1 @@
1
- {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAWlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,eAAe,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QACxC,SAAS;QACT,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;QACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;KAC9C,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAE3C,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QACvC,SAAS,EAAE,YAAY;QACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;QACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;KAC7C,CAAC,CAAC;IACH,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAWlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,CAAC,eAAe,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC;YACV,SAAS;YACT,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;SAC9C,CAAC;QACF,OAAO,CAAC,GAAG,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;SAC7C,CAAC;KACH,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
@@ -2,6 +2,12 @@ export declare class SnapevalError extends Error {
2
2
  exitCode: number;
3
3
  constructor(message: string, exitCode?: number);
4
4
  }
5
+ export declare class FileNotFoundError extends SnapevalError {
6
+ constructor(filePath: string, hint?: string);
7
+ }
8
+ export declare class ThresholdError extends SnapevalError {
9
+ constructor(actual: number, threshold: number);
10
+ }
5
11
  export declare class AdapterNotAvailableError extends SnapevalError {
6
12
  constructor(adapterName: string, installHint: string);
7
13
  }
@@ -1,3 +1,9 @@
1
+ // Exit codes:
2
+ // 0 = success
3
+ // 1 = threshold not met (eval ran successfully but pass rate below threshold)
4
+ // 2 = config/input error (bad JSON, missing fields, invalid flags)
5
+ // 3 = file not found (missing skill dir, missing evals.json, missing script)
6
+ // 4 = runtime error (harness failure, grading failure, timeout)
1
7
  export class SnapevalError extends Error {
2
8
  exitCode;
3
9
  constructor(message, exitCode = 2) {
@@ -6,9 +12,21 @@ export class SnapevalError extends Error {
6
12
  this.name = 'SnapevalError';
7
13
  }
8
14
  }
15
+ export class FileNotFoundError extends SnapevalError {
16
+ constructor(filePath, hint) {
17
+ super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
18
+ this.name = 'FileNotFoundError';
19
+ }
20
+ }
21
+ export class ThresholdError extends SnapevalError {
22
+ constructor(actual, threshold) {
23
+ super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
24
+ this.name = 'ThresholdError';
25
+ }
26
+ }
9
27
  export class AdapterNotAvailableError extends SnapevalError {
10
28
  constructor(adapterName, installHint) {
11
- super(`${adapterName} is not available. ${installHint}`);
29
+ super(`${adapterName} is not available. ${installHint}`, 4);
12
30
  this.name = 'AdapterNotAvailableError';
13
31
  }
14
32
  }
@@ -20,13 +38,13 @@ export class RateLimitError extends SnapevalError {
20
38
  }
21
39
  export class TimeoutError extends SnapevalError {
22
40
  constructor(evalId, timeoutMs) {
23
- super(`Eval ${evalId} timed out after ${timeoutMs}ms.`);
41
+ super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
24
42
  this.name = 'TimeoutError';
25
43
  }
26
44
  }
27
45
  export class GradingError extends SnapevalError {
28
46
  constructor(evalId, detail) {
29
- super(`Grading failed for eval ${evalId}: ${detail}`);
47
+ super(`Grading failed for eval ${evalId}: ${detail}`, 4);
30
48
  this.name = 'GradingError';
31
49
  }
32
50
  }
@@ -1 +1 @@
1
- {"version":3,"file":"errors.js","sourceRoot":"","sources":["../../src/errors.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,aAAc,SAAQ,KAAK;IACF;IAApC,YAAY,OAAe,EAAS,WAAmB,CAAC;QACtD,KAAK,CAAC,OAAO,CAAC,CAAC;QADmB,aAAQ,GAAR,QAAQ,CAAY;QAEtD,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED,MAAM,OAAO,wBAAyB,SAAQ,aAAa;IACzD,YAAY,WAAmB,EAAE,WAAmB;QAClD,KAAK,CAAC,GAAG,WAAW,sBAAsB,WAAW,EAAE,CAAC,CAAC;QACzD,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF;AAED,MAAM,OAAO,cAAe,SAAQ,aAAa;IAC/C,YAAY,WAAmB;QAC7B,KAAK,CAAC,GAAG,WAAW,mEAAmE,CAAC,CAAC;QACzF,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,SAAiB;QAC3C,KAAK,CAAC,QAAQ,MAAM,oBAAoB,SAAS,KAAK,CAAC,CAAC;QACxD,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,MAAc;QACxC,KAAK,CAAC,2BAA2B,MAAM,KAAK,MAAM,EAAE,CAAC,CAAC;QACtD,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF"}
1
+ {"version":3,"file":"errors.js","sourceRoot":"","sources":["../../src/errors.ts"],"names":[],"mappings":"AAAA,cAAc;AACd,cAAc;AACd,8EAA8E;AAC9E,mEAAmE;AACnE,6EAA6E;AAC7E,gEAAgE;AAEhE,MAAM,OAAO,aAAc,SAAQ,KAAK;IACF;IAApC,YAAY,OAAe,EAAS,WAAmB,CAAC;QACtD,KAAK,CAAC,OAAO,CAAC,CAAC;QADmB,aAAQ,GAAR,QAAQ,CAAY;QAEtD,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED,MAAM,OAAO,iBAAkB,SAAQ,aAAa;IAClD,YAAY,QAAgB,EAAE,IAAa;QACzC,KAAK,CAAC,mBAAmB,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;QAClE,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;IAClC,CAAC;CACF;AAED,MAAM,OAAO,cAAe,SAAQ,aAAa;IAC/C,YAAY,MAAc,EAAE,SAAiB;QAC3C,KAAK,CAAC,mBAAmB,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,wBAAwB,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC9G,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,wBAAyB,SAAQ,aAAa;IACzD,YAAY,WAAmB,EAAE,WAAmB;QAClD,KAAK,CAAC,GAAG,WAAW,sBAAsB,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC;QAC5D,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF;AAED,MAAM,OAAO,cAAe,SAAQ,aAAa;IAC/C,YAAY,WAAmB;QAC7B,KAAK,CAAC,GAAG,WAAW,mEAAmE,CAAC,CAAC;QACzF,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,SAAiB;QAC3C,KAAK,CAAC,QAAQ,MAAM,oBAAoB,SAAS,KAAK,EAAE,CAAC,CAAC,CAAC;QAC3D,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,MAAc;QACxC,KAAK,CAAC,2BAA2B,MAAM,KAAK,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;QACzD,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF"}
@@ -110,4 +110,5 @@ export interface SnapevalConfig {
110
110
  inference: string;
111
111
  workspace: string;
112
112
  runs: number;
113
+ concurrency: number;
113
114
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "1.8.0",
3
+ "version": "2.1.0",
4
4
  "description": "Harness-agnostic eval runner for agentskills.io skills",
5
5
  "type": "module",
6
6
  "bin": {
@@ -50,5 +50,8 @@
50
50
  "tsx": "^4.19.3",
51
51
  "typescript": "^5.8.2",
52
52
  "vitest": "^4.1.0"
53
+ },
54
+ "optionalDependencies": {
55
+ "@github/copilot-sdk": "^0.2.0"
53
56
  }
54
57
  }
package/plugin.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "1.8.0",
3
+ "version": "2.1.0",
4
4
  "description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
5
5
  "author": "Matan Tsach",
6
6
  "license": "MIT",
@@ -1,9 +1,9 @@
1
1
  ---
2
2
  name: snapeval
3
- description: Evaluate AI skills using the agentskills.io eval spec. Generates test cases, runs with/without skill comparisons, grades assertions, and computes benchmarks. Use when the user wants to evaluate, test, or review any skill — including phrases like "test my skill", "run evals", "evaluate this", "set up evals", or "how good is my skill."
3
+ description: Evaluate AI skills using the agentskills.io eval spec. Runs with/without skill comparisons, grades assertions, and computes benchmarks. Use when the user wants to evaluate, test, or review any skill — including phrases like "test my skill", "run evals", "evaluate this", "set up evals", or "how good is my skill."
4
4
  ---
5
5
 
6
- You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by generating test scenarios, running with/without skill comparisons, grading assertions, and iterating on skill quality.
6
+ You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by designing test scenarios, running with/without skill comparisons, grading assertions, and iterating on skill quality.
7
7
 
8
8
  ## Mode Detection
9
9
 
@@ -47,18 +47,34 @@ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
47
47
  - Loop until confirmed
48
48
  - If the user says "just run it" → skip to Phase 4 immediately
49
49
 
50
- ### Phase 4 — Init & First Eval
51
-
52
- 1. Run: `npx snapeval init <skill-path>` — generates evals.json (prompts + expected outputs, no assertions)
53
- 2. Run: `npx snapeval eval <skill-path>` — runs each eval with and without the skill
54
- 3. Report: "Ran N evals. With-skill vs without-skill outputs are in the workspace. Review the outputs and add assertions to evals.json for what 'good' looks like."
55
-
56
- ### Phase 5 — Add Assertions & Re-eval
57
-
58
- After the user reviews outputs and adds assertions to evals.json:
59
-
60
- 1. Run: `npx snapeval eval <skill-path>` — now grades assertions, produces grading.json + benchmark.json
61
- 2. Interpret the benchmark:
50
+ ### Phase 4 — Write evals.json & First Eval
51
+
52
+ 1. Write the approved scenarios to `<skill-path>/evals/evals.json` with assertions derived from the "What it tests" analysis. Format:
53
+ ```json
54
+ {
55
+ "skill_name": "<skill-name>",
56
+ "evals": [
57
+ {
58
+ "id": 1,
59
+ "slug": "kebab-case-slug",
60
+ "prompt": "The realistic user prompt",
61
+ "assertions": ["Assertion 1", "Assertion 2"],
62
+ "files": []
63
+ }
64
+ ]
65
+ }
66
+ ```
67
+
68
+ **Writing good assertions:** Assertions are graded by an LLM that requires concrete evidence from the output to pass. Write specific, verifiable assertions — not vague ones.
69
+ - Good: `"Output contains a YAML block with an 'id' field for each issue"`
70
+ - Bad: `"Output is correct"`
71
+ - Good: `"Response declines to scout because the pipeline already has unclaimed issues"`
72
+ - Bad: `"Handles edge case properly"`
73
+
74
+ Script assertions are also supported: prefix with `script:` (e.g. `"script:check-yaml.sh"`). Scripts live in `<skill-path>/evals/scripts/`, receive the output directory as their first argument, and pass on exit code 0.
75
+
76
+ 2. Run: `npx snapeval eval <skill-path>` — runs each eval with and without the skill, grades assertions, produces grading.json + benchmark.json
77
+ 3. Interpret the benchmark:
62
78
  > "With skill: X% pass rate. Without skill: Y% pass rate. Delta: +Z%. The skill adds value on [specific assertions]."
63
79
 
64
80
  ## Review & Iterate
@@ -91,8 +107,7 @@ Never show raw stack traces. Translate errors into plain language with a suggest
91
107
 
92
108
  | Error | Response |
93
109
  |-------|----------|
94
- | No SKILL.md found | "I can't find a SKILL.md in `<path>`. Is this the right directory?" |
95
- | No evals.json | "No test cases exist yet. Want me to generate them with `snapeval init`?" |
110
+ | No evals.json | "No test cases exist yet. Want me to design scenarios and create evals.json?" |
96
111
  | Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`)." |
97
112
  | Skill invocation failure | "The skill failed to respond to eval N: `<error>`. This might be a bug in the skill — want to skip this eval and continue?" |
98
113
 
@@ -100,5 +115,5 @@ Never show raw stack traces. Translate errors into plain language with a suggest
100
115
 
101
116
  - Never ask the user to write evals.json or any config files manually
102
117
  - Always read the target skill's SKILL.md before generating scenarios
103
- - Only reference CLI commands that exist: `init`, `eval`, `review`
104
- - Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--old-skill`, `--no-open`, `--verbose`
118
+ - Only reference CLI commands that exist: `eval`, `review`
119
+ - Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--concurrency`, `--only`, `--threshold`, `--old-skill`, `--no-open`, `--verbose`
@@ -33,7 +33,7 @@ export async function getClient(): Promise<any> {
33
33
  );
34
34
  }
35
35
 
36
- clientInstance = new CopilotClient();
36
+ clientInstance = new CopilotClient({ logLevel: 'none' });
37
37
  await clientInstance.start();
38
38
  clientStarted = true;
39
39
  return clientInstance;