agentic-qe 3.6.9 → 3.6.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
  2. package/.claude/skills/pr-review/SKILL.md +2 -2
  3. package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
  4. package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
  5. package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
  6. package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
  7. package/.claude/skills/skills-manifest.json +5 -5
  8. package/package.json +1 -1
  9. package/scripts/benchmark-hnsw-loading.ts +480 -0
  10. package/scripts/benchmark-kg-assisted.ts +725 -0
  11. package/scripts/collect-production-telemetry.sh +291 -0
  12. package/scripts/detect-skill-conflicts.ts +347 -0
  13. package/scripts/eval-driven-workflow.ts +704 -0
  14. package/scripts/run-skill-eval.ts +210 -10
  15. package/scripts/score-skill-quality.ts +511 -0
  16. package/v3/CHANGELOG.md +19 -0
  17. package/v3/assets/skills/pr-review/SKILL.md +2 -2
  18. package/v3/dist/cli/bundle.js +1064 -363
  19. package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
  20. package/v3/dist/cli/commands/hooks.js +143 -2
  21. package/v3/dist/cli/commands/hooks.js.map +1 -1
  22. package/v3/dist/cli/commands/test.d.ts.map +1 -1
  23. package/v3/dist/cli/commands/test.js +6 -0
  24. package/v3/dist/cli/commands/test.js.map +1 -1
  25. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
  26. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
  27. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
  28. package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
  29. package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
  30. package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
  31. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
  32. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
  33. package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
  34. package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
  35. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
  36. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
  37. package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
  38. package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
  39. package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
  40. package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
  41. package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
  42. package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
  43. package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
  44. package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
  45. package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
  46. package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
  47. package/v3/dist/kernel/unified-memory.d.ts +2 -2
  48. package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
  49. package/v3/dist/kernel/unified-memory.js +7 -9
  50. package/v3/dist/kernel/unified-memory.js.map +1 -1
  51. package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
  52. package/v3/dist/learning/qe-hooks.js +34 -3
  53. package/v3/dist/learning/qe-hooks.js.map +1 -1
  54. package/v3/dist/mcp/bundle.js +857 -329
  55. package/v3/package.json +1 -1
@@ -0,0 +1,704 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Eval-Driven Development Workflow
4
+ *
5
+ * Enables the eval-driven development loop: baseline → write skill → compare.
6
+ * Two subcommands:
7
+ * init — Bootstrap eval scaffolding for a skill from its SKILL.md
8
+ * compare — Compare two eval run JSON files (before/after)
9
+ *
10
+ * Usage:
11
+ * npx tsx scripts/eval-driven-workflow.ts init <skill-name>
12
+ * npx tsx scripts/eval-driven-workflow.ts compare <before.json> <after.json>
13
+ * npx tsx scripts/eval-driven-workflow.ts --help
14
+ */
15
+
16
+ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'fs';
17
+ import { join, dirname } from 'path';
18
+
19
+ // ============================================================================
20
+ // CONSTANTS
21
+ // ============================================================================
22
+
23
+ const SKILLS_DIR = '.claude/skills';
24
+ const PLATFORM_PREFIXES = ['v3-', 'flow-nexus-', 'agentdb-', 'reasoningbank-', 'swarm-'];
25
+
26
+ // ============================================================================
27
+ // FRONTMATTER PARSER (shared pattern from score-skill-quality.ts)
28
+ // ============================================================================
29
+
30
+ function parseYamlFrontmatter(content: string): Record<string, unknown> {
31
+ const match = content.match(/^---\n([\s\S]*?)\n---/);
32
+ if (!match) return {};
33
+
34
+ const result: Record<string, unknown> = {};
35
+ const lines = match[1].split('\n');
36
+ let inNested = false;
37
+ let nestedKey = '';
38
+ const nestedObj: Record<string, unknown> = {};
39
+
40
+ for (const line of lines) {
41
+ if (!line.trim() || line.trim().startsWith('#')) continue;
42
+ const indent = line.search(/\S/);
43
+
44
+ if (indent > 0 && inNested) {
45
+ const kv = line.trim().match(/^([\w_]+):\s*(.+)$/);
46
+ if (kv) nestedObj[kv[1]] = parseYamlValue(kv[2]);
47
+ continue;
48
+ }
49
+ if (indent === 0 && inNested) {
50
+ result[nestedKey] = { ...nestedObj };
51
+ inNested = false;
52
+ }
53
+
54
+ const kv = line.trim().match(/^([\w_]+):\s*(.*)$/);
55
+ if (kv) {
56
+ const [, key, value] = kv;
57
+ if (!value || value.trim() === '') {
58
+ inNested = true;
59
+ nestedKey = key;
60
+ Object.keys(nestedObj).forEach(k => delete nestedObj[k]);
61
+ } else {
62
+ result[key] = parseYamlValue(value.trim());
63
+ }
64
+ }
65
+ }
66
+ if (inNested) result[nestedKey] = { ...nestedObj };
67
+ return result;
68
+ }
69
+
70
+ function parseYamlValue(value: string): unknown {
71
+ const t = value.trim();
72
+ if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'")))
73
+ return t.slice(1, -1);
74
+ if (t.startsWith('[') && t.endsWith(']'))
75
+ return t.slice(1, -1).split(',').map(i => {
76
+ const s = i.trim();
77
+ return (s.startsWith('"') || s.startsWith("'")) ? s.slice(1, -1) : s;
78
+ });
79
+ if (t === 'true') return true;
80
+ if (t === 'false') return false;
81
+ const n = Number(t);
82
+ if (!isNaN(n) && t !== '') return n;
83
+ return t;
84
+ }
85
+
86
+ // ============================================================================
87
+ // PROJECT ROOT
88
+ // ============================================================================
89
+
90
+ function getProjectRoot(): string {
91
+ let dir = process.cwd();
92
+ while (dir !== '/') {
93
+ if (existsSync(join(dir, 'package.json'))) return dir;
94
+ dir = dirname(dir);
95
+ }
96
+ return process.cwd();
97
+ }
98
+
99
+ // ============================================================================
100
+ // KEYWORD EXTRACTION
101
+ // ============================================================================
102
+
103
+ /** Extract meaningful keywords from SKILL.md body for must_contain assertions */
104
+ function extractKeywords(body: string, frontmatter: Record<string, unknown>): string[] {
105
+ const keywords = new Set<string>();
106
+
107
+ // Extract tool/standard names commonly referenced in skills
108
+ const toolPatterns = [
109
+ /\b(owasp|pact|k6|artillery|jmeter|wcag|jest|vitest|playwright|cypress)\b/gi,
110
+ /\b(supertest|graphql|rest|grpc|openapi|swagger|postman|cucumber|gherkin)\b/gi,
111
+ /\b(bdd|tdd|mutation|stryker|sonarqube|eslint|docker|kubernetes|terraform)\b/gi,
112
+ /\b(kafka|rabbitmq|redis|postgresql|mongodb|oauth|jwt|saml)\b/gi,
113
+ /\b(xss|sqli|csrf|ssrf|sast|dast|sca|sbom|cve)\b/gi,
114
+ /\b(MCP|hooks|pre-edit|post-edit|session|memory|neural|swarm|agent)\b/gi,
115
+ ];
116
+
117
+ for (const pattern of toolPatterns) {
118
+ const matches = body.match(pattern);
119
+ if (matches) {
120
+ for (const m of matches) keywords.add(m.toLowerCase());
121
+ }
122
+ }
123
+
124
+ // Extract markdown headings as domain keywords
125
+ const headings = body.match(/^#{1,3}\s+(.+)$/gm);
126
+ if (headings) {
127
+ for (const h of headings.slice(0, 8)) {
128
+ const text = h.replace(/^#+\s+/, '').trim().toLowerCase();
129
+ if (text.length > 3 && text.length < 40) keywords.add(text);
130
+ }
131
+ }
132
+
133
+ // Add tags from frontmatter
134
+ const tags = Array.isArray(frontmatter.tags) ? frontmatter.tags : [];
135
+ for (const tag of tags) {
136
+ if (typeof tag === 'string') keywords.add(tag.toLowerCase());
137
+ }
138
+
139
+ return [...keywords].slice(0, 12);
140
+ }
141
+
142
+ /** Extract the primary capability description from SKILL.md body */
143
+ function extractPrimaryCapability(body: string): string {
144
+ // Look for "What This Skill Does" or first substantial paragraph
145
+ const whatMatch = body.match(/##\s*What This Skill Does\s*\n+([\s\S]*?)(?=\n##|\n\*\*Key)/);
146
+ if (whatMatch) return whatMatch[1].trim().split('\n')[0];
147
+
148
+ // Fall back to first paragraph after the title
149
+ const firstPara = body.match(/^#[^#].*\n+([A-Z][\s\S]*?)(?=\n\n|\n##)/m);
150
+ if (firstPara) return firstPara[1].trim().split('\n')[0];
151
+
152
+ return 'the primary functionality described in SKILL.md';
153
+ }
154
+
155
+ // ============================================================================
156
+ // INIT SUBCOMMAND
157
+ // ============================================================================
158
+
159
+ function runInit(skillName: string): void {
160
+ const projectRoot = getProjectRoot();
161
+ const skillDir = join(projectRoot, SKILLS_DIR, skillName);
162
+
163
+ // 1. Validate skill exists
164
+ if (!existsSync(skillDir) || !statSync(skillDir).isDirectory()) {
165
+ console.error(`Error: Skill directory not found: ${SKILLS_DIR}/${skillName}`);
166
+ console.error(` Available skills are in ${SKILLS_DIR}/`);
167
+ process.exit(1);
168
+ }
169
+
170
+ // Check it's not a platform skill
171
+ if (PLATFORM_PREFIXES.some(p => skillName.startsWith(p))) {
172
+ console.error(`Error: '${skillName}' is a platform skill, not an AQE skill.`);
173
+ console.error(' Only AQE skills are supported by this workflow.');
174
+ process.exit(1);
175
+ }
176
+
177
+ // 2. Check if eval already exists
178
+ const evalsDir = join(skillDir, 'evals');
179
+ const evalPath = join(evalsDir, `${skillName}.yaml`);
180
+ if (existsSync(evalPath)) {
181
+ console.log(`Eval already exists: ${SKILLS_DIR}/${skillName}/evals/${skillName}.yaml`);
182
+ console.log(' To regenerate, delete the existing file first.');
183
+ process.exit(0);
184
+ }
185
+
186
+ // 3. Read SKILL.md
187
+ const mdPath = existsSync(join(skillDir, 'SKILL.md'))
188
+ ? join(skillDir, 'SKILL.md')
189
+ : existsSync(join(skillDir, 'skill.md'))
190
+ ? join(skillDir, 'skill.md')
191
+ : null;
192
+
193
+ if (!mdPath) {
194
+ console.error(`Error: No SKILL.md found in ${SKILLS_DIR}/${skillName}/`);
195
+ process.exit(1);
196
+ }
197
+
198
+ const content = readFileSync(mdPath, 'utf-8');
199
+ const frontmatter = parseYamlFrontmatter(content);
200
+ const bodyMatch = content.match(/^---[\s\S]*?---\s*\n([\s\S]*)$/);
201
+ const body = bodyMatch ? bodyMatch[1] : content;
202
+
203
+ // 4. Extract info from SKILL.md
204
+ const description = String(frontmatter.description || frontmatter.name || skillName);
205
+ const category = String(frontmatter.category || 'general');
206
+ const priority = String(frontmatter.priority || 'p1');
207
+ const tags = Array.isArray(frontmatter.tags) ? frontmatter.tags : [];
208
+ const agents = Array.isArray(frontmatter.agents) ? frontmatter.agents : [];
209
+
210
+ const keywords = extractKeywords(body, frontmatter);
211
+ const primaryCapability = extractPrimaryCapability(body);
212
+
213
+ // 5. Build must_contain keywords for test cases (pick top relevant ones)
214
+ const mustContainBasic = keywords.slice(0, 3);
215
+ const mustContainCore = keywords.slice(0, 5);
216
+
217
+ // 6. Generate YAML
218
+ const today = new Date().toISOString().split('T')[0];
219
+ const yaml = `# =============================================================================
220
+ # AQE ${skillName} Skill Evaluation Test Suite v1.0.0
221
+ # Generated by eval-driven-workflow.ts on ${today}
222
+ # =============================================================================
223
+ #
224
+ # Eval-driven development workflow:
225
+ # 1. Review and customize test cases below (look for TODO comments)
226
+ # 2. Run baseline: npx tsx scripts/run-skill-eval.ts --skill ${skillName} --output before.json
227
+ # 3. Improve SKILL.md based on eval failures
228
+ # 4. Compare: npx tsx scripts/eval-driven-workflow.ts compare before.json after.json
229
+ #
230
+ # Schema: .claude/skills/.validation/schemas/skill-eval.schema.json
231
+ # Runner: scripts/run-skill-eval.ts
232
+ # =============================================================================
233
+
234
+ skill: ${skillName}
235
+ version: 1.0.0
236
+ description: >
237
+ Evaluation test suite for ${skillName} skill.
238
+ ${description.slice(0, 120)}
239
+
240
+ # =============================================================================
241
+ # Multi-Model Configuration
242
+ # =============================================================================
243
+
244
+ models_to_test:
245
+ - claude-sonnet-4 # Primary model (high accuracy expected)
246
+ - claude-3-haiku # Fast model (minimum quality bar)
247
+
248
+ # =============================================================================
249
+ # MCP Integration Configuration
250
+ # =============================================================================
251
+
252
+ mcp_integration:
253
+ enabled: true
254
+ namespace: skill-validation
255
+ query_patterns: true
256
+ track_outcomes: true
257
+ store_patterns: true
258
+ share_learning: true
259
+ update_quality_gate: true
260
+ target_agents:
261
+ - qe-learning-coordinator
262
+ - qe-queen-coordinator
263
+
264
+ # =============================================================================
265
+ # ReasoningBank Learning Configuration
266
+ # =============================================================================
267
+
268
+ learning:
269
+ store_success_patterns: true
270
+ store_failure_patterns: true
271
+ pattern_ttl_days: 90
272
+ min_confidence_to_store: 0.7
273
+ cross_model_comparison: true
274
+
275
+ # =============================================================================
276
+ # Result Format Configuration
277
+ # =============================================================================
278
+
279
+ result_format:
280
+ json_output: true
281
+ markdown_report: false
282
+ include_raw_output: false
283
+ include_timing: true
284
+ include_token_usage: true
285
+
286
+ # =============================================================================
287
+ # Test Cases — 5 seed cases generated from SKILL.md
288
+ # =============================================================================
289
+ # TODO: Review each test case and customize prompts, must_contain keywords,
290
+ # and validation thresholds for your specific skill behavior.
291
+ # =============================================================================
292
+
293
+ test_cases:
294
+ # ---------------------------------------------------------------------------
295
+ # tc001: Basic Invocation
296
+ # ---------------------------------------------------------------------------
297
+ - id: tc001_basic_invocation
298
+ description: "Skill responds to basic invocation with relevant output"
299
+ category: basic
300
+ priority: critical
301
+
302
+ input:
303
+ prompt: |
304
+ # TODO: Replace with a minimal, realistic prompt for this skill
305
+ I need help with ${skillName.replace(/-/g, ' ')}.
306
+ context:
307
+ language: typescript
308
+
309
+ expected_output:
310
+ must_contain:
311
+ ${mustContainBasic.map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD" # TODO: Add expected keywords'}
312
+ must_not_contain:
313
+ - "unable to"
314
+ - "I cannot"
315
+
316
+ validation:
317
+ schema_check: true
318
+ keyword_match_threshold: 0.6
319
+ reasoning_quality_min: 0.5
320
+
321
+ # ---------------------------------------------------------------------------
322
+ # tc002: Handles Empty/Missing Input
323
+ # ---------------------------------------------------------------------------
324
+ - id: tc002_handles_empty_input
325
+ description: "Skill handles empty or missing input gracefully"
326
+ category: edge_cases
327
+ priority: high
328
+
329
+ input:
330
+ prompt: ""
331
+ context:
332
+ language: unknown
333
+
334
+ expected_output:
335
+ must_contain:
336
+ - "provide" # TODO: Adjust — what should the skill say for empty input?
337
+ must_not_contain:
338
+ - "exception"
339
+ - "crash"
340
+ - "undefined"
341
+
342
+ validation:
343
+ schema_check: true
344
+ allow_partial: true
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # tc003: Core Capability
348
+ # ---------------------------------------------------------------------------
349
+ - id: tc003_core_capability
350
+ description: "Tests the primary capability: ${primaryCapability.slice(0, 80)}"
351
+ category: core
352
+ priority: critical
353
+
354
+ input:
355
+ prompt: |
356
+ # TODO: Write a prompt that exercises the core capability of this skill
357
+ # Core capability: ${primaryCapability.slice(0, 100)}
358
+ Help me apply ${skillName.replace(/-/g, ' ')} to a sample project.
359
+ context:
360
+ language: typescript
361
+ framework: nodejs
362
+
363
+ expected_output:
364
+ must_contain:
365
+ ${mustContainCore.map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD" # TODO: Add expected keywords'}
366
+ must_not_contain:
367
+ - "error"
368
+ - "not supported"
369
+
370
+ validation:
371
+ schema_check: true
372
+ keyword_match_threshold: 0.8
373
+ reasoning_quality_min: 0.6
374
+
375
+ # ---------------------------------------------------------------------------
376
+ # tc004: Output Structure
377
+ # ---------------------------------------------------------------------------
378
+ - id: tc004_output_structure
379
+ description: "Validates output contains expected sections and structure"
380
+ category: structure
381
+ priority: high
382
+
383
+ input:
384
+ prompt: |
385
+ # TODO: Write a prompt that should produce well-structured output
386
+ Give me a comprehensive guide for ${skillName.replace(/-/g, ' ')}.
387
+ context:
388
+ language: typescript
389
+
390
+ expected_output:
391
+ must_contain:
392
+ - "##" # TODO: Expect markdown headings?
393
+ ${mustContainBasic.slice(0, 2).map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD"'}
394
+ must_not_contain:
395
+ - "TODO"
396
+ - "placeholder"
397
+
398
+ validation:
399
+ schema_check: true
400
+ keyword_match_threshold: 0.7
401
+
402
+ # ---------------------------------------------------------------------------
403
+ # tc005: Negative Control
404
+ # ---------------------------------------------------------------------------
405
+ - id: tc005_negative_control
406
+ description: "Input where skill should decline or redirect to another skill"
407
+ category: negative
408
+ priority: high
409
+
410
+ input:
411
+ prompt: |
412
+ # TODO: Write an out-of-scope prompt that this skill should NOT handle
413
+ How do I make a soufflé?
414
+ context:
415
+ language: unknown
416
+
417
+ expected_output:
418
+ must_not_contain:
419
+ - "recipe"
420
+ - "ingredients"
421
+ - "bake"
422
+ # TODO: What should the skill say when declining? Add must_contain keywords.
423
+
424
+ validation:
425
+ schema_check: true
426
+ allow_partial: true
427
+
428
+ # =============================================================================
429
+ # Success Criteria
430
+ # =============================================================================
431
+
432
+ success_criteria:
433
+ pass_rate: 0.8 # 80% starter threshold — increase as skill matures
434
+ critical_pass_rate: 1.0 # Critical tests must always pass
435
+ avg_reasoning_quality: 0.6
436
+ max_execution_time_ms: 300000
437
+ cross_model_variance: 0.2
438
+
439
+ # =============================================================================
440
+ # Metadata
441
+ # =============================================================================
442
+
443
+ metadata:
444
+ author: "eval-driven-workflow"
445
+ created: "${today}"
446
+ last_updated: "${today}"
447
+ coverage_target: "Core functionality and basic edge cases"
448
+ source_skill_category: "${category}"
449
+ source_skill_priority: "${priority}"
450
+ source_skill_tags: [${tags.map(t => `"${t}"`).join(', ')}]
451
+ source_skill_agents: [${agents.map(a => `"${a}"`).join(', ')}]
452
+ `;
453
+
454
+ // 7. Write file
455
+ if (!existsSync(evalsDir)) {
456
+ mkdirSync(evalsDir, { recursive: true });
457
+ }
458
+ writeFileSync(evalPath, yaml);
459
+
460
+ // 8. Print next steps
461
+ const keywordList = keywords.length > 0 ? keywords.join(', ') : '(none extracted — add manually)';
462
+ console.log(`Created eval scaffold: ${SKILLS_DIR}/${skillName}/evals/${skillName}.yaml`);
463
+ console.log(` 5 seed test cases generated from SKILL.md`);
464
+ console.log(` Keywords extracted: ${keywordList}`);
465
+ console.log('');
466
+ console.log('Next steps:');
467
+ console.log(' 1. Review and customize test cases (look for TODO comments)');
468
+ console.log(` 2. Run baseline: npx tsx scripts/run-skill-eval.ts --skill ${skillName} --output before.json`);
469
+ console.log(' 3. Improve SKILL.md based on eval failures');
470
+ console.log(` 4. Compare: npx tsx scripts/eval-driven-workflow.ts compare before.json after.json`);
471
+ }
472
+
473
+ // ============================================================================
474
+ // COMPARE SUBCOMMAND
475
+ // ============================================================================
476
+
477
+ interface TestCaseResult {
478
+ id: string;
479
+ passed: boolean;
480
+ skipped: boolean;
481
+ execution_time_ms: number;
482
+ }
483
+
484
+ interface ModelEvalResult {
485
+ model: string;
486
+ skill: string;
487
+ pass_rate: number;
488
+ critical_pass_rate: number;
489
+ total_execution_time_ms: number;
490
+ test_results: TestCaseResult[];
491
+ }
492
+
493
+ interface EvalRunResult {
494
+ skill: string;
495
+ model_results: ModelEvalResult[];
496
+ summary: {
497
+ avg_pass_rate: number;
498
+ };
499
+ }
500
+
501
+ function runCompare(beforePath: string, afterPath: string): void {
502
+ // 1. Read files
503
+ if (!existsSync(beforePath)) {
504
+ console.error(`Error: Before file not found: ${beforePath}`);
505
+ process.exit(1);
506
+ }
507
+ if (!existsSync(afterPath)) {
508
+ console.error(`Error: After file not found: ${afterPath}`);
509
+ process.exit(1);
510
+ }
511
+
512
+ let before: EvalRunResult;
513
+ let after: EvalRunResult;
514
+ try {
515
+ before = JSON.parse(readFileSync(beforePath, 'utf-8'));
516
+ after = JSON.parse(readFileSync(afterPath, 'utf-8'));
517
+ } catch (e) {
518
+ console.error(`Error: Failed to parse JSON: ${e instanceof Error ? e.message : String(e)}`);
519
+ process.exit(1);
520
+ }
521
+
522
+ const skillName = after.skill || before.skill || 'unknown';
523
+
524
+ // Use first model result for comparison (most common use case)
525
+ const beforeModel = before.model_results?.[0];
526
+ const afterModel = after.model_results?.[0];
527
+
528
+ if (!beforeModel || !afterModel) {
529
+ console.error('Error: Both files must contain at least one model_results entry.');
530
+ process.exit(1);
531
+ }
532
+
533
+ // 2. Compare aggregate metrics
534
+ const beforePassRate = beforeModel.pass_rate * 100;
535
+ const afterPassRate = afterModel.pass_rate * 100;
536
+ const deltaPassRate = afterPassRate - beforePassRate;
537
+
538
+ const beforeCritical = beforeModel.critical_pass_rate * 100;
539
+ const afterCritical = afterModel.critical_pass_rate * 100;
540
+ const deltaCritical = afterCritical - beforeCritical;
541
+
542
+ const beforeTime = beforeModel.total_execution_time_ms / 1000;
543
+ const afterTime = afterModel.total_execution_time_ms / 1000;
544
+ const deltaTime = afterTime - beforeTime;
545
+
546
+ // 3. Compare per-test-case results
547
+ const beforeResults = new Map<string, TestCaseResult>();
548
+ for (const r of beforeModel.test_results || []) {
549
+ beforeResults.set(r.id, r);
550
+ }
551
+
552
+ interface TestChange {
553
+ id: string;
554
+ before: string;
555
+ after: string;
556
+ label: string;
557
+ }
558
+ const changes: TestChange[] = [];
559
+
560
+ for (const r of afterModel.test_results || []) {
561
+ const b = beforeResults.get(r.id);
562
+ const bStatus = b ? (b.skipped ? 'SKIP' : b.passed ? 'PASS' : 'FAIL') : 'NEW';
563
+ const aStatus = r.skipped ? 'SKIP' : r.passed ? 'PASS' : 'FAIL';
564
+
565
+ let label: string;
566
+ if (bStatus === 'NEW') label = '(new test)';
567
+ else if (bStatus === 'FAIL' && aStatus === 'PASS') label = '(improved)';
568
+ else if (bStatus === 'PASS' && aStatus === 'FAIL') label = '(REGRESSION)';
569
+ else if (bStatus === aStatus) label = '(stable)';
570
+ else label = '(changed)';
571
+
572
+ changes.push({ id: r.id, before: bStatus, after: aStatus, label });
573
+ }
574
+
575
+ // Check for tests removed in after
576
+ for (const [id] of beforeResults) {
577
+ if (!afterModel.test_results?.find(r => r.id === id)) {
578
+ changes.push({ id, before: 'PASS/FAIL', after: 'REMOVED', label: '(removed)' });
579
+ }
580
+ }
581
+
582
+ // 4. Output diff table
583
+ const sep = '='.repeat(72);
584
+ const divider = '-'.repeat(54);
585
+
586
+ console.log(sep);
587
+ console.log(`EVAL COMPARISON: ${skillName}`);
588
+ console.log(sep);
589
+
590
+ console.log(`${'Metric'.padEnd(26)}${'Before'.padStart(10)}${'After'.padStart(10)}${'Delta'.padStart(10)}`);
591
+ console.log(divider);
592
+
593
+ const fmtPct = (v: number) => `${v.toFixed(1)}%`;
594
+ const fmtDelta = (v: number) => `${v >= 0 ? '+' : ''}${v.toFixed(1)}%`;
595
+ const fmtTime = (v: number) => `${v.toFixed(1)}s`;
596
+ const fmtTimeDelta = (v: number) => `${v >= 0 ? '+' : ''}${v.toFixed(1)}s`;
597
+
598
+ const arrow = (v: number) => v > 0 ? ' \u2191' : v < 0 ? ' \u2193' : '';
599
+
600
+ console.log(
601
+ `${'Pass rate'.padEnd(26)}${fmtPct(beforePassRate).padStart(10)}${fmtPct(afterPassRate).padStart(10)}${(fmtDelta(deltaPassRate) + arrow(deltaPassRate)).padStart(10)}`
602
+ );
603
+ console.log(
604
+ `${'Critical pass rate'.padEnd(26)}${fmtPct(beforeCritical).padStart(10)}${fmtPct(afterCritical).padStart(10)}${(fmtDelta(deltaCritical) + arrow(deltaCritical)).padStart(10)}`
605
+ );
606
+ console.log(
607
+ `${'Avg execution time'.padEnd(26)}${fmtTime(beforeTime).padStart(10)}${fmtTime(afterTime).padStart(10)}${fmtTimeDelta(deltaTime).padStart(10)}`
608
+ );
609
+
610
+ console.log('');
611
+ console.log('Test Case Changes:');
612
+ for (const c of changes) {
613
+ const idPad = c.id.length > 30 ? c.id.slice(0, 27) + '...' : c.id.padEnd(30);
614
+ console.log(` ${idPad} ${c.before} \u2192 ${c.after} ${c.label}`);
615
+ }
616
+
617
+ console.log(sep);
618
+
619
+ // 5. Determine result
620
+ const regressions = changes.filter(c => c.label === '(REGRESSION)');
621
+ if (deltaPassRate > 0) {
622
+ console.log(`Result: IMPROVEMENT (+${deltaPassRate.toFixed(1)}% pass rate)`);
623
+ } else if (deltaPassRate === 0 && regressions.length === 0) {
624
+ console.log('Result: NO CHANGE');
625
+ } else {
626
+ console.log(`Result: REGRESSION (${deltaPassRate.toFixed(1)}% pass rate, ${regressions.length} test(s) regressed)`);
627
+ }
628
+ console.log(sep);
629
+
630
+ // Exit code: 0 if after >= before, 1 if regression
631
+ process.exit(afterPassRate >= beforePassRate ? 0 : 1);
632
+ }
633
+
634
+ // ============================================================================
635
+ // HELP
636
+ // ============================================================================
637
+
638
+ function printHelp(): void {
639
+ console.log(`
640
+ Eval-Driven Development Workflow
641
+
642
+ Usage:
643
+ npx tsx scripts/eval-driven-workflow.ts init <skill-name>
644
+ npx tsx scripts/eval-driven-workflow.ts compare <before.json> <after.json>
645
+
646
+ Subcommands:
647
+ init <skill> Bootstrap eval YAML with 5 seed test cases from SKILL.md
648
+ compare <before> <after> Compare two eval run JSON files and show diff table
649
+
650
+ Options:
651
+ --help, -h Show this help
652
+
653
+ Workflow:
654
+ 1. npx tsx scripts/eval-driven-workflow.ts init my-skill
655
+ 2. Review generated YAML (look for TODO comments)
656
+ 3. npx tsx scripts/run-skill-eval.ts --skill my-skill --output before.json
657
+ 4. Edit SKILL.md to improve eval results
658
+ 5. npx tsx scripts/run-skill-eval.ts --skill my-skill --output after.json
659
+ 6. npx tsx scripts/eval-driven-workflow.ts compare before.json after.json
660
+ `);
661
+ }
662
+
663
+ // ============================================================================
664
+ // MAIN
665
+ // ============================================================================
666
+
667
+ function main(): void {
668
+ const args = process.argv.slice(2);
669
+
670
+ if (args.length === 0 || args[0] === '--help' || args[0] === '-h') {
671
+ printHelp();
672
+ process.exit(0);
673
+ }
674
+
675
+ const subcommand = args[0];
676
+
677
+ switch (subcommand) {
678
+ case 'init': {
679
+ const skillName = args[1];
680
+ if (!skillName) {
681
+ console.error('Error: Skill name required. Usage: eval-driven-workflow.ts init <skill-name>');
682
+ process.exit(1);
683
+ }
684
+ runInit(skillName);
685
+ break;
686
+ }
687
+ case 'compare': {
688
+ const beforePath = args[1];
689
+ const afterPath = args[2];
690
+ if (!beforePath || !afterPath) {
691
+ console.error('Error: Two JSON file paths required. Usage: eval-driven-workflow.ts compare <before.json> <after.json>');
692
+ process.exit(1);
693
+ }
694
+ runCompare(beforePath, afterPath);
695
+ break;
696
+ }
697
+ default:
698
+ console.error(`Unknown subcommand: ${subcommand}`);
699
+ console.error('Use --help for usage information.');
700
+ process.exit(1);
701
+ }
702
+ }
703
+
704
+ main();