agentic-qe 3.6.9 → 3.6.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
- package/.claude/skills/pr-review/SKILL.md +2 -2
- package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
- package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
- package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
- package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
- package/.claude/skills/skills-manifest.json +5 -5
- package/package.json +1 -1
- package/scripts/benchmark-hnsw-loading.ts +480 -0
- package/scripts/benchmark-kg-assisted.ts +725 -0
- package/scripts/collect-production-telemetry.sh +291 -0
- package/scripts/detect-skill-conflicts.ts +347 -0
- package/scripts/eval-driven-workflow.ts +704 -0
- package/scripts/run-skill-eval.ts +210 -10
- package/scripts/score-skill-quality.ts +511 -0
- package/v3/CHANGELOG.md +19 -0
- package/v3/assets/skills/pr-review/SKILL.md +2 -2
- package/v3/dist/cli/bundle.js +1064 -363
- package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
- package/v3/dist/cli/commands/hooks.js +143 -2
- package/v3/dist/cli/commands/hooks.js.map +1 -1
- package/v3/dist/cli/commands/test.d.ts.map +1 -1
- package/v3/dist/cli/commands/test.js +6 -0
- package/v3/dist/cli/commands/test.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
- package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
- package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
- package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
- package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
- package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
- package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
- package/v3/dist/kernel/unified-memory.d.ts +2 -2
- package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory.js +7 -9
- package/v3/dist/kernel/unified-memory.js.map +1 -1
- package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
- package/v3/dist/learning/qe-hooks.js +34 -3
- package/v3/dist/learning/qe-hooks.js.map +1 -1
- package/v3/dist/mcp/bundle.js +857 -329
- package/v3/package.json +1 -1
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Eval-Driven Development Workflow
|
|
4
|
+
*
|
|
5
|
+
* Enables the eval-driven development loop: baseline → write skill → compare.
|
|
6
|
+
* Two subcommands:
|
|
7
|
+
* init — Bootstrap eval scaffolding for a skill from its SKILL.md
|
|
8
|
+
* compare — Compare two eval run JSON files (before/after)
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* npx tsx scripts/eval-driven-workflow.ts init <skill-name>
|
|
12
|
+
* npx tsx scripts/eval-driven-workflow.ts compare <before.json> <after.json>
|
|
13
|
+
* npx tsx scripts/eval-driven-workflow.ts --help
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'fs';
|
|
17
|
+
import { join, dirname } from 'path';
|
|
18
|
+
|
|
19
|
+
// ============================================================================
|
|
20
|
+
// CONSTANTS
|
|
21
|
+
// ============================================================================
|
|
22
|
+
|
|
23
|
+
const SKILLS_DIR = '.claude/skills';
|
|
24
|
+
const PLATFORM_PREFIXES = ['v3-', 'flow-nexus-', 'agentdb-', 'reasoningbank-', 'swarm-'];
|
|
25
|
+
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// FRONTMATTER PARSER (shared pattern from score-skill-quality.ts)
|
|
28
|
+
// ============================================================================
|
|
29
|
+
|
|
30
|
+
function parseYamlFrontmatter(content: string): Record<string, unknown> {
|
|
31
|
+
const match = content.match(/^---\n([\s\S]*?)\n---/);
|
|
32
|
+
if (!match) return {};
|
|
33
|
+
|
|
34
|
+
const result: Record<string, unknown> = {};
|
|
35
|
+
const lines = match[1].split('\n');
|
|
36
|
+
let inNested = false;
|
|
37
|
+
let nestedKey = '';
|
|
38
|
+
const nestedObj: Record<string, unknown> = {};
|
|
39
|
+
|
|
40
|
+
for (const line of lines) {
|
|
41
|
+
if (!line.trim() || line.trim().startsWith('#')) continue;
|
|
42
|
+
const indent = line.search(/\S/);
|
|
43
|
+
|
|
44
|
+
if (indent > 0 && inNested) {
|
|
45
|
+
const kv = line.trim().match(/^([\w_]+):\s*(.+)$/);
|
|
46
|
+
if (kv) nestedObj[kv[1]] = parseYamlValue(kv[2]);
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
if (indent === 0 && inNested) {
|
|
50
|
+
result[nestedKey] = { ...nestedObj };
|
|
51
|
+
inNested = false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const kv = line.trim().match(/^([\w_]+):\s*(.*)$/);
|
|
55
|
+
if (kv) {
|
|
56
|
+
const [, key, value] = kv;
|
|
57
|
+
if (!value || value.trim() === '') {
|
|
58
|
+
inNested = true;
|
|
59
|
+
nestedKey = key;
|
|
60
|
+
Object.keys(nestedObj).forEach(k => delete nestedObj[k]);
|
|
61
|
+
} else {
|
|
62
|
+
result[key] = parseYamlValue(value.trim());
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
if (inNested) result[nestedKey] = { ...nestedObj };
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function parseYamlValue(value: string): unknown {
|
|
71
|
+
const t = value.trim();
|
|
72
|
+
if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'")))
|
|
73
|
+
return t.slice(1, -1);
|
|
74
|
+
if (t.startsWith('[') && t.endsWith(']'))
|
|
75
|
+
return t.slice(1, -1).split(',').map(i => {
|
|
76
|
+
const s = i.trim();
|
|
77
|
+
return (s.startsWith('"') || s.startsWith("'")) ? s.slice(1, -1) : s;
|
|
78
|
+
});
|
|
79
|
+
if (t === 'true') return true;
|
|
80
|
+
if (t === 'false') return false;
|
|
81
|
+
const n = Number(t);
|
|
82
|
+
if (!isNaN(n) && t !== '') return n;
|
|
83
|
+
return t;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ============================================================================
|
|
87
|
+
// PROJECT ROOT
|
|
88
|
+
// ============================================================================
|
|
89
|
+
|
|
90
|
+
function getProjectRoot(): string {
|
|
91
|
+
let dir = process.cwd();
|
|
92
|
+
while (dir !== '/') {
|
|
93
|
+
if (existsSync(join(dir, 'package.json'))) return dir;
|
|
94
|
+
dir = dirname(dir);
|
|
95
|
+
}
|
|
96
|
+
return process.cwd();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ============================================================================
|
|
100
|
+
// KEYWORD EXTRACTION
|
|
101
|
+
// ============================================================================
|
|
102
|
+
|
|
103
|
+
/** Extract meaningful keywords from SKILL.md body for must_contain assertions */
|
|
104
|
+
function extractKeywords(body: string, frontmatter: Record<string, unknown>): string[] {
|
|
105
|
+
const keywords = new Set<string>();
|
|
106
|
+
|
|
107
|
+
// Extract tool/standard names commonly referenced in skills
|
|
108
|
+
const toolPatterns = [
|
|
109
|
+
/\b(owasp|pact|k6|artillery|jmeter|wcag|jest|vitest|playwright|cypress)\b/gi,
|
|
110
|
+
/\b(supertest|graphql|rest|grpc|openapi|swagger|postman|cucumber|gherkin)\b/gi,
|
|
111
|
+
/\b(bdd|tdd|mutation|stryker|sonarqube|eslint|docker|kubernetes|terraform)\b/gi,
|
|
112
|
+
/\b(kafka|rabbitmq|redis|postgresql|mongodb|oauth|jwt|saml)\b/gi,
|
|
113
|
+
/\b(xss|sqli|csrf|ssrf|sast|dast|sca|sbom|cve)\b/gi,
|
|
114
|
+
/\b(MCP|hooks|pre-edit|post-edit|session|memory|neural|swarm|agent)\b/gi,
|
|
115
|
+
];
|
|
116
|
+
|
|
117
|
+
for (const pattern of toolPatterns) {
|
|
118
|
+
const matches = body.match(pattern);
|
|
119
|
+
if (matches) {
|
|
120
|
+
for (const m of matches) keywords.add(m.toLowerCase());
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Extract markdown headings as domain keywords
|
|
125
|
+
const headings = body.match(/^#{1,3}\s+(.+)$/gm);
|
|
126
|
+
if (headings) {
|
|
127
|
+
for (const h of headings.slice(0, 8)) {
|
|
128
|
+
const text = h.replace(/^#+\s+/, '').trim().toLowerCase();
|
|
129
|
+
if (text.length > 3 && text.length < 40) keywords.add(text);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Add tags from frontmatter
|
|
134
|
+
const tags = Array.isArray(frontmatter.tags) ? frontmatter.tags : [];
|
|
135
|
+
for (const tag of tags) {
|
|
136
|
+
if (typeof tag === 'string') keywords.add(tag.toLowerCase());
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return [...keywords].slice(0, 12);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/** Extract the primary capability description from SKILL.md body */
|
|
143
|
+
function extractPrimaryCapability(body: string): string {
|
|
144
|
+
// Look for "What This Skill Does" or first substantial paragraph
|
|
145
|
+
const whatMatch = body.match(/##\s*What This Skill Does\s*\n+([\s\S]*?)(?=\n##|\n\*\*Key)/);
|
|
146
|
+
if (whatMatch) return whatMatch[1].trim().split('\n')[0];
|
|
147
|
+
|
|
148
|
+
// Fall back to first paragraph after the title
|
|
149
|
+
const firstPara = body.match(/^#[^#].*\n+([A-Z][\s\S]*?)(?=\n\n|\n##)/m);
|
|
150
|
+
if (firstPara) return firstPara[1].trim().split('\n')[0];
|
|
151
|
+
|
|
152
|
+
return 'the primary functionality described in SKILL.md';
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ============================================================================
|
|
156
|
+
// INIT SUBCOMMAND
|
|
157
|
+
// ============================================================================
|
|
158
|
+
|
|
159
|
+
function runInit(skillName: string): void {
|
|
160
|
+
const projectRoot = getProjectRoot();
|
|
161
|
+
const skillDir = join(projectRoot, SKILLS_DIR, skillName);
|
|
162
|
+
|
|
163
|
+
// 1. Validate skill exists
|
|
164
|
+
if (!existsSync(skillDir) || !statSync(skillDir).isDirectory()) {
|
|
165
|
+
console.error(`Error: Skill directory not found: ${SKILLS_DIR}/${skillName}`);
|
|
166
|
+
console.error(` Available skills are in ${SKILLS_DIR}/`);
|
|
167
|
+
process.exit(1);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Check it's not a platform skill
|
|
171
|
+
if (PLATFORM_PREFIXES.some(p => skillName.startsWith(p))) {
|
|
172
|
+
console.error(`Error: '${skillName}' is a platform skill, not an AQE skill.`);
|
|
173
|
+
console.error(' Only AQE skills are supported by this workflow.');
|
|
174
|
+
process.exit(1);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// 2. Check if eval already exists
|
|
178
|
+
const evalsDir = join(skillDir, 'evals');
|
|
179
|
+
const evalPath = join(evalsDir, `${skillName}.yaml`);
|
|
180
|
+
if (existsSync(evalPath)) {
|
|
181
|
+
console.log(`Eval already exists: ${SKILLS_DIR}/${skillName}/evals/${skillName}.yaml`);
|
|
182
|
+
console.log(' To regenerate, delete the existing file first.');
|
|
183
|
+
process.exit(0);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// 3. Read SKILL.md
|
|
187
|
+
const mdPath = existsSync(join(skillDir, 'SKILL.md'))
|
|
188
|
+
? join(skillDir, 'SKILL.md')
|
|
189
|
+
: existsSync(join(skillDir, 'skill.md'))
|
|
190
|
+
? join(skillDir, 'skill.md')
|
|
191
|
+
: null;
|
|
192
|
+
|
|
193
|
+
if (!mdPath) {
|
|
194
|
+
console.error(`Error: No SKILL.md found in ${SKILLS_DIR}/${skillName}/`);
|
|
195
|
+
process.exit(1);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const content = readFileSync(mdPath, 'utf-8');
|
|
199
|
+
const frontmatter = parseYamlFrontmatter(content);
|
|
200
|
+
const bodyMatch = content.match(/^---[\s\S]*?---\s*\n([\s\S]*)$/);
|
|
201
|
+
const body = bodyMatch ? bodyMatch[1] : content;
|
|
202
|
+
|
|
203
|
+
// 4. Extract info from SKILL.md
|
|
204
|
+
const description = String(frontmatter.description || frontmatter.name || skillName);
|
|
205
|
+
const category = String(frontmatter.category || 'general');
|
|
206
|
+
const priority = String(frontmatter.priority || 'p1');
|
|
207
|
+
const tags = Array.isArray(frontmatter.tags) ? frontmatter.tags : [];
|
|
208
|
+
const agents = Array.isArray(frontmatter.agents) ? frontmatter.agents : [];
|
|
209
|
+
|
|
210
|
+
const keywords = extractKeywords(body, frontmatter);
|
|
211
|
+
const primaryCapability = extractPrimaryCapability(body);
|
|
212
|
+
|
|
213
|
+
// 5. Build must_contain keywords for test cases (pick top relevant ones)
|
|
214
|
+
const mustContainBasic = keywords.slice(0, 3);
|
|
215
|
+
const mustContainCore = keywords.slice(0, 5);
|
|
216
|
+
|
|
217
|
+
// 6. Generate YAML
|
|
218
|
+
const today = new Date().toISOString().split('T')[0];
|
|
219
|
+
const yaml = `# =============================================================================
|
|
220
|
+
# AQE ${skillName} Skill Evaluation Test Suite v1.0.0
|
|
221
|
+
# Generated by eval-driven-workflow.ts on ${today}
|
|
222
|
+
# =============================================================================
|
|
223
|
+
#
|
|
224
|
+
# Eval-driven development workflow:
|
|
225
|
+
# 1. Review and customize test cases below (look for TODO comments)
|
|
226
|
+
# 2. Run baseline: npx tsx scripts/run-skill-eval.ts --skill ${skillName} --output before.json
|
|
227
|
+
# 3. Improve SKILL.md based on eval failures
|
|
228
|
+
# 4. Compare: npx tsx scripts/eval-driven-workflow.ts compare before.json after.json
|
|
229
|
+
#
|
|
230
|
+
# Schema: .claude/skills/.validation/schemas/skill-eval.schema.json
|
|
231
|
+
# Runner: scripts/run-skill-eval.ts
|
|
232
|
+
# =============================================================================
|
|
233
|
+
|
|
234
|
+
skill: ${skillName}
|
|
235
|
+
version: 1.0.0
|
|
236
|
+
description: >
|
|
237
|
+
Evaluation test suite for ${skillName} skill.
|
|
238
|
+
${description.slice(0, 120)}
|
|
239
|
+
|
|
240
|
+
# =============================================================================
|
|
241
|
+
# Multi-Model Configuration
|
|
242
|
+
# =============================================================================
|
|
243
|
+
|
|
244
|
+
models_to_test:
|
|
245
|
+
- claude-sonnet-4 # Primary model (high accuracy expected)
|
|
246
|
+
- claude-3-haiku # Fast model (minimum quality bar)
|
|
247
|
+
|
|
248
|
+
# =============================================================================
|
|
249
|
+
# MCP Integration Configuration
|
|
250
|
+
# =============================================================================
|
|
251
|
+
|
|
252
|
+
mcp_integration:
|
|
253
|
+
enabled: true
|
|
254
|
+
namespace: skill-validation
|
|
255
|
+
query_patterns: true
|
|
256
|
+
track_outcomes: true
|
|
257
|
+
store_patterns: true
|
|
258
|
+
share_learning: true
|
|
259
|
+
update_quality_gate: true
|
|
260
|
+
target_agents:
|
|
261
|
+
- qe-learning-coordinator
|
|
262
|
+
- qe-queen-coordinator
|
|
263
|
+
|
|
264
|
+
# =============================================================================
|
|
265
|
+
# ReasoningBank Learning Configuration
|
|
266
|
+
# =============================================================================
|
|
267
|
+
|
|
268
|
+
learning:
|
|
269
|
+
store_success_patterns: true
|
|
270
|
+
store_failure_patterns: true
|
|
271
|
+
pattern_ttl_days: 90
|
|
272
|
+
min_confidence_to_store: 0.7
|
|
273
|
+
cross_model_comparison: true
|
|
274
|
+
|
|
275
|
+
# =============================================================================
|
|
276
|
+
# Result Format Configuration
|
|
277
|
+
# =============================================================================
|
|
278
|
+
|
|
279
|
+
result_format:
|
|
280
|
+
json_output: true
|
|
281
|
+
markdown_report: false
|
|
282
|
+
include_raw_output: false
|
|
283
|
+
include_timing: true
|
|
284
|
+
include_token_usage: true
|
|
285
|
+
|
|
286
|
+
# =============================================================================
|
|
287
|
+
# Test Cases — 5 seed cases generated from SKILL.md
|
|
288
|
+
# =============================================================================
|
|
289
|
+
# TODO: Review each test case and customize prompts, must_contain keywords,
|
|
290
|
+
# and validation thresholds for your specific skill behavior.
|
|
291
|
+
# =============================================================================
|
|
292
|
+
|
|
293
|
+
test_cases:
|
|
294
|
+
# ---------------------------------------------------------------------------
|
|
295
|
+
# tc001: Basic Invocation
|
|
296
|
+
# ---------------------------------------------------------------------------
|
|
297
|
+
- id: tc001_basic_invocation
|
|
298
|
+
description: "Skill responds to basic invocation with relevant output"
|
|
299
|
+
category: basic
|
|
300
|
+
priority: critical
|
|
301
|
+
|
|
302
|
+
input:
|
|
303
|
+
prompt: |
|
|
304
|
+
# TODO: Replace with a minimal, realistic prompt for this skill
|
|
305
|
+
I need help with ${skillName.replace(/-/g, ' ')}.
|
|
306
|
+
context:
|
|
307
|
+
language: typescript
|
|
308
|
+
|
|
309
|
+
expected_output:
|
|
310
|
+
must_contain:
|
|
311
|
+
${mustContainBasic.map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD" # TODO: Add expected keywords'}
|
|
312
|
+
must_not_contain:
|
|
313
|
+
- "unable to"
|
|
314
|
+
- "I cannot"
|
|
315
|
+
|
|
316
|
+
validation:
|
|
317
|
+
schema_check: true
|
|
318
|
+
keyword_match_threshold: 0.6
|
|
319
|
+
reasoning_quality_min: 0.5
|
|
320
|
+
|
|
321
|
+
# ---------------------------------------------------------------------------
|
|
322
|
+
# tc002: Handles Empty/Missing Input
|
|
323
|
+
# ---------------------------------------------------------------------------
|
|
324
|
+
- id: tc002_handles_empty_input
|
|
325
|
+
description: "Skill handles empty or missing input gracefully"
|
|
326
|
+
category: edge_cases
|
|
327
|
+
priority: high
|
|
328
|
+
|
|
329
|
+
input:
|
|
330
|
+
prompt: ""
|
|
331
|
+
context:
|
|
332
|
+
language: unknown
|
|
333
|
+
|
|
334
|
+
expected_output:
|
|
335
|
+
must_contain:
|
|
336
|
+
- "provide" # TODO: Adjust — what should the skill say for empty input?
|
|
337
|
+
must_not_contain:
|
|
338
|
+
- "exception"
|
|
339
|
+
- "crash"
|
|
340
|
+
- "undefined"
|
|
341
|
+
|
|
342
|
+
validation:
|
|
343
|
+
schema_check: true
|
|
344
|
+
allow_partial: true
|
|
345
|
+
|
|
346
|
+
# ---------------------------------------------------------------------------
|
|
347
|
+
# tc003: Core Capability
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
- id: tc003_core_capability
|
|
350
|
+
description: "Tests the primary capability: ${primaryCapability.slice(0, 80)}"
|
|
351
|
+
category: core
|
|
352
|
+
priority: critical
|
|
353
|
+
|
|
354
|
+
input:
|
|
355
|
+
prompt: |
|
|
356
|
+
# TODO: Write a prompt that exercises the core capability of this skill
|
|
357
|
+
# Core capability: ${primaryCapability.slice(0, 100)}
|
|
358
|
+
Help me apply ${skillName.replace(/-/g, ' ')} to a sample project.
|
|
359
|
+
context:
|
|
360
|
+
language: typescript
|
|
361
|
+
framework: nodejs
|
|
362
|
+
|
|
363
|
+
expected_output:
|
|
364
|
+
must_contain:
|
|
365
|
+
${mustContainCore.map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD" # TODO: Add expected keywords'}
|
|
366
|
+
must_not_contain:
|
|
367
|
+
- "error"
|
|
368
|
+
- "not supported"
|
|
369
|
+
|
|
370
|
+
validation:
|
|
371
|
+
schema_check: true
|
|
372
|
+
keyword_match_threshold: 0.8
|
|
373
|
+
reasoning_quality_min: 0.6
|
|
374
|
+
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
# tc004: Output Structure
|
|
377
|
+
# ---------------------------------------------------------------------------
|
|
378
|
+
- id: tc004_output_structure
|
|
379
|
+
description: "Validates output contains expected sections and structure"
|
|
380
|
+
category: structure
|
|
381
|
+
priority: high
|
|
382
|
+
|
|
383
|
+
input:
|
|
384
|
+
prompt: |
|
|
385
|
+
# TODO: Write a prompt that should produce well-structured output
|
|
386
|
+
Give me a comprehensive guide for ${skillName.replace(/-/g, ' ')}.
|
|
387
|
+
context:
|
|
388
|
+
language: typescript
|
|
389
|
+
|
|
390
|
+
expected_output:
|
|
391
|
+
must_contain:
|
|
392
|
+
- "##" # TODO: Expect markdown headings?
|
|
393
|
+
${mustContainBasic.slice(0, 2).map(k => ` - "${k}"`).join('\n') || ' - "TODO_KEYWORD"'}
|
|
394
|
+
must_not_contain:
|
|
395
|
+
- "TODO"
|
|
396
|
+
- "placeholder"
|
|
397
|
+
|
|
398
|
+
validation:
|
|
399
|
+
schema_check: true
|
|
400
|
+
keyword_match_threshold: 0.7
|
|
401
|
+
|
|
402
|
+
# ---------------------------------------------------------------------------
|
|
403
|
+
# tc005: Negative Control
|
|
404
|
+
# ---------------------------------------------------------------------------
|
|
405
|
+
- id: tc005_negative_control
|
|
406
|
+
description: "Input where skill should decline or redirect to another skill"
|
|
407
|
+
category: negative
|
|
408
|
+
priority: high
|
|
409
|
+
|
|
410
|
+
input:
|
|
411
|
+
prompt: |
|
|
412
|
+
# TODO: Write an out-of-scope prompt that this skill should NOT handle
|
|
413
|
+
How do I make a soufflé?
|
|
414
|
+
context:
|
|
415
|
+
language: unknown
|
|
416
|
+
|
|
417
|
+
expected_output:
|
|
418
|
+
must_not_contain:
|
|
419
|
+
- "recipe"
|
|
420
|
+
- "ingredients"
|
|
421
|
+
- "bake"
|
|
422
|
+
# TODO: What should the skill say when declining? Add must_contain keywords.
|
|
423
|
+
|
|
424
|
+
validation:
|
|
425
|
+
schema_check: true
|
|
426
|
+
allow_partial: true
|
|
427
|
+
|
|
428
|
+
# =============================================================================
|
|
429
|
+
# Success Criteria
|
|
430
|
+
# =============================================================================
|
|
431
|
+
|
|
432
|
+
success_criteria:
|
|
433
|
+
pass_rate: 0.8 # 80% starter threshold — increase as skill matures
|
|
434
|
+
critical_pass_rate: 1.0 # Critical tests must always pass
|
|
435
|
+
avg_reasoning_quality: 0.6
|
|
436
|
+
max_execution_time_ms: 300000
|
|
437
|
+
cross_model_variance: 0.2
|
|
438
|
+
|
|
439
|
+
# =============================================================================
|
|
440
|
+
# Metadata
|
|
441
|
+
# =============================================================================
|
|
442
|
+
|
|
443
|
+
metadata:
|
|
444
|
+
author: "eval-driven-workflow"
|
|
445
|
+
created: "${today}"
|
|
446
|
+
last_updated: "${today}"
|
|
447
|
+
coverage_target: "Core functionality and basic edge cases"
|
|
448
|
+
source_skill_category: "${category}"
|
|
449
|
+
source_skill_priority: "${priority}"
|
|
450
|
+
source_skill_tags: [${tags.map(t => `"${t}"`).join(', ')}]
|
|
451
|
+
source_skill_agents: [${agents.map(a => `"${a}"`).join(', ')}]
|
|
452
|
+
`;
|
|
453
|
+
|
|
454
|
+
// 7. Write file
|
|
455
|
+
if (!existsSync(evalsDir)) {
|
|
456
|
+
mkdirSync(evalsDir, { recursive: true });
|
|
457
|
+
}
|
|
458
|
+
writeFileSync(evalPath, yaml);
|
|
459
|
+
|
|
460
|
+
// 8. Print next steps
|
|
461
|
+
const keywordList = keywords.length > 0 ? keywords.join(', ') : '(none extracted — add manually)';
|
|
462
|
+
console.log(`Created eval scaffold: ${SKILLS_DIR}/${skillName}/evals/${skillName}.yaml`);
|
|
463
|
+
console.log(` 5 seed test cases generated from SKILL.md`);
|
|
464
|
+
console.log(` Keywords extracted: ${keywordList}`);
|
|
465
|
+
console.log('');
|
|
466
|
+
console.log('Next steps:');
|
|
467
|
+
console.log(' 1. Review and customize test cases (look for TODO comments)');
|
|
468
|
+
console.log(` 2. Run baseline: npx tsx scripts/run-skill-eval.ts --skill ${skillName} --output before.json`);
|
|
469
|
+
console.log(' 3. Improve SKILL.md based on eval failures');
|
|
470
|
+
console.log(` 4. Compare: npx tsx scripts/eval-driven-workflow.ts compare before.json after.json`);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// ============================================================================
|
|
474
|
+
// COMPARE SUBCOMMAND
|
|
475
|
+
// ============================================================================
|
|
476
|
+
|
|
477
|
+
interface TestCaseResult {
|
|
478
|
+
id: string;
|
|
479
|
+
passed: boolean;
|
|
480
|
+
skipped: boolean;
|
|
481
|
+
execution_time_ms: number;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
interface ModelEvalResult {
|
|
485
|
+
model: string;
|
|
486
|
+
skill: string;
|
|
487
|
+
pass_rate: number;
|
|
488
|
+
critical_pass_rate: number;
|
|
489
|
+
total_execution_time_ms: number;
|
|
490
|
+
test_results: TestCaseResult[];
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
interface EvalRunResult {
|
|
494
|
+
skill: string;
|
|
495
|
+
model_results: ModelEvalResult[];
|
|
496
|
+
summary: {
|
|
497
|
+
avg_pass_rate: number;
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
function runCompare(beforePath: string, afterPath: string): void {
|
|
502
|
+
// 1. Read files
|
|
503
|
+
if (!existsSync(beforePath)) {
|
|
504
|
+
console.error(`Error: Before file not found: ${beforePath}`);
|
|
505
|
+
process.exit(1);
|
|
506
|
+
}
|
|
507
|
+
if (!existsSync(afterPath)) {
|
|
508
|
+
console.error(`Error: After file not found: ${afterPath}`);
|
|
509
|
+
process.exit(1);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
let before: EvalRunResult;
|
|
513
|
+
let after: EvalRunResult;
|
|
514
|
+
try {
|
|
515
|
+
before = JSON.parse(readFileSync(beforePath, 'utf-8'));
|
|
516
|
+
after = JSON.parse(readFileSync(afterPath, 'utf-8'));
|
|
517
|
+
} catch (e) {
|
|
518
|
+
console.error(`Error: Failed to parse JSON: ${e instanceof Error ? e.message : String(e)}`);
|
|
519
|
+
process.exit(1);
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
const skillName = after.skill || before.skill || 'unknown';
|
|
523
|
+
|
|
524
|
+
// Use first model result for comparison (most common use case)
|
|
525
|
+
const beforeModel = before.model_results?.[0];
|
|
526
|
+
const afterModel = after.model_results?.[0];
|
|
527
|
+
|
|
528
|
+
if (!beforeModel || !afterModel) {
|
|
529
|
+
console.error('Error: Both files must contain at least one model_results entry.');
|
|
530
|
+
process.exit(1);
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// 2. Compare aggregate metrics
|
|
534
|
+
const beforePassRate = beforeModel.pass_rate * 100;
|
|
535
|
+
const afterPassRate = afterModel.pass_rate * 100;
|
|
536
|
+
const deltaPassRate = afterPassRate - beforePassRate;
|
|
537
|
+
|
|
538
|
+
const beforeCritical = beforeModel.critical_pass_rate * 100;
|
|
539
|
+
const afterCritical = afterModel.critical_pass_rate * 100;
|
|
540
|
+
const deltaCritical = afterCritical - beforeCritical;
|
|
541
|
+
|
|
542
|
+
const beforeTime = beforeModel.total_execution_time_ms / 1000;
|
|
543
|
+
const afterTime = afterModel.total_execution_time_ms / 1000;
|
|
544
|
+
const deltaTime = afterTime - beforeTime;
|
|
545
|
+
|
|
546
|
+
// 3. Compare per-test-case results
|
|
547
|
+
const beforeResults = new Map<string, TestCaseResult>();
|
|
548
|
+
for (const r of beforeModel.test_results || []) {
|
|
549
|
+
beforeResults.set(r.id, r);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
interface TestChange {
|
|
553
|
+
id: string;
|
|
554
|
+
before: string;
|
|
555
|
+
after: string;
|
|
556
|
+
label: string;
|
|
557
|
+
}
|
|
558
|
+
const changes: TestChange[] = [];
|
|
559
|
+
|
|
560
|
+
for (const r of afterModel.test_results || []) {
|
|
561
|
+
const b = beforeResults.get(r.id);
|
|
562
|
+
const bStatus = b ? (b.skipped ? 'SKIP' : b.passed ? 'PASS' : 'FAIL') : 'NEW';
|
|
563
|
+
const aStatus = r.skipped ? 'SKIP' : r.passed ? 'PASS' : 'FAIL';
|
|
564
|
+
|
|
565
|
+
let label: string;
|
|
566
|
+
if (bStatus === 'NEW') label = '(new test)';
|
|
567
|
+
else if (bStatus === 'FAIL' && aStatus === 'PASS') label = '(improved)';
|
|
568
|
+
else if (bStatus === 'PASS' && aStatus === 'FAIL') label = '(REGRESSION)';
|
|
569
|
+
else if (bStatus === aStatus) label = '(stable)';
|
|
570
|
+
else label = '(changed)';
|
|
571
|
+
|
|
572
|
+
changes.push({ id: r.id, before: bStatus, after: aStatus, label });
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
// Check for tests removed in after
|
|
576
|
+
for (const [id] of beforeResults) {
|
|
577
|
+
if (!afterModel.test_results?.find(r => r.id === id)) {
|
|
578
|
+
changes.push({ id, before: 'PASS/FAIL', after: 'REMOVED', label: '(removed)' });
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
// 4. Output diff table
|
|
583
|
+
const sep = '='.repeat(72);
|
|
584
|
+
const divider = '-'.repeat(54);
|
|
585
|
+
|
|
586
|
+
console.log(sep);
|
|
587
|
+
console.log(`EVAL COMPARISON: ${skillName}`);
|
|
588
|
+
console.log(sep);
|
|
589
|
+
|
|
590
|
+
console.log(`${'Metric'.padEnd(26)}${'Before'.padStart(10)}${'After'.padStart(10)}${'Delta'.padStart(10)}`);
|
|
591
|
+
console.log(divider);
|
|
592
|
+
|
|
593
|
+
const fmtPct = (v: number) => `${v.toFixed(1)}%`;
|
|
594
|
+
const fmtDelta = (v: number) => `${v >= 0 ? '+' : ''}${v.toFixed(1)}%`;
|
|
595
|
+
const fmtTime = (v: number) => `${v.toFixed(1)}s`;
|
|
596
|
+
const fmtTimeDelta = (v: number) => `${v >= 0 ? '+' : ''}${v.toFixed(1)}s`;
|
|
597
|
+
|
|
598
|
+
const arrow = (v: number) => v > 0 ? ' \u2191' : v < 0 ? ' \u2193' : '';
|
|
599
|
+
|
|
600
|
+
console.log(
|
|
601
|
+
`${'Pass rate'.padEnd(26)}${fmtPct(beforePassRate).padStart(10)}${fmtPct(afterPassRate).padStart(10)}${(fmtDelta(deltaPassRate) + arrow(deltaPassRate)).padStart(10)}`
|
|
602
|
+
);
|
|
603
|
+
console.log(
|
|
604
|
+
`${'Critical pass rate'.padEnd(26)}${fmtPct(beforeCritical).padStart(10)}${fmtPct(afterCritical).padStart(10)}${(fmtDelta(deltaCritical) + arrow(deltaCritical)).padStart(10)}`
|
|
605
|
+
);
|
|
606
|
+
console.log(
|
|
607
|
+
`${'Avg execution time'.padEnd(26)}${fmtTime(beforeTime).padStart(10)}${fmtTime(afterTime).padStart(10)}${fmtTimeDelta(deltaTime).padStart(10)}`
|
|
608
|
+
);
|
|
609
|
+
|
|
610
|
+
console.log('');
|
|
611
|
+
console.log('Test Case Changes:');
|
|
612
|
+
for (const c of changes) {
|
|
613
|
+
const idPad = c.id.length > 30 ? c.id.slice(0, 27) + '...' : c.id.padEnd(30);
|
|
614
|
+
console.log(` ${idPad} ${c.before} \u2192 ${c.after} ${c.label}`);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
console.log(sep);
|
|
618
|
+
|
|
619
|
+
// 5. Determine result
|
|
620
|
+
const regressions = changes.filter(c => c.label === '(REGRESSION)');
|
|
621
|
+
if (deltaPassRate > 0) {
|
|
622
|
+
console.log(`Result: IMPROVEMENT (+${deltaPassRate.toFixed(1)}% pass rate)`);
|
|
623
|
+
} else if (deltaPassRate === 0 && regressions.length === 0) {
|
|
624
|
+
console.log('Result: NO CHANGE');
|
|
625
|
+
} else {
|
|
626
|
+
console.log(`Result: REGRESSION (${deltaPassRate.toFixed(1)}% pass rate, ${regressions.length} test(s) regressed)`);
|
|
627
|
+
}
|
|
628
|
+
console.log(sep);
|
|
629
|
+
|
|
630
|
+
// Exit code: 0 if after >= before, 1 if regression
|
|
631
|
+
process.exit(afterPassRate >= beforePassRate ? 0 : 1);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// ============================================================================
|
|
635
|
+
// HELP
|
|
636
|
+
// ============================================================================
|
|
637
|
+
|
|
638
|
+
function printHelp(): void {
|
|
639
|
+
console.log(`
|
|
640
|
+
Eval-Driven Development Workflow
|
|
641
|
+
|
|
642
|
+
Usage:
|
|
643
|
+
npx tsx scripts/eval-driven-workflow.ts init <skill-name>
|
|
644
|
+
npx tsx scripts/eval-driven-workflow.ts compare <before.json> <after.json>
|
|
645
|
+
|
|
646
|
+
Subcommands:
|
|
647
|
+
init <skill> Bootstrap eval YAML with 5 seed test cases from SKILL.md
|
|
648
|
+
compare <before> <after> Compare two eval run JSON files and show diff table
|
|
649
|
+
|
|
650
|
+
Options:
|
|
651
|
+
--help, -h Show this help
|
|
652
|
+
|
|
653
|
+
Workflow:
|
|
654
|
+
1. npx tsx scripts/eval-driven-workflow.ts init my-skill
|
|
655
|
+
2. Review generated YAML (look for TODO comments)
|
|
656
|
+
3. npx tsx scripts/run-skill-eval.ts --skill my-skill --output before.json
|
|
657
|
+
4. Edit SKILL.md to improve eval results
|
|
658
|
+
5. npx tsx scripts/run-skill-eval.ts --skill my-skill --output after.json
|
|
659
|
+
6. npx tsx scripts/eval-driven-workflow.ts compare before.json after.json
|
|
660
|
+
`);
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// ============================================================================
|
|
664
|
+
// MAIN
|
|
665
|
+
// ============================================================================
|
|
666
|
+
|
|
667
|
+
function main(): void {
|
|
668
|
+
const args = process.argv.slice(2);
|
|
669
|
+
|
|
670
|
+
if (args.length === 0 || args[0] === '--help' || args[0] === '-h') {
|
|
671
|
+
printHelp();
|
|
672
|
+
process.exit(0);
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
const subcommand = args[0];
|
|
676
|
+
|
|
677
|
+
switch (subcommand) {
|
|
678
|
+
case 'init': {
|
|
679
|
+
const skillName = args[1];
|
|
680
|
+
if (!skillName) {
|
|
681
|
+
console.error('Error: Skill name required. Usage: eval-driven-workflow.ts init <skill-name>');
|
|
682
|
+
process.exit(1);
|
|
683
|
+
}
|
|
684
|
+
runInit(skillName);
|
|
685
|
+
break;
|
|
686
|
+
}
|
|
687
|
+
case 'compare': {
|
|
688
|
+
const beforePath = args[1];
|
|
689
|
+
const afterPath = args[2];
|
|
690
|
+
if (!beforePath || !afterPath) {
|
|
691
|
+
console.error('Error: Two JSON file paths required. Usage: eval-driven-workflow.ts compare <before.json> <after.json>');
|
|
692
|
+
process.exit(1);
|
|
693
|
+
}
|
|
694
|
+
runCompare(beforePath, afterPath);
|
|
695
|
+
break;
|
|
696
|
+
}
|
|
697
|
+
default:
|
|
698
|
+
console.error(`Unknown subcommand: ${subcommand}`);
|
|
699
|
+
console.error('Use --help for usage information.');
|
|
700
|
+
process.exit(1);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
main();
|