agentic-qe 3.6.9 → 3.6.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
- package/.claude/skills/pr-review/SKILL.md +2 -2
- package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
- package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
- package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
- package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
- package/.claude/skills/skills-manifest.json +5 -5
- package/package.json +1 -1
- package/scripts/benchmark-hnsw-loading.ts +480 -0
- package/scripts/benchmark-kg-assisted.ts +725 -0
- package/scripts/collect-production-telemetry.sh +291 -0
- package/scripts/detect-skill-conflicts.ts +347 -0
- package/scripts/eval-driven-workflow.ts +704 -0
- package/scripts/run-skill-eval.ts +210 -10
- package/scripts/score-skill-quality.ts +511 -0
- package/v3/CHANGELOG.md +19 -0
- package/v3/assets/skills/pr-review/SKILL.md +2 -2
- package/v3/dist/cli/bundle.js +1064 -363
- package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
- package/v3/dist/cli/commands/hooks.js +143 -2
- package/v3/dist/cli/commands/hooks.js.map +1 -1
- package/v3/dist/cli/commands/test.d.ts.map +1 -1
- package/v3/dist/cli/commands/test.js +6 -0
- package/v3/dist/cli/commands/test.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
- package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
- package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
- package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
- package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
- package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
- package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
- package/v3/dist/kernel/unified-memory.d.ts +2 -2
- package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory.js +7 -9
- package/v3/dist/kernel/unified-memory.js.map +1 -1
- package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
- package/v3/dist/learning/qe-hooks.js +34 -3
- package/v3/dist/learning/qe-hooks.js.map +1 -1
- package/v3/dist/mcp/bundle.js +857 -329
- package/v3/package.json +1 -1
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
#!/usr/bin/env tsx
|
|
2
|
+
/**
|
|
3
|
+
* KG-Assisted QE Benchmark (Issue #266)
|
|
4
|
+
*
|
|
5
|
+
* A/B comparison: does code intelligence KG improve QE agent test generation?
|
|
6
|
+
*
|
|
7
|
+
* Protocol:
|
|
8
|
+
* Run A (Control) — aqe init --minimal (no code intelligence) → test generate × N files
|
|
9
|
+
* Run B (Treatment) — aqe init --auto (KG populated) → same test generate calls
|
|
10
|
+
*
|
|
11
|
+
* Metrics captured per file:
|
|
12
|
+
* - Token usage (via `aqe token-usage --json`)
|
|
13
|
+
* - Generated test count, assertions, coverage estimate
|
|
14
|
+
* - Wall-clock latency
|
|
15
|
+
* - Output size (chars — proxy for context consumed)
|
|
16
|
+
* - KG index stats (vectors in memory.db)
|
|
17
|
+
*
|
|
18
|
+
* Test subject: https://github.com/maxritter/claude-pilot (cloned to /tmp)
|
|
19
|
+
*
|
|
20
|
+
* Run:
|
|
21
|
+
* npx tsx scripts/benchmark-kg-assisted.ts
|
|
22
|
+
* npx tsx scripts/benchmark-kg-assisted.ts --skip-clone # reuse existing clone
|
|
23
|
+
* npx tsx scripts/benchmark-kg-assisted.ts --files 3 # fewer files for quick test
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { execSync, ExecSyncOptions } from 'child_process';
|
|
27
|
+
import * as fs from 'fs';
|
|
28
|
+
import * as path from 'path';
|
|
29
|
+
|
|
30
|
+
// ── Configuration ────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
const REPO_URL = 'https://github.com/maxritter/claude-pilot';
|
|
33
|
+
const BENCH_DIR = '/tmp/kg-benchmark';
|
|
34
|
+
const PROJECT_DIR = path.join(BENCH_DIR, 'claude-pilot');
|
|
35
|
+
const RESULTS_DIR = path.join(BENCH_DIR, 'results');
|
|
36
|
+
const AQE_DIR = path.join(PROJECT_DIR, '.agentic-qe');
|
|
37
|
+
|
|
38
|
+
// Parse CLI args
|
|
39
|
+
const args = process.argv.slice(2);
|
|
40
|
+
const skipClone = args.includes('--skip-clone');
|
|
41
|
+
const fileCountArg = args.indexOf('--files');
|
|
42
|
+
const maxFiles = fileCountArg >= 0 ? parseInt(args[fileCountArg + 1] || '10', 10) : 10;
|
|
43
|
+
|
|
44
|
+
// ── File Selection ───────────────────────────────────────────────────
|
|
45
|
+
|
|
46
|
+
interface BenchmarkFile {
|
|
47
|
+
relativePath: string;
|
|
48
|
+
complexity: 'simple' | 'medium' | 'complex';
|
|
49
|
+
lineCount: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function discoverBenchmarkFiles(): BenchmarkFile[] {
|
|
53
|
+
const result = execSync(
|
|
54
|
+
`find ${PROJECT_DIR} -name '*.py' ! -path '*/test*' ! -name 'conftest.py' ! -name '__init__.py' -exec wc -l {} + 2>/dev/null | sort -rn | grep -v total`,
|
|
55
|
+
{ encoding: 'utf-8' }
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
const files: BenchmarkFile[] = [];
|
|
59
|
+
for (const line of result.trim().split('\n')) {
|
|
60
|
+
const match = line.trim().match(/^(\d+)\s+(.+)$/);
|
|
61
|
+
if (!match) continue;
|
|
62
|
+
const lineCount = parseInt(match[1], 10);
|
|
63
|
+
const absPath = match[2];
|
|
64
|
+
const relativePath = path.relative(PROJECT_DIR, absPath);
|
|
65
|
+
|
|
66
|
+
let complexity: BenchmarkFile['complexity'];
|
|
67
|
+
if (lineCount < 150) complexity = 'simple';
|
|
68
|
+
else if (lineCount < 400) complexity = 'medium';
|
|
69
|
+
else complexity = 'complex';
|
|
70
|
+
|
|
71
|
+
files.push({ relativePath, complexity, lineCount });
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Select balanced set: 30% complex, 40% medium, 30% simple
|
|
75
|
+
const simple = files.filter((f) => f.complexity === 'simple');
|
|
76
|
+
const medium = files.filter((f) => f.complexity === 'medium');
|
|
77
|
+
const complex = files.filter((f) => f.complexity === 'complex');
|
|
78
|
+
|
|
79
|
+
const selected: BenchmarkFile[] = [];
|
|
80
|
+
const targets = [
|
|
81
|
+
{ bucket: complex, target: Math.min(3, Math.ceil(maxFiles * 0.3)) },
|
|
82
|
+
{ bucket: medium, target: Math.min(4, Math.ceil(maxFiles * 0.4)) },
|
|
83
|
+
{ bucket: simple, target: Math.min(3, Math.ceil(maxFiles * 0.3)) },
|
|
84
|
+
];
|
|
85
|
+
|
|
86
|
+
for (const { bucket, target } of targets) {
|
|
87
|
+
selected.push(...bucket.slice(0, target));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return selected.slice(0, maxFiles);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ── AQE CLI Helpers ──────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
const EXEC_OPTS: ExecSyncOptions = {
|
|
96
|
+
cwd: PROJECT_DIR,
|
|
97
|
+
encoding: 'utf-8' as BufferEncoding,
|
|
98
|
+
timeout: 300_000,
|
|
99
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
function runAqe(subcommand: string): string {
|
|
103
|
+
const cmd = `npx agentic-qe ${subcommand}`;
|
|
104
|
+
console.log(` $ ${cmd}`);
|
|
105
|
+
try {
|
|
106
|
+
return execSync(cmd, EXEC_OPTS) as string;
|
|
107
|
+
} catch (e: unknown) {
|
|
108
|
+
const err = e as { stdout?: string; stderr?: string; message?: string };
|
|
109
|
+
const output = (err.stdout || '') + '\n' + (err.stderr || '');
|
|
110
|
+
console.error(` [WARN] ${err.message?.split('\n')[0]}`);
|
|
111
|
+
return output;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function wipeAqeData(): void {
|
|
116
|
+
if (fs.existsSync(AQE_DIR)) {
|
|
117
|
+
fs.rmSync(AQE_DIR, { recursive: true, force: true });
|
|
118
|
+
console.log(' Wiped .agentic-qe/');
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ── Token Usage ──────────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
interface TokenSnapshot {
|
|
125
|
+
inputTokens: number;
|
|
126
|
+
outputTokens: number;
|
|
127
|
+
totalTokens: number;
|
|
128
|
+
patternsReused: number;
|
|
129
|
+
tokensSaved: number;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function captureTokenUsage(): TokenSnapshot {
|
|
133
|
+
const empty: TokenSnapshot = { inputTokens: 0, outputTokens: 0, totalTokens: 0, patternsReused: 0, tokensSaved: 0 };
|
|
134
|
+
|
|
135
|
+
// Try reading token metrics directly from memory.db (faster than spawning CLI)
|
|
136
|
+
const memoryDb = path.join(AQE_DIR, 'memory.db');
|
|
137
|
+
if (!fs.existsSync(memoryDb)) return empty;
|
|
138
|
+
|
|
139
|
+
try {
|
|
140
|
+
// Query the token_metrics table directly via sqlite3
|
|
141
|
+
const result = execSync(
|
|
142
|
+
`sqlite3 '${memoryDb}' "SELECT COALESCE(SUM(json_extract(value, '$.usage.inputTokens')),0), COALESCE(SUM(json_extract(value, '$.usage.outputTokens')),0), COALESCE(SUM(json_extract(value, '$.usage.totalTokens')),0) FROM kv_store WHERE namespace='token-metrics';" 2>/dev/null`,
|
|
143
|
+
{ encoding: 'utf-8', timeout: 5_000, cwd: PROJECT_DIR }
|
|
144
|
+
).trim();
|
|
145
|
+
|
|
146
|
+
if (result) {
|
|
147
|
+
const parts = result.split('|');
|
|
148
|
+
return {
|
|
149
|
+
inputTokens: parseInt(parts[0], 10) || 0,
|
|
150
|
+
outputTokens: parseInt(parts[1], 10) || 0,
|
|
151
|
+
totalTokens: parseInt(parts[2], 10) || 0,
|
|
152
|
+
patternsReused: 0,
|
|
153
|
+
tokensSaved: 0,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
} catch { /* table may not exist */ }
|
|
157
|
+
|
|
158
|
+
// Fallback: try the pattern reuse count
|
|
159
|
+
try {
|
|
160
|
+
const patternResult = execSync(
|
|
161
|
+
`sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM qe_patterns WHERE access_count > 0;" 2>/dev/null`,
|
|
162
|
+
{ encoding: 'utf-8', timeout: 5_000, cwd: PROJECT_DIR }
|
|
163
|
+
).trim();
|
|
164
|
+
return { ...empty, patternsReused: parseInt(patternResult, 10) || 0 };
|
|
165
|
+
} catch { /* ignore */ }
|
|
166
|
+
|
|
167
|
+
return empty;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ── Output Parsing ───────────────────────────────────────────────────
|
|
171
|
+
|
|
172
|
+
interface ParsedOutput {
|
|
173
|
+
testsGenerated: number;
|
|
174
|
+
assertions: number;
|
|
175
|
+
coverageEstimate: number;
|
|
176
|
+
patternsUsed: number;
|
|
177
|
+
/** Number of mock declarations generated from KG dependencies */
|
|
178
|
+
kgMocksGenerated: number;
|
|
179
|
+
/** Number of KG dependency imports detected */
|
|
180
|
+
kgDependenciesFound: number;
|
|
181
|
+
/** Number of KG similarity references */
|
|
182
|
+
kgSimilarityRefs: number;
|
|
183
|
+
/** Estimated file reads avoided by using KG (= dependency count) */
|
|
184
|
+
fileReadsAvoided: number;
|
|
185
|
+
/** Estimated prompt tokens (chars / 4 approximation) */
|
|
186
|
+
estimatedPromptTokens: number;
|
|
187
|
+
/** Estimated output tokens (output chars / 4) */
|
|
188
|
+
estimatedOutputTokens: number;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function countAssertions(code: string): number {
|
|
192
|
+
// Count actual assertion patterns in generated code across frameworks
|
|
193
|
+
const patterns = [
|
|
194
|
+
/assert\s+/g, // Python assert
|
|
195
|
+
/self\.assert\w+/g, // unittest assertions
|
|
196
|
+
/pytest\.raises/g, // pytest raises
|
|
197
|
+
/expect\s*\(/g, // Jest/Vitest expect()
|
|
198
|
+
/\.to(?:Be|Equal|Have|Throw|Match|Contain)/g, // Jest matchers
|
|
199
|
+
/\.should\./g, // Chai should
|
|
200
|
+
/test_\w+/g, // Python test functions (each is an assertion unit)
|
|
201
|
+
];
|
|
202
|
+
let count = 0;
|
|
203
|
+
for (const p of patterns) {
|
|
204
|
+
const matches = code.match(p);
|
|
205
|
+
if (matches) count += matches.length;
|
|
206
|
+
}
|
|
207
|
+
return count;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function parseTestGenerationOutput(output: string, sourceChars: number): ParsedOutput {
|
|
211
|
+
// "Generated N tests"
|
|
212
|
+
const genMatch = output.match(/Generated\s+(\d+)\s+test/i);
|
|
213
|
+
const testsGenerated = genMatch ? parseInt(genMatch[1], 10) : 0;
|
|
214
|
+
|
|
215
|
+
// "Assertions: N" from CLI output
|
|
216
|
+
const assertMatch = output.match(/Assertions:\s*(\d+)/i);
|
|
217
|
+
let assertions = assertMatch ? parseInt(assertMatch[1], 10) : 0;
|
|
218
|
+
|
|
219
|
+
// If CLI didn't report assertions, count them from the generated code
|
|
220
|
+
if (assertions === 0) {
|
|
221
|
+
assertions = countAssertions(output);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// "Coverage Estimate: N%"
|
|
225
|
+
const covMatch = output.match(/Coverage\s+Estimate:\s*([\d.]+)%/i);
|
|
226
|
+
const coverageEstimate = covMatch ? parseFloat(covMatch[1]) : 0;
|
|
227
|
+
|
|
228
|
+
// Count pattern-related lines
|
|
229
|
+
const patternMatches = output.match(/pattern|SONA|DecisionTransformer/gi);
|
|
230
|
+
const patternsUsed = patternMatches ? patternMatches.length : 0;
|
|
231
|
+
|
|
232
|
+
// KG-specific metrics
|
|
233
|
+
// Mock declarations from KG: "vi.mock('...')" or "jest.mock('...')" or "@patch('...')"
|
|
234
|
+
const mockMatches = output.match(/(?:vi|jest)\.mock\(['"]|@patch\(['"]|from unittest\.mock import/g);
|
|
235
|
+
const kgMocksGenerated = mockMatches ? mockMatches.length : 0;
|
|
236
|
+
|
|
237
|
+
// KG dependency imports: lines mentioning "Knowledge Graph" or "dependency analysis"
|
|
238
|
+
const kgDepMatches = output.match(/Knowledge Graph|dependency analysis|Auto-generated mocks from/gi);
|
|
239
|
+
const kgDependenciesFound = kgDepMatches ? kgDepMatches.length : 0;
|
|
240
|
+
|
|
241
|
+
// KG similarity references
|
|
242
|
+
const kgSimMatches = output.match(/Similar.*module|similarity:|similar code/gi);
|
|
243
|
+
const kgSimilarityRefs = kgSimMatches ? kgSimMatches.length : 0;
|
|
244
|
+
|
|
245
|
+
// File reads avoided: each mock/dependency from KG = 1 file read the agent didn't need
|
|
246
|
+
const importMatches = output.match(/(?:vi|jest)\.mock\(['"]([^'"]+)['"]/g);
|
|
247
|
+
const patchMatches = output.match(/@patch\(['"]([^'"]+)['"]/g);
|
|
248
|
+
const fileReadsAvoided = (importMatches?.length || 0) + (patchMatches?.length || 0);
|
|
249
|
+
|
|
250
|
+
// Token estimation: ~4 chars per token for code
|
|
251
|
+
// Prompt = source code + template (~500 chars) + KG context
|
|
252
|
+
const kgContextChars = kgDependenciesFound * 80 + kgSimilarityRefs * 200; // rough estimate
|
|
253
|
+
const estimatedPromptTokens = Math.round((sourceChars + 500 + kgContextChars) / 4);
|
|
254
|
+
const estimatedOutputTokens = Math.round(output.length / 4);
|
|
255
|
+
|
|
256
|
+
return {
|
|
257
|
+
testsGenerated, assertions, coverageEstimate, patternsUsed,
|
|
258
|
+
kgMocksGenerated, kgDependenciesFound, kgSimilarityRefs,
|
|
259
|
+
fileReadsAvoided, estimatedPromptTokens, estimatedOutputTokens,
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// ── Test Generation ──────────────────────────────────────────────────
|
|
264
|
+
|
|
265
|
+
interface GenerationResult {
|
|
266
|
+
file: string;
|
|
267
|
+
complexity: string;
|
|
268
|
+
lineCount: number;
|
|
269
|
+
sourceChars: number;
|
|
270
|
+
wallTimeMs: number;
|
|
271
|
+
parsed: ParsedOutput;
|
|
272
|
+
outputLength: number;
|
|
273
|
+
rawOutput: string;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function generateTestsForFile(file: BenchmarkFile): GenerationResult {
|
|
277
|
+
const absPath = path.join(PROJECT_DIR, file.relativePath);
|
|
278
|
+
const sourceCode = fs.readFileSync(absPath, 'utf-8');
|
|
279
|
+
const sourceChars = sourceCode.length;
|
|
280
|
+
|
|
281
|
+
const t0 = performance.now();
|
|
282
|
+
|
|
283
|
+
// CLI: aqe test generate <file> --type unit --framework pytest
|
|
284
|
+
const escapedPath = file.relativePath.replace(/'/g, "'\\''");
|
|
285
|
+
let output: string;
|
|
286
|
+
try {
|
|
287
|
+
output = execSync(
|
|
288
|
+
`npx agentic-qe test generate '${escapedPath}' --type unit --framework pytest 2>&1`,
|
|
289
|
+
{ ...EXEC_OPTS, timeout: 120_000 }
|
|
290
|
+
) as string;
|
|
291
|
+
} catch (e: unknown) {
|
|
292
|
+
const err = e as { stdout?: string; stderr?: string };
|
|
293
|
+
output = (err.stdout || '') + '\n' + (err.stderr || '');
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const wallTimeMs = performance.now() - t0;
|
|
297
|
+
const parsed = parseTestGenerationOutput(output, sourceChars);
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
file: file.relativePath,
|
|
301
|
+
complexity: file.complexity,
|
|
302
|
+
lineCount: file.lineCount,
|
|
303
|
+
sourceChars,
|
|
304
|
+
wallTimeMs,
|
|
305
|
+
parsed,
|
|
306
|
+
outputLength: output.length,
|
|
307
|
+
rawOutput: output,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ── KG Stats ─────────────────────────────────────────────────────────
|
|
312
|
+
|
|
313
|
+
interface KGStats {
|
|
314
|
+
vectorCount: number;
|
|
315
|
+
patternCount: number;
|
|
316
|
+
hasKG: boolean;
|
|
317
|
+
indexDuration?: number;
|
|
318
|
+
dbSizeBytes: number;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
function getKGStats(): KGStats {
|
|
322
|
+
const memoryDb = path.join(AQE_DIR, 'memory.db');
|
|
323
|
+
const empty: KGStats = { vectorCount: 0, patternCount: 0, hasKG: false, dbSizeBytes: 0 };
|
|
324
|
+
|
|
325
|
+
if (!fs.existsSync(memoryDb)) return empty;
|
|
326
|
+
|
|
327
|
+
try {
|
|
328
|
+
const dbSize = fs.statSync(memoryDb).size;
|
|
329
|
+
|
|
330
|
+
// KG data lives in `vectors` table (code index) and `kv_store` (KG nodes/edges)
|
|
331
|
+
const vectorResult = execSync(
|
|
332
|
+
`sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM vectors;" 2>/dev/null`,
|
|
333
|
+
{ encoding: 'utf-8' }
|
|
334
|
+
).trim();
|
|
335
|
+
|
|
336
|
+
const kvResult = execSync(
|
|
337
|
+
`sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM kv_store;" 2>/dev/null`,
|
|
338
|
+
{ encoding: 'utf-8' }
|
|
339
|
+
).trim();
|
|
340
|
+
|
|
341
|
+
// Also check qe_patterns for learned patterns
|
|
342
|
+
const patternResult = execSync(
|
|
343
|
+
`sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM qe_patterns;" 2>/dev/null`,
|
|
344
|
+
{ encoding: 'utf-8' }
|
|
345
|
+
).trim();
|
|
346
|
+
|
|
347
|
+
const vectorCount = parseInt(vectorResult, 10) || 0;
|
|
348
|
+
const kvCount = parseInt(kvResult, 10) || 0;
|
|
349
|
+
const patternCount = parseInt(patternResult, 10) || 0;
|
|
350
|
+
|
|
351
|
+
return { vectorCount, patternCount: patternCount + kvCount, hasKG: vectorCount > 0, dbSizeBytes: dbSize };
|
|
352
|
+
} catch {
|
|
353
|
+
return empty;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// ── Benchmark Run ────────────────────────────────────────────────────
|
|
358
|
+
|
|
359
|
+
interface RunResult {
|
|
360
|
+
label: string;
|
|
361
|
+
kgStats: KGStats;
|
|
362
|
+
initOutput: string;
|
|
363
|
+
initDurationMs: number;
|
|
364
|
+
files: GenerationResult[];
|
|
365
|
+
totalWallTimeMs: number;
|
|
366
|
+
aggregateTokens: TokenSnapshot;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function runBenchmark(label: string, files: BenchmarkFile[], useAutoInit: boolean): RunResult {
|
|
370
|
+
console.log(`\n${'═'.repeat(60)}`);
|
|
371
|
+
console.log(` ${label}`);
|
|
372
|
+
console.log(`${'═'.repeat(60)}`);
|
|
373
|
+
|
|
374
|
+
// Step 1: Wipe and reinitialize
|
|
375
|
+
wipeAqeData();
|
|
376
|
+
|
|
377
|
+
const initCmd = useAutoInit ? 'init --auto' : 'init --minimal --skip-patterns';
|
|
378
|
+
const initT0 = performance.now();
|
|
379
|
+
const initOutput = runAqe(initCmd);
|
|
380
|
+
let initDurationMs = performance.now() - initT0;
|
|
381
|
+
console.log(` Init completed in ${(initDurationMs / 1000).toFixed(1)}s`);
|
|
382
|
+
|
|
383
|
+
// For treatment: explicitly run code indexing to populate KG
|
|
384
|
+
if (useAutoInit) {
|
|
385
|
+
console.log(' Running code intelligence indexing...');
|
|
386
|
+
const indexT0 = performance.now();
|
|
387
|
+
const indexOutput = runAqe('code index .');
|
|
388
|
+
const indexMs = performance.now() - indexT0;
|
|
389
|
+
initDurationMs += indexMs;
|
|
390
|
+
console.log(` Code index completed in ${(indexMs / 1000).toFixed(1)}s`);
|
|
391
|
+
|
|
392
|
+
// Log index results
|
|
393
|
+
const filesMatch = indexOutput.match(/Files indexed:\s*(\d+)/);
|
|
394
|
+
const nodesMatch = indexOutput.match(/Nodes created:\s*(\d+)/);
|
|
395
|
+
const edgesMatch = indexOutput.match(/Edges created:\s*(\d+)/);
|
|
396
|
+
if (filesMatch) console.log(` Indexed: ${filesMatch[1]} files, ${nodesMatch?.[1] || '?'} nodes, ${edgesMatch?.[1] || '?'} edges`);
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// Step 2: KG stats
|
|
400
|
+
const kgStats = getKGStats();
|
|
401
|
+
if (useAutoInit) kgStats.indexDuration = initDurationMs;
|
|
402
|
+
console.log(` KG vectors: ${kgStats.vectorCount} | patterns: ${kgStats.patternCount} | DB: ${(kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
|
|
403
|
+
|
|
404
|
+
// Step 3: Generate tests per file
|
|
405
|
+
const results: GenerationResult[] = [];
|
|
406
|
+
|
|
407
|
+
for (let i = 0; i < files.length; i++) {
|
|
408
|
+
const file = files[i];
|
|
409
|
+
console.log(`\n [${i + 1}/${files.length}] ${file.relativePath} (${file.complexity}, ${file.lineCount} lines)`);
|
|
410
|
+
|
|
411
|
+
const result = generateTestsForFile(file);
|
|
412
|
+
results.push(result);
|
|
413
|
+
|
|
414
|
+
console.log(` Tests: ${result.parsed.testsGenerated} | Assertions: ${result.parsed.assertions} | Coverage: ${result.parsed.coverageEstimate}% | Time: ${(result.wallTimeMs / 1000).toFixed(1)}s`);
|
|
415
|
+
console.log(` Output: ${result.outputLength} chars | Patterns: ${result.parsed.patternsUsed}`);
|
|
416
|
+
console.log(` Tokens (est): prompt=${result.parsed.estimatedPromptTokens} output=${result.parsed.estimatedOutputTokens} | KG mocks: ${result.parsed.kgMocksGenerated} | File reads avoided: ${result.parsed.fileReadsAvoided}`);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// Capture token usage once at end of run (avoids per-file process overhead)
|
|
420
|
+
console.log('\n Capturing token usage...');
|
|
421
|
+
const aggregateTokens = captureTokenUsage();
|
|
422
|
+
console.log(` Tokens: in=${aggregateTokens.inputTokens} out=${aggregateTokens.outputTokens} total=${aggregateTokens.totalTokens} saved=${aggregateTokens.tokensSaved}`);
|
|
423
|
+
|
|
424
|
+
return {
|
|
425
|
+
label,
|
|
426
|
+
kgStats,
|
|
427
|
+
initOutput,
|
|
428
|
+
initDurationMs,
|
|
429
|
+
files: results,
|
|
430
|
+
totalWallTimeMs: results.reduce((s, r) => s + r.wallTimeMs, 0),
|
|
431
|
+
aggregateTokens,
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// ── Comparison ───────────────────────────────────────────────────────
|
|
436
|
+
|
|
437
|
+
interface FileMetrics {
|
|
438
|
+
tests: number; assertions: number; coverage: number;
|
|
439
|
+
wallMs: number; outputChars: number; patterns: number;
|
|
440
|
+
kgMocks: number; fileReadsAvoided: number;
|
|
441
|
+
estPromptTokens: number; estOutputTokens: number;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
interface FileComparison {
|
|
445
|
+
file: string;
|
|
446
|
+
complexity: string;
|
|
447
|
+
lineCount: number;
|
|
448
|
+
control: FileMetrics;
|
|
449
|
+
treatment: FileMetrics;
|
|
450
|
+
delta: FileMetrics;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
function buildMetrics(r: GenerationResult): FileMetrics {
|
|
454
|
+
return {
|
|
455
|
+
tests: r.parsed.testsGenerated,
|
|
456
|
+
assertions: r.parsed.assertions,
|
|
457
|
+
coverage: r.parsed.coverageEstimate,
|
|
458
|
+
wallMs: r.wallTimeMs,
|
|
459
|
+
outputChars: r.outputLength,
|
|
460
|
+
patterns: r.parsed.patternsUsed,
|
|
461
|
+
kgMocks: r.parsed.kgMocksGenerated,
|
|
462
|
+
fileReadsAvoided: r.parsed.fileReadsAvoided,
|
|
463
|
+
estPromptTokens: r.parsed.estimatedPromptTokens,
|
|
464
|
+
estOutputTokens: r.parsed.estimatedOutputTokens,
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
function deltaMetrics(a: FileMetrics, b: FileMetrics): FileMetrics {
|
|
469
|
+
const d = {} as FileMetrics;
|
|
470
|
+
for (const k of Object.keys(a) as (keyof FileMetrics)[]) {
|
|
471
|
+
(d as any)[k] = (b[k] as number) - (a[k] as number);
|
|
472
|
+
}
|
|
473
|
+
return d;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
function compare(control: RunResult, treatment: RunResult): FileComparison[] {
|
|
477
|
+
return control.files.map((c, i) => {
|
|
478
|
+
const t = treatment.files[i];
|
|
479
|
+
const cm = buildMetrics(c);
|
|
480
|
+
const tm = buildMetrics(t);
|
|
481
|
+
return {
|
|
482
|
+
file: c.file,
|
|
483
|
+
complexity: c.complexity,
|
|
484
|
+
lineCount: c.lineCount,
|
|
485
|
+
control: cm,
|
|
486
|
+
treatment: tm,
|
|
487
|
+
delta: deltaMetrics(cm, tm),
|
|
488
|
+
};
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// ── Summary Printing ─────────────────────────────────────────────────
|
|
493
|
+
|
|
494
|
+
function printSummary(control: RunResult, treatment: RunResult, comparisons: FileComparison[]): void {
|
|
495
|
+
const W = 76;
|
|
496
|
+
const hr = '═'.repeat(W);
|
|
497
|
+
const line = (s: string) => console.log(`║ ${s.padEnd(W - 4)} ║`);
|
|
498
|
+
const blank = () => line('');
|
|
499
|
+
|
|
500
|
+
console.log(`\n╔${hr}╗`);
|
|
501
|
+
console.log(`║${'KG-ASSISTED QE BENCHMARK RESULTS'.padStart((W + 32) / 2).padEnd(W)}║`);
|
|
502
|
+
console.log(`╠${hr}╣`);
|
|
503
|
+
|
|
504
|
+
blank();
|
|
505
|
+
line('INITIALIZATION');
|
|
506
|
+
line(` Control (no KG): ${(control.initDurationMs / 1000).toFixed(1).padStart(6)}s | Vectors: ${control.kgStats.vectorCount} | DB: ${(control.kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
|
|
507
|
+
line(` Treatment (KG): ${(treatment.initDurationMs / 1000).toFixed(1).padStart(6)}s | Vectors: ${treatment.kgStats.vectorCount} | DB: ${(treatment.kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
|
|
508
|
+
|
|
509
|
+
blank();
|
|
510
|
+
line('PER-FILE COMPARISON');
|
|
511
|
+
console.log(`╟${'─'.repeat(W)}╢`);
|
|
512
|
+
line('File Ctrl KG ΔTests ΔAssert ΔCov% ΔTime');
|
|
513
|
+
console.log(`╟${'─'.repeat(W)}╢`);
|
|
514
|
+
|
|
515
|
+
for (const c of comparisons) {
|
|
516
|
+
const name = path.basename(c.file).padEnd(26).slice(0, 26);
|
|
517
|
+
const ctrl = String(c.control.tests).padStart(4);
|
|
518
|
+
const kg = String(c.treatment.tests).padStart(5);
|
|
519
|
+
const dt = ((c.delta.tests >= 0 ? '+' : '') + c.delta.tests).padStart(7);
|
|
520
|
+
const da = ((c.delta.assertions >= 0 ? '+' : '') + c.delta.assertions).padStart(8);
|
|
521
|
+
const dc = ((c.delta.coverage >= 0 ? '+' : '') + c.delta.coverage.toFixed(1)).padStart(6);
|
|
522
|
+
const dw = ((c.delta.wallMs >= 0 ? '+' : '') + (c.delta.wallMs / 1000).toFixed(1) + 's').padStart(7);
|
|
523
|
+
line(`${name} ${ctrl} ${kg} ${dt} ${da} ${dc} ${dw}`);
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
console.log(`╟${'─'.repeat(W)}╢`);
|
|
527
|
+
|
|
528
|
+
// Aggregates
|
|
529
|
+
const sumCtrl = { tests: 0, assertions: 0, coverage: 0 };
|
|
530
|
+
const sumKg = { tests: 0, assertions: 0, coverage: 0 };
|
|
531
|
+
for (const c of comparisons) {
|
|
532
|
+
sumCtrl.tests += c.control.tests;
|
|
533
|
+
sumCtrl.assertions += c.control.assertions;
|
|
534
|
+
sumCtrl.coverage += c.control.coverage;
|
|
535
|
+
sumKg.tests += c.treatment.tests;
|
|
536
|
+
sumKg.assertions += c.treatment.assertions;
|
|
537
|
+
sumKg.coverage += c.treatment.coverage;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
const n = comparisons.length || 1;
|
|
541
|
+
const ctrlTok = control.aggregateTokens;
|
|
542
|
+
const kgTok = treatment.aggregateTokens;
|
|
543
|
+
|
|
544
|
+
// Compute estimated token totals from per-file data
|
|
545
|
+
const estCtrlPrompt = comparisons.reduce((s, c) => s + c.control.estPromptTokens, 0);
|
|
546
|
+
const estCtrlOutput = comparisons.reduce((s, c) => s + c.control.estOutputTokens, 0);
|
|
547
|
+
const estKgPrompt = comparisons.reduce((s, c) => s + c.treatment.estPromptTokens, 0);
|
|
548
|
+
const estKgOutput = comparisons.reduce((s, c) => s + c.treatment.estOutputTokens, 0);
|
|
549
|
+
const totalKgMocks = comparisons.reduce((s, c) => s + c.treatment.kgMocks, 0);
|
|
550
|
+
const totalFileReadsAvoided = comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0);
|
|
551
|
+
|
|
552
|
+
blank();
|
|
553
|
+
line('AGGREGATES');
|
|
554
|
+
line(` Tests generated: Control=${sumCtrl.tests} KG=${sumKg.tests} Δ=${sumKg.tests - sumCtrl.tests}`);
|
|
555
|
+
line(` Total assertions: Control=${sumCtrl.assertions} KG=${sumKg.assertions} Δ=${sumKg.assertions - sumCtrl.assertions}`);
|
|
556
|
+
line(` Avg coverage: Control=${(sumCtrl.coverage / n).toFixed(1)}% KG=${(sumKg.coverage / n).toFixed(1)}%`);
|
|
557
|
+
line(` Token usage (DB): Control=${ctrlTok.totalTokens} KG=${kgTok.totalTokens} Δ=${kgTok.totalTokens - ctrlTok.totalTokens}`);
|
|
558
|
+
line(` Tokens saved: Control=${ctrlTok.tokensSaved} KG=${kgTok.tokensSaved}`);
|
|
559
|
+
line(` Patterns reused: Control=${ctrlTok.patternsReused} KG=${kgTok.patternsReused}`);
|
|
560
|
+
line(` Wall time: Control=${(control.totalWallTimeMs / 1000).toFixed(1)}s KG=${(treatment.totalWallTimeMs / 1000).toFixed(1)}s`);
|
|
561
|
+
|
|
562
|
+
blank();
|
|
563
|
+
line('TOKEN EFFICIENCY (estimated ~4 chars/token)');
|
|
564
|
+
line(` Prompt tokens: Control=${estCtrlPrompt} KG=${estKgPrompt} Δ=${estKgPrompt - estCtrlPrompt}`);
|
|
565
|
+
line(` Output tokens: Control=${estCtrlOutput} KG=${estKgOutput} Δ=${estKgOutput - estCtrlOutput}`);
|
|
566
|
+
line(` Total estimated: Control=${estCtrlPrompt + estCtrlOutput} KG=${estKgPrompt + estKgOutput} Δ=${(estKgPrompt + estKgOutput) - (estCtrlPrompt + estCtrlOutput)}`);
|
|
567
|
+
line(` KG mocks generated: ${totalKgMocks} (each = 1 fewer file read for agent)`);
|
|
568
|
+
line(` File reads avoided: ${totalFileReadsAvoided} (dependency info from KG instead of fs)`);
|
|
569
|
+
|
|
570
|
+
blank();
|
|
571
|
+
line('KEY FINDINGS (Issue #266 Questions)');
|
|
572
|
+
const tokenDelta = kgTok.totalTokens - ctrlTok.totalTokens;
|
|
573
|
+
const estTokenDelta = (estKgPrompt + estKgOutput) - (estCtrlPrompt + estCtrlOutput);
|
|
574
|
+
line(` Q1: KG reduces tokens? ${tokenDelta < 0 ? 'YES (DB: saved ' + Math.abs(tokenDelta) + ')' : tokenDelta === 0 ? 'NO CHANGE (DB)' : 'NO (DB: +' + tokenDelta + ')'} | Est: ${estTokenDelta < 0 ? 'saved ' + Math.abs(estTokenDelta) : '+' + estTokenDelta}`);
|
|
575
|
+
line(` Q2: KG improves quality? ${sumKg.tests > sumCtrl.tests ? 'YES (+' + (sumKg.tests - sumCtrl.tests) + ' tests, +' + (sumKg.assertions - sumCtrl.assertions) + ' assertions)' : 'NO CHANGE or WORSE'}`);
|
|
576
|
+
line(` Q3: Latency justified? ${treatment.totalWallTimeMs < control.totalWallTimeMs * 1.5 ? 'YES (overhead < 50%)' : 'NEEDS REVIEW (>' + ((treatment.totalWallTimeMs / control.totalWallTimeMs - 1) * 100).toFixed(0) + '% slower)'}`);
|
|
577
|
+
line(` Q4: File reads avoided? ${totalFileReadsAvoided > 0 ? 'YES (' + totalFileReadsAvoided + ' reads avoided via KG deps)' : 'NO (KG deps not yet used for mock generation)'}`);
|
|
578
|
+
|
|
579
|
+
// Q5: By complexity
|
|
580
|
+
for (const cx of ['simple', 'medium', 'complex'] as const) {
|
|
581
|
+
const subset = comparisons.filter((c) => c.complexity === cx);
|
|
582
|
+
if (subset.length === 0) continue;
|
|
583
|
+
const ct = subset.reduce((s, c) => s + c.control.tests, 0);
|
|
584
|
+
const kt = subset.reduce((s, c) => s + c.treatment.tests, 0);
|
|
585
|
+
const ca = subset.reduce((s, c) => s + c.control.assertions, 0);
|
|
586
|
+
const ka = subset.reduce((s, c) => s + c.treatment.assertions, 0);
|
|
587
|
+
const fr = subset.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0);
|
|
588
|
+
line(` Q5 [${cx.padEnd(7)}]: tests ${ct}→${kt} assertions ${ca}→${ka} file reads avoided: ${fr}`);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
blank();
|
|
592
|
+
console.log(`╚${hr}╝`);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// ── Main ─────────────────────────────────────────────────────────────
|
|
596
|
+
|
|
597
|
+
async function main() {
|
|
598
|
+
console.log('╔══════════════════════════════════════════════════════════════╗');
|
|
599
|
+
console.log('║ KG-Assisted QE Benchmark (Issue #266) ║');
|
|
600
|
+
console.log('║ Test subject: github.com/maxritter/claude-pilot ║');
|
|
601
|
+
console.log('╚══════════════════════════════════════════════════════════════╝');
|
|
602
|
+
console.log(` Timestamp: ${new Date().toISOString()}`);
|
|
603
|
+
console.log(` Max files: ${maxFiles}`);
|
|
604
|
+
console.log(` Skip clone: ${skipClone}`);
|
|
605
|
+
|
|
606
|
+
// Step 1: Clone
|
|
607
|
+
if (!skipClone) {
|
|
608
|
+
console.log('\n── 1. Cloning test project ──────────────────────────');
|
|
609
|
+
if (fs.existsSync(PROJECT_DIR)) {
|
|
610
|
+
fs.rmSync(PROJECT_DIR, { recursive: true, force: true });
|
|
611
|
+
}
|
|
612
|
+
fs.mkdirSync(BENCH_DIR, { recursive: true });
|
|
613
|
+
execSync(`git clone ${REPO_URL} ${PROJECT_DIR}`, { stdio: 'inherit' });
|
|
614
|
+
} else {
|
|
615
|
+
console.log('\n── 1. Using existing clone ──────────────────────────');
|
|
616
|
+
if (!fs.existsSync(PROJECT_DIR)) {
|
|
617
|
+
console.error(`ERROR: ${PROJECT_DIR} does not exist. Run without --skip-clone.`);
|
|
618
|
+
process.exit(1);
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Step 2: Select files
|
|
623
|
+
console.log('\n── 2. Selecting benchmark files ─────────────────────');
|
|
624
|
+
const files = discoverBenchmarkFiles();
|
|
625
|
+
console.log(` Selected ${files.length} files:`);
|
|
626
|
+
for (const f of files) {
|
|
627
|
+
console.log(` [${f.complexity.padEnd(7)}] ${f.relativePath} (${f.lineCount} lines)`);
|
|
628
|
+
}
|
|
629
|
+
if (files.length === 0) {
|
|
630
|
+
console.error('ERROR: No source files found');
|
|
631
|
+
process.exit(1);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// Step 3: Run A — Control
|
|
635
|
+
console.log('\n── 3. Run A: Control (no KG) ────────────────────────');
|
|
636
|
+
const control = runBenchmark('CONTROL (no KG)', files, false);
|
|
637
|
+
|
|
638
|
+
// Step 4: Run B — Treatment
|
|
639
|
+
console.log('\n── 4. Run B: Treatment (with KG) ────────────────────');
|
|
640
|
+
const treatment = runBenchmark('TREATMENT (KG-assisted)', files, true);
|
|
641
|
+
|
|
642
|
+
// Step 5: Compare & report
|
|
643
|
+
console.log('\n── 5. Results ───────────────────────────────────────');
|
|
644
|
+
const comparisons = compare(control, treatment);
|
|
645
|
+
printSummary(control, treatment, comparisons);
|
|
646
|
+
|
|
647
|
+
// Step 6: Save JSON
|
|
648
|
+
fs.mkdirSync(RESULTS_DIR, { recursive: true });
|
|
649
|
+
const resultsPath = path.join(RESULTS_DIR, `kg-benchmark-${Date.now()}.json`);
|
|
650
|
+
|
|
651
|
+
const fullResults = {
|
|
652
|
+
timestamp: new Date().toISOString(),
|
|
653
|
+
testSubject: REPO_URL,
|
|
654
|
+
filesCount: files.length,
|
|
655
|
+
control: {
|
|
656
|
+
label: control.label,
|
|
657
|
+
kgStats: control.kgStats,
|
|
658
|
+
initDurationMs: control.initDurationMs,
|
|
659
|
+
totalWallTimeMs: control.totalWallTimeMs,
|
|
660
|
+
aggregateTokens: control.aggregateTokens,
|
|
661
|
+
files: control.files.map((f) => ({
|
|
662
|
+
file: f.file, complexity: f.complexity, lineCount: f.lineCount, sourceChars: f.sourceChars,
|
|
663
|
+
wallTimeMs: f.wallTimeMs, outputLength: f.outputLength, parsed: f.parsed,
|
|
664
|
+
})),
|
|
665
|
+
},
|
|
666
|
+
treatment: {
|
|
667
|
+
label: treatment.label,
|
|
668
|
+
kgStats: treatment.kgStats,
|
|
669
|
+
initDurationMs: treatment.initDurationMs,
|
|
670
|
+
totalWallTimeMs: treatment.totalWallTimeMs,
|
|
671
|
+
aggregateTokens: treatment.aggregateTokens,
|
|
672
|
+
files: treatment.files.map((f) => ({
|
|
673
|
+
file: f.file, complexity: f.complexity, lineCount: f.lineCount, sourceChars: f.sourceChars,
|
|
674
|
+
wallTimeMs: f.wallTimeMs, outputLength: f.outputLength, parsed: f.parsed,
|
|
675
|
+
})),
|
|
676
|
+
},
|
|
677
|
+
comparisons,
|
|
678
|
+
keyFindings: {
|
|
679
|
+
q1_kg_reduces_tokens: treatment.aggregateTokens.totalTokens < control.aggregateTokens.totalTokens,
|
|
680
|
+
q2_kg_improves_quality: comparisons.reduce((s, c) => s + c.delta.tests, 0) > 0,
|
|
681
|
+
q3_latency_justified: treatment.totalWallTimeMs < control.totalWallTimeMs * 1.5,
|
|
682
|
+
q4_file_reads_avoided: comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
|
|
683
|
+
q5_complexity_breakdown: Object.fromEntries(
|
|
684
|
+
(['simple', 'medium', 'complex'] as const).map((cx) => {
|
|
685
|
+
const subset = comparisons.filter((c) => c.complexity === cx);
|
|
686
|
+
return [cx, {
|
|
687
|
+
files: subset.length,
|
|
688
|
+
controlTests: subset.reduce((s, c) => s + c.control.tests, 0),
|
|
689
|
+
kgTests: subset.reduce((s, c) => s + c.treatment.tests, 0),
|
|
690
|
+
controlAssertions: subset.reduce((s, c) => s + c.control.assertions, 0),
|
|
691
|
+
kgAssertions: subset.reduce((s, c) => s + c.treatment.assertions, 0),
|
|
692
|
+
fileReadsAvoided: subset.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
|
|
693
|
+
}];
|
|
694
|
+
})
|
|
695
|
+
),
|
|
696
|
+
tokenEfficiency: {
|
|
697
|
+
controlEstPromptTokens: comparisons.reduce((s, c) => s + c.control.estPromptTokens, 0),
|
|
698
|
+
controlEstOutputTokens: comparisons.reduce((s, c) => s + c.control.estOutputTokens, 0),
|
|
699
|
+
kgEstPromptTokens: comparisons.reduce((s, c) => s + c.treatment.estPromptTokens, 0),
|
|
700
|
+
kgEstOutputTokens: comparisons.reduce((s, c) => s + c.treatment.estOutputTokens, 0),
|
|
701
|
+
totalKgMocksGenerated: comparisons.reduce((s, c) => s + c.treatment.kgMocks, 0),
|
|
702
|
+
totalFileReadsAvoided: comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
|
|
703
|
+
},
|
|
704
|
+
},
|
|
705
|
+
};
|
|
706
|
+
|
|
707
|
+
fs.writeFileSync(resultsPath, JSON.stringify(fullResults, null, 2));
|
|
708
|
+
console.log(`\n Results JSON: ${resultsPath}`);
|
|
709
|
+
|
|
710
|
+
// Save raw outputs for manual diff
|
|
711
|
+
const rawDir = path.join(RESULTS_DIR, 'raw');
|
|
712
|
+
fs.mkdirSync(rawDir, { recursive: true });
|
|
713
|
+
for (const r of control.files) {
|
|
714
|
+
fs.writeFileSync(path.join(rawDir, `control-${path.basename(r.file, '.py')}.txt`), r.rawOutput);
|
|
715
|
+
}
|
|
716
|
+
for (const r of treatment.files) {
|
|
717
|
+
fs.writeFileSync(path.join(rawDir, `treatment-${path.basename(r.file, '.py')}.txt`), r.rawOutput);
|
|
718
|
+
}
|
|
719
|
+
console.log(` Raw outputs: ${rawDir}/`);
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
main().catch((err) => {
|
|
723
|
+
console.error('Benchmark failed:', err);
|
|
724
|
+
process.exit(1);
|
|
725
|
+
});
|