agentic-qe 3.6.9 → 3.6.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
  2. package/.claude/skills/pr-review/SKILL.md +2 -2
  3. package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
  4. package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
  5. package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
  6. package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
  7. package/.claude/skills/skills-manifest.json +5 -5
  8. package/package.json +1 -1
  9. package/scripts/benchmark-hnsw-loading.ts +480 -0
  10. package/scripts/benchmark-kg-assisted.ts +725 -0
  11. package/scripts/collect-production-telemetry.sh +291 -0
  12. package/scripts/detect-skill-conflicts.ts +347 -0
  13. package/scripts/eval-driven-workflow.ts +704 -0
  14. package/scripts/run-skill-eval.ts +210 -10
  15. package/scripts/score-skill-quality.ts +511 -0
  16. package/v3/CHANGELOG.md +44 -0
  17. package/v3/assets/skills/pr-review/SKILL.md +2 -2
  18. package/v3/dist/cli/bundle.js +1526 -700
  19. package/v3/dist/cli/commands/code.d.ts.map +1 -1
  20. package/v3/dist/cli/commands/code.js +9 -85
  21. package/v3/dist/cli/commands/code.js.map +1 -1
  22. package/v3/dist/cli/commands/coverage.d.ts.map +1 -1
  23. package/v3/dist/cli/commands/coverage.js +3 -28
  24. package/v3/dist/cli/commands/coverage.js.map +1 -1
  25. package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
  26. package/v3/dist/cli/commands/hooks.js +143 -2
  27. package/v3/dist/cli/commands/hooks.js.map +1 -1
  28. package/v3/dist/cli/commands/security.d.ts.map +1 -1
  29. package/v3/dist/cli/commands/security.js +3 -29
  30. package/v3/dist/cli/commands/security.js.map +1 -1
  31. package/v3/dist/cli/commands/test.d.ts.map +1 -1
  32. package/v3/dist/cli/commands/test.js +11 -58
  33. package/v3/dist/cli/commands/test.js.map +1 -1
  34. package/v3/dist/cli/utils/file-discovery.d.ts +27 -0
  35. package/v3/dist/cli/utils/file-discovery.d.ts.map +1 -0
  36. package/v3/dist/cli/utils/file-discovery.js +105 -0
  37. package/v3/dist/cli/utils/file-discovery.js.map +1 -0
  38. package/v3/dist/coordination/task-executor.d.ts.map +1 -1
  39. package/v3/dist/coordination/task-executor.js +304 -44
  40. package/v3/dist/coordination/task-executor.js.map +1 -1
  41. package/v3/dist/domains/code-intelligence/coordinator.d.ts.map +1 -1
  42. package/v3/dist/domains/code-intelligence/coordinator.js +8 -1
  43. package/v3/dist/domains/code-intelligence/coordinator.js.map +1 -1
  44. package/v3/dist/domains/code-intelligence/services/metric-collector/index.d.ts.map +1 -1
  45. package/v3/dist/domains/code-intelligence/services/metric-collector/index.js +10 -0
  46. package/v3/dist/domains/code-intelligence/services/metric-collector/index.js.map +1 -1
  47. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts +7 -1
  48. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts.map +1 -1
  49. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js +10 -1
  50. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js.map +1 -1
  51. package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js +34 -10
  52. package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js.map +1 -1
  53. package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts +9 -0
  54. package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts.map +1 -1
  55. package/v3/dist/domains/coverage-analysis/services/hnsw-index.js +38 -3
  56. package/v3/dist/domains/coverage-analysis/services/hnsw-index.js.map +1 -1
  57. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
  58. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
  59. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
  60. package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
  61. package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
  62. package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
  63. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
  64. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
  65. package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
  66. package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
  67. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
  68. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
  69. package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
  70. package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
  71. package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
  72. package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
  73. package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
  74. package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
  75. package/v3/dist/init/init-wizard-hooks.d.ts +8 -1
  76. package/v3/dist/init/init-wizard-hooks.d.ts.map +1 -1
  77. package/v3/dist/init/init-wizard-hooks.js +47 -39
  78. package/v3/dist/init/init-wizard-hooks.js.map +1 -1
  79. package/v3/dist/init/phases/07-hooks.d.ts +11 -1
  80. package/v3/dist/init/phases/07-hooks.d.ts.map +1 -1
  81. package/v3/dist/init/phases/07-hooks.js +46 -50
  82. package/v3/dist/init/phases/07-hooks.js.map +1 -1
  83. package/v3/dist/init/settings-merge.d.ts +35 -0
  84. package/v3/dist/init/settings-merge.d.ts.map +1 -0
  85. package/v3/dist/init/settings-merge.js +140 -0
  86. package/v3/dist/init/settings-merge.js.map +1 -0
  87. package/v3/dist/integrations/agentic-flow/model-router/router.js +1 -1
  88. package/v3/dist/integrations/agentic-flow/model-router/router.js.map +1 -1
  89. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.d.ts.map +1 -1
  90. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js +18 -3
  91. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js.map +1 -1
  92. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts +3 -3
  93. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts.map +1 -1
  94. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js +18 -0
  95. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js.map +1 -1
  96. package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
  97. package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
  98. package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
  99. package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
  100. package/v3/dist/kernel/unified-memory.d.ts +2 -2
  101. package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
  102. package/v3/dist/kernel/unified-memory.js +7 -9
  103. package/v3/dist/kernel/unified-memory.js.map +1 -1
  104. package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
  105. package/v3/dist/learning/qe-hooks.js +34 -3
  106. package/v3/dist/learning/qe-hooks.js.map +1 -1
  107. package/v3/dist/mcp/bundle.js +1403 -425
  108. package/v3/dist/mcp/handlers/domain-handler-configs.d.ts.map +1 -1
  109. package/v3/dist/mcp/handlers/domain-handler-configs.js +40 -31
  110. package/v3/dist/mcp/handlers/domain-handler-configs.js.map +1 -1
  111. package/v3/dist/mcp/handlers/task-handlers.d.ts.map +1 -1
  112. package/v3/dist/mcp/handlers/task-handlers.js +68 -5
  113. package/v3/dist/mcp/handlers/task-handlers.js.map +1 -1
  114. package/v3/dist/mcp/protocol-server.d.ts.map +1 -1
  115. package/v3/dist/mcp/protocol-server.js +16 -2
  116. package/v3/dist/mcp/protocol-server.js.map +1 -1
  117. package/v3/package.json +1 -1
@@ -0,0 +1,725 @@
1
+ #!/usr/bin/env tsx
2
+ /**
3
+ * KG-Assisted QE Benchmark (Issue #266)
4
+ *
5
+ * A/B comparison: does code intelligence KG improve QE agent test generation?
6
+ *
7
+ * Protocol:
8
+ * Run A (Control) — aqe init --minimal (no code intelligence) → test generate × N files
9
+ * Run B (Treatment) — aqe init --auto (KG populated) → same test generate calls
10
+ *
11
+ * Metrics captured per file:
12
+ * - Token usage (via `aqe token-usage --json`)
13
+ * - Generated test count, assertions, coverage estimate
14
+ * - Wall-clock latency
15
+ * - Output size (chars — proxy for context consumed)
16
+ * - KG index stats (vectors in memory.db)
17
+ *
18
+ * Test subject: https://github.com/maxritter/claude-pilot (cloned to /tmp)
19
+ *
20
+ * Run:
21
+ * npx tsx scripts/benchmark-kg-assisted.ts
22
+ * npx tsx scripts/benchmark-kg-assisted.ts --skip-clone # reuse existing clone
23
+ * npx tsx scripts/benchmark-kg-assisted.ts --files 3 # fewer files for quick test
24
+ */
25
+
26
+ import { execSync, ExecSyncOptions } from 'child_process';
27
+ import * as fs from 'fs';
28
+ import * as path from 'path';
29
+
30
+ // ── Configuration ────────────────────────────────────────────────────
31
+
32
+ const REPO_URL = 'https://github.com/maxritter/claude-pilot';
33
+ const BENCH_DIR = '/tmp/kg-benchmark';
34
+ const PROJECT_DIR = path.join(BENCH_DIR, 'claude-pilot');
35
+ const RESULTS_DIR = path.join(BENCH_DIR, 'results');
36
+ const AQE_DIR = path.join(PROJECT_DIR, '.agentic-qe');
37
+
38
+ // Parse CLI args
39
+ const args = process.argv.slice(2);
40
+ const skipClone = args.includes('--skip-clone');
41
+ const fileCountArg = args.indexOf('--files');
42
+ const maxFiles = fileCountArg >= 0 ? parseInt(args[fileCountArg + 1] || '10', 10) : 10;
43
+
44
+ // ── File Selection ───────────────────────────────────────────────────
45
+
46
+ interface BenchmarkFile {
47
+ relativePath: string;
48
+ complexity: 'simple' | 'medium' | 'complex';
49
+ lineCount: number;
50
+ }
51
+
52
+ function discoverBenchmarkFiles(): BenchmarkFile[] {
53
+ const result = execSync(
54
+ `find ${PROJECT_DIR} -name '*.py' ! -path '*/test*' ! -name 'conftest.py' ! -name '__init__.py' -exec wc -l {} + 2>/dev/null | sort -rn | grep -v total`,
55
+ { encoding: 'utf-8' }
56
+ );
57
+
58
+ const files: BenchmarkFile[] = [];
59
+ for (const line of result.trim().split('\n')) {
60
+ const match = line.trim().match(/^(\d+)\s+(.+)$/);
61
+ if (!match) continue;
62
+ const lineCount = parseInt(match[1], 10);
63
+ const absPath = match[2];
64
+ const relativePath = path.relative(PROJECT_DIR, absPath);
65
+
66
+ let complexity: BenchmarkFile['complexity'];
67
+ if (lineCount < 150) complexity = 'simple';
68
+ else if (lineCount < 400) complexity = 'medium';
69
+ else complexity = 'complex';
70
+
71
+ files.push({ relativePath, complexity, lineCount });
72
+ }
73
+
74
+ // Select balanced set: 30% complex, 40% medium, 30% simple
75
+ const simple = files.filter((f) => f.complexity === 'simple');
76
+ const medium = files.filter((f) => f.complexity === 'medium');
77
+ const complex = files.filter((f) => f.complexity === 'complex');
78
+
79
+ const selected: BenchmarkFile[] = [];
80
+ const targets = [
81
+ { bucket: complex, target: Math.min(3, Math.ceil(maxFiles * 0.3)) },
82
+ { bucket: medium, target: Math.min(4, Math.ceil(maxFiles * 0.4)) },
83
+ { bucket: simple, target: Math.min(3, Math.ceil(maxFiles * 0.3)) },
84
+ ];
85
+
86
+ for (const { bucket, target } of targets) {
87
+ selected.push(...bucket.slice(0, target));
88
+ }
89
+
90
+ return selected.slice(0, maxFiles);
91
+ }
92
+
93
+ // ── AQE CLI Helpers ──────────────────────────────────────────────────
94
+
95
+ const EXEC_OPTS: ExecSyncOptions = {
96
+ cwd: PROJECT_DIR,
97
+ encoding: 'utf-8' as BufferEncoding,
98
+ timeout: 300_000,
99
+ stdio: ['pipe', 'pipe', 'pipe'],
100
+ };
101
+
102
+ function runAqe(subcommand: string): string {
103
+ const cmd = `npx agentic-qe ${subcommand}`;
104
+ console.log(` $ ${cmd}`);
105
+ try {
106
+ return execSync(cmd, EXEC_OPTS) as string;
107
+ } catch (e: unknown) {
108
+ const err = e as { stdout?: string; stderr?: string; message?: string };
109
+ const output = (err.stdout || '') + '\n' + (err.stderr || '');
110
+ console.error(` [WARN] ${err.message?.split('\n')[0]}`);
111
+ return output;
112
+ }
113
+ }
114
+
115
+ function wipeAqeData(): void {
116
+ if (fs.existsSync(AQE_DIR)) {
117
+ fs.rmSync(AQE_DIR, { recursive: true, force: true });
118
+ console.log(' Wiped .agentic-qe/');
119
+ }
120
+ }
121
+
122
+ // ── Token Usage ──────────────────────────────────────────────────────
123
+
124
+ interface TokenSnapshot {
125
+ inputTokens: number;
126
+ outputTokens: number;
127
+ totalTokens: number;
128
+ patternsReused: number;
129
+ tokensSaved: number;
130
+ }
131
+
132
+ function captureTokenUsage(): TokenSnapshot {
133
+ const empty: TokenSnapshot = { inputTokens: 0, outputTokens: 0, totalTokens: 0, patternsReused: 0, tokensSaved: 0 };
134
+
135
+ // Try reading token metrics directly from memory.db (faster than spawning CLI)
136
+ const memoryDb = path.join(AQE_DIR, 'memory.db');
137
+ if (!fs.existsSync(memoryDb)) return empty;
138
+
139
+ try {
140
+ // Query the token_metrics table directly via sqlite3
141
+ const result = execSync(
142
+ `sqlite3 '${memoryDb}' "SELECT COALESCE(SUM(json_extract(value, '$.usage.inputTokens')),0), COALESCE(SUM(json_extract(value, '$.usage.outputTokens')),0), COALESCE(SUM(json_extract(value, '$.usage.totalTokens')),0) FROM kv_store WHERE namespace='token-metrics';" 2>/dev/null`,
143
+ { encoding: 'utf-8', timeout: 5_000, cwd: PROJECT_DIR }
144
+ ).trim();
145
+
146
+ if (result) {
147
+ const parts = result.split('|');
148
+ return {
149
+ inputTokens: parseInt(parts[0], 10) || 0,
150
+ outputTokens: parseInt(parts[1], 10) || 0,
151
+ totalTokens: parseInt(parts[2], 10) || 0,
152
+ patternsReused: 0,
153
+ tokensSaved: 0,
154
+ };
155
+ }
156
+ } catch { /* table may not exist */ }
157
+
158
+ // Fallback: try the pattern reuse count
159
+ try {
160
+ const patternResult = execSync(
161
+ `sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM qe_patterns WHERE access_count > 0;" 2>/dev/null`,
162
+ { encoding: 'utf-8', timeout: 5_000, cwd: PROJECT_DIR }
163
+ ).trim();
164
+ return { ...empty, patternsReused: parseInt(patternResult, 10) || 0 };
165
+ } catch { /* ignore */ }
166
+
167
+ return empty;
168
+ }
169
+
170
+ // ── Output Parsing ───────────────────────────────────────────────────
171
+
172
+ interface ParsedOutput {
173
+ testsGenerated: number;
174
+ assertions: number;
175
+ coverageEstimate: number;
176
+ patternsUsed: number;
177
+ /** Number of mock declarations generated from KG dependencies */
178
+ kgMocksGenerated: number;
179
+ /** Number of KG dependency imports detected */
180
+ kgDependenciesFound: number;
181
+ /** Number of KG similarity references */
182
+ kgSimilarityRefs: number;
183
+ /** Estimated file reads avoided by using KG (= dependency count) */
184
+ fileReadsAvoided: number;
185
+ /** Estimated prompt tokens (chars / 4 approximation) */
186
+ estimatedPromptTokens: number;
187
+ /** Estimated output tokens (output chars / 4) */
188
+ estimatedOutputTokens: number;
189
+ }
190
+
191
+ function countAssertions(code: string): number {
192
+ // Count actual assertion patterns in generated code across frameworks
193
+ const patterns = [
194
+ /assert\s+/g, // Python assert
195
+ /self\.assert\w+/g, // unittest assertions
196
+ /pytest\.raises/g, // pytest raises
197
+ /expect\s*\(/g, // Jest/Vitest expect()
198
+ /\.to(?:Be|Equal|Have|Throw|Match|Contain)/g, // Jest matchers
199
+ /\.should\./g, // Chai should
200
+ /test_\w+/g, // Python test functions (each is an assertion unit)
201
+ ];
202
+ let count = 0;
203
+ for (const p of patterns) {
204
+ const matches = code.match(p);
205
+ if (matches) count += matches.length;
206
+ }
207
+ return count;
208
+ }
209
+
210
+ function parseTestGenerationOutput(output: string, sourceChars: number): ParsedOutput {
211
+ // "Generated N tests"
212
+ const genMatch = output.match(/Generated\s+(\d+)\s+test/i);
213
+ const testsGenerated = genMatch ? parseInt(genMatch[1], 10) : 0;
214
+
215
+ // "Assertions: N" from CLI output
216
+ const assertMatch = output.match(/Assertions:\s*(\d+)/i);
217
+ let assertions = assertMatch ? parseInt(assertMatch[1], 10) : 0;
218
+
219
+ // If CLI didn't report assertions, count them from the generated code
220
+ if (assertions === 0) {
221
+ assertions = countAssertions(output);
222
+ }
223
+
224
+ // "Coverage Estimate: N%"
225
+ const covMatch = output.match(/Coverage\s+Estimate:\s*([\d.]+)%/i);
226
+ const coverageEstimate = covMatch ? parseFloat(covMatch[1]) : 0;
227
+
228
+ // Count pattern-related lines
229
+ const patternMatches = output.match(/pattern|SONA|DecisionTransformer/gi);
230
+ const patternsUsed = patternMatches ? patternMatches.length : 0;
231
+
232
+ // KG-specific metrics
233
+ // Mock declarations from KG: "vi.mock('...')" or "jest.mock('...')" or "@patch('...')"
234
+ const mockMatches = output.match(/(?:vi|jest)\.mock\(['"]|@patch\(['"]|from unittest\.mock import/g);
235
+ const kgMocksGenerated = mockMatches ? mockMatches.length : 0;
236
+
237
+ // KG dependency imports: lines mentioning "Knowledge Graph" or "dependency analysis"
238
+ const kgDepMatches = output.match(/Knowledge Graph|dependency analysis|Auto-generated mocks from/gi);
239
+ const kgDependenciesFound = kgDepMatches ? kgDepMatches.length : 0;
240
+
241
+ // KG similarity references
242
+ const kgSimMatches = output.match(/Similar.*module|similarity:|similar code/gi);
243
+ const kgSimilarityRefs = kgSimMatches ? kgSimMatches.length : 0;
244
+
245
+ // File reads avoided: each mock/dependency from KG = 1 file read the agent didn't need
246
+ const importMatches = output.match(/(?:vi|jest)\.mock\(['"]([^'"]+)['"]/g);
247
+ const patchMatches = output.match(/@patch\(['"]([^'"]+)['"]/g);
248
+ const fileReadsAvoided = (importMatches?.length || 0) + (patchMatches?.length || 0);
249
+
250
+ // Token estimation: ~4 chars per token for code
251
+ // Prompt = source code + template (~500 chars) + KG context
252
+ const kgContextChars = kgDependenciesFound * 80 + kgSimilarityRefs * 200; // rough estimate
253
+ const estimatedPromptTokens = Math.round((sourceChars + 500 + kgContextChars) / 4);
254
+ const estimatedOutputTokens = Math.round(output.length / 4);
255
+
256
+ return {
257
+ testsGenerated, assertions, coverageEstimate, patternsUsed,
258
+ kgMocksGenerated, kgDependenciesFound, kgSimilarityRefs,
259
+ fileReadsAvoided, estimatedPromptTokens, estimatedOutputTokens,
260
+ };
261
+ }
262
+
263
+ // ── Test Generation ──────────────────────────────────────────────────
264
+
265
+ interface GenerationResult {
266
+ file: string;
267
+ complexity: string;
268
+ lineCount: number;
269
+ sourceChars: number;
270
+ wallTimeMs: number;
271
+ parsed: ParsedOutput;
272
+ outputLength: number;
273
+ rawOutput: string;
274
+ }
275
+
276
+ function generateTestsForFile(file: BenchmarkFile): GenerationResult {
277
+ const absPath = path.join(PROJECT_DIR, file.relativePath);
278
+ const sourceCode = fs.readFileSync(absPath, 'utf-8');
279
+ const sourceChars = sourceCode.length;
280
+
281
+ const t0 = performance.now();
282
+
283
+ // CLI: aqe test generate <file> --type unit --framework pytest
284
+ const escapedPath = file.relativePath.replace(/'/g, "'\\''");
285
+ let output: string;
286
+ try {
287
+ output = execSync(
288
+ `npx agentic-qe test generate '${escapedPath}' --type unit --framework pytest 2>&1`,
289
+ { ...EXEC_OPTS, timeout: 120_000 }
290
+ ) as string;
291
+ } catch (e: unknown) {
292
+ const err = e as { stdout?: string; stderr?: string };
293
+ output = (err.stdout || '') + '\n' + (err.stderr || '');
294
+ }
295
+
296
+ const wallTimeMs = performance.now() - t0;
297
+ const parsed = parseTestGenerationOutput(output, sourceChars);
298
+
299
+ return {
300
+ file: file.relativePath,
301
+ complexity: file.complexity,
302
+ lineCount: file.lineCount,
303
+ sourceChars,
304
+ wallTimeMs,
305
+ parsed,
306
+ outputLength: output.length,
307
+ rawOutput: output,
308
+ };
309
+ }
310
+
311
+ // ── KG Stats ─────────────────────────────────────────────────────────
312
+
313
+ interface KGStats {
314
+ vectorCount: number;
315
+ patternCount: number;
316
+ hasKG: boolean;
317
+ indexDuration?: number;
318
+ dbSizeBytes: number;
319
+ }
320
+
321
+ function getKGStats(): KGStats {
322
+ const memoryDb = path.join(AQE_DIR, 'memory.db');
323
+ const empty: KGStats = { vectorCount: 0, patternCount: 0, hasKG: false, dbSizeBytes: 0 };
324
+
325
+ if (!fs.existsSync(memoryDb)) return empty;
326
+
327
+ try {
328
+ const dbSize = fs.statSync(memoryDb).size;
329
+
330
+ // KG data lives in `vectors` table (code index) and `kv_store` (KG nodes/edges)
331
+ const vectorResult = execSync(
332
+ `sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM vectors;" 2>/dev/null`,
333
+ { encoding: 'utf-8' }
334
+ ).trim();
335
+
336
+ const kvResult = execSync(
337
+ `sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM kv_store;" 2>/dev/null`,
338
+ { encoding: 'utf-8' }
339
+ ).trim();
340
+
341
+ // Also check qe_patterns for learned patterns
342
+ const patternResult = execSync(
343
+ `sqlite3 '${memoryDb}' "SELECT COUNT(*) FROM qe_patterns;" 2>/dev/null`,
344
+ { encoding: 'utf-8' }
345
+ ).trim();
346
+
347
+ const vectorCount = parseInt(vectorResult, 10) || 0;
348
+ const kvCount = parseInt(kvResult, 10) || 0;
349
+ const patternCount = parseInt(patternResult, 10) || 0;
350
+
351
+ return { vectorCount, patternCount: patternCount + kvCount, hasKG: vectorCount > 0, dbSizeBytes: dbSize };
352
+ } catch {
353
+ return empty;
354
+ }
355
+ }
356
+
357
+ // ── Benchmark Run ────────────────────────────────────────────────────
358
+
359
+ interface RunResult {
360
+ label: string;
361
+ kgStats: KGStats;
362
+ initOutput: string;
363
+ initDurationMs: number;
364
+ files: GenerationResult[];
365
+ totalWallTimeMs: number;
366
+ aggregateTokens: TokenSnapshot;
367
+ }
368
+
369
+ function runBenchmark(label: string, files: BenchmarkFile[], useAutoInit: boolean): RunResult {
370
+ console.log(`\n${'═'.repeat(60)}`);
371
+ console.log(` ${label}`);
372
+ console.log(`${'═'.repeat(60)}`);
373
+
374
+ // Step 1: Wipe and reinitialize
375
+ wipeAqeData();
376
+
377
+ const initCmd = useAutoInit ? 'init --auto' : 'init --minimal --skip-patterns';
378
+ const initT0 = performance.now();
379
+ const initOutput = runAqe(initCmd);
380
+ let initDurationMs = performance.now() - initT0;
381
+ console.log(` Init completed in ${(initDurationMs / 1000).toFixed(1)}s`);
382
+
383
+ // For treatment: explicitly run code indexing to populate KG
384
+ if (useAutoInit) {
385
+ console.log(' Running code intelligence indexing...');
386
+ const indexT0 = performance.now();
387
+ const indexOutput = runAqe('code index .');
388
+ const indexMs = performance.now() - indexT0;
389
+ initDurationMs += indexMs;
390
+ console.log(` Code index completed in ${(indexMs / 1000).toFixed(1)}s`);
391
+
392
+ // Log index results
393
+ const filesMatch = indexOutput.match(/Files indexed:\s*(\d+)/);
394
+ const nodesMatch = indexOutput.match(/Nodes created:\s*(\d+)/);
395
+ const edgesMatch = indexOutput.match(/Edges created:\s*(\d+)/);
396
+ if (filesMatch) console.log(` Indexed: ${filesMatch[1]} files, ${nodesMatch?.[1] || '?'} nodes, ${edgesMatch?.[1] || '?'} edges`);
397
+ }
398
+
399
+ // Step 2: KG stats
400
+ const kgStats = getKGStats();
401
+ if (useAutoInit) kgStats.indexDuration = initDurationMs;
402
+ console.log(` KG vectors: ${kgStats.vectorCount} | patterns: ${kgStats.patternCount} | DB: ${(kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
403
+
404
+ // Step 3: Generate tests per file
405
+ const results: GenerationResult[] = [];
406
+
407
+ for (let i = 0; i < files.length; i++) {
408
+ const file = files[i];
409
+ console.log(`\n [${i + 1}/${files.length}] ${file.relativePath} (${file.complexity}, ${file.lineCount} lines)`);
410
+
411
+ const result = generateTestsForFile(file);
412
+ results.push(result);
413
+
414
+ console.log(` Tests: ${result.parsed.testsGenerated} | Assertions: ${result.parsed.assertions} | Coverage: ${result.parsed.coverageEstimate}% | Time: ${(result.wallTimeMs / 1000).toFixed(1)}s`);
415
+ console.log(` Output: ${result.outputLength} chars | Patterns: ${result.parsed.patternsUsed}`);
416
+ console.log(` Tokens (est): prompt=${result.parsed.estimatedPromptTokens} output=${result.parsed.estimatedOutputTokens} | KG mocks: ${result.parsed.kgMocksGenerated} | File reads avoided: ${result.parsed.fileReadsAvoided}`);
417
+ }
418
+
419
+ // Capture token usage once at end of run (avoids per-file process overhead)
420
+ console.log('\n Capturing token usage...');
421
+ const aggregateTokens = captureTokenUsage();
422
+ console.log(` Tokens: in=${aggregateTokens.inputTokens} out=${aggregateTokens.outputTokens} total=${aggregateTokens.totalTokens} saved=${aggregateTokens.tokensSaved}`);
423
+
424
+ return {
425
+ label,
426
+ kgStats,
427
+ initOutput,
428
+ initDurationMs,
429
+ files: results,
430
+ totalWallTimeMs: results.reduce((s, r) => s + r.wallTimeMs, 0),
431
+ aggregateTokens,
432
+ };
433
+ }
434
+
435
+ // ── Comparison ───────────────────────────────────────────────────────
436
+
437
+ interface FileMetrics {
438
+ tests: number; assertions: number; coverage: number;
439
+ wallMs: number; outputChars: number; patterns: number;
440
+ kgMocks: number; fileReadsAvoided: number;
441
+ estPromptTokens: number; estOutputTokens: number;
442
+ }
443
+
444
+ interface FileComparison {
445
+ file: string;
446
+ complexity: string;
447
+ lineCount: number;
448
+ control: FileMetrics;
449
+ treatment: FileMetrics;
450
+ delta: FileMetrics;
451
+ }
452
+
453
+ function buildMetrics(r: GenerationResult): FileMetrics {
454
+ return {
455
+ tests: r.parsed.testsGenerated,
456
+ assertions: r.parsed.assertions,
457
+ coverage: r.parsed.coverageEstimate,
458
+ wallMs: r.wallTimeMs,
459
+ outputChars: r.outputLength,
460
+ patterns: r.parsed.patternsUsed,
461
+ kgMocks: r.parsed.kgMocksGenerated,
462
+ fileReadsAvoided: r.parsed.fileReadsAvoided,
463
+ estPromptTokens: r.parsed.estimatedPromptTokens,
464
+ estOutputTokens: r.parsed.estimatedOutputTokens,
465
+ };
466
+ }
467
+
468
+ function deltaMetrics(a: FileMetrics, b: FileMetrics): FileMetrics {
469
+ const d = {} as FileMetrics;
470
+ for (const k of Object.keys(a) as (keyof FileMetrics)[]) {
471
+ (d as any)[k] = (b[k] as number) - (a[k] as number);
472
+ }
473
+ return d;
474
+ }
475
+
476
+ function compare(control: RunResult, treatment: RunResult): FileComparison[] {
477
+ return control.files.map((c, i) => {
478
+ const t = treatment.files[i];
479
+ const cm = buildMetrics(c);
480
+ const tm = buildMetrics(t);
481
+ return {
482
+ file: c.file,
483
+ complexity: c.complexity,
484
+ lineCount: c.lineCount,
485
+ control: cm,
486
+ treatment: tm,
487
+ delta: deltaMetrics(cm, tm),
488
+ };
489
+ });
490
+ }
491
+
492
+ // ── Summary Printing ─────────────────────────────────────────────────
493
+
494
+ function printSummary(control: RunResult, treatment: RunResult, comparisons: FileComparison[]): void {
495
+ const W = 76;
496
+ const hr = '═'.repeat(W);
497
+ const line = (s: string) => console.log(`║ ${s.padEnd(W - 4)} ║`);
498
+ const blank = () => line('');
499
+
500
+ console.log(`\n╔${hr}╗`);
501
+ console.log(`║${'KG-ASSISTED QE BENCHMARK RESULTS'.padStart((W + 32) / 2).padEnd(W)}║`);
502
+ console.log(`╠${hr}╣`);
503
+
504
+ blank();
505
+ line('INITIALIZATION');
506
+ line(` Control (no KG): ${(control.initDurationMs / 1000).toFixed(1).padStart(6)}s | Vectors: ${control.kgStats.vectorCount} | DB: ${(control.kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
507
+ line(` Treatment (KG): ${(treatment.initDurationMs / 1000).toFixed(1).padStart(6)}s | Vectors: ${treatment.kgStats.vectorCount} | DB: ${(treatment.kgStats.dbSizeBytes / 1024).toFixed(0)}KB`);
508
+
509
+ blank();
510
+ line('PER-FILE COMPARISON');
511
+ console.log(`╟${'─'.repeat(W)}╢`);
512
+ line('File Ctrl KG ΔTests ΔAssert ΔCov% ΔTime');
513
+ console.log(`╟${'─'.repeat(W)}╢`);
514
+
515
+ for (const c of comparisons) {
516
+ const name = path.basename(c.file).padEnd(26).slice(0, 26);
517
+ const ctrl = String(c.control.tests).padStart(4);
518
+ const kg = String(c.treatment.tests).padStart(5);
519
+ const dt = ((c.delta.tests >= 0 ? '+' : '') + c.delta.tests).padStart(7);
520
+ const da = ((c.delta.assertions >= 0 ? '+' : '') + c.delta.assertions).padStart(8);
521
+ const dc = ((c.delta.coverage >= 0 ? '+' : '') + c.delta.coverage.toFixed(1)).padStart(6);
522
+ const dw = ((c.delta.wallMs >= 0 ? '+' : '') + (c.delta.wallMs / 1000).toFixed(1) + 's').padStart(7);
523
+ line(`${name} ${ctrl} ${kg} ${dt} ${da} ${dc} ${dw}`);
524
+ }
525
+
526
+ console.log(`╟${'─'.repeat(W)}╢`);
527
+
528
+ // Aggregates
529
+ const sumCtrl = { tests: 0, assertions: 0, coverage: 0 };
530
+ const sumKg = { tests: 0, assertions: 0, coverage: 0 };
531
+ for (const c of comparisons) {
532
+ sumCtrl.tests += c.control.tests;
533
+ sumCtrl.assertions += c.control.assertions;
534
+ sumCtrl.coverage += c.control.coverage;
535
+ sumKg.tests += c.treatment.tests;
536
+ sumKg.assertions += c.treatment.assertions;
537
+ sumKg.coverage += c.treatment.coverage;
538
+ }
539
+
540
+ const n = comparisons.length || 1;
541
+ const ctrlTok = control.aggregateTokens;
542
+ const kgTok = treatment.aggregateTokens;
543
+
544
+ // Compute estimated token totals from per-file data
545
+ const estCtrlPrompt = comparisons.reduce((s, c) => s + c.control.estPromptTokens, 0);
546
+ const estCtrlOutput = comparisons.reduce((s, c) => s + c.control.estOutputTokens, 0);
547
+ const estKgPrompt = comparisons.reduce((s, c) => s + c.treatment.estPromptTokens, 0);
548
+ const estKgOutput = comparisons.reduce((s, c) => s + c.treatment.estOutputTokens, 0);
549
+ const totalKgMocks = comparisons.reduce((s, c) => s + c.treatment.kgMocks, 0);
550
+ const totalFileReadsAvoided = comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0);
551
+
552
+ blank();
553
+ line('AGGREGATES');
554
+ line(` Tests generated: Control=${sumCtrl.tests} KG=${sumKg.tests} Δ=${sumKg.tests - sumCtrl.tests}`);
555
+ line(` Total assertions: Control=${sumCtrl.assertions} KG=${sumKg.assertions} Δ=${sumKg.assertions - sumCtrl.assertions}`);
556
+ line(` Avg coverage: Control=${(sumCtrl.coverage / n).toFixed(1)}% KG=${(sumKg.coverage / n).toFixed(1)}%`);
557
+ line(` Token usage (DB): Control=${ctrlTok.totalTokens} KG=${kgTok.totalTokens} Δ=${kgTok.totalTokens - ctrlTok.totalTokens}`);
558
+ line(` Tokens saved: Control=${ctrlTok.tokensSaved} KG=${kgTok.tokensSaved}`);
559
+ line(` Patterns reused: Control=${ctrlTok.patternsReused} KG=${kgTok.patternsReused}`);
560
+ line(` Wall time: Control=${(control.totalWallTimeMs / 1000).toFixed(1)}s KG=${(treatment.totalWallTimeMs / 1000).toFixed(1)}s`);
561
+
562
+ blank();
563
+ line('TOKEN EFFICIENCY (estimated ~4 chars/token)');
564
+ line(` Prompt tokens: Control=${estCtrlPrompt} KG=${estKgPrompt} Δ=${estKgPrompt - estCtrlPrompt}`);
565
+ line(` Output tokens: Control=${estCtrlOutput} KG=${estKgOutput} Δ=${estKgOutput - estCtrlOutput}`);
566
+ line(` Total estimated: Control=${estCtrlPrompt + estCtrlOutput} KG=${estKgPrompt + estKgOutput} Δ=${(estKgPrompt + estKgOutput) - (estCtrlPrompt + estCtrlOutput)}`);
567
+ line(` KG mocks generated: ${totalKgMocks} (each = 1 fewer file read for agent)`);
568
+ line(` File reads avoided: ${totalFileReadsAvoided} (dependency info from KG instead of fs)`);
569
+
570
+ blank();
571
+ line('KEY FINDINGS (Issue #266 Questions)');
572
+ const tokenDelta = kgTok.totalTokens - ctrlTok.totalTokens;
573
+ const estTokenDelta = (estKgPrompt + estKgOutput) - (estCtrlPrompt + estCtrlOutput);
574
+ line(` Q1: KG reduces tokens? ${tokenDelta < 0 ? 'YES (DB: saved ' + Math.abs(tokenDelta) + ')' : tokenDelta === 0 ? 'NO CHANGE (DB)' : 'NO (DB: +' + tokenDelta + ')'} | Est: ${estTokenDelta < 0 ? 'saved ' + Math.abs(estTokenDelta) : '+' + estTokenDelta}`);
575
+ line(` Q2: KG improves quality? ${sumKg.tests > sumCtrl.tests ? 'YES (+' + (sumKg.tests - sumCtrl.tests) + ' tests, +' + (sumKg.assertions - sumCtrl.assertions) + ' assertions)' : 'NO CHANGE or WORSE'}`);
576
+ line(` Q3: Latency justified? ${treatment.totalWallTimeMs < control.totalWallTimeMs * 1.5 ? 'YES (overhead < 50%)' : 'NEEDS REVIEW (>' + ((treatment.totalWallTimeMs / control.totalWallTimeMs - 1) * 100).toFixed(0) + '% slower)'}`);
577
+ line(` Q4: File reads avoided? ${totalFileReadsAvoided > 0 ? 'YES (' + totalFileReadsAvoided + ' reads avoided via KG deps)' : 'NO (KG deps not yet used for mock generation)'}`);
578
+
579
+ // Q5: By complexity
580
+ for (const cx of ['simple', 'medium', 'complex'] as const) {
581
+ const subset = comparisons.filter((c) => c.complexity === cx);
582
+ if (subset.length === 0) continue;
583
+ const ct = subset.reduce((s, c) => s + c.control.tests, 0);
584
+ const kt = subset.reduce((s, c) => s + c.treatment.tests, 0);
585
+ const ca = subset.reduce((s, c) => s + c.control.assertions, 0);
586
+ const ka = subset.reduce((s, c) => s + c.treatment.assertions, 0);
587
+ const fr = subset.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0);
588
+ line(` Q5 [${cx.padEnd(7)}]: tests ${ct}→${kt} assertions ${ca}→${ka} file reads avoided: ${fr}`);
589
+ }
590
+
591
+ blank();
592
+ console.log(`╚${hr}╝`);
593
+ }
594
+
595
+ // ── Main ─────────────────────────────────────────────────────────────
596
+
597
+ async function main() {
598
+ console.log('╔══════════════════════════════════════════════════════════════╗');
599
+ console.log('║ KG-Assisted QE Benchmark (Issue #266) ║');
600
+ console.log('║ Test subject: github.com/maxritter/claude-pilot ║');
601
+ console.log('╚══════════════════════════════════════════════════════════════╝');
602
+ console.log(` Timestamp: ${new Date().toISOString()}`);
603
+ console.log(` Max files: ${maxFiles}`);
604
+ console.log(` Skip clone: ${skipClone}`);
605
+
606
+ // Step 1: Clone
607
+ if (!skipClone) {
608
+ console.log('\n── 1. Cloning test project ──────────────────────────');
609
+ if (fs.existsSync(PROJECT_DIR)) {
610
+ fs.rmSync(PROJECT_DIR, { recursive: true, force: true });
611
+ }
612
+ fs.mkdirSync(BENCH_DIR, { recursive: true });
613
+ execSync(`git clone ${REPO_URL} ${PROJECT_DIR}`, { stdio: 'inherit' });
614
+ } else {
615
+ console.log('\n── 1. Using existing clone ──────────────────────────');
616
+ if (!fs.existsSync(PROJECT_DIR)) {
617
+ console.error(`ERROR: ${PROJECT_DIR} does not exist. Run without --skip-clone.`);
618
+ process.exit(1);
619
+ }
620
+ }
621
+
622
+ // Step 2: Select files
623
+ console.log('\n── 2. Selecting benchmark files ─────────────────────');
624
+ const files = discoverBenchmarkFiles();
625
+ console.log(` Selected ${files.length} files:`);
626
+ for (const f of files) {
627
+ console.log(` [${f.complexity.padEnd(7)}] ${f.relativePath} (${f.lineCount} lines)`);
628
+ }
629
+ if (files.length === 0) {
630
+ console.error('ERROR: No source files found');
631
+ process.exit(1);
632
+ }
633
+
634
+ // Step 3: Run A — Control
635
+ console.log('\n── 3. Run A: Control (no KG) ────────────────────────');
636
+ const control = runBenchmark('CONTROL (no KG)', files, false);
637
+
638
+ // Step 4: Run B — Treatment
639
+ console.log('\n── 4. Run B: Treatment (with KG) ────────────────────');
640
+ const treatment = runBenchmark('TREATMENT (KG-assisted)', files, true);
641
+
642
+ // Step 5: Compare & report
643
+ console.log('\n── 5. Results ───────────────────────────────────────');
644
+ const comparisons = compare(control, treatment);
645
+ printSummary(control, treatment, comparisons);
646
+
647
+ // Step 6: Save JSON
648
+ fs.mkdirSync(RESULTS_DIR, { recursive: true });
649
+ const resultsPath = path.join(RESULTS_DIR, `kg-benchmark-${Date.now()}.json`);
650
+
651
+ const fullResults = {
652
+ timestamp: new Date().toISOString(),
653
+ testSubject: REPO_URL,
654
+ filesCount: files.length,
655
+ control: {
656
+ label: control.label,
657
+ kgStats: control.kgStats,
658
+ initDurationMs: control.initDurationMs,
659
+ totalWallTimeMs: control.totalWallTimeMs,
660
+ aggregateTokens: control.aggregateTokens,
661
+ files: control.files.map((f) => ({
662
+ file: f.file, complexity: f.complexity, lineCount: f.lineCount, sourceChars: f.sourceChars,
663
+ wallTimeMs: f.wallTimeMs, outputLength: f.outputLength, parsed: f.parsed,
664
+ })),
665
+ },
666
+ treatment: {
667
+ label: treatment.label,
668
+ kgStats: treatment.kgStats,
669
+ initDurationMs: treatment.initDurationMs,
670
+ totalWallTimeMs: treatment.totalWallTimeMs,
671
+ aggregateTokens: treatment.aggregateTokens,
672
+ files: treatment.files.map((f) => ({
673
+ file: f.file, complexity: f.complexity, lineCount: f.lineCount, sourceChars: f.sourceChars,
674
+ wallTimeMs: f.wallTimeMs, outputLength: f.outputLength, parsed: f.parsed,
675
+ })),
676
+ },
677
+ comparisons,
678
+ keyFindings: {
679
+ q1_kg_reduces_tokens: treatment.aggregateTokens.totalTokens < control.aggregateTokens.totalTokens,
680
+ q2_kg_improves_quality: comparisons.reduce((s, c) => s + c.delta.tests, 0) > 0,
681
+ q3_latency_justified: treatment.totalWallTimeMs < control.totalWallTimeMs * 1.5,
682
+ q4_file_reads_avoided: comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
683
+ q5_complexity_breakdown: Object.fromEntries(
684
+ (['simple', 'medium', 'complex'] as const).map((cx) => {
685
+ const subset = comparisons.filter((c) => c.complexity === cx);
686
+ return [cx, {
687
+ files: subset.length,
688
+ controlTests: subset.reduce((s, c) => s + c.control.tests, 0),
689
+ kgTests: subset.reduce((s, c) => s + c.treatment.tests, 0),
690
+ controlAssertions: subset.reduce((s, c) => s + c.control.assertions, 0),
691
+ kgAssertions: subset.reduce((s, c) => s + c.treatment.assertions, 0),
692
+ fileReadsAvoided: subset.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
693
+ }];
694
+ })
695
+ ),
696
+ tokenEfficiency: {
697
+ controlEstPromptTokens: comparisons.reduce((s, c) => s + c.control.estPromptTokens, 0),
698
+ controlEstOutputTokens: comparisons.reduce((s, c) => s + c.control.estOutputTokens, 0),
699
+ kgEstPromptTokens: comparisons.reduce((s, c) => s + c.treatment.estPromptTokens, 0),
700
+ kgEstOutputTokens: comparisons.reduce((s, c) => s + c.treatment.estOutputTokens, 0),
701
+ totalKgMocksGenerated: comparisons.reduce((s, c) => s + c.treatment.kgMocks, 0),
702
+ totalFileReadsAvoided: comparisons.reduce((s, c) => s + c.treatment.fileReadsAvoided, 0),
703
+ },
704
+ },
705
+ };
706
+
707
+ fs.writeFileSync(resultsPath, JSON.stringify(fullResults, null, 2));
708
+ console.log(`\n Results JSON: ${resultsPath}`);
709
+
710
+ // Save raw outputs for manual diff
711
+ const rawDir = path.join(RESULTS_DIR, 'raw');
712
+ fs.mkdirSync(rawDir, { recursive: true });
713
+ for (const r of control.files) {
714
+ fs.writeFileSync(path.join(rawDir, `control-${path.basename(r.file, '.py')}.txt`), r.rawOutput);
715
+ }
716
+ for (const r of treatment.files) {
717
+ fs.writeFileSync(path.join(rawDir, `treatment-${path.basename(r.file, '.py')}.txt`), r.rawOutput);
718
+ }
719
+ console.log(` Raw outputs: ${rawDir}/`);
720
+ }
721
+
722
+ main().catch((err) => {
723
+ console.error('Benchmark failed:', err);
724
+ process.exit(1);
725
+ });