@graphpilot-oss/graphpilot 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +73 -126
- package/README.md +359 -101
- package/dist/cli.js +20 -0
- package/dist/cli.js.map +1 -1
- package/dist/indexer.js +3 -3
- package/dist/indexer.js.map +1 -1
- package/dist/init.d.ts +28 -0
- package/dist/init.js +112 -0
- package/dist/init.js.map +1 -0
- package/dist/interactions.d.ts +5 -4
- package/dist/interactions.js +0 -0
- package/dist/interactions.js.map +1 -1
- package/dist/mcp.js +126 -46
- package/dist/mcp.js.map +1 -1
- package/dist/repo-resolve.d.ts +47 -0
- package/dist/repo-resolve.js +195 -0
- package/dist/repo-resolve.js.map +1 -0
- package/dist/storage.js +10 -1
- package/dist/storage.js.map +1 -1
- package/dist/validation.js +30 -4
- package/dist/validation.js.map +1 -1
- package/dist/watcher.d.ts +10 -0
- package/dist/watcher.js +70 -7
- package/dist/watcher.js.map +1 -1
- package/examples/README.md +105 -0
- package/examples/claude-code/README.md +125 -0
- package/examples/claude-code/claude-routing.md +102 -0
- package/examples/claude-code/claude_config.json +8 -0
- package/examples/cline/.clinerules +39 -0
- package/examples/cline/README.md +104 -0
- package/examples/cline/cline_mcp_settings.json +10 -0
- package/examples/continue/.continuerules +39 -0
- package/examples/continue/README.md +98 -0
- package/examples/continue/config.json +13 -0
- package/examples/cursor/.cursorrules +39 -0
- package/examples/cursor/README.md +98 -0
- package/examples/cursor/mcp.json +11 -0
- package/examples/windsurf/.windsurfrules +39 -0
- package/examples/windsurf/README.md +85 -0
- package/examples/windsurf/mcp_config.json +8 -0
- package/package.json +12 -3
- package/.editorconfig +0 -15
- package/.github/CODEOWNERS +0 -22
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -33
- package/.github/ISSUE_TEMPLATE/config.yml +0 -5
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -23
- package/.github/PULL_REQUEST_TEMPLATE.md +0 -19
- package/.github/dependabot.yml +0 -15
- package/.github/workflows/ci.yml +0 -62
- package/.github/workflows/release.yml +0 -50
- package/.prettierignore +0 -19
- package/.prettierrc.json +0 -20
- package/CODE_OF_CONDUCT.md +0 -83
- package/CONTRIBUTING.md +0 -111
- package/bench/README.md +0 -544
- package/bench/results/agent-tier-2026-05-22.md +0 -28
- package/bench/results/agent-tier-summary.md +0 -44
- package/bench/results/baseline-tier-2026-05-22.md +0 -23
- package/bench/results/baseline.json +0 -810
- package/bench/results/baseline.md +0 -28
- package/bench/run-agent-tier-automated.ts +0 -234
- package/bench/run-agent-tier.md +0 -125
- package/bench/run-baseline-tier.ts +0 -200
- package/bench/run.ts +0 -210
- package/bench/runner-baseline.ts +0 -177
- package/bench/runner-graphpilot.ts +0 -131
- package/bench/score-agent-tier.ts +0 -191
- package/bench/score.ts +0 -59
- package/bench/tasks.ts +0 -236
- package/dist/provenance.d.ts +0 -74
- package/dist/provenance.js +0 -95
- package/dist/provenance.js.map +0 -1
- package/docs/architecture.md +0 -311
- package/docs/limitations.md +0 -156
- package/docs/mcp-setup.md +0 -231
- package/docs/quickstart.md +0 -202
- package/eslint.config.js +0 -148
- package/lefthook.yml +0 -81
- package/pnpm-workspace.yaml +0 -6
- package/scripts/smoke-stdio.mjs +0 -97
- package/src/cli.ts +0 -171
- package/src/edges.ts +0 -202
- package/src/git.ts +0 -255
- package/src/graph-schema.ts +0 -229
- package/src/impact.ts +0 -218
- package/src/indexer.ts +0 -152
- package/src/interactions.ts +0 -0
- package/src/mcp.ts +0 -652
- package/src/parser.ts +0 -138
- package/src/provenance.ts +0 -115
- package/src/query.ts +0 -148
- package/src/redact.ts +0 -122
- package/src/storage.ts +0 -115
- package/src/symbols.ts +0 -173
- package/src/validation.ts +0 -69
- package/src/validators.ts +0 -253
- package/src/watcher.ts +0 -383
- package/tests/edges.test.ts +0 -175
- package/tests/fixtures/sample.ts +0 -32
- package/tests/git.test.ts +0 -303
- package/tests/graph-schema.test.ts +0 -321
- package/tests/impact.test.ts +0 -454
- package/tests/interactions.test.ts +0 -180
- package/tests/lint-policy.test.ts +0 -106
- package/tests/mcp-stdio.test.ts +0 -171
- package/tests/mcp.test.ts +0 -335
- package/tests/parser.test.ts +0 -31
- package/tests/provenance.test.ts +0 -132
- package/tests/query.test.ts +0 -160
- package/tests/redact.test.ts +0 -167
- package/tests/security.test.ts +0 -144
- package/tests/symbols.test.ts +0 -78
- package/tests/validators.test.ts +0 -193
- package/tests/watcher.test.ts +0 -250
- package/tsconfig.json +0 -18
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
# GraphPilot Benchmark — 2026-05-20T06:13:00.314Z
|
|
2
|
-
|
|
3
|
-
Corpus: `<graphpilot-repo>`
|
|
4
|
-
graphpilot v0.0.1
|
|
5
|
-
Node v23.11.0 on darwin
|
|
6
|
-
|
|
7
|
-
## Aggregate
|
|
8
|
-
|
|
9
|
-
- Tasks run: **10**
|
|
10
|
-
- F1 (avg): graphpilot **0.89** vs grep **0.42**
|
|
11
|
-
- Bytes processed (total): graphpilot **721B** vs grep **528.1KB** (99.9% reduction)
|
|
12
|
-
- Winner counts: graphpilot **7** · grep **1** · tie **2**
|
|
13
|
-
- Expected-winner accuracy: **9/10** (90%)
|
|
14
|
-
|
|
15
|
-
## Per-task
|
|
16
|
-
|
|
17
|
-
| # | Task | GP F1 | Grep F1 | GP bytes | Grep bytes | Winner | Expected |
|
|
18
|
-
|---|---|---|---|---|---|---|---|
|
|
19
|
-
| t01-callers-analyzeImpact | Find every function that calls analyzeImpact | 1.00 | 0.00 | 18B | 48.8KB | graphpilot | graphpilot ✓ |
|
|
20
|
-
| t02-callers-extractSymbols | Find every direct caller of extractSymbols | 1.00 | 0.00 | 44B | 43.6KB | graphpilot | graphpilot ✓ |
|
|
21
|
-
| t03-callers-validateRootPath | Find every direct caller of validateRootPath | 1.00 | 0.00 | 49B | 48.5KB | graphpilot | graphpilot ✓ |
|
|
22
|
-
| t04-recall-substring-parse | Find every symbol whose name contains "parse" | 1.00 | 0.50 | 65B | 148.1KB | graphpilot | graphpilot ✓ |
|
|
23
|
-
| t05-kind-filter-interfaces | Enumerate all TypeScript interfaces under src/ | 1.00 | 1.00 | 342B | 88.9KB | tie | graphpilot ✗ |
|
|
24
|
-
| t06-impact-extractSymbols-depth2 | Compute blast radius of changing extractSymbols (depth 2) | 0.92 | 0.00 | 99B | 43.6KB | graphpilot | graphpilot ✓ |
|
|
25
|
-
| t07-tests-affected-parseFile | Identify test files that exercise parseFile (directly) | 1.00 | 0.33 | 25B | 48.8KB | graphpilot | graphpilot ✓ |
|
|
26
|
-
| t08-recall-substring-args | Find every MCP-tool input-args interface | 1.00 | 0.48 | 75B | 33.3KB | graphpilot | graphpilot ✓ |
|
|
27
|
-
| t09-recall-miss | Look up a symbol that does not exist (negative test) | 1.00 | 1.00 | 2B | 6.9KB | tie | tie ✓ |
|
|
28
|
-
| t10-string-literal-MAX_FILE_BYTES | Find every literal occurrence of the constant name "MAX_FILE_BYTES" | 0.00 | 0.86 | 2B | 17.5KB | grep | grep ✓ |
|
|
@@ -1,234 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Automated Tier-B Agent Benchmark Runner
|
|
3
|
-
*
|
|
4
|
-
* Instead of running Claude Code GUI sessions manually, this script:
|
|
5
|
-
* 1. Programmatically calls the same gp_* tools that an agent would
|
|
6
|
-
* 2. Measures structural correctness (blast radius, callers, etc.)
|
|
7
|
-
* 3. Simulates agent reasoning by checking if key data was present
|
|
8
|
-
* 4. Produces the per-task metrics table
|
|
9
|
-
*
|
|
10
|
-
* This is a proxy for real LLM agent behavior; it measures tool quality
|
|
11
|
-
* rather than agent reasoning quality. But it's reproducible and fast.
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import * as fs from 'node:fs';
|
|
15
|
-
import { GraphIndex } from '../src/query.js';
|
|
16
|
-
import { loadGraph } from '../src/storage.js';
|
|
17
|
-
import { analyzeImpact } from '../src/impact.js';
|
|
18
|
-
import { TASKS } from './tasks.js';
|
|
19
|
-
import { getChangedFiles, readGitInfo } from '../src/git.js';
|
|
20
|
-
import type { SymbolRecord, CallEdge } from '../src/symbols.js';
|
|
21
|
-
|
|
22
|
-
interface TaskMetrics {
|
|
23
|
-
taskId: string;
|
|
24
|
-
description: string;
|
|
25
|
-
kind: string;
|
|
26
|
-
success: boolean; // did GP find all ground-truth results?
|
|
27
|
-
recall: number; // |found ∩ truth| / |truth|
|
|
28
|
-
precision: number; // |found ∩ truth| / |found|
|
|
29
|
-
f1: number;
|
|
30
|
-
hallucinations: number; // results not in ground truth
|
|
31
|
-
evidenceAnchorsPresent: boolean; // all results have file:line @ sha
|
|
32
|
-
tokenEstimate: number; // rough proxy: response size in chars / 4
|
|
33
|
-
notes: string;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
function formatProvenance(s: SymbolRecord, sha: string | null): string {
|
|
37
|
-
const shaTag = sha ? ` @ ${sha.slice(0, 7)}` : '';
|
|
38
|
-
return `${s.file}:${s.line}${shaTag}`;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
async function runTask(idx: GraphIndex, taskId: string, graph: any): Promise<TaskMetrics> {
|
|
42
|
-
const task = TASKS.find((t) => t.id === taskId);
|
|
43
|
-
if (!task) throw new Error(`Task ${taskId} not found`);
|
|
44
|
-
|
|
45
|
-
const shortSha = graph.indexedSha ? graph.indexedSha.slice(0, 7) : null;
|
|
46
|
-
|
|
47
|
-
let found: SymbolRecord[] = [];
|
|
48
|
-
let responseText = '';
|
|
49
|
-
let success = false;
|
|
50
|
-
|
|
51
|
-
try {
|
|
52
|
-
switch (task.kind) {
|
|
53
|
-
case 'callers': {
|
|
54
|
-
const target = idx.resolveSymbol(task.query);
|
|
55
|
-
if (target) {
|
|
56
|
-
const edges = idx.callers(target.id, { limit: 100 });
|
|
57
|
-
found = edges
|
|
58
|
-
.map((e) => idx.findById(e.fromId))
|
|
59
|
-
.filter((s) => s !== null) as SymbolRecord[];
|
|
60
|
-
responseText = found
|
|
61
|
-
.map((s) => `${s.name} @ ${formatProvenance(s, shortSha)}`)
|
|
62
|
-
.join('\n');
|
|
63
|
-
}
|
|
64
|
-
break;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
case 'impact': {
|
|
68
|
-
const target = idx.resolveSymbol(task.query);
|
|
69
|
-
if (target) {
|
|
70
|
-
const impact = analyzeImpact(idx, task.query);
|
|
71
|
-
if (impact) {
|
|
72
|
-
found = impact.directCallers.map((c) => c.symbol);
|
|
73
|
-
found = found.concat(impact.transitiveCallers.map((c) => c.symbol));
|
|
74
|
-
responseText = [
|
|
75
|
-
`Direct: ${impact.directCallers.map((c) => c.symbol.name).join(', ')}`,
|
|
76
|
-
`Transitive: ${impact.transitiveCallers.map((c) => c.symbol.name).join(', ')}`,
|
|
77
|
-
impact.directCallers
|
|
78
|
-
.map((c) => ` ${c.symbol.name} @ ${formatProvenance(c.symbol, shortSha)}`)
|
|
79
|
-
.join('\n'),
|
|
80
|
-
].join('\n');
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
break;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
case 'impact-since': {
|
|
87
|
-
// Differential impact — simulated with empty changed files (clean repo)
|
|
88
|
-
const target = idx.resolveSymbol(task.query);
|
|
89
|
-
if (target) {
|
|
90
|
-
const impact = analyzeImpact(idx, task.query, { changedFiles: new Set() });
|
|
91
|
-
if (impact) {
|
|
92
|
-
found = [];
|
|
93
|
-
responseText = `(filtered to 0 files changed since HEAD~1)`;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
break;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
case 'recall':
|
|
100
|
-
case 'recall-substring': {
|
|
101
|
-
found = idx.findByName(task.query, { substring: task.kind === 'recall-substring' });
|
|
102
|
-
responseText = found
|
|
103
|
-
.map((s) => `${s.name} (${s.kind}) @ ${formatProvenance(s, shortSha)}`)
|
|
104
|
-
.join('\n');
|
|
105
|
-
break;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
case 'kind-filter': {
|
|
109
|
-
found = idx.findByKind(task.query as any);
|
|
110
|
-
responseText = found.map((s) => `${s.name} @ ${formatProvenance(s, shortSha)}`).join('\n');
|
|
111
|
-
break;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
case 'tests-affected': {
|
|
115
|
-
const target = idx.resolveSymbol(task.query);
|
|
116
|
-
if (target) {
|
|
117
|
-
const edges = idx.callers(target.id, { limit: 100 });
|
|
118
|
-
found = edges
|
|
119
|
-
.map((e) => idx.findById(e.fromId))
|
|
120
|
-
.filter((s) => s !== null && s.file.includes('test')) as SymbolRecord[];
|
|
121
|
-
responseText = found.map((s) => `${s.file}:${s.line}`).join('\n');
|
|
122
|
-
}
|
|
123
|
-
break;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
case 'recall-miss': {
|
|
127
|
-
found = idx.findByName(task.query);
|
|
128
|
-
responseText =
|
|
129
|
-
found.length === 0 ? '[not found in index]' : found.map((s) => s.name).join(', ');
|
|
130
|
-
break;
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
case 'string-literal': {
|
|
134
|
-
// We can't search text; skip this (would be grep-only)
|
|
135
|
-
found = [];
|
|
136
|
-
responseText = '[string search not implemented in GraphPilot]';
|
|
137
|
-
break;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
} catch (err) {
|
|
141
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
142
|
-
responseText = `[ERROR: ${msg}]`;
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
// Score against ground truth
|
|
146
|
-
const truth = new Set(task.groundTruth);
|
|
147
|
-
const foundNames = new Set(found.map((s) => s.name));
|
|
148
|
-
|
|
149
|
-
const intersection = new Set([...foundNames].filter((n) => truth.has(n)));
|
|
150
|
-
const recall = truth.size > 0 ? intersection.size / truth.size : 1;
|
|
151
|
-
const precision = foundNames.size > 0 ? intersection.size / foundNames.size : 1;
|
|
152
|
-
const f1 = precision + recall > 0 ? (2 * (precision * recall)) / (precision + recall) : 0;
|
|
153
|
-
|
|
154
|
-
success = recall === 1 && precision === 1;
|
|
155
|
-
const hallucinations = foundNames.size - intersection.size;
|
|
156
|
-
|
|
157
|
-
// Check for evidence anchors in response
|
|
158
|
-
const evidenceAnchorsPresent = /:\d+\s*@\s*[0-9a-f]{7}/.test(responseText) || found.length === 0;
|
|
159
|
-
|
|
160
|
-
const tokenEstimate = Math.ceil(responseText.length / 4);
|
|
161
|
-
|
|
162
|
-
return {
|
|
163
|
-
taskId,
|
|
164
|
-
description: task.description,
|
|
165
|
-
kind: task.kind,
|
|
166
|
-
success,
|
|
167
|
-
recall: Math.round(recall * 100) / 100,
|
|
168
|
-
precision: Math.round(precision * 100) / 100,
|
|
169
|
-
f1: Math.round(f1 * 100) / 100,
|
|
170
|
-
hallucinations,
|
|
171
|
-
evidenceAnchorsPresent,
|
|
172
|
-
tokenEstimate,
|
|
173
|
-
notes: `truth=${Array.from(truth).join(',')} found=${Array.from(foundNames).join(',')}`,
|
|
174
|
-
};
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
async function main() {
|
|
178
|
-
// Load from the repo root (indexer stores it with the repo-relative path hash)
|
|
179
|
-
const repoPath = process.cwd();
|
|
180
|
-
const graph = loadGraph(repoPath);
|
|
181
|
-
if (!graph) {
|
|
182
|
-
console.error(`No graph.json found for ${repoPath}. Run: node dist/cli.js index .`);
|
|
183
|
-
process.exit(1);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
const idx = new GraphIndex(graph);
|
|
187
|
-
const results: TaskMetrics[] = [];
|
|
188
|
-
|
|
189
|
-
console.log(`Running ${TASKS.length} tasks against indexed GraphPilot...\n`);
|
|
190
|
-
|
|
191
|
-
for (const task of TASKS) {
|
|
192
|
-
const metrics = await runTask(idx, task.id, graph);
|
|
193
|
-
results.push(metrics);
|
|
194
|
-
const icon = metrics.success ? '✓' : '✗';
|
|
195
|
-
console.log(
|
|
196
|
-
`${icon} ${metrics.taskId}: F1=${metrics.f1} recall=${metrics.recall} prec=${metrics.precision}`,
|
|
197
|
-
);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
// Write results
|
|
201
|
-
const timestamp = new Date().toISOString().split('T')[0];
|
|
202
|
-
const resultsPath = `bench/results/agent-tier-${timestamp}.md`;
|
|
203
|
-
|
|
204
|
-
let md = `# Tier-B Benchmark Results (Automated)\n\n`;
|
|
205
|
-
md += `Timestamp: ${new Date().toISOString()}\n\n`;
|
|
206
|
-
md += `## Per-Task Metrics\n\n`;
|
|
207
|
-
md += `| Task | Description | Success | Recall | Precision | F1 | Halluc | Anchors |\n`;
|
|
208
|
-
md += `|---|---|---|---|---|---|---|---|\n`;
|
|
209
|
-
|
|
210
|
-
let totalSuccess = 0;
|
|
211
|
-
let totalHalluc = 0;
|
|
212
|
-
|
|
213
|
-
for (const m of results) {
|
|
214
|
-
const success = m.success ? '✓' : '✗';
|
|
215
|
-
const anchors = m.evidenceAnchorsPresent ? '✓' : '✗';
|
|
216
|
-
md += `| ${m.taskId} | ${m.description} | ${success} | ${m.recall} | ${m.precision} | ${m.f1} | ${m.hallucinations} | ${anchors} |\n`;
|
|
217
|
-
totalSuccess += m.success ? 1 : 0;
|
|
218
|
-
totalHalluc += m.hallucinations;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
md += `\n## Summary\n\n`;
|
|
222
|
-
md += `- **Tasks passed:** ${totalSuccess}/${results.length}\n`;
|
|
223
|
-
md += `- **Total hallucinations:** ${totalHalluc}\n`;
|
|
224
|
-
md += `- **Evidence anchors:** ${results.filter((r) => r.evidenceAnchorsPresent).length}/${results.filter((r) => r.kind !== 'string-literal').length} (excluding string-search)\n`;
|
|
225
|
-
md += `- **Mean F1 across tasks:** ${(results.reduce((n, r) => n + r.f1, 0) / results.length).toFixed(2)}\n`;
|
|
226
|
-
|
|
227
|
-
fs.mkdirSync('bench/results', { recursive: true });
|
|
228
|
-
fs.writeFileSync(resultsPath, md);
|
|
229
|
-
|
|
230
|
-
console.log(`\nResults written to ${resultsPath}`);
|
|
231
|
-
console.log(`\n${md}`);
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
main().catch(console.error);
|
package/bench/run-agent-tier.md
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
# Tier-B Agent Benchmark — Spec
|
|
2
|
-
|
|
3
|
-
> The launch headline ("Claude Code with GraphPilot succeeded on X/10
|
|
4
|
-
> refactor tasks vs Y/10 without") lives here. This is the **manual
|
|
5
|
-
> turn-the-crank session** that produces those numbers.
|
|
6
|
-
>
|
|
7
|
-
> **Status:** spec only. Numbers not yet produced. Tier A (in
|
|
8
|
-
> [README.md](README.md)) covers the deterministic, tool-only
|
|
9
|
-
> comparison. Tier B adds an LLM in the loop.
|
|
10
|
-
|
|
11
|
-
## Why Tier B is separate
|
|
12
|
-
|
|
13
|
-
Tier A measures _whether the tools return the right info_. Tier B
|
|
14
|
-
measures _whether the agent reaches the right conclusion using those
|
|
15
|
-
tools_. Both matter; they answer different questions.
|
|
16
|
-
|
|
17
|
-
Tier A is automatable. Tier B is not — it requires:
|
|
18
|
-
|
|
19
|
-
1. Running real Claude Code sessions
|
|
20
|
-
2. Scoring "did the agent reach the right answer?" by hand
|
|
21
|
-
3. Recording token usage from the agent's logs
|
|
22
|
-
|
|
23
|
-
That's ~4–6 hours of focused human work. Out of scope for a single
|
|
24
|
-
benchmark commit; in scope for a separate launch-prep session.
|
|
25
|
-
|
|
26
|
-
## Method
|
|
27
|
-
|
|
28
|
-
### Setup
|
|
29
|
-
|
|
30
|
-
- A test repo (preferably `microsoft/TypeScript` — large enough to
|
|
31
|
-
matter, recognizable to readers)
|
|
32
|
-
- Three Claude Code configurations:
|
|
33
|
-
- **Baseline:** vanilla Claude Code, no MCP servers
|
|
34
|
-
- **With GraphPilot:** Claude Code with the graphpilot MCP server
|
|
35
|
-
configured + a CLAUDE.md routing rule pointing structural questions
|
|
36
|
-
at the gp\_\* tools
|
|
37
|
-
- **With CodeGraphContext** (optional but punchy): the closest OSS
|
|
38
|
-
competitor, same setup
|
|
39
|
-
|
|
40
|
-
### The 10 tasks
|
|
41
|
-
|
|
42
|
-
These mirror the Tier-A corpus but are phrased as natural-language
|
|
43
|
-
refactor prompts:
|
|
44
|
-
|
|
45
|
-
1. Rename `createSourceFile` everywhere it's called
|
|
46
|
-
2. Find every function that catches but ignores errors
|
|
47
|
-
3. List the public API exported from `src/compiler/` (or pick one module)
|
|
48
|
-
4. Find the shortest call path from `parser.ts` to a syscall (`fs.write*`)
|
|
49
|
-
5. Find functions never called by any test
|
|
50
|
-
6. Which functions take `Diagnostic` as a parameter?
|
|
51
|
-
7. Find all callers of a function flagged `@deprecated`
|
|
52
|
-
8. Locate the function that emits a specific error message text
|
|
53
|
-
9. Trace a value from CLI input to where it's logged (expect agents to
|
|
54
|
-
fail this — taint analysis isn't our beat)
|
|
55
|
-
10. Find HTTP routes without auth middleware (expect failure — no
|
|
56
|
-
framework-aware tooling in v1)
|
|
57
|
-
|
|
58
|
-
Tasks 9 and 10 are **deliberate "graphpilot loses" tasks**. Including
|
|
59
|
-
them is what keeps the result believable.
|
|
60
|
-
|
|
61
|
-
### Metrics per task
|
|
62
|
-
|
|
63
|
-
For each `(task, condition)` cell:
|
|
64
|
-
|
|
65
|
-
| Metric | How |
|
|
66
|
-
| ----------------------- | -------------------------------------------------- |
|
|
67
|
-
| **Task success** (0/1) | Human eval against a hand-written rubric |
|
|
68
|
-
| **Hallucination count** | Manual count of fabricated names / paths / imports |
|
|
69
|
-
| **Token cost** | Sum of input+output tokens from Claude Code's log |
|
|
70
|
-
| **Wall-clock** | Stopwatch from prompt-submit to final answer |
|
|
71
|
-
| **Clean patch apply** | Did the proposed diff apply without conflict? |
|
|
72
|
-
|
|
73
|
-
### Scoring
|
|
74
|
-
|
|
75
|
-
Aggregate the per-task numbers into the headline:
|
|
76
|
-
|
|
77
|
-
```
|
|
78
|
-
Claude Code alone: N/10 tasks succeeded
|
|
79
|
-
Claude Code + GraphPilot: M/10 tasks succeeded
|
|
80
|
-
Token cost: −X%
|
|
81
|
-
Hallucinations: −Y%
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
If reality comes back at 5/10 vs 4/10, publish that — don't fake it.
|
|
85
|
-
|
|
86
|
-
## Runbook (when the session happens)
|
|
87
|
-
|
|
88
|
-
1. Clone the corpus repo (e.g. `microsoft/TypeScript`) to a clean dir
|
|
89
|
-
2. Configure Claude Code three ways (vanilla / + graphpilot / + CGC)
|
|
90
|
-
3. For each task: open a fresh session in each config, paste the prompt,
|
|
91
|
-
run until Claude produces an answer or gives up, score the result
|
|
92
|
-
4. Tally totals; write the per-task table into
|
|
93
|
-
`bench/results/agent-tier-<date>.md`
|
|
94
|
-
5. Drop the headline into the project README
|
|
95
|
-
|
|
96
|
-
## Why we haven't done this yet
|
|
97
|
-
|
|
98
|
-
- Tier A produces real, publishable numbers in <1 minute and locks in
|
|
99
|
-
the methodology. Better to have that floor than to launch with no
|
|
100
|
-
numbers because Tier B is half-done.
|
|
101
|
-
- Running Tier B costs real money (~$10–20 per pass in Claude tokens)
|
|
102
|
-
and ~4–6 hours of attention. Worth doing right, in a focused session,
|
|
103
|
-
not interleaved with development.
|
|
104
|
-
- The Tier-A bytes-reduction number (99.9 %) is _already_ sufficient
|
|
105
|
-
for a Show HN headline: _"99% fewer tokens needed to answer
|
|
106
|
-
structural questions in your TypeScript codebase."_
|
|
107
|
-
|
|
108
|
-
## Estimated effort
|
|
109
|
-
|
|
110
|
-
- Setup: 30 min
|
|
111
|
-
- Run + score: 3–4 hours (10 tasks × 3 conditions × ~6 min)
|
|
112
|
-
- Writeup + numbers into README: 30 min
|
|
113
|
-
|
|
114
|
-
Total: half a working day.
|
|
115
|
-
|
|
116
|
-
## What to do if Tier-B numbers are mediocre
|
|
117
|
-
|
|
118
|
-
If "Claude Code + GraphPilot" comes back at 6/10 vs 5/10 baseline, the
|
|
119
|
-
honest move is:
|
|
120
|
-
|
|
121
|
-
1. Publish the real number
|
|
122
|
-
2. Reframe the launch around Tier A (where the win is huge)
|
|
123
|
-
3. Investigate WHY the agent didn't translate tool quality into answer
|
|
124
|
-
quality (probably: tool descriptions not aggressive enough, or
|
|
125
|
-
CLAUDE.md routing not strong enough). Fix and re-run before launch.
|
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Baseline Tier-B: vanilla grep + CLI tools
|
|
3
|
-
*
|
|
4
|
-
* Simulates what an agent would do without GraphPilot:
|
|
5
|
-
* - Use `grep -r` for queries
|
|
6
|
-
* - No structured index, no blast-radius analysis
|
|
7
|
-
* - High noise (false positives in comments, strings)
|
|
8
|
-
*
|
|
9
|
-
* This is a strawman baseline; real agents might use LSP or IDEs.
|
|
10
|
-
* But grep represents the cost of *no* structured indexing.
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
import { execSync } from 'node:child_process';
|
|
14
|
-
import * as fs from 'node:fs';
|
|
15
|
-
import { TASKS } from './tasks.js';
|
|
16
|
-
|
|
17
|
-
interface TaskMetrics {
|
|
18
|
-
taskId: string;
|
|
19
|
-
description: string;
|
|
20
|
-
kind: string;
|
|
21
|
-
success: boolean;
|
|
22
|
-
recall: number;
|
|
23
|
-
precision: number;
|
|
24
|
-
f1: number;
|
|
25
|
-
hallucinations: number;
|
|
26
|
-
tokenEstimate: number;
|
|
27
|
-
notes: string;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
function runGrep(pattern: string, options: string[] = []): string[] {
|
|
31
|
-
try {
|
|
32
|
-
const cmd = [
|
|
33
|
-
'grep',
|
|
34
|
-
'-r',
|
|
35
|
-
'--include=*.ts',
|
|
36
|
-
'--include=*.tsx',
|
|
37
|
-
...options,
|
|
38
|
-
pattern,
|
|
39
|
-
'src',
|
|
40
|
-
'tests',
|
|
41
|
-
].join(' ');
|
|
42
|
-
const output = execSync(cmd, {
|
|
43
|
-
encoding: 'utf8',
|
|
44
|
-
cwd: '.',
|
|
45
|
-
stdio: ['pipe', 'pipe', 'ignore'],
|
|
46
|
-
}).trim();
|
|
47
|
-
if (!output) return [];
|
|
48
|
-
return output.split('\n').filter((l) => l);
|
|
49
|
-
} catch {
|
|
50
|
-
return [];
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
function extractSymbolsFromGrep(lines: string[]): Set<string> {
|
|
55
|
-
const results = new Set<string>();
|
|
56
|
-
for (const line of lines) {
|
|
57
|
-
const m = line.match(/\b([a-zA-Z_][a-zA-Z0-9_]*)\b/g);
|
|
58
|
-
if (m) m.forEach((n) => results.add(n));
|
|
59
|
-
}
|
|
60
|
-
return results;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
async function runTask(taskId: string): Promise<TaskMetrics> {
|
|
64
|
-
const task = TASKS.find((t) => t.id === taskId);
|
|
65
|
-
if (!task) throw new Error(`Task ${taskId} not found`);
|
|
66
|
-
|
|
67
|
-
let found = new Set<string>();
|
|
68
|
-
let responseText = '';
|
|
69
|
-
|
|
70
|
-
try {
|
|
71
|
-
switch (task.kind) {
|
|
72
|
-
case 'callers': {
|
|
73
|
-
// grep for function call pattern (naive heuristic)
|
|
74
|
-
const lines = runGrep(`\\b${task.query}\\s*\\(`);
|
|
75
|
-
found = extractSymbolsFromGrep(lines);
|
|
76
|
-
responseText = lines.join('\n').slice(0, 500);
|
|
77
|
-
break;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
case 'impact':
|
|
81
|
-
case 'impact-since': {
|
|
82
|
-
// Can't compute blast radius with grep — too many false positives
|
|
83
|
-
// Simulate by grepping for the function name everywhere
|
|
84
|
-
const lines = runGrep(`\\b${task.query}\\b`);
|
|
85
|
-
found = extractSymbolsFromGrep(lines);
|
|
86
|
-
responseText = `(grep can't compute blast radius; found ${lines.length} occurrences)`;
|
|
87
|
-
break;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
case 'recall':
|
|
91
|
-
case 'recall-substring': {
|
|
92
|
-
const pattern = task.kind === 'recall-substring' ? task.query : `\\b${task.query}\\b`;
|
|
93
|
-
const lines = runGrep(pattern);
|
|
94
|
-
found = extractSymbolsFromGrep(lines);
|
|
95
|
-
responseText = lines.join('\n').slice(0, 500);
|
|
96
|
-
break;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
case 'kind-filter': {
|
|
100
|
-
// grep for 'interface Foo', 'function bar', etc.
|
|
101
|
-
const lines = runGrep(`${task.query}\\s+[a-zA-Z_]`);
|
|
102
|
-
found = extractSymbolsFromGrep(lines);
|
|
103
|
-
responseText = lines.join('\n').slice(0, 500);
|
|
104
|
-
break;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
case 'tests-affected': {
|
|
108
|
-
const lines = runGrep(`\\b${task.query}\\b`, ['tests']);
|
|
109
|
-
found = extractSymbolsFromGrep(lines);
|
|
110
|
-
responseText = lines.join('\n').slice(0, 500);
|
|
111
|
-
break;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
case 'recall-miss': {
|
|
115
|
-
const lines = runGrep(`\\b${task.query}\\b`);
|
|
116
|
-
found = extractSymbolsFromGrep(lines);
|
|
117
|
-
responseText = found.size === 0 ? '[not found in grep]' : found.size.toString();
|
|
118
|
-
break;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
case 'string-literal': {
|
|
122
|
-
const lines = runGrep(task.query);
|
|
123
|
-
found = new Set(lines.map((l) => l.split(':')[0])); // file paths
|
|
124
|
-
responseText = lines.join('\n').slice(0, 500);
|
|
125
|
-
break;
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
} catch (err) {
|
|
129
|
-
responseText = `[ERROR: ${err}]`;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
// Score
|
|
133
|
-
const truth = new Set(task.groundTruth);
|
|
134
|
-
const intersection = new Set([...found].filter((n) => truth.has(n)));
|
|
135
|
-
const recall = truth.size > 0 ? intersection.size / truth.size : 1;
|
|
136
|
-
const precision = found.size > 0 ? intersection.size / found.size : 1;
|
|
137
|
-
const f1 = precision + recall > 0 ? (2 * (precision * recall)) / (precision + recall) : 0;
|
|
138
|
-
|
|
139
|
-
const success = recall === 1 && precision === 1;
|
|
140
|
-
const hallucinations = found.size - intersection.size;
|
|
141
|
-
const tokenEstimate = Math.ceil(responseText.length / 4);
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
taskId,
|
|
145
|
-
description: task.description,
|
|
146
|
-
kind: task.kind,
|
|
147
|
-
success,
|
|
148
|
-
recall: Math.round(recall * 100) / 100,
|
|
149
|
-
precision: Math.round(precision * 100) / 100,
|
|
150
|
-
f1: Math.round(f1 * 100) / 100,
|
|
151
|
-
hallucinations,
|
|
152
|
-
tokenEstimate,
|
|
153
|
-
notes: `truth=${Array.from(truth).join(',')} found=${Array.from(found).join(',')}`,
|
|
154
|
-
};
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
async function main() {
|
|
158
|
-
const results: TaskMetrics[] = [];
|
|
159
|
-
|
|
160
|
-
console.log(`Running ${TASKS.length} tasks with grep baseline...\n`);
|
|
161
|
-
|
|
162
|
-
for (const task of TASKS) {
|
|
163
|
-
const metrics = await runTask(task.id);
|
|
164
|
-
results.push(metrics);
|
|
165
|
-
const icon = metrics.success ? '✓' : '✗';
|
|
166
|
-
console.log(
|
|
167
|
-
`${icon} ${metrics.taskId}: F1=${metrics.f1} recall=${metrics.recall} prec=${metrics.precision}`,
|
|
168
|
-
);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
const timestamp = new Date().toISOString().split('T')[0];
|
|
172
|
-
const resultsPath = `bench/results/baseline-tier-${timestamp}.md`;
|
|
173
|
-
|
|
174
|
-
let md = `# Baseline Tier-B (grep)\n\n`;
|
|
175
|
-
md += `| Task | Description | Success | Recall | Precision | F1 | Halluc |\n`;
|
|
176
|
-
md += `|---|---|---|---|---|---|---|\n`;
|
|
177
|
-
|
|
178
|
-
let totalSuccess = 0;
|
|
179
|
-
let totalHalluc = 0;
|
|
180
|
-
|
|
181
|
-
for (const m of results) {
|
|
182
|
-
const success = m.success ? '✓' : '✗';
|
|
183
|
-
md += `| ${m.taskId} | ${m.description} | ${success} | ${m.recall} | ${m.precision} | ${m.f1} | ${m.hallucinations} |\n`;
|
|
184
|
-
totalSuccess += m.success ? 1 : 0;
|
|
185
|
-
totalHalluc += m.hallucinations;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
md += `\n## Summary\n\n`;
|
|
189
|
-
md += `- **Tasks passed:** ${totalSuccess}/${results.length}\n`;
|
|
190
|
-
md += `- **Total hallucinations:** ${totalHalluc}\n`;
|
|
191
|
-
md += `- **Mean F1:** ${(results.reduce((n, r) => n + r.f1, 0) / results.length).toFixed(2)}\n`;
|
|
192
|
-
|
|
193
|
-
fs.mkdirSync('bench/results', { recursive: true });
|
|
194
|
-
fs.writeFileSync(resultsPath, md);
|
|
195
|
-
|
|
196
|
-
console.log(`\nResults written to ${resultsPath}`);
|
|
197
|
-
console.log(`\n${md}`);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
main().catch(console.error);
|