@graphpilot-oss/graphpilot 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +73 -126
- package/README.md +359 -101
- package/dist/cli.js +20 -0
- package/dist/cli.js.map +1 -1
- package/dist/indexer.js +3 -3
- package/dist/indexer.js.map +1 -1
- package/dist/init.d.ts +28 -0
- package/dist/init.js +112 -0
- package/dist/init.js.map +1 -0
- package/dist/interactions.d.ts +5 -4
- package/dist/interactions.js +0 -0
- package/dist/interactions.js.map +1 -1
- package/dist/mcp.js +126 -46
- package/dist/mcp.js.map +1 -1
- package/dist/repo-resolve.d.ts +47 -0
- package/dist/repo-resolve.js +195 -0
- package/dist/repo-resolve.js.map +1 -0
- package/dist/storage.js +10 -1
- package/dist/storage.js.map +1 -1
- package/dist/validation.js +30 -4
- package/dist/validation.js.map +1 -1
- package/dist/watcher.d.ts +10 -0
- package/dist/watcher.js +70 -7
- package/dist/watcher.js.map +1 -1
- package/examples/README.md +105 -0
- package/examples/claude-code/README.md +125 -0
- package/examples/claude-code/claude-routing.md +102 -0
- package/examples/claude-code/claude_config.json +8 -0
- package/examples/cline/.clinerules +39 -0
- package/examples/cline/README.md +104 -0
- package/examples/cline/cline_mcp_settings.json +10 -0
- package/examples/continue/.continuerules +39 -0
- package/examples/continue/README.md +98 -0
- package/examples/continue/config.json +13 -0
- package/examples/cursor/.cursorrules +39 -0
- package/examples/cursor/README.md +98 -0
- package/examples/cursor/mcp.json +11 -0
- package/examples/windsurf/.windsurfrules +39 -0
- package/examples/windsurf/README.md +85 -0
- package/examples/windsurf/mcp_config.json +8 -0
- package/package.json +12 -3
- package/.editorconfig +0 -15
- package/.github/CODEOWNERS +0 -22
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -33
- package/.github/ISSUE_TEMPLATE/config.yml +0 -5
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -23
- package/.github/PULL_REQUEST_TEMPLATE.md +0 -19
- package/.github/dependabot.yml +0 -15
- package/.github/workflows/ci.yml +0 -62
- package/.github/workflows/release.yml +0 -50
- package/.prettierignore +0 -19
- package/.prettierrc.json +0 -20
- package/CODE_OF_CONDUCT.md +0 -83
- package/CONTRIBUTING.md +0 -111
- package/bench/README.md +0 -544
- package/bench/results/agent-tier-2026-05-22.md +0 -28
- package/bench/results/agent-tier-summary.md +0 -44
- package/bench/results/baseline-tier-2026-05-22.md +0 -23
- package/bench/results/baseline.json +0 -810
- package/bench/results/baseline.md +0 -28
- package/bench/run-agent-tier-automated.ts +0 -234
- package/bench/run-agent-tier.md +0 -125
- package/bench/run-baseline-tier.ts +0 -200
- package/bench/run.ts +0 -210
- package/bench/runner-baseline.ts +0 -177
- package/bench/runner-graphpilot.ts +0 -131
- package/bench/score-agent-tier.ts +0 -191
- package/bench/score.ts +0 -59
- package/bench/tasks.ts +0 -236
- package/dist/provenance.d.ts +0 -74
- package/dist/provenance.js +0 -95
- package/dist/provenance.js.map +0 -1
- package/docs/architecture.md +0 -311
- package/docs/limitations.md +0 -156
- package/docs/mcp-setup.md +0 -231
- package/docs/quickstart.md +0 -202
- package/eslint.config.js +0 -148
- package/lefthook.yml +0 -81
- package/pnpm-workspace.yaml +0 -6
- package/scripts/smoke-stdio.mjs +0 -97
- package/src/cli.ts +0 -171
- package/src/edges.ts +0 -202
- package/src/git.ts +0 -255
- package/src/graph-schema.ts +0 -229
- package/src/impact.ts +0 -218
- package/src/indexer.ts +0 -152
- package/src/interactions.ts +0 -0
- package/src/mcp.ts +0 -652
- package/src/parser.ts +0 -138
- package/src/provenance.ts +0 -115
- package/src/query.ts +0 -148
- package/src/redact.ts +0 -122
- package/src/storage.ts +0 -115
- package/src/symbols.ts +0 -173
- package/src/validation.ts +0 -69
- package/src/validators.ts +0 -253
- package/src/watcher.ts +0 -383
- package/tests/edges.test.ts +0 -175
- package/tests/fixtures/sample.ts +0 -32
- package/tests/git.test.ts +0 -303
- package/tests/graph-schema.test.ts +0 -321
- package/tests/impact.test.ts +0 -454
- package/tests/interactions.test.ts +0 -180
- package/tests/lint-policy.test.ts +0 -106
- package/tests/mcp-stdio.test.ts +0 -171
- package/tests/mcp.test.ts +0 -335
- package/tests/parser.test.ts +0 -31
- package/tests/provenance.test.ts +0 -132
- package/tests/query.test.ts +0 -160
- package/tests/redact.test.ts +0 -167
- package/tests/security.test.ts +0 -144
- package/tests/symbols.test.ts +0 -78
- package/tests/validators.test.ts +0 -193
- package/tests/watcher.test.ts +0 -250
- package/tsconfig.json +0 -18
package/bench/run.ts
DELETED
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tier-A benchmark runner. Runs each task in TASKS in two conditions
|
|
3
|
-
* (graphpilot, baseline grep), scores both, writes a JSON result file
|
|
4
|
-
* plus a markdown summary to bench/results/.
|
|
5
|
-
*
|
|
6
|
-
* Usage:
|
|
7
|
-
* pnpm bench [--repo=<path>] [--out=<file>]
|
|
8
|
-
*
|
|
9
|
-
* Defaults to running against the graphpilot repo itself
|
|
10
|
-
* (process.cwd()), which is the self-test corpus.
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
import { mkdirSync, writeFileSync, readFileSync, existsSync } from 'node:fs';
|
|
14
|
-
import { join, resolve } from 'node:path';
|
|
15
|
-
import { TASKS, type Task } from './tasks.js';
|
|
16
|
-
import { GraphpilotRunner, type RunResult } from './runner-graphpilot.js';
|
|
17
|
-
import { BaselineRunner } from './runner-baseline.js';
|
|
18
|
-
import { score, type Scored } from './score.js';
|
|
19
|
-
|
|
20
|
-
interface PerTaskResult {
|
|
21
|
-
task: Task;
|
|
22
|
-
graphpilot: { run: RunResult; score: Scored };
|
|
23
|
-
baseline: { run: RunResult; score: Scored };
|
|
24
|
-
winner: 'graphpilot' | 'grep' | 'tie';
|
|
25
|
-
/** Did the winner match expectedWinner? Diagnostic. */
|
|
26
|
-
expectedMatch: boolean;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
interface AggregateMetrics {
|
|
30
|
-
totalTasks: number;
|
|
31
|
-
graphpilotF1Sum: number;
|
|
32
|
-
baselineF1Sum: number;
|
|
33
|
-
graphpilotBytesTotal: number;
|
|
34
|
-
baselineBytesTotal: number;
|
|
35
|
-
graphpilotWins: number;
|
|
36
|
-
baselineWins: number;
|
|
37
|
-
ties: number;
|
|
38
|
-
expectedWinnerHits: number;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
interface BenchmarkReport {
|
|
42
|
-
meta: {
|
|
43
|
-
corpus: string;
|
|
44
|
-
timestamp: string;
|
|
45
|
-
graphpilotVersion: string;
|
|
46
|
-
nodeVersion: string;
|
|
47
|
-
platform: NodeJS.Platform;
|
|
48
|
-
};
|
|
49
|
-
aggregate: AggregateMetrics;
|
|
50
|
-
perTask: PerTaskResult[];
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
function pickWinner(gp: Scored, bl: Scored): 'graphpilot' | 'grep' | 'tie' {
|
|
54
|
-
const epsilon = 0.001;
|
|
55
|
-
if (Math.abs(gp.f1 - bl.f1) < epsilon) return 'tie';
|
|
56
|
-
return gp.f1 > bl.f1 ? 'graphpilot' : 'grep';
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
function fmt(n: number, dp = 2): string {
|
|
60
|
-
return Number.isFinite(n) ? n.toFixed(dp) : '?';
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function fmtBytes(n: number): string {
|
|
64
|
-
if (n < 1024) return `${n}B`;
|
|
65
|
-
if (n < 1024 * 1024) return `${(n / 1024).toFixed(1)}KB`;
|
|
66
|
-
return `${(n / 1024 / 1024).toFixed(1)}MB`;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
function summaryMarkdown(report: BenchmarkReport): string {
|
|
70
|
-
const a = report.aggregate;
|
|
71
|
-
const lines: string[] = [];
|
|
72
|
-
lines.push(`# GraphPilot Benchmark — ${report.meta.timestamp}`);
|
|
73
|
-
lines.push('');
|
|
74
|
-
lines.push(`Corpus: \`${report.meta.corpus}\``);
|
|
75
|
-
lines.push(`graphpilot v${report.meta.graphpilotVersion}`);
|
|
76
|
-
lines.push(`Node ${report.meta.nodeVersion} on ${report.meta.platform}`);
|
|
77
|
-
lines.push('');
|
|
78
|
-
lines.push('## Aggregate');
|
|
79
|
-
lines.push('');
|
|
80
|
-
lines.push(`- Tasks run: **${a.totalTasks}**`);
|
|
81
|
-
lines.push(
|
|
82
|
-
`- F1 (avg): graphpilot **${fmt(a.graphpilotF1Sum / a.totalTasks)}** ` +
|
|
83
|
-
`vs grep **${fmt(a.baselineF1Sum / a.totalTasks)}**`,
|
|
84
|
-
);
|
|
85
|
-
lines.push(
|
|
86
|
-
`- Bytes processed (total): graphpilot **${fmtBytes(a.graphpilotBytesTotal)}** ` +
|
|
87
|
-
`vs grep **${fmtBytes(a.baselineBytesTotal)}**` +
|
|
88
|
-
` (${fmt((1 - a.graphpilotBytesTotal / a.baselineBytesTotal) * 100, 1)}% reduction)`,
|
|
89
|
-
);
|
|
90
|
-
lines.push(
|
|
91
|
-
`- Winner counts: graphpilot **${a.graphpilotWins}** · grep **${a.baselineWins}** · tie **${a.ties}**`,
|
|
92
|
-
);
|
|
93
|
-
lines.push(
|
|
94
|
-
`- Expected-winner accuracy: **${a.expectedWinnerHits}/${a.totalTasks}** ` +
|
|
95
|
-
`(${fmt((a.expectedWinnerHits / a.totalTasks) * 100, 0)}%)`,
|
|
96
|
-
);
|
|
97
|
-
lines.push('');
|
|
98
|
-
lines.push('## Per-task');
|
|
99
|
-
lines.push('');
|
|
100
|
-
lines.push('| # | Task | GP F1 | Grep F1 | GP bytes | Grep bytes | Winner | Expected |');
|
|
101
|
-
lines.push('|---|---|---|---|---|---|---|---|');
|
|
102
|
-
for (const t of report.perTask) {
|
|
103
|
-
const match = t.expectedMatch ? '✓' : '✗';
|
|
104
|
-
lines.push(
|
|
105
|
-
`| ${t.task.id} | ${t.task.description} ` +
|
|
106
|
-
`| ${fmt(t.graphpilot.score.f1)} ` +
|
|
107
|
-
`| ${fmt(t.baseline.score.f1)} ` +
|
|
108
|
-
`| ${fmtBytes(t.graphpilot.run.outputBytes)} ` +
|
|
109
|
-
`| ${fmtBytes(t.baseline.run.outputBytes)} ` +
|
|
110
|
-
`| ${t.winner} ` +
|
|
111
|
-
`| ${t.task.expectedWinner} ${match} |`,
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
return lines.join('\n');
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
async function main(): Promise<number> {
|
|
118
|
-
const args = process.argv.slice(2);
|
|
119
|
-
let repoArg: string | undefined;
|
|
120
|
-
let outArg: string | undefined;
|
|
121
|
-
for (const a of args) {
|
|
122
|
-
if (a.startsWith('--repo=')) repoArg = a.slice('--repo='.length);
|
|
123
|
-
else if (a.startsWith('--out=')) outArg = a.slice('--out='.length);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const repo = resolve(repoArg ?? process.cwd());
|
|
127
|
-
const gp = new GraphpilotRunner(repo);
|
|
128
|
-
const baseline = new BaselineRunner(repo);
|
|
129
|
-
|
|
130
|
-
const perTask: PerTaskResult[] = [];
|
|
131
|
-
const agg: AggregateMetrics = {
|
|
132
|
-
totalTasks: TASKS.length,
|
|
133
|
-
graphpilotF1Sum: 0,
|
|
134
|
-
baselineF1Sum: 0,
|
|
135
|
-
graphpilotBytesTotal: 0,
|
|
136
|
-
baselineBytesTotal: 0,
|
|
137
|
-
graphpilotWins: 0,
|
|
138
|
-
baselineWins: 0,
|
|
139
|
-
ties: 0,
|
|
140
|
-
expectedWinnerHits: 0,
|
|
141
|
-
};
|
|
142
|
-
|
|
143
|
-
for (const task of TASKS) {
|
|
144
|
-
const gpRun = gp.run(task);
|
|
145
|
-
const blRun = baseline.run(task);
|
|
146
|
-
|
|
147
|
-
const gpScore = score(gpRun.returned, task.groundTruth);
|
|
148
|
-
const blScore = score(blRun.returned, task.groundTruth);
|
|
149
|
-
const winner = pickWinner(gpScore, blScore);
|
|
150
|
-
const expectedMatch = winner === task.expectedWinner;
|
|
151
|
-
|
|
152
|
-
perTask.push({
|
|
153
|
-
task,
|
|
154
|
-
graphpilot: { run: gpRun, score: gpScore },
|
|
155
|
-
baseline: { run: blRun, score: blScore },
|
|
156
|
-
winner,
|
|
157
|
-
expectedMatch,
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
agg.graphpilotF1Sum += gpScore.f1;
|
|
161
|
-
agg.baselineF1Sum += blScore.f1;
|
|
162
|
-
agg.graphpilotBytesTotal += gpRun.outputBytes;
|
|
163
|
-
agg.baselineBytesTotal += blRun.outputBytes;
|
|
164
|
-
if (winner === 'graphpilot') agg.graphpilotWins++;
|
|
165
|
-
else if (winner === 'grep') agg.baselineWins++;
|
|
166
|
-
else agg.ties++;
|
|
167
|
-
if (expectedMatch) agg.expectedWinnerHits++;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
// Pull package.json version for the meta block.
|
|
171
|
-
const pkgVersion = JSON.parse(readFileSync(join(repo, 'package.json'), 'utf8')).version as string;
|
|
172
|
-
|
|
173
|
-
const report: BenchmarkReport = {
|
|
174
|
-
meta: {
|
|
175
|
-
corpus: repo,
|
|
176
|
-
timestamp: new Date().toISOString(),
|
|
177
|
-
graphpilotVersion: pkgVersion,
|
|
178
|
-
nodeVersion: process.version,
|
|
179
|
-
platform: process.platform,
|
|
180
|
-
},
|
|
181
|
-
aggregate: agg,
|
|
182
|
-
perTask,
|
|
183
|
-
};
|
|
184
|
-
|
|
185
|
-
const resultsDir = join(repo, 'bench', 'results');
|
|
186
|
-
if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
|
|
187
|
-
|
|
188
|
-
const ts = report.meta.timestamp.replace(/[:.]/g, '-');
|
|
189
|
-
const jsonPath = outArg ?? join(resultsDir, `bench-${ts}.json`);
|
|
190
|
-
const mdPath = jsonPath.replace(/\.json$/, '.md');
|
|
191
|
-
|
|
192
|
-
writeFileSync(jsonPath, JSON.stringify(report, null, 2), 'utf8');
|
|
193
|
-
writeFileSync(mdPath, summaryMarkdown(report), 'utf8');
|
|
194
|
-
|
|
195
|
-
// Console summary
|
|
196
|
-
console.log(summaryMarkdown(report));
|
|
197
|
-
console.log('');
|
|
198
|
-
console.log(`Wrote ${jsonPath}`);
|
|
199
|
-
console.log(`Wrote ${mdPath}`);
|
|
200
|
-
|
|
201
|
-
return 0;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
main().then(
|
|
205
|
-
(code) => process.exit(code),
|
|
206
|
-
(err) => {
|
|
207
|
-
console.error('Benchmark failed:', err);
|
|
208
|
-
process.exit(1);
|
|
209
|
-
},
|
|
210
|
-
);
|
package/bench/runner-baseline.ts
DELETED
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Grep-style baseline. Simulates what an agent without structural memory
|
|
3
|
-
* would do: scan every source file for the query as a literal substring,
|
|
4
|
-
* return the matching files / function names.
|
|
5
|
-
*
|
|
6
|
-
* This UNDERSTATES the real baseline cost because:
|
|
7
|
-
* - A real agent reads context around each grep hit (we count just
|
|
8
|
-
* the matching files' raw bytes)
|
|
9
|
-
* - A real agent often grep+read multiple times before answering
|
|
10
|
-
*
|
|
11
|
-
* Even with that bias toward grep, GraphPilot should still win the
|
|
12
|
-
* structural tasks by a large margin on `outputBytes` (proxy for tokens
|
|
13
|
-
* the agent would have to read).
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
import fg from 'fast-glob';
|
|
17
|
-
import { readFileSync, statSync } from 'node:fs';
|
|
18
|
-
import { relative, resolve } from 'node:path';
|
|
19
|
-
import type { Task } from './tasks.js';
|
|
20
|
-
|
|
21
|
-
export interface RunResult {
|
|
22
|
-
returned: string[];
|
|
23
|
-
/** Total bytes the agent would have to read to answer this question via grep. */
|
|
24
|
-
outputBytes: number;
|
|
25
|
-
durationMs: number;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const SOURCE_GLOB = ['**/*.ts', '**/*.tsx', '**/*.js', '**/*.jsx'];
|
|
29
|
-
const IGNORE = [
|
|
30
|
-
'**/node_modules/**',
|
|
31
|
-
'**/dist/**',
|
|
32
|
-
'**/build/**',
|
|
33
|
-
'**/coverage/**',
|
|
34
|
-
'**/.next/**',
|
|
35
|
-
'**/.nuxt/**',
|
|
36
|
-
'**/*.d.ts',
|
|
37
|
-
];
|
|
38
|
-
|
|
39
|
-
/** Cache: filePath -> bytes + lines. Avoids re-reading on every task. */
|
|
40
|
-
interface FileCache {
|
|
41
|
-
path: string;
|
|
42
|
-
rel: string;
|
|
43
|
-
bytes: number;
|
|
44
|
-
lines: string[];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export class BaselineRunner {
|
|
48
|
-
readonly absRoot: string;
|
|
49
|
-
private readonly files: FileCache[];
|
|
50
|
-
|
|
51
|
-
constructor(repoRoot: string) {
|
|
52
|
-
this.absRoot = resolve(repoRoot);
|
|
53
|
-
const filePaths = fg.sync(SOURCE_GLOB, {
|
|
54
|
-
cwd: this.absRoot,
|
|
55
|
-
absolute: true,
|
|
56
|
-
ignore: IGNORE,
|
|
57
|
-
onlyFiles: true,
|
|
58
|
-
});
|
|
59
|
-
this.files = filePaths.map((p) => {
|
|
60
|
-
const text = readFileSync(p, 'utf8');
|
|
61
|
-
return {
|
|
62
|
-
path: p,
|
|
63
|
-
rel: relative(this.absRoot, p),
|
|
64
|
-
bytes: statSync(p).size,
|
|
65
|
-
lines: text.split('\n'),
|
|
66
|
-
};
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
/**
|
|
71
|
-
* Scan all source files for `query` as a literal substring. Returns
|
|
72
|
-
* (a) the set of matching files (for string-literal/tests-affected
|
|
73
|
-
* tasks) and (b) function-like identifier names that appear adjacent
|
|
74
|
-
* to `function`/`class`/`interface`/`const` keywords (a rough proxy
|
|
75
|
-
* for "what an agent would conclude").
|
|
76
|
-
*/
|
|
77
|
-
private grepScan(query: string): {
|
|
78
|
-
matchedFiles: Set<string>;
|
|
79
|
-
bytesRead: number;
|
|
80
|
-
suspectedNames: Set<string>;
|
|
81
|
-
} {
|
|
82
|
-
const matchedFiles = new Set<string>();
|
|
83
|
-
const suspectedNames = new Set<string>();
|
|
84
|
-
let bytesRead = 0;
|
|
85
|
-
|
|
86
|
-
for (const f of this.files) {
|
|
87
|
-
let hitInFile = false;
|
|
88
|
-
for (const line of f.lines) {
|
|
89
|
-
if (!line.includes(query)) continue;
|
|
90
|
-
hitInFile = true;
|
|
91
|
-
// Heuristic: pull a likely caller name out of "function X(", "method X(",
|
|
92
|
-
// "const X =", "X(" near the beginning, etc. Rough but it's what a
|
|
93
|
-
// human-style grep+eyeball would produce.
|
|
94
|
-
const fnMatch =
|
|
95
|
-
line.match(/(?:function|class|interface)\s+([A-Za-z_$][\w$]*)/) ||
|
|
96
|
-
line.match(/\b([A-Za-z_$][\w$]*)\s*[:=]\s*(?:async\s+)?(?:function|\()/);
|
|
97
|
-
if (fnMatch) suspectedNames.add(fnMatch[1]);
|
|
98
|
-
}
|
|
99
|
-
if (hitInFile) {
|
|
100
|
-
matchedFiles.add(f.rel);
|
|
101
|
-
bytesRead += f.bytes;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
return { matchedFiles, bytesRead, suspectedNames };
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
run(task: Task): RunResult {
|
|
109
|
-
const start = Date.now();
|
|
110
|
-
let returned: string[] = [];
|
|
111
|
-
let bytesRead = 0;
|
|
112
|
-
|
|
113
|
-
switch (task.kind) {
|
|
114
|
-
case 'callers':
|
|
115
|
-
case 'impact':
|
|
116
|
-
case 'recall':
|
|
117
|
-
case 'recall-substring':
|
|
118
|
-
case 'recall-miss': {
|
|
119
|
-
// Grep for the query string, then collect identifier names near
|
|
120
|
-
// each match. Best-effort approximation of what an agent would
|
|
121
|
-
// do without structural memory.
|
|
122
|
-
const { bytesRead: br, suspectedNames } = this.grepScan(task.query);
|
|
123
|
-
bytesRead = br;
|
|
124
|
-
// Exclude the query itself if it appears as a suspected name
|
|
125
|
-
// (the function declaration line will have the query as its own
|
|
126
|
-
// name; that's not a caller).
|
|
127
|
-
suspectedNames.delete(task.query);
|
|
128
|
-
returned = [...suspectedNames].sort();
|
|
129
|
-
break;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
case 'kind-filter': {
|
|
133
|
-
// The query is a TypeScript keyword like "interface". Grep for
|
|
134
|
-
// "interface " (with trailing space) to filter declarations from
|
|
135
|
-
// string literals, then extract the next identifier.
|
|
136
|
-
const re = new RegExp(`\\b${task.query}\\s+([A-Za-z_$][\\w$]*)`, 'g');
|
|
137
|
-
const names = new Set<string>();
|
|
138
|
-
for (const f of this.files) {
|
|
139
|
-
if (!f.rel.startsWith('src/')) continue;
|
|
140
|
-
const text = f.lines.join('\n');
|
|
141
|
-
let m: RegExpExecArray | null;
|
|
142
|
-
let matched = false;
|
|
143
|
-
while ((m = re.exec(text)) !== null) {
|
|
144
|
-
names.add(m[1]);
|
|
145
|
-
matched = true;
|
|
146
|
-
}
|
|
147
|
-
if (matched) bytesRead += f.bytes;
|
|
148
|
-
}
|
|
149
|
-
returned = [...names].sort();
|
|
150
|
-
break;
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
case 'tests-affected': {
|
|
154
|
-
// Without structural memory, you'd grep for the symbol and look
|
|
155
|
-
// at which *.test.ts files contain it.
|
|
156
|
-
const { bytesRead: br, matchedFiles } = this.grepScan(task.query);
|
|
157
|
-
bytesRead = br;
|
|
158
|
-
returned = [...matchedFiles].filter((f) => /\.(test|spec)\.[jt]sx?$/.test(f)).sort();
|
|
159
|
-
break;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
case 'string-literal': {
|
|
163
|
-
// GREP-NATIVE: just return matching files
|
|
164
|
-
const { bytesRead: br, matchedFiles } = this.grepScan(task.query);
|
|
165
|
-
bytesRead = br;
|
|
166
|
-
returned = [...matchedFiles].sort();
|
|
167
|
-
break;
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
return {
|
|
172
|
-
returned,
|
|
173
|
-
outputBytes: bytesRead, // baseline cost = bytes the agent would read
|
|
174
|
-
durationMs: Date.now() - start,
|
|
175
|
-
};
|
|
176
|
-
}
|
|
177
|
-
}
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Run each benchmark task using GraphPilot's primitives directly.
|
|
3
|
-
* Measures correctness against task.groundTruth + bytes of output the
|
|
4
|
-
* tool returned (proxy for the token cost an agent would pay).
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { loadGraph } from '../src/storage.js';
|
|
8
|
-
import { GraphIndex } from '../src/query.js';
|
|
9
|
-
import { analyzeImpact } from '../src/impact.js';
|
|
10
|
-
import type { Task } from './tasks.js';
|
|
11
|
-
|
|
12
|
-
export interface RunResult {
|
|
13
|
-
/** Strings the tool returned. For caller/impact tasks, caller names. */
|
|
14
|
-
returned: string[];
|
|
15
|
-
/** Bytes the tool's structured output occupies as JSON. */
|
|
16
|
-
outputBytes: number;
|
|
17
|
-
/** Wall-clock for the tool call. */
|
|
18
|
-
durationMs: number;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export class GraphpilotRunner {
|
|
22
|
-
private readonly idx: GraphIndex;
|
|
23
|
-
|
|
24
|
-
constructor(repoRoot: string) {
|
|
25
|
-
const g = loadGraph(repoRoot);
|
|
26
|
-
if (!g) {
|
|
27
|
-
throw new Error(`No graph found at ${repoRoot}. Run \`graphpilot index\` first.`);
|
|
28
|
-
}
|
|
29
|
-
this.idx = new GraphIndex(g);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
run(task: Task): RunResult {
|
|
33
|
-
const start = Date.now();
|
|
34
|
-
let returned: string[];
|
|
35
|
-
|
|
36
|
-
switch (task.kind) {
|
|
37
|
-
case 'callers': {
|
|
38
|
-
const target = this.idx.resolveSymbol(task.query);
|
|
39
|
-
if (!target) {
|
|
40
|
-
returned = [];
|
|
41
|
-
break;
|
|
42
|
-
}
|
|
43
|
-
const edges = this.idx.callers(target.id);
|
|
44
|
-
const names = new Set<string>();
|
|
45
|
-
for (const e of edges) {
|
|
46
|
-
const from = this.idx.findById(e.fromId);
|
|
47
|
-
if (from) names.add(from.name);
|
|
48
|
-
}
|
|
49
|
-
returned = [...names].sort();
|
|
50
|
-
break;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
case 'recall': {
|
|
54
|
-
const matches = this.idx.findByName(task.query, { limit: 50 });
|
|
55
|
-
returned = matches.map((s) => s.name).sort();
|
|
56
|
-
break;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
case 'recall-substring': {
|
|
60
|
-
const matches = this.idx.findByName(task.query, {
|
|
61
|
-
substring: true,
|
|
62
|
-
limit: 100,
|
|
63
|
-
});
|
|
64
|
-
returned = matches.map((s) => s.name).sort();
|
|
65
|
-
break;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
case 'kind-filter': {
|
|
69
|
-
// Filter the full symbol table by kind. The MCP surface doesn't
|
|
70
|
-
// expose this as a tool today (would be a v0.2 gp_list_by_kind)
|
|
71
|
-
// but the data is in GraphIndex.graph.symbols.
|
|
72
|
-
returned = this.idx.graph.symbols
|
|
73
|
-
.filter((s) => s.kind === task.query && s.file.startsWith('src/'))
|
|
74
|
-
.map((s) => s.name)
|
|
75
|
-
.sort();
|
|
76
|
-
break;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
case 'impact': {
|
|
80
|
-
const report = analyzeImpact(this.idx, task.query, { depth: 2 });
|
|
81
|
-
if (!report) {
|
|
82
|
-
returned = [];
|
|
83
|
-
break;
|
|
84
|
-
}
|
|
85
|
-
const names = new Set<string>();
|
|
86
|
-
for (const c of report.directCallers) names.add(c.symbol.name);
|
|
87
|
-
for (const c of report.transitiveCallers) names.add(c.symbol.name);
|
|
88
|
-
returned = [...names].sort();
|
|
89
|
-
break;
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
case 'tests-affected': {
|
|
93
|
-
const report = analyzeImpact(this.idx, task.query, { depth: 3 });
|
|
94
|
-
if (!report) {
|
|
95
|
-
returned = [];
|
|
96
|
-
break;
|
|
97
|
-
}
|
|
98
|
-
const files = new Set<string>();
|
|
99
|
-
for (const c of report.testsAffected) files.add(c.symbol.file);
|
|
100
|
-
returned = [...files].sort();
|
|
101
|
-
break;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
case 'recall-miss': {
|
|
105
|
-
const matches = this.idx.findByName(task.query, { limit: 10 });
|
|
106
|
-
returned = matches.map((s) => s.name).sort();
|
|
107
|
-
break;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
case 'string-literal': {
|
|
111
|
-
// GraphPilot intentionally doesn't index string literals or
|
|
112
|
-
// identifier usages outside structural contexts. Best effort:
|
|
113
|
-
// return any file where a symbol whose NAME matches the query
|
|
114
|
-
// is defined. This will miss most usages (the honest "grep wins"
|
|
115
|
-
// baseline).
|
|
116
|
-
const decl = this.idx.findByName(task.query, { limit: 5 });
|
|
117
|
-
const files = new Set<string>();
|
|
118
|
-
for (const s of decl) files.add(s.file);
|
|
119
|
-
returned = [...files].sort();
|
|
120
|
-
break;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
const outputBytes = Buffer.byteLength(JSON.stringify(returned), 'utf8');
|
|
125
|
-
return {
|
|
126
|
-
returned,
|
|
127
|
-
outputBytes,
|
|
128
|
-
durationMs: Date.now() - start,
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
|
-
}
|
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tier-B Agent Benchmark Scorer
|
|
3
|
-
*
|
|
4
|
-
* Parses Claude Code session transcripts (.jsonl) and produces the per-task
|
|
5
|
-
* metrics table for the agent-eval benchmark.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* npx tsx bench/score-agent-tier.ts \
|
|
9
|
-
* --baseline <path-to-baseline.jsonl> \
|
|
10
|
-
* --graphpilot <path-to-graphpilot.jsonl> \
|
|
11
|
-
* --output bench/results/agent-tier-2026-05-22.md
|
|
12
|
-
*
|
|
13
|
-
* Input: Claude Code session jsonl (one JSON message per line)
|
|
14
|
-
* Output: Markdown table with per-task metrics + aggregate stats
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
import * as fs from 'node:fs';
|
|
18
|
-
import * as path from 'node:path';
|
|
19
|
-
import { createReadStream } from 'node:fs';
|
|
20
|
-
import { createInterface } from 'node:readline';
|
|
21
|
-
|
|
22
|
-
interface Message {
|
|
23
|
-
type: string;
|
|
24
|
-
role?: string;
|
|
25
|
-
content?: string;
|
|
26
|
-
[key: string]: any;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
interface TaskResult {
|
|
30
|
-
id: string;
|
|
31
|
-
prompt: string;
|
|
32
|
-
// Scorer fills in:
|
|
33
|
-
taskSuccessBaseline: 0 | 1;
|
|
34
|
-
taskSuccessGraphPilot: 0 | 1;
|
|
35
|
-
hallucCountBaseline: number;
|
|
36
|
-
hallucCountGraphPilot: number;
|
|
37
|
-
tokenCostBaseline: number;
|
|
38
|
-
tokenCostGraphPilot: number;
|
|
39
|
-
anchorResolutionRate: number; // 0..1, only for GP
|
|
40
|
-
diffNoiseRatio: number; // 0..1, only for GP (impact-since tasks)
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
async function readJsonl(filePath: string): Promise<Message[]> {
|
|
44
|
-
const messages: Message[] = [];
|
|
45
|
-
const rl = createInterface({
|
|
46
|
-
input: createReadStream(filePath),
|
|
47
|
-
crlfDelay: Infinity,
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
for await (const line of rl) {
|
|
51
|
-
if (line.trim()) {
|
|
52
|
-
messages.push(JSON.parse(line));
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
return messages;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
function extractTokenUsage(msgs: Message[]): number {
|
|
59
|
-
let total = 0;
|
|
60
|
-
for (const msg of msgs) {
|
|
61
|
-
if (msg.usage) {
|
|
62
|
-
total += (msg.usage.input_tokens ?? 0) + (msg.usage.output_tokens ?? 0);
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
return total;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
/**
|
|
69
|
-
* Extract file:line @ sha anchors from transcript text.
|
|
70
|
-
* Pattern: src/foo.ts:42 @ ab12cd3
|
|
71
|
-
*/
|
|
72
|
-
function extractAnchors(text: string): Array<{ file: string; line: number; sha: string }> {
|
|
73
|
-
const anchors: Array<{ file: string; line: number; sha: string }> = [];
|
|
74
|
-
const re = /(\S+\.(?:ts|tsx|js|jsx)):(\d+)(?:\s+@\s+([0-9a-f]{7}))?/g;
|
|
75
|
-
let m;
|
|
76
|
-
while ((m = re.exec(text))) {
|
|
77
|
-
anchors.push({
|
|
78
|
-
file: m[1],
|
|
79
|
-
line: parseInt(m[2], 10),
|
|
80
|
-
sha: m[3] ?? '',
|
|
81
|
-
});
|
|
82
|
-
}
|
|
83
|
-
return anchors;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
/**
|
|
87
|
-
* Stub scorer: human-in-the-loop.
|
|
88
|
-
*
|
|
89
|
-
* Real scoring requires:
|
|
90
|
-
* 1. Human opens the transcript
|
|
91
|
-
* 2. Manually reads the agent's final answer
|
|
92
|
-
* 3. Compares to ground truth (from tasks.ts)
|
|
93
|
-
* 4. Marks success/hallucination count
|
|
94
|
-
*
|
|
95
|
-
* This function prompts for input or reads from a pre-filled CSV.
|
|
96
|
-
* For now, it's a template showing the metrics to collect.
|
|
97
|
-
*/
|
|
98
|
-
async function scoreTask(
|
|
99
|
-
taskId: string,
|
|
100
|
-
baselineTranscript: string,
|
|
101
|
-
graphpilotTranscript: string,
|
|
102
|
-
): Promise<TaskResult> {
|
|
103
|
-
// TODO: Implement human-in-the-loop or CSV reader
|
|
104
|
-
// For MVP, return a stub result
|
|
105
|
-
return {
|
|
106
|
-
id: taskId,
|
|
107
|
-
prompt: '(placeholder)',
|
|
108
|
-
taskSuccessBaseline: 0,
|
|
109
|
-
taskSuccessGraphPilot: 0,
|
|
110
|
-
hallucCountBaseline: 0,
|
|
111
|
-
hallucCountGraphPilot: 0,
|
|
112
|
-
tokenCostBaseline: 0,
|
|
113
|
-
tokenCostGraphPilot: 0,
|
|
114
|
-
anchorResolutionRate: 0,
|
|
115
|
-
diffNoiseRatio: 0,
|
|
116
|
-
};
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* Format results as Markdown table suitable for README
|
|
121
|
-
*/
|
|
122
|
-
function formatResultsTable(results: TaskResult[]): string {
|
|
123
|
-
let md = `# Tier-B Agent Benchmark Results\n\n`;
|
|
124
|
-
md += `| Task | Baseline Success | GP Success | Halluc (B) | Halluc (GP) | Tokens (B) | Tokens (GP) | Anchor % | Diff Noise |\n`;
|
|
125
|
-
md += `|---|---|---|---|---|---|---|---|---|\n`;
|
|
126
|
-
|
|
127
|
-
for (const r of results) {
|
|
128
|
-
md += `| ${r.id} | ${r.taskSuccessBaseline} | ${r.taskSuccessGraphPilot} | ${r.hallucCountBaseline} | ${r.hallucCountGraphPilot} | ${r.tokenCostBaseline} | ${r.tokenCostGraphPilot} | ${(r.anchorResolutionRate * 100).toFixed(0)}% | ${(r.diffNoiseRatio * 100).toFixed(0)}% |\n`;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Aggregate stats
|
|
132
|
-
const totalSuccessB = results.reduce((n, r) => n + r.taskSuccessBaseline, 0);
|
|
133
|
-
const totalSuccessGP = results.reduce((n, r) => n + r.taskSuccessGraphPilot, 0);
|
|
134
|
-
const totalHallucB = results.reduce((n, r) => n + r.hallucCountBaseline, 0);
|
|
135
|
-
const totalHallucGP = results.reduce((n, r) => n + r.hallucCountGraphPilot, 0);
|
|
136
|
-
const totalTokensB = results.reduce((n, r) => n + r.tokenCostBaseline, 0);
|
|
137
|
-
const totalTokensGP = results.reduce((n, r) => n + r.tokenCostGraphPilot, 0);
|
|
138
|
-
const avgAnchorRes = results.reduce((n, r) => n + r.anchorResolutionRate, 0) / results.length;
|
|
139
|
-
|
|
140
|
-
md += `\n## Summary\n\n`;
|
|
141
|
-
md += `- **Baseline success rate:** ${totalSuccessB}/${results.length}\n`;
|
|
142
|
-
md += `- **GraphPilot success rate:** ${totalSuccessGP}/${results.length}\n`;
|
|
143
|
-
md += `- **Hallucinations (Baseline):** ${totalHallucB}\n`;
|
|
144
|
-
md += `- **Hallucinations (GraphPilot):** ${totalHallucGP}\n`;
|
|
145
|
-
md += `- **Token cost (Baseline):** ${totalTokensB}\n`;
|
|
146
|
-
md += `- **Token cost (GraphPilot):** ${totalTokensGP} (−${((1 - totalTokensGP / totalTokensB) * 100).toFixed(0)}%)\n`;
|
|
147
|
-
md += `- **Anchor resolution rate:** ${(avgAnchorRes * 100).toFixed(0)}%\n`;
|
|
148
|
-
|
|
149
|
-
return md;
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
async function main() {
|
|
153
|
-
const args = process.argv.slice(2);
|
|
154
|
-
const baselineArg =
|
|
155
|
-
args.find((a) => a.startsWith('--baseline='))?.split('=')[1] ||
|
|
156
|
-
args[args.indexOf('--baseline') + 1];
|
|
157
|
-
const graphpilotArg =
|
|
158
|
-
args.find((a) => a.startsWith('--graphpilot='))?.split('=')[1] ||
|
|
159
|
-
args[args.indexOf('--graphpilot') + 1];
|
|
160
|
-
const outputArg =
|
|
161
|
-
args.find((a) => a.startsWith('--output='))?.split('=')[1] ||
|
|
162
|
-
args[args.indexOf('--output') + 1];
|
|
163
|
-
|
|
164
|
-
if (!baselineArg || !graphpilotArg || !outputArg) {
|
|
165
|
-
console.error(
|
|
166
|
-
'Usage: npx tsx bench/score-agent-tier.ts --baseline <path> --graphpilot <path> --output <path>',
|
|
167
|
-
);
|
|
168
|
-
process.exit(1);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
console.log('Loading transcripts...');
|
|
172
|
-
const baselineData = await readJsonl(baselineArg);
|
|
173
|
-
const graphpilotData = await readJsonl(graphpilotArg);
|
|
174
|
-
|
|
175
|
-
console.log(`Baseline: ${baselineData.length} messages`);
|
|
176
|
-
console.log(`GraphPilot: ${graphpilotData.length} messages`);
|
|
177
|
-
|
|
178
|
-
// TODO: Parse transcripts by task boundary, invoke scorer for each pair
|
|
179
|
-
console.log('\n[Stub] Scoring requires human review. Prepare scoring input CSV with columns:');
|
|
180
|
-
console.log(' task_id, baseline_success (0/1), gp_success (0/1), baseline_halluc, gp_halluc');
|
|
181
|
-
console.log('Then run: npx tsx bench/score-agent-tier.ts --scores <csv> --output <path>');
|
|
182
|
-
|
|
183
|
-
const results: TaskResult[] = [];
|
|
184
|
-
// Placeholder: results would be populated from CSV or human input
|
|
185
|
-
|
|
186
|
-
const output = formatResultsTable(results);
|
|
187
|
-
fs.writeFileSync(outputArg, output);
|
|
188
|
-
console.log(`Results written to ${outputArg}`);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
main().catch(console.error);
|