@graphpilot-oss/graphpilot 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/CHANGELOG.md +73 -126
  2. package/README.md +359 -101
  3. package/dist/cli.js +20 -0
  4. package/dist/cli.js.map +1 -1
  5. package/dist/indexer.js +3 -3
  6. package/dist/indexer.js.map +1 -1
  7. package/dist/init.d.ts +28 -0
  8. package/dist/init.js +112 -0
  9. package/dist/init.js.map +1 -0
  10. package/dist/interactions.d.ts +5 -4
  11. package/dist/interactions.js +0 -0
  12. package/dist/interactions.js.map +1 -1
  13. package/dist/mcp.js +126 -46
  14. package/dist/mcp.js.map +1 -1
  15. package/dist/repo-resolve.d.ts +47 -0
  16. package/dist/repo-resolve.js +195 -0
  17. package/dist/repo-resolve.js.map +1 -0
  18. package/dist/storage.js +10 -1
  19. package/dist/storage.js.map +1 -1
  20. package/dist/validation.js +30 -4
  21. package/dist/validation.js.map +1 -1
  22. package/dist/watcher.d.ts +10 -0
  23. package/dist/watcher.js +70 -7
  24. package/dist/watcher.js.map +1 -1
  25. package/examples/README.md +105 -0
  26. package/examples/claude-code/README.md +125 -0
  27. package/examples/claude-code/claude-routing.md +102 -0
  28. package/examples/claude-code/claude_config.json +8 -0
  29. package/examples/cline/.clinerules +39 -0
  30. package/examples/cline/README.md +104 -0
  31. package/examples/cline/cline_mcp_settings.json +10 -0
  32. package/examples/continue/.continuerules +39 -0
  33. package/examples/continue/README.md +98 -0
  34. package/examples/continue/config.json +13 -0
  35. package/examples/cursor/.cursorrules +39 -0
  36. package/examples/cursor/README.md +98 -0
  37. package/examples/cursor/mcp.json +11 -0
  38. package/examples/windsurf/.windsurfrules +39 -0
  39. package/examples/windsurf/README.md +85 -0
  40. package/examples/windsurf/mcp_config.json +8 -0
  41. package/package.json +12 -3
  42. package/.editorconfig +0 -15
  43. package/.github/CODEOWNERS +0 -22
  44. package/.github/FUNDING.yml +0 -1
  45. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -33
  46. package/.github/ISSUE_TEMPLATE/config.yml +0 -5
  47. package/.github/ISSUE_TEMPLATE/feature_request.md +0 -23
  48. package/.github/PULL_REQUEST_TEMPLATE.md +0 -19
  49. package/.github/dependabot.yml +0 -15
  50. package/.github/workflows/ci.yml +0 -62
  51. package/.github/workflows/release.yml +0 -50
  52. package/.prettierignore +0 -19
  53. package/.prettierrc.json +0 -20
  54. package/CODE_OF_CONDUCT.md +0 -83
  55. package/CONTRIBUTING.md +0 -111
  56. package/bench/README.md +0 -544
  57. package/bench/results/agent-tier-2026-05-22.md +0 -28
  58. package/bench/results/agent-tier-summary.md +0 -44
  59. package/bench/results/baseline-tier-2026-05-22.md +0 -23
  60. package/bench/results/baseline.json +0 -810
  61. package/bench/results/baseline.md +0 -28
  62. package/bench/run-agent-tier-automated.ts +0 -234
  63. package/bench/run-agent-tier.md +0 -125
  64. package/bench/run-baseline-tier.ts +0 -200
  65. package/bench/run.ts +0 -210
  66. package/bench/runner-baseline.ts +0 -177
  67. package/bench/runner-graphpilot.ts +0 -131
  68. package/bench/score-agent-tier.ts +0 -191
  69. package/bench/score.ts +0 -59
  70. package/bench/tasks.ts +0 -236
  71. package/dist/provenance.d.ts +0 -74
  72. package/dist/provenance.js +0 -95
  73. package/dist/provenance.js.map +0 -1
  74. package/docs/architecture.md +0 -311
  75. package/docs/limitations.md +0 -156
  76. package/docs/mcp-setup.md +0 -231
  77. package/docs/quickstart.md +0 -202
  78. package/eslint.config.js +0 -148
  79. package/lefthook.yml +0 -81
  80. package/pnpm-workspace.yaml +0 -6
  81. package/scripts/smoke-stdio.mjs +0 -97
  82. package/src/cli.ts +0 -171
  83. package/src/edges.ts +0 -202
  84. package/src/git.ts +0 -255
  85. package/src/graph-schema.ts +0 -229
  86. package/src/impact.ts +0 -218
  87. package/src/indexer.ts +0 -152
  88. package/src/interactions.ts +0 -0
  89. package/src/mcp.ts +0 -652
  90. package/src/parser.ts +0 -138
  91. package/src/provenance.ts +0 -115
  92. package/src/query.ts +0 -148
  93. package/src/redact.ts +0 -122
  94. package/src/storage.ts +0 -115
  95. package/src/symbols.ts +0 -173
  96. package/src/validation.ts +0 -69
  97. package/src/validators.ts +0 -253
  98. package/src/watcher.ts +0 -383
  99. package/tests/edges.test.ts +0 -175
  100. package/tests/fixtures/sample.ts +0 -32
  101. package/tests/git.test.ts +0 -303
  102. package/tests/graph-schema.test.ts +0 -321
  103. package/tests/impact.test.ts +0 -454
  104. package/tests/interactions.test.ts +0 -180
  105. package/tests/lint-policy.test.ts +0 -106
  106. package/tests/mcp-stdio.test.ts +0 -171
  107. package/tests/mcp.test.ts +0 -335
  108. package/tests/parser.test.ts +0 -31
  109. package/tests/provenance.test.ts +0 -132
  110. package/tests/query.test.ts +0 -160
  111. package/tests/redact.test.ts +0 -167
  112. package/tests/security.test.ts +0 -144
  113. package/tests/symbols.test.ts +0 -78
  114. package/tests/validators.test.ts +0 -193
  115. package/tests/watcher.test.ts +0 -250
  116. package/tsconfig.json +0 -18
package/bench/run.ts DELETED
@@ -1,210 +0,0 @@
1
- /**
2
- * Tier-A benchmark runner. Runs each task in TASKS in two conditions
3
- * (graphpilot, baseline grep), scores both, writes a JSON result file
4
- * plus a markdown summary to bench/results/.
5
- *
6
- * Usage:
7
- * pnpm bench [--repo=<path>] [--out=<file>]
8
- *
9
- * Defaults to running against the graphpilot repo itself
10
- * (process.cwd()), which is the self-test corpus.
11
- */
12
-
13
- import { mkdirSync, writeFileSync, readFileSync, existsSync } from 'node:fs';
14
- import { join, resolve } from 'node:path';
15
- import { TASKS, type Task } from './tasks.js';
16
- import { GraphpilotRunner, type RunResult } from './runner-graphpilot.js';
17
- import { BaselineRunner } from './runner-baseline.js';
18
- import { score, type Scored } from './score.js';
19
-
20
- interface PerTaskResult {
21
- task: Task;
22
- graphpilot: { run: RunResult; score: Scored };
23
- baseline: { run: RunResult; score: Scored };
24
- winner: 'graphpilot' | 'grep' | 'tie';
25
- /** Did the winner match expectedWinner? Diagnostic. */
26
- expectedMatch: boolean;
27
- }
28
-
29
- interface AggregateMetrics {
30
- totalTasks: number;
31
- graphpilotF1Sum: number;
32
- baselineF1Sum: number;
33
- graphpilotBytesTotal: number;
34
- baselineBytesTotal: number;
35
- graphpilotWins: number;
36
- baselineWins: number;
37
- ties: number;
38
- expectedWinnerHits: number;
39
- }
40
-
41
- interface BenchmarkReport {
42
- meta: {
43
- corpus: string;
44
- timestamp: string;
45
- graphpilotVersion: string;
46
- nodeVersion: string;
47
- platform: NodeJS.Platform;
48
- };
49
- aggregate: AggregateMetrics;
50
- perTask: PerTaskResult[];
51
- }
52
-
53
- function pickWinner(gp: Scored, bl: Scored): 'graphpilot' | 'grep' | 'tie' {
54
- const epsilon = 0.001;
55
- if (Math.abs(gp.f1 - bl.f1) < epsilon) return 'tie';
56
- return gp.f1 > bl.f1 ? 'graphpilot' : 'grep';
57
- }
58
-
59
- function fmt(n: number, dp = 2): string {
60
- return Number.isFinite(n) ? n.toFixed(dp) : '?';
61
- }
62
-
63
- function fmtBytes(n: number): string {
64
- if (n < 1024) return `${n}B`;
65
- if (n < 1024 * 1024) return `${(n / 1024).toFixed(1)}KB`;
66
- return `${(n / 1024 / 1024).toFixed(1)}MB`;
67
- }
68
-
69
- function summaryMarkdown(report: BenchmarkReport): string {
70
- const a = report.aggregate;
71
- const lines: string[] = [];
72
- lines.push(`# GraphPilot Benchmark — ${report.meta.timestamp}`);
73
- lines.push('');
74
- lines.push(`Corpus: \`${report.meta.corpus}\``);
75
- lines.push(`graphpilot v${report.meta.graphpilotVersion}`);
76
- lines.push(`Node ${report.meta.nodeVersion} on ${report.meta.platform}`);
77
- lines.push('');
78
- lines.push('## Aggregate');
79
- lines.push('');
80
- lines.push(`- Tasks run: **${a.totalTasks}**`);
81
- lines.push(
82
- `- F1 (avg): graphpilot **${fmt(a.graphpilotF1Sum / a.totalTasks)}** ` +
83
- `vs grep **${fmt(a.baselineF1Sum / a.totalTasks)}**`,
84
- );
85
- lines.push(
86
- `- Bytes processed (total): graphpilot **${fmtBytes(a.graphpilotBytesTotal)}** ` +
87
- `vs grep **${fmtBytes(a.baselineBytesTotal)}**` +
88
- ` (${fmt((1 - a.graphpilotBytesTotal / a.baselineBytesTotal) * 100, 1)}% reduction)`,
89
- );
90
- lines.push(
91
- `- Winner counts: graphpilot **${a.graphpilotWins}** · grep **${a.baselineWins}** · tie **${a.ties}**`,
92
- );
93
- lines.push(
94
- `- Expected-winner accuracy: **${a.expectedWinnerHits}/${a.totalTasks}** ` +
95
- `(${fmt((a.expectedWinnerHits / a.totalTasks) * 100, 0)}%)`,
96
- );
97
- lines.push('');
98
- lines.push('## Per-task');
99
- lines.push('');
100
- lines.push('| # | Task | GP F1 | Grep F1 | GP bytes | Grep bytes | Winner | Expected |');
101
- lines.push('|---|---|---|---|---|---|---|---|');
102
- for (const t of report.perTask) {
103
- const match = t.expectedMatch ? '✓' : '✗';
104
- lines.push(
105
- `| ${t.task.id} | ${t.task.description} ` +
106
- `| ${fmt(t.graphpilot.score.f1)} ` +
107
- `| ${fmt(t.baseline.score.f1)} ` +
108
- `| ${fmtBytes(t.graphpilot.run.outputBytes)} ` +
109
- `| ${fmtBytes(t.baseline.run.outputBytes)} ` +
110
- `| ${t.winner} ` +
111
- `| ${t.task.expectedWinner} ${match} |`,
112
- );
113
- }
114
- return lines.join('\n');
115
- }
116
-
117
- async function main(): Promise<number> {
118
- const args = process.argv.slice(2);
119
- let repoArg: string | undefined;
120
- let outArg: string | undefined;
121
- for (const a of args) {
122
- if (a.startsWith('--repo=')) repoArg = a.slice('--repo='.length);
123
- else if (a.startsWith('--out=')) outArg = a.slice('--out='.length);
124
- }
125
-
126
- const repo = resolve(repoArg ?? process.cwd());
127
- const gp = new GraphpilotRunner(repo);
128
- const baseline = new BaselineRunner(repo);
129
-
130
- const perTask: PerTaskResult[] = [];
131
- const agg: AggregateMetrics = {
132
- totalTasks: TASKS.length,
133
- graphpilotF1Sum: 0,
134
- baselineF1Sum: 0,
135
- graphpilotBytesTotal: 0,
136
- baselineBytesTotal: 0,
137
- graphpilotWins: 0,
138
- baselineWins: 0,
139
- ties: 0,
140
- expectedWinnerHits: 0,
141
- };
142
-
143
- for (const task of TASKS) {
144
- const gpRun = gp.run(task);
145
- const blRun = baseline.run(task);
146
-
147
- const gpScore = score(gpRun.returned, task.groundTruth);
148
- const blScore = score(blRun.returned, task.groundTruth);
149
- const winner = pickWinner(gpScore, blScore);
150
- const expectedMatch = winner === task.expectedWinner;
151
-
152
- perTask.push({
153
- task,
154
- graphpilot: { run: gpRun, score: gpScore },
155
- baseline: { run: blRun, score: blScore },
156
- winner,
157
- expectedMatch,
158
- });
159
-
160
- agg.graphpilotF1Sum += gpScore.f1;
161
- agg.baselineF1Sum += blScore.f1;
162
- agg.graphpilotBytesTotal += gpRun.outputBytes;
163
- agg.baselineBytesTotal += blRun.outputBytes;
164
- if (winner === 'graphpilot') agg.graphpilotWins++;
165
- else if (winner === 'grep') agg.baselineWins++;
166
- else agg.ties++;
167
- if (expectedMatch) agg.expectedWinnerHits++;
168
- }
169
-
170
- // Pull package.json version for the meta block.
171
- const pkgVersion = JSON.parse(readFileSync(join(repo, 'package.json'), 'utf8')).version as string;
172
-
173
- const report: BenchmarkReport = {
174
- meta: {
175
- corpus: repo,
176
- timestamp: new Date().toISOString(),
177
- graphpilotVersion: pkgVersion,
178
- nodeVersion: process.version,
179
- platform: process.platform,
180
- },
181
- aggregate: agg,
182
- perTask,
183
- };
184
-
185
- const resultsDir = join(repo, 'bench', 'results');
186
- if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
187
-
188
- const ts = report.meta.timestamp.replace(/[:.]/g, '-');
189
- const jsonPath = outArg ?? join(resultsDir, `bench-${ts}.json`);
190
- const mdPath = jsonPath.replace(/\.json$/, '.md');
191
-
192
- writeFileSync(jsonPath, JSON.stringify(report, null, 2), 'utf8');
193
- writeFileSync(mdPath, summaryMarkdown(report), 'utf8');
194
-
195
- // Console summary
196
- console.log(summaryMarkdown(report));
197
- console.log('');
198
- console.log(`Wrote ${jsonPath}`);
199
- console.log(`Wrote ${mdPath}`);
200
-
201
- return 0;
202
- }
203
-
204
- main().then(
205
- (code) => process.exit(code),
206
- (err) => {
207
- console.error('Benchmark failed:', err);
208
- process.exit(1);
209
- },
210
- );
@@ -1,177 +0,0 @@
1
- /**
2
- * Grep-style baseline. Simulates what an agent without structural memory
3
- * would do: scan every source file for the query as a literal substring,
4
- * return the matching files / function names.
5
- *
6
- * This UNDERSTATES the real baseline cost because:
7
- * - A real agent reads context around each grep hit (we count just
8
- * the matching files' raw bytes)
9
- * - A real agent often grep+read multiple times before answering
10
- *
11
- * Even with that bias toward grep, GraphPilot should still win the
12
- * structural tasks by a large margin on `outputBytes` (proxy for tokens
13
- * the agent would have to read).
14
- */
15
-
16
- import fg from 'fast-glob';
17
- import { readFileSync, statSync } from 'node:fs';
18
- import { relative, resolve } from 'node:path';
19
- import type { Task } from './tasks.js';
20
-
21
- export interface RunResult {
22
- returned: string[];
23
- /** Total bytes the agent would have to read to answer this question via grep. */
24
- outputBytes: number;
25
- durationMs: number;
26
- }
27
-
28
- const SOURCE_GLOB = ['**/*.ts', '**/*.tsx', '**/*.js', '**/*.jsx'];
29
- const IGNORE = [
30
- '**/node_modules/**',
31
- '**/dist/**',
32
- '**/build/**',
33
- '**/coverage/**',
34
- '**/.next/**',
35
- '**/.nuxt/**',
36
- '**/*.d.ts',
37
- ];
38
-
39
- /** Cache: filePath -> bytes + lines. Avoids re-reading on every task. */
40
- interface FileCache {
41
- path: string;
42
- rel: string;
43
- bytes: number;
44
- lines: string[];
45
- }
46
-
47
- export class BaselineRunner {
48
- readonly absRoot: string;
49
- private readonly files: FileCache[];
50
-
51
- constructor(repoRoot: string) {
52
- this.absRoot = resolve(repoRoot);
53
- const filePaths = fg.sync(SOURCE_GLOB, {
54
- cwd: this.absRoot,
55
- absolute: true,
56
- ignore: IGNORE,
57
- onlyFiles: true,
58
- });
59
- this.files = filePaths.map((p) => {
60
- const text = readFileSync(p, 'utf8');
61
- return {
62
- path: p,
63
- rel: relative(this.absRoot, p),
64
- bytes: statSync(p).size,
65
- lines: text.split('\n'),
66
- };
67
- });
68
- }
69
-
70
- /**
71
- * Scan all source files for `query` as a literal substring. Returns
72
- * (a) the set of matching files (for string-literal/tests-affected
73
- * tasks) and (b) function-like identifier names that appear adjacent
74
- * to `function`/`class`/`interface`/`const` keywords (a rough proxy
75
- * for "what an agent would conclude").
76
- */
77
- private grepScan(query: string): {
78
- matchedFiles: Set<string>;
79
- bytesRead: number;
80
- suspectedNames: Set<string>;
81
- } {
82
- const matchedFiles = new Set<string>();
83
- const suspectedNames = new Set<string>();
84
- let bytesRead = 0;
85
-
86
- for (const f of this.files) {
87
- let hitInFile = false;
88
- for (const line of f.lines) {
89
- if (!line.includes(query)) continue;
90
- hitInFile = true;
91
- // Heuristic: pull a likely caller name out of "function X(", "method X(",
92
- // "const X =", "X(" near the beginning, etc. Rough but it's what a
93
- // human-style grep+eyeball would produce.
94
- const fnMatch =
95
- line.match(/(?:function|class|interface)\s+([A-Za-z_$][\w$]*)/) ||
96
- line.match(/\b([A-Za-z_$][\w$]*)\s*[:=]\s*(?:async\s+)?(?:function|\()/);
97
- if (fnMatch) suspectedNames.add(fnMatch[1]);
98
- }
99
- if (hitInFile) {
100
- matchedFiles.add(f.rel);
101
- bytesRead += f.bytes;
102
- }
103
- }
104
-
105
- return { matchedFiles, bytesRead, suspectedNames };
106
- }
107
-
108
- run(task: Task): RunResult {
109
- const start = Date.now();
110
- let returned: string[] = [];
111
- let bytesRead = 0;
112
-
113
- switch (task.kind) {
114
- case 'callers':
115
- case 'impact':
116
- case 'recall':
117
- case 'recall-substring':
118
- case 'recall-miss': {
119
- // Grep for the query string, then collect identifier names near
120
- // each match. Best-effort approximation of what an agent would
121
- // do without structural memory.
122
- const { bytesRead: br, suspectedNames } = this.grepScan(task.query);
123
- bytesRead = br;
124
- // Exclude the query itself if it appears as a suspected name
125
- // (the function declaration line will have the query as its own
126
- // name; that's not a caller).
127
- suspectedNames.delete(task.query);
128
- returned = [...suspectedNames].sort();
129
- break;
130
- }
131
-
132
- case 'kind-filter': {
133
- // The query is a TypeScript keyword like "interface". Grep for
134
- // "interface " (with trailing space) to filter declarations from
135
- // string literals, then extract the next identifier.
136
- const re = new RegExp(`\\b${task.query}\\s+([A-Za-z_$][\\w$]*)`, 'g');
137
- const names = new Set<string>();
138
- for (const f of this.files) {
139
- if (!f.rel.startsWith('src/')) continue;
140
- const text = f.lines.join('\n');
141
- let m: RegExpExecArray | null;
142
- let matched = false;
143
- while ((m = re.exec(text)) !== null) {
144
- names.add(m[1]);
145
- matched = true;
146
- }
147
- if (matched) bytesRead += f.bytes;
148
- }
149
- returned = [...names].sort();
150
- break;
151
- }
152
-
153
- case 'tests-affected': {
154
- // Without structural memory, you'd grep for the symbol and look
155
- // at which *.test.ts files contain it.
156
- const { bytesRead: br, matchedFiles } = this.grepScan(task.query);
157
- bytesRead = br;
158
- returned = [...matchedFiles].filter((f) => /\.(test|spec)\.[jt]sx?$/.test(f)).sort();
159
- break;
160
- }
161
-
162
- case 'string-literal': {
163
- // GREP-NATIVE: just return matching files
164
- const { bytesRead: br, matchedFiles } = this.grepScan(task.query);
165
- bytesRead = br;
166
- returned = [...matchedFiles].sort();
167
- break;
168
- }
169
- }
170
-
171
- return {
172
- returned,
173
- outputBytes: bytesRead, // baseline cost = bytes the agent would read
174
- durationMs: Date.now() - start,
175
- };
176
- }
177
- }
@@ -1,131 +0,0 @@
1
- /**
2
- * Run each benchmark task using GraphPilot's primitives directly.
3
- * Measures correctness against task.groundTruth + bytes of output the
4
- * tool returned (proxy for the token cost an agent would pay).
5
- */
6
-
7
- import { loadGraph } from '../src/storage.js';
8
- import { GraphIndex } from '../src/query.js';
9
- import { analyzeImpact } from '../src/impact.js';
10
- import type { Task } from './tasks.js';
11
-
12
- export interface RunResult {
13
- /** Strings the tool returned. For caller/impact tasks, caller names. */
14
- returned: string[];
15
- /** Bytes the tool's structured output occupies as JSON. */
16
- outputBytes: number;
17
- /** Wall-clock for the tool call. */
18
- durationMs: number;
19
- }
20
-
21
- export class GraphpilotRunner {
22
- private readonly idx: GraphIndex;
23
-
24
- constructor(repoRoot: string) {
25
- const g = loadGraph(repoRoot);
26
- if (!g) {
27
- throw new Error(`No graph found at ${repoRoot}. Run \`graphpilot index\` first.`);
28
- }
29
- this.idx = new GraphIndex(g);
30
- }
31
-
32
- run(task: Task): RunResult {
33
- const start = Date.now();
34
- let returned: string[];
35
-
36
- switch (task.kind) {
37
- case 'callers': {
38
- const target = this.idx.resolveSymbol(task.query);
39
- if (!target) {
40
- returned = [];
41
- break;
42
- }
43
- const edges = this.idx.callers(target.id);
44
- const names = new Set<string>();
45
- for (const e of edges) {
46
- const from = this.idx.findById(e.fromId);
47
- if (from) names.add(from.name);
48
- }
49
- returned = [...names].sort();
50
- break;
51
- }
52
-
53
- case 'recall': {
54
- const matches = this.idx.findByName(task.query, { limit: 50 });
55
- returned = matches.map((s) => s.name).sort();
56
- break;
57
- }
58
-
59
- case 'recall-substring': {
60
- const matches = this.idx.findByName(task.query, {
61
- substring: true,
62
- limit: 100,
63
- });
64
- returned = matches.map((s) => s.name).sort();
65
- break;
66
- }
67
-
68
- case 'kind-filter': {
69
- // Filter the full symbol table by kind. The MCP surface doesn't
70
- // expose this as a tool today (would be a v0.2 gp_list_by_kind)
71
- // but the data is in GraphIndex.graph.symbols.
72
- returned = this.idx.graph.symbols
73
- .filter((s) => s.kind === task.query && s.file.startsWith('src/'))
74
- .map((s) => s.name)
75
- .sort();
76
- break;
77
- }
78
-
79
- case 'impact': {
80
- const report = analyzeImpact(this.idx, task.query, { depth: 2 });
81
- if (!report) {
82
- returned = [];
83
- break;
84
- }
85
- const names = new Set<string>();
86
- for (const c of report.directCallers) names.add(c.symbol.name);
87
- for (const c of report.transitiveCallers) names.add(c.symbol.name);
88
- returned = [...names].sort();
89
- break;
90
- }
91
-
92
- case 'tests-affected': {
93
- const report = analyzeImpact(this.idx, task.query, { depth: 3 });
94
- if (!report) {
95
- returned = [];
96
- break;
97
- }
98
- const files = new Set<string>();
99
- for (const c of report.testsAffected) files.add(c.symbol.file);
100
- returned = [...files].sort();
101
- break;
102
- }
103
-
104
- case 'recall-miss': {
105
- const matches = this.idx.findByName(task.query, { limit: 10 });
106
- returned = matches.map((s) => s.name).sort();
107
- break;
108
- }
109
-
110
- case 'string-literal': {
111
- // GraphPilot intentionally doesn't index string literals or
112
- // identifier usages outside structural contexts. Best effort:
113
- // return any file where a symbol whose NAME matches the query
114
- // is defined. This will miss most usages (the honest "grep wins"
115
- // baseline).
116
- const decl = this.idx.findByName(task.query, { limit: 5 });
117
- const files = new Set<string>();
118
- for (const s of decl) files.add(s.file);
119
- returned = [...files].sort();
120
- break;
121
- }
122
- }
123
-
124
- const outputBytes = Buffer.byteLength(JSON.stringify(returned), 'utf8');
125
- return {
126
- returned,
127
- outputBytes,
128
- durationMs: Date.now() - start,
129
- };
130
- }
131
- }
@@ -1,191 +0,0 @@
1
- /**
2
- * Tier-B Agent Benchmark Scorer
3
- *
4
- * Parses Claude Code session transcripts (.jsonl) and produces the per-task
5
- * metrics table for the agent-eval benchmark.
6
- *
7
- * Usage:
8
- * npx tsx bench/score-agent-tier.ts \
9
- * --baseline <path-to-baseline.jsonl> \
10
- * --graphpilot <path-to-graphpilot.jsonl> \
11
- * --output bench/results/agent-tier-2026-05-22.md
12
- *
13
- * Input: Claude Code session jsonl (one JSON message per line)
14
- * Output: Markdown table with per-task metrics + aggregate stats
15
- */
16
-
17
- import * as fs from 'node:fs';
18
- import * as path from 'node:path';
19
- import { createReadStream } from 'node:fs';
20
- import { createInterface } from 'node:readline';
21
-
22
- interface Message {
23
- type: string;
24
- role?: string;
25
- content?: string;
26
- [key: string]: any;
27
- }
28
-
29
- interface TaskResult {
30
- id: string;
31
- prompt: string;
32
- // Scorer fills in:
33
- taskSuccessBaseline: 0 | 1;
34
- taskSuccessGraphPilot: 0 | 1;
35
- hallucCountBaseline: number;
36
- hallucCountGraphPilot: number;
37
- tokenCostBaseline: number;
38
- tokenCostGraphPilot: number;
39
- anchorResolutionRate: number; // 0..1, only for GP
40
- diffNoiseRatio: number; // 0..1, only for GP (impact-since tasks)
41
- }
42
-
43
- async function readJsonl(filePath: string): Promise<Message[]> {
44
- const messages: Message[] = [];
45
- const rl = createInterface({
46
- input: createReadStream(filePath),
47
- crlfDelay: Infinity,
48
- });
49
-
50
- for await (const line of rl) {
51
- if (line.trim()) {
52
- messages.push(JSON.parse(line));
53
- }
54
- }
55
- return messages;
56
- }
57
-
58
- function extractTokenUsage(msgs: Message[]): number {
59
- let total = 0;
60
- for (const msg of msgs) {
61
- if (msg.usage) {
62
- total += (msg.usage.input_tokens ?? 0) + (msg.usage.output_tokens ?? 0);
63
- }
64
- }
65
- return total;
66
- }
67
-
68
- /**
69
- * Extract file:line @ sha anchors from transcript text.
70
- * Pattern: src/foo.ts:42 @ ab12cd3
71
- */
72
- function extractAnchors(text: string): Array<{ file: string; line: number; sha: string }> {
73
- const anchors: Array<{ file: string; line: number; sha: string }> = [];
74
- const re = /(\S+\.(?:ts|tsx|js|jsx)):(\d+)(?:\s+@\s+([0-9a-f]{7}))?/g;
75
- let m;
76
- while ((m = re.exec(text))) {
77
- anchors.push({
78
- file: m[1],
79
- line: parseInt(m[2], 10),
80
- sha: m[3] ?? '',
81
- });
82
- }
83
- return anchors;
84
- }
85
-
86
- /**
87
- * Stub scorer: human-in-the-loop.
88
- *
89
- * Real scoring requires:
90
- * 1. Human opens the transcript
91
- * 2. Manually reads the agent's final answer
92
- * 3. Compares to ground truth (from tasks.ts)
93
- * 4. Marks success/hallucination count
94
- *
95
- * This function prompts for input or reads from a pre-filled CSV.
96
- * For now, it's a template showing the metrics to collect.
97
- */
98
- async function scoreTask(
99
- taskId: string,
100
- baselineTranscript: string,
101
- graphpilotTranscript: string,
102
- ): Promise<TaskResult> {
103
- // TODO: Implement human-in-the-loop or CSV reader
104
- // For MVP, return a stub result
105
- return {
106
- id: taskId,
107
- prompt: '(placeholder)',
108
- taskSuccessBaseline: 0,
109
- taskSuccessGraphPilot: 0,
110
- hallucCountBaseline: 0,
111
- hallucCountGraphPilot: 0,
112
- tokenCostBaseline: 0,
113
- tokenCostGraphPilot: 0,
114
- anchorResolutionRate: 0,
115
- diffNoiseRatio: 0,
116
- };
117
- }
118
-
119
- /**
120
- * Format results as Markdown table suitable for README
121
- */
122
- function formatResultsTable(results: TaskResult[]): string {
123
- let md = `# Tier-B Agent Benchmark Results\n\n`;
124
- md += `| Task | Baseline Success | GP Success | Halluc (B) | Halluc (GP) | Tokens (B) | Tokens (GP) | Anchor % | Diff Noise |\n`;
125
- md += `|---|---|---|---|---|---|---|---|---|\n`;
126
-
127
- for (const r of results) {
128
- md += `| ${r.id} | ${r.taskSuccessBaseline} | ${r.taskSuccessGraphPilot} | ${r.hallucCountBaseline} | ${r.hallucCountGraphPilot} | ${r.tokenCostBaseline} | ${r.tokenCostGraphPilot} | ${(r.anchorResolutionRate * 100).toFixed(0)}% | ${(r.diffNoiseRatio * 100).toFixed(0)}% |\n`;
129
- }
130
-
131
- // Aggregate stats
132
- const totalSuccessB = results.reduce((n, r) => n + r.taskSuccessBaseline, 0);
133
- const totalSuccessGP = results.reduce((n, r) => n + r.taskSuccessGraphPilot, 0);
134
- const totalHallucB = results.reduce((n, r) => n + r.hallucCountBaseline, 0);
135
- const totalHallucGP = results.reduce((n, r) => n + r.hallucCountGraphPilot, 0);
136
- const totalTokensB = results.reduce((n, r) => n + r.tokenCostBaseline, 0);
137
- const totalTokensGP = results.reduce((n, r) => n + r.tokenCostGraphPilot, 0);
138
- const avgAnchorRes = results.reduce((n, r) => n + r.anchorResolutionRate, 0) / results.length;
139
-
140
- md += `\n## Summary\n\n`;
141
- md += `- **Baseline success rate:** ${totalSuccessB}/${results.length}\n`;
142
- md += `- **GraphPilot success rate:** ${totalSuccessGP}/${results.length}\n`;
143
- md += `- **Hallucinations (Baseline):** ${totalHallucB}\n`;
144
- md += `- **Hallucinations (GraphPilot):** ${totalHallucGP}\n`;
145
- md += `- **Token cost (Baseline):** ${totalTokensB}\n`;
146
- md += `- **Token cost (GraphPilot):** ${totalTokensGP} (−${((1 - totalTokensGP / totalTokensB) * 100).toFixed(0)}%)\n`;
147
- md += `- **Anchor resolution rate:** ${(avgAnchorRes * 100).toFixed(0)}%\n`;
148
-
149
- return md;
150
- }
151
-
152
- async function main() {
153
- const args = process.argv.slice(2);
154
- const baselineArg =
155
- args.find((a) => a.startsWith('--baseline='))?.split('=')[1] ||
156
- args[args.indexOf('--baseline') + 1];
157
- const graphpilotArg =
158
- args.find((a) => a.startsWith('--graphpilot='))?.split('=')[1] ||
159
- args[args.indexOf('--graphpilot') + 1];
160
- const outputArg =
161
- args.find((a) => a.startsWith('--output='))?.split('=')[1] ||
162
- args[args.indexOf('--output') + 1];
163
-
164
- if (!baselineArg || !graphpilotArg || !outputArg) {
165
- console.error(
166
- 'Usage: npx tsx bench/score-agent-tier.ts --baseline <path> --graphpilot <path> --output <path>',
167
- );
168
- process.exit(1);
169
- }
170
-
171
- console.log('Loading transcripts...');
172
- const baselineData = await readJsonl(baselineArg);
173
- const graphpilotData = await readJsonl(graphpilotArg);
174
-
175
- console.log(`Baseline: ${baselineData.length} messages`);
176
- console.log(`GraphPilot: ${graphpilotData.length} messages`);
177
-
178
- // TODO: Parse transcripts by task boundary, invoke scorer for each pair
179
- console.log('\n[Stub] Scoring requires human review. Prepare scoring input CSV with columns:');
180
- console.log(' task_id, baseline_success (0/1), gp_success (0/1), baseline_halluc, gp_halluc');
181
- console.log('Then run: npx tsx bench/score-agent-tier.ts --scores <csv> --output <path>');
182
-
183
- const results: TaskResult[] = [];
184
- // Placeholder: results would be populated from CSV or human input
185
-
186
- const output = formatResultsTable(results);
187
- fs.writeFileSync(outputArg, output);
188
- console.log(`Results written to ${outputArg}`);
189
- }
190
-
191
- main().catch(console.error);