@graphpilot-oss/graphpilot 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/CHANGELOG.md +72 -126
  2. package/README.md +290 -102
  3. package/dist/cli.js +41 -1
  4. package/dist/cli.js.map +1 -1
  5. package/dist/edges.js +22 -11
  6. package/dist/edges.js.map +1 -1
  7. package/dist/indexer.js +3 -3
  8. package/dist/indexer.js.map +1 -1
  9. package/dist/init.d.ts +28 -0
  10. package/dist/init.js +112 -0
  11. package/dist/init.js.map +1 -0
  12. package/dist/interactions.d.ts +5 -4
  13. package/dist/interactions.js +0 -0
  14. package/dist/interactions.js.map +1 -1
  15. package/dist/mcp.js +119 -90
  16. package/dist/mcp.js.map +1 -1
  17. package/dist/repo-resolve.d.ts +47 -0
  18. package/dist/repo-resolve.js +195 -0
  19. package/dist/repo-resolve.js.map +1 -0
  20. package/dist/storage.js +10 -1
  21. package/dist/storage.js.map +1 -1
  22. package/dist/symbols.js +26 -2
  23. package/dist/symbols.js.map +1 -1
  24. package/dist/validation.js +30 -4
  25. package/dist/validation.js.map +1 -1
  26. package/dist/validators.d.ts +1 -5
  27. package/dist/validators.js +0 -11
  28. package/dist/validators.js.map +1 -1
  29. package/dist/watcher.d.ts +10 -0
  30. package/dist/watcher.js +70 -7
  31. package/dist/watcher.js.map +1 -1
  32. package/examples/README.md +105 -0
  33. package/examples/claude-code/README.md +125 -0
  34. package/examples/claude-code/claude-routing.md +102 -0
  35. package/examples/claude-code/claude_config.json +8 -0
  36. package/examples/cline/.clinerules +39 -0
  37. package/examples/cline/README.md +104 -0
  38. package/examples/cline/cline_mcp_settings.json +10 -0
  39. package/examples/continue/.continuerules +39 -0
  40. package/examples/continue/README.md +98 -0
  41. package/examples/continue/config.json +13 -0
  42. package/examples/cursor/.cursorrules +39 -0
  43. package/examples/cursor/README.md +98 -0
  44. package/examples/cursor/mcp.json +11 -0
  45. package/examples/windsurf/.windsurfrules +39 -0
  46. package/examples/windsurf/README.md +85 -0
  47. package/examples/windsurf/mcp_config.json +8 -0
  48. package/package.json +14 -4
  49. package/.editorconfig +0 -15
  50. package/.github/CODEOWNERS +0 -22
  51. package/.github/FUNDING.yml +0 -1
  52. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -33
  53. package/.github/ISSUE_TEMPLATE/config.yml +0 -5
  54. package/.github/ISSUE_TEMPLATE/feature_request.md +0 -23
  55. package/.github/PULL_REQUEST_TEMPLATE.md +0 -19
  56. package/.github/dependabot.yml +0 -15
  57. package/.github/workflows/ci.yml +0 -62
  58. package/.github/workflows/release.yml +0 -50
  59. package/.prettierignore +0 -19
  60. package/.prettierrc.json +0 -20
  61. package/CODE_OF_CONDUCT.md +0 -83
  62. package/CONTRIBUTING.md +0 -111
  63. package/bench/README.md +0 -544
  64. package/bench/results/agent-tier-2026-05-22.md +0 -28
  65. package/bench/results/agent-tier-summary.md +0 -44
  66. package/bench/results/baseline-tier-2026-05-22.md +0 -23
  67. package/bench/results/baseline.json +0 -810
  68. package/bench/results/baseline.md +0 -28
  69. package/bench/run-agent-tier-automated.ts +0 -234
  70. package/bench/run-agent-tier.md +0 -125
  71. package/bench/run-baseline-tier.ts +0 -200
  72. package/bench/run.ts +0 -210
  73. package/bench/runner-baseline.ts +0 -177
  74. package/bench/runner-graphpilot.ts +0 -131
  75. package/bench/score-agent-tier.ts +0 -191
  76. package/bench/score.ts +0 -59
  77. package/bench/tasks.ts +0 -236
  78. package/dist/provenance.d.ts +0 -74
  79. package/dist/provenance.js +0 -95
  80. package/dist/provenance.js.map +0 -1
  81. package/docs/architecture.md +0 -311
  82. package/docs/limitations.md +0 -156
  83. package/docs/mcp-setup.md +0 -231
  84. package/docs/quickstart.md +0 -202
  85. package/eslint.config.js +0 -148
  86. package/lefthook.yml +0 -81
  87. package/pnpm-workspace.yaml +0 -6
  88. package/scripts/smoke-stdio.mjs +0 -97
  89. package/src/cli.ts +0 -171
  90. package/src/edges.ts +0 -202
  91. package/src/git.ts +0 -255
  92. package/src/graph-schema.ts +0 -229
  93. package/src/impact.ts +0 -218
  94. package/src/indexer.ts +0 -152
  95. package/src/interactions.ts +0 -0
  96. package/src/mcp.ts +0 -652
  97. package/src/parser.ts +0 -138
  98. package/src/provenance.ts +0 -115
  99. package/src/query.ts +0 -148
  100. package/src/redact.ts +0 -122
  101. package/src/storage.ts +0 -115
  102. package/src/symbols.ts +0 -173
  103. package/src/validation.ts +0 -69
  104. package/src/validators.ts +0 -253
  105. package/src/watcher.ts +0 -383
  106. package/tests/edges.test.ts +0 -175
  107. package/tests/fixtures/sample.ts +0 -32
  108. package/tests/git.test.ts +0 -303
  109. package/tests/graph-schema.test.ts +0 -321
  110. package/tests/impact.test.ts +0 -454
  111. package/tests/interactions.test.ts +0 -180
  112. package/tests/lint-policy.test.ts +0 -106
  113. package/tests/mcp-stdio.test.ts +0 -171
  114. package/tests/mcp.test.ts +0 -335
  115. package/tests/parser.test.ts +0 -31
  116. package/tests/provenance.test.ts +0 -132
  117. package/tests/query.test.ts +0 -160
  118. package/tests/redact.test.ts +0 -167
  119. package/tests/security.test.ts +0 -144
  120. package/tests/symbols.test.ts +0 -78
  121. package/tests/validators.test.ts +0 -193
  122. package/tests/watcher.test.ts +0 -250
  123. package/tsconfig.json +0 -18
@@ -1,28 +0,0 @@
1
- # GraphPilot Benchmark — 2026-05-20T06:13:00.314Z
2
-
3
- Corpus: `<graphpilot-repo>`
4
- graphpilot v0.0.1
5
- Node v23.11.0 on darwin
6
-
7
- ## Aggregate
8
-
9
- - Tasks run: **10**
10
- - F1 (avg): graphpilot **0.89** vs grep **0.42**
11
- - Bytes processed (total): graphpilot **721B** vs grep **528.1KB** (99.9% reduction)
12
- - Winner counts: graphpilot **7** · grep **1** · tie **2**
13
- - Expected-winner accuracy: **9/10** (90%)
14
-
15
- ## Per-task
16
-
17
- | # | Task | GP F1 | Grep F1 | GP bytes | Grep bytes | Winner | Expected |
18
- |---|---|---|---|---|---|---|---|
19
- | t01-callers-analyzeImpact | Find every function that calls analyzeImpact | 1.00 | 0.00 | 18B | 48.8KB | graphpilot | graphpilot ✓ |
20
- | t02-callers-extractSymbols | Find every direct caller of extractSymbols | 1.00 | 0.00 | 44B | 43.6KB | graphpilot | graphpilot ✓ |
21
- | t03-callers-validateRootPath | Find every direct caller of validateRootPath | 1.00 | 0.00 | 49B | 48.5KB | graphpilot | graphpilot ✓ |
22
- | t04-recall-substring-parse | Find every symbol whose name contains "parse" | 1.00 | 0.50 | 65B | 148.1KB | graphpilot | graphpilot ✓ |
23
- | t05-kind-filter-interfaces | Enumerate all TypeScript interfaces under src/ | 1.00 | 1.00 | 342B | 88.9KB | tie | graphpilot ✗ |
24
- | t06-impact-extractSymbols-depth2 | Compute blast radius of changing extractSymbols (depth 2) | 0.92 | 0.00 | 99B | 43.6KB | graphpilot | graphpilot ✓ |
25
- | t07-tests-affected-parseFile | Identify test files that exercise parseFile (directly) | 1.00 | 0.33 | 25B | 48.8KB | graphpilot | graphpilot ✓ |
26
- | t08-recall-substring-args | Find every MCP-tool input-args interface | 1.00 | 0.48 | 75B | 33.3KB | graphpilot | graphpilot ✓ |
27
- | t09-recall-miss | Look up a symbol that does not exist (negative test) | 1.00 | 1.00 | 2B | 6.9KB | tie | tie ✓ |
28
- | t10-string-literal-MAX_FILE_BYTES | Find every literal occurrence of the constant name "MAX_FILE_BYTES" | 0.00 | 0.86 | 2B | 17.5KB | grep | grep ✓ |
@@ -1,234 +0,0 @@
1
- /**
2
- * Automated Tier-B Agent Benchmark Runner
3
- *
4
- * Instead of running Claude Code GUI sessions manually, this script:
5
- * 1. Programmatically calls the same gp_* tools that an agent would
6
- * 2. Measures structural correctness (blast radius, callers, etc.)
7
- * 3. Simulates agent reasoning by checking if key data was present
8
- * 4. Produces the per-task metrics table
9
- *
10
- * This is a proxy for real LLM agent behavior; it measures tool quality
11
- * rather than agent reasoning quality. But it's reproducible and fast.
12
- */
13
-
14
- import * as fs from 'node:fs';
15
- import { GraphIndex } from '../src/query.js';
16
- import { loadGraph } from '../src/storage.js';
17
- import { analyzeImpact } from '../src/impact.js';
18
- import { TASKS } from './tasks.js';
19
- import { getChangedFiles, readGitInfo } from '../src/git.js';
20
- import type { SymbolRecord, CallEdge } from '../src/symbols.js';
21
-
22
- interface TaskMetrics {
23
- taskId: string;
24
- description: string;
25
- kind: string;
26
- success: boolean; // did GP find all ground-truth results?
27
- recall: number; // |found ∩ truth| / |truth|
28
- precision: number; // |found ∩ truth| / |found|
29
- f1: number;
30
- hallucinations: number; // results not in ground truth
31
- evidenceAnchorsPresent: boolean; // all results have file:line @ sha
32
- tokenEstimate: number; // rough proxy: response size in chars / 4
33
- notes: string;
34
- }
35
-
36
- function formatProvenance(s: SymbolRecord, sha: string | null): string {
37
- const shaTag = sha ? ` @ ${sha.slice(0, 7)}` : '';
38
- return `${s.file}:${s.line}${shaTag}`;
39
- }
40
-
41
- async function runTask(idx: GraphIndex, taskId: string, graph: any): Promise<TaskMetrics> {
42
- const task = TASKS.find((t) => t.id === taskId);
43
- if (!task) throw new Error(`Task ${taskId} not found`);
44
-
45
- const shortSha = graph.indexedSha ? graph.indexedSha.slice(0, 7) : null;
46
-
47
- let found: SymbolRecord[] = [];
48
- let responseText = '';
49
- let success = false;
50
-
51
- try {
52
- switch (task.kind) {
53
- case 'callers': {
54
- const target = idx.resolveSymbol(task.query);
55
- if (target) {
56
- const edges = idx.callers(target.id, { limit: 100 });
57
- found = edges
58
- .map((e) => idx.findById(e.fromId))
59
- .filter((s) => s !== null) as SymbolRecord[];
60
- responseText = found
61
- .map((s) => `${s.name} @ ${formatProvenance(s, shortSha)}`)
62
- .join('\n');
63
- }
64
- break;
65
- }
66
-
67
- case 'impact': {
68
- const target = idx.resolveSymbol(task.query);
69
- if (target) {
70
- const impact = analyzeImpact(idx, task.query);
71
- if (impact) {
72
- found = impact.directCallers.map((c) => c.symbol);
73
- found = found.concat(impact.transitiveCallers.map((c) => c.symbol));
74
- responseText = [
75
- `Direct: ${impact.directCallers.map((c) => c.symbol.name).join(', ')}`,
76
- `Transitive: ${impact.transitiveCallers.map((c) => c.symbol.name).join(', ')}`,
77
- impact.directCallers
78
- .map((c) => ` ${c.symbol.name} @ ${formatProvenance(c.symbol, shortSha)}`)
79
- .join('\n'),
80
- ].join('\n');
81
- }
82
- }
83
- break;
84
- }
85
-
86
- case 'impact-since': {
87
- // Differential impact — simulated with empty changed files (clean repo)
88
- const target = idx.resolveSymbol(task.query);
89
- if (target) {
90
- const impact = analyzeImpact(idx, task.query, { changedFiles: new Set() });
91
- if (impact) {
92
- found = [];
93
- responseText = `(filtered to 0 files changed since HEAD~1)`;
94
- }
95
- }
96
- break;
97
- }
98
-
99
- case 'recall':
100
- case 'recall-substring': {
101
- found = idx.findByName(task.query, { substring: task.kind === 'recall-substring' });
102
- responseText = found
103
- .map((s) => `${s.name} (${s.kind}) @ ${formatProvenance(s, shortSha)}`)
104
- .join('\n');
105
- break;
106
- }
107
-
108
- case 'kind-filter': {
109
- found = idx.findByKind(task.query as any);
110
- responseText = found.map((s) => `${s.name} @ ${formatProvenance(s, shortSha)}`).join('\n');
111
- break;
112
- }
113
-
114
- case 'tests-affected': {
115
- const target = idx.resolveSymbol(task.query);
116
- if (target) {
117
- const edges = idx.callers(target.id, { limit: 100 });
118
- found = edges
119
- .map((e) => idx.findById(e.fromId))
120
- .filter((s) => s !== null && s.file.includes('test')) as SymbolRecord[];
121
- responseText = found.map((s) => `${s.file}:${s.line}`).join('\n');
122
- }
123
- break;
124
- }
125
-
126
- case 'recall-miss': {
127
- found = idx.findByName(task.query);
128
- responseText =
129
- found.length === 0 ? '[not found in index]' : found.map((s) => s.name).join(', ');
130
- break;
131
- }
132
-
133
- case 'string-literal': {
134
- // We can't search text; skip this (would be grep-only)
135
- found = [];
136
- responseText = '[string search not implemented in GraphPilot]';
137
- break;
138
- }
139
- }
140
- } catch (err) {
141
- const msg = err instanceof Error ? err.message : String(err);
142
- responseText = `[ERROR: ${msg}]`;
143
- }
144
-
145
- // Score against ground truth
146
- const truth = new Set(task.groundTruth);
147
- const foundNames = new Set(found.map((s) => s.name));
148
-
149
- const intersection = new Set([...foundNames].filter((n) => truth.has(n)));
150
- const recall = truth.size > 0 ? intersection.size / truth.size : 1;
151
- const precision = foundNames.size > 0 ? intersection.size / foundNames.size : 1;
152
- const f1 = precision + recall > 0 ? (2 * (precision * recall)) / (precision + recall) : 0;
153
-
154
- success = recall === 1 && precision === 1;
155
- const hallucinations = foundNames.size - intersection.size;
156
-
157
- // Check for evidence anchors in response
158
- const evidenceAnchorsPresent = /:\d+\s*@\s*[0-9a-f]{7}/.test(responseText) || found.length === 0;
159
-
160
- const tokenEstimate = Math.ceil(responseText.length / 4);
161
-
162
- return {
163
- taskId,
164
- description: task.description,
165
- kind: task.kind,
166
- success,
167
- recall: Math.round(recall * 100) / 100,
168
- precision: Math.round(precision * 100) / 100,
169
- f1: Math.round(f1 * 100) / 100,
170
- hallucinations,
171
- evidenceAnchorsPresent,
172
- tokenEstimate,
173
- notes: `truth=${Array.from(truth).join(',')} found=${Array.from(foundNames).join(',')}`,
174
- };
175
- }
176
-
177
- async function main() {
178
- // Load from the repo root (indexer stores it with the repo-relative path hash)
179
- const repoPath = process.cwd();
180
- const graph = loadGraph(repoPath);
181
- if (!graph) {
182
- console.error(`No graph.json found for ${repoPath}. Run: node dist/cli.js index .`);
183
- process.exit(1);
184
- }
185
-
186
- const idx = new GraphIndex(graph);
187
- const results: TaskMetrics[] = [];
188
-
189
- console.log(`Running ${TASKS.length} tasks against indexed GraphPilot...\n`);
190
-
191
- for (const task of TASKS) {
192
- const metrics = await runTask(idx, task.id, graph);
193
- results.push(metrics);
194
- const icon = metrics.success ? '✓' : '✗';
195
- console.log(
196
- `${icon} ${metrics.taskId}: F1=${metrics.f1} recall=${metrics.recall} prec=${metrics.precision}`,
197
- );
198
- }
199
-
200
- // Write results
201
- const timestamp = new Date().toISOString().split('T')[0];
202
- const resultsPath = `bench/results/agent-tier-${timestamp}.md`;
203
-
204
- let md = `# Tier-B Benchmark Results (Automated)\n\n`;
205
- md += `Timestamp: ${new Date().toISOString()}\n\n`;
206
- md += `## Per-Task Metrics\n\n`;
207
- md += `| Task | Description | Success | Recall | Precision | F1 | Halluc | Anchors |\n`;
208
- md += `|---|---|---|---|---|---|---|---|\n`;
209
-
210
- let totalSuccess = 0;
211
- let totalHalluc = 0;
212
-
213
- for (const m of results) {
214
- const success = m.success ? '✓' : '✗';
215
- const anchors = m.evidenceAnchorsPresent ? '✓' : '✗';
216
- md += `| ${m.taskId} | ${m.description} | ${success} | ${m.recall} | ${m.precision} | ${m.f1} | ${m.hallucinations} | ${anchors} |\n`;
217
- totalSuccess += m.success ? 1 : 0;
218
- totalHalluc += m.hallucinations;
219
- }
220
-
221
- md += `\n## Summary\n\n`;
222
- md += `- **Tasks passed:** ${totalSuccess}/${results.length}\n`;
223
- md += `- **Total hallucinations:** ${totalHalluc}\n`;
224
- md += `- **Evidence anchors:** ${results.filter((r) => r.evidenceAnchorsPresent).length}/${results.filter((r) => r.kind !== 'string-literal').length} (excluding string-search)\n`;
225
- md += `- **Mean F1 across tasks:** ${(results.reduce((n, r) => n + r.f1, 0) / results.length).toFixed(2)}\n`;
226
-
227
- fs.mkdirSync('bench/results', { recursive: true });
228
- fs.writeFileSync(resultsPath, md);
229
-
230
- console.log(`\nResults written to ${resultsPath}`);
231
- console.log(`\n${md}`);
232
- }
233
-
234
- main().catch(console.error);
@@ -1,125 +0,0 @@
1
- # Tier-B Agent Benchmark — Spec
2
-
3
- > The launch headline ("Claude Code with GraphPilot succeeded on X/10
4
- > refactor tasks vs Y/10 without") lives here. This is the **manual
5
- > turn-the-crank session** that produces those numbers.
6
- >
7
- > **Status:** spec only. Numbers not yet produced. Tier A (in
8
- > [README.md](README.md)) covers the deterministic, tool-only
9
- > comparison. Tier B adds an LLM in the loop.
10
-
11
- ## Why Tier B is separate
12
-
13
- Tier A measures _whether the tools return the right info_. Tier B
14
- measures _whether the agent reaches the right conclusion using those
15
- tools_. Both matter; they answer different questions.
16
-
17
- Tier A is automatable. Tier B is not — it requires:
18
-
19
- 1. Running real Claude Code sessions
20
- 2. Scoring "did the agent reach the right answer?" by hand
21
- 3. Recording token usage from the agent's logs
22
-
23
- That's ~4–6 hours of focused human work. Out of scope for a single
24
- benchmark commit; in scope for a separate launch-prep session.
25
-
26
- ## Method
27
-
28
- ### Setup
29
-
30
- - A test repo (preferably `microsoft/TypeScript` — large enough to
31
- matter, recognizable to readers)
32
- - Three Claude Code configurations:
33
- - **Baseline:** vanilla Claude Code, no MCP servers
34
- - **With GraphPilot:** Claude Code with the graphpilot MCP server
35
- configured + a CLAUDE.md routing rule pointing structural questions
36
- at the gp\_\* tools
37
- - **With CodeGraphContext** (optional but punchy): the closest OSS
38
- competitor, same setup
39
-
40
- ### The 10 tasks
41
-
42
- These mirror the Tier-A corpus but are phrased as natural-language
43
- refactor prompts:
44
-
45
- 1. Rename `createSourceFile` everywhere it's called
46
- 2. Find every function that catches but ignores errors
47
- 3. List the public API exported from `src/compiler/` (or pick one module)
48
- 4. Find the shortest call path from `parser.ts` to a syscall (`fs.write*`)
49
- 5. Find functions never called by any test
50
- 6. Which functions take `Diagnostic` as a parameter?
51
- 7. Find all callers of a function flagged `@deprecated`
52
- 8. Locate the function that emits a specific error message text
53
- 9. Trace a value from CLI input to where it's logged (expect agents to
54
- fail this — taint analysis isn't our beat)
55
- 10. Find HTTP routes without auth middleware (expect failure — no
56
- framework-aware tooling in v1)
57
-
58
- Tasks 9 and 10 are **deliberate "graphpilot loses" tasks**. Including
59
- them is what keeps the result believable.
60
-
61
- ### Metrics per task
62
-
63
- For each `(task, condition)` cell:
64
-
65
- | Metric | How |
66
- | ----------------------- | -------------------------------------------------- |
67
- | **Task success** (0/1) | Human eval against a hand-written rubric |
68
- | **Hallucination count** | Manual count of fabricated names / paths / imports |
69
- | **Token cost** | Sum of input+output tokens from Claude Code's log |
70
- | **Wall-clock** | Stopwatch from prompt-submit to final answer |
71
- | **Clean patch apply** | Did the proposed diff apply without conflict? |
72
-
73
- ### Scoring
74
-
75
- Aggregate the per-task numbers into the headline:
76
-
77
- ```
78
- Claude Code alone: N/10 tasks succeeded
79
- Claude Code + GraphPilot: M/10 tasks succeeded
80
- Token cost: −X%
81
- Hallucinations: −Y%
82
- ```
83
-
84
- If reality comes back at 5/10 vs 4/10, publish that — don't fake it.
85
-
86
- ## Runbook (when the session happens)
87
-
88
- 1. Clone the corpus repo (e.g. `microsoft/TypeScript`) to a clean dir
89
- 2. Configure Claude Code three ways (vanilla / + graphpilot / + CGC)
90
- 3. For each task: open a fresh session in each config, paste the prompt,
91
- run until Claude produces an answer or gives up, score the result
92
- 4. Tally totals; write the per-task table into
93
- `bench/results/agent-tier-<date>.md`
94
- 5. Drop the headline into the project README
95
-
96
- ## Why we haven't done this yet
97
-
98
- - Tier A produces real, publishable numbers in <1 minute and locks in
99
- the methodology. Better to have that floor than to launch with no
100
- numbers because Tier B is half-done.
101
- - Running Tier B costs real money (~$10–20 per pass in Claude tokens)
102
- and ~4–6 hours of attention. Worth doing right, in a focused session,
103
- not interleaved with development.
104
- - The Tier-A bytes-reduction number (99.9 %) is _already_ sufficient
105
- for a Show HN headline: _"99% fewer tokens needed to answer
106
- structural questions in your TypeScript codebase."_
107
-
108
- ## Estimated effort
109
-
110
- - Setup: 30 min
111
- - Run + score: 3–4 hours (10 tasks × 3 conditions × ~6 min)
112
- - Writeup + numbers into README: 30 min
113
-
114
- Total: half a working day.
115
-
116
- ## What to do if Tier-B numbers are mediocre
117
-
118
- If "Claude Code + GraphPilot" comes back at 6/10 vs 5/10 baseline, the
119
- honest move is:
120
-
121
- 1. Publish the real number
122
- 2. Reframe the launch around Tier A (where the win is huge)
123
- 3. Investigate WHY the agent didn't translate tool quality into answer
124
- quality (probably: tool descriptions not aggressive enough, or
125
- CLAUDE.md routing not strong enough). Fix and re-run before launch.
@@ -1,200 +0,0 @@
1
- /**
2
- * Baseline Tier-B: vanilla grep + CLI tools
3
- *
4
- * Simulates what an agent would do without GraphPilot:
5
- * - Use `grep -r` for queries
6
- * - No structured index, no blast-radius analysis
7
- * - High noise (false positives in comments, strings)
8
- *
9
- * This is a strawman baseline; real agents might use LSP or IDEs.
10
- * But grep represents the cost of *no* structured indexing.
11
- */
12
-
13
- import { execSync } from 'node:child_process';
14
- import * as fs from 'node:fs';
15
- import { TASKS } from './tasks.js';
16
-
17
- interface TaskMetrics {
18
- taskId: string;
19
- description: string;
20
- kind: string;
21
- success: boolean;
22
- recall: number;
23
- precision: number;
24
- f1: number;
25
- hallucinations: number;
26
- tokenEstimate: number;
27
- notes: string;
28
- }
29
-
30
- function runGrep(pattern: string, options: string[] = []): string[] {
31
- try {
32
- const cmd = [
33
- 'grep',
34
- '-r',
35
- '--include=*.ts',
36
- '--include=*.tsx',
37
- ...options,
38
- pattern,
39
- 'src',
40
- 'tests',
41
- ].join(' ');
42
- const output = execSync(cmd, {
43
- encoding: 'utf8',
44
- cwd: '.',
45
- stdio: ['pipe', 'pipe', 'ignore'],
46
- }).trim();
47
- if (!output) return [];
48
- return output.split('\n').filter((l) => l);
49
- } catch {
50
- return [];
51
- }
52
- }
53
-
54
- function extractSymbolsFromGrep(lines: string[]): Set<string> {
55
- const results = new Set<string>();
56
- for (const line of lines) {
57
- const m = line.match(/\b([a-zA-Z_][a-zA-Z0-9_]*)\b/g);
58
- if (m) m.forEach((n) => results.add(n));
59
- }
60
- return results;
61
- }
62
-
63
- async function runTask(taskId: string): Promise<TaskMetrics> {
64
- const task = TASKS.find((t) => t.id === taskId);
65
- if (!task) throw new Error(`Task ${taskId} not found`);
66
-
67
- let found = new Set<string>();
68
- let responseText = '';
69
-
70
- try {
71
- switch (task.kind) {
72
- case 'callers': {
73
- // grep for function call pattern (naive heuristic)
74
- const lines = runGrep(`\\b${task.query}\\s*\\(`);
75
- found = extractSymbolsFromGrep(lines);
76
- responseText = lines.join('\n').slice(0, 500);
77
- break;
78
- }
79
-
80
- case 'impact':
81
- case 'impact-since': {
82
- // Can't compute blast radius with grep — too many false positives
83
- // Simulate by grepping for the function name everywhere
84
- const lines = runGrep(`\\b${task.query}\\b`);
85
- found = extractSymbolsFromGrep(lines);
86
- responseText = `(grep can't compute blast radius; found ${lines.length} occurrences)`;
87
- break;
88
- }
89
-
90
- case 'recall':
91
- case 'recall-substring': {
92
- const pattern = task.kind === 'recall-substring' ? task.query : `\\b${task.query}\\b`;
93
- const lines = runGrep(pattern);
94
- found = extractSymbolsFromGrep(lines);
95
- responseText = lines.join('\n').slice(0, 500);
96
- break;
97
- }
98
-
99
- case 'kind-filter': {
100
- // grep for 'interface Foo', 'function bar', etc.
101
- const lines = runGrep(`${task.query}\\s+[a-zA-Z_]`);
102
- found = extractSymbolsFromGrep(lines);
103
- responseText = lines.join('\n').slice(0, 500);
104
- break;
105
- }
106
-
107
- case 'tests-affected': {
108
- const lines = runGrep(`\\b${task.query}\\b`, ['tests']);
109
- found = extractSymbolsFromGrep(lines);
110
- responseText = lines.join('\n').slice(0, 500);
111
- break;
112
- }
113
-
114
- case 'recall-miss': {
115
- const lines = runGrep(`\\b${task.query}\\b`);
116
- found = extractSymbolsFromGrep(lines);
117
- responseText = found.size === 0 ? '[not found in grep]' : found.size.toString();
118
- break;
119
- }
120
-
121
- case 'string-literal': {
122
- const lines = runGrep(task.query);
123
- found = new Set(lines.map((l) => l.split(':')[0])); // file paths
124
- responseText = lines.join('\n').slice(0, 500);
125
- break;
126
- }
127
- }
128
- } catch (err) {
129
- responseText = `[ERROR: ${err}]`;
130
- }
131
-
132
- // Score
133
- const truth = new Set(task.groundTruth);
134
- const intersection = new Set([...found].filter((n) => truth.has(n)));
135
- const recall = truth.size > 0 ? intersection.size / truth.size : 1;
136
- const precision = found.size > 0 ? intersection.size / found.size : 1;
137
- const f1 = precision + recall > 0 ? (2 * (precision * recall)) / (precision + recall) : 0;
138
-
139
- const success = recall === 1 && precision === 1;
140
- const hallucinations = found.size - intersection.size;
141
- const tokenEstimate = Math.ceil(responseText.length / 4);
142
-
143
- return {
144
- taskId,
145
- description: task.description,
146
- kind: task.kind,
147
- success,
148
- recall: Math.round(recall * 100) / 100,
149
- precision: Math.round(precision * 100) / 100,
150
- f1: Math.round(f1 * 100) / 100,
151
- hallucinations,
152
- tokenEstimate,
153
- notes: `truth=${Array.from(truth).join(',')} found=${Array.from(found).join(',')}`,
154
- };
155
- }
156
-
157
- async function main() {
158
- const results: TaskMetrics[] = [];
159
-
160
- console.log(`Running ${TASKS.length} tasks with grep baseline...\n`);
161
-
162
- for (const task of TASKS) {
163
- const metrics = await runTask(task.id);
164
- results.push(metrics);
165
- const icon = metrics.success ? '✓' : '✗';
166
- console.log(
167
- `${icon} ${metrics.taskId}: F1=${metrics.f1} recall=${metrics.recall} prec=${metrics.precision}`,
168
- );
169
- }
170
-
171
- const timestamp = new Date().toISOString().split('T')[0];
172
- const resultsPath = `bench/results/baseline-tier-${timestamp}.md`;
173
-
174
- let md = `# Baseline Tier-B (grep)\n\n`;
175
- md += `| Task | Description | Success | Recall | Precision | F1 | Halluc |\n`;
176
- md += `|---|---|---|---|---|---|---|\n`;
177
-
178
- let totalSuccess = 0;
179
- let totalHalluc = 0;
180
-
181
- for (const m of results) {
182
- const success = m.success ? '✓' : '✗';
183
- md += `| ${m.taskId} | ${m.description} | ${success} | ${m.recall} | ${m.precision} | ${m.f1} | ${m.hallucinations} |\n`;
184
- totalSuccess += m.success ? 1 : 0;
185
- totalHalluc += m.hallucinations;
186
- }
187
-
188
- md += `\n## Summary\n\n`;
189
- md += `- **Tasks passed:** ${totalSuccess}/${results.length}\n`;
190
- md += `- **Total hallucinations:** ${totalHalluc}\n`;
191
- md += `- **Mean F1:** ${(results.reduce((n, r) => n + r.f1, 0) / results.length).toFixed(2)}\n`;
192
-
193
- fs.mkdirSync('bench/results', { recursive: true });
194
- fs.writeFileSync(resultsPath, md);
195
-
196
- console.log(`\nResults written to ${resultsPath}`);
197
- console.log(`\n${md}`);
198
- }
199
-
200
- main().catch(console.error);