@mishasinitcyn/betterrank 0.2.2 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -31,9 +31,12 @@ betterrank callers authenticateUser --root /path/to/project --context
31
31
  # Everything about a function: source, types, deps, callers
32
32
  betterrank context calculate_bid --root /path/to/project
33
33
 
34
- # Trace the full call chain from entry point to function
34
+ # Trace the full call chain from entry point to function (upward)
35
35
  betterrank trace calculate_bid --root /path/to/project
36
36
 
37
+ # What does a function call, recursively? (downward)
38
+ betterrank callees calculate_bid --root /path/to/project
39
+
37
40
  # What symbols changed and what might break?
38
41
  betterrank diff --root /path/to/project
39
42
 
@@ -186,7 +189,32 @@ Callers (1 file):
186
189
  src/engine/bidding.py
187
190
  ```
188
191
 
189
- ### `trace` — Recursive caller chain
192
+ ### `history` — Git history of a specific function
193
+
194
+ Shows only commits that touched a function's lines. Uses tree-sitter line ranges for accuracy (better than git's heuristic `:funcname:` detection). Add `--patch` to include function-scoped diffs.
195
+
196
+ ```bash
197
+ # Commit list (compact)
198
+ betterrank history calculate_bid --root /path/to/project
199
+
200
+ # With function-scoped diffs
201
+ betterrank history calculate_bid --root /path/to/project --patch --limit 3
202
+
203
+ # Paginate through older commits
204
+ betterrank history calculate_bid --root /path/to/project --offset 5 --limit 5
205
+ ```
206
+
207
+ **Example output:**
208
+ ```
209
+ calculate_bid (src/engine/bidding.py:489-718)
210
+
211
+ 082b9d5 2026-02-24 fix: restore GSP auction pricing
212
+ c75f5ff 2026-02-14 fix: resolve lint errors from main merge
213
+ 623429c 2026-02-13 hot fix
214
+ 5d236d3 2026-02-06 feat: wire ad_position to ValuePredictor
215
+ ```
216
+
217
+ ### `trace` — Recursive caller chain (upward)
190
218
 
191
219
  Walk UP the call graph from a symbol to see the full path from entry points to your function. At each hop, resolves which function in the caller file contains the call site.
192
220
 
@@ -203,6 +231,26 @@ calculate_bid (src/engine/bidding.py:489)
203
231
  ← app (src/main.py:45)
204
232
  ```
205
233
 
234
+ ### `callees` — Recursive callee chain (downward)
235
+
236
+ Walk DOWN the call graph from a symbol to see everything it calls, transitively. The mirror of `trace`. Use before refactoring to understand downstream dependencies.
237
+
238
+ ```bash
239
+ betterrank callees calculate_bid --root /path/to/project
240
+ betterrank callees calculate_bid --root /path/to/project --depth 5
241
+ ```
242
+
243
+ **Example output:**
244
+ ```
245
+ calculate_bid (src/engine/bidding.py:489)
246
+ → from_microdollars (src/core/currency.py:108)
247
+ → get_config (src/engine/predictor/config.py:316)
248
+ → load_yaml (src/core/config.py:22)
249
+ → get_value_predictor (src/engine/predictor/persistence.py:123)
250
+ ```
251
+
252
+ Use both together for a full "sandwich view" of a function — who calls it (upstream) and what it touches (downstream).
253
+
206
254
  ### `diff` — Git-aware blast radius
207
255
 
208
256
  Shows which symbols changed in the working tree and how many external files call each changed symbol. Compares current disk state against a git ref.
@@ -290,6 +338,7 @@ const idx = new CodeIndex('/path/to/project');
290
338
  const map = await idx.map({ limit: 100, focusFiles: ['src/main.ts'] });
291
339
  const results = await idx.search({ query: 'auth', kind: 'function', limit: 10 });
292
340
  const callers = await idx.callers({ symbol: 'authenticate', context: 2 });
341
+ const tree = await idx.callees({ symbol: 'authenticate', depth: 3 });
293
342
  const counts = await idx.getCallerCounts('src/auth.ts');
294
343
  const deps = await idx.dependencies({ file: 'src/auth.ts' });
295
344
  const dependents = await idx.dependents({ file: 'src/auth.ts' });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mishasinitcyn/betterrank",
3
- "version": "0.2.2",
3
+ "version": "0.2.5",
4
4
  "description": "Structural code index with PageRank-ranked repo maps, symbol search, call-graph queries, and dependency analysis. Built on tree-sitter and graphology.",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
package/src/cli.js CHANGED
@@ -19,12 +19,15 @@ Commands:
19
19
  symbols [--file path] [--kind type] List definitions (ranked by PageRank)
20
20
  callers <symbol> [--file path] [--context] All call sites (ranked, with context lines)
21
21
  context <symbol> [--file path] Full context: source, deps, types, callers
22
- trace <symbol> [--depth N] Recursive caller chain (call tree)
22
+ history <symbol> [--file path] Git history of a specific function
23
+ trace <symbol> [--depth N] Recursive caller chain (upward)
24
+ callees <symbol> [--depth N] Recursive callee chain (downward)
23
25
  diff [--ref <commit>] Git-aware blast radius (changed symbols + callers)
24
26
  deps <file> What this file imports (ranked)
25
27
  dependents <file> What imports this file (ranked)
26
28
  neighborhood <file> [--hops N] [--max-files N] Local subgraph (ranked by PageRank)
27
29
  orphans [--level file|symbol] [--kind type] Find disconnected files/symbols
30
+ compare <pathA> <pathB> Structural diff between two files/dirs
28
31
  reindex Force full rebuild
29
32
  stats Index statistics
30
33
 
@@ -154,6 +157,24 @@ Examples:
154
157
  betterrank context calculate_bid --root .
155
158
  betterrank context Router --file src/llm.py --root .`,
156
159
 
160
+ history: `betterrank history <symbol> [--file path] [--patch] [--limit N] [--root <path>]
161
+
162
+ Git history of a specific function. Uses the tree-sitter line range to
163
+ show only commits that touched that function's lines.
164
+
165
+ More accurate than git log -L :funcname: because betterrank knows the
166
+ exact line range from tree-sitter, not git's heuristic function detection.
167
+
168
+ Options:
169
+ --file <path> Disambiguate when multiple symbols share a name
170
+ --patch, -p Include function-scoped diffs (not just commit list)
171
+ --limit N Max commits to show (default: 20)
172
+
173
+ Examples:
174
+ betterrank history calculate_bid --root .
175
+ betterrank history calculate_bid --root . --patch --limit 3
176
+ betterrank history Router --file src/llm.py --root .`,
177
+
157
178
  trace: `betterrank trace <symbol> [--depth N] [--file path] [--root <path>]
158
179
 
159
180
  Recursive caller chain — walk UP the call graph from a symbol to see
@@ -250,6 +271,27 @@ Examples:
250
271
  betterrank orphans --level symbol --kind function --root .
251
272
  betterrank orphans --count --root .`,
252
273
 
274
+ compare: `betterrank compare <pathA> <pathB> [--kind type] [--include-tests]
275
+
276
+ Structural diff between two files or directories. Shows which symbols
277
+ exist in both, which are unique to each side, and how their signatures
278
+ and dependencies differ. No scores — just deterministic structural facts.
279
+
280
+ For directories: also shows file-level overlap (shared basenames).
281
+ By default filters out test files and generic names (get, set, __init__, etc.)
282
+ to focus on meaningful structural overlap.
283
+
284
+ Options:
285
+ --kind <type> Filter to: function, class, type, variable
286
+ --include-tests Include test files and test_ functions
287
+ --limit N Max items per section (default: 30)
288
+
289
+ Examples:
290
+ betterrank compare src/auth.py lib/auth.py
291
+ betterrank compare ./repo-a ./repo-b
292
+ betterrank compare ./repo-a ./repo-b --kind function
293
+ betterrank compare flask/app.py bottle/bottle.py --kind class`,
294
+
253
295
  reindex: `betterrank reindex [--root <path>]
254
296
 
255
297
  Force a full rebuild of the index. Use after branch switches, large merges,
@@ -309,6 +351,126 @@ async function main() {
309
351
  return; // Keep process alive (server is listening)
310
352
  }
311
353
 
354
+ // Compare command — standalone, doesn't need CodeIndex
355
+ if (command === 'compare') {
356
+ const pathA = flags._positional[0];
357
+ const pathB = flags._positional[1];
358
+ if (!pathA || !pathB) {
359
+ console.error('Usage: betterrank compare <pathA> <pathB> [--kind type]');
360
+ process.exit(1);
361
+ }
362
+ const absA = resolve(pathA);
363
+ const absB = resolve(pathB);
364
+ const { compare } = await import('./compare.js');
365
+ const includeTests = flags['include-tests'] === true;
366
+ const countMode = flags.count === true;
367
+
368
+ let result;
369
+ try {
370
+ result = await compare(absA, absB, { kind: flags.kind, includeTests });
371
+ } catch (err) {
372
+ console.error(err.message);
373
+ process.exit(1);
374
+ }
375
+
376
+ // --count mode: just print totals
377
+ if (countMode) {
378
+ const sm = result.summary;
379
+ console.log(`shared: ${sm.sharedNames}`);
380
+ console.log(`only_a: ${sm.onlyACount}`);
381
+ console.log(`only_b: ${sm.onlyBCount}`);
382
+ console.log(`total_a: ${sm.totalA}`);
383
+ console.log(`total_b: ${sm.totalB}`);
384
+ return;
385
+ }
386
+
387
+ const limit = flags.limit !== undefined ? parseInt(flags.limit, 10) : DEFAULT_LIMIT;
388
+ const off = flags.offset !== undefined ? parseInt(flags.offset, 10) : 0;
389
+
390
+ // Helper: paginate a list and print a range header
391
+ const paginateSection = (items, label) => {
392
+ const total = items.length;
393
+ if (total === 0) return [];
394
+ const shown = items.slice(off, off + limit);
395
+ if (shown.length === 0) {
396
+ console.log(`\n── ${label} (${total}) ── (offset ${off} exceeds ${total} results)`);
397
+ return [];
398
+ }
399
+ const rangeStr = total > limit || off > 0 ? `, showing ${off + 1}-${off + shown.length}` : '';
400
+ console.log(`\n── ${label} (${total}${rangeStr}) ──`);
401
+ return shown;
402
+ };
403
+
404
+ // Warn if either side had zero symbols
405
+ if (result.summary.totalA === 0) {
406
+ process.stderr.write(`⚠ No parseable symbols found in A: ${result.labelA}\n`);
407
+ }
408
+ if (result.summary.totalB === 0) {
409
+ process.stderr.write(`⚠ No parseable symbols found in B: ${result.labelB}\n`);
410
+ }
411
+
412
+ // Summary first (most useful at a glance)
413
+ const sm = result.summary;
414
+ console.log(`── Summary ──`);
415
+ console.log(` A: ${result.labelA} (${sm.totalA} symbols)`);
416
+ console.log(` B: ${result.labelB} (${sm.totalB} symbols)`);
417
+ console.log(` Shared names: ${sm.sharedNames} | Only A: ${sm.onlyACount} | Only B: ${sm.onlyBCount}`);
418
+
419
+ // File-level overlap (directory mode)
420
+ if (result.isDirectoryMode) {
421
+ console.log(`\n── Files ──`);
422
+ console.log(` A: ${result.files.totalA} files | B: ${result.files.totalB} files`);
423
+ if (result.files.shared.length > 0) {
424
+ const fileList = result.files.shared.length > 15
425
+ ? result.files.shared.slice(0, 15).join(', ') + ` (+${result.files.shared.length - 15} more)`
426
+ : result.files.shared.join(', ');
427
+ console.log(` Shared basenames (${result.files.shared.length}): ${fileList}`);
428
+ }
429
+ }
430
+
431
+ // Shared symbols — compact grouped format, sorted by sharedRefs
432
+ const sharedShown = paginateSection(result.shared, 'Shared symbols');
433
+ for (const s of sharedShown) {
434
+ const kinds = new Set([...s.inA.map(d => d.kind), ...s.inB.map(d => d.kind)]);
435
+ const kindStr = [...kinds].join('/');
436
+ const refTag = s.sharedRefs.length > 0 ? ` ${s.sharedRefs.length} shared refs` : '';
437
+ console.log(` ${s.name} [${kindStr}] A:${s.inA.length} def${s.inA.length > 1 ? 's' : ''} B:${s.inB.length} def${s.inB.length > 1 ? 's' : ''}${refTag}`);
438
+ for (const d of s.inA.slice(0, 2)) {
439
+ console.log(` A: ${d.file}:${d.line} ${d.signature}`);
440
+ }
441
+ if (s.inA.length > 2) console.log(` A: ... and ${s.inA.length - 2} more`);
442
+ for (const d of s.inB.slice(0, 2)) {
443
+ console.log(` B: ${d.file}:${d.line} ${d.signature}`);
444
+ }
445
+ if (s.inB.length > 2) console.log(` B: ... and ${s.inB.length - 2} more`);
446
+ if (s.sharedRefs.length > 0) {
447
+ console.log(` Shared refs: ${s.sharedRefs.slice(0, 10).join(', ')}${s.sharedRefs.length > 10 ? ` (+${s.sharedRefs.length - 10} more)` : ''}`);
448
+ }
449
+ }
450
+ if (result.shared.length > off + limit) {
451
+ console.log(` (use --offset ${off + limit} to see more)`);
452
+ }
453
+
454
+ // Only in A
455
+ const onlyAShown = paginateSection(result.onlyA, 'Only in A');
456
+ for (const s of onlyAShown) {
457
+ console.log(` [${s.kind}] ${s.file}:${s.line} ${s.signature}`);
458
+ }
459
+ if (result.onlyA.length > off + limit) {
460
+ console.log(` (use --offset ${off + limit} to see more)`);
461
+ }
462
+
463
+ // Only in B
464
+ const onlyBShown = paginateSection(result.onlyB, 'Only in B');
465
+ for (const s of onlyBShown) {
466
+ console.log(` [${s.kind}] ${s.file}:${s.line} ${s.signature}`);
467
+ }
468
+ if (result.onlyB.length > off + limit) {
469
+ console.log(` (use --offset ${off + limit} to see more)`);
470
+ }
471
+ return;
472
+ }
473
+
312
474
  // Outline command — standalone by default, needs CodeIndex for --annotate
313
475
  if (command === 'outline') {
314
476
  const filePath = flags._positional[0];
@@ -591,6 +753,36 @@ async function main() {
591
753
  break;
592
754
  }
593
755
 
756
+ case 'history': {
757
+ const symbol = flags._positional[0];
758
+ if (!symbol) { console.error('Usage: betterrank history <symbol> [--file path] [--patch]'); process.exit(1); }
759
+ const histLimit = flags.limit ? parseInt(flags.limit, 10) : 20;
760
+ const histOffset = flags.offset ? parseInt(flags.offset, 10) : 0;
761
+ const showPatch = flags.patch === true || flags.p === true;
762
+ const result = await idx.history({ symbol, file: normalizeFilePath(flags.file), offset: histOffset, limit: histLimit, patch: showPatch });
763
+ if (!result) {
764
+ console.log(`(symbol "${symbol}" not found)`);
765
+ } else if (result.error) {
766
+ console.error(result.error);
767
+ } else if (result.raw) {
768
+ // --patch mode: print git's full output
769
+ const def = result.definition;
770
+ console.log(`${def.name} (${def.file}:${def.lineStart}-${def.lineEnd})\n`);
771
+ console.log(result.raw);
772
+ } else {
773
+ const def = result.definition;
774
+ console.log(`${def.name} (${def.file}:${def.lineStart}-${def.lineEnd})\n`);
775
+ if (result.commits.length === 0) {
776
+ console.log('(no commits found)');
777
+ } else {
778
+ for (const line of result.commits) {
779
+ console.log(` ${line}`);
780
+ }
781
+ }
782
+ }
783
+ break;
784
+ }
785
+
594
786
  case 'trace': {
595
787
  const symbol = flags._positional[0];
596
788
  if (!symbol) { console.error('Usage: betterrank trace <symbol> [--depth N]'); process.exit(1); }
@@ -612,6 +804,27 @@ async function main() {
612
804
  break;
613
805
  }
614
806
 
807
+ case 'callees': {
808
+ const symbol = flags._positional[0];
809
+ if (!symbol) { console.error('Usage: betterrank callees <symbol> [--depth N]'); process.exit(1); }
810
+ const calleesDepth = flags.depth ? parseInt(flags.depth, 10) : 3;
811
+ const tree = await idx.callees({ symbol, file: normalizeFilePath(flags.file), depth: calleesDepth });
812
+ if (!tree) {
813
+ console.log(`(symbol "${symbol}" not found)`);
814
+ } else {
815
+ const printNode = (node, depth) => {
816
+ const indent = depth === 0 ? '' : ' '.repeat(depth) + '→ ';
817
+ const loc = `(${node.file}:${node.line || '?'})`;
818
+ console.log(`${indent}${node.name} ${loc}`);
819
+ for (const callee of node.callees) {
820
+ printNode(callee, depth + 1);
821
+ }
822
+ };
823
+ printNode(tree, 0);
824
+ }
825
+ break;
826
+ }
827
+
615
828
  case 'diff': {
616
829
  const result = await idx.diff({ ref: flags.ref || 'HEAD' });
617
830
  if (result.error) {
@@ -789,6 +1002,12 @@ async function main() {
789
1002
  break;
790
1003
  }
791
1004
 
1005
+ case 'similar': {
1006
+ console.error('The "similar" command has been replaced by "compare".');
1007
+ console.error('Usage: betterrank compare <pathA> <pathB>');
1008
+ process.exit(1);
1009
+ }
1010
+
792
1011
  case 'reindex': {
793
1012
  const t0 = Date.now();
794
1013
  const result = await idx.reindex();
package/src/compare.js ADDED
@@ -0,0 +1,288 @@
1
+ import { readFile, stat as fsStat } from 'fs/promises';
2
+ import { glob } from 'glob';
3
+ import { join, relative, basename } from 'path';
4
+ import { parseFile, SUPPORTED_EXTENSIONS } from './parser.js';
5
+
6
+ const IGNORE_PATTERNS = [
7
+ '**/node_modules/**', '**/.npm/**', '**/.yarn/**', '**/.pnp.*',
8
+ '**/bower_components/**', '**/*.min.js', '**/*.bundle.js', '**/*.map',
9
+ '**/__pycache__/**', '**/.venv/**', '**/venv/**', '**/env/**',
10
+ '**/.env/**', '**/.virtualenvs/**', '**/site-packages/**',
11
+ '**/*.egg-info/**', '**/.eggs/**', '**/dist/**', '**/build/**',
12
+ '**/.git/**', '**/.svn/**', '**/.hg/**',
13
+ '**/vendor/**', '**/tmp/**', '**/temp/**',
14
+ '**/.idea/**', '**/.vscode/**', '**/.DS_Store',
15
+ '**/Pods/**', '**/DerivedData/**',
16
+ ];
17
+
18
+ // Names too generic to be meaningful matches across codebases.
19
+ // These are filtered on EXACT name match only — `processEvent` survives,
20
+ // only bare `process` is dropped.
21
+ const NOISE_NAMES = new Set([
22
+ // Ultra-common function names
23
+ 'get', 'set', 'run', 'main', 'init', 'setup', 'start', 'stop',
24
+ 'open', 'close', 'read', 'write', 'delete', 'update', 'create',
25
+ 'add', 'remove', 'clear', 'reset', 'test', 'check', 'load',
26
+ 'toString', 'toJSON', 'valueOf', 'hash', 'eq', 'repr', 'str',
27
+ 'copy', 'keys', 'values', 'items', 'pop', 'push', 'append',
28
+ 'default', 'setdefault', 'apply', 'call', 'bind',
29
+ 'map', 'filter', 'reduce', 'format', 'parse', 'validate',
30
+ 'serialize', 'deserialize', 'configure', 'connect',
31
+ // Python dunders
32
+ '__init__', '__repr__', '__str__', '__eq__', '__hash__',
33
+ '__enter__', '__exit__', '__iter__', '__next__', '__len__',
34
+ '__getitem__', '__setitem__', '__delitem__', '__contains__',
35
+ '__call__', '__bool__', '__getattr__', '__setattr__', '__delattr__',
36
+ '__get__', '__set__', '__delete__',
37
+ // JS common
38
+ 'constructor', 'render', 'process', 'handle', 'execute',
39
+ // Single-char and trivially short names
40
+ 'a', 'b', 'c', 'd', 'e', 'f', 'x', 'y', 'n', 'i', 'j', 'k',
41
+ // Common test fixture names
42
+ 'foo', 'bar', 'baz', 'wrapper', 'decorator', 'callback',
43
+ 'index', 'app', 'client', 'response', 'request',
44
+ ]);
45
+
46
+ async function scanAndParse(dirPath) {
47
+ const pattern = `**/*{${SUPPORTED_EXTENSIONS.join(',')}}`;
48
+ const files = await glob(pattern, {
49
+ cwd: dirPath,
50
+ ignore: IGNORE_PATTERNS,
51
+ absolute: true,
52
+ nodir: true,
53
+ });
54
+
55
+ const results = [];
56
+ for (const absPath of files) {
57
+ const relPath = relative(dirPath, absPath);
58
+ try {
59
+ const source = await readFile(absPath, 'utf-8');
60
+ const result = parseFile(relPath, source);
61
+ if (result) results.push(result);
62
+ } catch {
63
+ // skip unparseable files
64
+ }
65
+ }
66
+ return results;
67
+ }
68
+
69
+ async function parseSingleFile(filePath) {
70
+ const source = await readFile(filePath, 'utf-8');
71
+ const result = parseFile(basename(filePath), source);
72
+ return result ? [result] : [];
73
+ }
74
+
75
+ function extractSymbols(parseResults) {
76
+ const symbols = [];
77
+ for (const fileResult of parseResults) {
78
+ for (const def of fileResult.definitions) {
79
+ symbols.push({
80
+ name: def.name,
81
+ kind: def.kind,
82
+ file: fileResult.file,
83
+ lineStart: def.lineStart,
84
+ lineEnd: def.lineEnd,
85
+ signature: def.signature,
86
+ paramCount: (def.paramNames || []).length,
87
+ paramNames: def.paramNames || [],
88
+ localRefs: def.localRefs || [],
89
+ bodyLines: (def.lineEnd || 0) - (def.lineStart || 0),
90
+ });
91
+ }
92
+ }
93
+ return symbols;
94
+ }
95
+
96
+ // Test file detection
97
+ const TEST_SEGMENTS = ['test/', 'tests/', '__tests__/', 'spec/', 'specs/', 'conftest'];
98
+ function isTestFile(file) {
99
+ const lower = file.toLowerCase();
100
+ return TEST_SEGMENTS.some(s => lower.includes(s)) || basename(file).startsWith('test_');
101
+ }
102
+
103
+ /**
104
+ * Compare two codebases (files or directories).
105
+ *
106
+ * Returns deterministic structural facts grouped by symbol name.
107
+ * Filters out noise (test_ prefixes, dunders, trivially generic names).
108
+ *
109
+ * Shared symbols are ranked by sharedRefs count (how many internal
110
+ * function calls they have in common — the strongest signal for
111
+ * "these are likely doing the same thing").
112
+ */
113
+ async function compare(pathA, pathB, { kind, includeTests = false } = {}) {
114
+ // Validate paths exist
115
+ let statA, statB;
116
+ try {
117
+ statA = await fsStat(pathA);
118
+ } catch {
119
+ throw new Error(`Path A does not exist: ${pathA}`);
120
+ }
121
+ try {
122
+ statB = await fsStat(pathB);
123
+ } catch {
124
+ throw new Error(`Path B does not exist: ${pathB}`);
125
+ }
126
+
127
+ const parseResultsA = statA.isDirectory()
128
+ ? await scanAndParse(pathA)
129
+ : await parseSingleFile(pathA);
130
+ const parseResultsB = statB.isDirectory()
131
+ ? await scanAndParse(pathB)
132
+ : await parseSingleFile(pathB);
133
+
134
+ let symbolsA = extractSymbols(parseResultsA);
135
+ let symbolsB = extractSymbols(parseResultsB);
136
+
137
+ // Apply kind filter
138
+ if (kind) {
139
+ symbolsA = symbolsA.filter(s => s.kind === kind);
140
+ symbolsB = symbolsB.filter(s => s.kind === kind);
141
+ }
142
+
143
+ // Filter out test functions, test files, and noise unless explicitly included
144
+ const isSignificant = (s) => {
145
+ if (!includeTests && s.name.startsWith('test_')) return false;
146
+ if (!includeTests && s.name.startsWith('Test')) return false;
147
+ if (!includeTests && isTestFile(s.file)) return false;
148
+ if (NOISE_NAMES.has(s.name)) return false;
149
+ return true;
150
+ };
151
+
152
+ symbolsA = symbolsA.filter(isSignificant);
153
+ symbolsB = symbolsB.filter(isSignificant);
154
+
155
+ // Deduplicate (same name+file+line can appear from overlapping tree-sitter captures)
156
+ const dedup = (syms) => {
157
+ const seen = new Set();
158
+ return syms.filter(s => {
159
+ const key = `${s.name}::${s.file}::${s.lineStart}`;
160
+ if (seen.has(key)) return false;
161
+ seen.add(key);
162
+ return true;
163
+ });
164
+ };
165
+ symbolsA = dedup(symbolsA);
166
+ symbolsB = dedup(symbolsB);
167
+
168
+ // Group by name
169
+ const byNameA = new Map();
170
+ for (const s of symbolsA) {
171
+ if (!byNameA.has(s.name)) byNameA.set(s.name, []);
172
+ byNameA.get(s.name).push(s);
173
+ }
174
+ const byNameB = new Map();
175
+ for (const s of symbolsB) {
176
+ if (!byNameB.has(s.name)) byNameB.set(s.name, []);
177
+ byNameB.get(s.name).push(s);
178
+ }
179
+
180
+ // Shared: names that exist in both. Group all definitions under one entry.
181
+ const shared = [];
182
+ for (const [name, symsA] of byNameA) {
183
+ if (!byNameB.has(name)) continue;
184
+ const symsB = byNameB.get(name);
185
+
186
+ // Collect all local refs across all definitions of this name
187
+ const allRefsA = new Set();
188
+ const allRefsB = new Set();
189
+ for (const s of symsA) for (const r of s.localRefs) allRefsA.add(r);
190
+ for (const s of symsB) for (const r of s.localRefs) allRefsB.add(r);
191
+ const sharedRefs = [...allRefsA].filter(r => allRefsB.has(r));
192
+
193
+ // Check if any pair has matching kind and similar param count
194
+ const sameKind = symsA.some(a => symsB.some(b => a.kind === b.kind));
195
+ const sameParamCount = symsA.some(a => symsB.some(b => a.paramCount === b.paramCount));
196
+
197
+ shared.push({
198
+ name,
199
+ inA: symsA.map(s => ({ kind: s.kind, file: s.file, line: s.lineStart, signature: s.signature, paramCount: s.paramCount, bodyLines: s.bodyLines })),
200
+ inB: symsB.map(s => ({ kind: s.kind, file: s.file, line: s.lineStart, signature: s.signature, paramCount: s.paramCount, bodyLines: s.bodyLines })),
201
+ sharedRefs,
202
+ sameKind,
203
+ sameParamCount,
204
+ });
205
+ }
206
+
207
+ // Sort shared by sharedRefs count (strongest consolidation signal),
208
+ // then by whether kind/params match, then alphabetically
209
+ shared.sort((a, b) => {
210
+ const refDiff = b.sharedRefs.length - a.sharedRefs.length;
211
+ if (refDiff !== 0) return refDiff;
212
+ // Prefer same-kind matches
213
+ if (a.sameKind !== b.sameKind) return a.sameKind ? -1 : 1;
214
+ // Prefer same-param-count matches
215
+ if (a.sameParamCount !== b.sameParamCount) return a.sameParamCount ? -1 : 1;
216
+ // Alphabetical tiebreak
217
+ return a.name.localeCompare(b.name);
218
+ });
219
+
220
+ // Only in A / Only in B — sorted alphabetically
221
+ const onlyA = [];
222
+ for (const [name, syms] of byNameA) {
223
+ if (byNameB.has(name)) continue;
224
+ for (const s of syms) {
225
+ onlyA.push({ name, kind: s.kind, file: s.file, line: s.lineStart, signature: s.signature });
226
+ }
227
+ }
228
+ // Sort: public names first, then _private, alphabetical within each group
229
+ const privateLast = (a, b) => {
230
+ const aPrivate = a.name.startsWith('_');
231
+ const bPrivate = b.name.startsWith('_');
232
+ if (aPrivate !== bPrivate) return aPrivate ? 1 : -1;
233
+ return a.name.localeCompare(b.name);
234
+ };
235
+ onlyA.sort(privateLast);
236
+
237
+ const onlyB = [];
238
+ for (const [name, syms] of byNameB) {
239
+ if (byNameA.has(name)) continue;
240
+ for (const s of syms) {
241
+ onlyB.push({ name, kind: s.kind, file: s.file, line: s.lineStart, signature: s.signature });
242
+ }
243
+ }
244
+ onlyB.sort(privateLast);
245
+
246
+ // File-level comparison (basename matching for directory mode)
247
+ // Filter out basenames that are too generic to be meaningful matches
248
+ const NOISE_BASENAMES = new Set([
249
+ '__init__.py', 'conftest.py', 'conf.py', 'setup.py', 'setup.cfg',
250
+ 'index.js', 'index.ts', 'index.tsx', 'main.py', 'main.go', 'main.rs',
251
+ 'app.py', 'app.js', 'app.ts', 'mod.rs', 'lib.rs',
252
+ 'utils.py', 'utils.js', 'utils.ts', 'helpers.py', 'helpers.js',
253
+ 'types.ts', 'types.py', 'config.py', 'config.js', 'config.ts',
254
+ 'constants.py', 'constants.js', 'constants.ts',
255
+ ]);
256
+ const filesA = parseResultsA.map(r => r.file);
257
+ const filesB = parseResultsB.map(r => r.file);
258
+ const basenamesA = new Set(filesA.map(f => basename(f)));
259
+ const basenamesB = new Set(filesB.map(f => basename(f)));
260
+ const sharedFiles = [...basenamesA].filter(f => basenamesB.has(f) && !NOISE_BASENAMES.has(f));
261
+ const onlyFilesA = [...basenamesA].filter(f => !basenamesB.has(f));
262
+ const onlyFilesB = [...basenamesB].filter(f => !basenamesA.has(f));
263
+
264
+ return {
265
+ labelA: statA.isDirectory() ? pathA : basename(pathA),
266
+ labelB: statB.isDirectory() ? pathB : basename(pathB),
267
+ isDirectoryMode: statA.isDirectory() || statB.isDirectory(),
268
+ shared,
269
+ onlyA,
270
+ onlyB,
271
+ files: {
272
+ shared: sharedFiles,
273
+ onlyA: onlyFilesA,
274
+ onlyB: onlyFilesB,
275
+ totalA: filesA.length,
276
+ totalB: filesB.length,
277
+ },
278
+ summary: {
279
+ totalA: symbolsA.length,
280
+ totalB: symbolsB.length,
281
+ sharedNames: shared.length,
282
+ onlyACount: onlyA.length,
283
+ onlyBCount: onlyB.length,
284
+ },
285
+ };
286
+ }
287
+
288
+ export { compare };
package/src/graph.js CHANGED
@@ -61,6 +61,9 @@ function buildGraph(allSymbols) {
61
61
  lineStart: def.lineStart,
62
62
  lineEnd: def.lineEnd,
63
63
  signature: def.signature,
64
+ astProfile: def.astProfile || null,
65
+ paramNames: def.paramNames || null,
66
+ localRefs: def.localRefs || null,
64
67
  });
65
68
  graph.addEdge(file, symbolKey, { type: 'DEFINES' });
66
69
  }
@@ -148,6 +151,9 @@ function updateGraphFiles(graph, removedFiles, newSymbols) {
148
151
  lineStart: def.lineStart,
149
152
  lineEnd: def.lineEnd,
150
153
  signature: def.signature,
154
+ astProfile: def.astProfile || null,
155
+ paramNames: def.paramNames || null,
156
+ localRefs: def.localRefs || null,
151
157
  });
152
158
  graph.addEdge(file, symbolKey, { type: 'DEFINES' });
153
159
 
package/src/index.js CHANGED
@@ -4,6 +4,73 @@ import { CodeIndexCache } from './cache.js';
4
4
  import { rankedSymbols } from './graph.js';
5
5
  import { parseFile } from './parser.js';
6
6
 
7
+ /**
8
+ * Collapse unchanged context lines in git log -L diff output.
9
+ * Keeps `ctx` lines of context around each +/- change, replaces
10
+ * long unchanged runs with "...".
11
+ */
12
+ function _collapseDiffContext(raw, ctx = 2) {
13
+ const output = [];
14
+ // Split into per-commit sections (each starts with "commit ")
15
+ const sections = raw.split(/^(?=commit )/m);
16
+
17
+ for (const section of sections) {
18
+ if (!section.trim()) continue;
19
+ const lines = section.split('\n');
20
+ // Find the diff start (line starting with "diff --git")
21
+ const diffStart = lines.findIndex(l => l.startsWith('diff --git'));
22
+ if (diffStart === -1) {
23
+ // No diff in this section (e.g., initial commit with just +++ lines)
24
+ output.push(section);
25
+ continue;
26
+ }
27
+
28
+ // Keep the commit header (everything before "diff --git")
29
+ output.push(lines.slice(0, diffStart).join('\n'));
30
+
31
+ // Process the diff portion
32
+ const diffLines = lines.slice(diffStart);
33
+ // Find lines that are actual diff content (after the @@ hunk header)
34
+ const hunkStart = diffLines.findIndex(l => l.startsWith('@@'));
35
+ if (hunkStart === -1) {
36
+ output.push(diffLines.join('\n'));
37
+ continue;
38
+ }
39
+
40
+ // Keep diff headers (diff --git, ---, +++, @@)
41
+ output.push(diffLines.slice(0, hunkStart + 1).join('\n'));
42
+
43
+ const content = diffLines.slice(hunkStart + 1);
44
+ // Mark which lines are "interesting" (changed or near a change)
45
+ const isChange = content.map(l => l.startsWith('+') || l.startsWith('-'));
46
+ const show = new Array(content.length).fill(false);
47
+
48
+ for (let i = 0; i < content.length; i++) {
49
+ if (isChange[i]) {
50
+ for (let j = Math.max(0, i - ctx); j <= Math.min(content.length - 1, i + ctx); j++) {
51
+ show[j] = true;
52
+ }
53
+ }
54
+ }
55
+
56
+ // Build collapsed output
57
+ let inEllipsis = false;
58
+ const collapsed = [];
59
+ for (let i = 0; i < content.length; i++) {
60
+ if (show[i]) {
61
+ inEllipsis = false;
62
+ collapsed.push(content[i]);
63
+ } else if (!inEllipsis) {
64
+ inEllipsis = true;
65
+ collapsed.push(' ...');
66
+ }
67
+ }
68
+ output.push(collapsed.join('\n'));
69
+ }
70
+
71
+ return output.join('\n');
72
+ }
73
+
7
74
  // ── Orphan false-positive filters ──────────────────────────────────────────
8
75
  //
9
76
  // Orphan detection finds files/symbols with no cross-file connections.
@@ -1157,6 +1224,63 @@ class CodeIndex {
1157
1224
  };
1158
1225
  }
1159
1226
 
1227
+ /**
1228
+ * Git history of a specific function, using its tree-sitter line range.
1229
+ *
1230
+ * @param {object} opts
1231
+ * @param {string} opts.symbol - Symbol name
1232
+ * @param {string} [opts.file] - Disambiguate by file
1233
+ * @param {number} [opts.offset=0] - Skip first N commits
1234
+ * @param {number} [opts.limit=20] - Max commits to show
1235
+ * @param {boolean} [opts.patch=false] - Include function-scoped diffs
1236
+ * @returns {object|null} { definition, commits, raw? }
1237
+ */
1238
+ async history({ symbol, file, offset = 0, limit = 20, patch = false }) {
1239
+ await this._ensureReady();
1240
+ const graph = this.cache.getGraph();
1241
+ if (!graph) return null;
1242
+
1243
+ const candidates = [];
1244
+ graph.forEachNode((node, attrs) => {
1245
+ if (attrs.type !== 'symbol') return;
1246
+ if (attrs.name !== symbol) return;
1247
+ if (file && attrs.file !== file) return;
1248
+ candidates.push(attrs);
1249
+ });
1250
+ if (candidates.length === 0) return null;
1251
+
1252
+ const ranked = this._getRanked();
1253
+ const scoreMap = new Map(ranked);
1254
+ candidates.sort((a, b) => {
1255
+ const aKey = `${a.file}::${a.name}`;
1256
+ const bKey = `${b.file}::${b.name}`;
1257
+ return (scoreMap.get(bKey) || 0) - (scoreMap.get(aKey) || 0);
1258
+ });
1259
+ const target = candidates[0];
1260
+
1261
+ const { execSync } = await import('child_process');
1262
+ try {
1263
+ if (patch) {
1264
+ const raw = execSync(
1265
+ `git log -L ${target.lineStart},${target.lineEnd}:${target.file} --skip=${offset} -n ${limit}`,
1266
+ { cwd: this.projectRoot, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 30000 }
1267
+ ).trim();
1268
+ // Collapse unchanged context lines in diffs — keep 2 lines around changes
1269
+ const collapsed = _collapseDiffContext(raw, 2);
1270
+ return { definition: target, commits: [], raw: collapsed };
1271
+ }
1272
+ // Summary only
1273
+ const output = execSync(
1274
+ `git log -L ${target.lineStart},${target.lineEnd}:${target.file} --no-patch --format="%h %ad %s" --date=short --skip=${offset} -n ${limit}`,
1275
+ { cwd: this.projectRoot, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 30000 }
1276
+ ).trim();
1277
+ const commits = output ? output.split('\n').filter(Boolean) : [];
1278
+ return { definition: target, commits };
1279
+ } catch {
1280
+ return { definition: target, commits: [], error: 'git log failed — is this a git repo?' };
1281
+ }
1282
+ }
1283
+
1160
1284
  /**
1161
1285
  * Recursive caller chain — walk UP the call graph from a symbol.
1162
1286
  * At each hop, resolves which function in the caller file contains
@@ -1291,6 +1415,133 @@ class CodeIndex {
1291
1415
  return buildNode(rootAttrs.name, rootAttrs.file, rootAttrs.lineStart, 0);
1292
1416
  }
1293
1417
 
1418
+ /**
1419
+ * Recursive callee chain — walk DOWN the call graph.
1420
+ * Mirror of trace(): shows what a function calls, transitively.
1421
+ *
1422
+ * @param {string} opts.symbol - Symbol name
1423
+ * @param {string} [opts.file] - Disambiguate by file
1424
+ * @param {number} [opts.depth=3] - Max hops downward
1425
+ * @returns {object} Tree root node with .callees[]
1426
+ */
1427
+ async callees({ symbol, file, depth = 3 }) {
1428
+ await this._ensureReady();
1429
+ const graph = this.cache.getGraph();
1430
+ if (!graph) return null;
1431
+
1432
+ // Find the target symbol node(s)
1433
+ const targetKeys = [];
1434
+ graph.forEachNode((node, attrs) => {
1435
+ if (attrs.type !== 'symbol') return;
1436
+ if (attrs.name !== symbol) return;
1437
+ if (file && attrs.file !== file) return;
1438
+ targetKeys.push(node);
1439
+ });
1440
+
1441
+ if (targetKeys.length === 0) return null;
1442
+
1443
+ // Use the first match (highest PageRank if multiple)
1444
+ const ranked = this._getRanked();
1445
+ const scoreMap = new Map(ranked);
1446
+ targetKeys.sort((a, b) => (scoreMap.get(b) || 0) - (scoreMap.get(a) || 0));
1447
+ const rootKey = targetKeys[0];
1448
+ const rootAttrs = graph.getNodeAttributes(rootKey);
1449
+
1450
+ // Build a map: symbolKey -> set of symbol keys it references (outgoing REFERENCES)
1451
+ // For each file node that has an outgoing REFERENCES edge to a symbol,
1452
+ // we need to resolve which function in that file makes the call.
1453
+ // Approach: for each symbol, find what other symbols it references
1454
+ // by looking at REFERENCES edges from the symbol's file to other symbols,
1455
+ // then filtering to references that occur within the symbol's line range.
1456
+
1457
+ // Cache of file -> definitions
1458
+ const defCache = new Map();
1459
+ const getFileDefs = async (filePath) => {
1460
+ if (defCache.has(filePath)) return defCache.get(filePath);
1461
+ try {
1462
+ const absPath = join(this.projectRoot, filePath);
1463
+ const source = await readFile(absPath, 'utf-8');
1464
+ const parsed = parseFile(filePath, source);
1465
+ const defs = parsed ? parsed.definitions.sort((a, b) => a.lineStart - b.lineStart) : [];
1466
+ defCache.set(filePath, defs);
1467
+ return defs;
1468
+ } catch {
1469
+ defCache.set(filePath, []);
1470
+ return [];
1471
+ }
1472
+ };
1473
+
1474
+ const visited = new Set();
1475
+
1476
+ const buildNode = async (symbolName, symbolFile, symbolLine, currentDepth) => {
1477
+ const nodeKey = `${symbolFile}::${symbolName}`;
1478
+ const node = { name: symbolName, file: symbolFile, line: symbolLine, callees: [] };
1479
+
1480
+ if (currentDepth >= depth) return node;
1481
+ if (visited.has(nodeKey)) return node;
1482
+ visited.add(nodeKey);
1483
+
1484
+ // Find the definition's line range so we know which references belong to it
1485
+ const defs = await getFileDefs(symbolFile);
1486
+ const thisDef = defs.find(d => d.name === symbolName && d.lineStart === symbolLine)
1487
+ || defs.find(d => d.name === symbolName);
1488
+ if (!thisDef) return node;
1489
+
1490
+ // Get the source to find call sites within this function's body
1491
+ let sourceLines;
1492
+ try {
1493
+ const absPath = join(this.projectRoot, symbolFile);
1494
+ const source = await readFile(absPath, 'utf-8');
1495
+ sourceLines = source.split('\n');
1496
+ } catch {
1497
+ return node;
1498
+ }
1499
+
1500
+ // Find the file node in the graph for this symbol's file
1501
+ const fileNodeKey = symbolFile;
1502
+
1503
+ // Collect all symbols that this file references (outgoing REFERENCES from file node)
1504
+ const referencedSymbols = new Map(); // name -> {file, line, name}
1505
+ if (graph.hasNode(fileNodeKey)) {
1506
+ graph.forEachOutEdge(fileNodeKey, (_edge, attrs, _source, target) => {
1507
+ if (attrs.type !== 'REFERENCES') return;
1508
+ const targetAttrs = graph.getNodeAttributes(target);
1509
+ if (targetAttrs.type !== 'symbol') return;
1510
+ // Skip self-references
1511
+ if (targetAttrs.name === symbolName && targetAttrs.file === symbolFile) return;
1512
+ referencedSymbols.set(`${targetAttrs.file}::${targetAttrs.name}`, {
1513
+ name: targetAttrs.name,
1514
+ file: targetAttrs.file,
1515
+ line: targetAttrs.lineStart
1516
+ });
1517
+ });
1518
+ }
1519
+
1520
+ // Filter to references that appear within this function's line range
1521
+ for (const [key, ref] of referencedSymbols) {
1522
+ const callPattern = new RegExp(
1523
+ `(?<![a-zA-Z0-9_])${ref.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s*\\(`
1524
+ );
1525
+
1526
+ let found = false;
1527
+ for (let i = thisDef.lineStart - 1; i < Math.min(thisDef.lineEnd, sourceLines.length); i++) {
1528
+ if (callPattern.test(sourceLines[i])) { found = true; break; }
1529
+ }
1530
+
1531
+ if (found) {
1532
+ const calleeNode = await buildNode(ref.name, ref.file, ref.line, currentDepth + 1);
1533
+ node.callees.push(calleeNode);
1534
+ }
1535
+ }
1536
+
1537
+ // Sort callees by file then name for deterministic output
1538
+ node.callees.sort((a, b) => a.file.localeCompare(b.file) || a.name.localeCompare(b.name));
1539
+ return node;
1540
+ };
1541
+
1542
+ return buildNode(rootAttrs.name, rootAttrs.file, rootAttrs.lineStart, 0);
1543
+ }
1544
+
1294
1545
  /**
1295
1546
  * Git-aware blast radius — what symbols changed and who calls them.
1296
1547
  *
@@ -1435,6 +1686,273 @@ class CodeIndex {
1435
1686
  return files.size;
1436
1687
  }
1437
1688
 
1689
+ /**
1690
+ * Find structurally similar functions/classes across the codebase.
1691
+ *
1692
+ * Similarity is computed from multiple signals:
1693
+ * - AST shape profile (node-type frequency vector) — captures structural patterns
1694
+ * - Reference overlap — functions that call the same things do similar work
1695
+ * - Parameter name overlap — shared param names suggest shared purpose
1696
+ * - Name similarity — tokenized name overlap (camelCase/snake_case aware)
1697
+ *
1698
+ * @param {object} opts
1699
+ * @param {string} [opts.symbol] - Find symbols similar to this one
1700
+ * @param {string} [opts.file] - Disambiguate symbol by file, or find similar symbols across this file
1701
+ * @param {string} [opts.kind] - Filter candidates to this kind (function, class, type)
1702
+ * @param {number} [opts.threshold=0.4] - Minimum similarity score (0-1)
1703
+ * @param {number} [opts.offset] - Skip first N results
1704
+ * @param {number} [opts.limit=20] - Max results to return
1705
+ * @param {boolean} [opts.count=false] - If true, return only { total }
1706
+ * @returns {Array<{symbol, file, line, signature, score, breakdown}>|{total: number}}
1707
+ */
1708
+ async similar({ symbol, file, kind, threshold = 0.4, offset, limit = 20, count = false } = {}) {
1709
+ await this._ensureReady();
1710
+ const graph = this.cache.getGraph();
1711
+ if (!graph) return count ? { total: 0 } : [];
1712
+
1713
+ // Collect all symbol nodes with their attributes
1714
+ // Filter out trivial symbols that match everything due to lack of structure
1715
+ const allSymbols = [];
1716
+ graph.forEachNode((key, attrs) => {
1717
+ if (attrs.type !== 'symbol') return;
1718
+ if (kind && attrs.kind !== kind) return;
1719
+ allSymbols.push({ key, ...attrs });
1720
+ });
1721
+
1722
+ // Identify which symbols are "non-trivial" (enough structure to be meaningful)
1723
+ const isNonTrivial = (attrs) => {
1724
+ const profile = attrs.astProfile;
1725
+ if (!profile) return false;
1726
+ const bodyLines = (attrs.lineEnd || 0) - (attrs.lineStart || 0);
1727
+ const structuralNodes = Object.entries(profile)
1728
+ .filter(([k]) => k !== '_totalNodes')
1729
+ .reduce((sum, [, v]) => sum + v, 0);
1730
+ // At least 3 structural nodes (ifs, calls, returns, etc.) or 5+ lines
1731
+ return structuralNodes >= 3 || bodyLines >= 5;
1732
+ };
1733
+
1734
+ // Build reference sets per symbol from localRefs (per-function scoped refs from parser)
1735
+ const refSets = new Map();
1736
+ for (const sym of allSymbols) {
1737
+ refSets.set(sym.key, new Set(sym.localRefs || []));
1738
+ }
1739
+
1740
+ // Compute IDF weights for AST node types (rare types are more discriminative)
1741
+ const nodeTypeDocFreq = new Map(); // nodeType -> count of symbols that have it
1742
+ for (const sym of allSymbols) {
1743
+ if (!sym.astProfile) continue;
1744
+ for (const [k, v] of Object.entries(sym.astProfile)) {
1745
+ if (k === '_totalNodes' || v === 0) continue;
1746
+ nodeTypeDocFreq.set(k, (nodeTypeDocFreq.get(k) || 0) + 1);
1747
+ }
1748
+ }
1749
+ const totalDocs = allSymbols.length;
1750
+ const idfWeights = new Map();
1751
+ for (const [nodeType, docFreq] of nodeTypeDocFreq) {
1752
+ // IDF: log(N / df) — rare types get higher weight
1753
+ idfWeights.set(nodeType, Math.log(totalDocs / docFreq));
1754
+ }
1755
+
1756
+ // Find the target symbol(s) to compare against
1757
+ let targets;
1758
+ if (symbol) {
1759
+ targets = allSymbols.filter(s => {
1760
+ if (s.name !== symbol) return false;
1761
+ if (file && s.file !== file) return false;
1762
+ return true;
1763
+ });
1764
+ if (targets.length === 0) return count ? { total: 0 } : [];
1765
+ } else if (file) {
1766
+ // Compare all symbols in this file against the rest
1767
+ targets = allSymbols.filter(s => s.file === file);
1768
+ if (targets.length === 0) return count ? { total: 0 } : [];
1769
+ } else {
1770
+ return count ? { total: 0 } : [];
1771
+ }
1772
+
1773
+ // Compute similarity for each candidate against each target
1774
+ const results = [];
1775
+ for (const candidate of allSymbols) {
1776
+ // Skip self-matches
1777
+ if (targets.some(t => t.key === candidate.key)) continue;
1778
+ // Skip trivial candidates — they match everything and are noise
1779
+ if (!isNonTrivial(candidate)) continue;
1780
+
1781
+ let bestScore = 0;
1782
+ let bestBreakdown = null;
1783
+ let bestTarget = null;
1784
+
1785
+ for (const target of targets) {
1786
+ const breakdown = this._computeSimilarity(target, candidate, refSets, idfWeights);
1787
+ if (breakdown.total > bestScore) {
1788
+ bestScore = breakdown.total;
1789
+ bestBreakdown = breakdown;
1790
+ bestTarget = target;
1791
+ }
1792
+ }
1793
+
1794
+ if (bestScore >= threshold) {
1795
+ results.push({
1796
+ symbol: candidate.name,
1797
+ file: candidate.file,
1798
+ line: candidate.lineStart,
1799
+ signature: candidate.signature,
1800
+ score: Math.round(bestScore * 100) / 100,
1801
+ matchedWith: bestTarget ? `${bestTarget.file}::${bestTarget.name}` : null,
1802
+ breakdown: bestBreakdown,
1803
+ });
1804
+ }
1805
+ }
1806
+
1807
+ // Sort by score descending
1808
+ results.sort((a, b) => b.score - a.score);
1809
+
1810
+ if (count) return { total: results.length };
1811
+ return paginate(results, { offset, limit }).items;
1812
+ }
1813
+
1814
+ /**
1815
+ * Compute multi-signal similarity between two symbols.
1816
+ * Returns { total, astShape, refOverlap, paramOverlap, nameScore }
1817
+ */
1818
+ _computeSimilarity(a, b, refSets, idfWeights = null) {
1819
+ const astShape = this._astProfileSimilarity(a.astProfile, b.astProfile, idfWeights);
1820
+ const refOverlap = this._setOverlap(refSets.get(a.key), refSets.get(b.key), 2);
1821
+ const paramOverlap = this._paramSimilarity(a.paramNames, b.paramNames);
1822
+ const nameScore = this._nameSimilarity(a.name, b.name);
1823
+
1824
+ // Weighted combination — AST shape is most important, refs second
1825
+ const total = (
1826
+ astShape * 0.40 +
1827
+ refOverlap * 0.30 +
1828
+ paramOverlap * 0.15 +
1829
+ nameScore * 0.15
1830
+ );
1831
+
1832
+ return {
1833
+ total: Math.round(total * 100) / 100,
1834
+ astShape: Math.round(astShape * 100) / 100,
1835
+ refOverlap: Math.round(refOverlap * 100) / 100,
1836
+ paramOverlap: Math.round(paramOverlap * 100) / 100,
1837
+ nameScore: Math.round(nameScore * 100) / 100,
1838
+ };
1839
+ }
1840
+
1841
+ /**
1842
+ * Cosine similarity between two AST profile vectors.
1843
+ * Ignores _totalNodes (used separately for size gating).
1844
+ */
1845
+ _astProfileSimilarity(a, b, idfWeights = null) {
1846
+ if (!a || !b) return 0;
1847
+
1848
+ // Collect all keys (excluding _totalNodes)
1849
+ const keys = new Set([
1850
+ ...Object.keys(a).filter(k => k !== '_totalNodes'),
1851
+ ...Object.keys(b).filter(k => k !== '_totalNodes'),
1852
+ ]);
1853
+ if (keys.size === 0) return 0;
1854
+
1855
+ // Size ratio penalty: very different sized functions get dampened
1856
+ const sizeA = a._totalNodes || 1;
1857
+ const sizeB = b._totalNodes || 1;
1858
+ const sizeRatio = Math.min(sizeA, sizeB) / Math.max(sizeA, sizeB);
1859
+ // Only penalize extreme size differences (>10x)
1860
+ const sizePenalty = sizeRatio < 0.1 ? sizeRatio * 2 : 1;
1861
+
1862
+ // If both profiles have very few distinct node types, similarity is unreliable.
1863
+ // Two functions both having {call_expression: 1, return_statement: 1} is not meaningful.
1864
+ const distinctA = Object.keys(a).filter(k => k !== '_totalNodes').length;
1865
+ const distinctB = Object.keys(b).filter(k => k !== '_totalNodes').length;
1866
+ const minDistinct = Math.min(distinctA, distinctB);
1867
+ // Penalize low-diversity profiles
1868
+ const diversityPenalty = minDistinct <= 2 ? 0.4 : minDistinct <= 3 ? 0.7 : 1.0;
1869
+
1870
+ // Normalize counts to proportions, then apply IDF weighting.
1871
+ // IDF makes rare node types (try_statement, list_comprehension) more
1872
+ // discriminative than ubiquitous ones (call, return_statement).
1873
+ const normalize = (profile) => {
1874
+ const total = Object.entries(profile)
1875
+ .filter(([k]) => k !== '_totalNodes')
1876
+ .reduce((sum, [, v]) => sum + v, 0) || 1;
1877
+ const result = {};
1878
+ for (const k of keys) {
1879
+ const proportion = (profile[k] || 0) / total;
1880
+ const idf = (idfWeights && idfWeights.has(k)) ? idfWeights.get(k) : 1;
1881
+ result[k] = proportion * idf;
1882
+ }
1883
+ return result;
1884
+ };
1885
+
1886
+ const na = normalize(a);
1887
+ const nb = normalize(b);
1888
+
1889
+ // Cosine similarity
1890
+ let dot = 0, magA = 0, magB = 0;
1891
+ for (const k of keys) {
1892
+ dot += na[k] * nb[k];
1893
+ magA += na[k] * na[k];
1894
+ magB += nb[k] * nb[k];
1895
+ }
1896
+ const denom = Math.sqrt(magA) * Math.sqrt(magB);
1897
+ const cosine = denom > 0 ? dot / denom : 0;
1898
+
1899
+ return cosine * sizePenalty * diversityPenalty;
1900
+ }
1901
+
1902
+ /**
1903
+ * Jaccard similarity between two sets.
1904
+ * @param {number} [minSize=0] - Minimum set size for overlap to count
1905
+ */
1906
+ _setOverlap(a, b, minSize = 0) {
1907
+ if (!a || !b || a.size === 0 || b.size === 0) return 0;
1908
+ if (a.size < minSize && b.size < minSize) return 0;
1909
+ let intersection = 0;
1910
+ for (const item of a) {
1911
+ if (b.has(item)) intersection++;
1912
+ }
1913
+ const union = a.size + b.size - intersection;
1914
+ return union > 0 ? intersection / union : 0;
1915
+ }
1916
+
1917
+ /**
1918
+ * Parameter name similarity: Jaccard overlap on lowercased param names.
1919
+ */
1920
+ _paramSimilarity(a, b) {
1921
+ if (!a || !b || a.length === 0 || b.length === 0) return 0;
1922
+ const setA = new Set(a.map(p => p.toLowerCase()));
1923
+ const setB = new Set(b.map(p => p.toLowerCase()));
1924
+ // Remove 'self', 'cls', 'this' — they're noise
1925
+ for (const noise of ['self', 'cls', 'this']) {
1926
+ setA.delete(noise);
1927
+ setB.delete(noise);
1928
+ }
1929
+ if (setA.size === 0 || setB.size === 0) return 0;
1930
+ return this._setOverlap(setA, setB);
1931
+ }
1932
+
1933
+ /**
1934
+ * Name similarity: tokenize camelCase/snake_case names, compute Jaccard overlap.
1935
+ */
1936
+ _nameSimilarity(a, b) {
1937
+ if (!a || !b) return 0;
1938
+ if (a === b) return 1;
1939
+
1940
+ const tokenize = (name) => {
1941
+ // Split on _ and camelCase boundaries, lowercase
1942
+ return name
1943
+ .replace(/([a-z])([A-Z])/g, '$1_$2')
1944
+ .toLowerCase()
1945
+ .split(/[_\s]+/)
1946
+ .filter(t => t.length > 1); // drop single-char tokens
1947
+ };
1948
+
1949
+ const tokA = new Set(tokenize(a));
1950
+ const tokB = new Set(tokenize(b));
1951
+ if (tokA.size === 0 || tokB.size === 0) return 0;
1952
+
1953
+ return this._setOverlap(tokA, tokB);
1954
+ }
1955
+
1438
1956
  /**
1439
1957
  * Force a full rebuild.
1440
1958
  */
package/src/parser.js CHANGED
@@ -246,6 +246,99 @@ const KIND_MAP = {
246
246
  decorated_definition: 'function',
247
247
  };
248
248
 
249
+ /**
250
+ * Walk an AST subtree and count node types that reveal structural shape.
251
+ * Returns a flat object like { if_statement: 3, for_statement: 1, call_expression: 7, ... }
252
+ * This is intentionally coarse — we want "shape" not identity.
253
+ */
254
+ const STRUCTURAL_NODE_TYPES = new Set([
255
+ // Control flow
256
+ 'if_statement', 'if_expression', 'elif_clause', 'else_clause',
257
+ 'for_statement', 'for_in_statement', 'for_expression',
258
+ 'while_statement', 'loop_expression',
259
+ 'match_statement', 'match_expression', 'switch_statement', 'case_clause',
260
+ 'try_statement', 'try_expression', 'except_clause', 'catch_clause', 'finally_clause',
261
+ 'with_statement',
262
+ // Returns / yields
263
+ 'return_statement', 'yield', 'yield_expression', 'await_expression',
264
+ // Calls & access
265
+ 'call_expression', 'call', 'method_call_expression',
266
+ 'member_expression', 'attribute', 'subscript_expression', 'subscript',
267
+ // Assignments
268
+ 'assignment', 'assignment_expression', 'augmented_assignment',
269
+ // Data structures
270
+ 'list', 'list_comprehension', 'dictionary', 'dictionary_comprehension',
271
+ 'array', 'object', 'tuple',
272
+ // Assertions / raises
273
+ 'assert_statement', 'raise_statement', 'throw_statement',
274
+ // Boolean logic
275
+ 'boolean_operator', 'binary_expression', 'comparison_operator', 'not_operator',
276
+ // Conditionals
277
+ 'conditional_expression', 'ternary_expression',
278
+ // String operations
279
+ 'string', 'f_string', 'template_string',
280
+ // Decorators
281
+ 'decorator',
282
+ ]);
283
+
284
+ function buildAstProfile(node) {
285
+ const profile = {};
286
+ let totalNodes = 0;
287
+
288
+ function walk(n) {
289
+ if (STRUCTURAL_NODE_TYPES.has(n.type)) {
290
+ profile[n.type] = (profile[n.type] || 0) + 1;
291
+ }
292
+ totalNodes++;
293
+ for (let i = 0; i < n.namedChildCount; i++) {
294
+ walk(n.namedChild(i));
295
+ }
296
+ }
297
+
298
+ walk(node);
299
+ profile._totalNodes = totalNodes;
300
+ return profile;
301
+ }
302
+
303
+ /**
304
+ * Extract parameter names from a function's tree-sitter node.
305
+ * Works across languages by looking for common parameter node patterns.
306
+ */
307
+ function extractParamNames(node) {
308
+ const params = [];
309
+ // Find the parameter list node
310
+ const paramNodes = [];
311
+ for (let i = 0; i < node.namedChildCount; i++) {
312
+ const child = node.namedChild(i);
313
+ if (child.type === 'parameters' || child.type === 'formal_parameters' ||
314
+ child.type === 'parameter_list') {
315
+ paramNodes.push(child);
316
+ }
317
+ // Drill into wrappers (e.g. variable_declarator -> arrow_function)
318
+ for (let j = 0; j < child.namedChildCount; j++) {
319
+ const gc = child.namedChild(j);
320
+ if (gc.type === 'parameters' || gc.type === 'formal_parameters' ||
321
+ gc.type === 'parameter_list') {
322
+ paramNodes.push(gc);
323
+ }
324
+ }
325
+ }
326
+
327
+ for (const paramList of paramNodes) {
328
+ for (let i = 0; i < paramList.namedChildCount; i++) {
329
+ const p = paramList.namedChild(i);
330
+ // Try to get the identifier name from various param shapes
331
+ const nameNode = p.childForFieldName('name') || p.childForFieldName('pattern');
332
+ if (nameNode && nameNode.type === 'identifier') {
333
+ params.push(nameNode.text);
334
+ } else if (p.type === 'identifier') {
335
+ params.push(p.text);
336
+ }
337
+ }
338
+ }
339
+ return params;
340
+ }
341
+
249
342
  /**
250
343
  * Find the body/block node of a definition, drilling into wrappers like
251
344
  * lexical_declaration → variable_declarator → arrow_function → body.
@@ -340,6 +433,11 @@ function parseFile(filePath, source) {
340
433
  bodyStartLine = bodyRow === defRow ? bodyRow + 2 : bodyRow + 1; // 1-indexed
341
434
  }
342
435
 
436
+ // Build AST profile from function body (or whole node if no body)
437
+ const profileNode = bodyNode || defNode.node;
438
+ const astProfile = buildAstProfile(profileNode);
439
+ const paramNames = extractParamNames(defNode.node);
440
+
343
441
  definitions.push({
344
442
  name: nameCapture.node.text,
345
443
  kind: nodeKind(defNode.node.type),
@@ -348,6 +446,8 @@ function parseFile(filePath, source) {
348
446
  lineEnd: defNode.node.endPosition.row + 1,
349
447
  signature: extractSignature(defNode.node, langName),
350
448
  bodyStartLine,
449
+ astProfile,
450
+ paramNames,
351
451
  });
352
452
  }
353
453
  } catch (e) {
@@ -373,9 +473,28 @@ function parseFile(filePath, source) {
373
473
  }
374
474
  }
375
475
 
476
+ // Associate each reference with its enclosing definition (by line range).
477
+ // This gives us per-function reference sets for similarity analysis.
478
+ // Sort definitions by lineStart for binary search.
479
+ const sortedDefs = [...definitions].sort((a, b) => a.lineStart - b.lineStart);
480
+ for (const ref of references) {
481
+ // Find the innermost enclosing definition
482
+ let enclosing = null;
483
+ for (const def of sortedDefs) {
484
+ if (ref.line >= def.lineStart && ref.line <= def.lineEnd) {
485
+ // Pick innermost (last matching, since sorted by start and nested defs start later)
486
+ enclosing = def;
487
+ }
488
+ }
489
+ if (enclosing) {
490
+ if (!enclosing.localRefs) enclosing.localRefs = [];
491
+ enclosing.localRefs.push(ref.name);
492
+ }
493
+ }
494
+
376
495
  // No tree.delete()/parser.delete() needed — native GC handles cleanup
377
496
 
378
497
  return { file: filePath, definitions, references };
379
498
  }
380
499
 
381
- export { parseFile, SUPPORTED_EXTENSIONS, LANG_MAP };
500
+ export { parseFile, buildAstProfile, extractParamNames, SUPPORTED_EXTENSIONS, LANG_MAP };