codedeep-mcp 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -8
- package/dist/config.js +1 -1
- package/dist/fs-util.js +48 -0
- package/dist/git/git-service.js +27 -0
- package/dist/index.js +15 -1
- package/dist/indexer/code-index.js +91 -22
- package/dist/indexer/parser.js +100 -25
- package/dist/indexer/pipeline.js +64 -4
- package/dist/indexer/scanner.js +6 -4
- package/dist/indexer/watcher.js +9 -0
- package/dist/notes/note-store.js +513 -0
- package/dist/notes/staleness.js +168 -0
- package/dist/notes/types.js +19 -0
- package/dist/server.js +105 -16
- package/dist/tools/common.js +51 -41
- package/dist/tools/find-references.js +9 -11
- package/dist/tools/forget.js +26 -0
- package/dist/tools/get-context.js +149 -18
- package/dist/tools/impact.js +18 -5
- package/dist/tools/note-render.js +57 -0
- package/dist/tools/overview.js +76 -3
- package/dist/tools/recall.js +165 -0
- package/dist/tools/remember.js +207 -0
- package/dist/tools/search-structure.js +3 -2
- package/package.json +4 -2
package/README.md
CHANGED
|
@@ -10,7 +10,7 @@ An MCP server that gives AI coding agents structural understanding of codebases.
|
|
|
10
10
|
|
|
11
11
|
**One tool call replaces 5-10 Grep-Read cycles.**
|
|
12
12
|
|
|
13
|
-
codedeep-mcp parses your code with [tree-sitter](https://tree-sitter.github.io/tree-sitter/), builds a symbol index, and exposes
|
|
13
|
+
codedeep-mcp parses your code with [tree-sitter](https://tree-sitter.github.io/tree-sitter/), builds a symbol index, and exposes 9 tools over the [Model Context Protocol](https://modelcontextprotocol.io/): 6 read-only structural tools that answer questions directly (find symbols, trace callers, assess blast radius, search by structure) plus a 3-tool agent-curated knowledge layer (`remember` / `recall` / `forget`) whose notes are **staleness-tracked** against your source — when anchored code changes, the note is flagged instead of rotting silently.
|
|
14
14
|
|
|
15
15
|
## Why
|
|
16
16
|
|
|
@@ -32,6 +32,9 @@ codedeep-mcp solves this by parsing code into symbols and relationships, then an
|
|
|
32
32
|
| `find_references` | Cross-file usage search | Who calls this function, and from where? |
|
|
33
33
|
| `impact` | Depth-N blast radius | Transitive upstream callers, grouped by hop |
|
|
34
34
|
| `search_structure` | Keyword and structural search | Find by name/signature (all languages), or AST pattern (TS/JS) |
|
|
35
|
+
| `remember` | Store a durable, anchored note | Cross-file invariants, footguns, decisions — anchored to files/symbols |
|
|
36
|
+
| `recall` | Retrieve notes with freshness | Each note tagged ✓ fresh / ⚠ stale by re-checking its anchors |
|
|
37
|
+
| `forget` | Delete a note | Remove superseded or wrong knowledge |
|
|
35
38
|
|
|
36
39
|
## Quick Start
|
|
37
40
|
|
|
@@ -63,7 +66,6 @@ Any MCP client that supports stdio transport works. Configure it to run `npx cod
|
|
|
63
66
|
Your Code ──> tree-sitter (parse) ──> In-Memory Index ──> MCP Tools
|
|
64
67
|
│
|
|
65
68
|
Git (optional)
|
|
66
|
-
LSP (planned)
|
|
67
69
|
```
|
|
68
70
|
|
|
69
71
|
**Structural index (always, instant):**
|
|
@@ -83,10 +85,22 @@ Commit frequency identifies hotspot files; co-change analysis reveals
|
|
|
83
85
|
behavioral coupling (files that change together); and a risk score
|
|
84
86
|
(churn × coupling × complexity) ranks the most change-prone, tangled hubs.
|
|
85
87
|
|
|
86
|
-
**
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
**Agent-curated knowledge layer (staleness-tracked):**
|
|
89
|
+
`remember` anchors durable notes to files/symbols and snapshots a content
|
|
90
|
+
baseline; `recall` re-checks each anchor against the current source and tags
|
|
91
|
+
every note ✓ fresh / ⚠ stale / ✗ missing — so an agent's accumulated knowledge
|
|
92
|
+
is verified at read time instead of rotting silently. Anchored notes also
|
|
93
|
+
surface **inline**: `get_context` renders the notes anchored to the symbol or
|
|
94
|
+
file being read (staleness-checked, budget-capped), and `overview` reports how
|
|
95
|
+
much knowledge is stored — the agent meets its own notes where it's already
|
|
96
|
+
looking, without having to ask. Notes are stored in the local `.codedeep`
|
|
97
|
+
cache, never written into your source.
|
|
98
|
+
|
|
99
|
+
**Honest confidence, by design:**
|
|
100
|
+
Cross-file edges are AST-derived name-matches with confidence tiers, not
|
|
101
|
+
compiler-verified references — every approximate row is tagged (e.g.
|
|
102
|
+
`[name match, unverified]`, `[behavioral]`) so an agent knows what to
|
|
103
|
+
trust and what to verify before asserting.
|
|
90
104
|
|
|
91
105
|
## Example
|
|
92
106
|
|
|
@@ -139,8 +153,9 @@ cyclomatic + cognitive complexity:
|
|
|
139
153
|
TypeScript / JS · Python · Java · Go · Rust · Swift · Kotlin · Dart · C# ·
|
|
140
154
|
PHP · Ruby · C++ · C · Objective-C
|
|
141
155
|
|
|
142
|
-
|
|
143
|
-
|
|
156
|
+
Cross-file references are AST name-matches with per-row confidence tags (see
|
|
157
|
+
*How It Works*) — precision-tuned per language against real-repo corpora with
|
|
158
|
+
an explicit 0-wrong-kind-edge goal.
|
|
144
159
|
|
|
145
160
|
## Configuration
|
|
146
161
|
|
package/dist/config.js
CHANGED
|
@@ -3,7 +3,7 @@ import { constants as fsConstants, readFileSync } from 'node:fs';
|
|
|
3
3
|
import { access, mkdir } from 'node:fs/promises';
|
|
4
4
|
import { homedir } from 'node:os';
|
|
5
5
|
import { isAbsolute, join, relative, resolve, sep } from 'node:path';
|
|
6
|
-
import { toPosix } from './
|
|
6
|
+
import { toPosix } from './fs-util.js';
|
|
7
7
|
import { errMsg, log } from './logger.js';
|
|
8
8
|
const DEFAULT_EXCLUDES = [
|
|
9
9
|
'node_modules',
|
package/dist/fs-util.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
// Neutral filesystem helpers with no layer above them: used by the indexer
|
|
2
|
+
// (scanner/pipeline/watcher), config resolution, the tools layer, AND the
|
|
3
|
+
// notes layer. They live here — not in tools/common.ts or indexer/scanner.ts —
|
|
4
|
+
// so lower layers (notes/, config.ts) never import from a higher one just to
|
|
5
|
+
// read a file or normalize a separator.
|
|
6
|
+
import { promises as fs } from 'node:fs';
|
|
7
|
+
import { join, sep } from 'node:path';
|
|
8
|
+
export function toPosix(p) {
|
|
9
|
+
return sep === '/' ? p : p.split(sep).join('/');
|
|
10
|
+
}
|
|
11
|
+
// projectRoot is fixed for the process lifetime, so its realpath is too —
|
|
12
|
+
// caching it spares one syscall per safeReadIndexedFile call (pattern
|
|
13
|
+
// scans call this once per candidate file).
|
|
14
|
+
const realRootCache = new Map();
|
|
15
|
+
async function realProjectRoot(projectRoot) {
|
|
16
|
+
let cached = realRootCache.get(projectRoot);
|
|
17
|
+
if (cached === undefined) {
|
|
18
|
+
cached = await fs.realpath(projectRoot);
|
|
19
|
+
realRootCache.set(projectRoot, cached);
|
|
20
|
+
}
|
|
21
|
+
return cached;
|
|
22
|
+
}
|
|
23
|
+
// Re-check scanner admission rules at read time so stale on-disk
|
|
24
|
+
// state (symlink-swap, growth past cap, became-directory) can't
|
|
25
|
+
// bypass the indexer's contract.
|
|
26
|
+
export async function safeReadIndexedFile(relPath, config) {
|
|
27
|
+
const abs = join(config.projectRoot, relPath);
|
|
28
|
+
const stats = await fs.lstat(abs);
|
|
29
|
+
if (stats.isSymbolicLink()) {
|
|
30
|
+
throw new Error('refusing to follow symlink');
|
|
31
|
+
}
|
|
32
|
+
if (!stats.isFile()) {
|
|
33
|
+
throw new Error('not a regular file');
|
|
34
|
+
}
|
|
35
|
+
if (stats.size > config.maxFileSize) {
|
|
36
|
+
throw new Error(`exceeds maxFileSize (${stats.size} > ${config.maxFileSize})`);
|
|
37
|
+
}
|
|
38
|
+
// lstat only checks the final component. Resolve parent-directory
|
|
39
|
+
// symlinks so a swap higher up in the path can't escape projectRoot.
|
|
40
|
+
const [real, realRoot] = await Promise.all([
|
|
41
|
+
fs.realpath(abs),
|
|
42
|
+
realProjectRoot(config.projectRoot),
|
|
43
|
+
]);
|
|
44
|
+
if (real !== realRoot && !real.startsWith(realRoot + sep)) {
|
|
45
|
+
throw new Error('path escapes project root');
|
|
46
|
+
}
|
|
47
|
+
return fs.readFile(abs, 'utf8');
|
|
48
|
+
}
|
package/dist/git/git-service.js
CHANGED
|
@@ -53,6 +53,7 @@ export class GitService {
|
|
|
53
53
|
generationValue = 0;
|
|
54
54
|
branchMemo = null;
|
|
55
55
|
recentMemo = new Map();
|
|
56
|
+
headMemo = null;
|
|
56
57
|
// Single-flight: a refresh requested while one is running coalesces
|
|
57
58
|
// into exactly one trailing rerun.
|
|
58
59
|
inFlight = null;
|
|
@@ -333,6 +334,7 @@ export class GitService {
|
|
|
333
334
|
this.generationValue++;
|
|
334
335
|
this.branchMemo = null;
|
|
335
336
|
this.recentMemo.clear();
|
|
337
|
+
this.headMemo = null; // per-generation like its siblings; clear on HEAD move
|
|
336
338
|
}
|
|
337
339
|
async branchSummary() {
|
|
338
340
|
if (this.closed)
|
|
@@ -557,6 +559,31 @@ export class GitService {
|
|
|
557
559
|
this.recentMemo.set(memoKey, { gen, value });
|
|
558
560
|
return value;
|
|
559
561
|
}
|
|
562
|
+
// Short HEAD sha at the current generation, or null off-git / on transient
|
|
563
|
+
// failure. Used by `remember` to stamp a note's provenance ("noted at commit
|
|
564
|
+
// X"). Generation-memoized like recentCommits/branchSummary, so a HEAD move
|
|
565
|
+
// (the logs/HEAD watcher bumps the generation) invalidates it. Never throws.
|
|
566
|
+
async currentHead() {
|
|
567
|
+
if (this.closed)
|
|
568
|
+
return null;
|
|
569
|
+
this.maybeRetryStartup();
|
|
570
|
+
if (this.stateValue !== 'ready')
|
|
571
|
+
return null;
|
|
572
|
+
if (this.headMemo && this.headMemo.gen === this.generationValue) {
|
|
573
|
+
return this.headMemo.value;
|
|
574
|
+
}
|
|
575
|
+
const gen = this.generationValue;
|
|
576
|
+
const { out } = await this.probe(['rev-parse', '--short', 'HEAD'], {
|
|
577
|
+
timeoutMs: QUICK_TIMEOUT_MS,
|
|
578
|
+
});
|
|
579
|
+
// null = unborn HEAD or a transient failure: return null WITHOUT memoizing
|
|
580
|
+
// so the next call retries (mirrors recentCommits).
|
|
581
|
+
if (out === null)
|
|
582
|
+
return null;
|
|
583
|
+
const value = out.trim().length > 0 ? out.trim() : null;
|
|
584
|
+
this.headMemo = { gen, value };
|
|
585
|
+
return value;
|
|
586
|
+
}
|
|
560
587
|
// Shutdown: kill in-flight children, never await the analysis — the
|
|
561
588
|
// 10s shutdown watchdog must not ride on a git subprocess.
|
|
562
589
|
close() {
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import { join } from "node:path";
|
|
2
3
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3
4
|
import { loadConfig, resolveCacheDir } from "./config.js";
|
|
4
5
|
import { GitService } from "./git/git-service.js";
|
|
@@ -6,6 +7,7 @@ import { CodeIndex } from "./indexer/code-index.js";
|
|
|
6
7
|
import { Indexer } from "./indexer/pipeline.js";
|
|
7
8
|
import { Watcher } from "./indexer/watcher.js";
|
|
8
9
|
import { errMsg, log } from "./logger.js";
|
|
10
|
+
import { NoteStore } from "./notes/note-store.js";
|
|
9
11
|
import { createServer } from "./server.js";
|
|
10
12
|
let initial;
|
|
11
13
|
try {
|
|
@@ -64,6 +66,18 @@ if (config.watch) {
|
|
|
64
66
|
// .catch makes the chain unrejectable — one line, one error path.
|
|
65
67
|
const git = new GitService(config, index, indexer.cachePath);
|
|
66
68
|
void indexingPromise.catch(() => { }).then(() => git.start());
|
|
69
|
+
// The note store is independent of the code index (notes are primary,
|
|
70
|
+
// non-rebuildable user data, NOT derived from source), so it survives every
|
|
71
|
+
// index invalidation. It sits next to index.json under the resolved cacheDir,
|
|
72
|
+
// inheriting the read-only-root ~/.cache fallback and the `.codedeep` scanner
|
|
73
|
+
// exclude. Write-through saves on each remember/forget mean no shutdown flush is
|
|
74
|
+
// needed. load() is FIRE-AND-FORGET (like git.start below): it never throws
|
|
75
|
+
// (quarantines on corruption) and is memoized, and every note tool re-awaits it
|
|
76
|
+
// before acting — so we must NOT block server.connect (and the 6 structural
|
|
77
|
+
// tools that never touch notes) on note-store disk I/O, which can be slow on a
|
|
78
|
+
// networked cacheDir or one cluttered with stale .tmp/.bak entries.
|
|
79
|
+
const notes = new NoteStore(join(config.cacheDir, "notes.json"), config.projectRoot);
|
|
80
|
+
void notes.load();
|
|
67
81
|
// Flush-on-shutdown — the watcher's per-flush saves bound the loss window,
|
|
68
82
|
// but exiting between flushes shouldn't discard the last debounce batch.
|
|
69
83
|
// watcher.close() drains that batch and persists through its normal save
|
|
@@ -134,5 +148,5 @@ process.on("SIGTERM", () => shutdown("SIGTERM received", false, true));
|
|
|
134
148
|
// exits with the last debounce batch unflushed and unsaved.
|
|
135
149
|
process.stdin.once("end", () => shutdown("stdin closed", true));
|
|
136
150
|
process.stdin.once("close", () => shutdown("stdin closed", true));
|
|
137
|
-
const server = createServer({ index, indexer, config, git });
|
|
151
|
+
const server = createServer({ index, indexer, config, git, notes });
|
|
138
152
|
await server.connect(new StdioServerTransport());
|
|
@@ -273,8 +273,8 @@ const DEFAULT_RISK_HOTSPOTS = 10; // rows returned by getRiskHotspots
|
|
|
273
273
|
// and inheritance-blind by construction, so an empty/shallow tree is a blind
|
|
274
274
|
// spot, not an "all clear".
|
|
275
275
|
const CALLER_TREE_LIMITATIONS = Object.freeze([
|
|
276
|
-
'Upstream callers only — cross-file callees (downstream) are not traversed
|
|
277
|
-
'Inheritance/override edges are not modeled, so virtual-dispatch callers may be missing
|
|
276
|
+
'Upstream callers only — cross-file callees (downstream) are not traversed.',
|
|
277
|
+
'Inheritance/override edges are not modeled, so virtual-dispatch callers may be missing.',
|
|
278
278
|
'Edges are heuristic AST name-matches, not compiler-verified; confidence is ordinal, not probabilistic.',
|
|
279
279
|
]);
|
|
280
280
|
export const zeroSymbolsByKind = () => ({
|
|
@@ -335,21 +335,58 @@ export class CodeIndex {
|
|
|
335
335
|
this.fileByPath.set(file.path, file);
|
|
336
336
|
this.importsByFile.set(file.path, [...imports]);
|
|
337
337
|
this.symbolsByFile.set(file.path, [...symbols]);
|
|
338
|
+
const ownIds = new Set();
|
|
338
339
|
for (const sym of symbols) {
|
|
340
|
+
ownIds.add(sym.id);
|
|
339
341
|
this.symbolById.set(sym.id, sym);
|
|
340
342
|
pushOrInit(this.symbolsByName, sym.name, sym);
|
|
341
343
|
}
|
|
342
|
-
|
|
344
|
+
// The id-keyed adjacency admits SAME-FILE edges only — enforced here, at
|
|
345
|
+
// the site where LIVE-EXTRACTED edges enter the maps, not just by
|
|
346
|
+
// extractor convention. (load() is the OTHER entry site: it restores
|
|
347
|
+
// persisted adjacency verbatim, trusting that the cache was written from
|
|
348
|
+
// maps this gate already filtered — a hand-edited cache bypasses it, and
|
|
349
|
+
// corrupt caches are deleted, not repaired.) removeFileInternal's prune
|
|
350
|
+
// relies on this invariant (it deletes only the removed file's own keys;
|
|
351
|
+
// a cross-file edge — on EITHER endpoint — would leave dangling ids in
|
|
352
|
+
// another file's sets and inflate fan-in/fan-out). A violating ref is
|
|
353
|
+
// DEMOTED WHOLESALE, not just skipped: its foreign endpoint ids are
|
|
354
|
+
// nulled before it enters the name-keyed stores, so every query surface
|
|
355
|
+
// agrees it is an unresolved name match (a kept ids-intact copy would
|
|
356
|
+
// read as top-tier 'resolved' in find_references while being absent from
|
|
357
|
+
// fan-in/fan-out — two surfaces disagreeing about the same edge). If a
|
|
358
|
+
// future resolver produces cross-file edges, they surface as these
|
|
359
|
+
// demoted name-keyed refs plus this warn — the deliberate signal to
|
|
360
|
+
// design the cross-file story properly.
|
|
361
|
+
let crossFileEdges = 0;
|
|
362
|
+
const admitted = [];
|
|
343
363
|
for (const ref of references) {
|
|
344
|
-
|
|
364
|
+
let stored = ref;
|
|
365
|
+
if ((ref.sourceId !== null && !ownIds.has(ref.sourceId)) ||
|
|
366
|
+
(ref.targetId !== null && !ownIds.has(ref.targetId))) {
|
|
367
|
+
crossFileEdges++;
|
|
368
|
+
stored = {
|
|
369
|
+
...ref,
|
|
370
|
+
sourceId: ref.sourceId !== null && ownIds.has(ref.sourceId) ? ref.sourceId : null,
|
|
371
|
+
targetId: ref.targetId !== null && ownIds.has(ref.targetId) ? ref.targetId : null,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
admitted.push(stored);
|
|
375
|
+
pushOrInit(this.referencesByTargetName, stored.targetName, stored);
|
|
345
376
|
// Module-level calls (sourceId=null) and cross-file unresolved refs
|
|
346
377
|
// (targetId=null) skip the id-keyed adjacency; they're queried by name
|
|
347
378
|
// via referencesByTargetName.
|
|
348
|
-
if (
|
|
349
|
-
addAdjacency(this.callees,
|
|
350
|
-
addAdjacency(this.callers,
|
|
379
|
+
if (stored.sourceId && stored.targetId) {
|
|
380
|
+
addAdjacency(this.callees, stored.sourceId, stored.targetId);
|
|
381
|
+
addAdjacency(this.callers, stored.targetId, stored.sourceId);
|
|
351
382
|
}
|
|
352
383
|
}
|
|
384
|
+
this.referencesBySourceFile.set(file.path, admitted);
|
|
385
|
+
if (crossFileEdges > 0) {
|
|
386
|
+
log.warn(`CodeIndex.addFile: ${file.path} carried ${crossFileEdges} reference(s) ` +
|
|
387
|
+
`with a non-same-file endpoint id; demoted to unresolved name matches ` +
|
|
388
|
+
`(id-keyed adjacency is same-file by invariant — see removeFileInternal)`);
|
|
389
|
+
}
|
|
353
390
|
this.namesDirty = true;
|
|
354
391
|
this.callerCountsDirty = true;
|
|
355
392
|
this.aliasIndexDirty = true;
|
|
@@ -382,12 +419,20 @@ export class CodeIndex {
|
|
|
382
419
|
else
|
|
383
420
|
this.symbolsByName.set(sym.name, filtered);
|
|
384
421
|
}
|
|
422
|
+
// Deleting each id's own key fully prunes the adjacency maps: id-keyed
|
|
423
|
+
// edges are SAME-FILE by construction (resolveCalls builds its nameToId/
|
|
424
|
+
// typeNameToId/methodsByClass maps from the one file's symbols — a non-null
|
|
425
|
+
// targetId can only point within the file; cross-file refs are stored
|
|
426
|
+
// name-keyed with targetId=null), so no OTHER file's set can contain this
|
|
427
|
+
// file's ids. A whole-map sweep here would be a provable no-op that scales
|
|
428
|
+
// the watcher's re-index hot path with total repo size. The invariant is
|
|
429
|
+
// ENFORCED at the edge-entry site (addFile's ownIds gate above) and pinned
|
|
430
|
+
// by extractor.test.ts's "resolved references always target the SAME
|
|
431
|
+
// file's symbols" property test.
|
|
385
432
|
for (const id of deletedIds) {
|
|
386
433
|
this.callees.delete(id);
|
|
387
434
|
this.callers.delete(id);
|
|
388
435
|
}
|
|
389
|
-
pruneAdjacency(this.callers, deletedIds);
|
|
390
|
-
pruneAdjacency(this.callees, deletedIds);
|
|
391
436
|
const refsFromFile = this.referencesBySourceFile.get(path);
|
|
392
437
|
if (refsFromFile) {
|
|
393
438
|
// Group by targetName so each by-name list is filtered once even when
|
|
@@ -1608,7 +1653,8 @@ export function isCallerOf(ref, target) {
|
|
|
1608
1653
|
posix.dirname(ref.file) === posix.dirname(target.file);
|
|
1609
1654
|
// Self-receiver refs (extractor-determined: TS `this` node, Python
|
|
1610
1655
|
// self/cls) that extract-time resolution did NOT bind to a sibling
|
|
1611
|
-
// method can only target an inherited method —
|
|
1656
|
+
// method can only target an inherited method — inheritance is not
|
|
1657
|
+
// modeled, so claiming the edge would be confidently wrong. An
|
|
1612
1658
|
// ordinary receiver merely NAMED `self` is not affected.
|
|
1613
1659
|
if (isMember && ref.selfReceiver)
|
|
1614
1660
|
return false;
|
|
@@ -1654,14 +1700,27 @@ export function isClassMember(s) {
|
|
|
1654
1700
|
// counting impact.ts renders ("N callers across D depths (F files)"), shared so
|
|
1655
1701
|
// impact and the risk surface never diverge — and deduped, unlike
|
|
1656
1702
|
// CallerTreeResult.totalNodes which double-counts DAG diamonds.
|
|
1657
|
-
export function countDistinctCallers(root) {
|
|
1658
|
-
const
|
|
1703
|
+
export function countDistinctCallers(root, withTiers = false) {
|
|
1704
|
+
const callerKeys = new Set();
|
|
1705
|
+
// `strongest` (best edge per caller) is only built when the caller wants the
|
|
1706
|
+
// tier breakdown; the scalar blast-radius path skips the per-node strength
|
|
1707
|
+
// tracking + final bucketing loop entirely. Keyed identically to callerKeys,
|
|
1708
|
+
// so the per-tier counts sum to `callers`. A caller reachable via a resolved
|
|
1709
|
+
// path is at least that trustworthy, hence "strongest".
|
|
1710
|
+
const strongest = withTiers ? new Map() : null;
|
|
1659
1711
|
const files = new Set();
|
|
1660
1712
|
const depths = new Set();
|
|
1661
1713
|
let depthCapped = false;
|
|
1662
1714
|
const walk = (node) => {
|
|
1663
1715
|
for (const child of node.children) {
|
|
1664
|
-
|
|
1716
|
+
const key = child.symbolId ?? `m:${child.file}:${child.line}`;
|
|
1717
|
+
callerKeys.add(key);
|
|
1718
|
+
if (strongest) {
|
|
1719
|
+
const prev = strongest.get(key);
|
|
1720
|
+
if (prev === undefined || STRENGTH_RANK[child.strength] > STRENGTH_RANK[prev]) {
|
|
1721
|
+
strongest.set(key, child.strength);
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1665
1724
|
files.add(child.file);
|
|
1666
1725
|
depths.add(child.depth);
|
|
1667
1726
|
if (child.depthCapped)
|
|
@@ -1670,7 +1729,25 @@ export function countDistinctCallers(root) {
|
|
|
1670
1729
|
}
|
|
1671
1730
|
};
|
|
1672
1731
|
walk(root);
|
|
1673
|
-
|
|
1732
|
+
const counts = {
|
|
1733
|
+
callers: callerKeys.size,
|
|
1734
|
+
files: files.size,
|
|
1735
|
+
depths: depths.size,
|
|
1736
|
+
depthCapped,
|
|
1737
|
+
};
|
|
1738
|
+
if (strongest) {
|
|
1739
|
+
const tiers = { structural: 0, nameMatch: 0, weakMember: 0 };
|
|
1740
|
+
for (const strength of strongest.values()) {
|
|
1741
|
+
if (strength === 'resolved')
|
|
1742
|
+
tiers.structural += 1;
|
|
1743
|
+
else if (strength === 'weak-member')
|
|
1744
|
+
tiers.weakMember += 1;
|
|
1745
|
+
else
|
|
1746
|
+
tiers.nameMatch += 1;
|
|
1747
|
+
}
|
|
1748
|
+
counts.tiers = tiers;
|
|
1749
|
+
}
|
|
1750
|
+
return counts;
|
|
1674
1751
|
}
|
|
1675
1752
|
// True when `imports` brings `name` into scope as a value binding the bare
|
|
1676
1753
|
// call site could resolve to (named import, alias, or wildcard). Shared by
|
|
@@ -1724,14 +1801,6 @@ function addAdjacency(map, key, value) {
|
|
|
1724
1801
|
}
|
|
1725
1802
|
set.add(value);
|
|
1726
1803
|
}
|
|
1727
|
-
function pruneAdjacency(map, deleted) {
|
|
1728
|
-
for (const [key, set] of map) {
|
|
1729
|
-
for (const id of deleted)
|
|
1730
|
-
set.delete(id);
|
|
1731
|
-
if (set.size === 0)
|
|
1732
|
-
map.delete(key);
|
|
1733
|
-
}
|
|
1734
|
-
}
|
|
1735
1804
|
function adjacencyToEntries(map) {
|
|
1736
1805
|
const out = [];
|
|
1737
1806
|
for (const [k, set] of map)
|
package/dist/indexer/parser.js
CHANGED
|
@@ -27,7 +27,82 @@ const LANG_TO_WASM = {
|
|
|
27
27
|
objc: 'tree-sitter-objc.wasm',
|
|
28
28
|
};
|
|
29
29
|
const parsers = new Map();
|
|
30
|
-
|
|
30
|
+
// Split memoization: the tree-sitter WASM runtime loads once (coreInit);
|
|
31
|
+
// each language's grammar loads on demand (langLoads) — a repo that is pure
|
|
32
|
+
// Python must not pay the ~95MB RSS floor of all 16 grammars. Rejections
|
|
33
|
+
// self-reset (a transient EMFILE must not disable a language for the
|
|
34
|
+
// process lifetime).
|
|
35
|
+
let coreInit = null;
|
|
36
|
+
const langLoads = new Map();
|
|
37
|
+
// A grammar load is retried IN PLACE with a short backoff before the promise
|
|
38
|
+
// rejects: transient failures (an EMFILE storm during the WASM read) clear in
|
|
39
|
+
// milliseconds, and retrying HERE — at the altitude where the failure lives —
|
|
40
|
+
// protects every caller identically (startup bulk scan, watcher single-file,
|
|
41
|
+
// pattern-mode validation) with no per-path retry bookkeeping anywhere else.
|
|
42
|
+
// (A per-path retry queue in the watcher was tried and removed: it could not
|
|
43
|
+
// cover the startup path, swallowed edits landing mid-budget, and its counters
|
|
44
|
+
// leaked across interleaved outcomes.) A failure that survives the attempts is
|
|
45
|
+
// treated as durable (corrupt/missing .wasm — needs user action); langLoads
|
|
46
|
+
// still self-resets, so a LATER call (next fs event / rescan / restart) probes
|
|
47
|
+
// again rather than latching the language off forever.
|
|
48
|
+
const GRAMMAR_LOAD_ATTEMPTS = 3;
|
|
49
|
+
const GRAMMAR_RETRY_BASE_MS = 50;
|
|
50
|
+
// After a full attempt budget fails, further ensures for that language FAIL
|
|
51
|
+
// FAST for this window instead of re-running the backoff sequence. Without
|
|
52
|
+
// it, the memo's self-reset composes badly with the serial batch path: a
|
|
53
|
+
// permanently corrupt .wasm would cost ~150ms of backoff PER FILE of that
|
|
54
|
+
// language (5,000 files ≈ 12+ minutes of stall inside one indexAll). With
|
|
55
|
+
// it, a batch sweeps at full speed (~one backoff sequence per TTL window)
|
|
56
|
+
// while a later probe — the next fs event / rescan after the window — still
|
|
57
|
+
// retries fresh, so the language is never latched off.
|
|
58
|
+
const GRAMMAR_FAILURE_TTL_MS = 5_000;
|
|
59
|
+
function delay(ms) {
|
|
60
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
61
|
+
}
|
|
62
|
+
const langFailures = new Map();
|
|
63
|
+
function ensureLanguage(lang) {
|
|
64
|
+
const wasm = LANG_TO_WASM[lang];
|
|
65
|
+
// Unknown/unsupported names are a no-op here (the scanner emits 'unknown'
|
|
66
|
+
// for unrecognized extensions); parseFile still warns when asked to parse one.
|
|
67
|
+
if (!wasm)
|
|
68
|
+
return Promise.resolve();
|
|
69
|
+
let p = langLoads.get(lang);
|
|
70
|
+
if (!p) {
|
|
71
|
+
const recent = langFailures.get(lang);
|
|
72
|
+
if (recent !== undefined) {
|
|
73
|
+
if (Date.now() - recent.at < GRAMMAR_FAILURE_TTL_MS) {
|
|
74
|
+
return Promise.reject(recent.err); // fail fast inside the TTL window
|
|
75
|
+
}
|
|
76
|
+
langFailures.delete(lang); // window over → probe again for real
|
|
77
|
+
}
|
|
78
|
+
p = (async () => {
|
|
79
|
+
let lastErr;
|
|
80
|
+
for (let attempt = 1; attempt <= GRAMMAR_LOAD_ATTEMPTS; attempt++) {
|
|
81
|
+
try {
|
|
82
|
+
const language = await Language.load(path.join(grammarsDir, wasm));
|
|
83
|
+
const parser = new Parser();
|
|
84
|
+
parser.setLanguage(language);
|
|
85
|
+
parsers.set(lang, parser);
|
|
86
|
+
langFailures.delete(lang);
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
catch (err) {
|
|
90
|
+
lastErr = err;
|
|
91
|
+
if (attempt < GRAMMAR_LOAD_ATTEMPTS) {
|
|
92
|
+
await delay(GRAMMAR_RETRY_BASE_MS * attempt);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
langFailures.set(lang, { at: Date.now(), err: lastErr });
|
|
97
|
+
throw lastErr;
|
|
98
|
+
})();
|
|
99
|
+
p.catch(() => {
|
|
100
|
+
langLoads.delete(lang); // durable failure → a later call probes again
|
|
101
|
+
});
|
|
102
|
+
langLoads.set(lang, p);
|
|
103
|
+
}
|
|
104
|
+
return p;
|
|
105
|
+
}
|
|
31
106
|
// Conditional-compilation directive lines (#if / #elseif / #else / #endif).
|
|
32
107
|
// `m` matches ^/$ per line; without `s`, `.*` stays within one line.
|
|
33
108
|
const SWIFT_DIRECTIVE_LINE = /^[ \t]*#(?:if|elseif|else|endif)\b.*$/gm;
|
|
@@ -88,39 +163,39 @@ function countParseErrors(root) {
|
|
|
88
163
|
}
|
|
89
164
|
return count;
|
|
90
165
|
}
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
}));
|
|
101
|
-
for (const [lang, parser] of loaded) {
|
|
102
|
-
parsers.set(lang, parser);
|
|
103
|
-
}
|
|
104
|
-
})();
|
|
166
|
+
// Initialize the runtime and load grammars. With `languages`, loads ONLY those
|
|
167
|
+
// grammars (the pipeline passes the scan-found set, so a repo pays memory for
|
|
168
|
+
// exactly the languages it contains); with no argument, loads ALL grammars
|
|
169
|
+
// (test harnesses and callers that can't know the set up front). Idempotent
|
|
170
|
+
// and incremental: each grammar loads at most once, and later calls with new
|
|
171
|
+
// languages top up what's already loaded.
|
|
172
|
+
export async function initParser(languages) {
|
|
173
|
+
if (!coreInit) {
|
|
174
|
+
coreInit = Parser.init();
|
|
105
175
|
// A cached rejection would otherwise disable parsing (and pattern
|
|
106
|
-
// validation) for the process lifetime after one transient failure
|
|
107
|
-
//
|
|
108
|
-
|
|
109
|
-
|
|
176
|
+
// validation) for the process lifetime after one transient failure —
|
|
177
|
+
// reset so the next call retries.
|
|
178
|
+
coreInit.catch(() => {
|
|
179
|
+
coreInit = null;
|
|
110
180
|
});
|
|
111
181
|
}
|
|
112
|
-
|
|
182
|
+
await coreInit;
|
|
183
|
+
const langs = languages ? [...new Set(languages)] : Object.keys(LANG_TO_WASM);
|
|
184
|
+
await Promise.all(langs.map(ensureLanguage));
|
|
113
185
|
}
|
|
114
186
|
// The returned Tree holds WASM memory; callers must call `tree.delete()` when
|
|
115
187
|
// finished — JS GC won't free it.
|
|
116
188
|
export function parseFile(content, language) {
|
|
117
|
-
if (parsers.size === 0) {
|
|
118
|
-
throw new Error('parser not initialized; call initParser() first');
|
|
119
|
-
}
|
|
120
189
|
const parser = parsers.get(language);
|
|
121
190
|
if (!parser) {
|
|
122
|
-
|
|
123
|
-
|
|
191
|
+
if (!(language in LANG_TO_WASM)) {
|
|
192
|
+
log.warn(`parseFile: unsupported language "${language}"`);
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
// Supported but not loaded — a caller-ordering bug (every parse path must
|
|
196
|
+
// initParser([language]) first). Throw loudly (the pipeline catches and
|
|
197
|
+
// warns per file) rather than silently skipping the file.
|
|
198
|
+
throw new Error(`parser not initialized for "${language}"; call initParser(["${language}"]) first`);
|
|
124
199
|
}
|
|
125
200
|
let tree = parser.parse(content);
|
|
126
201
|
if (!tree) {
|