codedeep-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +177 -0
- package/dist/config.js +223 -0
- package/dist/git/analyzer.js +177 -0
- package/dist/git/git-service.js +568 -0
- package/dist/git/head-watcher.js +113 -0
- package/dist/git/runner.js +204 -0
- package/dist/index.js +138 -0
- package/dist/indexer/code-index.js +1801 -0
- package/dist/indexer/complexity.js +633 -0
- package/dist/indexer/extractor.js +354 -0
- package/dist/indexer/languages/cpp.js +934 -0
- package/dist/indexer/languages/csharp.js +854 -0
- package/dist/indexer/languages/dart.js +777 -0
- package/dist/indexer/languages/go.js +665 -0
- package/dist/indexer/languages/java.js +507 -0
- package/dist/indexer/languages/kotlin.js +709 -0
- package/dist/indexer/languages/objc.js +397 -0
- package/dist/indexer/languages/php.js +771 -0
- package/dist/indexer/languages/python.js +455 -0
- package/dist/indexer/languages/ruby.js +697 -0
- package/dist/indexer/languages/rust.js +754 -0
- package/dist/indexer/languages/swift.js +691 -0
- package/dist/indexer/languages/typescript.js +485 -0
- package/dist/indexer/parser.js +175 -0
- package/dist/indexer/pipeline.js +342 -0
- package/dist/indexer/scanner.js +279 -0
- package/dist/indexer/watcher.js +353 -0
- package/dist/logger.js +16 -0
- package/dist/server.js +170 -0
- package/dist/tools/common.js +207 -0
- package/dist/tools/find-references.js +224 -0
- package/dist/tools/find-symbol.js +94 -0
- package/dist/tools/get-context.js +370 -0
- package/dist/tools/impact.js +218 -0
- package/dist/tools/overview.js +482 -0
- package/dist/tools/search-structure.js +303 -0
- package/dist/types.js +61 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-dart.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-kotlin.wasm +0 -0
- package/grammars/tree-sitter-objc.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-ruby.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-swift.wasm +0 -0
- package/grammars/tree-sitter-tsx.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +67 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { promises as fs } from 'node:fs';
|
|
3
|
+
import { isAbsolute, join, relative } from 'node:path';
|
|
4
|
+
import { errMsg, log } from '../logger.js';
|
|
5
|
+
import { LANGUAGE_UNKNOWN } from '../types.js';
|
|
6
|
+
import { extractSymbols } from './extractor.js';
|
|
7
|
+
import { initParser, parseFile } from './parser.js';
|
|
8
|
+
import { compileExcludeMatcher, detectLanguage, isBinaryByContent, isBinaryByExtension, refineHeaderLanguage, scanProject, toPosix, } from './scanner.js';
|
|
9
|
+
const BATCH_SIZE = 50;
|
|
10
|
+
// The ONE no-change policy, applied by both indexChanged's scan diff and
|
|
11
|
+
// indexFile's single-file path. mtime alone misses content swaps under
|
|
12
|
+
// coarse-resolution filesystems or `cp -p` / archive extraction that
|
|
13
|
+
// preserves timestamps; comparing size catches the common case cheaply.
|
|
14
|
+
// indexFile additionally hash-verifies (see indexFileInner) because it
|
|
15
|
+
// runs in response to an explicit fs event.
|
|
16
|
+
// `language` is the freshly DETECTED language for the path: an upgrade
|
|
17
|
+
// that teaches the scanner a new extension (e.g. `.java`) reclassifies
|
|
18
|
+
// files a cached index recorded as 'unknown' — those must re-index even
|
|
19
|
+
// though their bytes never changed, or the new language stays inert on
|
|
20
|
+
// every warm cache.
|
|
21
|
+
function isUnchanged(prev, mtimeMs, size, language) {
|
|
22
|
+
return (prev !== undefined &&
|
|
23
|
+
prev.lastModified === mtimeMs &&
|
|
24
|
+
prev.size === size &&
|
|
25
|
+
prev.language === language);
|
|
26
|
+
}
|
|
27
|
+
function hashContent(content) {
|
|
28
|
+
return createHash('sha1').update(content).digest('hex').slice(0, 16);
|
|
29
|
+
}
|
|
30
|
+
export class Indexer {
|
|
31
|
+
config;
|
|
32
|
+
index;
|
|
33
|
+
cachePath;
|
|
34
|
+
matchExclude;
|
|
35
|
+
indexing = false;
|
|
36
|
+
done = 0;
|
|
37
|
+
total = 0;
|
|
38
|
+
ready = false;
|
|
39
|
+
// Whether the most recent indexAll/indexChanged saw a COMPLETE scan.
|
|
40
|
+
// A partial scan (transient readdir failure) resolves successfully but
|
|
41
|
+
// preserves unseen cached entries — the watcher must know the rescan
|
|
42
|
+
// it requested may not have covered everything.
|
|
43
|
+
lastScanCompleteFlag = true;
|
|
44
|
+
get lastScanComplete() {
|
|
45
|
+
return this.lastScanCompleteFlag;
|
|
46
|
+
}
|
|
47
|
+
constructor(config, index) {
|
|
48
|
+
this.config = config;
|
|
49
|
+
this.index = index;
|
|
50
|
+
this.cachePath = join(config.cacheDir, 'index.json');
|
|
51
|
+
this.matchExclude = compileExcludeMatcher(config.exclude);
|
|
52
|
+
}
|
|
53
|
+
get isIndexing() {
|
|
54
|
+
return this.indexing;
|
|
55
|
+
}
|
|
56
|
+
get progress() {
|
|
57
|
+
return { done: this.done, total: this.total };
|
|
58
|
+
}
|
|
59
|
+
// indexAll/indexChanged resolve `true` when the work ran and `false`
|
|
60
|
+
// when the concurrency guard dropped the request; indexFile returns the
|
|
61
|
+
// richer IndexFileResult so the watcher can tell mutation from idle
|
|
62
|
+
// work and retry what deserves retrying.
|
|
63
|
+
async indexAll() {
|
|
64
|
+
return this.runGuarded(async () => {
|
|
65
|
+
await initParser();
|
|
66
|
+
const { files: current, complete } = await scanProject(this.config);
|
|
67
|
+
this.lastScanCompleteFlag = complete;
|
|
68
|
+
this.total = current.length;
|
|
69
|
+
await this.processBatched(current);
|
|
70
|
+
// Prune cached entries no longer in the scan, but only when the scan
|
|
71
|
+
// was complete. A partial scan (transient readdir/stat failure) would
|
|
72
|
+
// otherwise drop valid symbols until the next clean run.
|
|
73
|
+
if (complete) {
|
|
74
|
+
const currentPaths = new Set(current.map((f) => f.path));
|
|
75
|
+
for (const existing of this.index.getAllFiles()) {
|
|
76
|
+
if (!currentPaths.has(existing.path)) {
|
|
77
|
+
this.index.removeFile(existing.path);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
log.warn('Indexer.indexAll: scan incomplete; preserving cached entries not seen in this scan');
|
|
83
|
+
}
|
|
84
|
+
await this.persist();
|
|
85
|
+
this.ready = true;
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
async indexChanged() {
|
|
89
|
+
return this.runGuarded(async () => {
|
|
90
|
+
await initParser();
|
|
91
|
+
const { files: current, complete } = await scanProject(this.config);
|
|
92
|
+
this.lastScanCompleteFlag = complete;
|
|
93
|
+
const previous = new Map(this.index.getAllFiles().map((f) => [f.path, f]));
|
|
94
|
+
const toIndex = [];
|
|
95
|
+
for (const f of current) {
|
|
96
|
+
const prev = previous.get(f.path);
|
|
97
|
+
if (!isUnchanged(prev, f.lastModified, f.size, f.language)) {
|
|
98
|
+
toIndex.push(f);
|
|
99
|
+
}
|
|
100
|
+
previous.delete(f.path);
|
|
101
|
+
}
|
|
102
|
+
let deletedCount = 0;
|
|
103
|
+
if (complete) {
|
|
104
|
+
deletedCount = previous.size;
|
|
105
|
+
for (const stalePath of previous.keys()) {
|
|
106
|
+
this.index.removeFile(stalePath);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
else if (previous.size > 0) {
|
|
110
|
+
log.warn(`Indexer.indexChanged: scan incomplete; preserving ${previous.size} cached entries not seen`);
|
|
111
|
+
}
|
|
112
|
+
if (toIndex.length === 0 && deletedCount === 0) {
|
|
113
|
+
log.debug('Indexer: indexChanged found no changes');
|
|
114
|
+
this.ready = true;
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
this.total = toIndex.length;
|
|
118
|
+
await this.processBatched(toIndex);
|
|
119
|
+
await this.persist();
|
|
120
|
+
this.ready = true;
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
// Does NOT call save() — callers debounce events and batch persistence themselves.
|
|
124
|
+
async indexFile(rawPath) {
|
|
125
|
+
let outcome = 'noop';
|
|
126
|
+
const ran = await this.runGuarded(async () => {
|
|
127
|
+
outcome = await this.indexFileInner(rawPath);
|
|
128
|
+
});
|
|
129
|
+
return ran ? outcome : 'dropped';
|
|
130
|
+
}
|
|
131
|
+
async indexFileInner(rawPath) {
|
|
132
|
+
await initParser();
|
|
133
|
+
// Canonicalize to a project-relative POSIX path so the cache key
|
|
134
|
+
// aligns with the scanner's `src/a.ts` form regardless of whether
|
|
135
|
+
// the watcher emits an absolute path, a `./`-prefix, or Windows
|
|
136
|
+
// backslashes. Mismatched keys would orphan stale symbols and
|
|
137
|
+
// create duplicate entries on update.
|
|
138
|
+
const projectRoot = this.config.projectRoot;
|
|
139
|
+
const absInput = isAbsolute(rawPath) ? rawPath : join(projectRoot, rawPath);
|
|
140
|
+
const relPath = toPosix(relative(projectRoot, absInput));
|
|
141
|
+
if (relPath === '' || relPath === '..' || relPath.startsWith('../')) {
|
|
142
|
+
log.debug(`Indexer.indexFile: skip ${rawPath} (outside project root)`);
|
|
143
|
+
return 'noop';
|
|
144
|
+
}
|
|
145
|
+
const removed = () => this.index.removeFile(relPath) ? 'removed' : 'noop';
|
|
146
|
+
if (this.matchExclude(relPath) || isBinaryByExtension(relPath)) {
|
|
147
|
+
return removed();
|
|
148
|
+
}
|
|
149
|
+
// Stat before language detection so deletions and size-cap rejections
|
|
150
|
+
// remove cached entries even for unknown-language files.
|
|
151
|
+
const absPath = join(this.config.projectRoot, relPath);
|
|
152
|
+
let stats;
|
|
153
|
+
try {
|
|
154
|
+
stats = await fs.lstat(absPath);
|
|
155
|
+
}
|
|
156
|
+
catch (err) {
|
|
157
|
+
log.debug(`Indexer.indexFile: stat failed for ${relPath} (${errMsg(err)}); treated as deletion`);
|
|
158
|
+
return removed();
|
|
159
|
+
}
|
|
160
|
+
if (stats.isSymbolicLink()) {
|
|
161
|
+
log.debug(`Indexer.indexFile: skip ${relPath} (symlink)`);
|
|
162
|
+
return removed();
|
|
163
|
+
}
|
|
164
|
+
// FIFOs/sockets/devices must not reach isBinaryByContent — opening
|
|
165
|
+
// a writer-less named pipe blocks forever and would wedge the
|
|
166
|
+
// watcher's flush chain. (Directories land here too when called
|
|
167
|
+
// directly; the watcher routes those to a rescan first.)
|
|
168
|
+
if (!stats.isFile()) {
|
|
169
|
+
log.debug(`Indexer.indexFile: skip ${relPath} (not a regular file)`);
|
|
170
|
+
return removed();
|
|
171
|
+
}
|
|
172
|
+
// Mirror the scanner's maxFiles cap for files not already indexed —
|
|
173
|
+
// without this, watcher events could grow the index unboundedly
|
|
174
|
+
// past the configured bound until the next full scan prunes it.
|
|
175
|
+
if (this.config.maxFiles > 0 &&
|
|
176
|
+
!this.index.hasFile(relPath) &&
|
|
177
|
+
this.index.fileCount >= this.config.maxFiles) {
|
|
178
|
+
log.debug(`Indexer.indexFile: skip ${relPath} (index at maxFiles=${this.config.maxFiles})`);
|
|
179
|
+
return 'cap-skipped';
|
|
180
|
+
}
|
|
181
|
+
// Metadata-only events (and the trailing event of an atomic-save
|
|
182
|
+
// pair) would otherwise pay a full parse + index update + save. But
|
|
183
|
+
// mtime+size alone cannot distinguish an atomic-save echo from a
|
|
184
|
+
// REAL second same-size edit landing in the same coarse-mtime tick
|
|
185
|
+
// (HFS+/FAT/NFS report whole seconds) — an explicit fs event fired,
|
|
186
|
+
// so verify by content hash (read without parse) before skipping.
|
|
187
|
+
let language = detectLanguage(relPath) ?? LANGUAGE_UNKNOWN;
|
|
188
|
+
// A `.h` mapped to 'cpp' may be an Objective-C header — content-sniff it (no-op
|
|
189
|
+
// unless `.h` AND objc configured). MUST run before isUnchanged (which compares
|
|
190
|
+
// the stored language), so a `.h` re-classified by a heuristic tweak re-indexes.
|
|
191
|
+
if (language === 'cpp') {
|
|
192
|
+
language = await refineHeaderLanguage(absPath, language, new Set(this.config.languages));
|
|
193
|
+
}
|
|
194
|
+
const existing = this.index.getFile(relPath);
|
|
195
|
+
if (isUnchanged(existing, stats.mtimeMs, stats.size, language)) {
|
|
196
|
+
if (existing?.contentHash !== undefined) {
|
|
197
|
+
try {
|
|
198
|
+
const content = await fs.readFile(absPath, 'utf8');
|
|
199
|
+
if (hashContent(content) === existing.contentHash) {
|
|
200
|
+
log.debug(`Indexer.indexFile: ${relPath} unchanged; skipping`);
|
|
201
|
+
return 'noop';
|
|
202
|
+
}
|
|
203
|
+
// Same stat fingerprint, different bytes — fall through and
|
|
204
|
+
// re-index for real.
|
|
205
|
+
}
|
|
206
|
+
catch (err) {
|
|
207
|
+
log.debug(`Indexer.indexFile: hash check read failed for ${relPath} (${errMsg(err)}); treated as deletion`);
|
|
208
|
+
return removed();
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
// No stored hash (unknown-language entry) — stat match suffices;
|
|
213
|
+
// these files carry no symbols to go stale.
|
|
214
|
+
log.debug(`Indexer.indexFile: ${relPath} unchanged; skipping`);
|
|
215
|
+
return 'noop';
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
if (stats.size > this.config.maxFileSize) {
|
|
219
|
+
log.debug(`Indexer.indexFile: skip ${relPath} (size ${stats.size} > maxFileSize ${this.config.maxFileSize})`);
|
|
220
|
+
return removed();
|
|
221
|
+
}
|
|
222
|
+
if (language !== LANGUAGE_UNKNOWN &&
|
|
223
|
+
!this.config.languages.includes(language)) {
|
|
224
|
+
return removed();
|
|
225
|
+
}
|
|
226
|
+
if (language === LANGUAGE_UNKNOWN) {
|
|
227
|
+
try {
|
|
228
|
+
if (await isBinaryByContent(absPath)) {
|
|
229
|
+
return removed();
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
catch (err) {
|
|
233
|
+
log.warn(`Indexer.indexFile: byte check failed for ${relPath}: ${errMsg(err)}`);
|
|
234
|
+
return removed();
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
const file = {
|
|
238
|
+
path: relPath,
|
|
239
|
+
language,
|
|
240
|
+
size: stats.size,
|
|
241
|
+
lastModified: stats.mtimeMs,
|
|
242
|
+
lastIndexed: 0,
|
|
243
|
+
symbolCount: 0,
|
|
244
|
+
};
|
|
245
|
+
this.total = 1;
|
|
246
|
+
const result = await this.processFile(file);
|
|
247
|
+
this.done = 1;
|
|
248
|
+
return result;
|
|
249
|
+
}
|
|
250
|
+
// Resolves `false` when a run is already in flight (the request is
|
|
251
|
+
// dropped, not queued); `true` when the work ran to completion.
|
|
252
|
+
async runGuarded(work) {
|
|
253
|
+
if (this.indexing) {
|
|
254
|
+
log.warn('Indexer: indexing already in progress; refusing concurrent run');
|
|
255
|
+
return false;
|
|
256
|
+
}
|
|
257
|
+
this.indexing = true;
|
|
258
|
+
this.done = 0;
|
|
259
|
+
this.total = 0;
|
|
260
|
+
try {
|
|
261
|
+
await work();
|
|
262
|
+
}
|
|
263
|
+
finally {
|
|
264
|
+
this.indexing = false;
|
|
265
|
+
}
|
|
266
|
+
return true;
|
|
267
|
+
}
|
|
268
|
+
async processBatched(files) {
|
|
269
|
+
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
270
|
+
const end = Math.min(i + BATCH_SIZE, files.length);
|
|
271
|
+
for (let j = i; j < end; j++) {
|
|
272
|
+
await this.processFile(files[j]);
|
|
273
|
+
this.done++;
|
|
274
|
+
}
|
|
275
|
+
log.debug(`Indexed ${this.done}/${this.total} files`);
|
|
276
|
+
// Yield to the event loop so concurrent MCP requests can be served.
|
|
277
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
async processFile(file) {
|
|
281
|
+
// Recorded for audit but never parsed — keeps overview's "Other files"
|
|
282
|
+
// count accurate without invoking the parser on unsupported grammars.
|
|
283
|
+
if (file.language === LANGUAGE_UNKNOWN) {
|
|
284
|
+
this.index.updateFile({ ...file, lastIndexed: Date.now() }, [], [], []);
|
|
285
|
+
return 'indexed';
|
|
286
|
+
}
|
|
287
|
+
const absPath = join(this.config.projectRoot, file.path);
|
|
288
|
+
const removed = () => this.index.removeFile(file.path) ? 'removed' : 'noop';
|
|
289
|
+
let content;
|
|
290
|
+
try {
|
|
291
|
+
content = await fs.readFile(absPath, 'utf8');
|
|
292
|
+
}
|
|
293
|
+
catch (err) {
|
|
294
|
+
log.warn(`Indexer: failed to read ${file.path}: ${errMsg(err)}`);
|
|
295
|
+
return removed();
|
|
296
|
+
}
|
|
297
|
+
let tree;
|
|
298
|
+
try {
|
|
299
|
+
tree = parseFile(content, file.language);
|
|
300
|
+
}
|
|
301
|
+
catch (err) {
|
|
302
|
+
log.warn(`Indexer: parseFile threw for ${file.path}: ${errMsg(err)}`);
|
|
303
|
+
return removed();
|
|
304
|
+
}
|
|
305
|
+
if (!tree) {
|
|
306
|
+
log.warn(`Indexer: parser returned null for ${file.path} (language=${file.language})`);
|
|
307
|
+
return removed();
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
let result;
|
|
311
|
+
try {
|
|
312
|
+
result = extractSymbols(tree, content, file);
|
|
313
|
+
}
|
|
314
|
+
catch (err) {
|
|
315
|
+
log.warn(`Indexer: extractSymbols threw for ${file.path}: ${errMsg(err)}`);
|
|
316
|
+
return removed();
|
|
317
|
+
}
|
|
318
|
+
const annotated = {
|
|
319
|
+
...file,
|
|
320
|
+
lastIndexed: Date.now(),
|
|
321
|
+
symbolCount: result.symbols.length,
|
|
322
|
+
contentHash: hashContent(content),
|
|
323
|
+
};
|
|
324
|
+
this.index.updateFile(annotated, result.symbols, result.references, result.imports);
|
|
325
|
+
return 'indexed';
|
|
326
|
+
}
|
|
327
|
+
finally {
|
|
328
|
+
// tree-sitter trees hold WASM memory that JS GC won't reclaim.
|
|
329
|
+
tree.delete();
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
async persist() {
|
|
333
|
+
try {
|
|
334
|
+
await this.index.save(this.cachePath);
|
|
335
|
+
log.debug(`Indexer: saved cache to ${this.cachePath}`);
|
|
336
|
+
}
|
|
337
|
+
catch (err) {
|
|
338
|
+
log.error(`Indexer: failed to save cache: ${errMsg(err)}`);
|
|
339
|
+
// Do not rethrow — the in-memory index remains usable.
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import { open, readdir, stat } from 'node:fs/promises';
|
|
2
|
+
import { join, relative, sep, posix } from 'node:path';
|
|
3
|
+
import picomatch from 'picomatch';
|
|
4
|
+
import { LANGUAGE_UNKNOWN } from '../types.js';
|
|
5
|
+
import { log } from '../logger.js';
|
|
6
|
+
const BYTE_CHECK_BUF_SIZE = 8192;
|
|
7
|
+
const LANGUAGE_BY_EXT = {
|
|
8
|
+
'.ts': 'typescript',
|
|
9
|
+
'.tsx': 'tsx',
|
|
10
|
+
'.js': 'javascript',
|
|
11
|
+
'.jsx': 'javascript',
|
|
12
|
+
'.mjs': 'javascript',
|
|
13
|
+
'.cjs': 'javascript',
|
|
14
|
+
'.py': 'python',
|
|
15
|
+
'.java': 'java',
|
|
16
|
+
'.go': 'go',
|
|
17
|
+
'.rs': 'rust',
|
|
18
|
+
'.swift': 'swift',
|
|
19
|
+
'.kt': 'kotlin',
|
|
20
|
+
'.kts': 'kotlin',
|
|
21
|
+
'.dart': 'dart',
|
|
22
|
+
'.cs': 'csharp',
|
|
23
|
+
'.php': 'php',
|
|
24
|
+
'.rb': 'ruby',
|
|
25
|
+
'.rake': 'ruby',
|
|
26
|
+
'.gemspec': 'ruby',
|
|
27
|
+
// C: `.c` source files use the dedicated tree-sitter-c grammar — NOT
|
|
28
|
+
// tree-sitter-cpp, which errors on K&R old-style functions and mis-parses C
|
|
29
|
+
// code that uses C++ keywords as identifiers (`int new;`, `int class;`).
|
|
30
|
+
// `'c'` then dispatches to the C++ extractor (tree-sitter-c and -cpp produce
|
|
31
|
+
// byte-identical ASTs for the C subset — see extractor.ts).
|
|
32
|
+
'.c': 'c',
|
|
33
|
+
// C++: the C++-specific source/header extensions plus `.h`. `.h` is
|
|
34
|
+
// ambiguous (C or C++) but C++ is the dominant case for this tool's
|
|
35
|
+
// audience and tree-sitter-cpp parses C headers fine as a superset — so a C
|
|
36
|
+
// header maps to 'cpp', not 'c' (only `.c` is C-specific).
|
|
37
|
+
'.cpp': 'cpp',
|
|
38
|
+
'.cc': 'cpp',
|
|
39
|
+
'.cxx': 'cpp',
|
|
40
|
+
'.hpp': 'cpp',
|
|
41
|
+
'.hh': 'cpp',
|
|
42
|
+
'.hxx': 'cpp',
|
|
43
|
+
'.ipp': 'cpp',
|
|
44
|
+
'.tpp': 'cpp',
|
|
45
|
+
'.h': 'cpp',
|
|
46
|
+
// Objective-C: `.m` is unambiguous ObjC. `.h` stays mapped to 'cpp' above and is
|
|
47
|
+
// content-sniffed (refineHeaderLanguage) — ObjC headers are also `.h` and hold the
|
|
48
|
+
// bulk of the API (@protocol + @property are header-exclusive). `.mm` (ObjC++) is
|
|
49
|
+
// intentionally UNMAPPED (it needs a separate grammar; tree-sitter-objc errors on
|
|
50
|
+
// the C++ parts) — it falls through to LANGUAGE_UNKNOWN.
|
|
51
|
+
'.m': 'objc',
|
|
52
|
+
};
|
|
53
|
+
// Line-anchored Objective-C markers used to refine an ambiguous `.h` from cpp→objc.
|
|
54
|
+
// Anchoring kills the false positives a substring match would hit: `//@interface`
|
|
55
|
+
// comments, a `"#import"` string literal, and C++23's bare `import std;` (which has
|
|
56
|
+
// neither the leading `#` of `#import` nor the `@` of `@import`). `#import` is
|
|
57
|
+
// ObjC-exclusive and present in essentially every ObjC header.
|
|
58
|
+
const OBJC_HEADER_MARKERS = [
|
|
59
|
+
// `#import` is the dominant ObjC header signal (also a niche MSVC C++ directive, so
|
|
60
|
+
// not strictly ObjC-exclusive — but a C++ `.h` using MSVC `#import` is rare and the
|
|
61
|
+
// mis-route is recall-only). All markers are LINE-ANCHORED so a `//@interface`
|
|
62
|
+
// comment or a `"#import"` string literal never matches.
|
|
63
|
+
/^\s*#\s*import\b/m,
|
|
64
|
+
/^\s*@(?:interface|protocol|implementation|class|import)\b/m,
|
|
65
|
+
/^\s*NS_ASSUME_NONNULL_(?:BEGIN|END)\b/m,
|
|
66
|
+
// `typedef NS_ENUM(NSInteger, Foo)` is the real shape, so allow a leading
|
|
67
|
+
// `typedef` (still line-anchored — a `// NS_ENUM(...)` comment stays excluded).
|
|
68
|
+
/^\s*(?:typedef\s+)?NS_(?:ENUM|OPTIONS)\s*\(/m,
|
|
69
|
+
];
|
|
70
|
+
const BINARY_EXT = new Set([
|
|
71
|
+
'.png', '.jpg', '.jpeg', '.gif', '.ico',
|
|
72
|
+
'.woff', '.woff2', '.ttf', '.eot',
|
|
73
|
+
'.mp3', '.mp4',
|
|
74
|
+
'.zip', '.tar', '.gz',
|
|
75
|
+
'.wasm', '.pdf',
|
|
76
|
+
'.exe', '.dll', '.so', '.dylib', '.o', '.a',
|
|
77
|
+
'.class', '.pyc', '.pyo', '.jar', '.war',
|
|
78
|
+
]);
|
|
79
|
+
const GLOB_CHARS = /[*?[\]{}!]/;
|
|
80
|
+
export function toPosix(p) {
|
|
81
|
+
return sep === '/' ? p : p.split(sep).join('/');
|
|
82
|
+
}
|
|
83
|
+
export function detectLanguage(filename) {
|
|
84
|
+
const ext = posix.extname(toPosix(filename)).toLowerCase();
|
|
85
|
+
return LANGUAGE_BY_EXT[ext] ?? null;
|
|
86
|
+
}
|
|
87
|
+
// A `.h` file is ambiguous: a C/C++ header maps to 'cpp' (above), but an Objective-C
|
|
88
|
+
// header is also `.h` and holds the public API (@protocol + @property are
|
|
89
|
+
// header-exclusive). tree-sitter-cpp errors on every ObjC header, and tree-sitter-objc
|
|
90
|
+
// wrecks C++ headers, so a blanket route is wrong either way. Refine the detected
|
|
91
|
+
// language by content: ONLY a `.h`-that-resolved-to-'cpp', and ONLY when 'objc' is a
|
|
92
|
+
// configured language (so disabling objc keeps `.h`→cpp), read the first 8KB and route
|
|
93
|
+
// to 'objc' iff a line-anchored ObjC marker matches. Self-heals: `isUnchanged` compares
|
|
94
|
+
// the stored language, so a later heuristic tweak re-indexes a re-classified `.h` with
|
|
95
|
+
// no schema bump. Everything else returns `language` unchanged (one I/O-free fast path).
|
|
96
|
+
export async function refineHeaderLanguage(absPath, language, langSet) {
|
|
97
|
+
// Only the ambiguous `.h` extension is sniffed. `.cpp`/`.hpp`/`.cc`/… also map to
|
|
98
|
+
// 'cpp' but are unambiguously C++ — they must NOT pay the read or risk a misroute.
|
|
99
|
+
if (language !== 'cpp' ||
|
|
100
|
+
!langSet.has('objc') ||
|
|
101
|
+
posix.extname(toPosix(absPath)).toLowerCase() !== '.h') {
|
|
102
|
+
return language;
|
|
103
|
+
}
|
|
104
|
+
let head;
|
|
105
|
+
try {
|
|
106
|
+
const fh = await open(absPath, 'r');
|
|
107
|
+
try {
|
|
108
|
+
const buf = Buffer.alloc(BYTE_CHECK_BUF_SIZE);
|
|
109
|
+
const { bytesRead } = await fh.read(buf, 0, BYTE_CHECK_BUF_SIZE, 0);
|
|
110
|
+
head = buf.toString('utf8', 0, bytesRead);
|
|
111
|
+
}
|
|
112
|
+
finally {
|
|
113
|
+
await fh.close();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// A read failure here is non-fatal for routing — keep the path-based 'cpp'.
|
|
118
|
+
return language;
|
|
119
|
+
}
|
|
120
|
+
return OBJC_HEADER_MARKERS.some((re) => re.test(head)) ? 'objc' : language;
|
|
121
|
+
}
|
|
122
|
+
export function isBinaryByExtension(filename) {
|
|
123
|
+
return BINARY_EXT.has(posix.extname(toPosix(filename)).toLowerCase());
|
|
124
|
+
}
|
|
125
|
+
// Reads up to 8KB from absPath and returns true on the first null byte.
|
|
126
|
+
// Mirrors git's "any NUL in the prefix means binary" heuristic. Used only
|
|
127
|
+
// for unknown-extension files; trusted source extensions skip this I/O.
|
|
128
|
+
export async function isBinaryByContent(absPath) {
|
|
129
|
+
const fh = await open(absPath, 'r');
|
|
130
|
+
try {
|
|
131
|
+
const buf = Buffer.alloc(BYTE_CHECK_BUF_SIZE);
|
|
132
|
+
const { bytesRead } = await fh.read(buf, 0, BYTE_CHECK_BUF_SIZE, 0);
|
|
133
|
+
for (let i = 0; i < bytesRead; i++) {
|
|
134
|
+
if (buf[i] === 0)
|
|
135
|
+
return true;
|
|
136
|
+
}
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
finally {
|
|
140
|
+
await fh.close();
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
export function compileExcludeMatcher(patterns) {
|
|
144
|
+
const expanded = [];
|
|
145
|
+
for (const p of patterns) {
|
|
146
|
+
const hasSlash = p.includes('/');
|
|
147
|
+
const hasGlob = GLOB_CHARS.test(p);
|
|
148
|
+
if (!hasSlash && !hasGlob) {
|
|
149
|
+
expanded.push(p, `${p}/**`, `**/${p}`, `**/${p}/**`);
|
|
150
|
+
}
|
|
151
|
+
else if (!hasSlash && hasGlob) {
|
|
152
|
+
expanded.push(p, `**/${p}`);
|
|
153
|
+
}
|
|
154
|
+
else {
|
|
155
|
+
expanded.push(p);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
const isMatch = picomatch(expanded, { dot: true });
|
|
159
|
+
return (relPath) => isMatch(relPath);
|
|
160
|
+
}
|
|
161
|
+
export function depthOf(relPath) {
|
|
162
|
+
let n = 0;
|
|
163
|
+
for (let i = 0; i < relPath.length; i++) {
|
|
164
|
+
if (relPath.charCodeAt(i) === 47 /* '/' */)
|
|
165
|
+
n++;
|
|
166
|
+
}
|
|
167
|
+
return n;
|
|
168
|
+
}
|
|
169
|
+
export function compareShallowFirst(a, b) {
|
|
170
|
+
const da = depthOf(a.path);
|
|
171
|
+
const db = depthOf(b.path);
|
|
172
|
+
if (da !== db)
|
|
173
|
+
return da - db;
|
|
174
|
+
return a.path < b.path ? -1 : a.path > b.path ? 1 : 0;
|
|
175
|
+
}
|
|
176
|
+
async function* walk(root, dir, matchExclude, state) {
|
|
177
|
+
let entries;
|
|
178
|
+
try {
|
|
179
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
180
|
+
}
|
|
181
|
+
catch (err) {
|
|
182
|
+
if (dir === root)
|
|
183
|
+
throw err;
|
|
184
|
+
// A transient failure here (EACCES/EMFILE/network FS) hides files
|
|
185
|
+
// from this scan. Mark the scan incomplete so callers don't
|
|
186
|
+
// mistake the omission for a deletion.
|
|
187
|
+
state.complete = false;
|
|
188
|
+
log.warn(`scanner: readdir failed for ${dir}: ${err.message}`);
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
for (const entry of entries) {
|
|
192
|
+
if (entry.isSymbolicLink())
|
|
193
|
+
continue;
|
|
194
|
+
const absPath = join(dir, entry.name);
|
|
195
|
+
const relPath = toPosix(relative(root, absPath));
|
|
196
|
+
if (entry.isDirectory()) {
|
|
197
|
+
if (matchExclude(relPath))
|
|
198
|
+
continue;
|
|
199
|
+
yield* walk(root, absPath, matchExclude, state);
|
|
200
|
+
}
|
|
201
|
+
else if (entry.isFile()) {
|
|
202
|
+
yield { absPath, relPath };
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
export async function scanProject(config) {
|
|
207
|
+
const matchExclude = compileExcludeMatcher(config.exclude);
|
|
208
|
+
const langSet = new Set(config.languages);
|
|
209
|
+
const root = config.projectRoot;
|
|
210
|
+
const cap = config.maxFiles > 0 ? config.maxFiles : Number.MAX_SAFE_INTEGER;
|
|
211
|
+
// parseable claims every cap slot first; unknowns fill residual budget.
|
|
212
|
+
// Without this split, readdir order alone could let overview-only files
|
|
213
|
+
// exhaust a tight maxFiles before src/ is even walked.
|
|
214
|
+
const parseable = [];
|
|
215
|
+
const unknown = [];
|
|
216
|
+
const state = { complete: true };
|
|
217
|
+
for await (const { absPath, relPath } of walk(root, root, matchExclude, state)) {
|
|
218
|
+
if (parseable.length >= cap) {
|
|
219
|
+
log.warn(`scanner: reached maxFiles=${cap}; remaining files skipped`);
|
|
220
|
+
break;
|
|
221
|
+
}
|
|
222
|
+
if (matchExclude(relPath))
|
|
223
|
+
continue;
|
|
224
|
+
if (isBinaryByExtension(relPath))
|
|
225
|
+
continue;
|
|
226
|
+
let language = detectLanguage(relPath) ?? LANGUAGE_UNKNOWN;
|
|
227
|
+
// A `.h` mapped to 'cpp' may be an Objective-C header — content-sniff it (no-op
|
|
228
|
+
// unless the ext is `.h` AND objc is configured). Done before the langSet gate so
|
|
229
|
+
// a refined 'objc' is kept iff objc is enabled (refineHeaderLanguage self-gates).
|
|
230
|
+
if (language === 'cpp')
|
|
231
|
+
language = await refineHeaderLanguage(absPath, language, langSet);
|
|
232
|
+
// Recognized-but-unconfigured languages are dropped; unknown files are
|
|
233
|
+
// kept (subject to residual budget) so overview can surface them.
|
|
234
|
+
if (language !== LANGUAGE_UNKNOWN && !langSet.has(language))
|
|
235
|
+
continue;
|
|
236
|
+
if (language === LANGUAGE_UNKNOWN) {
|
|
237
|
+
if (unknown.length >= cap - parseable.length)
|
|
238
|
+
continue;
|
|
239
|
+
try {
|
|
240
|
+
if (await isBinaryByContent(absPath))
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
catch (err) {
|
|
244
|
+
state.complete = false;
|
|
245
|
+
log.warn(`scanner: byte check failed for ${relPath}: ${err.message}`);
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
let stats;
|
|
250
|
+
try {
|
|
251
|
+
stats = await stat(absPath);
|
|
252
|
+
}
|
|
253
|
+
catch (err) {
|
|
254
|
+
state.complete = false;
|
|
255
|
+
log.warn(`scanner: stat failed for ${relPath}: ${err.message}`);
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
if (stats.size > config.maxFileSize) {
|
|
259
|
+
log.debug(`scanner: skip ${relPath} (size ${stats.size} > maxFileSize ${config.maxFileSize})`);
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
262
|
+
const fileInfo = {
|
|
263
|
+
path: relPath,
|
|
264
|
+
language,
|
|
265
|
+
size: stats.size,
|
|
266
|
+
lastModified: stats.mtimeMs,
|
|
267
|
+
lastIndexed: 0,
|
|
268
|
+
symbolCount: 0,
|
|
269
|
+
};
|
|
270
|
+
if (language === LANGUAGE_UNKNOWN)
|
|
271
|
+
unknown.push(fileInfo);
|
|
272
|
+
else
|
|
273
|
+
parseable.push(fileInfo);
|
|
274
|
+
}
|
|
275
|
+
const remaining = Math.max(0, cap - parseable.length);
|
|
276
|
+
const results = parseable.concat(unknown.slice(0, remaining));
|
|
277
|
+
results.sort(compareShallowFirst);
|
|
278
|
+
return { files: results, complete: state.complete };
|
|
279
|
+
}
|