codedeep-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +177 -0
- package/dist/config.js +223 -0
- package/dist/git/analyzer.js +177 -0
- package/dist/git/git-service.js +568 -0
- package/dist/git/head-watcher.js +113 -0
- package/dist/git/runner.js +204 -0
- package/dist/index.js +138 -0
- package/dist/indexer/code-index.js +1801 -0
- package/dist/indexer/complexity.js +633 -0
- package/dist/indexer/extractor.js +354 -0
- package/dist/indexer/languages/cpp.js +934 -0
- package/dist/indexer/languages/csharp.js +854 -0
- package/dist/indexer/languages/dart.js +777 -0
- package/dist/indexer/languages/go.js +665 -0
- package/dist/indexer/languages/java.js +507 -0
- package/dist/indexer/languages/kotlin.js +709 -0
- package/dist/indexer/languages/objc.js +397 -0
- package/dist/indexer/languages/php.js +771 -0
- package/dist/indexer/languages/python.js +455 -0
- package/dist/indexer/languages/ruby.js +697 -0
- package/dist/indexer/languages/rust.js +754 -0
- package/dist/indexer/languages/swift.js +691 -0
- package/dist/indexer/languages/typescript.js +485 -0
- package/dist/indexer/parser.js +175 -0
- package/dist/indexer/pipeline.js +342 -0
- package/dist/indexer/scanner.js +279 -0
- package/dist/indexer/watcher.js +353 -0
- package/dist/logger.js +16 -0
- package/dist/server.js +170 -0
- package/dist/tools/common.js +207 -0
- package/dist/tools/find-references.js +224 -0
- package/dist/tools/find-symbol.js +94 -0
- package/dist/tools/get-context.js +370 -0
- package/dist/tools/impact.js +218 -0
- package/dist/tools/overview.js +482 -0
- package/dist/tools/search-structure.js +303 -0
- package/dist/types.js +61 -0
- package/grammars/tree-sitter-c.wasm +0 -0
- package/grammars/tree-sitter-c_sharp.wasm +0 -0
- package/grammars/tree-sitter-cpp.wasm +0 -0
- package/grammars/tree-sitter-dart.wasm +0 -0
- package/grammars/tree-sitter-go.wasm +0 -0
- package/grammars/tree-sitter-java.wasm +0 -0
- package/grammars/tree-sitter-javascript.wasm +0 -0
- package/grammars/tree-sitter-kotlin.wasm +0 -0
- package/grammars/tree-sitter-objc.wasm +0 -0
- package/grammars/tree-sitter-php.wasm +0 -0
- package/grammars/tree-sitter-python.wasm +0 -0
- package/grammars/tree-sitter-ruby.wasm +0 -0
- package/grammars/tree-sitter-rust.wasm +0 -0
- package/grammars/tree-sitter-swift.wasm +0 -0
- package/grammars/tree-sitter-tsx.wasm +0 -0
- package/grammars/tree-sitter-typescript.wasm +0 -0
- package/package.json +67 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
import { IMPORT_DEFAULT, IMPORT_NAMESPACE, RECEIVER_OPAQUE } from '../../types.js';
|
|
2
|
+
import { SIGNATURE_DISPLAY_CAP, bareDecoratorIdentifier, commentDocLine, declSignature, normalizeSignature, resolveCalls, symbolId, } from '../extractor.js';
|
|
3
|
+
import { cFamilyBooleanOperatorKind, computeComplexity, isCFamilyBooleanOperator, } from '../complexity.js';
|
|
4
|
+
// Function-like nodes whose bodies contain calls that shouldn't attribute
|
|
5
|
+
// to an enclosing body. walkDecorators uses this subset (NOT the full
|
|
6
|
+
// SKIP_TYPES) so it still descends through class bodies — top-level
|
|
7
|
+
// decorators on inner classes attribute to the enclosing function — but
|
|
8
|
+
// stops at nested function bodies, where decorator firing is gated on the
|
|
9
|
+
// nested function being called.
|
|
10
|
+
const TS_FUNCTION_BODY_SKIP_TYPES = new Set([
|
|
11
|
+
'function_declaration',
|
|
12
|
+
'function_expression',
|
|
13
|
+
'arrow_function',
|
|
14
|
+
'method_definition',
|
|
15
|
+
]);
|
|
16
|
+
const TS_SKIP_TYPES = new Set([
|
|
17
|
+
...TS_FUNCTION_BODY_SKIP_TYPES,
|
|
18
|
+
'class_declaration',
|
|
19
|
+
'class_expression',
|
|
20
|
+
'abstract_class_declaration',
|
|
21
|
+
]);
|
|
22
|
+
const TS_SELECTORS = [
|
|
23
|
+
{ nodeType: 'call_expression', getCallee: (n) => n.childForFieldName('function') },
|
|
24
|
+
{ nodeType: 'new_expression', getCallee: (n) => n.childForFieldName('constructor') },
|
|
25
|
+
{ nodeType: 'jsx_opening_element', getCallee: jsxComponentName },
|
|
26
|
+
{ nodeType: 'jsx_self_closing_element', getCallee: jsxComponentName },
|
|
27
|
+
{ nodeType: 'decorator', getCallee: bareDecoratorIdentifier },
|
|
28
|
+
];
|
|
29
|
+
// JSX components are PascalCase by convention; lowercase first char is an
|
|
30
|
+
// HTML element (`<div>`, `<span>`) which we don't track as a symbol ref.
|
|
31
|
+
function jsxComponentName(node) {
|
|
32
|
+
const name = node.childForFieldName('name');
|
|
33
|
+
if (!name || name.type !== 'identifier')
|
|
34
|
+
return null;
|
|
35
|
+
const ch = name.text.charAt(0);
|
|
36
|
+
if (ch >= 'a' && ch <= 'z')
|
|
37
|
+
return null;
|
|
38
|
+
return name;
|
|
39
|
+
}
|
|
40
|
+
// Dominant JS/TS fluent/stdlib method names (>=4 chars) suppressed when a
|
|
41
|
+
// member call to them is unresolved — without this, capturing chained calls
|
|
42
|
+
// floods the name-keyed store with `.then()`/`.filter()`/`.map()`-style noise.
|
|
43
|
+
// Domain method names (zod's `.optional`/`.nullable`/`.refine`, etc.) are
|
|
44
|
+
// deliberately absent — those are the recall win. <=3-char names (`.map`,
|
|
45
|
+
// `.get`, `.set`) are gated downstream by SHORT_NAME_THRESHOLD, so they're
|
|
46
|
+
// omitted here.
|
|
47
|
+
const TS_IGNORED_MEMBER_CALLEES = new Set([
|
|
48
|
+
'then', 'catch', 'finally', 'filter', 'forEach', 'reduce', 'flatMap',
|
|
49
|
+
'concat', 'slice', 'splice', 'indexOf', 'lastIndexOf', 'includes', 'join',
|
|
50
|
+
'find', 'findIndex', 'some', 'every', 'sort', 'reverse', 'push',
|
|
51
|
+
'replace', 'replaceAll', 'trim', 'split', 'startsWith', 'endsWith',
|
|
52
|
+
'substring', 'toLowerCase', 'toUpperCase', 'toString', 'valueOf',
|
|
53
|
+
'keys', 'values', 'entries', 'hasOwnProperty', 'charAt', 'padStart',
|
|
54
|
+
'padEnd', 'repeat', 'delete',
|
|
55
|
+
]);
|
|
56
|
+
// The four TS loop nodes (`for_in_statement` covers both for-of and for-in) —
|
|
57
|
+
// shared by the cyclomatic decision set and the cognitive surcharge set.
|
|
58
|
+
const TS_LOOP_NODE_TYPES = new Set([
|
|
59
|
+
'for_statement',
|
|
60
|
+
'for_in_statement',
|
|
61
|
+
'while_statement',
|
|
62
|
+
'do_statement',
|
|
63
|
+
]);
|
|
64
|
+
// Cyclomatic decision nodes — VERIFIED against SonarJS source (S1541 rule.ts):
|
|
65
|
+
// each adds +1. `for_in_statement` covers both `for…of` and `for…in`;
|
|
66
|
+
// `switch_case` counts per non-default case label (the extractor's `switch_case`
|
|
67
|
+
// node corresponds to a SwitchCase WITH a test) while `switch_default` and the
|
|
68
|
+
// `switch_statement` container do NOT. `&&`/`||`/`??` count via the shared
|
|
69
|
+
// isCFamilyBooleanOperator. NOTE the deliberate omissions that match SonarJS but
|
|
70
|
+
// differ from the textbook set: `throw` and `catch` do NOT count (ThrowStatement
|
|
71
|
+
// / CatchClause are absent from SonarJS's cyclomatic switch); `else`/`finally`/
|
|
72
|
+
// `default` never count; logical-assignment `&&=`/`||=`/`??=` do NOT count.
|
|
73
|
+
const TS_DECISION_NODE_TYPES = new Set([
|
|
74
|
+
...TS_LOOP_NODE_TYPES,
|
|
75
|
+
'if_statement',
|
|
76
|
+
'switch_case',
|
|
77
|
+
'ternary_expression',
|
|
78
|
+
]);
|
|
79
|
+
// COGNITIVE config — VERIFIED-EXACT against SonarJS S3776 (eslint-plugin-sonarjs
|
|
80
|
+
// `cjs/S3776/rule.js`, clean-room read + threshold-0 oracle), which differs
|
|
81
|
+
// MATERIALLY from sonar-java (do not assume the Java config transfers): see the
|
|
82
|
+
// boolean + JSX notes below. All node names AST-dumped against the bundled
|
|
83
|
+
// grammars. See complexity.ts + the project docs' "Cognitive Complexity Rules".
|
|
84
|
+
const TS_COGNITIVE_OPTIONS = {
|
|
85
|
+
ifType: 'if_statement',
|
|
86
|
+
conditionField: 'condition',
|
|
87
|
+
consequenceField: 'consequence',
|
|
88
|
+
alternativeField: 'alternative',
|
|
89
|
+
// tree-sitter-typescript wraps else/else-if in an `else_clause` node (UNLIKE
|
|
90
|
+
// Java's direct `alternative`); the engine unwraps it. else-if = +1 flat.
|
|
91
|
+
elseClauseType: 'else_clause',
|
|
92
|
+
loopTypes: TS_LOOP_NODE_TYPES,
|
|
93
|
+
// TS uses `switch_statement` (not Java's `switch_expression`); the WHOLE switch
|
|
94
|
+
// is +1 regardless of case count (the cognitive/cyclomatic divergence).
|
|
95
|
+
switchTypes: new Set(['switch_statement']),
|
|
96
|
+
ternaryType: 'ternary_expression',
|
|
97
|
+
catchType: 'catch_clause',
|
|
98
|
+
// EMPTY by design: nested functions/arrows are already in TS_SKIP_TYPES (the
|
|
99
|
+
// cognitive walk's boundary prunes them), so each top-level fn / method /
|
|
100
|
+
// arrow-const gets its OWN standalone cognitive number and nested-fn control
|
|
101
|
+
// flow counts toward nobody — matching SonarJS's per-function report (an
|
|
102
|
+
// extracted symbol's number == SonarJS's) and the TS cyclomatic arrow-callback
|
|
103
|
+
// gap. Adding arrows here would double-count them into the encloser.
|
|
104
|
+
nestOnlyTypes: new Set(),
|
|
105
|
+
labeledJumpTypes: new Set(['break_statement', 'continue_statement']),
|
|
106
|
+
// Read the `label` FIELD, not namedChildCount: an unlabeled `break /*c*/;`
|
|
107
|
+
// carries a comment as a named child, so counting children would misread it as
|
|
108
|
+
// labeled and add a spurious +1.
|
|
109
|
+
hasLabel: (node) => node.childForFieldName('label') != null,
|
|
110
|
+
// SonarJS counts ONLY maximal `&&` runs; `cFamilyBooleanOperatorKind` returns
|
|
111
|
+
// the kind for `&&`/`||`/`??` so `||`/`??` stay in the source-order run as
|
|
112
|
+
// breakers, and booleanRunStarts filters to `&&`-run-starts (`||`/`??` never
|
|
113
|
+
// count). NB cyclomatic DOES count `||`/`??` — the expected cyc/cog divergence.
|
|
114
|
+
booleanOperatorKind: cFamilyBooleanOperatorKind,
|
|
115
|
+
booleanRunStarts: (kind, prev) => kind === '&&' && prev !== '&&',
|
|
116
|
+
excludeBooleanRun: tsBooleanRunExcluded,
|
|
117
|
+
parenthesizedType: 'parenthesized_expression',
|
|
118
|
+
};
|
|
119
|
+
// SonarJS S3776 excludes a UNIFORM-operator logical expression whose immediate
|
|
120
|
+
// parent is a JSX `{...}` container (`jsx_expression` — covers both JSX children
|
|
121
|
+
// and attribute values) from the cognitive count: `{cond && <X/>}` / `{a && b}` /
|
|
122
|
+
// `{foo() && bar()}` / `<div x={a && a}/>` all score 0 (oracle-confirmed). A
|
|
123
|
+
// MIXED-operator tree is NOT excluded (`{(a || b) && <X/>}` = 1). Mirrors the
|
|
124
|
+
// plugin's `flattenJsxShortCircuitNodes`: bail on a ternary or a different-operator
|
|
125
|
+
// logical node; recurse same-operator operands; any other leaf is fine.
|
|
126
|
+
function tsBooleanRunExcluded(root) {
|
|
127
|
+
// Walk up through parenthesized_expression wrappers before the container test:
|
|
128
|
+
// SonarJS runs on ESTree, which has no paren nodes, so a WHOLE-expression-
|
|
129
|
+
// parenthesized short-circuit (`{(cond && <X/>)}`, a common conditional-render
|
|
130
|
+
// idiom) sits DIRECTLY under the JSX container there and IS excluded. tree-sitter
|
|
131
|
+
// keeps the paren node between them, so without this walk codedeep-mcp would over-count.
|
|
132
|
+
let container = root.parent;
|
|
133
|
+
while (container?.type === 'parenthesized_expression')
|
|
134
|
+
container = container.parent;
|
|
135
|
+
if (container?.type !== 'jsx_expression')
|
|
136
|
+
return false;
|
|
137
|
+
const rootOp = cFamilyBooleanOperatorKind(root);
|
|
138
|
+
if (rootOp === null)
|
|
139
|
+
return false;
|
|
140
|
+
const uniform = (node) => {
|
|
141
|
+
// Unwrap parens like the engine's skipParens (sonar's ESTree has no paren
|
|
142
|
+
// nodes, so operands are the raw children).
|
|
143
|
+
let n = node;
|
|
144
|
+
while (n && n.type === 'parenthesized_expression')
|
|
145
|
+
n = n.namedChild(0);
|
|
146
|
+
if (!n)
|
|
147
|
+
return true;
|
|
148
|
+
if (n.type === 'ternary_expression')
|
|
149
|
+
return false;
|
|
150
|
+
const k = cFamilyBooleanOperatorKind(n);
|
|
151
|
+
if (k === null)
|
|
152
|
+
return true; // non-logical leaf
|
|
153
|
+
if (k !== rootOp)
|
|
154
|
+
return false; // different operator → not a JSX short-circuit
|
|
155
|
+
return uniform(n.childForFieldName('left')) && uniform(n.childForFieldName('right'));
|
|
156
|
+
};
|
|
157
|
+
return uniform(root);
|
|
158
|
+
}
|
|
159
|
+
// Peels receiver wrappers that are transparent to receiver IDENTITY:
|
|
160
|
+
// `non_null_expression` (`a!`) and `parenthesized_expression` (`(a)`). The
|
|
161
|
+
// wrapped expression is the first NON-COMMENT named child (the `!` and parens are
|
|
162
|
+
// anonymous tokens; a leading inline comment — `(/*c*/ a)` — is a NAMED node, so
|
|
163
|
+
// skip it, the same comment-skip the Go receiver unwrap does), so `a!.x()` /
|
|
164
|
+
// `(a).x()` recover the inner `a`/`this` and resolve like `a.x()`. A genuinely
|
|
165
|
+
// chained receiver (`a.b().c()` → call_expression) is NOT a wrapper and is left
|
|
166
|
+
// intact → stays opaque.
|
|
167
|
+
function unwrapReceiver(node) {
|
|
168
|
+
let n = node;
|
|
169
|
+
while (n.type === 'non_null_expression' || n.type === 'parenthesized_expression') {
|
|
170
|
+
let inner = n.firstNamedChild;
|
|
171
|
+
while (inner && inner.type === 'comment')
|
|
172
|
+
inner = inner.nextNamedSibling;
|
|
173
|
+
if (!inner)
|
|
174
|
+
break;
|
|
175
|
+
n = inner;
|
|
176
|
+
}
|
|
177
|
+
return n;
|
|
178
|
+
}
|
|
179
|
+
// `this.x()` / `obj.x()` carry their literal receiver token; a non-null `a!.x()`
|
|
180
|
+
// or parenthesized `(a).x()` receiver is unwrapped to that token too (so it
|
|
181
|
+
// resolves like `a.x()`). Genuinely chained or indexed receivers (`a.b().c()`,
|
|
182
|
+
// `arr[0].run()`) carry RECEIVER_OPAQUE so the called method stays findable by
|
|
183
|
+
// name (recall) but never resolves. `super.x()` (parent-class call) and
|
|
184
|
+
// computed-property calls (no clean property name, e.g. `foo()[k]()`) emit nothing.
|
|
185
|
+
function tsMemberCallInfo(callee) {
|
|
186
|
+
if (callee.type !== 'member_expression')
|
|
187
|
+
return null;
|
|
188
|
+
const obj = callee.childForFieldName('object');
|
|
189
|
+
const prop = callee.childForFieldName('property');
|
|
190
|
+
if (!obj || !prop)
|
|
191
|
+
return null;
|
|
192
|
+
if (prop.type !== 'property_identifier' && prop.type !== 'private_property_identifier') {
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
const recv = unwrapReceiver(obj);
|
|
196
|
+
if (recv.type === 'this')
|
|
197
|
+
return { receiver: 'this', property: prop.text, isSelf: true };
|
|
198
|
+
if (recv.type === 'identifier') {
|
|
199
|
+
return { receiver: recv.text, property: prop.text, isSelf: false };
|
|
200
|
+
}
|
|
201
|
+
if (recv.type === 'super')
|
|
202
|
+
return null;
|
|
203
|
+
return { receiver: RECEIVER_OPAQUE, property: prop.text, isSelf: false };
|
|
204
|
+
}
|
|
205
|
+
export function extractTypeScript(tree, content, fileInfo) {
|
|
206
|
+
const symbols = [];
|
|
207
|
+
const imports = [];
|
|
208
|
+
const bodies = [];
|
|
209
|
+
for (const child of tree.rootNode.namedChildren) {
|
|
210
|
+
let exported = false;
|
|
211
|
+
let target = child;
|
|
212
|
+
if (child.type === 'export_statement') {
|
|
213
|
+
const decl = child.childForFieldName('declaration');
|
|
214
|
+
// Re-exports (`export { x }`, `export * from '...'`) have no
|
|
215
|
+
// `declaration` field and contribute no new symbols here.
|
|
216
|
+
if (!decl)
|
|
217
|
+
continue;
|
|
218
|
+
exported = true;
|
|
219
|
+
target = decl;
|
|
220
|
+
}
|
|
221
|
+
extractTopLevel(target, child, content, fileInfo, exported, symbols, imports, bodies);
|
|
222
|
+
}
|
|
223
|
+
const references = resolveCalls(bodies, tree.rootNode, symbols, fileInfo, TS_SELECTORS, TS_SKIP_TYPES, TS_FUNCTION_BODY_SKIP_TYPES, tsMemberCallInfo, { ignoredMemberCallees: TS_IGNORED_MEMBER_CALLEES });
|
|
224
|
+
computeComplexity(bodies, symbols, {
|
|
225
|
+
decisionNodeTypes: TS_DECISION_NODE_TYPES,
|
|
226
|
+
extraDecisionPredicate: isCFamilyBooleanOperator,
|
|
227
|
+
skipTypes: TS_SKIP_TYPES,
|
|
228
|
+
cognitive: TS_COGNITIVE_OPTIONS,
|
|
229
|
+
});
|
|
230
|
+
return { symbols, references, imports };
|
|
231
|
+
}
|
|
232
|
+
function extractTopLevel(target, outer, content, fileInfo, exported, outSymbols, outImports, outBodies) {
|
|
233
|
+
switch (target.type) {
|
|
234
|
+
case 'ambient_declaration': {
|
|
235
|
+
const inner = target.firstNamedChild;
|
|
236
|
+
if (inner)
|
|
237
|
+
extractTopLevel(inner, outer, content, fileInfo, exported, outSymbols, outImports, outBodies);
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
case 'function_declaration':
|
|
241
|
+
case 'function_signature': {
|
|
242
|
+
const name = target.childForFieldName('name')?.text;
|
|
243
|
+
if (!name)
|
|
244
|
+
return;
|
|
245
|
+
const sym = makeSymbol(target, outer, declSignature(target, content), fileInfo, 'function', name, `${fileInfo.path}:${name}`, exported);
|
|
246
|
+
outSymbols.push(sym);
|
|
247
|
+
const body = target.childForFieldName('body');
|
|
248
|
+
if (body)
|
|
249
|
+
outBodies.push({ symbolId: sym.id, body });
|
|
250
|
+
return;
|
|
251
|
+
}
|
|
252
|
+
case 'class_declaration':
|
|
253
|
+
case 'abstract_class_declaration': {
|
|
254
|
+
const className = target.childForFieldName('name')?.text;
|
|
255
|
+
if (!className)
|
|
256
|
+
return;
|
|
257
|
+
const classSym = makeSymbol(target, outer, declSignature(target, content), fileInfo, 'class', className, `${fileInfo.path}:${className}`, exported);
|
|
258
|
+
outSymbols.push(classSym);
|
|
259
|
+
const body = target.childForFieldName('body');
|
|
260
|
+
if (!body)
|
|
261
|
+
return;
|
|
262
|
+
for (const member of body.namedChildren) {
|
|
263
|
+
extractClassMember(member, content, fileInfo, className, exported, outSymbols, outBodies);
|
|
264
|
+
}
|
|
265
|
+
// Walk the class body itself so calls in static blocks and
|
|
266
|
+
// non-callable field initializers (`static x = helper()`,
|
|
267
|
+
// `field = helper()`) attribute to the class. TS_SKIP_TYPES
|
|
268
|
+
// contains method_definition + function/arrow forms, so calls
|
|
269
|
+
// inside member function bodies stay attributed to the member.
|
|
270
|
+
outBodies.push({ symbolId: classSym.id, body, className });
|
|
271
|
+
return;
|
|
272
|
+
}
|
|
273
|
+
case 'interface_declaration': {
|
|
274
|
+
const name = target.childForFieldName('name')?.text;
|
|
275
|
+
if (!name)
|
|
276
|
+
return;
|
|
277
|
+
outSymbols.push(makeSymbol(target, outer, declSignature(target, content), fileInfo, 'interface', name, `${fileInfo.path}:${name}`, exported));
|
|
278
|
+
return;
|
|
279
|
+
}
|
|
280
|
+
case 'type_alias_declaration': {
|
|
281
|
+
const name = target.childForFieldName('name')?.text;
|
|
282
|
+
if (!name)
|
|
283
|
+
return;
|
|
284
|
+
outSymbols.push(makeSymbol(target, outer, declSignature(target, content), fileInfo, 'type', name, `${fileInfo.path}:${name}`, exported));
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
case 'enum_declaration': {
|
|
288
|
+
// Covers `enum`, `const enum` (const modifier inside the node), and —
|
|
289
|
+
// via the ambient_declaration recursion above — `declare enum`. Enum
|
|
290
|
+
// MEMBERS are not extracted (extraction scope: top-level and
|
|
291
|
+
// class-level declarations only).
|
|
292
|
+
const name = target.childForFieldName('name')?.text;
|
|
293
|
+
if (!name)
|
|
294
|
+
return;
|
|
295
|
+
outSymbols.push(makeSymbol(target, outer, declSignature(target, content), fileInfo, 'enum', name, `${fileInfo.path}:${name}`, exported));
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
298
|
+
// Bare `namespace X {}` parses as expression_statement > internal_module
|
|
299
|
+
// (grammar quirk); `export namespace` and `declare namespace` surface
|
|
300
|
+
// internal_module directly via the declaration field / ambient recursion.
|
|
301
|
+
case 'expression_statement': {
|
|
302
|
+
const inner = target.firstNamedChild;
|
|
303
|
+
if (inner && (inner.type === 'internal_module' || inner.type === 'module')) {
|
|
304
|
+
extractTopLevel(inner, outer, content, fileInfo, exported, outSymbols, outImports, outBodies);
|
|
305
|
+
}
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
case 'internal_module': // namespace X { … }
|
|
309
|
+
case 'module': { // module X { … } (legacy keyword)
|
|
310
|
+
const nameNode = target.childForFieldName('name');
|
|
311
|
+
// Simple identifiers only. A dotted `namespace A.B` (nested_identifier)
|
|
312
|
+
// would put a '.' in the FQN and trip classNameFromFqn's member
|
|
313
|
+
// parsing (isClassMember → true → dropped from file outlines), and a
|
|
314
|
+
// string name (`declare module "pkg"`) names a package, not a symbol.
|
|
315
|
+
// Declaration-only: namespace MEMBERS are not extracted this round —
|
|
316
|
+
// a member FQN `file:Ns.fn` would collide with class-member semantics.
|
|
317
|
+
if (nameNode?.type !== 'identifier')
|
|
318
|
+
return;
|
|
319
|
+
outSymbols.push(makeSymbol(target, outer, declSignature(target, content), fileInfo, 'module', nameNode.text, `${fileInfo.path}:${nameNode.text}`, exported));
|
|
320
|
+
return;
|
|
321
|
+
}
|
|
322
|
+
case 'lexical_declaration':
|
|
323
|
+
case 'variable_declaration': {
|
|
324
|
+
for (const declarator of target.namedChildren) {
|
|
325
|
+
if (declarator.type !== 'variable_declarator')
|
|
326
|
+
continue;
|
|
327
|
+
const nameNode = declarator.childForFieldName('name');
|
|
328
|
+
// Skip destructuring patterns (array_pattern, object_pattern).
|
|
329
|
+
if (!nameNode || nameNode.type !== 'identifier')
|
|
330
|
+
continue;
|
|
331
|
+
const name = nameNode.text;
|
|
332
|
+
const value = declarator.childForFieldName('value');
|
|
333
|
+
const isFunction = !!value && (value.type === 'arrow_function' || value.type === 'function_expression');
|
|
334
|
+
const kind = isFunction ? 'function' : 'variable';
|
|
335
|
+
const sym = makeSymbol(declarator, outer, variableSignature(declarator, value, content), fileInfo, kind, name, `${fileInfo.path}:${name}`, exported);
|
|
336
|
+
outSymbols.push(sym);
|
|
337
|
+
if (isFunction && value) {
|
|
338
|
+
const body = value.childForFieldName('body');
|
|
339
|
+
if (body)
|
|
340
|
+
outBodies.push({ symbolId: sym.id, body });
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return;
|
|
344
|
+
}
|
|
345
|
+
case 'import_statement': {
|
|
346
|
+
extractImport(target, fileInfo, outImports);
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
default:
|
|
350
|
+
return;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
function extractClassMember(member, content, fileInfo, className, exported, outSymbols, outBodies) {
|
|
354
|
+
switch (member.type) {
|
|
355
|
+
case 'method_definition':
|
|
356
|
+
case 'method_signature':
|
|
357
|
+
case 'abstract_method_signature': {
|
|
358
|
+
const methodName = member.childForFieldName('name')?.text;
|
|
359
|
+
if (!methodName)
|
|
360
|
+
return;
|
|
361
|
+
const methodSym = makeSymbol(member, member, declSignature(member, content), fileInfo, 'method', methodName, `${fileInfo.path}:${className}.${methodName}`, exported, className);
|
|
362
|
+
outSymbols.push(methodSym);
|
|
363
|
+
const methodBody = member.childForFieldName('body');
|
|
364
|
+
if (methodBody) {
|
|
365
|
+
outBodies.push({ symbolId: methodSym.id, body: methodBody, className });
|
|
366
|
+
}
|
|
367
|
+
return;
|
|
368
|
+
}
|
|
369
|
+
case 'public_field_definition': {
|
|
370
|
+
const propName = member.childForFieldName('name')?.text;
|
|
371
|
+
if (!propName)
|
|
372
|
+
return;
|
|
373
|
+
const value = member.childForFieldName('value');
|
|
374
|
+
const isCallable = !!value && (value.type === 'arrow_function' || value.type === 'function_expression');
|
|
375
|
+
const kind = isCallable ? 'method' : 'variable';
|
|
376
|
+
const fieldSym = makeSymbol(member, member, variableSignature(member, value, content), fileInfo, kind, propName, `${fileInfo.path}:${className}.${propName}`, exported, className);
|
|
377
|
+
outSymbols.push(fieldSym);
|
|
378
|
+
if (isCallable && value) {
|
|
379
|
+
const fnBody = value.childForFieldName('body');
|
|
380
|
+
if (fnBody) {
|
|
381
|
+
outBodies.push({ symbolId: fieldSym.id, body: fnBody, className });
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
return;
|
|
385
|
+
}
|
|
386
|
+
default:
|
|
387
|
+
return;
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
function extractImport(stmt, fileInfo, out) {
|
|
391
|
+
const sourceNode = stmt.childForFieldName('source');
|
|
392
|
+
if (!sourceNode)
|
|
393
|
+
return;
|
|
394
|
+
const sourceModule = sourceNode.text.replace(/^['"`]|['"`]$/g, '');
|
|
395
|
+
const importedNames = [];
|
|
396
|
+
// `import type { ... }` — the `type` token is an unnamed child of
|
|
397
|
+
// import_statement, so it isn't surfaced via namedChildren.
|
|
398
|
+
const wholeIsTypeOnly = hasTypeKeyword(stmt);
|
|
399
|
+
for (const child of stmt.namedChildren) {
|
|
400
|
+
if (child.type !== 'import_clause')
|
|
401
|
+
continue;
|
|
402
|
+
for (const item of child.namedChildren) {
|
|
403
|
+
if (item.type === 'identifier') {
|
|
404
|
+
const named = { name: IMPORT_DEFAULT, alias: item.text };
|
|
405
|
+
if (wholeIsTypeOnly)
|
|
406
|
+
named.kind = 'type';
|
|
407
|
+
importedNames.push(named);
|
|
408
|
+
}
|
|
409
|
+
else if (item.type === 'namespace_import') {
|
|
410
|
+
for (const nsChild of item.namedChildren) {
|
|
411
|
+
if (nsChild.type !== 'identifier')
|
|
412
|
+
continue;
|
|
413
|
+
const named = { name: IMPORT_NAMESPACE, alias: nsChild.text };
|
|
414
|
+
named.kind = wholeIsTypeOnly ? 'type' : 'namespace';
|
|
415
|
+
importedNames.push(named);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
else if (item.type === 'named_imports') {
|
|
419
|
+
for (const spec of item.namedChildren) {
|
|
420
|
+
if (spec.type !== 'import_specifier')
|
|
421
|
+
continue;
|
|
422
|
+
const specName = spec.childForFieldName('name')?.text;
|
|
423
|
+
if (!specName)
|
|
424
|
+
continue;
|
|
425
|
+
const specAlias = spec.childForFieldName('alias')?.text;
|
|
426
|
+
const named = { name: specName };
|
|
427
|
+
if (specAlias)
|
|
428
|
+
named.alias = specAlias;
|
|
429
|
+
// `import { type X, Y }` — per-specifier `type` keyword sits
|
|
430
|
+
// as an unnamed child before the name identifier.
|
|
431
|
+
if (wholeIsTypeOnly || hasTypeKeyword(spec))
|
|
432
|
+
named.kind = 'type';
|
|
433
|
+
importedNames.push(named);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
out.push({
|
|
439
|
+
file: fileInfo.path,
|
|
440
|
+
sourceModule,
|
|
441
|
+
importedNames,
|
|
442
|
+
line: stmt.startPosition.row + 1,
|
|
443
|
+
});
|
|
444
|
+
}
|
|
445
|
+
function hasTypeKeyword(node) {
|
|
446
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
447
|
+
if (node.child(i)?.type === 'type')
|
|
448
|
+
return true;
|
|
449
|
+
}
|
|
450
|
+
return false;
|
|
451
|
+
}
|
|
452
|
+
function makeSymbol(decl, docNode, signature, fileInfo, kind, name, fqn, exported, qualifier = '') {
|
|
453
|
+
return {
|
|
454
|
+
// The id hashes the FULL signature; only the stored copy is capped —
|
|
455
|
+
// otherwise overloads differing past the cap share an id (JG1).
|
|
456
|
+
id: symbolId(fileInfo.path, name, kind, signature, qualifier),
|
|
457
|
+
name,
|
|
458
|
+
fqn,
|
|
459
|
+
kind,
|
|
460
|
+
file: fileInfo.path,
|
|
461
|
+
startLine: decl.startPosition.row + 1,
|
|
462
|
+
endLine: decl.endPosition.row + 1,
|
|
463
|
+
signature: signature.slice(0, SIGNATURE_DISPLAY_CAP),
|
|
464
|
+
doc: extractDoc(docNode),
|
|
465
|
+
exported,
|
|
466
|
+
language: fileInfo.language,
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
function variableSignature(declarator, value, content) {
|
|
470
|
+
if (value && (value.type === 'arrow_function' || value.type === 'function_expression')) {
|
|
471
|
+
const body = value.childForFieldName('body');
|
|
472
|
+
const sigEnd = body ? body.startIndex : declarator.endIndex;
|
|
473
|
+
let sig = normalizeSignature(content.slice(declarator.startIndex, sigEnd));
|
|
474
|
+
if (value.type === 'arrow_function')
|
|
475
|
+
sig = sig.replace(/=>\s*$/, '').trimEnd();
|
|
476
|
+
return sig;
|
|
477
|
+
}
|
|
478
|
+
return normalizeSignature(content.slice(declarator.startIndex, declarator.endIndex));
|
|
479
|
+
}
|
|
480
|
+
function extractDoc(node) {
|
|
481
|
+
const prev = node.previousNamedSibling;
|
|
482
|
+
if (!prev || prev.type !== 'comment')
|
|
483
|
+
return null;
|
|
484
|
+
return commentDocLine(prev.text);
|
|
485
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import { fileURLToPath } from 'node:url';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { Parser, Language } from 'web-tree-sitter';
|
|
4
|
+
import { log } from '../logger.js';
|
|
5
|
+
const here = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
// From dist/indexer/parser.js or src/indexer/parser.ts → up two levels to repo root.
|
|
7
|
+
const grammarsDir = path.resolve(here, '..', '..', 'grammars');
|
|
8
|
+
const LANG_TO_WASM = {
|
|
9
|
+
typescript: 'tree-sitter-typescript.wasm',
|
|
10
|
+
tsx: 'tree-sitter-tsx.wasm',
|
|
11
|
+
javascript: 'tree-sitter-javascript.wasm',
|
|
12
|
+
python: 'tree-sitter-python.wasm',
|
|
13
|
+
java: 'tree-sitter-java.wasm',
|
|
14
|
+
go: 'tree-sitter-go.wasm',
|
|
15
|
+
rust: 'tree-sitter-rust.wasm',
|
|
16
|
+
swift: 'tree-sitter-swift.wasm',
|
|
17
|
+
kotlin: 'tree-sitter-kotlin.wasm',
|
|
18
|
+
dart: 'tree-sitter-dart.wasm',
|
|
19
|
+
// @repomix ships this as tree-sitter-c_sharp.wasm (underscore), not -csharp.
|
|
20
|
+
csharp: 'tree-sitter-c_sharp.wasm',
|
|
21
|
+
php: 'tree-sitter-php.wasm',
|
|
22
|
+
ruby: 'tree-sitter-ruby.wasm',
|
|
23
|
+
cpp: 'tree-sitter-cpp.wasm',
|
|
24
|
+
// `.c` needs the dedicated C grammar (tree-sitter-cpp errors on K&R + C code
|
|
25
|
+
// using C++ keywords as identifiers); the extractor is shared with cpp.
|
|
26
|
+
c: 'tree-sitter-c.wasm',
|
|
27
|
+
objc: 'tree-sitter-objc.wasm',
|
|
28
|
+
};
|
|
29
|
+
const parsers = new Map();
|
|
30
|
+
let initPromise = null;
|
|
31
|
+
// Conditional-compilation directive lines (#if / #elseif / #else / #endif).
|
|
32
|
+
// `m` matches ^/$ per line; without `s`, `.*` stays within one line.
|
|
33
|
+
const SWIFT_DIRECTIVE_LINE = /^[ \t]*#(?:if|elseif|else|endif)\b.*$/gm;
|
|
34
|
+
// tree-sitter-swift cannot parse a #if/#endif conditional-compilation block
|
|
35
|
+
// INSIDE a type or function body — it emits ERROR nodes that drop the enclosing
|
|
36
|
+
// type AND its first guarded member, hoisting the rest to the top level with
|
|
37
|
+
// the wrong kind/FQN. We blank each directive LINE to equal-length whitespace
|
|
38
|
+
// (newlines untouched) before parsing: byte offsets and line numbers are
|
|
39
|
+
// preserved, so the extractor's signature slices and get_context body slices
|
|
40
|
+
// still match the on-disk file, while every #if branch's declarations now parse
|
|
41
|
+
// as ordinary consecutive members. A member defined in more than one branch is
|
|
42
|
+
// extracted from each — over-extraction (the symbol-id occurrence counter keeps
|
|
43
|
+
// ids unique), never the silent type loss the raw grammar produces. Fast-pathed
|
|
44
|
+
// when the file contains no #if at all. Invoked by parseFile ONLY when the raw
|
|
45
|
+
// parse errors and only adopted when the result parses clean (see call site).
|
|
46
|
+
function neutralizeSwiftDirectives(content) {
|
|
47
|
+
if (!content.includes('#if'))
|
|
48
|
+
return content;
|
|
49
|
+
return content.replace(SWIFT_DIRECTIVE_LINE, (line) => ' '.repeat(line.length));
|
|
50
|
+
}
|
|
51
|
+
// tree-sitter-objc cannot parse a bare `NS_ASSUME_NONNULL_BEGIN` (no trailing
|
|
52
|
+
// semicolon) before an `@interface` — it mis-parses the macro as a type and DROPS THE
|
|
53
|
+
// WHOLE interface (degrading to a labeled/expression statement, 0 symbols). That macro
|
|
54
|
+
// brackets virtually every modern ObjC header. We blank each `NS_ASSUME_NONNULL_BEGIN/
|
|
55
|
+
// END` LINE to equal-length whitespace (newlines untouched) so byte offsets + line
|
|
56
|
+
// numbers are preserved — the extractor's slices still match the on-disk file — while
|
|
57
|
+
// the interface now parses. Exactly the Swift `#if` pattern: applied by parseFile ONLY
|
|
58
|
+
// when the raw parse errors and the neutralized parse is clean (see call site).
|
|
59
|
+
// Fast-pathed when the file contains no such macro. (Residual NS_ENUM/NS_OPTIONS/
|
|
60
|
+
// FOUNDATION_EXPORT macro-opacity is a documented recall-only gap, not covered here.)
|
|
61
|
+
// Only a STANDALONE macro line (optionally followed by whitespace or a `//` line
|
|
62
|
+
// comment) is blanked — never a line that carries real code after the macro. This
|
|
63
|
+
// keeps the equal-length-blanking safe (it removes only the stray macro token, never
|
|
64
|
+
// shifts braces or drops a declaration sharing the line).
|
|
65
|
+
const OBJC_NULLABILITY_LINE = /^[ \t]*NS_ASSUME_NONNULL_(?:BEGIN|END)\b[ \t]*(?:\/\/.*)?$/gm;
|
|
66
|
+
function neutralizeObjcDirectives(content) {
|
|
67
|
+
if (!content.includes('NS_ASSUME_NONNULL_'))
|
|
68
|
+
return content;
|
|
69
|
+
return content.replace(OBJC_NULLABILITY_LINE, (line) => ' '.repeat(line.length));
|
|
70
|
+
}
|
|
71
|
+
// Counts ERROR + MISSING nodes in a tree. Used to decide whether the ObjC
|
|
72
|
+
// nullability-neutralized parse is strictly better than the raw one: an ObjC header
|
|
73
|
+
// commonly retains OTHER macro-opacity (NS_ENUM/FOUNDATION_EXPORT) after the
|
|
74
|
+
// NS_ASSUME_NONNULL fix, so the bar is "fewer errors", not "zero" (unlike Swift's
|
|
75
|
+
// brace-sensitive #if, where only a fully clean reparse is safe to adopt). Prunes
|
|
76
|
+
// clean subtrees via `hasError` so the walk stays cheap.
|
|
77
|
+
function countParseErrors(root) {
|
|
78
|
+
let count = 0;
|
|
79
|
+
const stack = [root];
|
|
80
|
+
while (stack.length > 0) {
|
|
81
|
+
const node = stack.pop();
|
|
82
|
+
if (node.type === 'ERROR' || node.isMissing)
|
|
83
|
+
count++;
|
|
84
|
+
for (const c of node.children) {
|
|
85
|
+
if (c.hasError || c.isMissing)
|
|
86
|
+
stack.push(c);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return count;
|
|
90
|
+
}
|
|
91
|
+
export function initParser() {
|
|
92
|
+
if (!initPromise) {
|
|
93
|
+
initPromise = (async () => {
|
|
94
|
+
await Parser.init();
|
|
95
|
+
const loaded = await Promise.all(Object.entries(LANG_TO_WASM).map(async ([lang, wasm]) => {
|
|
96
|
+
const language = await Language.load(path.join(grammarsDir, wasm));
|
|
97
|
+
const parser = new Parser();
|
|
98
|
+
parser.setLanguage(language);
|
|
99
|
+
return [lang, parser];
|
|
100
|
+
}));
|
|
101
|
+
for (const [lang, parser] of loaded) {
|
|
102
|
+
parsers.set(lang, parser);
|
|
103
|
+
}
|
|
104
|
+
})();
|
|
105
|
+
// A cached rejection would otherwise disable parsing (and pattern
|
|
106
|
+
// validation) for the process lifetime after one transient failure
|
|
107
|
+
// (EMFILE during the WASM reads) — reset so the next call retries.
|
|
108
|
+
initPromise.catch(() => {
|
|
109
|
+
initPromise = null;
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
return initPromise;
|
|
113
|
+
}
|
|
114
|
+
// The returned Tree holds WASM memory; callers must call `tree.delete()` when
|
|
115
|
+
// finished — JS GC won't free it.
|
|
116
|
+
export function parseFile(content, language) {
|
|
117
|
+
if (parsers.size === 0) {
|
|
118
|
+
throw new Error('parser not initialized; call initParser() first');
|
|
119
|
+
}
|
|
120
|
+
const parser = parsers.get(language);
|
|
121
|
+
if (!parser) {
|
|
122
|
+
log.warn(`parseFile: unsupported language "${language}"`);
|
|
123
|
+
return null;
|
|
124
|
+
}
|
|
125
|
+
let tree = parser.parse(content);
|
|
126
|
+
if (!tree) {
|
|
127
|
+
log.warn(`parseFile: parser returned null for language "${language}"`);
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
// Workaround for tree-sitter-swift's in-body #if mis-parse (see above), applied
|
|
131
|
+
// CONDITIONALLY: only swap in the directive-neutralized parse when the original
|
|
132
|
+
// ERRORS and neutralization yields a CLEAN parse. So a file that already parses
|
|
133
|
+
// — including one with a `#if`-looking line inside a multi-line string literal —
|
|
134
|
+
// is never rewritten, and a neutralization that unbalances braces (a guard
|
|
135
|
+
// straddling an opening brace) is discarded rather than producing wrong nesting.
|
|
136
|
+
// The neutralized tree's offsets stay aligned with the ORIGINAL content the
|
|
137
|
+
// caller passes to extractSymbols (equal-length blanking).
|
|
138
|
+
if (language === 'swift' && tree.rootNode.hasError && content.includes('#if')) {
|
|
139
|
+
const neutralized = neutralizeSwiftDirectives(content);
|
|
140
|
+
if (neutralized !== content) {
|
|
141
|
+
const alt = parser.parse(neutralized);
|
|
142
|
+
if (alt && !alt.rootNode.hasError) {
|
|
143
|
+
tree.delete();
|
|
144
|
+
tree = alt;
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
alt?.delete();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// tree-sitter-objc's NS_ASSUME_NONNULL_BEGIN mis-parse (see above), applied the same
|
|
152
|
+
// CONDITIONALLY: adopt the neutralized parse only when the raw one ERRORS and
|
|
153
|
+
// neutralization does NOT increase the error count. A header may carry other
|
|
154
|
+
// unfixable macro-opacity (NS_ENUM etc.), so the bar is no-more-errors, not zero
|
|
155
|
+
// (unlike Swift). Crucially `<=`, not `<`: when the raw mis-parse buries a whole
|
|
156
|
+
// @interface inside ONE giant ERROR node and the neutralized parse hoists that
|
|
157
|
+
// interface cleanly but still errors on an intervening NS_ENUM, the error-NODE counts
|
|
158
|
+
// tie — yet the neutralized parse is strictly better. Blanking a balanced STANDALONE
|
|
159
|
+
// macro line only removes a stray token (never adds one or shifts braces), so the
|
|
160
|
+
// neutralized parse is never structurally worse; adopting an equal-count tie is safe.
|
|
161
|
+
if (language === 'objc' && tree.rootNode.hasError && content.includes('NS_ASSUME_NONNULL_')) {
|
|
162
|
+
const neutralized = neutralizeObjcDirectives(content);
|
|
163
|
+
if (neutralized !== content) {
|
|
164
|
+
const alt = parser.parse(neutralized);
|
|
165
|
+
if (alt && countParseErrors(alt.rootNode) <= countParseErrors(tree.rootNode)) {
|
|
166
|
+
tree.delete();
|
|
167
|
+
tree = alt;
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
alt?.delete();
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return tree;
|
|
175
|
+
}
|