gitnexus 1.6.4-rc.101 → 1.6.4-rc.103

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -612,9 +612,10 @@ importedRawReturnTypesMap, heritageMap, bindingAccumulator) => {
612
612
  await loadLanguage(language, file.path);
613
613
  let tree = astCache.get(file.path);
614
614
  if (!tree) {
615
+ const parseContent = provider.preprocessSource?.(file.content, file.path) ?? file.content;
615
616
  try {
616
- tree = parser.parse(file.content, undefined, {
617
- bufferSize: getTreeSitterBufferSize(file.content),
617
+ tree = parser.parse(parseContent, undefined, {
618
+ bufferSize: getTreeSitterBufferSize(parseContent),
618
619
  });
619
620
  }
620
621
  catch (parseError) {
@@ -2583,9 +2584,10 @@ export const extractFetchCallsFromFiles = async (files, astCache) => {
2583
2584
  await loadLanguage(language, file.path);
2584
2585
  let tree = astCache.get(file.path);
2585
2586
  if (!tree) {
2587
+ const parseContent = provider.preprocessSource?.(file.content, file.path) ?? file.content;
2586
2588
  try {
2587
- tree = parser.parse(file.content, undefined, {
2588
- bufferSize: getTreeSitterBufferSize(file.content),
2589
+ tree = parser.parse(parseContent, undefined, {
2590
+ bufferSize: getTreeSitterBufferSize(parseContent),
2589
2591
  });
2590
2592
  }
2591
2593
  catch {
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Strip Unreal Engine reflection macros from C++ source, length-preserving.
3
+ *
4
+ * Returns the original string unchanged if no strong UE marker is detected,
5
+ * so non-UE C++ files (including ones that contain `*_API`-suffixed
6
+ * identifiers like `REST_API` or `HTTP_API`) incur only a single regex test.
7
+ *
8
+ * The `_filePath` parameter is part of the `LanguageProvider.preprocessSource`
9
+ * contract but is unused — UE detection is purely content-based. Accepted and
10
+ * ignored here so the function matches the hook signature exactly.
11
+ */
12
+ export declare function stripUeMacros(source: string, _filePath?: string): string;
@@ -0,0 +1,260 @@
1
+ /**
2
+ * Unreal Engine reflection-macro preprocessor for C++ source.
3
+ *
4
+ * Tree-sitter does not expand C preprocessor macros, so Unreal's reflection
5
+ * markers (`UCLASS(...)`, `UFUNCTION(...)`, `MODULENAME_API`, ...) are parsed
6
+ * verbatim. The result is mis-parsed declarations: in `class BRAWLUI_API
7
+ * UMyClass : public UObject`, tree-sitter-cpp captures `BRAWLUI_API` as the
8
+ * class name and the rest of the declaration becomes structurally wrong.
9
+ *
10
+ * This module elides those macros from the source text BEFORE tree-sitter
11
+ * parses it. Replacement is **length-preserving** (each elided byte becomes
12
+ * a space, newlines preserved) so byte offsets and line/column positions
13
+ * tree-sitter reports remain identical to the original file. Symbol
14
+ * locations in the graph stay accurate.
15
+ *
16
+ * A cheap detection guard short-circuits files that don't look like UE
17
+ * sources, so non-UE C++ codebases pay no cost.
18
+ *
19
+ * Pure function — no tree-sitter dependency, safe for worker threads.
20
+ */
21
+ /**
22
+ * Strong UE markers — reflection macros that only Unreal Engine projects use.
23
+ * Presence of one of these is sufficient evidence that the file is a UE source
24
+ * and that `MODULENAME_API` tokens in it are intended as export macros.
25
+ *
26
+ * Importantly, `_API` tokens are NOT in this guard — `REST_API`, `HTTP_API`,
27
+ * `MY_LIB_API` and similar identifiers appear in plenty of non-UE C++ codebases
28
+ * as constants/enums/parameter names. We must not erase them just because the
29
+ * file mentions an `_API` token.
30
+ */
31
+ const HAS_UE_HINT = /\b(?:UCLASS|UFUNCTION|UPROPERTY|USTRUCT|UENUM|UINTERFACE|GENERATED_BODY|GENERATED_[A-Z_]+_BODY|UE_DEPRECATED|DECLARE_(?:DYNAMIC_)?(?:MULTICAST_)?DELEGATE)/;
32
+ const SIMPLE_MACROS_NO_ARGS = [
33
+ 'GENERATED_BODY',
34
+ 'GENERATED_UCLASS_BODY',
35
+ 'GENERATED_USTRUCT_BODY',
36
+ 'GENERATED_UINTERFACE_BODY',
37
+ 'GENERATED_IINTERFACE_BODY',
38
+ 'DECLARE_CLASS',
39
+ 'GENERATED_BODY_LEGACY',
40
+ ];
41
+ const PARENTHESIZED_MACROS = [
42
+ 'UCLASS',
43
+ 'UFUNCTION',
44
+ 'UPROPERTY',
45
+ 'USTRUCT',
46
+ 'UENUM',
47
+ 'UINTERFACE',
48
+ 'UMETA',
49
+ 'UE_DEPRECATED',
50
+ ];
51
+ const DELEGATE_MACRO_RE = /\bDECLARE_(?:DYNAMIC_)?(?:MULTICAST_)?DELEGATE(?:_(?:RetVal_OneParam|RetVal_TwoParams|RetVal_ThreeParams|RetVal_FourParams|RetVal_FiveParams|RetVal_SixParams|RetVal_SevenParams|RetVal_EightParams|RetVal_NineParams|RetVal|OneParam|TwoParams|ThreeParams|FourParams|FiveParams|SixParams|SevenParams|EightParams|NineParams|TenParams))?(?=\s*\()/g;
52
+ /**
53
+ * Module export tokens like `BRAWLUI_API`, `ENGINE_API`, `COREUOBJECT_API`.
54
+ * Pattern: ALL_CAPS identifier ending in `_API`. The leading word boundary
55
+ * (`\b`) prevents matching mid-identifier.
56
+ */
57
+ const API_MACRO_RE = /\b[A-Z][A-Z0-9_]*_API\b/g;
58
+ /** Replace `[start, end)` of `chars` with spaces, preserving newlines. */
59
+ function eraseRange(chars, start, end) {
60
+ for (let i = start; i < end; i++) {
61
+ if (chars[i] !== '\n' && chars[i] !== '\r') {
62
+ chars[i] = ' ';
63
+ }
64
+ }
65
+ }
66
+ /**
67
+ * Find the matching close paren for an opening paren at index `openIdx`.
68
+ * Returns the index of `)` (inclusive end), or -1 if unbalanced.
69
+ *
70
+ * Handles nested parens and string/char literals so commas/parens inside
71
+ * strings don't throw off the match. Does not attempt to handle raw string
72
+ * literals (`R"(...)"`); UE reflection-macro arguments do not use them in
73
+ * practice.
74
+ */
75
+ function findMatchingParen(source, openIdx) {
76
+ if (source.charCodeAt(openIdx) !== 0x28)
77
+ return -1;
78
+ let depth = 1;
79
+ let i = openIdx + 1;
80
+ const len = source.length;
81
+ while (i < len && depth > 0) {
82
+ const ch = source.charCodeAt(i);
83
+ // String literal
84
+ if (ch === 0x22) {
85
+ i++;
86
+ while (i < len) {
87
+ const c = source.charCodeAt(i);
88
+ if (c === 0x5c) {
89
+ i += 2;
90
+ continue;
91
+ }
92
+ if (c === 0x22) {
93
+ i++;
94
+ break;
95
+ }
96
+ i++;
97
+ }
98
+ continue;
99
+ }
100
+ // Char literal
101
+ if (ch === 0x27) {
102
+ i++;
103
+ while (i < len) {
104
+ const c = source.charCodeAt(i);
105
+ if (c === 0x5c) {
106
+ i += 2;
107
+ continue;
108
+ }
109
+ if (c === 0x27) {
110
+ i++;
111
+ break;
112
+ }
113
+ i++;
114
+ }
115
+ continue;
116
+ }
117
+ // Line comment
118
+ if (ch === 0x2f && source.charCodeAt(i + 1) === 0x2f) {
119
+ while (i < len && source.charCodeAt(i) !== 0x0a)
120
+ i++;
121
+ continue;
122
+ }
123
+ // Block comment
124
+ if (ch === 0x2f && source.charCodeAt(i + 1) === 0x2a) {
125
+ i += 2;
126
+ while (i < len) {
127
+ if (source.charCodeAt(i) === 0x2a && source.charCodeAt(i + 1) === 0x2f) {
128
+ i += 2;
129
+ break;
130
+ }
131
+ i++;
132
+ }
133
+ continue;
134
+ }
135
+ if (ch === 0x28)
136
+ depth++;
137
+ else if (ch === 0x29) {
138
+ depth--;
139
+ if (depth === 0)
140
+ return i;
141
+ }
142
+ i++;
143
+ }
144
+ return -1;
145
+ }
146
+ /** Match a whole-word identifier at `idx`. Returns the byte after the identifier, or -1 on miss. */
147
+ function matchIdentifierAt(source, idx, name) {
148
+ if (idx > 0) {
149
+ const prev = source.charCodeAt(idx - 1);
150
+ if ((prev >= 0x30 && prev <= 0x39) ||
151
+ (prev >= 0x41 && prev <= 0x5a) ||
152
+ (prev >= 0x61 && prev <= 0x7a) ||
153
+ prev === 0x5f) {
154
+ return -1;
155
+ }
156
+ }
157
+ for (let k = 0; k < name.length; k++) {
158
+ if (source.charCodeAt(idx + k) !== name.charCodeAt(k))
159
+ return -1;
160
+ }
161
+ const after = idx + name.length;
162
+ if (after < source.length) {
163
+ const next = source.charCodeAt(after);
164
+ if ((next >= 0x30 && next <= 0x39) ||
165
+ (next >= 0x41 && next <= 0x5a) ||
166
+ (next >= 0x61 && next <= 0x7a) ||
167
+ next === 0x5f) {
168
+ return -1;
169
+ }
170
+ }
171
+ return after;
172
+ }
173
+ /** Skip ASCII whitespace forward from `idx`. Returns the next non-whitespace byte index. */
174
+ function skipWhitespace(source, idx) {
175
+ const len = source.length;
176
+ while (idx < len) {
177
+ const ch = source.charCodeAt(idx);
178
+ if (ch === 0x20 || ch === 0x09 || ch === 0x0a || ch === 0x0d) {
179
+ idx++;
180
+ continue;
181
+ }
182
+ break;
183
+ }
184
+ return idx;
185
+ }
186
+ /**
187
+ * Strip Unreal Engine reflection macros from C++ source, length-preserving.
188
+ *
189
+ * Returns the original string unchanged if no strong UE marker is detected,
190
+ * so non-UE C++ files (including ones that contain `*_API`-suffixed
191
+ * identifiers like `REST_API` or `HTTP_API`) incur only a single regex test.
192
+ *
193
+ * The `_filePath` parameter is part of the `LanguageProvider.preprocessSource`
194
+ * contract but is unused — UE detection is purely content-based. Accepted and
195
+ * ignored here so the function matches the hook signature exactly.
196
+ */
197
+ export function stripUeMacros(source, _filePath) {
198
+ if (!HAS_UE_HINT.test(source))
199
+ return source;
200
+ const chars = source.split('');
201
+ for (const macro of PARENTHESIZED_MACROS) {
202
+ let searchFrom = 0;
203
+ while (true) {
204
+ const hit = source.indexOf(macro, searchFrom);
205
+ if (hit < 0)
206
+ break;
207
+ searchFrom = hit + 1;
208
+ const after = matchIdentifierAt(source, hit, macro);
209
+ if (after < 0)
210
+ continue;
211
+ const parenIdx = skipWhitespace(source, after);
212
+ if (source.charCodeAt(parenIdx) !== 0x28)
213
+ continue;
214
+ const close = findMatchingParen(source, parenIdx);
215
+ if (close < 0)
216
+ continue;
217
+ eraseRange(chars, hit, close + 1);
218
+ }
219
+ }
220
+ for (const macro of SIMPLE_MACROS_NO_ARGS) {
221
+ let searchFrom = 0;
222
+ while (true) {
223
+ const hit = source.indexOf(macro, searchFrom);
224
+ if (hit < 0)
225
+ break;
226
+ searchFrom = hit + 1;
227
+ const after = matchIdentifierAt(source, hit, macro);
228
+ if (after < 0)
229
+ continue;
230
+ const parenIdx = skipWhitespace(source, after);
231
+ if (source.charCodeAt(parenIdx) === 0x28) {
232
+ const close = findMatchingParen(source, parenIdx);
233
+ if (close < 0)
234
+ continue;
235
+ eraseRange(chars, hit, close + 1);
236
+ }
237
+ else {
238
+ eraseRange(chars, hit, after);
239
+ }
240
+ }
241
+ }
242
+ for (const re of [DELEGATE_MACRO_RE, API_MACRO_RE]) {
243
+ re.lastIndex = 0;
244
+ let match;
245
+ while ((match = re.exec(source)) !== null) {
246
+ const start = match.index;
247
+ let end = start + match[0].length;
248
+ if (re === DELEGATE_MACRO_RE) {
249
+ const parenIdx = skipWhitespace(source, end);
250
+ if (source.charCodeAt(parenIdx) === 0x28) {
251
+ const close = findMatchingParen(source, parenIdx);
252
+ if (close >= 0)
253
+ end = close + 1;
254
+ }
255
+ }
256
+ eraseRange(chars, start, end);
257
+ }
258
+ }
259
+ return chars.join('');
260
+ }
@@ -147,9 +147,13 @@ export const processHeritage = async (graph, files, astCache, ctx, onProgress) =
147
147
  let tree = astCache.get(file.path);
148
148
  if (!tree) {
149
149
  // Use larger bufferSize for files > 32KB
150
+ // Per-language source preprocessor (length-preserving, e.g. UE macro
151
+ // stripping for C++). MUST mirror parsing-processor on cache miss so
152
+ // re-parses see the same input as the cached AST.
153
+ const parseContent = provider.preprocessSource?.(file.content, file.path) ?? file.content;
150
154
  try {
151
- tree = parser.parse(file.content, undefined, {
152
- bufferSize: getTreeSitterBufferSize(file.content),
155
+ tree = parser.parse(parseContent, undefined, {
156
+ bufferSize: getTreeSitterBufferSize(parseContent),
153
157
  });
154
158
  }
155
159
  catch (parseError) {
@@ -294,9 +298,10 @@ export async function extractExtractedHeritageFromFiles(files, astCache) {
294
298
  await loadLanguage(language, file.path);
295
299
  let tree = astCache.get(file.path);
296
300
  if (!tree) {
301
+ const parseContent = provider.preprocessSource?.(file.content, file.path) ?? file.content;
297
302
  try {
298
- tree = parser.parse(file.content, undefined, {
299
- bufferSize: getTreeSitterBufferSize(file.content),
303
+ tree = parser.parse(parseContent, undefined, {
304
+ bufferSize: getTreeSitterBufferSize(parseContent),
300
305
  });
301
306
  }
302
307
  catch {
@@ -241,9 +241,10 @@ export const processImports = async (graph, files, astCache, ctx, onProgress, re
241
241
  let tree = astCache.get(file.path);
242
242
  let wasReparsed = false;
243
243
  if (!tree) {
244
+ const parseContent = provider.preprocessSource?.(file.content, file.path) ?? file.content;
244
245
  try {
245
- tree = parser.parse(file.content, undefined, {
246
- bufferSize: getTreeSitterBufferSize(file.content),
246
+ tree = parser.parse(parseContent, undefined, {
247
+ bufferSize: getTreeSitterBufferSize(parseContent),
247
248
  });
248
249
  }
249
250
  catch (parseError) {
@@ -74,6 +74,38 @@ interface LanguageProviderConfig {
74
74
  /** Tree-sitter query strings for definitions, imports, calls, heritage.
75
75
  * Required for tree-sitter languages; empty string for standalone processors. */
76
76
  readonly treeSitterQueries: string;
77
+ /**
78
+ * Optional source-text transform that runs **before** tree-sitter parses the file.
79
+ *
80
+ * Used to elide language constructs that confuse the grammar without affecting
81
+ * source-position fidelity — e.g., Unreal Engine reflection macros (`UCLASS`,
82
+ * `UFUNCTION`, `MODULENAME_API`) in C++ headers that prevent the parser from
83
+ * recognising class/function names correctly.
84
+ *
85
+ * **Length / position preservation:** the returned string MUST have the same
86
+ * JavaScript `.length` as the input AND preserve every newline (`\n`/`\r`)
87
+ * position byte-for-byte. Implementations replace elided characters with
88
+ * ASCII spaces while leaving newlines untouched. With this contract:
89
+ *
90
+ * - tree-sitter's reported `startPosition.row`/`startPosition.column`
91
+ * match the original file exactly (line/column come from newline counts)
92
+ * - `startIndex`/`endIndex` byte offsets match the original file exactly
93
+ * **when the elided range is pure ASCII** (UTF-16 `.length` equals UTF-8
94
+ * byte length only for ASCII).
95
+ *
96
+ * Implementations targeting languages where elided ranges may contain
97
+ * non-ASCII content must therefore preserve byte length, not just `.length`,
98
+ * if downstream code uses `startIndex` to slice the original UTF-8 bytes.
99
+ * The current C++ UE-macro preprocessor relies on the practical fact that
100
+ * UE reflection macros and module-export tokens are ASCII-only.
101
+ *
102
+ * Must be a pure function — same input always yields the same output. Called
103
+ * once per file, on every code path that re-parses (parsing-processor, import
104
+ * processor, heritage processor, call processor, parse worker).
105
+ *
106
+ * Default: undefined (no preprocessing — `file.content` is parsed verbatim).
107
+ */
108
+ readonly preprocessSource?: (sourceText: string, filePath: string) => string;
77
109
  /** Type extraction: declarations, initializers, for-loop bindings */
78
110
  readonly typeConfig: LanguageTypeConfig;
79
111
  /** Export detection: is this AST node a public/exported symbol? */
@@ -36,6 +36,7 @@ import { cVariableConfig, cppVariableConfig } from '../variable-extractors/confi
36
36
  import { createCallExtractor } from '../call-extractors/generic.js';
37
37
  import { cCallConfig, cppCallConfig } from '../call-extractors/configs/c-cpp.js';
38
38
  import { createHeritageExtractor } from '../heritage-extractors/generic.js';
39
+ import { stripUeMacros } from '../cpp-ue-preprocessor.js';
39
40
  const C_BUILT_INS = new Set([
40
41
  'printf',
41
42
  'fprintf',
@@ -380,6 +381,7 @@ export const cppProvider = defineLanguage({
380
381
  },
381
382
  ],
382
383
  treeSitterQueries: CPP_QUERIES,
384
+ preprocessSource: stripUeMacros,
383
385
  typeConfig: cCppConfig,
384
386
  exportChecker: cCppExportChecker,
385
387
  importResolver: createImportResolver(cppImportConfig),
@@ -262,6 +262,10 @@ const processParsingSequential = async (graph, files, symbolTable, astCache, sco
262
262
  lineOffset = extracted.lineOffset;
263
263
  isVueSetup = extracted.isSetup;
264
264
  }
265
+ // Per-language source-text transform (e.g., UE macro stripping for C++).
266
+ // Length-preserving — see LanguageProvider.preprocessSource contract.
267
+ parseContent =
268
+ getProvider(language).preprocessSource?.(parseContent, file.path) ?? parseContent;
265
269
  try {
266
270
  await loadLanguage(language, file.path);
267
271
  }
@@ -1019,6 +1019,10 @@ const processFileGroup = (files, language, queryString, result, onFileProcessed)
1019
1019
  lineOffset = extracted.lineOffset;
1020
1020
  isVueSetup = extracted.isSetup;
1021
1021
  }
1022
+ // Per-language source-text transform (e.g., UE macro stripping for C++).
1023
+ // Length-preserving — see LanguageProvider.preprocessSource contract.
1024
+ parseContent =
1025
+ getProvider(language).preprocessSource?.(parseContent, file.path) ?? parseContent;
1022
1026
  clearCaches(); // Reset memoization before each new file
1023
1027
  let tree;
1024
1028
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitnexus",
3
- "version": "1.6.4-rc.101",
3
+ "version": "1.6.4-rc.103",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",