@rarusoft/dendrite-wiki 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +79 -0
  2. package/dist/api-extractor/extract.js +269 -0
  3. package/dist/api-extractor/language-extractor.js +15 -0
  4. package/dist/api-extractor/python-extractor.js +358 -0
  5. package/dist/api-extractor/render.js +195 -0
  6. package/dist/api-extractor/tree-sitter-extractor.js +1079 -0
  7. package/dist/api-extractor/types.js +11 -0
  8. package/dist/api-extractor/typescript-extractor.js +50 -0
  9. package/dist/api-extractor/walk.js +178 -0
  10. package/dist/api-reference.js +438 -0
  11. package/dist/benchmark-events.js +129 -0
  12. package/dist/benchmark.js +270 -0
  13. package/dist/binder-export.js +381 -0
  14. package/dist/canonical-target.js +168 -0
  15. package/dist/chart-insert.js +377 -0
  16. package/dist/chart-prompts.js +414 -0
  17. package/dist/context-cache.js +98 -0
  18. package/dist/contradicts-shipped-memory.js +232 -0
  19. package/dist/diff-context.js +142 -0
  20. package/dist/doctor.js +220 -0
  21. package/dist/generated-docs.js +219 -0
  22. package/dist/i18n.js +71 -0
  23. package/dist/index.js +49 -0
  24. package/dist/librarian.js +255 -0
  25. package/dist/maintenance-actions.js +244 -0
  26. package/dist/maintenance-inbox.js +842 -0
  27. package/dist/maintenance-runner.js +62 -0
  28. package/dist/page-drift.js +225 -0
  29. package/dist/page-inbox.js +168 -0
  30. package/dist/report-export.js +339 -0
  31. package/dist/review-bridge.js +1386 -0
  32. package/dist/search-index.js +199 -0
  33. package/dist/store.js +1617 -0
  34. package/dist/telemetry-defaults.js +44 -0
  35. package/dist/telemetry-report.js +263 -0
  36. package/dist/telemetry.js +544 -0
  37. package/dist/wiki-synthesis.js +901 -0
  38. package/package.json +35 -0
  39. package/src/api-extractor/extract.ts +333 -0
  40. package/src/api-extractor/language-extractor.ts +37 -0
  41. package/src/api-extractor/python-extractor.ts +380 -0
  42. package/src/api-extractor/render.ts +267 -0
  43. package/src/api-extractor/tree-sitter-extractor.ts +1210 -0
  44. package/src/api-extractor/types.ts +41 -0
  45. package/src/api-extractor/typescript-extractor.ts +56 -0
  46. package/src/api-extractor/walk.ts +209 -0
  47. package/src/api-reference.ts +552 -0
  48. package/src/benchmark-events.ts +216 -0
  49. package/src/benchmark.ts +376 -0
  50. package/src/binder-export.ts +437 -0
  51. package/src/canonical-target.ts +192 -0
  52. package/src/chart-insert.ts +478 -0
  53. package/src/chart-prompts.ts +417 -0
  54. package/src/context-cache.ts +129 -0
  55. package/src/contradicts-shipped-memory.ts +311 -0
  56. package/src/diff-context.ts +187 -0
  57. package/src/doctor.ts +260 -0
  58. package/src/generated-docs.ts +316 -0
  59. package/src/i18n.ts +106 -0
  60. package/src/index.ts +59 -0
  61. package/src/librarian.ts +331 -0
  62. package/src/maintenance-actions.ts +314 -0
  63. package/src/maintenance-inbox.ts +1132 -0
  64. package/src/maintenance-runner.ts +85 -0
  65. package/src/page-drift.ts +292 -0
  66. package/src/page-inbox.ts +254 -0
  67. package/src/report-export.ts +392 -0
  68. package/src/review-bridge.ts +1729 -0
  69. package/src/search-index.ts +266 -0
  70. package/src/store.ts +2171 -0
  71. package/src/telemetry-defaults.ts +50 -0
  72. package/src/telemetry-report.ts +365 -0
  73. package/src/telemetry.ts +757 -0
  74. package/src/wiki-synthesis.ts +1307 -0
@@ -0,0 +1,1079 @@
1
+ /**
2
+ * Generic `LanguageExtractor` powered by tree-sitter — the long-tail language layer.
3
+ *
4
+ * Where `typescript-extractor.ts` and `python-extractor.ts` are handcrafted for top-traffic
5
+ * languages with first-class compiler/AST surfaces, this module covers the long tail
6
+ * (Rust today; Go, Java, Ruby, C, C++, PHP next) via tree-sitter's portable WASM grammars
7
+ * and each grammar's upstream `queries/tags.scm` file. Every supported language lives as a
8
+ * single config-table entry — extension, vendored WASM path, vendored tags.scm path, a
9
+ * public-symbol predicate, a doc-comment association rule. Adding another language is a
10
+ * config addition, not a new module.
11
+ *
12
+ * Rationale (Phase B1 of the API reference roadmap): the per-language handcrafted path
13
+ * doesn't scale. GitHub's stack-graphs project — their multi-year attempt at bespoke
14
+ * per-language indexers — was archived in September 2025; even GitHub couldn't sustain it.
15
+ * Tree-sitter `tags.scm` is the durable middle tier the industry settled on. Output
16
+ * quality matches roughly what our handcrafted Python extractor produces (signatures with
17
+ * types-as-written, doc comments as prose), which is the bar for "binder-on-shelf"
18
+ * presentability.
19
+ *
20
+ * Determinism: parse trees change between grammar versions, so each vendored grammar is
21
+ * pinned by upstream tag and sha256 (recorded in `NOTICE` at the repo root).
22
+ * Same `(web-tree-sitter version, grammar tag, tags.scm sha256)` triple = same parse tree
23
+ * across machines. WASM grammars lazy-load on first use so projects that never touch a
24
+ * given language never pay its load cost.
25
+ */
26
+ import { existsSync } from 'node:fs';
27
+ import { promises as fs } from 'node:fs';
28
+ import path from 'node:path';
29
+ import { fileURLToPath } from 'node:url';
30
+ import { Language, Parser, Query } from 'web-tree-sitter';
31
+ import { walkProjectSources } from './walk.js';
32
+ function rustIsPublic(definitionNode, _source, _name) {
33
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
34
+ const child = definitionNode.namedChild(i);
35
+ if (child && child.type === 'visibility_modifier') {
36
+ // `pub`, `pub(crate)`, `pub(super)`, `pub(in path)` all count for our purposes —
37
+ // any pub-prefixed visibility is part of the crate's public-or-internal API contract.
38
+ return child.text.startsWith('pub');
39
+ }
40
+ }
41
+ return false;
42
+ }
43
+ const RUST_CONFIG = {
44
+ id: 'rust',
45
+ extensions: ['.rs'],
46
+ projectSignals: ['Cargo.toml'],
47
+ vendorSubdir: 'rust',
48
+ walkOptions: {
49
+ include: ['src/**/*.rs', 'examples/**/*.rs', 'lib.rs', 'main.rs'],
50
+ exclude: ['**/target/**', '**/tests/**', '**/*_test.rs', '**/build.rs', '**/node_modules/**'],
51
+ respectInternalConvention: false
52
+ },
53
+ // Rust's tags.scm maps:
54
+ // struct/enum/union/type → @definition.class
55
+ // trait → @definition.interface
56
+ // function → @definition.function
57
+ // method (inside an impl block) → @definition.method
58
+ // module → @definition.module (we drop these)
59
+ // macro → @definition.macro (we drop these for now; they don't fit the existing kind set)
60
+ captureKindMap: {
61
+ 'definition.class': 'class',
62
+ 'definition.interface': 'interface',
63
+ 'definition.function': 'function',
64
+ 'definition.method': 'function'
65
+ },
66
+ docComment: {
67
+ // Rust: `///` for outer doc, `//!` for inner doc. We only attach outer doc to a
68
+ // definition; inner doc is module-level and surfaces via fileDocComment instead.
69
+ linePrefixes: ['///']
70
+ },
71
+ bodyNodeTypes: new Set(['block', 'field_declaration_list', 'declaration_list', 'enum_variant_list', 'trait_block']),
72
+ isPublic: rustIsPublic
73
+ };
74
+ // --- Go --------------------------------------------------------------------
75
+ function goIsPublic(_definitionNode, _source, name) {
76
+ // Go's "exported" rule is purely lexical: an identifier whose first letter is uppercase
77
+ // (Unicode-uppercase via `IsUpper`) is exported from its package. We match that exactly.
78
+ return name.length > 0 && name[0] === name[0].toUpperCase() && name[0] !== name[0].toLowerCase();
79
+ }
80
+ const GO_CONFIG = {
81
+ id: 'go',
82
+ extensions: ['.go'],
83
+ projectSignals: ['go.mod'],
84
+ vendorSubdir: 'go',
85
+ walkOptions: {
86
+ include: ['**/*.go'],
87
+ exclude: ['**/*_test.go', '**/vendor/**', '**/node_modules/**'],
88
+ respectInternalConvention: false
89
+ },
90
+ captureKindMap: {
91
+ 'definition.function': 'function',
92
+ 'definition.method': 'function',
93
+ // Go's grammar uses `definition.type` for type_spec — that covers struct, interface,
94
+ // type alias, and named-type all under one capture. Mapping all of them to `class`
95
+ // matches what readers care about: "this is a type defined in this package."
96
+ 'definition.type': 'class'
97
+ },
98
+ docComment: {
99
+ // Go's documentation convention is plain `//` comments immediately preceding the
100
+ // declaration, with text starting on the same line as the symbol's name. No special
101
+ // prefix character.
102
+ linePrefixes: ['//']
103
+ },
104
+ bodyNodeTypes: new Set(['block', 'field_declaration_list', 'method_spec_list', 'interface_type', 'struct_type']),
105
+ isPublic: goIsPublic
106
+ };
107
+ // --- Java ------------------------------------------------------------------
108
+ function javaIsPublic(definitionNode, _source, _name) {
109
+ // Java requires an explicit `public` modifier in the declaration's `modifiers` child.
110
+ // Package-private (no modifier) and `protected` / `private` are excluded from the
111
+ // generated API reference; readers reading "what does this class expose" expect the
112
+ // formal `public` API surface.
113
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
114
+ const child = definitionNode.namedChild(i);
115
+ if (child && child.type === 'modifiers') {
116
+ return /\bpublic\b/.test(child.text);
117
+ }
118
+ }
119
+ return false;
120
+ }
121
+ const JAVA_CONFIG = {
122
+ id: 'java',
123
+ extensions: ['.java'],
124
+ projectSignals: ['pom.xml', 'build.gradle', 'build.gradle.kts', 'settings.gradle', 'settings.gradle.kts'],
125
+ vendorSubdir: 'java',
126
+ walkOptions: {
127
+ include: ['src/**/*.java', '**/*.java'],
128
+ exclude: ['**/test/**', '**/tests/**', '**/build/**', '**/target/**', '**/.gradle/**', '**/node_modules/**'],
129
+ respectInternalConvention: false
130
+ },
131
+ captureKindMap: {
132
+ 'definition.class': 'class',
133
+ 'definition.interface': 'interface',
134
+ 'definition.method': 'function'
135
+ },
136
+ docComment: {
137
+ // Javadoc — block comments delimited by `/** */`. The renderer's block path strips
138
+ // leading `* ` from each interior line.
139
+ linePrefixes: [],
140
+ blockOpen: '/**',
141
+ blockClose: '*/'
142
+ },
143
+ bodyNodeTypes: new Set(['class_body', 'interface_body', 'block', 'enum_body', 'annotation_type_body']),
144
+ isPublic: javaIsPublic
145
+ };
146
+ // --- Ruby ------------------------------------------------------------------
147
+ function rubyIsPublic(_definitionNode, _source, _name) {
148
+ // Ruby's visibility model is more flexible than `public`/`private` modifiers — it's
149
+ // section-based via `private`/`protected` keywords inside class bodies. Properly tracking
150
+ // section state requires walking the surrounding class body, which we skip in this first
151
+ // cut. Since Ruby's *default* is public and most idiomatic Ruby code keeps the public
152
+ // API at module level (with `private` reserved for class internals), we accept the
153
+ // over-inclusion: every captured definition is treated as public. Future enhancement
154
+ // could detect intervening `private`/`protected` calls.
155
+ return true;
156
+ }
157
+ const RUBY_CONFIG = {
158
+ id: 'ruby',
159
+ extensions: ['.rb'],
160
+ projectSignals: ['Gemfile', 'Rakefile'],
161
+ vendorSubdir: 'ruby',
162
+ walkOptions: {
163
+ include: ['lib/**/*.rb', 'app/**/*.rb', '**/*.rb'],
164
+ exclude: ['**/spec/**', '**/test/**', '**/vendor/**', '**/node_modules/**', '**/tmp/**'],
165
+ respectInternalConvention: false
166
+ },
167
+ captureKindMap: {
168
+ 'definition.class': 'class',
169
+ 'definition.module': 'class',
170
+ 'definition.method': 'function'
171
+ },
172
+ docComment: {
173
+ // Ruby uses `#` for line comments. Documentation generators (RDoc, YARD) attach
174
+ // contiguous `#`-prefixed comments to the following declaration.
175
+ linePrefixes: ['#']
176
+ },
177
+ bodyNodeTypes: new Set(['body_statement', 'do_block']),
178
+ isPublic: rubyIsPublic
179
+ };
180
+ // --- C ---------------------------------------------------------------------
181
+ function hasStaticStorageClass(node) {
182
+ for (let i = 0; i < node.namedChildCount; i += 1) {
183
+ const child = node.namedChild(i);
184
+ if (child && child.type === 'storage_class_specifier' && child.text.includes('static')) {
185
+ return true;
186
+ }
187
+ }
188
+ return false;
189
+ }
190
+ function cIsPublic(definitionNode, _source, _name) {
191
+ // C has no language-level public/private, so the right cut is "is this declaration
192
+ // intended for the linker's external symbol table?" — i.e., NOT marked `static`.
193
+ // The C grammar's tags.scm captures `function_declarator` (a child of the wrapping
194
+ // `declaration` node), but `storage_class_specifier` lives on the declaration itself,
195
+ // so we have to look both at the captured node and its parent to find the modifier.
196
+ if (hasStaticStorageClass(definitionNode))
197
+ return false;
198
+ if (definitionNode.parent && hasStaticStorageClass(definitionNode.parent))
199
+ return false;
200
+ return true;
201
+ }
202
+ const C_CONFIG = {
203
+ id: 'c',
204
+ extensions: ['.c', '.h'],
205
+ projectSignals: ['Makefile', 'CMakeLists.txt', 'meson.build', 'configure.ac'],
206
+ vendorSubdir: 'c',
207
+ walkOptions: {
208
+ include: ['**/*.h', '**/*.c'],
209
+ exclude: ['**/build/**', '**/cmake-build-*/**', '**/.deps/**', '**/node_modules/**'],
210
+ respectInternalConvention: false
211
+ },
212
+ captureKindMap: {
213
+ 'definition.class': 'class', // struct / union
214
+ 'definition.function': 'function',
215
+ 'definition.type': 'type-alias' // typedef / enum
216
+ },
217
+ docComment: {
218
+ // Doxygen convention. Line-prefix `///` and Javadoc-style block `/** */` both signal
219
+ // a doc comment in idiomatic C codebases.
220
+ linePrefixes: ['///'],
221
+ blockOpen: '/**',
222
+ blockClose: '*/'
223
+ },
224
+ bodyNodeTypes: new Set(['compound_statement', 'field_declaration_list', 'enumerator_list']),
225
+ isPublic: cIsPublic
226
+ };
227
+ // --- C++ -------------------------------------------------------------------
228
+ function cppIsPublic(definitionNode, _source, _name) {
229
+ // C++ inherits C's static-linkage rule for free-standing functions and adds class-member
230
+ // access specifiers. Properly tracking `public:` / `private:` / `protected:` sections
231
+ // requires walking back to the nearest access_specifier inside the surrounding class —
232
+ // we skip that for the first cut and apply C's static-only filter, which already covers
233
+ // the common case (free-standing functions in headers). Class members will be
234
+ // over-included; a follow-up can tighten this. Headers (`.h`/`.hpp`) are the public API
235
+ // surface anyway, and that's where most readers look first.
236
+ if (hasStaticStorageClass(definitionNode))
237
+ return false;
238
+ if (definitionNode.parent && hasStaticStorageClass(definitionNode.parent))
239
+ return false;
240
+ return true;
241
+ }
242
+ const CPP_CONFIG = {
243
+ id: 'cpp',
244
+ extensions: ['.cpp', '.cc', '.cxx', '.hpp', '.hh', '.hxx', '.h'],
245
+ projectSignals: ['CMakeLists.txt', 'Makefile', 'meson.build', 'conanfile.txt', 'conanfile.py'],
246
+ vendorSubdir: 'cpp',
247
+ walkOptions: {
248
+ include: ['**/*.hpp', '**/*.hh', '**/*.hxx', '**/*.h', '**/*.cpp', '**/*.cc', '**/*.cxx'],
249
+ exclude: ['**/build/**', '**/cmake-build-*/**', '**/.deps/**', '**/node_modules/**'],
250
+ respectInternalConvention: false
251
+ },
252
+ captureKindMap: {
253
+ 'definition.class': 'class',
254
+ 'definition.function': 'function',
255
+ 'definition.method': 'function',
256
+ 'definition.type': 'type-alias'
257
+ },
258
+ docComment: {
259
+ linePrefixes: ['///'],
260
+ blockOpen: '/**',
261
+ blockClose: '*/'
262
+ },
263
+ bodyNodeTypes: new Set(['compound_statement', 'field_declaration_list', 'enumerator_list', 'namespace_body']),
264
+ isPublic: cppIsPublic
265
+ };
266
+ // --- PHP -------------------------------------------------------------------
267
+ function phpIsPublic(definitionNode, _source, _name) {
268
+ // PHP defaults to public visibility. The relevant signal is whether the declaration's
269
+ // modifiers list contains `private` or `protected`; if so, exclude. If no modifiers or
270
+ // `public` is explicit, include.
271
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
272
+ const child = definitionNode.namedChild(i);
273
+ if (child && (child.type === 'visibility_modifier' || child.type === 'modifiers')) {
274
+ const text = child.text;
275
+ if (/\b(private|protected)\b/.test(text)) {
276
+ return false;
277
+ }
278
+ }
279
+ }
280
+ return true;
281
+ }
282
+ const PHP_CONFIG = {
283
+ id: 'php',
284
+ extensions: ['.php'],
285
+ projectSignals: ['composer.json'],
286
+ vendorSubdir: 'php',
287
+ walkOptions: {
288
+ include: ['src/**/*.php', 'lib/**/*.php', '**/*.php'],
289
+ exclude: ['**/vendor/**', '**/tests/**', '**/Tests/**', '**/node_modules/**', '**/.phpunit.cache/**'],
290
+ respectInternalConvention: false
291
+ },
292
+ captureKindMap: {
293
+ 'definition.class': 'class',
294
+ // PHP's tags.scm captures both `interface` and `trait` as definition.interface — both
295
+ // are reasonable to render as interface-like surfaces.
296
+ 'definition.interface': 'interface',
297
+ 'definition.function': 'function'
298
+ },
299
+ docComment: {
300
+ // PHPDoc — same `/** */` shape as Javadoc.
301
+ linePrefixes: [],
302
+ blockOpen: '/**',
303
+ blockClose: '*/'
304
+ },
305
+ bodyNodeTypes: new Set(['compound_statement', 'declaration_list', 'enum_declaration_list']),
306
+ isPublic: phpIsPublic
307
+ };
308
+ // --- C# --------------------------------------------------------------------
309
+ function csharpIsPublic(definitionNode, _source, _name) {
310
+ // C#'s default access for class members is `private`; for top-level types it's
311
+ // `internal`. The API-reference contract is "what would a caller in another assembly
312
+ // see," so we require an explicit `public` modifier. Modifiers in tree-sitter-c-sharp
313
+ // appear as `modifier` children directly under the declaration.
314
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
315
+ const child = definitionNode.namedChild(i);
316
+ if (child && child.type === 'modifier' && child.text === 'public') {
317
+ return true;
318
+ }
319
+ }
320
+ return false;
321
+ }
322
+ const CSHARP_CONFIG = {
323
+ id: 'csharp',
324
+ extensions: ['.cs'],
325
+ projectSignals: ['global.json', 'Directory.Build.props', 'Directory.Build.targets'],
326
+ vendorSubdir: 'csharp',
327
+ // C#'s release publishes WASM with an underscore — `tree-sitter-c_sharp.wasm` —
328
+ // because the npm package convention forbids hyphens in module names. We honor that.
329
+ wasmFilename: 'tree-sitter-c_sharp.wasm',
330
+ walkOptions: {
331
+ include: ['**/*.cs'],
332
+ exclude: ['**/bin/**', '**/obj/**', '**/Tests/**', '**/*.Tests/**', '**/node_modules/**'],
333
+ respectInternalConvention: false
334
+ },
335
+ captureKindMap: {
336
+ 'definition.class': 'class',
337
+ 'definition.interface': 'interface',
338
+ 'definition.method': 'function'
339
+ },
340
+ docComment: {
341
+ // C# XML-doc convention is `///` line comments. Some codebases also use
342
+ // `/** */`. Support both.
343
+ linePrefixes: ['///'],
344
+ blockOpen: '/**',
345
+ blockClose: '*/'
346
+ },
347
+ bodyNodeTypes: new Set(['declaration_list', 'block', 'enum_member_declaration_list']),
348
+ isPublic: csharpIsPublic
349
+ };
350
+ // --- Swift -----------------------------------------------------------------
351
+ function swiftIsPublic(definitionNode, _source, _name) {
352
+ // Swift's default access is `internal`. The two access levels above that — `public`
353
+ // (callable from other modules) and `open` (subclassable / overridable from other
354
+ // modules) — are what API docs should show. We accept both as "public" for the API
355
+ // reference; private/fileprivate/internal are filtered.
356
+ // Modifier nodes in the alex-pinkus grammar appear as `modifiers` (a parent list) with
357
+ // children of type `visibility_modifier`, `inheritance_modifier`, etc. Walk one level
358
+ // to find any visibility marker.
359
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
360
+ const child = definitionNode.namedChild(i);
361
+ if (!child)
362
+ continue;
363
+ if (child.type === 'modifiers') {
364
+ const text = child.text;
365
+ if (/\b(public|open)\b/.test(text))
366
+ return true;
367
+ if (/\b(private|fileprivate|internal)\b/.test(text))
368
+ return false;
369
+ }
370
+ if (child.type === 'visibility_modifier') {
371
+ const text = child.text;
372
+ if (text === 'public' || text === 'open')
373
+ return true;
374
+ if (text === 'private' || text === 'fileprivate' || text === 'internal')
375
+ return false;
376
+ }
377
+ }
378
+ // No explicit modifier → Swift default is `internal`, which we treat as not-public for
379
+ // API reference purposes.
380
+ return false;
381
+ }
382
+ const SWIFT_CONFIG = {
383
+ id: 'swift',
384
+ extensions: ['.swift'],
385
+ projectSignals: ['Package.swift', 'Podfile', 'project.yml'],
386
+ vendorSubdir: 'swift',
387
+ walkOptions: {
388
+ include: ['Sources/**/*.swift', '**/*.swift'],
389
+ exclude: ['**/Tests/**', '**/.build/**', '**/Pods/**', '**/DerivedData/**', '**/node_modules/**'],
390
+ respectInternalConvention: false
391
+ },
392
+ captureKindMap: {
393
+ 'definition.class': 'class',
394
+ // Swift `protocol` is the closest equivalent to an interface.
395
+ 'definition.interface': 'interface',
396
+ 'definition.method': 'function',
397
+ 'definition.function': 'function',
398
+ 'definition.property': 'variable'
399
+ },
400
+ docComment: {
401
+ // Swift's documentation convention is `///` outer-doc lines and `/** */` blocks.
402
+ linePrefixes: ['///'],
403
+ blockOpen: '/**',
404
+ blockClose: '*/'
405
+ },
406
+ bodyNodeTypes: new Set(['class_body', 'protocol_body', 'function_body']),
407
+ isPublic: swiftIsPublic
408
+ };
409
+ // --- Lua -------------------------------------------------------------------
410
+ function luaIsPublic(definitionNode, source, _name) {
411
+ // Lua has no language-level visibility; the convention is the `local` keyword
412
+ // (`local function foo()` / `local foo = function() end`). tree-sitter-lua's
413
+ // `function_declaration` and `assignment_statement` both INCLUDE the leading `local`
414
+ // token as part of the captured node when present, so the cheapest reliable check is
415
+ // whether the captured text starts with `local`. We also do a small backward look at
416
+ // the source immediately before the node in case a future grammar revision changes
417
+ // where the `local` keyword sits in the parse tree.
418
+ const text = definitionNode.text;
419
+ if (/^\s*local\b/.test(text)) {
420
+ return false;
421
+ }
422
+ const lookback = source.slice(Math.max(0, definitionNode.startIndex - 32), definitionNode.startIndex);
423
+ if (/\blocal\s+$/.test(lookback)) {
424
+ return false;
425
+ }
426
+ return true;
427
+ }
428
+ const LUA_CONFIG = {
429
+ id: 'lua',
430
+ extensions: ['.lua'],
431
+ // Lua has no canonical project file. LuaRocks `.rockspec` is closest, but we also
432
+ // accept any directory containing Lua sources by listing `init.lua` (Neovim plugin
433
+ // convention) and the LuaRocks rocks directory.
434
+ projectSignals: ['init.lua', '.luarocks'],
435
+ vendorSubdir: 'lua',
436
+ walkOptions: {
437
+ include: ['lua/**/*.lua', 'src/**/*.lua', '**/*.lua'],
438
+ exclude: ['**/spec/**', '**/test/**', '**/.luarocks/**', '**/node_modules/**'],
439
+ respectInternalConvention: false
440
+ },
441
+ captureKindMap: {
442
+ 'definition.function': 'function',
443
+ 'definition.method': 'function'
444
+ },
445
+ docComment: {
446
+ // Lua line comments are `--`. The LDoc convention adds a triple-dash for doc
447
+ // comments (`---`). Both prefixes count, with longest-first ordering so `---` wins
448
+ // over `--` on lines that have both.
449
+ linePrefixes: ['---', '--']
450
+ },
451
+ bodyNodeTypes: new Set(['block']),
452
+ isPublic: luaIsPublic
453
+ };
454
+ // --- Scala -----------------------------------------------------------------
455
+ function scalaIsPublic(definitionNode, _source, _name) {
456
+ // Scala defaults class members to public; explicit `private` / `protected` modifiers
457
+ // exclude. The grammar surfaces modifiers as a `modifiers` child or directly as
458
+ // `access_modifier` siblings; check both.
459
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
460
+ const child = definitionNode.namedChild(i);
461
+ if (!child)
462
+ continue;
463
+ if (child.type === 'modifiers' || child.type === 'access_modifier' || child.type === 'modifier') {
464
+ const text = child.text;
465
+ if (/\b(private|protected)\b/.test(text))
466
+ return false;
467
+ }
468
+ }
469
+ return true;
470
+ }
471
+ const SCALA_CONFIG = {
472
+ id: 'scala',
473
+ extensions: ['.scala', '.sc'],
474
+ projectSignals: ['build.sbt', 'build.sc', 'pom.xml'],
475
+ vendorSubdir: 'scala',
476
+ walkOptions: {
477
+ include: ['src/**/*.scala', '**/*.scala'],
478
+ exclude: ['**/test/**', '**/target/**', '**/.bloop/**', '**/.metals/**', '**/node_modules/**'],
479
+ respectInternalConvention: false
480
+ },
481
+ captureKindMap: {
482
+ 'definition.class': 'class',
483
+ 'definition.interface': 'interface', // trait
484
+ 'definition.enum': 'enum',
485
+ 'definition.function': 'function',
486
+ 'definition.object': 'class' // singleton object — closest existing kind
487
+ },
488
+ docComment: {
489
+ // Scaladoc — same `/** */` shape as Javadoc.
490
+ linePrefixes: [],
491
+ blockOpen: '/**',
492
+ blockClose: '*/'
493
+ },
494
+ bodyNodeTypes: new Set(['template_body', 'block', 'class_parameters']),
495
+ isPublic: scalaIsPublic
496
+ };
497
+ // --- Elixir ----------------------------------------------------------------
498
+ function elixirIsPublic(_definitionNode, source, _name) {
499
+ // Elixir distinguishes `def` (public) from `defp` (private). The capture in tags.scm
500
+ // is parameterized by the `target.identifier` name (def/defp/etc.); we check the source
501
+ // text immediately preceding the captured node for the relevant keyword.
502
+ const startIdx = _definitionNode.startIndex;
503
+ const window = source.slice(Math.max(0, startIdx - 20), startIdx + 8);
504
+ if (/\bdefp\b/.test(window))
505
+ return false;
506
+ if (/\bdefmacrop\b/.test(window))
507
+ return false;
508
+ if (/\bdefguardp\b/.test(window))
509
+ return false;
510
+ if (/\bdefnp\b/.test(window))
511
+ return false;
512
+ return true;
513
+ }
514
+ const ELIXIR_CONFIG = {
515
+ id: 'elixir',
516
+ extensions: ['.ex', '.exs'],
517
+ projectSignals: ['mix.exs'],
518
+ vendorSubdir: 'elixir',
519
+ walkOptions: {
520
+ include: ['lib/**/*.ex', '**/*.ex'],
521
+ exclude: ['**/test/**', '**/_build/**', '**/deps/**', '**/.elixir_ls/**', '**/node_modules/**'],
522
+ respectInternalConvention: false
523
+ },
524
+ captureKindMap: {
525
+ 'definition.module': 'class',
526
+ 'definition.function': 'function'
527
+ },
528
+ docComment: {
529
+ // Elixir's `@doc` attribute holds the prose, but at the source level it appears as
530
+ // a `@doc """ ... """` heredoc preceding the def. The simpler convention also seen in
531
+ // libraries is `#`-prefixed line comments. Our walker handles both: `#` lines win
532
+ // first; heredoc `@doc` would need extractor-level support beyond this first cut.
533
+ linePrefixes: ['#']
534
+ },
535
+ bodyNodeTypes: new Set(['do_block', 'block']),
536
+ isPublic: elixirIsPublic
537
+ };
538
+ // --- OCaml -----------------------------------------------------------------
539
+ function ocamlIsPublic(_definitionNode, _source, _name) {
540
+ // OCaml's visibility model lives in module signatures (`.mli` files) — anything
541
+ // exposed there is public. Inside `.ml` files everything is technically reachable from
542
+ // outside the module unless the project ships a signature that hides it. For this
543
+ // first cut we treat all captured definitions as public; a future enhancement can
544
+ // honor signature files.
545
+ return true;
546
+ }
547
+ const OCAML_CONFIG = {
548
+ id: 'ocaml',
549
+ extensions: ['.ml', '.mli'],
550
+ projectSignals: ['dune-project', 'dune', '_oasis'],
551
+ vendorSubdir: 'ocaml',
552
+ walkOptions: {
553
+ include: ['**/*.ml', '**/*.mli'],
554
+ exclude: ['**/_build/**', '**/.merlin', '**/node_modules/**'],
555
+ respectInternalConvention: false
556
+ },
557
+ captureKindMap: {
558
+ 'definition.module': 'class', // OCaml modules are the closest analogue
559
+ 'definition.interface': 'interface',
560
+ 'definition.class': 'class',
561
+ 'definition.function': 'function',
562
+ 'definition.method': 'function'
563
+ },
564
+ docComment: {
565
+ // OCaml's documentation convention is `(** ... *)` block comments, with the `**`
566
+ // prefix distinguishing them from regular `(* ... *)` comments.
567
+ linePrefixes: [],
568
+ blockOpen: '(**',
569
+ blockClose: '*)'
570
+ },
571
+ bodyNodeTypes: new Set(['structure', 'signature', 'module_binding']),
572
+ isPublic: ocamlIsPublic
573
+ };
574
+ // --- Kotlin ----------------------------------------------------------------
575
+ function kotlinIsPublic(definitionNode, _source, _name) {
576
+ // Kotlin defaults to public visibility; explicit `private`, `protected`, or `internal`
577
+ // modifiers exclude. Modifiers appear as a `modifiers` child whose textual content
578
+ // contains the visibility keyword.
579
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
580
+ const child = definitionNode.namedChild(i);
581
+ if (!child)
582
+ continue;
583
+ if (child.type === 'modifiers' || child.type === 'modifier' || child.type === 'visibility_modifier') {
584
+ const text = child.text;
585
+ if (/\b(private|protected|internal)\b/.test(text))
586
+ return false;
587
+ }
588
+ }
589
+ return true;
590
+ }
591
+ const KOTLIN_CONFIG = {
592
+ id: 'kotlin',
593
+ extensions: ['.kt', '.kts'],
594
+ projectSignals: ['build.gradle.kts', 'settings.gradle.kts', 'build.gradle', 'pom.xml'],
595
+ vendorSubdir: 'kotlin',
596
+ walkOptions: {
597
+ include: ['src/**/*.kt', 'src/**/*.kts', '**/*.kt', '**/*.kts'],
598
+ exclude: ['**/test/**', '**/build/**', '**/.gradle/**', '**/node_modules/**'],
599
+ respectInternalConvention: false
600
+ },
601
+ captureKindMap: {
602
+ 'definition.class': 'class',
603
+ 'definition.function': 'function'
604
+ },
605
+ docComment: {
606
+ // KDoc — same `/** */` shape as Javadoc.
607
+ linePrefixes: [],
608
+ blockOpen: '/**',
609
+ blockClose: '*/'
610
+ },
611
+ bodyNodeTypes: new Set(['class_body', 'function_body', 'enum_class_body', 'block']),
612
+ isPublic: kotlinIsPublic
613
+ };
614
+ // --- Bash ------------------------------------------------------------------
615
+ function bashIsPublic(_definitionNode, _source, _name) {
616
+ // Bash has no language-level visibility. Every function definition in a script is
617
+ // reachable by any caller in the same shell. We surface them all.
618
+ return true;
619
+ }
620
+ const BASH_CONFIG = {
621
+ id: 'bash',
622
+ extensions: ['.sh', '.bash'],
623
+ // Shell scripts have no canonical project-marker file, so we fall back to a content-
624
+ // based claim: detect-time walker finds at least one .sh / .bash file under the root.
625
+ projectSignals: [],
626
+ requireExtensionPresent: true,
627
+ vendorSubdir: 'bash',
628
+ walkOptions: {
629
+ include: ['**/*.sh', '**/*.bash'],
630
+ exclude: ['**/node_modules/**', '**/.git/**'],
631
+ respectInternalConvention: false
632
+ },
633
+ captureKindMap: {
634
+ 'definition.function': 'function'
635
+ },
636
+ docComment: {
637
+ // Bash only has line comments with `#`.
638
+ linePrefixes: ['#']
639
+ },
640
+ bodyNodeTypes: new Set(['compound_statement']),
641
+ isPublic: bashIsPublic
642
+ };
643
+ const LANGUAGES = [
644
+ RUST_CONFIG,
645
+ GO_CONFIG,
646
+ JAVA_CONFIG,
647
+ RUBY_CONFIG,
648
+ C_CONFIG,
649
+ CPP_CONFIG,
650
+ PHP_CONFIG,
651
+ CSHARP_CONFIG,
652
+ SWIFT_CONFIG,
653
+ LUA_CONFIG,
654
+ SCALA_CONFIG,
655
+ ELIXIR_CONFIG,
656
+ OCAML_CONFIG,
657
+ KOTLIN_CONFIG,
658
+ BASH_CONFIG
659
+ ];
660
+ const moduleDir = path.dirname(fileURLToPath(import.meta.url));
661
+ // Walk upward from the compiled/source module location to find `vendor/tree-sitter`. This
662
+ // works under both `tsx` (running TypeScript directly from `src/`) and the built JS layout
663
+ // (`dist/src/wiki/api-extractor/...`) because each layout has a different relative depth
664
+ // to the project root.
665
+ function resolveVendorRoot() {
666
+ let dir = moduleDir;
667
+ // Bound the walk so we never escape arbitrarily far.
668
+ for (let i = 0; i < 8; i += 1) {
669
+ const candidate = path.join(dir, 'vendor', 'tree-sitter');
670
+ if (existsSync(candidate)) {
671
+ return candidate;
672
+ }
673
+ const parent = path.dirname(dir);
674
+ if (parent === dir)
675
+ break;
676
+ dir = parent;
677
+ }
678
+ return null;
679
+ }
680
+ let parserInitPromise = null;
681
+ async function ensureParserInit() {
682
+ if (!parserInitPromise) {
683
+ parserInitPromise = Parser.init();
684
+ }
685
+ return parserInitPromise;
686
+ }
687
+ const loadedGrammars = new Map();
688
+ async function loadGrammar(config) {
689
+ const cached = loadedGrammars.get(config.id);
690
+ if (cached !== undefined) {
691
+ return cached;
692
+ }
693
+ const promise = (async () => {
694
+ const vendorRoot = resolveVendorRoot();
695
+ if (!vendorRoot) {
696
+ return null;
697
+ }
698
+ const wasmFilename = config.wasmFilename ?? `tree-sitter-${config.id}.wasm`;
699
+ const wasmPath = path.join(vendorRoot, config.vendorSubdir, wasmFilename);
700
+ const tagsScmPath = path.join(vendorRoot, config.vendorSubdir, 'tags.scm');
701
+ if (!existsSync(wasmPath) || !existsSync(tagsScmPath)) {
702
+ return null;
703
+ }
704
+ await ensureParserInit();
705
+ const language = await Language.load(wasmPath);
706
+ const queryText = await fs.readFile(tagsScmPath, 'utf8');
707
+ const query = new Query(language, queryText);
708
+ return { config, language, query };
709
+ })();
710
+ loadedGrammars.set(config.id, promise);
711
+ return promise;
712
+ }
713
+ // Test-only escape hatch: clear the cache so tests can simulate cold loads or replace
714
+ // vendored bundles between runs.
715
+ export function resetTreeSitterGrammarCache() {
716
+ loadedGrammars.clear();
717
+ }
718
+ function languageForExtension(filePath) {
719
+ const ext = path.extname(filePath).toLowerCase();
720
+ for (const lang of LANGUAGES) {
721
+ if (lang.extensions.includes(ext)) {
722
+ return lang;
723
+ }
724
+ }
725
+ return null;
726
+ }
727
+ function defaultIncludeFor(config) {
728
+ // Build a generic include list from the language's extensions when the config doesn't
729
+ // override walkOptions.include. e.g., `.rs` → ['**/*.rs'].
730
+ return config.extensions.map((ext) => `**/*${ext}`);
731
+ }
732
+ function findCaptureNode(captures, name) {
733
+ return captures.find((capture) => capture.name === name)?.node;
734
+ }
735
+ // When multiple `@definition.*` captures could fire for the same node (e.g., Swift's
736
+ // grammar matches a class method as both `definition.method` and `definition.function`,
737
+ // and PHP captures `interface` and `trait` both as `definition.interface`), we want
738
+ // deterministic kind selection — not "whichever pattern tree-sitter iterated first."
739
+ // Lower index = higher priority. Names not in the list fall back to lowest priority.
740
+ const DEFINITION_CAPTURE_PRIORITY = [
741
+ 'definition.class',
742
+ 'definition.interface',
743
+ 'definition.enum',
744
+ 'definition.method',
745
+ 'definition.function',
746
+ 'definition.macro',
747
+ 'definition.module',
748
+ 'definition.type',
749
+ 'definition.field',
750
+ 'definition.property',
751
+ 'definition.object'
752
+ ];
753
+ function definitionCapturePriority(name) {
754
+ const idx = DEFINITION_CAPTURE_PRIORITY.indexOf(name);
755
+ return idx === -1 ? Number.MAX_SAFE_INTEGER : idx;
756
+ }
757
+ function findCaptureNodeForDefinition(captures) {
758
+ // tags.scm conventionally captures the WHOLE definition node under `@definition.<kind>`
759
+ // (class/function/method/interface/etc.), and the symbol's name under `@name`. When a
760
+ // pattern produces multiple definition captures, pick the highest-priority one so the
761
+ // rendered kind is deterministic across grammar version bumps.
762
+ let best = null;
763
+ for (const capture of captures) {
764
+ if (!capture.name.startsWith('definition.'))
765
+ continue;
766
+ const priority = definitionCapturePriority(capture.name);
767
+ if (!best || priority < best.priority) {
768
+ best = { capture, kindCaptureName: capture.name, priority };
769
+ }
770
+ }
771
+ return best ? { capture: best.capture, kindCaptureName: best.kindCaptureName } : null;
772
+ }
773
+ // Different grammars use different node-type names for comments. `comment` is the most
774
+ // common; Rust/Java/C/C++ use `line_comment` and `block_comment`; Kotlin uses
775
+ // `multiline_comment` for /* */ blocks; Scala uses `block_comment`. Keep the set wide.
776
+ const COMMENT_NODE_TYPES = new Set(['line_comment', 'block_comment', 'comment', 'multiline_comment']);
777
+ function findStartingDocCursor(definitionNode) {
778
+ // Locate a preceding-named-sibling that is a comment. Several grammars don't put doc
779
+ // comments at the same level as the captured node:
780
+ // * C captures `function_declarator` whose immediate previous sibling is the type
781
+ // specifier (not a comment); walk up to the wrapping `declaration` node.
782
+ // * fwcd's Kotlin grammar absorbs the trailing `/** */` block into the preceding
783
+ // `package_header` node — the comment ends up as the last named descendant of the
784
+ // wrapping sibling rather than as a sibling of the class.
785
+ // Strategy: walk up through ancestors; for each, take previousNamedSibling. If it's a
786
+ // comment, done. Otherwise descend into its last named child chain looking for a
787
+ // trailing comment. Bounded depth keeps the walk tractable.
788
+ let walker = definitionNode;
789
+ for (let i = 0; i < 4 && walker; i += 1) {
790
+ const prev = walker.previousNamedSibling;
791
+ if (prev) {
792
+ if (COMMENT_NODE_TYPES.has(prev.type)) {
793
+ return prev;
794
+ }
795
+ // Try the last named descendant of prev — handles grammars like Kotlin's where a
796
+ // trailing comment is absorbed into the preceding sibling node.
797
+ let inner = prev;
798
+ while (inner && inner.namedChildCount > 0) {
799
+ const lastChild = inner.namedChild(inner.namedChildCount - 1);
800
+ if (!lastChild)
801
+ break;
802
+ if (COMMENT_NODE_TYPES.has(lastChild.type)) {
803
+ return lastChild;
804
+ }
805
+ inner = lastChild;
806
+ }
807
+ }
808
+ walker = walker.parent;
809
+ }
810
+ return null;
811
+ }
812
+ function collectAdjacentDocComment(definitionNode, source, rule) {
813
+ // Walk backward through preceding named siblings, collecting contiguous comment lines
814
+ // that match the language's doc-comment convention. We use named-sibling traversal so
815
+ // unnamed punctuation/newline tokens between a comment and its target don't break the
816
+ // chain — different grammars expose those gaps differently and named traversal is the
817
+ // portable path. When the captured definition has no preceding sibling at its own level,
818
+ // we walk up to its parent (e.g., from `function_declarator` to the surrounding
819
+ // `declaration`) so doc comments wrapped one level out still attach.
820
+ const lines = [];
821
+ let cursor = findStartingDocCursor(definitionNode);
822
+ while (cursor) {
823
+ if (!COMMENT_NODE_TYPES.has(cursor.type)) {
824
+ break;
825
+ }
826
+ const raw = source.slice(cursor.startIndex, cursor.endIndex);
827
+ let body = null;
828
+ for (const prefix of rule.linePrefixes) {
829
+ if (raw.startsWith(prefix)) {
830
+ body = raw.slice(prefix.length).trimStart();
831
+ break;
832
+ }
833
+ }
834
+ if (body === null && rule.blockOpen && rule.blockClose) {
835
+ if (raw.startsWith(rule.blockOpen) && raw.endsWith(rule.blockClose)) {
836
+ const inner = raw.slice(rule.blockOpen.length, raw.length - rule.blockClose.length);
837
+ body = inner
838
+ .split(/\r?\n/)
839
+ .map((line) => line.replace(/^\s*\*\s?/, ''))
840
+ .join('\n')
841
+ .trim();
842
+ }
843
+ }
844
+ if (body === null) {
845
+ break;
846
+ }
847
+ lines.unshift(body);
848
+ cursor = cursor.previousNamedSibling;
849
+ }
850
+ const joined = lines.join('\n').trim();
851
+ return joined.length > 0 ? joined : null;
852
+ }
853
+ function buildSignature(node, source, bodyNodeTypes) {
854
+ // Strip the body of the definition for compactness on the API page. A function or
855
+ // method signature lives in the source up to (but excluding) its body child (block /
856
+ // class_body / field_declaration_list / etc., per language); for items without a body
857
+ // (type aliases, struct-only-header declarations, etc.) we keep the full text. This
858
+ // produces clean signatures like `pub fn translate(key: DendriteI18nKey) -> String`
859
+ // instead of dumping the entire function body into the page.
860
+ const bodyChild = findBodyChild(node, bodyNodeTypes);
861
+ let endIndex = node.endIndex;
862
+ if (bodyChild) {
863
+ endIndex = bodyChild.startIndex;
864
+ }
865
+ return source.slice(node.startIndex, endIndex).trim().replace(/\s+$/, '');
866
+ }
867
+ function findBodyChild(node, bodyNodeTypes) {
868
+ for (let i = 0; i < node.childCount; i += 1) {
869
+ const child = node.child(i);
870
+ if (child && bodyNodeTypes.has(child.type)) {
871
+ return child;
872
+ }
873
+ }
874
+ return null;
875
+ }
876
+ function deriveModuleSlug(relativeSourcePath) {
877
+ const trimmed = relativeSourcePath.replace(/\\/g, '/').replace(/^\.\//, '');
878
+ const withoutExt = trimmed.replace(/\.[a-z0-9]+$/i, '');
879
+ const stripped = withoutExt.replace(/^src\//, '');
880
+ return `api/${stripped}`;
881
+ }
882
+ function extractFileDocCommentRust(source) {
883
+ // Rust uses `//!` as the inner-doc / module-doc convention. Walk the leading lines of
884
+ // the file collecting consecutive `//!` lines. Lines that appear in real Rust file
885
+ // headers and that we treat as ignorable prelude:
886
+ // - shebang (`#!/usr/bin/env cargo`) — only valid on the first line
887
+ // - outer attributes (`#![deny(warnings)]`, `#![cfg_attr(...)]`, etc.)
888
+ // - blank lines (always)
889
+ // Without skipping these, a typical `main.rs` whose first line is `#![deny(warnings)]`
890
+ // would terminate doc collection before any `//!` line ever started — silently dropping
891
+ // the module-level documentation for binary crates.
892
+ const lines = source.split(/\r?\n/);
893
+ const collected = [];
894
+ for (const line of lines) {
895
+ const trimmed = line.trimStart();
896
+ if (trimmed.startsWith('//!')) {
897
+ collected.push(trimmed.slice(3).trimStart());
898
+ }
899
+ else if (trimmed.length === 0 && collected.length > 0) {
900
+ // Blank line right after a `//!` block — keep it as a paragraph break.
901
+ collected.push('');
902
+ }
903
+ else if (collected.length > 0) {
904
+ break;
905
+ }
906
+ else if (trimmed.length === 0) {
907
+ // Leading blank lines — skip.
908
+ continue;
909
+ }
910
+ else if (trimmed.startsWith('#!')) {
911
+ // Shebang or outer-attribute prelude (`#!/...` or `#![attr]`). Both are valid Rust
912
+ // file-header content that must NOT terminate doc-comment collection. Skip.
913
+ continue;
914
+ }
915
+ else {
916
+ break;
917
+ }
918
+ }
919
+ const body = collected.join('\n').trim();
920
+ return body.length > 0 ? body : null;
921
+ }
922
+ async function extractWithGrammar(loaded, sourcePath, rootDir) {
923
+ const absolute = path.isAbsolute(sourcePath) ? sourcePath : path.resolve(rootDir, sourcePath);
924
+ const relative = path.relative(rootDir, absolute).replace(/\\/g, '/');
925
+ const source = await fs.readFile(absolute, 'utf8');
926
+ const parser = new Parser();
927
+ parser.setLanguage(loaded.language);
928
+ const tree = parser.parse(source);
929
+ if (!tree) {
930
+ throw new Error(`tree-sitter failed to parse ${relative}`);
931
+ }
932
+ const candidatesByNode = new Map();
933
+ for (const match of loaded.query.matches(tree.rootNode)) {
934
+ const definition = findCaptureNodeForDefinition(match.captures);
935
+ if (!definition)
936
+ continue;
937
+ if (!loaded.config.captureKindMap[definition.kindCaptureName])
938
+ continue;
939
+ const definitionNode = definition.capture.node;
940
+ const nameNode = findCaptureNode(match.captures, 'name');
941
+ // Skip *this match* if the name capture is missing or empty — but DO NOT drop the
942
+ // node entirely: a separate (lower-priority) match for the same node may still carry
943
+ // a valid name. The previous one-pass loop also `continue`d here, advancing to the
944
+ // next match without recording the node; the two-pass refactor preserves that
945
+ // behavior because a match with no usable name simply doesn't enter
946
+ // `candidatesByNode`, so a later (lower-priority) match can.
947
+ if (!nameNode || !nameNode.text)
948
+ continue;
949
+ const priority = definitionCapturePriority(definition.kindCaptureName);
950
+ const dedupeKey = `${definitionNode.startIndex}:${definitionNode.endIndex}`;
951
+ const existing = candidatesByNode.get(dedupeKey);
952
+ if (!existing || priority < existing.priority) {
953
+ candidatesByNode.set(dedupeKey, {
954
+ node: definitionNode,
955
+ nameNode,
956
+ kindCaptureName: definition.kindCaptureName,
957
+ priority
958
+ });
959
+ }
960
+ }
961
+ const symbols = [];
962
+ for (const candidate of candidatesByNode.values()) {
963
+ const kind = loaded.config.captureKindMap[candidate.kindCaptureName];
964
+ if (!kind)
965
+ continue;
966
+ const name = candidate.nameNode.text;
967
+ if (!loaded.config.isPublic(candidate.node, source, name))
968
+ continue;
969
+ const signature = buildSignature(candidate.node, source, loaded.config.bodyNodeTypes);
970
+ const docComment = collectAdjacentDocComment(candidate.node, source, loaded.config.docComment);
971
+ const sourceLine = candidate.node.startPosition.row + 1;
972
+ symbols.push({
973
+ name,
974
+ kind,
975
+ signature,
976
+ docComment,
977
+ tags: [],
978
+ sourceLine,
979
+ isDeprecated: false
980
+ });
981
+ }
982
+ symbols.sort((a, b) => a.sourceLine - b.sourceLine);
983
+ // File-level doc comment: language-specific. Rust's `//!` lives at file head.
984
+ const fileDocComment = loaded.config.id === 'rust' ? extractFileDocCommentRust(source) : null;
985
+ return {
986
+ sourcePath: relative,
987
+ moduleSlug: deriveModuleSlug(relative),
988
+ symbols,
989
+ fileDocComment
990
+ };
991
+ }
992
+ async function exists(filePath) {
993
+ try {
994
+ await fs.access(filePath);
995
+ return true;
996
+ }
997
+ catch {
998
+ return false;
999
+ }
1000
+ }
1001
+ export const treeSitterExtractor = {
1002
+ id: 'tree-sitter',
1003
+ async detect(rootDir) {
1004
+ // Claim the project iff (a) some configured language has a project signal in the
1005
+ // root (or a content match when `requireExtensionPresent` is set), AND (b) we can
1006
+ // actually load that language's vendored grammar. The grammar load is cheap on the
1007
+ // second call (cached) so detect() can be invoked freely.
1008
+ for (const config of LANGUAGES) {
1009
+ let signalMatched = false;
1010
+ if (config.projectSignals.length > 0) {
1011
+ for (const signal of config.projectSignals) {
1012
+ if (await exists(path.join(rootDir, signal))) {
1013
+ signalMatched = true;
1014
+ break;
1015
+ }
1016
+ }
1017
+ }
1018
+ else if (config.requireExtensionPresent) {
1019
+ // Content-based detect: short-circuit on first hit so we don't pay a full project
1020
+ // walk per call. With Bash registered (its only practical use of this flag), every
1021
+ // detect on a non-Bash project would otherwise scan the entire tree looking for a
1022
+ // single `.sh` file before falling through to other extractors.
1023
+ const include = config.walkOptions?.include ?? config.extensions.map((ext) => `**/*${ext}`);
1024
+ const exclude = config.walkOptions?.exclude;
1025
+ const found = await walkProjectSources(rootDir, { include, exclude, respectInternalConvention: false, limit: 1 });
1026
+ if (found.length > 0)
1027
+ signalMatched = true;
1028
+ }
1029
+ else {
1030
+ // Pure-extension-match languages with neither signals nor `requireExtensionPresent`
1031
+ // set: never claim. (No language ships in this state today; the branch exists as a
1032
+ // forward-compatibility guard so future configs can't accidentally hijack
1033
+ // signal-less projects.)
1034
+ continue;
1035
+ }
1036
+ if (!signalMatched)
1037
+ continue;
1038
+ const loaded = await loadGrammar(config);
1039
+ if (loaded) {
1040
+ return true;
1041
+ }
1042
+ }
1043
+ return false;
1044
+ },
1045
+ async walk(rootDir, options) {
1046
+ // When the caller passes explicit walkOptions we honor them as-is; otherwise we union
1047
+ // the per-language defaults so a project that mixes languages gets all of them
1048
+ // surfaced in one pass.
1049
+ if (options) {
1050
+ return walkProjectSources(rootDir, options);
1051
+ }
1052
+ const collected = [];
1053
+ for (const config of LANGUAGES) {
1054
+ // Skip languages that can't be loaded — no point walking files we can't parse.
1055
+ const loaded = await loadGrammar(config);
1056
+ if (!loaded)
1057
+ continue;
1058
+ const include = config.walkOptions?.include ?? defaultIncludeFor(config);
1059
+ const exclude = config.walkOptions?.exclude;
1060
+ const respectInternalConvention = config.walkOptions?.respectInternalConvention ?? false;
1061
+ const found = await walkProjectSources(rootDir, { include, exclude, respectInternalConvention });
1062
+ collected.push(...found);
1063
+ }
1064
+ // Sort + dedupe in case multiple language patterns capture the same path.
1065
+ return Array.from(new Set(collected)).sort();
1066
+ },
1067
+ async extract(sourcePath, options) {
1068
+ const rootDir = options?.rootDir ?? process.cwd();
1069
+ const config = languageForExtension(sourcePath);
1070
+ if (!config) {
1071
+ throw new Error(`treeSitterExtractor.extract: no configured language matches extension of ${sourcePath}`);
1072
+ }
1073
+ const loaded = await loadGrammar(config);
1074
+ if (!loaded) {
1075
+ throw new Error(`treeSitterExtractor.extract: vendored grammar for ${config.id} is missing — expected vendor/tree-sitter/${config.vendorSubdir}/tree-sitter-${config.id}.wasm`);
1076
+ }
1077
+ return extractWithGrammar(loaded, sourcePath, rootDir);
1078
+ }
1079
+ };