@rarusoft/dendrite-wiki 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +79 -0
  2. package/dist/api-extractor/extract.js +269 -0
  3. package/dist/api-extractor/language-extractor.js +15 -0
  4. package/dist/api-extractor/python-extractor.js +358 -0
  5. package/dist/api-extractor/render.js +195 -0
  6. package/dist/api-extractor/tree-sitter-extractor.js +1079 -0
  7. package/dist/api-extractor/types.js +11 -0
  8. package/dist/api-extractor/typescript-extractor.js +50 -0
  9. package/dist/api-extractor/walk.js +178 -0
  10. package/dist/api-reference.js +438 -0
  11. package/dist/benchmark-events.js +129 -0
  12. package/dist/benchmark.js +270 -0
  13. package/dist/binder-export.js +381 -0
  14. package/dist/canonical-target.js +168 -0
  15. package/dist/chart-insert.js +377 -0
  16. package/dist/chart-prompts.js +414 -0
  17. package/dist/context-cache.js +98 -0
  18. package/dist/contradicts-shipped-memory.js +232 -0
  19. package/dist/diff-context.js +142 -0
  20. package/dist/doctor.js +220 -0
  21. package/dist/generated-docs.js +219 -0
  22. package/dist/i18n.js +71 -0
  23. package/dist/index.js +49 -0
  24. package/dist/librarian.js +255 -0
  25. package/dist/maintenance-actions.js +244 -0
  26. package/dist/maintenance-inbox.js +842 -0
  27. package/dist/maintenance-runner.js +62 -0
  28. package/dist/page-drift.js +225 -0
  29. package/dist/page-inbox.js +168 -0
  30. package/dist/report-export.js +339 -0
  31. package/dist/review-bridge.js +1386 -0
  32. package/dist/search-index.js +199 -0
  33. package/dist/store.js +1617 -0
  34. package/dist/telemetry-defaults.js +44 -0
  35. package/dist/telemetry-report.js +263 -0
  36. package/dist/telemetry.js +544 -0
  37. package/dist/wiki-synthesis.js +901 -0
  38. package/package.json +35 -0
  39. package/src/api-extractor/extract.ts +333 -0
  40. package/src/api-extractor/language-extractor.ts +37 -0
  41. package/src/api-extractor/python-extractor.ts +380 -0
  42. package/src/api-extractor/render.ts +267 -0
  43. package/src/api-extractor/tree-sitter-extractor.ts +1210 -0
  44. package/src/api-extractor/types.ts +41 -0
  45. package/src/api-extractor/typescript-extractor.ts +56 -0
  46. package/src/api-extractor/walk.ts +209 -0
  47. package/src/api-reference.ts +552 -0
  48. package/src/benchmark-events.ts +216 -0
  49. package/src/benchmark.ts +376 -0
  50. package/src/binder-export.ts +437 -0
  51. package/src/canonical-target.ts +192 -0
  52. package/src/chart-insert.ts +478 -0
  53. package/src/chart-prompts.ts +417 -0
  54. package/src/context-cache.ts +129 -0
  55. package/src/contradicts-shipped-memory.ts +311 -0
  56. package/src/diff-context.ts +187 -0
  57. package/src/doctor.ts +260 -0
  58. package/src/generated-docs.ts +316 -0
  59. package/src/i18n.ts +106 -0
  60. package/src/index.ts +59 -0
  61. package/src/librarian.ts +331 -0
  62. package/src/maintenance-actions.ts +314 -0
  63. package/src/maintenance-inbox.ts +1132 -0
  64. package/src/maintenance-runner.ts +85 -0
  65. package/src/page-drift.ts +292 -0
  66. package/src/page-inbox.ts +254 -0
  67. package/src/report-export.ts +392 -0
  68. package/src/review-bridge.ts +1729 -0
  69. package/src/search-index.ts +266 -0
  70. package/src/store.ts +2171 -0
  71. package/src/telemetry-defaults.ts +50 -0
  72. package/src/telemetry-report.ts +365 -0
  73. package/src/telemetry.ts +757 -0
  74. package/src/wiki-synthesis.ts +1307 -0
@@ -0,0 +1,1210 @@
1
+ /**
2
+ * Generic `LanguageExtractor` powered by tree-sitter — the long-tail language layer.
3
+ *
4
+ * Where `typescript-extractor.ts` and `python-extractor.ts` are handcrafted for top-traffic
5
+ * languages with first-class compiler/AST surfaces, this module covers the long tail
6
+ * (Rust today; Go, Java, Ruby, C, C++, PHP next) via tree-sitter's portable WASM grammars
7
+ * and each grammar's upstream `queries/tags.scm` file. Every supported language lives as a
8
+ * single config-table entry — extension, vendored WASM path, vendored tags.scm path, a
9
+ * public-symbol predicate, a doc-comment association rule. Adding another language is a
10
+ * config addition, not a new module.
11
+ *
12
+ * Rationale (Phase B1 of the API reference roadmap): the per-language handcrafted path
13
+ * doesn't scale. GitHub's stack-graphs project — their multi-year attempt at bespoke
14
+ * per-language indexers — was archived in September 2025; even GitHub couldn't sustain it.
15
+ * Tree-sitter `tags.scm` is the durable middle tier the industry settled on. Output
16
+ * quality matches roughly what our handcrafted Python extractor produces (signatures with
17
+ * types-as-written, doc comments as prose), which is the bar for "binder-on-shelf"
18
+ * presentability.
19
+ *
20
+ * Determinism: parse trees change between grammar versions, so each vendored grammar is
21
+ * pinned by upstream tag and sha256 (recorded in `NOTICE` at the repo root).
22
+ * Same `(web-tree-sitter version, grammar tag, tags.scm sha256)` triple = same parse tree
23
+ * across machines. WASM grammars lazy-load on first use so projects that never touch a
24
+ * given language never pay its load cost.
25
+ */
26
+
27
+ import { existsSync } from 'node:fs';
28
+ import { promises as fs } from 'node:fs';
29
+ import path from 'node:path';
30
+ import { fileURLToPath } from 'node:url';
31
+ import { Language, Parser, Query } from 'web-tree-sitter';
32
+ import type { Node, QueryCapture } from 'web-tree-sitter';
33
+ import type { ApiFileReference, ApiSymbol, ApiSymbolKind } from './types.js';
34
+ import type { LanguageExtractor } from './language-extractor.js';
35
+ import { walkProjectSources, type WalkOptions } from './walk.js';
36
+
37
+ interface DocCommentRule {
38
+ // Line-comment prefix that marks a doc comment (e.g., `///` for Rust, `//` for Go).
39
+ // The first matching prefix wins; longest-prefix-first ordering recommended.
40
+ linePrefixes: string[];
41
+ // Optional block-comment open/close (e.g., `/**`/`*/` for Java/JS-style). When absent,
42
+ // only line-style doc comments are recognized.
43
+ blockOpen?: string;
44
+ blockClose?: string;
45
+ }
46
+
47
+ interface TreeSitterLanguageConfig {
48
+ // Stable id, matches what shows up in diagnostics and the pythonExtractor / TS pattern.
49
+ id: string;
50
+ // File extensions that mean "this is one of mine" — `.rs`, `.go`, `.java`, etc.
51
+ // First match wins; lowercased before comparison.
52
+ extensions: string[];
53
+ // Project-root signal files used by detect() — at least one must exist for the extractor
54
+ // to claim the project. Empty array = "any file with a matching extension is enough."
55
+ projectSignals: string[];
56
+ // Vendored grammar relative path under `vendor/tree-sitter/<id>/`. The convention is a
57
+ // directory containing `tree-sitter-<id>.wasm`, `tags.scm`, and `LICENSE`.
58
+ vendorSubdir: string;
59
+ // Some grammars publish their WASM under a name other than `tree-sitter-<id>.wasm`. The
60
+ // PHP grammar, for example, ships a single combined `tree-sitter-php.wasm` even though
61
+ // its repo is structurally a multi-grammar bundle. When omitted, falls back to the
62
+ // canonical `tree-sitter-<id>.wasm` filename.
63
+ wasmFilename?: string;
64
+ // Default include / exclude globs for `walkProjectSources`. When omitted, defaults are
65
+ // synthesized from `extensions`.
66
+ walkOptions?: WalkOptions;
67
+ // Maps a tag query capture name (e.g., `definition.class`) to our language-agnostic
68
+ // ApiSymbolKind. Captures not present in the map are dropped.
69
+ captureKindMap: Record<string, ApiSymbolKind>;
70
+ // Doc-comment association rule. The extractor walks backward from each definition node
71
+ // through preceding siblings, collecting contiguous comments that match this rule.
72
+ docComment: DocCommentRule;
73
+ // tree-sitter node types that represent the BODY of a definition (function bodies,
74
+ // class members, struct fields). The signature renderer slices the source text up to
75
+ // (but not including) the first body child, so signatures stay clean and bodies don't
76
+ // bloat the API page. Different grammars use different node-type names — Rust uses
77
+ // `block`, Go also `block`, Java uses `block` for methods + `class_body` for classes,
78
+ // Ruby uses `body_statement`, C/C++ use `compound_statement`/`field_declaration_list`,
79
+ // PHP uses `compound_statement`/`declaration_list`.
80
+ bodyNodeTypes: ReadonlySet<string>;
81
+ // Returns true if the given definition node represents a public/exported symbol. The
82
+ // signature receives the captured definition node and the original source text so the
83
+ // predicate can inspect modifiers, naming conventions, etc.
84
+ isPublic(definitionNode: Node, source: string, name: string): boolean;
85
+ // For languages without a canonical project-marker file (Bash, Lua, etc.), require at
86
+ // least one file matching this language's extensions to exist under rootDir before
87
+ // claiming. This prevents the extractor from claiming an arbitrary directory just
88
+ // because the language has empty `projectSignals`. Defaults to false; ignored when
89
+ // `projectSignals` is non-empty (the signal check wins).
90
+ requireExtensionPresent?: boolean;
91
+ }
92
+
93
+ function rustIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
94
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
95
+ const child = definitionNode.namedChild(i);
96
+ if (child && child.type === 'visibility_modifier') {
97
+ // `pub`, `pub(crate)`, `pub(super)`, `pub(in path)` all count for our purposes —
98
+ // any pub-prefixed visibility is part of the crate's public-or-internal API contract.
99
+ return child.text.startsWith('pub');
100
+ }
101
+ }
102
+ return false;
103
+ }
104
+
105
+ const RUST_CONFIG: TreeSitterLanguageConfig = {
106
+ id: 'rust',
107
+ extensions: ['.rs'],
108
+ projectSignals: ['Cargo.toml'],
109
+ vendorSubdir: 'rust',
110
+ walkOptions: {
111
+ include: ['src/**/*.rs', 'examples/**/*.rs', 'lib.rs', 'main.rs'],
112
+ exclude: ['**/target/**', '**/tests/**', '**/*_test.rs', '**/build.rs', '**/node_modules/**'],
113
+ respectInternalConvention: false
114
+ },
115
+ // Rust's tags.scm maps:
116
+ // struct/enum/union/type → @definition.class
117
+ // trait → @definition.interface
118
+ // function → @definition.function
119
+ // method (inside an impl block) → @definition.method
120
+ // module → @definition.module (we drop these)
121
+ // macro → @definition.macro (we drop these for now; they don't fit the existing kind set)
122
+ captureKindMap: {
123
+ 'definition.class': 'class',
124
+ 'definition.interface': 'interface',
125
+ 'definition.function': 'function',
126
+ 'definition.method': 'function'
127
+ },
128
+ docComment: {
129
+ // Rust: `///` for outer doc, `//!` for inner doc. We only attach outer doc to a
130
+ // definition; inner doc is module-level and surfaces via fileDocComment instead.
131
+ linePrefixes: ['///']
132
+ },
133
+ bodyNodeTypes: new Set(['block', 'field_declaration_list', 'declaration_list', 'enum_variant_list', 'trait_block']),
134
+ isPublic: rustIsPublic
135
+ };
136
+
137
+ // --- Go --------------------------------------------------------------------
138
+
139
+ function goIsPublic(_definitionNode: Node, _source: string, name: string): boolean {
140
+ // Go's "exported" rule is purely lexical: an identifier whose first letter is uppercase
141
+ // (Unicode-uppercase via `IsUpper`) is exported from its package. We match that exactly.
142
+ return name.length > 0 && name[0] === name[0].toUpperCase() && name[0] !== name[0].toLowerCase();
143
+ }
144
+
145
+ const GO_CONFIG: TreeSitterLanguageConfig = {
146
+ id: 'go',
147
+ extensions: ['.go'],
148
+ projectSignals: ['go.mod'],
149
+ vendorSubdir: 'go',
150
+ walkOptions: {
151
+ include: ['**/*.go'],
152
+ exclude: ['**/*_test.go', '**/vendor/**', '**/node_modules/**'],
153
+ respectInternalConvention: false
154
+ },
155
+ captureKindMap: {
156
+ 'definition.function': 'function',
157
+ 'definition.method': 'function',
158
+ // Go's grammar uses `definition.type` for type_spec — that covers struct, interface,
159
+ // type alias, and named-type all under one capture. Mapping all of them to `class`
160
+ // matches what readers care about: "this is a type defined in this package."
161
+ 'definition.type': 'class'
162
+ },
163
+ docComment: {
164
+ // Go's documentation convention is plain `//` comments immediately preceding the
165
+ // declaration, with text starting on the same line as the symbol's name. No special
166
+ // prefix character.
167
+ linePrefixes: ['//']
168
+ },
169
+ bodyNodeTypes: new Set(['block', 'field_declaration_list', 'method_spec_list', 'interface_type', 'struct_type']),
170
+ isPublic: goIsPublic
171
+ };
172
+
173
+ // --- Java ------------------------------------------------------------------
174
+
175
+ function javaIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
176
+ // Java requires an explicit `public` modifier in the declaration's `modifiers` child.
177
+ // Package-private (no modifier) and `protected` / `private` are excluded from the
178
+ // generated API reference; readers reading "what does this class expose" expect the
179
+ // formal `public` API surface.
180
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
181
+ const child = definitionNode.namedChild(i);
182
+ if (child && child.type === 'modifiers') {
183
+ return /\bpublic\b/.test(child.text);
184
+ }
185
+ }
186
+ return false;
187
+ }
188
+
189
+ const JAVA_CONFIG: TreeSitterLanguageConfig = {
190
+ id: 'java',
191
+ extensions: ['.java'],
192
+ projectSignals: ['pom.xml', 'build.gradle', 'build.gradle.kts', 'settings.gradle', 'settings.gradle.kts'],
193
+ vendorSubdir: 'java',
194
+ walkOptions: {
195
+ include: ['src/**/*.java', '**/*.java'],
196
+ exclude: ['**/test/**', '**/tests/**', '**/build/**', '**/target/**', '**/.gradle/**', '**/node_modules/**'],
197
+ respectInternalConvention: false
198
+ },
199
+ captureKindMap: {
200
+ 'definition.class': 'class',
201
+ 'definition.interface': 'interface',
202
+ 'definition.method': 'function'
203
+ },
204
+ docComment: {
205
+ // Javadoc — block comments delimited by `/** */`. The renderer's block path strips
206
+ // leading `* ` from each interior line.
207
+ linePrefixes: [],
208
+ blockOpen: '/**',
209
+ blockClose: '*/'
210
+ },
211
+ bodyNodeTypes: new Set(['class_body', 'interface_body', 'block', 'enum_body', 'annotation_type_body']),
212
+ isPublic: javaIsPublic
213
+ };
214
+
215
+ // --- Ruby ------------------------------------------------------------------
216
+
217
+ function rubyIsPublic(_definitionNode: Node, _source: string, _name: string): boolean {
218
+ // Ruby's visibility model is more flexible than `public`/`private` modifiers — it's
219
+ // section-based via `private`/`protected` keywords inside class bodies. Properly tracking
220
+ // section state requires walking the surrounding class body, which we skip in this first
221
+ // cut. Since Ruby's *default* is public and most idiomatic Ruby code keeps the public
222
+ // API at module level (with `private` reserved for class internals), we accept the
223
+ // over-inclusion: every captured definition is treated as public. Future enhancement
224
+ // could detect intervening `private`/`protected` calls.
225
+ return true;
226
+ }
227
+
228
+ const RUBY_CONFIG: TreeSitterLanguageConfig = {
229
+ id: 'ruby',
230
+ extensions: ['.rb'],
231
+ projectSignals: ['Gemfile', 'Rakefile'],
232
+ vendorSubdir: 'ruby',
233
+ walkOptions: {
234
+ include: ['lib/**/*.rb', 'app/**/*.rb', '**/*.rb'],
235
+ exclude: ['**/spec/**', '**/test/**', '**/vendor/**', '**/node_modules/**', '**/tmp/**'],
236
+ respectInternalConvention: false
237
+ },
238
+ captureKindMap: {
239
+ 'definition.class': 'class',
240
+ 'definition.module': 'class',
241
+ 'definition.method': 'function'
242
+ },
243
+ docComment: {
244
+ // Ruby uses `#` for line comments. Documentation generators (RDoc, YARD) attach
245
+ // contiguous `#`-prefixed comments to the following declaration.
246
+ linePrefixes: ['#']
247
+ },
248
+ bodyNodeTypes: new Set(['body_statement', 'do_block']),
249
+ isPublic: rubyIsPublic
250
+ };
251
+
252
+ // --- C ---------------------------------------------------------------------
253
+
254
+ function hasStaticStorageClass(node: Node): boolean {
255
+ for (let i = 0; i < node.namedChildCount; i += 1) {
256
+ const child = node.namedChild(i);
257
+ if (child && child.type === 'storage_class_specifier' && child.text.includes('static')) {
258
+ return true;
259
+ }
260
+ }
261
+ return false;
262
+ }
263
+
264
+ function cIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
265
+ // C has no language-level public/private, so the right cut is "is this declaration
266
+ // intended for the linker's external symbol table?" — i.e., NOT marked `static`.
267
+ // The C grammar's tags.scm captures `function_declarator` (a child of the wrapping
268
+ // `declaration` node), but `storage_class_specifier` lives on the declaration itself,
269
+ // so we have to look both at the captured node and its parent to find the modifier.
270
+ if (hasStaticStorageClass(definitionNode)) return false;
271
+ if (definitionNode.parent && hasStaticStorageClass(definitionNode.parent)) return false;
272
+ return true;
273
+ }
274
+
275
+ const C_CONFIG: TreeSitterLanguageConfig = {
276
+ id: 'c',
277
+ extensions: ['.c', '.h'],
278
+ projectSignals: ['Makefile', 'CMakeLists.txt', 'meson.build', 'configure.ac'],
279
+ vendorSubdir: 'c',
280
+ walkOptions: {
281
+ include: ['**/*.h', '**/*.c'],
282
+ exclude: ['**/build/**', '**/cmake-build-*/**', '**/.deps/**', '**/node_modules/**'],
283
+ respectInternalConvention: false
284
+ },
285
+ captureKindMap: {
286
+ 'definition.class': 'class', // struct / union
287
+ 'definition.function': 'function',
288
+ 'definition.type': 'type-alias' // typedef / enum
289
+ },
290
+ docComment: {
291
+ // Doxygen convention. Line-prefix `///` and Javadoc-style block `/** */` both signal
292
+ // a doc comment in idiomatic C codebases.
293
+ linePrefixes: ['///'],
294
+ blockOpen: '/**',
295
+ blockClose: '*/'
296
+ },
297
+ bodyNodeTypes: new Set(['compound_statement', 'field_declaration_list', 'enumerator_list']),
298
+ isPublic: cIsPublic
299
+ };
300
+
301
+ // --- C++ -------------------------------------------------------------------
302
+
303
+ function cppIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
304
+ // C++ inherits C's static-linkage rule for free-standing functions and adds class-member
305
+ // access specifiers. Properly tracking `public:` / `private:` / `protected:` sections
306
+ // requires walking back to the nearest access_specifier inside the surrounding class —
307
+ // we skip that for the first cut and apply C's static-only filter, which already covers
308
+ // the common case (free-standing functions in headers). Class members will be
309
+ // over-included; a follow-up can tighten this. Headers (`.h`/`.hpp`) are the public API
310
+ // surface anyway, and that's where most readers look first.
311
+ if (hasStaticStorageClass(definitionNode)) return false;
312
+ if (definitionNode.parent && hasStaticStorageClass(definitionNode.parent)) return false;
313
+ return true;
314
+ }
315
+
316
+ const CPP_CONFIG: TreeSitterLanguageConfig = {
317
+ id: 'cpp',
318
+ extensions: ['.cpp', '.cc', '.cxx', '.hpp', '.hh', '.hxx', '.h'],
319
+ projectSignals: ['CMakeLists.txt', 'Makefile', 'meson.build', 'conanfile.txt', 'conanfile.py'],
320
+ vendorSubdir: 'cpp',
321
+ walkOptions: {
322
+ include: ['**/*.hpp', '**/*.hh', '**/*.hxx', '**/*.h', '**/*.cpp', '**/*.cc', '**/*.cxx'],
323
+ exclude: ['**/build/**', '**/cmake-build-*/**', '**/.deps/**', '**/node_modules/**'],
324
+ respectInternalConvention: false
325
+ },
326
+ captureKindMap: {
327
+ 'definition.class': 'class',
328
+ 'definition.function': 'function',
329
+ 'definition.method': 'function',
330
+ 'definition.type': 'type-alias'
331
+ },
332
+ docComment: {
333
+ linePrefixes: ['///'],
334
+ blockOpen: '/**',
335
+ blockClose: '*/'
336
+ },
337
+ bodyNodeTypes: new Set(['compound_statement', 'field_declaration_list', 'enumerator_list', 'namespace_body']),
338
+ isPublic: cppIsPublic
339
+ };
340
+
341
+ // --- PHP -------------------------------------------------------------------
342
+
343
+ function phpIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
344
+ // PHP defaults to public visibility. The relevant signal is whether the declaration's
345
+ // modifiers list contains `private` or `protected`; if so, exclude. If no modifiers or
346
+ // `public` is explicit, include.
347
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
348
+ const child = definitionNode.namedChild(i);
349
+ if (child && (child.type === 'visibility_modifier' || child.type === 'modifiers')) {
350
+ const text = child.text;
351
+ if (/\b(private|protected)\b/.test(text)) {
352
+ return false;
353
+ }
354
+ }
355
+ }
356
+ return true;
357
+ }
358
+
359
+ const PHP_CONFIG: TreeSitterLanguageConfig = {
360
+ id: 'php',
361
+ extensions: ['.php'],
362
+ projectSignals: ['composer.json'],
363
+ vendorSubdir: 'php',
364
+ walkOptions: {
365
+ include: ['src/**/*.php', 'lib/**/*.php', '**/*.php'],
366
+ exclude: ['**/vendor/**', '**/tests/**', '**/Tests/**', '**/node_modules/**', '**/.phpunit.cache/**'],
367
+ respectInternalConvention: false
368
+ },
369
+ captureKindMap: {
370
+ 'definition.class': 'class',
371
+ // PHP's tags.scm captures both `interface` and `trait` as definition.interface — both
372
+ // are reasonable to render as interface-like surfaces.
373
+ 'definition.interface': 'interface',
374
+ 'definition.function': 'function'
375
+ },
376
+ docComment: {
377
+ // PHPDoc — same `/** */` shape as Javadoc.
378
+ linePrefixes: [],
379
+ blockOpen: '/**',
380
+ blockClose: '*/'
381
+ },
382
+ bodyNodeTypes: new Set(['compound_statement', 'declaration_list', 'enum_declaration_list']),
383
+ isPublic: phpIsPublic
384
+ };
385
+
386
+ // --- C# --------------------------------------------------------------------
387
+
388
+ function csharpIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
389
+ // C#'s default access for class members is `private`; for top-level types it's
390
+ // `internal`. The API-reference contract is "what would a caller in another assembly
391
+ // see," so we require an explicit `public` modifier. Modifiers in tree-sitter-c-sharp
392
+ // appear as `modifier` children directly under the declaration.
393
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
394
+ const child = definitionNode.namedChild(i);
395
+ if (child && child.type === 'modifier' && child.text === 'public') {
396
+ return true;
397
+ }
398
+ }
399
+ return false;
400
+ }
401
+
402
+ const CSHARP_CONFIG: TreeSitterLanguageConfig = {
403
+ id: 'csharp',
404
+ extensions: ['.cs'],
405
+ projectSignals: ['global.json', 'Directory.Build.props', 'Directory.Build.targets'],
406
+ vendorSubdir: 'csharp',
407
+ // C#'s release publishes WASM with an underscore — `tree-sitter-c_sharp.wasm` —
408
+ // because the npm package convention forbids hyphens in module names. We honor that.
409
+ wasmFilename: 'tree-sitter-c_sharp.wasm',
410
+ walkOptions: {
411
+ include: ['**/*.cs'],
412
+ exclude: ['**/bin/**', '**/obj/**', '**/Tests/**', '**/*.Tests/**', '**/node_modules/**'],
413
+ respectInternalConvention: false
414
+ },
415
+ captureKindMap: {
416
+ 'definition.class': 'class',
417
+ 'definition.interface': 'interface',
418
+ 'definition.method': 'function'
419
+ },
420
+ docComment: {
421
+ // C# XML-doc convention is `///` line comments. Some codebases also use
422
+ // `/** */`. Support both.
423
+ linePrefixes: ['///'],
424
+ blockOpen: '/**',
425
+ blockClose: '*/'
426
+ },
427
+ bodyNodeTypes: new Set(['declaration_list', 'block', 'enum_member_declaration_list']),
428
+ isPublic: csharpIsPublic
429
+ };
430
+
431
+ // --- Swift -----------------------------------------------------------------
432
+
433
+ function swiftIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
434
+ // Swift's default access is `internal`. The two access levels above that — `public`
435
+ // (callable from other modules) and `open` (subclassable / overridable from other
436
+ // modules) — are what API docs should show. We accept both as "public" for the API
437
+ // reference; private/fileprivate/internal are filtered.
438
+ // Modifier nodes in the alex-pinkus grammar appear as `modifiers` (a parent list) with
439
+ // children of type `visibility_modifier`, `inheritance_modifier`, etc. Walk one level
440
+ // to find any visibility marker.
441
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
442
+ const child = definitionNode.namedChild(i);
443
+ if (!child) continue;
444
+ if (child.type === 'modifiers') {
445
+ const text = child.text;
446
+ if (/\b(public|open)\b/.test(text)) return true;
447
+ if (/\b(private|fileprivate|internal)\b/.test(text)) return false;
448
+ }
449
+ if (child.type === 'visibility_modifier') {
450
+ const text = child.text;
451
+ if (text === 'public' || text === 'open') return true;
452
+ if (text === 'private' || text === 'fileprivate' || text === 'internal') return false;
453
+ }
454
+ }
455
+ // No explicit modifier → Swift default is `internal`, which we treat as not-public for
456
+ // API reference purposes.
457
+ return false;
458
+ }
459
+
460
+ const SWIFT_CONFIG: TreeSitterLanguageConfig = {
461
+ id: 'swift',
462
+ extensions: ['.swift'],
463
+ projectSignals: ['Package.swift', 'Podfile', 'project.yml'],
464
+ vendorSubdir: 'swift',
465
+ walkOptions: {
466
+ include: ['Sources/**/*.swift', '**/*.swift'],
467
+ exclude: ['**/Tests/**', '**/.build/**', '**/Pods/**', '**/DerivedData/**', '**/node_modules/**'],
468
+ respectInternalConvention: false
469
+ },
470
+ captureKindMap: {
471
+ 'definition.class': 'class',
472
+ // Swift `protocol` is the closest equivalent to an interface.
473
+ 'definition.interface': 'interface',
474
+ 'definition.method': 'function',
475
+ 'definition.function': 'function',
476
+ 'definition.property': 'variable'
477
+ },
478
+ docComment: {
479
+ // Swift's documentation convention is `///` outer-doc lines and `/** */` blocks.
480
+ linePrefixes: ['///'],
481
+ blockOpen: '/**',
482
+ blockClose: '*/'
483
+ },
484
+ bodyNodeTypes: new Set(['class_body', 'protocol_body', 'function_body']),
485
+ isPublic: swiftIsPublic
486
+ };
487
+
488
+ // --- Lua -------------------------------------------------------------------
489
+
490
+ function luaIsPublic(definitionNode: Node, source: string, _name: string): boolean {
491
+ // Lua has no language-level visibility; the convention is the `local` keyword
492
+ // (`local function foo()` / `local foo = function() end`). tree-sitter-lua's
493
+ // `function_declaration` and `assignment_statement` both INCLUDE the leading `local`
494
+ // token as part of the captured node when present, so the cheapest reliable check is
495
+ // whether the captured text starts with `local`. We also do a small backward look at
496
+ // the source immediately before the node in case a future grammar revision changes
497
+ // where the `local` keyword sits in the parse tree.
498
+ const text = definitionNode.text;
499
+ if (/^\s*local\b/.test(text)) {
500
+ return false;
501
+ }
502
+ const lookback = source.slice(Math.max(0, definitionNode.startIndex - 32), definitionNode.startIndex);
503
+ if (/\blocal\s+$/.test(lookback)) {
504
+ return false;
505
+ }
506
+ return true;
507
+ }
508
+
509
+ const LUA_CONFIG: TreeSitterLanguageConfig = {
510
+ id: 'lua',
511
+ extensions: ['.lua'],
512
+ // Lua has no canonical project file. LuaRocks `.rockspec` is closest, but we also
513
+ // accept any directory containing Lua sources by listing `init.lua` (Neovim plugin
514
+ // convention) and the LuaRocks rocks directory.
515
+ projectSignals: ['init.lua', '.luarocks'],
516
+ vendorSubdir: 'lua',
517
+ walkOptions: {
518
+ include: ['lua/**/*.lua', 'src/**/*.lua', '**/*.lua'],
519
+ exclude: ['**/spec/**', '**/test/**', '**/.luarocks/**', '**/node_modules/**'],
520
+ respectInternalConvention: false
521
+ },
522
+ captureKindMap: {
523
+ 'definition.function': 'function',
524
+ 'definition.method': 'function'
525
+ },
526
+ docComment: {
527
+ // Lua line comments are `--`. The LDoc convention adds a triple-dash for doc
528
+ // comments (`---`). Both prefixes count, with longest-first ordering so `---` wins
529
+ // over `--` on lines that have both.
530
+ linePrefixes: ['---', '--']
531
+ },
532
+ bodyNodeTypes: new Set(['block']),
533
+ isPublic: luaIsPublic
534
+ };
535
+
536
+ // --- Scala -----------------------------------------------------------------
537
+
538
+ function scalaIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
539
+ // Scala defaults class members to public; explicit `private` / `protected` modifiers
540
+ // exclude. The grammar surfaces modifiers as a `modifiers` child or directly as
541
+ // `access_modifier` siblings; check both.
542
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
543
+ const child = definitionNode.namedChild(i);
544
+ if (!child) continue;
545
+ if (child.type === 'modifiers' || child.type === 'access_modifier' || child.type === 'modifier') {
546
+ const text = child.text;
547
+ if (/\b(private|protected)\b/.test(text)) return false;
548
+ }
549
+ }
550
+ return true;
551
+ }
552
+
553
+ const SCALA_CONFIG: TreeSitterLanguageConfig = {
554
+ id: 'scala',
555
+ extensions: ['.scala', '.sc'],
556
+ projectSignals: ['build.sbt', 'build.sc', 'pom.xml'],
557
+ vendorSubdir: 'scala',
558
+ walkOptions: {
559
+ include: ['src/**/*.scala', '**/*.scala'],
560
+ exclude: ['**/test/**', '**/target/**', '**/.bloop/**', '**/.metals/**', '**/node_modules/**'],
561
+ respectInternalConvention: false
562
+ },
563
+ captureKindMap: {
564
+ 'definition.class': 'class',
565
+ 'definition.interface': 'interface', // trait
566
+ 'definition.enum': 'enum',
567
+ 'definition.function': 'function',
568
+ 'definition.object': 'class' // singleton object — closest existing kind
569
+ },
570
+ docComment: {
571
+ // Scaladoc — same `/** */` shape as Javadoc.
572
+ linePrefixes: [],
573
+ blockOpen: '/**',
574
+ blockClose: '*/'
575
+ },
576
+ bodyNodeTypes: new Set(['template_body', 'block', 'class_parameters']),
577
+ isPublic: scalaIsPublic
578
+ };
579
+
580
+ // --- Elixir ----------------------------------------------------------------
581
+
582
+ function elixirIsPublic(_definitionNode: Node, source: string, _name: string): boolean {
583
+ // Elixir distinguishes `def` (public) from `defp` (private). The capture in tags.scm
584
+ // is parameterized by the `target.identifier` name (def/defp/etc.); we check the source
585
+ // text immediately preceding the captured node for the relevant keyword.
586
+ const startIdx = _definitionNode.startIndex;
587
+ const window = source.slice(Math.max(0, startIdx - 20), startIdx + 8);
588
+ if (/\bdefp\b/.test(window)) return false;
589
+ if (/\bdefmacrop\b/.test(window)) return false;
590
+ if (/\bdefguardp\b/.test(window)) return false;
591
+ if (/\bdefnp\b/.test(window)) return false;
592
+ return true;
593
+ }
594
+
595
+ const ELIXIR_CONFIG: TreeSitterLanguageConfig = {
596
+ id: 'elixir',
597
+ extensions: ['.ex', '.exs'],
598
+ projectSignals: ['mix.exs'],
599
+ vendorSubdir: 'elixir',
600
+ walkOptions: {
601
+ include: ['lib/**/*.ex', '**/*.ex'],
602
+ exclude: ['**/test/**', '**/_build/**', '**/deps/**', '**/.elixir_ls/**', '**/node_modules/**'],
603
+ respectInternalConvention: false
604
+ },
605
+ captureKindMap: {
606
+ 'definition.module': 'class',
607
+ 'definition.function': 'function'
608
+ },
609
+ docComment: {
610
+ // Elixir's `@doc` attribute holds the prose, but at the source level it appears as
611
+ // a `@doc """ ... """` heredoc preceding the def. The simpler convention also seen in
612
+ // libraries is `#`-prefixed line comments. Our walker handles both: `#` lines win
613
+ // first; heredoc `@doc` would need extractor-level support beyond this first cut.
614
+ linePrefixes: ['#']
615
+ },
616
+ bodyNodeTypes: new Set(['do_block', 'block']),
617
+ isPublic: elixirIsPublic
618
+ };
619
+
620
+ // --- OCaml -----------------------------------------------------------------
621
+
622
+ function ocamlIsPublic(_definitionNode: Node, _source: string, _name: string): boolean {
623
+ // OCaml's visibility model lives in module signatures (`.mli` files) — anything
624
+ // exposed there is public. Inside `.ml` files everything is technically reachable from
625
+ // outside the module unless the project ships a signature that hides it. For this
626
+ // first cut we treat all captured definitions as public; a future enhancement can
627
+ // honor signature files.
628
+ return true;
629
+ }
630
+
631
+ const OCAML_CONFIG: TreeSitterLanguageConfig = {
632
+ id: 'ocaml',
633
+ extensions: ['.ml', '.mli'],
634
+ projectSignals: ['dune-project', 'dune', '_oasis'],
635
+ vendorSubdir: 'ocaml',
636
+ walkOptions: {
637
+ include: ['**/*.ml', '**/*.mli'],
638
+ exclude: ['**/_build/**', '**/.merlin', '**/node_modules/**'],
639
+ respectInternalConvention: false
640
+ },
641
+ captureKindMap: {
642
+ 'definition.module': 'class', // OCaml modules are the closest analogue
643
+ 'definition.interface': 'interface',
644
+ 'definition.class': 'class',
645
+ 'definition.function': 'function',
646
+ 'definition.method': 'function'
647
+ },
648
+ docComment: {
649
+ // OCaml's documentation convention is `(** ... *)` block comments, with the `**`
650
+ // prefix distinguishing them from regular `(* ... *)` comments.
651
+ linePrefixes: [],
652
+ blockOpen: '(**',
653
+ blockClose: '*)'
654
+ },
655
+ bodyNodeTypes: new Set(['structure', 'signature', 'module_binding']),
656
+ isPublic: ocamlIsPublic
657
+ };
658
+
659
+ // --- Kotlin ----------------------------------------------------------------
660
+
661
+ function kotlinIsPublic(definitionNode: Node, _source: string, _name: string): boolean {
662
+ // Kotlin defaults to public visibility; explicit `private`, `protected`, or `internal`
663
+ // modifiers exclude. Modifiers appear as a `modifiers` child whose textual content
664
+ // contains the visibility keyword.
665
+ for (let i = 0; i < definitionNode.namedChildCount; i += 1) {
666
+ const child = definitionNode.namedChild(i);
667
+ if (!child) continue;
668
+ if (child.type === 'modifiers' || child.type === 'modifier' || child.type === 'visibility_modifier') {
669
+ const text = child.text;
670
+ if (/\b(private|protected|internal)\b/.test(text)) return false;
671
+ }
672
+ }
673
+ return true;
674
+ }
675
+
676
+ const KOTLIN_CONFIG: TreeSitterLanguageConfig = {
677
+ id: 'kotlin',
678
+ extensions: ['.kt', '.kts'],
679
+ projectSignals: ['build.gradle.kts', 'settings.gradle.kts', 'build.gradle', 'pom.xml'],
680
+ vendorSubdir: 'kotlin',
681
+ walkOptions: {
682
+ include: ['src/**/*.kt', 'src/**/*.kts', '**/*.kt', '**/*.kts'],
683
+ exclude: ['**/test/**', '**/build/**', '**/.gradle/**', '**/node_modules/**'],
684
+ respectInternalConvention: false
685
+ },
686
+ captureKindMap: {
687
+ 'definition.class': 'class',
688
+ 'definition.function': 'function'
689
+ },
690
+ docComment: {
691
+ // KDoc — same `/** */` shape as Javadoc.
692
+ linePrefixes: [],
693
+ blockOpen: '/**',
694
+ blockClose: '*/'
695
+ },
696
+ bodyNodeTypes: new Set(['class_body', 'function_body', 'enum_class_body', 'block']),
697
+ isPublic: kotlinIsPublic
698
+ };
699
+
700
+ // --- Bash ------------------------------------------------------------------
701
+
702
+ function bashIsPublic(_definitionNode: Node, _source: string, _name: string): boolean {
703
+ // Bash has no language-level visibility. Every function definition in a script is
704
+ // reachable by any caller in the same shell. We surface them all.
705
+ return true;
706
+ }
707
+
708
+ const BASH_CONFIG: TreeSitterLanguageConfig = {
709
+ id: 'bash',
710
+ extensions: ['.sh', '.bash'],
711
+ // Shell scripts have no canonical project-marker file, so we fall back to a content-
712
+ // based claim: detect-time walker finds at least one .sh / .bash file under the root.
713
+ projectSignals: [],
714
+ requireExtensionPresent: true,
715
+ vendorSubdir: 'bash',
716
+ walkOptions: {
717
+ include: ['**/*.sh', '**/*.bash'],
718
+ exclude: ['**/node_modules/**', '**/.git/**'],
719
+ respectInternalConvention: false
720
+ },
721
+ captureKindMap: {
722
+ 'definition.function': 'function'
723
+ },
724
+ docComment: {
725
+ // Bash only has line comments with `#`.
726
+ linePrefixes: ['#']
727
+ },
728
+ bodyNodeTypes: new Set(['compound_statement']),
729
+ isPublic: bashIsPublic
730
+ };
731
+
732
+ const LANGUAGES: readonly TreeSitterLanguageConfig[] = [
733
+ RUST_CONFIG,
734
+ GO_CONFIG,
735
+ JAVA_CONFIG,
736
+ RUBY_CONFIG,
737
+ C_CONFIG,
738
+ CPP_CONFIG,
739
+ PHP_CONFIG,
740
+ CSHARP_CONFIG,
741
+ SWIFT_CONFIG,
742
+ LUA_CONFIG,
743
+ SCALA_CONFIG,
744
+ ELIXIR_CONFIG,
745
+ OCAML_CONFIG,
746
+ KOTLIN_CONFIG,
747
+ BASH_CONFIG
748
+ ];
749
+
750
+ const moduleDir = path.dirname(fileURLToPath(import.meta.url));
751
+
752
+ // Walk upward from the compiled/source module location to find `vendor/tree-sitter`. This
753
+ // works under both `tsx` (running TypeScript directly from `src/`) and the built JS layout
754
+ // (`dist/src/wiki/api-extractor/...`) because each layout has a different relative depth
755
+ // to the project root.
756
+ function resolveVendorRoot(): string | null {
757
+ let dir = moduleDir;
758
+ // Bound the walk so we never escape arbitrarily far.
759
+ for (let i = 0; i < 8; i += 1) {
760
+ const candidate = path.join(dir, 'vendor', 'tree-sitter');
761
+ if (existsSync(candidate)) {
762
+ return candidate;
763
+ }
764
+ const parent = path.dirname(dir);
765
+ if (parent === dir) break;
766
+ dir = parent;
767
+ }
768
+ return null;
769
+ }
770
+
771
+ let parserInitPromise: Promise<void> | null = null;
772
+ async function ensureParserInit(): Promise<void> {
773
+ if (!parserInitPromise) {
774
+ parserInitPromise = Parser.init();
775
+ }
776
+ return parserInitPromise;
777
+ }
778
+
779
+ interface LoadedGrammar {
780
+ config: TreeSitterLanguageConfig;
781
+ language: Language;
782
+ query: Query;
783
+ }
784
+
785
+ const loadedGrammars = new Map<string, Promise<LoadedGrammar | null>>();
786
+
787
+ async function loadGrammar(config: TreeSitterLanguageConfig): Promise<LoadedGrammar | null> {
788
+ const cached = loadedGrammars.get(config.id);
789
+ if (cached !== undefined) {
790
+ return cached;
791
+ }
792
+ const promise = (async () => {
793
+ const vendorRoot = resolveVendorRoot();
794
+ if (!vendorRoot) {
795
+ return null;
796
+ }
797
+ const wasmFilename = config.wasmFilename ?? `tree-sitter-${config.id}.wasm`;
798
+ const wasmPath = path.join(vendorRoot, config.vendorSubdir, wasmFilename);
799
+ const tagsScmPath = path.join(vendorRoot, config.vendorSubdir, 'tags.scm');
800
+ if (!existsSync(wasmPath) || !existsSync(tagsScmPath)) {
801
+ return null;
802
+ }
803
+ await ensureParserInit();
804
+ const language = await Language.load(wasmPath);
805
+ const queryText = await fs.readFile(tagsScmPath, 'utf8');
806
+ const query = new Query(language, queryText);
807
+ return { config, language, query };
808
+ })();
809
+ loadedGrammars.set(config.id, promise);
810
+ return promise;
811
+ }
812
+
813
+ // Test-only escape hatch: clear the cache so tests can simulate cold loads or replace
814
+ // vendored bundles between runs.
815
+ export function resetTreeSitterGrammarCache(): void {
816
+ loadedGrammars.clear();
817
+ }
818
+
819
+ function languageForExtension(filePath: string): TreeSitterLanguageConfig | null {
820
+ const ext = path.extname(filePath).toLowerCase();
821
+ for (const lang of LANGUAGES) {
822
+ if (lang.extensions.includes(ext)) {
823
+ return lang;
824
+ }
825
+ }
826
+ return null;
827
+ }
828
+
829
+ function defaultIncludeFor(config: TreeSitterLanguageConfig): string[] {
830
+ // Build a generic include list from the language's extensions when the config doesn't
831
+ // override walkOptions.include. e.g., `.rs` → ['**/*.rs'].
832
+ return config.extensions.map((ext) => `**/*${ext}`);
833
+ }
834
+
835
+ function findCaptureNode(captures: QueryCapture[], name: string): Node | undefined {
836
+ return captures.find((capture) => capture.name === name)?.node;
837
+ }
838
+
839
+ // When multiple `@definition.*` captures could fire for the same node (e.g., Swift's
840
+ // grammar matches a class method as both `definition.method` and `definition.function`,
841
+ // and PHP captures `interface` and `trait` both as `definition.interface`), we want
842
+ // deterministic kind selection — not "whichever pattern tree-sitter iterated first."
843
+ // Lower index = higher priority. Names not in the list fall back to lowest priority.
844
+ const DEFINITION_CAPTURE_PRIORITY: readonly string[] = [
845
+ 'definition.class',
846
+ 'definition.interface',
847
+ 'definition.enum',
848
+ 'definition.method',
849
+ 'definition.function',
850
+ 'definition.macro',
851
+ 'definition.module',
852
+ 'definition.type',
853
+ 'definition.field',
854
+ 'definition.property',
855
+ 'definition.object'
856
+ ];
857
+
858
+ function definitionCapturePriority(name: string): number {
859
+ const idx = DEFINITION_CAPTURE_PRIORITY.indexOf(name);
860
+ return idx === -1 ? Number.MAX_SAFE_INTEGER : idx;
861
+ }
862
+
863
+ function findCaptureNodeForDefinition(captures: QueryCapture[]): { capture: QueryCapture; kindCaptureName: string } | null {
864
+ // tags.scm conventionally captures the WHOLE definition node under `@definition.<kind>`
865
+ // (class/function/method/interface/etc.), and the symbol's name under `@name`. When a
866
+ // pattern produces multiple definition captures, pick the highest-priority one so the
867
+ // rendered kind is deterministic across grammar version bumps.
868
+ let best: { capture: QueryCapture; kindCaptureName: string; priority: number } | null = null;
869
+ for (const capture of captures) {
870
+ if (!capture.name.startsWith('definition.')) continue;
871
+ const priority = definitionCapturePriority(capture.name);
872
+ if (!best || priority < best.priority) {
873
+ best = { capture, kindCaptureName: capture.name, priority };
874
+ }
875
+ }
876
+ return best ? { capture: best.capture, kindCaptureName: best.kindCaptureName } : null;
877
+ }
878
+
879
+ // Different grammars use different node-type names for comments. `comment` is the most
880
+ // common; Rust/Java/C/C++ use `line_comment` and `block_comment`; Kotlin uses
881
+ // `multiline_comment` for /* */ blocks; Scala uses `block_comment`. Keep the set wide.
882
+ const COMMENT_NODE_TYPES = new Set(['line_comment', 'block_comment', 'comment', 'multiline_comment']);
883
+
884
+ function findStartingDocCursor(definitionNode: Node): Node | null {
885
+ // Locate a preceding-named-sibling that is a comment. Several grammars don't put doc
886
+ // comments at the same level as the captured node:
887
+ // * C captures `function_declarator` whose immediate previous sibling is the type
888
+ // specifier (not a comment); walk up to the wrapping `declaration` node.
889
+ // * fwcd's Kotlin grammar absorbs the trailing `/** */` block into the preceding
890
+ // `package_header` node — the comment ends up as the last named descendant of the
891
+ // wrapping sibling rather than as a sibling of the class.
892
+ // Strategy: walk up through ancestors; for each, take previousNamedSibling. If it's a
893
+ // comment, done. Otherwise descend into its last named child chain looking for a
894
+ // trailing comment. Bounded depth keeps the walk tractable.
895
+ let walker: Node | null = definitionNode;
896
+ for (let i = 0; i < 4 && walker; i += 1) {
897
+ const prev = walker.previousNamedSibling;
898
+ if (prev) {
899
+ if (COMMENT_NODE_TYPES.has(prev.type)) {
900
+ return prev;
901
+ }
902
+ // Try the last named descendant of prev — handles grammars like Kotlin's where a
903
+ // trailing comment is absorbed into the preceding sibling node.
904
+ let inner: Node | null = prev;
905
+ while (inner && inner.namedChildCount > 0) {
906
+ const lastChild = inner.namedChild(inner.namedChildCount - 1);
907
+ if (!lastChild) break;
908
+ if (COMMENT_NODE_TYPES.has(lastChild.type)) {
909
+ return lastChild;
910
+ }
911
+ inner = lastChild;
912
+ }
913
+ }
914
+ walker = walker.parent;
915
+ }
916
+ return null;
917
+ }
918
+
919
+ function collectAdjacentDocComment(definitionNode: Node, source: string, rule: DocCommentRule): string | null {
920
+ // Walk backward through preceding named siblings, collecting contiguous comment lines
921
+ // that match the language's doc-comment convention. We use named-sibling traversal so
922
+ // unnamed punctuation/newline tokens between a comment and its target don't break the
923
+ // chain — different grammars expose those gaps differently and named traversal is the
924
+ // portable path. When the captured definition has no preceding sibling at its own level,
925
+ // we walk up to its parent (e.g., from `function_declarator` to the surrounding
926
+ // `declaration`) so doc comments wrapped one level out still attach.
927
+ const lines: string[] = [];
928
+ let cursor: Node | null = findStartingDocCursor(definitionNode);
929
+ while (cursor) {
930
+ if (!COMMENT_NODE_TYPES.has(cursor.type)) {
931
+ break;
932
+ }
933
+ const raw = source.slice(cursor.startIndex, cursor.endIndex);
934
+ let body: string | null = null;
935
+ for (const prefix of rule.linePrefixes) {
936
+ if (raw.startsWith(prefix)) {
937
+ body = raw.slice(prefix.length).trimStart();
938
+ break;
939
+ }
940
+ }
941
+ if (body === null && rule.blockOpen && rule.blockClose) {
942
+ if (raw.startsWith(rule.blockOpen) && raw.endsWith(rule.blockClose)) {
943
+ const inner = raw.slice(rule.blockOpen.length, raw.length - rule.blockClose.length);
944
+ body = inner
945
+ .split(/\r?\n/)
946
+ .map((line) => line.replace(/^\s*\*\s?/, ''))
947
+ .join('\n')
948
+ .trim();
949
+ }
950
+ }
951
+ if (body === null) {
952
+ break;
953
+ }
954
+ lines.unshift(body);
955
+ cursor = cursor.previousNamedSibling;
956
+ }
957
+ const joined = lines.join('\n').trim();
958
+ return joined.length > 0 ? joined : null;
959
+ }
960
+
961
+ function buildSignature(node: Node, source: string, bodyNodeTypes: ReadonlySet<string>): string {
962
+ // Strip the body of the definition for compactness on the API page. A function or
963
+ // method signature lives in the source up to (but excluding) its body child (block /
964
+ // class_body / field_declaration_list / etc., per language); for items without a body
965
+ // (type aliases, struct-only-header declarations, etc.) we keep the full text. This
966
+ // produces clean signatures like `pub fn translate(key: DendriteI18nKey) -> String`
967
+ // instead of dumping the entire function body into the page.
968
+ const bodyChild = findBodyChild(node, bodyNodeTypes);
969
+ let endIndex = node.endIndex;
970
+ if (bodyChild) {
971
+ endIndex = bodyChild.startIndex;
972
+ }
973
+ return source.slice(node.startIndex, endIndex).trim().replace(/\s+$/, '');
974
+ }
975
+
976
+ function findBodyChild(node: Node, bodyNodeTypes: ReadonlySet<string>): Node | null {
977
+ for (let i = 0; i < node.childCount; i += 1) {
978
+ const child = node.child(i);
979
+ if (child && bodyNodeTypes.has(child.type)) {
980
+ return child;
981
+ }
982
+ }
983
+ return null;
984
+ }
985
+
986
+ function deriveModuleSlug(relativeSourcePath: string): string {
987
+ const trimmed = relativeSourcePath.replace(/\\/g, '/').replace(/^\.\//, '');
988
+ const withoutExt = trimmed.replace(/\.[a-z0-9]+$/i, '');
989
+ const stripped = withoutExt.replace(/^src\//, '');
990
+ return `api/${stripped}`;
991
+ }
992
+
993
+ function extractFileDocCommentRust(source: string): string | null {
994
+ // Rust uses `//!` as the inner-doc / module-doc convention. Walk the leading lines of
995
+ // the file collecting consecutive `//!` lines. Lines that appear in real Rust file
996
+ // headers and that we treat as ignorable prelude:
997
+ // - shebang (`#!/usr/bin/env cargo`) — only valid on the first line
998
+ // - outer attributes (`#![deny(warnings)]`, `#![cfg_attr(...)]`, etc.)
999
+ // - blank lines (always)
1000
+ // Without skipping these, a typical `main.rs` whose first line is `#![deny(warnings)]`
1001
+ // would terminate doc collection before any `//!` line ever started — silently dropping
1002
+ // the module-level documentation for binary crates.
1003
+ const lines = source.split(/\r?\n/);
1004
+ const collected: string[] = [];
1005
+ for (const line of lines) {
1006
+ const trimmed = line.trimStart();
1007
+ if (trimmed.startsWith('//!')) {
1008
+ collected.push(trimmed.slice(3).trimStart());
1009
+ } else if (trimmed.length === 0 && collected.length > 0) {
1010
+ // Blank line right after a `//!` block — keep it as a paragraph break.
1011
+ collected.push('');
1012
+ } else if (collected.length > 0) {
1013
+ break;
1014
+ } else if (trimmed.length === 0) {
1015
+ // Leading blank lines — skip.
1016
+ continue;
1017
+ } else if (trimmed.startsWith('#!')) {
1018
+ // Shebang or outer-attribute prelude (`#!/...` or `#![attr]`). Both are valid Rust
1019
+ // file-header content that must NOT terminate doc-comment collection. Skip.
1020
+ continue;
1021
+ } else {
1022
+ break;
1023
+ }
1024
+ }
1025
+ const body = collected.join('\n').trim();
1026
+ return body.length > 0 ? body : null;
1027
+ }
1028
+
1029
+ async function extractWithGrammar(
1030
+ loaded: LoadedGrammar,
1031
+ sourcePath: string,
1032
+ rootDir: string
1033
+ ): Promise<ApiFileReference> {
1034
+ const absolute = path.isAbsolute(sourcePath) ? sourcePath : path.resolve(rootDir, sourcePath);
1035
+ const relative = path.relative(rootDir, absolute).replace(/\\/g, '/');
1036
+ const source = await fs.readFile(absolute, 'utf8');
1037
+
1038
+ const parser = new Parser();
1039
+ parser.setLanguage(loaded.language);
1040
+ const tree = parser.parse(source);
1041
+ if (!tree) {
1042
+ throw new Error(`tree-sitter failed to parse ${relative}`);
1043
+ }
1044
+
1045
+ // Two-pass extraction so kind selection is deterministic when multiple patterns from
1046
+ // the grammar's tags.scm fire for the same node:
1047
+ // Pass 1 — gather every (node, capture-name) candidate from all matches, indexed by
1048
+ // node position. Capture the highest-priority `definition.*` from each match.
1049
+ // Pass 2 — per node, pick the highest-priority candidate across all matches that
1050
+ // targeted that node, then build the symbol from the winning capture.
1051
+ // Without this, a Swift method captured as both `definition.method` and `definition.
1052
+ // function` (or a PHP `interface` vs `trait`) renders an unstable kind across grammar
1053
+ // version bumps, because tree-sitter's match iteration is per-pattern, not by source
1054
+ // priority.
1055
+ interface Candidate {
1056
+ node: Node;
1057
+ nameNode: Node;
1058
+ kindCaptureName: string;
1059
+ priority: number;
1060
+ }
1061
+ const candidatesByNode = new Map<string, Candidate>();
1062
+ for (const match of loaded.query.matches(tree.rootNode)) {
1063
+ const definition = findCaptureNodeForDefinition(match.captures);
1064
+ if (!definition) continue;
1065
+ if (!loaded.config.captureKindMap[definition.kindCaptureName]) continue;
1066
+ const definitionNode = definition.capture.node;
1067
+ const nameNode = findCaptureNode(match.captures, 'name');
1068
+ // Skip *this match* if the name capture is missing or empty — but DO NOT drop the
1069
+ // node entirely: a separate (lower-priority) match for the same node may still carry
1070
+ // a valid name. The previous one-pass loop also `continue`d here, advancing to the
1071
+ // next match without recording the node; the two-pass refactor preserves that
1072
+ // behavior because a match with no usable name simply doesn't enter
1073
+ // `candidatesByNode`, so a later (lower-priority) match can.
1074
+ if (!nameNode || !nameNode.text) continue;
1075
+ const priority = definitionCapturePriority(definition.kindCaptureName);
1076
+ const dedupeKey = `${definitionNode.startIndex}:${definitionNode.endIndex}`;
1077
+ const existing = candidatesByNode.get(dedupeKey);
1078
+ if (!existing || priority < existing.priority) {
1079
+ candidatesByNode.set(dedupeKey, {
1080
+ node: definitionNode,
1081
+ nameNode,
1082
+ kindCaptureName: definition.kindCaptureName,
1083
+ priority
1084
+ });
1085
+ }
1086
+ }
1087
+
1088
+ const symbols: ApiSymbol[] = [];
1089
+ for (const candidate of candidatesByNode.values()) {
1090
+ const kind = loaded.config.captureKindMap[candidate.kindCaptureName];
1091
+ if (!kind) continue;
1092
+ const name = candidate.nameNode.text;
1093
+ if (!loaded.config.isPublic(candidate.node, source, name)) continue;
1094
+
1095
+ const signature = buildSignature(candidate.node, source, loaded.config.bodyNodeTypes);
1096
+ const docComment = collectAdjacentDocComment(candidate.node, source, loaded.config.docComment);
1097
+ const sourceLine = candidate.node.startPosition.row + 1;
1098
+
1099
+ symbols.push({
1100
+ name,
1101
+ kind,
1102
+ signature,
1103
+ docComment,
1104
+ tags: [],
1105
+ sourceLine,
1106
+ isDeprecated: false
1107
+ });
1108
+ }
1109
+
1110
+ symbols.sort((a, b) => a.sourceLine - b.sourceLine);
1111
+
1112
+ // File-level doc comment: language-specific. Rust's `//!` lives at file head.
1113
+ const fileDocComment = loaded.config.id === 'rust' ? extractFileDocCommentRust(source) : null;
1114
+
1115
+ return {
1116
+ sourcePath: relative,
1117
+ moduleSlug: deriveModuleSlug(relative),
1118
+ symbols,
1119
+ fileDocComment
1120
+ };
1121
+ }
1122
+
1123
+ async function exists(filePath: string): Promise<boolean> {
1124
+ try {
1125
+ await fs.access(filePath);
1126
+ return true;
1127
+ } catch {
1128
+ return false;
1129
+ }
1130
+ }
1131
+
1132
+ export const treeSitterExtractor: LanguageExtractor = {
1133
+ id: 'tree-sitter',
1134
+
1135
+ async detect(rootDir: string): Promise<boolean> {
1136
+ // Claim the project iff (a) some configured language has a project signal in the
1137
+ // root (or a content match when `requireExtensionPresent` is set), AND (b) we can
1138
+ // actually load that language's vendored grammar. The grammar load is cheap on the
1139
+ // second call (cached) so detect() can be invoked freely.
1140
+ for (const config of LANGUAGES) {
1141
+ let signalMatched = false;
1142
+ if (config.projectSignals.length > 0) {
1143
+ for (const signal of config.projectSignals) {
1144
+ if (await exists(path.join(rootDir, signal))) {
1145
+ signalMatched = true;
1146
+ break;
1147
+ }
1148
+ }
1149
+ } else if (config.requireExtensionPresent) {
1150
+ // Content-based detect: short-circuit on first hit so we don't pay a full project
1151
+ // walk per call. With Bash registered (its only practical use of this flag), every
1152
+ // detect on a non-Bash project would otherwise scan the entire tree looking for a
1153
+ // single `.sh` file before falling through to other extractors.
1154
+ const include = config.walkOptions?.include ?? config.extensions.map((ext) => `**/*${ext}`);
1155
+ const exclude = config.walkOptions?.exclude;
1156
+ const found = await walkProjectSources(rootDir, { include, exclude, respectInternalConvention: false, limit: 1 });
1157
+ if (found.length > 0) signalMatched = true;
1158
+ } else {
1159
+ // Pure-extension-match languages with neither signals nor `requireExtensionPresent`
1160
+ // set: never claim. (No language ships in this state today; the branch exists as a
1161
+ // forward-compatibility guard so future configs can't accidentally hijack
1162
+ // signal-less projects.)
1163
+ continue;
1164
+ }
1165
+ if (!signalMatched) continue;
1166
+ const loaded = await loadGrammar(config);
1167
+ if (loaded) {
1168
+ return true;
1169
+ }
1170
+ }
1171
+ return false;
1172
+ },
1173
+
1174
+ async walk(rootDir: string, options?: WalkOptions): Promise<string[]> {
1175
+ // When the caller passes explicit walkOptions we honor them as-is; otherwise we union
1176
+ // the per-language defaults so a project that mixes languages gets all of them
1177
+ // surfaced in one pass.
1178
+ if (options) {
1179
+ return walkProjectSources(rootDir, options);
1180
+ }
1181
+ const collected: string[] = [];
1182
+ for (const config of LANGUAGES) {
1183
+ // Skip languages that can't be loaded — no point walking files we can't parse.
1184
+ const loaded = await loadGrammar(config);
1185
+ if (!loaded) continue;
1186
+ const include = config.walkOptions?.include ?? defaultIncludeFor(config);
1187
+ const exclude = config.walkOptions?.exclude;
1188
+ const respectInternalConvention = config.walkOptions?.respectInternalConvention ?? false;
1189
+ const found = await walkProjectSources(rootDir, { include, exclude, respectInternalConvention });
1190
+ collected.push(...found);
1191
+ }
1192
+ // Sort + dedupe in case multiple language patterns capture the same path.
1193
+ return Array.from(new Set(collected)).sort();
1194
+ },
1195
+
1196
+ async extract(sourcePath: string, options?: { rootDir?: string }): Promise<ApiFileReference> {
1197
+ const rootDir = options?.rootDir ?? process.cwd();
1198
+ const config = languageForExtension(sourcePath);
1199
+ if (!config) {
1200
+ throw new Error(`treeSitterExtractor.extract: no configured language matches extension of ${sourcePath}`);
1201
+ }
1202
+ const loaded = await loadGrammar(config);
1203
+ if (!loaded) {
1204
+ throw new Error(
1205
+ `treeSitterExtractor.extract: vendored grammar for ${config.id} is missing — expected vendor/tree-sitter/${config.vendorSubdir}/tree-sitter-${config.id}.wasm`
1206
+ );
1207
+ }
1208
+ return extractWithGrammar(loaded, sourcePath, rootDir);
1209
+ }
1210
+ };