sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -12,9 +12,20 @@
12
12
  */
13
13
 
14
14
  // Grammar mapping: language ID -> grammar WASM file stem
15
+ //
16
+ // `tsx` uses tree-sitter-tsx (not tree-sitter-typescript) so that JSX inside
17
+ // .tsx bodies parses without producing ERROR nodes. Empirically (May 2026),
18
+ // routing .tsx to tree-sitter-typescript caused `export function Component(...)
19
+ // { return <Foo/> }` to silently miss the function-name capture, even though
20
+ // the tag query rule matched the AST shape — the JSX body created sibling
21
+ // ERROR nodes that broke capture resolution.
22
+ //
23
+ // tree-sitter-javascript already supports JSX natively, so .jsx files don't
24
+ // need a separate grammar.
15
25
  const GRAMMAR_MAP = {
16
26
  javascript: 'tree-sitter-javascript',
17
27
  typescript: 'tree-sitter-typescript',
28
+ tsx: 'tree-sitter-tsx',
18
29
  python: 'tree-sitter-python',
19
30
  go: 'tree-sitter-go',
20
31
  rust: 'tree-sitter-rust',
@@ -25,6 +36,16 @@ const GRAMMAR_MAP = {
25
36
  php: 'tree-sitter-php',
26
37
  kotlin: 'tree-sitter-kotlin',
27
38
  swift: 'tree-sitter-swift',
39
+ // tree-sitter-c-sharp ships in node_modules/tree-sitter-wasms/out/ but was
40
+ // previously unwired — C# fell through to the regex chunker in
41
+ // parseBraceBasedFile. That path missed every modern-C# idiom whose
42
+ // declaration line doesn't fit the rigid regex shape: `unsafe` modifier
43
+ // ordering, positional `record`, tuple-typed generic returns (e.g.
44
+ // `IAsyncEnumerable<(byte[] e, int len, …)>`), expression-bodied methods,
45
+ // file-scoped namespaces, indexers, operators, local functions, nested
46
+ // classes. Wiring tree-sitter-c-sharp puts C# on the same code path as
47
+ // the other 13 languages (cAST sibling-merge over a proper AST).
48
+ csharp: 'tree-sitter-c_sharp',
28
49
  };
29
50
 
30
51
  // Identifier node types — used to detect leaf-ident captures in extractSymbols()
@@ -49,13 +70,36 @@ const BOUNDARY_TYPES = new Set([
49
70
  'interface_declaration', 'type_alias_declaration', 'enum_declaration',
50
71
  // Structs/Traits (Rust/Go)
51
72
  'struct_item', 'impl_item', 'trait_item', 'type_declaration',
73
+ // Rust macros (macro_rules!)
74
+ 'macro_definition',
52
75
  // Modules
53
76
  'module', 'namespace_declaration',
54
77
  // Python
55
78
  'decorated_definition',
56
79
  // Java
57
80
  'record_declaration', 'constructor_declaration',
58
- // Ruby
81
+ // Java annotation types (`@interface Foo { ... }`). Without this, files
82
+ // that contain only an annotation declaration (gson SerializedName.java,
83
+ // Since.java, Until.java) produce no chunk anchor — the chunker emits
84
+ // a generic 'code' chunk and downstream search-time enrichment via
85
+ // findFirstEntityInRange then attaches whatever entity happens to start
86
+ // in the chunk's line range (which, when extractJava also ran with no
87
+ // block-comment skip, was a phantom `class MyClass` from inside the
88
+ // Javadoc <pre> example). Anchoring on the annotation declaration
89
+ // gives the @interface a proper name/type at index time.
90
+ 'annotation_type_declaration',
91
+ // Ruby — tree-sitter-ruby uses bare node names `class`, `method`,
92
+ // `singleton_method`, `singleton_class` (no `_declaration`/`_definition`
93
+ // suffix). Without these in the boundary set the cAST chunker:
94
+ // 1. never anchors a chunk on a Ruby class declaration (so `class Base`,
95
+ // `class IndifferentHash`, etc. produce only anonymous `code` chunks);
96
+ // 2. merges 8+ adjacent methods into one chunk and labels it after
97
+ // whichever singleton_method happened to be present in the merge;
98
+ // 3. drops `class << self` (the Sinatra DSL idiom) entirely.
99
+ // tree-sitter-ruby grammar reference: github.com/tree-sitter/tree-sitter-ruby
100
+ // (node types `class`, `method`, `singleton_class`). Aider's published
101
+ // tags.scm for Ruby uses the same node names.
102
+ 'class', 'method', 'singleton_class',
59
103
  'singleton_method',
60
104
  // PHP
61
105
  'trait_declaration',
@@ -67,8 +111,80 @@ const BOUNDARY_TYPES = new Set([
67
111
  'struct_specifier', 'enum_specifier', 'type_definition',
68
112
  // C++
69
113
  'class_specifier', 'namespace_definition',
114
+ // C++ `using X = ...` type aliases + `template<...> class|struct|fn|using` wrappers.
115
+ // Without these the chunker emitted templated decls as anonymous `code` chunks
116
+ // since the cAST sibling-merge path treated them as non-boundary. _resolveBoundary
117
+ // (below) drills into template_declaration to surface the inner class/struct/fn/alias
118
+ // name so the chunk metadata names + type the correct thing.
119
+ 'alias_declaration', 'template_declaration',
70
120
  ]);
71
121
 
122
+ // Per-language EXTRA boundary types. These are unioned with BOUNDARY_TYPES
123
+ // only when chunking a file in the matching language — so other languages'
124
+ // chunking behaviour stays byte-identical to before the addition. Used to
125
+ // keep grammar-specific node names out of the global set when those names
126
+ // could overlap with another grammar's nodes that have different chunking
127
+ // semantics. The threading happens in parseFileToChunks() which computes
128
+ // `effectiveBoundaryTypes = BOUNDARY_TYPES ∪ LANG_EXTRA_BOUNDARY_TYPES[lang]`
129
+ // once per parse and passes it through recursiveChunk + _extractSignature.
130
+ //
131
+ // C# additions (tree-sitter-c-sharp emits these for first-class declarations,
132
+ // verified empirically with scripts/_csharp_grammar_probe.mjs against Garnet):
133
+ // - struct_declaration, record_struct_declaration: C# struct / record struct
134
+ // - property_declaration: anchors per-property chunks so `RespCommandDocs.Command`
135
+ // style queries (CS-004) have a property-scoped chunk to land on; cAST
136
+ // sibling-merge still bundles small auto-properties into 2000-char buffers
137
+ // named after the first property + additional_symbols listing the rest.
138
+ // - delegate_declaration: `public delegate T Foo(...);` becomes a chunk anchor.
139
+ // - destructor_declaration, indexer_declaration, operator_declaration,
140
+ // conversion_operator_declaration: first-class declarations per C# spec.
141
+ // - file_scoped_namespace_declaration: C# 10+ `namespace Foo;` shape.
142
+ // - local_function_statement: nested function declarations inside methods.
143
+ // - event_declaration, event_field_declaration: events behave as
144
+ // property/field-shaped entities at search time.
145
+ // All these node names are C#-specific in our 14-language matrix EXCEPT
146
+ // `struct_declaration` (also Swift) and `property_declaration` (also Swift),
147
+ // which is exactly why they live here instead of in the global set.
148
+ const LANG_EXTRA_BOUNDARY_TYPES = {
149
+ csharp: new Set([
150
+ 'struct_declaration', 'record_struct_declaration',
151
+ 'delegate_declaration', 'destructor_declaration',
152
+ 'property_declaration',
153
+ 'indexer_declaration', 'operator_declaration',
154
+ 'conversion_operator_declaration',
155
+ 'file_scoped_namespace_declaration',
156
+ 'local_function_statement',
157
+ 'event_declaration', 'event_field_declaration',
158
+ ]),
159
+ };
160
+
161
+ // Per-language EXCLUSIONS from BOUNDARY_TYPES. Removes node-type names that
162
+ // the global set legitimately includes for one grammar but that collide
163
+ // with anonymous-keyword leaves in another grammar — producing phantom
164
+ // chunks during the cAST oversized-recursion path.
165
+ //
166
+ // Concrete trigger: tree-sitter-ruby uses bare `class` / `method` /
167
+ // `singleton_class` / `singleton_method` as the *node type names* of
168
+ // declarations (no `_declaration`/`_definition` suffix — see Ruby comment
169
+ // in BOUNDARY_TYPES). Those four strings are correctly in BOUNDARY_TYPES.
170
+ //
171
+ // But tree-sitter-c-sharp (and tree-sitter-java, tree-sitter-kotlin, etc.)
172
+ // emits an *anonymous keyword leaf* with type-string `"class"` as a child
173
+ // of `class_declaration`. When the chunker recurses into an oversized
174
+ // C# class and flushes the pre-body buffer (modifiers + class keyword +
175
+ // identifier + base_list), that `class` keyword leaf is misidentified as
176
+ // a boundary, producing a phantom `[class/null]` chunk with content
177
+ // `internal\nsealed\nclass\nRespServerSession\n: ServerSessionBase`.
178
+ //
179
+ // Java/Kotlin have the same latent bug (verified empirically on gson's
180
+ // TypeAdapters.java — emits a tiny `[class/null]` size=31 chunk at the
181
+ // class declaration line). The fix is intentionally scoped to csharp
182
+ // only so this PR doesn't change Java/Kotlin chunk output at all
183
+ // (their existing phantom chunks are tiny and don't affect retrieval).
184
+ const LANG_BOUNDARY_TYPE_EXCLUDES = {
185
+ csharp: new Set(['class']),
186
+ };
187
+
72
188
  // AST node types that represent function/class bodies. Used by
73
189
  // extractSignature() to find where the declaration's body starts so
74
190
  // the signature span is everything before it (decorators + name +
@@ -109,13 +225,24 @@ const NODE_TYPE_MAP = {
109
225
  'impl_item': 'impl',
110
226
  'trait_item': 'trait',
111
227
  'type_declaration': 'struct',
228
+ 'macro_definition': 'macro',
112
229
  'module': 'module',
113
230
  'namespace_declaration': 'namespace',
114
231
  'decorated_definition': 'decorator',
115
232
  // Java
116
233
  'record_declaration': 'record',
117
234
  'constructor_declaration': 'method',
118
- // Ruby
235
+ // @interface Foo { ... } — chunk labelled as 'interface' to match the
236
+ // existing extractJava regex behaviour (and the gold-probe convention
237
+ // that annotation types are interfaces). Note: a Java annotation is
238
+ // formally an *interface* per JLS §9.6, just a specialised form.
239
+ 'annotation_type_declaration': 'interface',
240
+ // Ruby — `class` is the bare tree-sitter-ruby node name for class
241
+ // declarations (Java/JS use `class_declaration`, Python `class_definition`,
242
+ // C++ `class_specifier`). `singleton_class` is `class << self` (or
243
+ // `class << SomeConst`) which opens the receiver's singleton scope.
244
+ 'class': 'class',
245
+ 'singleton_class': 'class',
119
246
  'method': 'method',
120
247
  'singleton_method': 'method',
121
248
  // PHP
@@ -133,10 +260,43 @@ const NODE_TYPE_MAP = {
133
260
  // C++
134
261
  'class_specifier': 'class',
135
262
  'namespace_definition': 'namespace',
263
+ 'alias_declaration': 'typeAlias',
264
+ // template_declaration intentionally absent — resolved to the inner
265
+ // type (class/struct/function/typeAlias) by _resolveBoundary at lookup time.
266
+ // C# — fires only on nodes that the chunker treats as boundaries; for
267
+ // non-C# languages those nodes are NOT in the effective boundary set
268
+ // (see LANG_EXTRA_BOUNDARY_TYPES), so _resolveBoundary is not invoked
269
+ // on them during normal sibling-merge. The leaf-too-big pathological
270
+ // branch is the only place these could be consulted for another
271
+ // grammar (e.g. an oversized Swift `property_declaration` with no
272
+ // children) — the resulting type label is a strict improvement over
273
+ // the previous 'code' fallback in that case.
274
+ 'struct_declaration': 'struct',
275
+ 'record_struct_declaration': 'record',
276
+ 'delegate_declaration': 'function',
277
+ 'destructor_declaration': 'method',
278
+ 'property_declaration': 'property',
279
+ 'indexer_declaration': 'method',
280
+ 'operator_declaration': 'method',
281
+ 'conversion_operator_declaration': 'method',
282
+ 'file_scoped_namespace_declaration': 'namespace',
283
+ 'local_function_statement': 'function',
284
+ 'event_declaration': 'property',
285
+ 'event_field_declaration': 'field',
136
286
  };
137
287
 
138
288
  // Standard tags.scm query patterns for symbol extraction
139
289
  // These are s-expression patterns matching tree-sitter node types
290
+ //
291
+ // Naming conventions for new captures (May 2026):
292
+ // @component.definition — `export const X = call(...)` (HOC-wrapped values
293
+ // like memo/forwardRef/createSlice). Higher priority than @variable, so
294
+ // when both fire on the same declarator, component wins via dedup-by-name+line
295
+ // in graph-extractor._normalizeTreeSitterEntities.
296
+ // @variable.definition — any other `export const X = expr` (literals, objects,
297
+ // typed configs). Scoped to export_statement on purpose: we don't want to
298
+ // extract every internal `const x = 1` inside a function body. Tree-sitter
299
+ // emits @arrowFunction in priority over @variable when value is an arrow.
140
300
  const TAGS_QUERIES = {
141
301
  javascript: `
142
302
  (function_declaration name: (identifier) @function.definition)
@@ -149,6 +309,27 @@ const TAGS_QUERIES = {
149
309
  (export_statement (function_declaration name: (identifier) @function.definition))
150
310
  (export_statement
151
311
  declaration: (class_declaration name: (identifier) @class.definition))
312
+ (export_statement
313
+ declaration: (lexical_declaration
314
+ (variable_declarator
315
+ name: (identifier) @component.definition
316
+ value: (call_expression))))
317
+ (export_statement
318
+ declaration: (lexical_declaration
319
+ (variable_declarator
320
+ name: (identifier) @variable.definition)))
321
+ ; Top-level (non-exported) file-scope const declarations.
322
+ ; HISTORY (2026-05-10): a prior version captured ALL top-level lexical
323
+ ; declarations as @variable.definition / @component.definition. That
324
+ ; over-extracted trivial consts (\`const VERSION = '5.8.4'\`,
325
+ ; \`const X = require('...')\`) which then dominated NL retrieval rankings
326
+ ; over real function/method definitions (regressed 5 fastify probes vs
327
+ ; post-perf-60 baseline). Restored to scoping @variable.definition to
328
+ ; export_statement only, matching the original intent in
329
+ ; graph-extractor.js:_normalizeTreeSitterEntities (line 1320 comment).
330
+ ; If structural-mode resolution of CJS top-level consts is needed,
331
+ ; add narrowly-scoped captures (e.g. value: [(array) (object) (new_expression)])
332
+ ; rather than re-introducing unrestricted (program ...) captures.
152
333
  (pair
153
334
  key: (property_identifier) @method.definition
154
335
  value: (function_expression))
@@ -172,6 +353,66 @@ const TAGS_QUERIES = {
172
353
  declaration: (class_declaration name: (type_identifier) @class.definition))
173
354
  (export_statement
174
355
  declaration: (abstract_class_declaration name: (type_identifier) @class.definition))
356
+ (export_statement
357
+ declaration: (lexical_declaration
358
+ (variable_declarator
359
+ name: (identifier) @component.definition
360
+ value: (call_expression))))
361
+ (export_statement
362
+ declaration: (lexical_declaration
363
+ (variable_declarator
364
+ name: (identifier) @variable.definition)))
365
+ ; Top-level non-exported consts intentionally NOT captured — see javascript
366
+ ; query above for rationale (regressed fastify probes via const VERSION etc.).
367
+ (pair
368
+ key: (property_identifier) @method.definition
369
+ value: (function_expression))
370
+ (pair
371
+ key: (property_identifier) @arrow.definition
372
+ value: (arrow_function))
373
+ (module name: (identifier) @namespace.definition)
374
+ (internal_module name: (identifier) @namespace.definition)
375
+ `,
376
+ // tsx grammar is a superset of typescript that also parses JSX. Tag query
377
+ // matches typescript verbatim — JSX expressions inside function bodies don't
378
+ // need their own captures (the surrounding function/component declaration is
379
+ // what we care about). We MUST keep these in sync if typescript adds new rules.
380
+ tsx: `
381
+ (function_declaration name: (identifier) @function.definition)
382
+ (generator_function_declaration name: (identifier) @function.definition)
383
+ (class_declaration name: (type_identifier) @class.definition)
384
+ (abstract_class_declaration name: (type_identifier) @class.definition)
385
+ (method_definition name: (property_identifier) @method.definition)
386
+ (interface_declaration name: (type_identifier) @interface.definition)
387
+ (type_alias_declaration name: (type_identifier) @type.definition)
388
+ (enum_declaration name: (identifier) @enum.definition)
389
+ (variable_declarator
390
+ name: (identifier) @arrow.definition
391
+ value: (arrow_function))
392
+ (export_statement
393
+ declaration: (class_declaration name: (type_identifier) @class.definition))
394
+ (export_statement
395
+ declaration: (abstract_class_declaration name: (type_identifier) @class.definition))
396
+ (export_statement
397
+ declaration: (lexical_declaration
398
+ (variable_declarator
399
+ name: (identifier) @component.definition
400
+ value: (call_expression))))
401
+ (export_statement
402
+ declaration: (lexical_declaration
403
+ (variable_declarator
404
+ name: (identifier) @variable.definition)))
405
+ ; Top-level (non-exported) file-scope const declarations — see javascript
406
+ ; query for rationale.
407
+ (program
408
+ (lexical_declaration
409
+ (variable_declarator
410
+ name: (identifier) @component.definition
411
+ value: (call_expression))))
412
+ (program
413
+ (lexical_declaration
414
+ (variable_declarator
415
+ name: (identifier) @variable.definition)))
175
416
  (pair
176
417
  key: (property_identifier) @method.definition
177
418
  value: (function_expression))
@@ -197,20 +438,27 @@ const TAGS_QUERIES = {
197
438
  (impl_item type: (type_identifier) @impl.definition)
198
439
  (trait_item name: (type_identifier) @trait.definition)
199
440
  (enum_item name: (type_identifier) @enum.definition)
441
+ (macro_definition name: (identifier) @macro.definition)
200
442
  `,
201
443
  java: `
202
444
  (class_declaration name: (identifier) @class.definition)
203
445
  (interface_declaration name: (identifier) @interface.definition)
446
+ (annotation_type_declaration name: (identifier) @interface.definition)
204
447
  (enum_declaration name: (identifier) @enum.definition)
205
448
  (record_declaration name: (identifier) @record.definition)
206
449
  (method_declaration name: (identifier) @method.definition)
207
450
  (constructor_declaration name: (identifier) @method.definition)
451
+ (annotation_type_element_declaration name: (identifier) @method.definition)
452
+ (enum_constant name: (identifier) @enum_constant.definition)
453
+ (field_declaration declarator: (variable_declarator name: (identifier) @field.definition))
208
454
  `,
209
455
  ruby: `
210
456
  (class name: (constant) @class.definition)
457
+ (singleton_class value: (constant) @class.definition)
211
458
  (module name: (constant) @module.definition)
212
459
  (method name: (identifier) @method.definition)
213
460
  (singleton_method name: (identifier) @method.definition)
461
+ (alias name: (identifier) @method.definition)
214
462
  `,
215
463
  php: `
216
464
  (class_declaration name: (name) @class.definition)
@@ -251,9 +499,102 @@ const TAGS_QUERIES = {
251
499
  (struct_specifier name: (type_identifier) @struct.definition)
252
500
  (enum_specifier name: (type_identifier) @enum.definition)
253
501
  (namespace_definition name: (namespace_identifier) @namespace.definition)
502
+ (alias_declaration name: (type_identifier) @type.definition)
503
+ `,
504
+ // C# — tree-sitter-c-sharp uses bare `identifier` (not type_identifier)
505
+ // for all names, and exposes `name:` fields on every first-class
506
+ // declaration. Probed against Garnet's 30+ partial-class shards in
507
+ // scripts/_csharp_grammar_probe.mjs: every boundary type below emits
508
+ // a parseable name field. Indexer/operator declarations have no
509
+ // user-facing name (the `this[…]` / `operator+` is the identity),
510
+ // so they're captured at node level via @method.definition.
511
+ // Namespaces use `qualified_name` for dotted forms (`Garnet.server`)
512
+ // and `identifier` for single-segment forms; we capture both shapes.
513
+ csharp: `
514
+ (class_declaration name: (identifier) @class.definition)
515
+ (interface_declaration name: (identifier) @interface.definition)
516
+ (struct_declaration name: (identifier) @struct.definition)
517
+ (record_declaration name: (identifier) @record.definition)
518
+ (record_struct_declaration name: (identifier) @record.definition)
519
+ (enum_declaration name: (identifier) @enum.definition)
520
+ (delegate_declaration name: (identifier) @function.definition)
521
+ (namespace_declaration name: (identifier) @namespace.definition)
522
+ (namespace_declaration name: (qualified_name) @namespace.definition)
523
+ (file_scoped_namespace_declaration name: (identifier) @namespace.definition)
524
+ (file_scoped_namespace_declaration name: (qualified_name) @namespace.definition)
525
+ (method_declaration name: (identifier) @method.definition)
526
+ (constructor_declaration name: (identifier) @method.definition)
527
+ (destructor_declaration name: (identifier) @method.definition)
528
+ (property_declaration name: (identifier) @property.definition)
529
+ (indexer_declaration) @method.definition
530
+ (operator_declaration) @method.definition
531
+ (conversion_operator_declaration) @method.definition
532
+ (event_declaration name: (identifier) @property.definition)
533
+ (event_field_declaration (variable_declaration (variable_declarator name: (identifier) @field.definition)))
534
+ (field_declaration (variable_declaration (variable_declarator name: (identifier) @field.definition)))
535
+ (local_function_statement name: (identifier) @function.definition)
254
536
  `,
255
537
  };
256
538
 
539
+ // Names that tree-sitter-c / tree-sitter-cpp sometimes emit as the `name:`
540
+ // field of a struct_specifier / class_specifier / function_declarator when
541
+ // the parser misidentifies a C/C++ keyword as a user identifier. Common
542
+ // cause: a header-only C++ library has its .h file routed to C (because
543
+ // .h → c in EXTENSION_MAP), and tree-sitter-c then encounters C++ keywords
544
+ // (`alignas`, `namespace`, `decltype`, `enum class`) it does not recognize.
545
+ //
546
+ // Examples:
547
+ // `struct alignas(16) uint128_t { ... }` tree-sitter-c → name=alignas
548
+ // `enum class Color { RED };` tree-sitter-c → name=class (under struct_specifier)
549
+ // `using Vec = decltype(Zero(D()));` tree-sitter-c → name=decltype (under function)
550
+ // `namespace hwy::x86 { ... }` tree-sitter-c → name=namespace (under function)
551
+ // `if (cond) { body }` tree-sitter-c → name=if (under function, on misparse)
552
+ //
553
+ // All entries are C/C++ reserved keywords. They CANNOT legally be the name
554
+ // of a user-defined type, function, or variable in any version of C/C++.
555
+ // Filtering them removes only phantom captures — never a legitimate entity.
556
+ //
557
+ // Closed list. Scoped to languageId ∈ {c, cpp} via C_FAMILY_LANGUAGES so a
558
+ // Go/Python/JS file with a class literally named `final` is not affected.
559
+ // Evidence gathered from highway @ 3c72230 cpp probe corpus:
560
+ // decltype: 667 captures, alignas: 1, namespace: phantom on CPP-008,
561
+ // class (under enum): 10, if: 11. Audit query in commit message.
562
+ const C_FAMILY_ATTRIBUTE_PHANTOM_NAMES = new Set([
563
+ // Attribute / specifier keywords
564
+ 'alignas', // C++11 keyword
565
+ '_Alignas', // C11 keyword
566
+ '__attribute__', // GCC extension
567
+ '__declspec', // MSVC extension
568
+ '__inline__', // GCC extension
569
+ '__forceinline', // MSVC extension
570
+ 'final', // C++11 contextual keyword
571
+ 'override', // C++11 contextual keyword
572
+ // Type-deduction operators
573
+ 'decltype', // C++11
574
+ 'typeof', // C2x / GCC extension
575
+ '__typeof', // GCC extension
576
+ '__typeof__', // GCC extension
577
+ // Structural keywords
578
+ 'class', // C++ — miscaptured from `enum class` in .h→c misparse
579
+ 'struct', // C/C++
580
+ 'union', // C/C++
581
+ 'enum', // C/C++
582
+ 'namespace', // C++
583
+ 'typedef', // C/C++
584
+ 'template', // C++
585
+ // Control-flow keywords (miscaptured as functions on parse errors)
586
+ 'if', // C/C++
587
+ 'for', // C/C++
588
+ 'while', // C/C++
589
+ 'switch', // C/C++
590
+ 'do', // C/C++
591
+ ]);
592
+
593
+ // Languages where C_FAMILY_ATTRIBUTE_PHANTOM_NAMES should be filtered.
594
+ // Scoped narrowly to C/C++ so a Go/Python/JS file containing a type
595
+ // literally named `final` is not affected.
596
+ const C_FAMILY_LANGUAGES = new Set(['c', 'cpp']);
597
+
257
598
  // Map capture names from tags.scm queries to entity types
258
599
  const CAPTURE_TO_ENTITY_TYPE = {
259
600
  'function.definition': 'function',
@@ -272,6 +613,30 @@ const CAPTURE_TO_ENTITY_TYPE = {
272
613
  'record.definition': 'record',
273
614
  'module.definition': 'module',
274
615
  'object.definition': 'class',
616
+ // JS/TS exported const declarations (May 2026):
617
+ // @component fires only when value is a call_expression (memo/forwardRef/createSlice etc.);
618
+ // @variable fires for any export const, including string/object/typed literals.
619
+ // Both can match the same node; component wins via priority dedup downstream.
620
+ 'component.definition': 'component',
621
+ 'variable.definition': 'variable',
622
+ 'macro.definition': 'macro',
623
+ // C# property declarations — `public RespCommand Command { get; init; }`.
624
+ // Used by the csharp TAGS_QUERIES entry to give init-only properties /
625
+ // computed properties / event-as-property declarations their own graph
626
+ // entity. Currently no other tree-sitter grammar in this codebase emits
627
+ // a @property.definition capture, so 'property' is a C#-private entity
628
+ // type at the moment (Swift's property_declaration node is not captured
629
+ // by tags.scm and won't reach this map).
630
+ 'property.definition': 'property',
631
+ // Java enum constants (FieldNamingPolicy.UPPER_CAMEL_CASE) and field
632
+ // declarations (TypeAdapters.BIT_SET — a static final field whose
633
+ // initializer is an anonymous `new TypeAdapter<BitSet>() { ... }`).
634
+ // Both are first-class declarations per JLS but tree-sitter-java
635
+ // exposes them under distinct node types (enum_constant,
636
+ // field_declaration > variable_declarator) that our query previously
637
+ // ignored, so neither got a proper symbol anchor in the graph.
638
+ 'enum_constant.definition': 'enum_constant',
639
+ 'field.definition': 'field',
275
640
  };
276
641
 
277
642
  export class TreeSitterProvider {
@@ -381,7 +746,7 @@ export class TreeSitterProvider {
381
746
  const seen = new Set(); // deduplicate by startIndex
382
747
  for (const capture of captures) {
383
748
  const { name: captureName, node } = capture;
384
- const entityType = CAPTURE_TO_ENTITY_TYPE[captureName];
749
+ let entityType = CAPTURE_TO_ENTITY_TYPE[captureName];
385
750
  if (!entityType) continue;
386
751
 
387
752
  // When queries capture an identifier (e.g. `name: (identifier) @x`),
@@ -390,6 +755,34 @@ export class TreeSitterProvider {
390
755
  const isLeafIdent = IDENT_TYPES.has(node.type);
391
756
  const extentNode = isLeafIdent && node.parent ? node.parent : node;
392
757
 
758
+ // Go's grammar collapses every `type X …` declaration into
759
+ // `type_declaration → type_spec` with a single @type.definition
760
+ // capture, which the table above maps to the catch-all 'typeAlias'.
761
+ // The downstream `type` field of a type_spec encodes whether X is a
762
+ // struct, an interface, or a true type alias / slice / func type.
763
+ // Drill in and emit the more specific entity type so symbol-type
764
+ // filtering, file-kind boosts and probe gold checks (GO-005 / GO-007
765
+ // / GO-008 expect 'interface'/'struct', not 'typeAlias') work as
766
+ // intended. Pure precision refinement: same node extent, same
767
+ // symbol name, only the label changes. Other languages have no
768
+ // `type_spec` node, so this branch is structurally Go-only.
769
+ if (
770
+ languageId === 'go' &&
771
+ entityType === 'typeAlias' &&
772
+ extentNode.type === 'type_spec'
773
+ ) {
774
+ const typeField = extentNode.childForFieldName?.('type');
775
+ if (typeField) {
776
+ if (typeField.type === 'struct_type') entityType = 'struct';
777
+ else if (typeField.type === 'interface_type') entityType = 'interface';
778
+ // All other type-spec rhs shapes (slice/array/map/channel/
779
+ // function/pointer/qualified/identifier/generic/parenthesized)
780
+ // remain 'typeAlias', which is the correct semantic label for
781
+ // `type Middlewares []func(...)`, `type Handler = http.Handler`,
782
+ // etc.
783
+ }
784
+ }
785
+
393
786
  // Deduplicate: multiple captures can match the same declaration
394
787
  const key = `${extentNode.startIndex}:${entityType}`;
395
788
  if (seen.has(key)) continue;
@@ -411,6 +804,16 @@ export class TreeSitterProvider {
411
804
  || this._extractNodeName(node)
412
805
  || `<anonymous:${entityType}>`);
413
806
 
807
+ // Filter C/C++ phantom captures where the parser bound the `name:`
808
+ // field to a C/C++ keyword (`alignas`, `__attribute__`, etc.) instead
809
+ // of the actual type name. See C_FAMILY_ATTRIBUTE_PHANTOM_NAMES above.
810
+ if (
811
+ C_FAMILY_LANGUAGES.has(languageId) &&
812
+ C_FAMILY_ATTRIBUTE_PHANTOM_NAMES.has(symbolName)
813
+ ) {
814
+ continue;
815
+ }
816
+
414
817
  symbols.push({
415
818
  name: symbolName,
416
819
  type: entityType,
@@ -451,10 +854,40 @@ export class TreeSitterProvider {
451
854
  const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
452
855
  this._chunkCounter = 0;
453
856
 
857
+ // Per-parse effective boundary set: BOUNDARY_TYPES ∪ language extras
858
+ // \ language excludes. For every language without an extras or excludes
859
+ // entry (= all 13 pre-2026-05-12 languages), this is byte-identical to
860
+ // BOUNDARY_TYPES (same Set reference), so non-C# parsing semantics are
861
+ // unchanged.
862
+ const langExtra = LANG_EXTRA_BOUNDARY_TYPES[languageId];
863
+ const langExcludes = LANG_BOUNDARY_TYPE_EXCLUDES[languageId];
864
+ let boundaryTypes;
865
+ if (!langExtra && !langExcludes) {
866
+ boundaryTypes = BOUNDARY_TYPES;
867
+ } else {
868
+ boundaryTypes = new Set(BOUNDARY_TYPES);
869
+ if (langExtra) for (const t of langExtra) boundaryTypes.add(t);
870
+ if (langExcludes) for (const t of langExcludes) boundaryTypes.delete(t);
871
+ }
872
+
454
873
  const children = this._getChildren(tree.rootNode);
455
- const chunks = this.recursiveChunk(children, content, maxChunkSize, null);
874
+ const chunks = this.recursiveChunk(children, content, maxChunkSize, null, boundaryTypes);
456
875
 
457
876
  tree.delete(); // free WASM memory
877
+
878
+ // Filter phantom C/C++ attribute names — null out chunk.name when the
879
+ // parser bound it to a C/C++ keyword. The chunk itself stays (the code
880
+ // is real); only the symbol label is corrected. Downstream anomalous-
881
+ // chunk demotion will treat any small-span resulting anonymous chunk
882
+ // appropriately.
883
+ if (C_FAMILY_LANGUAGES.has(languageId) && chunks) {
884
+ for (const chunk of chunks) {
885
+ if (chunk?.name && C_FAMILY_ATTRIBUTE_PHANTOM_NAMES.has(chunk.name)) {
886
+ chunk.name = null;
887
+ }
888
+ }
889
+ }
890
+
458
891
  return chunks.length > 0 ? chunks : null;
459
892
  }
460
893
 
@@ -483,13 +916,33 @@ export class TreeSitterProvider {
483
916
  * @param {string} content - Full file content
484
917
  * @param {number} maxSize - Maximum chunk size in characters
485
918
  * @param {object|null} parentInfo - Parent chunk info for hierarchical linking
919
+ * @param {Set<string>} [boundaryTypes] - Effective boundary set (per-parse,
920
+ * BOUNDARY_TYPES ∪ LANG_EXTRA_BOUNDARY_TYPES[lang]). When omitted, falls
921
+ * back to BOUNDARY_TYPES — preserves the pre-2026-05-12 call signature
922
+ * for any internal caller that constructs this provider directly.
486
923
  * @returns {Array} List of chunk objects
487
924
  */
488
- recursiveChunk(nodes, content, maxSize, parentInfo) {
925
+ recursiveChunk(nodes, content, maxSize, parentInfo, boundaryTypes = BOUNDARY_TYPES) {
489
926
  const chunks = [];
490
927
  let buffer = [];
491
928
  let bufferSize = 0;
492
929
 
930
+ // SMALL_TAIL_THRESHOLD: chunks below this character count are
931
+ // considered "orphan tails" — they tend to be `module.exports`,
932
+ // closing braces, trailing const declarations, etc. that cAST's
933
+ // sibling-merge couldn't fit into the previous buffer when it
934
+ // overflowed maxSize. Merging them into the preceding emitted
935
+ // chunk (when it shares the same parent context and won't push
936
+ // past 1.25× maxSize) gives the agent a coherent unit instead
937
+ // of a 2-line dangling chunk that wins retrieval on its own.
938
+ //
939
+ // Verified canary: lib/schema-controller.js was emitting
940
+ // [148-161 setupSerializer] followed by [163-164 module.exports]
941
+ // as two separate chunks — the orphan tail won S2-Q3 retrieval.
942
+ // After merge, the tail joins setupSerializer.
943
+ const SMALL_TAIL_THRESHOLD = 100;
944
+ const TAIL_MERGE_HEADROOM = 1.25;
945
+
493
946
  const flushBuffer = () => {
494
947
  if (buffer.length === 0) return;
495
948
  const text = buffer
@@ -497,23 +950,190 @@ export class TreeSitterProvider {
497
950
  .join('\n');
498
951
 
499
952
  if (text.trim().length > 30) {
500
- const firstBoundary = buffer.find(n => BOUNDARY_TYPES.has(n.type));
501
- const name = firstBoundary ? this._extractNodeName(firstBoundary) : null;
502
- const type = firstBoundary ? (NODE_TYPE_MAP[firstBoundary.type] || 'code') : 'code';
503
- const signature = firstBoundary ? this._extractSignature(firstBoundary, content) : null;
504
-
505
- chunks.push({
506
- chunkId: this._nextChunkId(),
507
- parentChunkId: parentInfo?.chunkId || null,
508
- parentSymbol: parentInfo?.name || null,
509
- parentType: parentInfo?.type || null,
510
- text: text.trim(),
511
- startLine: buffer[0].startPosition.row,
512
- endLine: buffer[buffer.length - 1].endPosition.row,
513
- type,
514
- name: name || (buffer.length === 1 ? null : null),
515
- signature,
516
- });
953
+ const boundariesInBuffer = buffer.filter(n => boundaryTypes.has(n.type));
954
+
955
+ // SIBLING_DOC_SPLIT (RS-008 motivation, May 2026): at top level, when
956
+ // 2+ boundary-typed siblings each carry an immediately-preceding outer
957
+ // doc-comment, emit one chunk per boundary instead of merging them.
958
+ // cAST sibling-merge would otherwise collapse them into one chunk
959
+ // anchored solely on the first boundary's name (e.g. packaging.rs's
960
+ // `is_package` + `detect_package_root` collapse into one
961
+ // `# function: is_package` chunk). The bi-encoder then sees only the
962
+ // first symbol as primary, and the sibling's doc-comment gets
963
+ // averaged into the pooled embedding — a `# Additional:` header is
964
+ // too weak to recover the secondary symbol at production k=5.
965
+ //
966
+ // Section i = buffer[afterPrevBoundary .. boundary_i]. The first
967
+ // section absorbs all leading file-level material (module-level
968
+ // comments, use stmts) so it stays attached to the first boundary;
969
+ // the last section absorbs any trailing non-boundary nodes.
970
+ //
971
+ // Validation (May 2026, full §3 pipeline):
972
+ // - All 17 non-rust language packs: byte-identical to baseline
973
+ // - retrieval-probes 60: 46/4/10 identical
974
+ // - GCSN dev MRR@10: 86.92% exact
975
+ // - Rust AST-tester: 5/0/3 identical, zero PASS→FAIL flips
976
+ // - doc-positive / doc-negative rust: identical
977
+ // RS-008 did NOT flip — bottleneck is encoder-bound (resolver.rs's
978
+ // doc-string literally names `detect_package_root`, beating
979
+ // packaging.rs on TF/IR). Shipped anyway as a structurally-correct
980
+ // cAST refinement: more focused chunks for documented multi-fn
981
+ // top-level files (e.g. fs.rs now has 5 per-fn chunks instead of
982
+ // one merged chunk), with zero regression cost across all gates.
983
+ //
984
+ // Gating (conservative — undocumented helpers stay merged):
985
+ // 1. parentInfo == null: top-level only. Nested contexts (mod,
986
+ // impl, class bodies) keep cAST merge behaviour because their
987
+ // `# Parent:` header line already anchors siblings to the
988
+ // enclosing scope.
989
+ // 2. boundariesInBuffer.length >= 2: nothing to split if one.
990
+ // 3. every boundary has a leading outer-doc comment (`///` or
991
+ // `/**`). Mixed documented/undocumented buffers fall through
992
+ // to cAST merge to avoid inflating chunk counts unnecessarily.
993
+ // RUBY_CLASS_SIBLING_SPLIT: split when buffer has 2+ Ruby class/
994
+ // module siblings, each with an extractable name. cAST sibling-
995
+ // merge otherwise collapses adjacent tiny classes — e.g. sinatra
996
+ // base.rb's `class ExtendedRack` + `class CommonLogger` + `class
997
+ // Error` + ... — into one chunk labeled after the first boundary,
998
+ // and later entity adoption (file-kind-ranking.applyResultDemotions)
999
+ // walks UP via findEnclosingEntity over the merged range and
1000
+ // silently relabels the chunk to the outer module/namespace,
1001
+ // losing the IAR anchor. Splitting per-class restores 1:1 chunk-
1002
+ // to-entity alignment.
1003
+ //
1004
+ // Ruby-only gate via the tree-sitter-ruby-specific node type names
1005
+ // (`class`/`module`/`singleton_class`). Other grammars use
1006
+ // `class_declaration` (Java/JS/TS/Kotlin/C#), `class_definition`
1007
+ // (Python/Dart), `class_specifier` (C++), `struct_item` (Rust),
1008
+ // etc. — none of those node names exist in tree-sitter-ruby, and
1009
+ // `class`/`module`/`singleton_class` don't exist in any other
1010
+ // grammar. So this split is byte-identical-null-op for every non-
1011
+ // Ruby language pack and the 60-probe retrieval bench, while
1012
+ // fixing the chunker-bound regressions on Ruby AST probes RB-001
1013
+ // through RB-008.
1014
+ const RUBY_CLASS_LIKE_TYPES = new Set([
1015
+ 'class',
1016
+ 'module',
1017
+ 'singleton_class',
1018
+ ]);
1019
+ const isClassLikeSiblingSet = boundariesInBuffer.length >= 2
1020
+ && boundariesInBuffer.every(b => {
1021
+ if (!RUBY_CLASS_LIKE_TYPES.has(b.type)) return false;
1022
+ const resolved = this._resolveBoundary(b);
1023
+ return !!this._extractNodeName(resolved.nameNode);
1024
+ });
1025
+
1026
+ if (
1027
+ parentInfo == null
1028
+ && boundariesInBuffer.length >= 2
1029
+ && boundariesInBuffer.every(b => {
1030
+ const idx = buffer.indexOf(b);
1031
+ return idx > 0 && this._isLeadingDocComment(buffer[idx - 1], content);
1032
+ })
1033
+ || isClassLikeSiblingSet
1034
+ ) {
1035
+ let sectionStart = 0;
1036
+ for (let i = 0; i < boundariesInBuffer.length; i++) {
1037
+ const b = boundariesInBuffer[i];
1038
+ const bIdx = buffer.indexOf(b);
1039
+ // Last section absorbs trailing non-boundary nodes after `b`.
1040
+ const isLast = i === boundariesInBuffer.length - 1;
1041
+ const sectionEnd = isLast ? buffer.length - 1 : bIdx;
1042
+ const section = buffer.slice(sectionStart, sectionEnd + 1);
1043
+ const sectionText = section
1044
+ .map(n => content.substring(n.startIndex, n.endIndex))
1045
+ .join('\n');
1046
+ if (sectionText.trim().length > 30) {
1047
+ const resolved = this._resolveBoundary(b);
1048
+ chunks.push({
1049
+ chunkId: this._nextChunkId(),
1050
+ parentChunkId: parentInfo?.chunkId || null,
1051
+ parentSymbol: parentInfo?.name || null,
1052
+ parentType: parentInfo?.type || null,
1053
+ text: sectionText.trim(),
1054
+ startLine: section[0].startPosition.row,
1055
+ endLine: section[section.length - 1].endPosition.row,
1056
+ type: resolved.type,
1057
+ name: this._extractNodeName(resolved.nameNode),
1058
+ signature: this._extractSignature(b, content, boundaryTypes),
1059
+ additionalSymbols: null,
1060
+ });
1061
+ }
1062
+ sectionStart = bIdx + 1;
1063
+ }
1064
+ buffer = [];
1065
+ bufferSize = 0;
1066
+ return;
1067
+ }
1068
+
1069
+ const firstBoundary = boundariesInBuffer[0];
1070
+ let name = null;
1071
+ let type = 'code';
1072
+ if (firstBoundary) {
1073
+ const resolved = this._resolveBoundary(firstBoundary);
1074
+ name = this._extractNodeName(resolved.nameNode);
1075
+ type = resolved.type;
1076
+ }
1077
+ const signature = firstBoundary ? this._extractSignature(firstBoundary, content, boundaryTypes) : null;
1078
+ // When the cAST sibling-merge collapses multiple top-level
1079
+ // boundaries into one chunk (e.g. small rust file with two
1080
+ // adjacent free-standing fns), only the first boundary's name
1081
+ // would otherwise reach embedding/LI headers — the bi-encoder
1082
+ // never sees the sibling symbol names. Collect them here and
1083
+ // pass through so buildEmbeddingText() / buildLiText() can
1084
+ // surface them via an `# Additional:` header line.
1085
+ let additionalSymbols = null;
1086
+ if (boundariesInBuffer.length > 1) {
1087
+ const sibNames = boundariesInBuffer.slice(1)
1088
+ .map(n => this._extractNodeName(n))
1089
+ .filter(n => n && n !== name);
1090
+ if (sibNames.length > 0) additionalSymbols = sibNames;
1091
+ }
1092
+
1093
+ // Tail-orphan merge: when the buffer about to be flushed is
1094
+ // small AND has no boundary symbol of its own, append it into
1095
+ // the previous chunk PROVIDED:
1096
+ // (a) the previous chunk's endLine is within 5 lines of this
1097
+ // buffer's startLine (spatial locality — avoids merging
1098
+ // a `module.exports` at line 163 with a class method at
1099
+ // line 30)
1100
+ // (b) merging keeps total under 1.25× maxSize (avoid overflow
1101
+ // cliffs)
1102
+ //
1103
+ // We deliberately don't require same parentChunkId because the
1104
+ // canonical orphan-tail case (Lib/schema-controller.js) has the
1105
+ // tail at FILE-level (parent=null) but the previous emitted
1106
+ // chunk is the last METHOD of a class (parent=class_id) emitted
1107
+ // via the recursive call. Spatial proximity is the more
1108
+ // structural test — a 2-line trailing assignment immediately
1109
+ // after a class block belongs with that block.
1110
+ const prev = chunks[chunks.length - 1];
1111
+ const isOrphanTail = !firstBoundary
1112
+ && text.trim().length < SMALL_TAIL_THRESHOLD;
1113
+ const bufferStart = buffer[0].startPosition.row;
1114
+ const linesGap = prev ? bufferStart - prev.endLine : Infinity;
1115
+ const isSpatiallyClose = linesGap >= 0 && linesGap <= 5;
1116
+ const mergedSize = prev ? (prev.text.length + 1 + text.trim().length) : Infinity;
1117
+ const fitsHeadroom = mergedSize <= maxSize * TAIL_MERGE_HEADROOM;
1118
+
1119
+ if (isOrphanTail && prev && isSpatiallyClose && fitsHeadroom) {
1120
+ prev.text = prev.text + '\n' + text.trim();
1121
+ prev.endLine = buffer[buffer.length - 1].endPosition.row;
1122
+ } else {
1123
+ chunks.push({
1124
+ chunkId: this._nextChunkId(),
1125
+ parentChunkId: parentInfo?.chunkId || null,
1126
+ parentSymbol: parentInfo?.name || null,
1127
+ parentType: parentInfo?.type || null,
1128
+ text: text.trim(),
1129
+ startLine: buffer[0].startPosition.row,
1130
+ endLine: buffer[buffer.length - 1].endPosition.row,
1131
+ type,
1132
+ name: name || (buffer.length === 1 ? null : null),
1133
+ signature,
1134
+ additionalSymbols,
1135
+ });
1136
+ }
517
1137
  }
518
1138
  buffer = [];
519
1139
  bufferSize = 0;
@@ -537,14 +1157,66 @@ export class TreeSitterProvider {
537
1157
  } else {
538
1158
  // Node is oversized even alone — recurse into children
539
1159
  if (node.childCount > 0) {
540
- const name = this._extractNodeName(node);
541
- const type = NODE_TYPE_MAP[node.type] || 'code';
1160
+ const resolved = this._resolveBoundary(node);
1161
+ const name = this._extractNodeName(resolved.nameNode);
1162
+ const type = resolved.type;
1163
+
1164
+ // Header chunk for oversized BOUNDARY nodes (large classes,
1165
+ // structs, traits, etc.): emit a small "header" chunk before
1166
+ // recursing into the body. Without this, queries that match
1167
+ // the boundary's name itself (rather than any inner member)
1168
+ // have NO chunk anchored on the boundary — only sub-chunks
1169
+ // with parent_symbol context. Empirically (kotlin JobSupport,
1170
+ // 1582-line `open class JobSupport`), this left class-targeted
1171
+ // queries to lose to inner method chunks. The header chunk
1172
+ // captures the declaration + leading doc-comment / opening
1173
+ // body (up to ~600 chars) so the boundary name is searchable.
1174
+ //
1175
+ // Gating: only when the node is a BOUNDARY_TYPES AND has a name.
1176
+ // Top-level Ruby method nodes are excluded because those
1177
+ // unscoped `def` snippets are normalized to anonymous code chunks
1178
+ // by ASTChunker. Parent-scoped Ruby methods still get header
1179
+ // chunks when oversized.
1180
+ // Header text is bounded to maxSize so we never exceed embed cap.
1181
+ const isRubyMethodHeader = parentInfo == null
1182
+ && (node.type === 'method' || node.type === 'singleton_method');
1183
+ if (boundaryTypes.has(node.type) && name && !isRubyMethodHeader) {
1184
+ const HEADER_MAX_CHARS = Math.min(600, maxSize);
1185
+ const headerEndIdx = Math.min(node.endIndex, node.startIndex + HEADER_MAX_CHARS);
1186
+ const headerText = content.substring(node.startIndex, headerEndIdx);
1187
+ if (headerText.trim().length > 30) {
1188
+ const lineCount = headerText.split('\n').length;
1189
+ chunks.push({
1190
+ chunkId: this._nextChunkId(),
1191
+ parentChunkId: parentInfo?.chunkId || null,
1192
+ parentSymbol: parentInfo?.name || null,
1193
+ parentType: parentInfo?.type || null,
1194
+ text: headerText.trim(),
1195
+ startLine: node.startPosition.row,
1196
+ endLine: node.startPosition.row + Math.max(0, lineCount - 1),
1197
+ type,
1198
+ name,
1199
+ signature: this._extractSignature(node, content, boundaryTypes),
1200
+ });
1201
+ }
1202
+ }
542
1203
 
543
- // Transparent nodes (e.g., statement_block, block) that have no
544
- // name and aren't boundary types should pass through the caller's
545
- // parent context instead of creating an anonymous level.
1204
+ // Transparent nodes (no name resolved) pass through the caller's
1205
+ // parent context instead of creating an anonymous "unknown" level.
1206
+ // Covers two cases:
1207
+ // 1. Non-boundary containers (statement_block, body_statement,
1208
+ // block) — pre-existing behaviour.
1209
+ // 2. Ruby `class << self` (singleton_class with value=self,
1210
+ // which has no extractable name). Without this carve-out
1211
+ // the chunk's sub-chunks get `parentSymbol='unknown'`,
1212
+ // losing the enclosing class context (e.g. Sinatra::Base);
1213
+ // with it they inherit `parentSymbol='Base'`. Narrowed to
1214
+ // singleton_class so other languages' nameless boundaries
1215
+ // (JS arrow_function, anonymous classes) keep their
1216
+ // pre-existing 'unknown' attribution unchanged.
546
1217
  let subParent;
547
- if (!name && !BOUNDARY_TYPES.has(node.type) && parentInfo) {
1218
+ const isNamelessRubySingleton = node.type === 'singleton_class';
1219
+ if (!name && (!boundaryTypes.has(node.type) || isNamelessRubySingleton) && parentInfo) {
548
1220
  subParent = parentInfo;
549
1221
  } else {
550
1222
  const parentId = this._nextChunkId();
@@ -555,12 +1227,14 @@ export class TreeSitterProvider {
555
1227
  this._getChildren(node),
556
1228
  content,
557
1229
  maxSize,
558
- subParent
1230
+ subParent,
1231
+ boundaryTypes
559
1232
  );
560
1233
  chunks.push(...subChunks);
561
1234
  } else {
562
1235
  // Leaf node too big — emit as-is (never split mid-expression)
563
1236
  const nodeText = content.substring(node.startIndex, node.endIndex);
1237
+ const resolved = this._resolveBoundary(node);
564
1238
  chunks.push({
565
1239
  chunkId: this._nextChunkId(),
566
1240
  parentChunkId: parentInfo?.chunkId || null,
@@ -569,9 +1243,9 @@ export class TreeSitterProvider {
569
1243
  text: nodeText.trim(),
570
1244
  startLine: node.startPosition.row,
571
1245
  endLine: node.endPosition.row,
572
- type: NODE_TYPE_MAP[node.type] || 'code',
573
- name: this._extractNodeName(node),
574
- signature: this._extractSignature(node, content),
1246
+ type: resolved.type,
1247
+ name: this._extractNodeName(resolved.nameNode),
1248
+ signature: this._extractSignature(node, content, boundaryTypes),
575
1249
  });
576
1250
  }
577
1251
  }
@@ -598,9 +1272,9 @@ export class TreeSitterProvider {
598
1272
  * does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
599
1273
  * surface is research-only on `embedding_text`.
600
1274
  */
601
- _extractSignature(node, content) {
1275
+ _extractSignature(node, content, boundaryTypes = BOUNDARY_TYPES) {
602
1276
  if (!node || !content) return null;
603
- if (!BOUNDARY_TYPES.has(node.type)) return null;
1277
+ if (!boundaryTypes.has(node.type)) return null;
604
1278
 
605
1279
  let bodyStart = null;
606
1280
  // Try field-name lookup first (works for most modern grammars).
@@ -636,23 +1310,101 @@ export class TreeSitterProvider {
636
1310
  return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
637
1311
  }
638
1312
 
1313
+ /**
1314
+ * Returns true if `node` is a comment-typed AST node whose source text
1315
+ * is an outer doc-comment immediately preceding a code item.
1316
+ *
1317
+ * Recognized outer-doc prefixes (cross-language):
1318
+ * /// — Rust outer doc, C/C++/C# triple-slash documentation
1319
+ * /** — Javadoc, JSDoc, PHPDoc, Doxygen, KDoc, Scaladoc
1320
+ *
1321
+ * Deliberately excludes:
1322
+ * //! — Rust inner doc (applies to enclosing module, not next item)
1323
+ * // — plain line comments (Go uses these as docs but the same
1324
+ * syntax is used for arbitrary inline notes; ambiguous, skip)
1325
+ * # — shell/Ruby/Python pound comments (ambiguous, and Python
1326
+ * docstrings live INSIDE the function, not preceding it)
1327
+ *
1328
+ * Used by the SIBLING_DOC_SPLIT branch in recursiveChunk.flushBuffer to
1329
+ * decide whether each of N top-level sibling boundaries has its own
1330
+ * docstring (in which case they each deserve their own chunk).
1331
+ */
1332
+ _isLeadingDocComment(node, content) {
1333
+ if (!node || !node.type) return false;
1334
+ // Tree-sitter comment node names vary by grammar (line_comment,
1335
+ // block_comment, comment, doc_comment); gate on a stable suffix.
1336
+ if (!/comment$/.test(node.type)) return false;
1337
+ const text = content.substring(node.startIndex, node.endIndex).trimStart();
1338
+ return text.startsWith('///') || text.startsWith('/**');
1339
+ }
1340
+
639
1341
  /** Extract symbol name from an AST node */
640
1342
  _extractNodeName(node) {
641
1343
  // Try field name first (most reliable)
642
1344
  const nameNode = node.childForFieldName('name');
643
1345
  if (nameNode) return nameNode.text;
644
1346
 
645
- // Fallback: look for identifier-type children (uses IDENT_TYPES set)
1347
+ // Rust `impl<'a> Type<'a> { ... }` — the type field is a
1348
+ // `generic_type` wrapper, not a leaf `type_identifier`, so the
1349
+ // IDENT_TYPES fallback below picks up the lifetime keyword instead
1350
+ // (or finds nothing). Drill into the wrapper to recover the type
1351
+ // name. Plain `impl Foo` (no generics) hits the IDENT_TYPES branch
1352
+ // unchanged; `impl Foo for Bar` also unchanged since `Foo` is the
1353
+ // first IDENT_TYPES child today.
1354
+ if (node.type === 'impl_item') {
1355
+ const typeNode = node.childForFieldName('type');
1356
+ if (typeNode && typeNode.type === 'generic_type') {
1357
+ const inner = typeNode.namedChild(0);
1358
+ if (inner && IDENT_TYPES.has(inner.type)) {
1359
+ return inner.text;
1360
+ }
1361
+ }
1362
+ }
1363
+
1364
+ // Fallback: look for identifier-type children (uses IDENT_TYPES set).
1365
+ // Visibility-keyword stoplist: tree-sitter-ruby parses bare `private`,
1366
+ // `protected`, `public` (with no args) as standalone `identifier`
1367
+ // statements inside a class/module body — they're method calls on
1368
+ // `self` that toggle subsequent definitions' visibility, not entity
1369
+ // names. When the chunker recurses into an oversized body_statement
1370
+ // and falls back to scanning IDENT_TYPES children, the first such
1371
+ // identifier between method defs would otherwise become the parent
1372
+ // breadcrumb "name=private" and poison every nested chunk's
1373
+ // parentSymbol. Java/Kotlin/C++/C#/Swift parse the same words as
1374
+ // keywords, not identifiers, so this filter is null-op for those
1375
+ // grammars — a Ruby-targeted fix that's safe across the corpus.
646
1376
  for (let i = 0; i < node.childCount; i++) {
647
1377
  const child = node.child(i);
648
1378
  if (IDENT_TYPES.has(child.type)) {
649
- return child.text;
1379
+ const text = child.text;
1380
+ if (text === 'private' || text === 'protected' || text === 'public') continue;
1381
+ return text;
650
1382
  }
651
1383
  }
652
1384
 
653
1385
  return null;
654
1386
  }
655
1387
 
1388
+ /**
1389
+ * Resolve the effective chunk type + name node for a boundary node.
1390
+ * Handles C++ template_declaration wrappers by drilling into the first
1391
+ * child with a known NODE_TYPE_MAP entry (class_specifier, struct_specifier,
1392
+ * function_definition, alias_declaration, etc.). Without this, templated
1393
+ * structs/classes/aliases were emitted as type=code with name=null because
1394
+ * template_declaration itself has no name field.
1395
+ */
1396
+ _resolveBoundary(node) {
1397
+ if (node.type === 'template_declaration') {
1398
+ for (let i = 0; i < node.childCount; i++) {
1399
+ const c = node.child(i);
1400
+ if (NODE_TYPE_MAP[c.type]) {
1401
+ return { type: NODE_TYPE_MAP[c.type], nameNode: c };
1402
+ }
1403
+ }
1404
+ }
1405
+ return { type: NODE_TYPE_MAP[node.type] || 'code', nameNode: node };
1406
+ }
1407
+
656
1408
  /** Create a tree-sitter query (mockable seam for tests) */
657
1409
  async _createQuery(language, queryString) {
658
1410
  const { Query } = await import('web-tree-sitter');
@@ -670,11 +1422,33 @@ export class TreeSitterProvider {
670
1422
  if (fs.existsSync(localPath)) return localPath;
671
1423
  }
672
1424
 
673
- // Strategy 2: .sweet-search/grammars/
1425
+ // Strategy 2: .sweet-search/grammars/ relative to process.cwd().
1426
+ // Used when sweet-search is run from inside a target repo and that repo
1427
+ // ships project-specific grammar overrides under its own .sweet-search/.
674
1428
  const dataDir = process.env.SWEET_SEARCH_DATA_DIR || '.sweet-search';
675
1429
  const dataPath = pathMod.join(process.cwd(), dataDir, 'grammars', `${grammarName}.wasm`);
676
1430
  if (fs.existsSync(dataPath)) return dataPath;
677
1431
 
1432
+ // Strategy 2b: .sweet-search/grammars/ relative to the sweet-search PACKAGE
1433
+ // root (the directory containing this provider file's parent's parent).
1434
+ // This is the home for grammar overrides that need to survive `npm install`
1435
+ // wiping the tree-sitter-wasms bundle (Strategy 3) and also be visible when
1436
+ // the indexer is run from an arbitrary target repo (so process.cwd() is not
1437
+ // the sweet-search root). Required for the Swift grammar override —
1438
+ // tree-sitter-wasms@0.1.13 ships swift v0.4.0 which crashes Node 25.x V8
1439
+ // turboshaft Wasm tier-up (Zone OOM in WasmLoweringPhase); the working
1440
+ // v0.7.2 wasm from alex-pinkus/tree-sitter-swift `0.7.2-pypi` lives here.
1441
+ // Resolve via import.meta.url so it works whether sweet-search is the cwd
1442
+ // or a node_modules dependency.
1443
+ try {
1444
+ const providerDir = pathMod.dirname(new URL(import.meta.url).pathname);
1445
+ const pkgRoot = pathMod.resolve(providerDir, '..', '..');
1446
+ const pkgOverridePath = pathMod.join(pkgRoot, '.sweet-search', 'grammars', `${grammarName}.wasm`);
1447
+ if (fs.existsSync(pkgOverridePath)) return pkgOverridePath;
1448
+ } catch {
1449
+ // import.meta.url unavailable (e.g. some bundlers); fall through.
1450
+ }
1451
+
678
1452
  // Strategy 3: tree-sitter-wasms bundle (all grammars in one package)
679
1453
  try {
680
1454
  const bundlePkg = await import.meta.resolve?.('tree-sitter-wasms/package.json');