sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -12,9 +12,20 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
// Grammar mapping: language ID -> grammar WASM file stem
|
|
15
|
+
//
|
|
16
|
+
// `tsx` uses tree-sitter-tsx (not tree-sitter-typescript) so that JSX inside
|
|
17
|
+
// .tsx bodies parses without producing ERROR nodes. Empirically (May 2026),
|
|
18
|
+
// routing .tsx to tree-sitter-typescript caused `export function Component(...)
|
|
19
|
+
// { return <Foo/> }` to silently miss the function-name capture, even though
|
|
20
|
+
// the tag query rule matched the AST shape — the JSX body created sibling
|
|
21
|
+
// ERROR nodes that broke capture resolution.
|
|
22
|
+
//
|
|
23
|
+
// tree-sitter-javascript already supports JSX natively, so .jsx files don't
|
|
24
|
+
// need a separate grammar.
|
|
15
25
|
const GRAMMAR_MAP = {
|
|
16
26
|
javascript: 'tree-sitter-javascript',
|
|
17
27
|
typescript: 'tree-sitter-typescript',
|
|
28
|
+
tsx: 'tree-sitter-tsx',
|
|
18
29
|
python: 'tree-sitter-python',
|
|
19
30
|
go: 'tree-sitter-go',
|
|
20
31
|
rust: 'tree-sitter-rust',
|
|
@@ -25,6 +36,16 @@ const GRAMMAR_MAP = {
|
|
|
25
36
|
php: 'tree-sitter-php',
|
|
26
37
|
kotlin: 'tree-sitter-kotlin',
|
|
27
38
|
swift: 'tree-sitter-swift',
|
|
39
|
+
// tree-sitter-c-sharp ships in node_modules/tree-sitter-wasms/out/ but was
|
|
40
|
+
// previously unwired — C# fell through to the regex chunker in
|
|
41
|
+
// parseBraceBasedFile. That path missed every modern-C# idiom whose
|
|
42
|
+
// declaration line doesn't fit the rigid regex shape: `unsafe` modifier
|
|
43
|
+
// ordering, positional `record`, tuple-typed generic returns (e.g.
|
|
44
|
+
// `IAsyncEnumerable<(byte[] e, int len, …)>`), expression-bodied methods,
|
|
45
|
+
// file-scoped namespaces, indexers, operators, local functions, nested
|
|
46
|
+
// classes. Wiring tree-sitter-c-sharp puts C# on the same code path as
|
|
47
|
+
// the other 13 languages (cAST sibling-merge over a proper AST).
|
|
48
|
+
csharp: 'tree-sitter-c_sharp',
|
|
28
49
|
};
|
|
29
50
|
|
|
30
51
|
// Identifier node types — used to detect leaf-ident captures in extractSymbols()
|
|
@@ -49,13 +70,36 @@ const BOUNDARY_TYPES = new Set([
|
|
|
49
70
|
'interface_declaration', 'type_alias_declaration', 'enum_declaration',
|
|
50
71
|
// Structs/Traits (Rust/Go)
|
|
51
72
|
'struct_item', 'impl_item', 'trait_item', 'type_declaration',
|
|
73
|
+
// Rust macros (macro_rules!)
|
|
74
|
+
'macro_definition',
|
|
52
75
|
// Modules
|
|
53
76
|
'module', 'namespace_declaration',
|
|
54
77
|
// Python
|
|
55
78
|
'decorated_definition',
|
|
56
79
|
// Java
|
|
57
80
|
'record_declaration', 'constructor_declaration',
|
|
58
|
-
//
|
|
81
|
+
// Java annotation types (`@interface Foo { ... }`). Without this, files
|
|
82
|
+
// that contain only an annotation declaration (gson SerializedName.java,
|
|
83
|
+
// Since.java, Until.java) produce no chunk anchor — the chunker emits
|
|
84
|
+
// a generic 'code' chunk and downstream search-time enrichment via
|
|
85
|
+
// findFirstEntityInRange then attaches whatever entity happens to start
|
|
86
|
+
// in the chunk's line range (which, when extractJava also ran with no
|
|
87
|
+
// block-comment skip, was a phantom `class MyClass` from inside the
|
|
88
|
+
// Javadoc <pre> example). Anchoring on the annotation declaration
|
|
89
|
+
// gives the @interface a proper name/type at index time.
|
|
90
|
+
'annotation_type_declaration',
|
|
91
|
+
// Ruby — tree-sitter-ruby uses bare node names `class`, `method`,
|
|
92
|
+
// `singleton_method`, `singleton_class` (no `_declaration`/`_definition`
|
|
93
|
+
// suffix). Without these in the boundary set the cAST chunker:
|
|
94
|
+
// 1. never anchors a chunk on a Ruby class declaration (so `class Base`,
|
|
95
|
+
// `class IndifferentHash`, etc. produce only anonymous `code` chunks);
|
|
96
|
+
// 2. merges 8+ adjacent methods into one chunk and labels it after
|
|
97
|
+
// whichever singleton_method happened to be present in the merge;
|
|
98
|
+
// 3. drops `class << self` (the Sinatra DSL idiom) entirely.
|
|
99
|
+
// tree-sitter-ruby grammar reference: github.com/tree-sitter/tree-sitter-ruby
|
|
100
|
+
// (node types `class`, `method`, `singleton_class`). Aider's published
|
|
101
|
+
// tags.scm for Ruby uses the same node names.
|
|
102
|
+
'class', 'method', 'singleton_class',
|
|
59
103
|
'singleton_method',
|
|
60
104
|
// PHP
|
|
61
105
|
'trait_declaration',
|
|
@@ -67,8 +111,80 @@ const BOUNDARY_TYPES = new Set([
|
|
|
67
111
|
'struct_specifier', 'enum_specifier', 'type_definition',
|
|
68
112
|
// C++
|
|
69
113
|
'class_specifier', 'namespace_definition',
|
|
114
|
+
// C++ `using X = ...` type aliases + `template<...> class|struct|fn|using` wrappers.
|
|
115
|
+
// Without these the chunker emitted templated decls as anonymous `code` chunks
|
|
116
|
+
// since the cAST sibling-merge path treated them as non-boundary. _resolveBoundary
|
|
117
|
+
// (below) drills into template_declaration to surface the inner class/struct/fn/alias
|
|
118
|
+
// name so the chunk metadata names + type the correct thing.
|
|
119
|
+
'alias_declaration', 'template_declaration',
|
|
70
120
|
]);
|
|
71
121
|
|
|
122
|
+
// Per-language EXTRA boundary types. These are unioned with BOUNDARY_TYPES
|
|
123
|
+
// only when chunking a file in the matching language — so other languages'
|
|
124
|
+
// chunking behaviour stays byte-identical to before the addition. Used to
|
|
125
|
+
// keep grammar-specific node names out of the global set when those names
|
|
126
|
+
// could overlap with another grammar's nodes that have different chunking
|
|
127
|
+
// semantics. The threading happens in parseFileToChunks() which computes
|
|
128
|
+
// `effectiveBoundaryTypes = BOUNDARY_TYPES ∪ LANG_EXTRA_BOUNDARY_TYPES[lang]`
|
|
129
|
+
// once per parse and passes it through recursiveChunk + _extractSignature.
|
|
130
|
+
//
|
|
131
|
+
// C# additions (tree-sitter-c-sharp emits these for first-class declarations,
|
|
132
|
+
// verified empirically with scripts/_csharp_grammar_probe.mjs against Garnet):
|
|
133
|
+
// - struct_declaration, record_struct_declaration: C# struct / record struct
|
|
134
|
+
// - property_declaration: anchors per-property chunks so `RespCommandDocs.Command`
|
|
135
|
+
// style queries (CS-004) have a property-scoped chunk to land on; cAST
|
|
136
|
+
// sibling-merge still bundles small auto-properties into 2000-char buffers
|
|
137
|
+
// named after the first property + additional_symbols listing the rest.
|
|
138
|
+
// - delegate_declaration: `public delegate T Foo(...);` becomes a chunk anchor.
|
|
139
|
+
// - destructor_declaration, indexer_declaration, operator_declaration,
|
|
140
|
+
// conversion_operator_declaration: first-class declarations per C# spec.
|
|
141
|
+
// - file_scoped_namespace_declaration: C# 10+ `namespace Foo;` shape.
|
|
142
|
+
// - local_function_statement: nested function declarations inside methods.
|
|
143
|
+
// - event_declaration, event_field_declaration: events behave as
|
|
144
|
+
// property/field-shaped entities at search time.
|
|
145
|
+
// All these node names are C#-specific in our 14-language matrix EXCEPT
|
|
146
|
+
// `struct_declaration` (also Swift) and `property_declaration` (also Swift),
|
|
147
|
+
// which is exactly why they live here instead of in the global set.
|
|
148
|
+
const LANG_EXTRA_BOUNDARY_TYPES = {
|
|
149
|
+
csharp: new Set([
|
|
150
|
+
'struct_declaration', 'record_struct_declaration',
|
|
151
|
+
'delegate_declaration', 'destructor_declaration',
|
|
152
|
+
'property_declaration',
|
|
153
|
+
'indexer_declaration', 'operator_declaration',
|
|
154
|
+
'conversion_operator_declaration',
|
|
155
|
+
'file_scoped_namespace_declaration',
|
|
156
|
+
'local_function_statement',
|
|
157
|
+
'event_declaration', 'event_field_declaration',
|
|
158
|
+
]),
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
// Per-language EXCLUSIONS from BOUNDARY_TYPES. Removes node-type names that
|
|
162
|
+
// the global set legitimately includes for one grammar but that collide
|
|
163
|
+
// with anonymous-keyword leaves in another grammar — producing phantom
|
|
164
|
+
// chunks during the cAST oversized-recursion path.
|
|
165
|
+
//
|
|
166
|
+
// Concrete trigger: tree-sitter-ruby uses bare `class` / `method` /
|
|
167
|
+
// `singleton_class` / `singleton_method` as the *node type names* of
|
|
168
|
+
// declarations (no `_declaration`/`_definition` suffix — see Ruby comment
|
|
169
|
+
// in BOUNDARY_TYPES). Those four strings are correctly in BOUNDARY_TYPES.
|
|
170
|
+
//
|
|
171
|
+
// But tree-sitter-c-sharp (and tree-sitter-java, tree-sitter-kotlin, etc.)
|
|
172
|
+
// emits an *anonymous keyword leaf* with type-string `"class"` as a child
|
|
173
|
+
// of `class_declaration`. When the chunker recurses into an oversized
|
|
174
|
+
// C# class and flushes the pre-body buffer (modifiers + class keyword +
|
|
175
|
+
// identifier + base_list), that `class` keyword leaf is misidentified as
|
|
176
|
+
// a boundary, producing a phantom `[class/null]` chunk with content
|
|
177
|
+
// `internal\nsealed\nclass\nRespServerSession\n: ServerSessionBase`.
|
|
178
|
+
//
|
|
179
|
+
// Java/Kotlin have the same latent bug (verified empirically on gson's
|
|
180
|
+
// TypeAdapters.java — emits a tiny `[class/null]` size=31 chunk at the
|
|
181
|
+
// class declaration line). The fix is intentionally scoped to csharp
|
|
182
|
+
// only so this PR doesn't change Java/Kotlin chunk output at all
|
|
183
|
+
// (their existing phantom chunks are tiny and don't affect retrieval).
|
|
184
|
+
const LANG_BOUNDARY_TYPE_EXCLUDES = {
|
|
185
|
+
csharp: new Set(['class']),
|
|
186
|
+
};
|
|
187
|
+
|
|
72
188
|
// AST node types that represent function/class bodies. Used by
|
|
73
189
|
// extractSignature() to find where the declaration's body starts so
|
|
74
190
|
// the signature span is everything before it (decorators + name +
|
|
@@ -109,13 +225,24 @@ const NODE_TYPE_MAP = {
|
|
|
109
225
|
'impl_item': 'impl',
|
|
110
226
|
'trait_item': 'trait',
|
|
111
227
|
'type_declaration': 'struct',
|
|
228
|
+
'macro_definition': 'macro',
|
|
112
229
|
'module': 'module',
|
|
113
230
|
'namespace_declaration': 'namespace',
|
|
114
231
|
'decorated_definition': 'decorator',
|
|
115
232
|
// Java
|
|
116
233
|
'record_declaration': 'record',
|
|
117
234
|
'constructor_declaration': 'method',
|
|
118
|
-
//
|
|
235
|
+
// @interface Foo { ... } — chunk labelled as 'interface' to match the
|
|
236
|
+
// existing extractJava regex behaviour (and the gold-probe convention
|
|
237
|
+
// that annotation types are interfaces). Note: a Java annotation is
|
|
238
|
+
// formally an *interface* per JLS §9.6, just a specialised form.
|
|
239
|
+
'annotation_type_declaration': 'interface',
|
|
240
|
+
// Ruby — `class` is the bare tree-sitter-ruby node name for class
|
|
241
|
+
// declarations (Java/JS use `class_declaration`, Python `class_definition`,
|
|
242
|
+
// C++ `class_specifier`). `singleton_class` is `class << self` (or
|
|
243
|
+
// `class << SomeConst`) which opens the receiver's singleton scope.
|
|
244
|
+
'class': 'class',
|
|
245
|
+
'singleton_class': 'class',
|
|
119
246
|
'method': 'method',
|
|
120
247
|
'singleton_method': 'method',
|
|
121
248
|
// PHP
|
|
@@ -133,10 +260,43 @@ const NODE_TYPE_MAP = {
|
|
|
133
260
|
// C++
|
|
134
261
|
'class_specifier': 'class',
|
|
135
262
|
'namespace_definition': 'namespace',
|
|
263
|
+
'alias_declaration': 'typeAlias',
|
|
264
|
+
// template_declaration intentionally absent — resolved to the inner
|
|
265
|
+
// type (class/struct/function/typeAlias) by _resolveBoundary at lookup time.
|
|
266
|
+
// C# — fires only on nodes that the chunker treats as boundaries; for
|
|
267
|
+
// non-C# languages those nodes are NOT in the effective boundary set
|
|
268
|
+
// (see LANG_EXTRA_BOUNDARY_TYPES), so _resolveBoundary is not invoked
|
|
269
|
+
// on them during normal sibling-merge. The leaf-too-big pathological
|
|
270
|
+
// branch is the only place these could be consulted for another
|
|
271
|
+
// grammar (e.g. an oversized Swift `property_declaration` with no
|
|
272
|
+
// children) — the resulting type label is a strict improvement over
|
|
273
|
+
// the previous 'code' fallback in that case.
|
|
274
|
+
'struct_declaration': 'struct',
|
|
275
|
+
'record_struct_declaration': 'record',
|
|
276
|
+
'delegate_declaration': 'function',
|
|
277
|
+
'destructor_declaration': 'method',
|
|
278
|
+
'property_declaration': 'property',
|
|
279
|
+
'indexer_declaration': 'method',
|
|
280
|
+
'operator_declaration': 'method',
|
|
281
|
+
'conversion_operator_declaration': 'method',
|
|
282
|
+
'file_scoped_namespace_declaration': 'namespace',
|
|
283
|
+
'local_function_statement': 'function',
|
|
284
|
+
'event_declaration': 'property',
|
|
285
|
+
'event_field_declaration': 'field',
|
|
136
286
|
};
|
|
137
287
|
|
|
138
288
|
// Standard tags.scm query patterns for symbol extraction
|
|
139
289
|
// These are s-expression patterns matching tree-sitter node types
|
|
290
|
+
//
|
|
291
|
+
// Naming conventions for new captures (May 2026):
|
|
292
|
+
// @component.definition — `export const X = call(...)` (HOC-wrapped values
|
|
293
|
+
// like memo/forwardRef/createSlice). Higher priority than @variable, so
|
|
294
|
+
// when both fire on the same declarator, component wins via dedup-by-name+line
|
|
295
|
+
// in graph-extractor._normalizeTreeSitterEntities.
|
|
296
|
+
// @variable.definition — any other `export const X = expr` (literals, objects,
|
|
297
|
+
// typed configs). Scoped to export_statement on purpose: we don't want to
|
|
298
|
+
// extract every internal `const x = 1` inside a function body. Tree-sitter
|
|
299
|
+
// emits @arrowFunction in priority over @variable when value is an arrow.
|
|
140
300
|
const TAGS_QUERIES = {
|
|
141
301
|
javascript: `
|
|
142
302
|
(function_declaration name: (identifier) @function.definition)
|
|
@@ -149,6 +309,27 @@ const TAGS_QUERIES = {
|
|
|
149
309
|
(export_statement (function_declaration name: (identifier) @function.definition))
|
|
150
310
|
(export_statement
|
|
151
311
|
declaration: (class_declaration name: (identifier) @class.definition))
|
|
312
|
+
(export_statement
|
|
313
|
+
declaration: (lexical_declaration
|
|
314
|
+
(variable_declarator
|
|
315
|
+
name: (identifier) @component.definition
|
|
316
|
+
value: (call_expression))))
|
|
317
|
+
(export_statement
|
|
318
|
+
declaration: (lexical_declaration
|
|
319
|
+
(variable_declarator
|
|
320
|
+
name: (identifier) @variable.definition)))
|
|
321
|
+
; Top-level (non-exported) file-scope const declarations.
|
|
322
|
+
; HISTORY (2026-05-10): a prior version captured ALL top-level lexical
|
|
323
|
+
; declarations as @variable.definition / @component.definition. That
|
|
324
|
+
; over-extracted trivial consts (\`const VERSION = '5.8.4'\`,
|
|
325
|
+
; \`const X = require('...')\`) which then dominated NL retrieval rankings
|
|
326
|
+
; over real function/method definitions (regressed 5 fastify probes vs
|
|
327
|
+
; post-perf-60 baseline). Restored to scoping @variable.definition to
|
|
328
|
+
; export_statement only, matching the original intent in
|
|
329
|
+
; graph-extractor.js:_normalizeTreeSitterEntities (line 1320 comment).
|
|
330
|
+
; If structural-mode resolution of CJS top-level consts is needed,
|
|
331
|
+
; add narrowly-scoped captures (e.g. value: [(array) (object) (new_expression)])
|
|
332
|
+
; rather than re-introducing unrestricted (program ...) captures.
|
|
152
333
|
(pair
|
|
153
334
|
key: (property_identifier) @method.definition
|
|
154
335
|
value: (function_expression))
|
|
@@ -172,6 +353,66 @@ const TAGS_QUERIES = {
|
|
|
172
353
|
declaration: (class_declaration name: (type_identifier) @class.definition))
|
|
173
354
|
(export_statement
|
|
174
355
|
declaration: (abstract_class_declaration name: (type_identifier) @class.definition))
|
|
356
|
+
(export_statement
|
|
357
|
+
declaration: (lexical_declaration
|
|
358
|
+
(variable_declarator
|
|
359
|
+
name: (identifier) @component.definition
|
|
360
|
+
value: (call_expression))))
|
|
361
|
+
(export_statement
|
|
362
|
+
declaration: (lexical_declaration
|
|
363
|
+
(variable_declarator
|
|
364
|
+
name: (identifier) @variable.definition)))
|
|
365
|
+
; Top-level non-exported consts intentionally NOT captured — see javascript
|
|
366
|
+
; query above for rationale (regressed fastify probes via const VERSION etc.).
|
|
367
|
+
(pair
|
|
368
|
+
key: (property_identifier) @method.definition
|
|
369
|
+
value: (function_expression))
|
|
370
|
+
(pair
|
|
371
|
+
key: (property_identifier) @arrow.definition
|
|
372
|
+
value: (arrow_function))
|
|
373
|
+
(module name: (identifier) @namespace.definition)
|
|
374
|
+
(internal_module name: (identifier) @namespace.definition)
|
|
375
|
+
`,
|
|
376
|
+
// tsx grammar is a superset of typescript that also parses JSX. Tag query
|
|
377
|
+
// matches typescript verbatim — JSX expressions inside function bodies don't
|
|
378
|
+
// need their own captures (the surrounding function/component declaration is
|
|
379
|
+
// what we care about). We MUST keep these in sync if typescript adds new rules.
|
|
380
|
+
tsx: `
|
|
381
|
+
(function_declaration name: (identifier) @function.definition)
|
|
382
|
+
(generator_function_declaration name: (identifier) @function.definition)
|
|
383
|
+
(class_declaration name: (type_identifier) @class.definition)
|
|
384
|
+
(abstract_class_declaration name: (type_identifier) @class.definition)
|
|
385
|
+
(method_definition name: (property_identifier) @method.definition)
|
|
386
|
+
(interface_declaration name: (type_identifier) @interface.definition)
|
|
387
|
+
(type_alias_declaration name: (type_identifier) @type.definition)
|
|
388
|
+
(enum_declaration name: (identifier) @enum.definition)
|
|
389
|
+
(variable_declarator
|
|
390
|
+
name: (identifier) @arrow.definition
|
|
391
|
+
value: (arrow_function))
|
|
392
|
+
(export_statement
|
|
393
|
+
declaration: (class_declaration name: (type_identifier) @class.definition))
|
|
394
|
+
(export_statement
|
|
395
|
+
declaration: (abstract_class_declaration name: (type_identifier) @class.definition))
|
|
396
|
+
(export_statement
|
|
397
|
+
declaration: (lexical_declaration
|
|
398
|
+
(variable_declarator
|
|
399
|
+
name: (identifier) @component.definition
|
|
400
|
+
value: (call_expression))))
|
|
401
|
+
(export_statement
|
|
402
|
+
declaration: (lexical_declaration
|
|
403
|
+
(variable_declarator
|
|
404
|
+
name: (identifier) @variable.definition)))
|
|
405
|
+
; Top-level (non-exported) file-scope const declarations — see javascript
|
|
406
|
+
; query for rationale.
|
|
407
|
+
(program
|
|
408
|
+
(lexical_declaration
|
|
409
|
+
(variable_declarator
|
|
410
|
+
name: (identifier) @component.definition
|
|
411
|
+
value: (call_expression))))
|
|
412
|
+
(program
|
|
413
|
+
(lexical_declaration
|
|
414
|
+
(variable_declarator
|
|
415
|
+
name: (identifier) @variable.definition)))
|
|
175
416
|
(pair
|
|
176
417
|
key: (property_identifier) @method.definition
|
|
177
418
|
value: (function_expression))
|
|
@@ -197,20 +438,27 @@ const TAGS_QUERIES = {
|
|
|
197
438
|
(impl_item type: (type_identifier) @impl.definition)
|
|
198
439
|
(trait_item name: (type_identifier) @trait.definition)
|
|
199
440
|
(enum_item name: (type_identifier) @enum.definition)
|
|
441
|
+
(macro_definition name: (identifier) @macro.definition)
|
|
200
442
|
`,
|
|
201
443
|
java: `
|
|
202
444
|
(class_declaration name: (identifier) @class.definition)
|
|
203
445
|
(interface_declaration name: (identifier) @interface.definition)
|
|
446
|
+
(annotation_type_declaration name: (identifier) @interface.definition)
|
|
204
447
|
(enum_declaration name: (identifier) @enum.definition)
|
|
205
448
|
(record_declaration name: (identifier) @record.definition)
|
|
206
449
|
(method_declaration name: (identifier) @method.definition)
|
|
207
450
|
(constructor_declaration name: (identifier) @method.definition)
|
|
451
|
+
(annotation_type_element_declaration name: (identifier) @method.definition)
|
|
452
|
+
(enum_constant name: (identifier) @enum_constant.definition)
|
|
453
|
+
(field_declaration declarator: (variable_declarator name: (identifier) @field.definition))
|
|
208
454
|
`,
|
|
209
455
|
ruby: `
|
|
210
456
|
(class name: (constant) @class.definition)
|
|
457
|
+
(singleton_class value: (constant) @class.definition)
|
|
211
458
|
(module name: (constant) @module.definition)
|
|
212
459
|
(method name: (identifier) @method.definition)
|
|
213
460
|
(singleton_method name: (identifier) @method.definition)
|
|
461
|
+
(alias name: (identifier) @method.definition)
|
|
214
462
|
`,
|
|
215
463
|
php: `
|
|
216
464
|
(class_declaration name: (name) @class.definition)
|
|
@@ -251,9 +499,102 @@ const TAGS_QUERIES = {
|
|
|
251
499
|
(struct_specifier name: (type_identifier) @struct.definition)
|
|
252
500
|
(enum_specifier name: (type_identifier) @enum.definition)
|
|
253
501
|
(namespace_definition name: (namespace_identifier) @namespace.definition)
|
|
502
|
+
(alias_declaration name: (type_identifier) @type.definition)
|
|
503
|
+
`,
|
|
504
|
+
// C# — tree-sitter-c-sharp uses bare `identifier` (not type_identifier)
|
|
505
|
+
// for all names, and exposes `name:` fields on every first-class
|
|
506
|
+
// declaration. Probed against Garnet's 30+ partial-class shards in
|
|
507
|
+
// scripts/_csharp_grammar_probe.mjs: every boundary type below emits
|
|
508
|
+
// a parseable name field. Indexer/operator declarations have no
|
|
509
|
+
// user-facing name (the `this[…]` / `operator+` is the identity),
|
|
510
|
+
// so they're captured at node level via @method.definition.
|
|
511
|
+
// Namespaces use `qualified_name` for dotted forms (`Garnet.server`)
|
|
512
|
+
// and `identifier` for single-segment forms; we capture both shapes.
|
|
513
|
+
csharp: `
|
|
514
|
+
(class_declaration name: (identifier) @class.definition)
|
|
515
|
+
(interface_declaration name: (identifier) @interface.definition)
|
|
516
|
+
(struct_declaration name: (identifier) @struct.definition)
|
|
517
|
+
(record_declaration name: (identifier) @record.definition)
|
|
518
|
+
(record_struct_declaration name: (identifier) @record.definition)
|
|
519
|
+
(enum_declaration name: (identifier) @enum.definition)
|
|
520
|
+
(delegate_declaration name: (identifier) @function.definition)
|
|
521
|
+
(namespace_declaration name: (identifier) @namespace.definition)
|
|
522
|
+
(namespace_declaration name: (qualified_name) @namespace.definition)
|
|
523
|
+
(file_scoped_namespace_declaration name: (identifier) @namespace.definition)
|
|
524
|
+
(file_scoped_namespace_declaration name: (qualified_name) @namespace.definition)
|
|
525
|
+
(method_declaration name: (identifier) @method.definition)
|
|
526
|
+
(constructor_declaration name: (identifier) @method.definition)
|
|
527
|
+
(destructor_declaration name: (identifier) @method.definition)
|
|
528
|
+
(property_declaration name: (identifier) @property.definition)
|
|
529
|
+
(indexer_declaration) @method.definition
|
|
530
|
+
(operator_declaration) @method.definition
|
|
531
|
+
(conversion_operator_declaration) @method.definition
|
|
532
|
+
(event_declaration name: (identifier) @property.definition)
|
|
533
|
+
(event_field_declaration (variable_declaration (variable_declarator name: (identifier) @field.definition)))
|
|
534
|
+
(field_declaration (variable_declaration (variable_declarator name: (identifier) @field.definition)))
|
|
535
|
+
(local_function_statement name: (identifier) @function.definition)
|
|
254
536
|
`,
|
|
255
537
|
};
|
|
256
538
|
|
|
539
|
+
// Names that tree-sitter-c / tree-sitter-cpp sometimes emit as the `name:`
|
|
540
|
+
// field of a struct_specifier / class_specifier / function_declarator when
|
|
541
|
+
// the parser misidentifies a C/C++ keyword as a user identifier. Common
|
|
542
|
+
// cause: a header-only C++ library has its .h file routed to C (because
|
|
543
|
+
// .h → c in EXTENSION_MAP), and tree-sitter-c then encounters C++ keywords
|
|
544
|
+
// (`alignas`, `namespace`, `decltype`, `enum class`) it does not recognize.
|
|
545
|
+
//
|
|
546
|
+
// Examples:
|
|
547
|
+
// `struct alignas(16) uint128_t { ... }` tree-sitter-c → name=alignas
|
|
548
|
+
// `enum class Color { RED };` tree-sitter-c → name=class (under struct_specifier)
|
|
549
|
+
// `using Vec = decltype(Zero(D()));` tree-sitter-c → name=decltype (under function)
|
|
550
|
+
// `namespace hwy::x86 { ... }` tree-sitter-c → name=namespace (under function)
|
|
551
|
+
// `if (cond) { body }` tree-sitter-c → name=if (under function, on misparse)
|
|
552
|
+
//
|
|
553
|
+
// All entries are C/C++ reserved keywords. They CANNOT legally be the name
|
|
554
|
+
// of a user-defined type, function, or variable in any version of C/C++.
|
|
555
|
+
// Filtering them removes only phantom captures — never a legitimate entity.
|
|
556
|
+
//
|
|
557
|
+
// Closed list. Scoped to languageId ∈ {c, cpp} via C_FAMILY_LANGUAGES so a
|
|
558
|
+
// Go/Python/JS file with a class literally named `final` is not affected.
|
|
559
|
+
// Evidence gathered from highway @ 3c72230 cpp probe corpus:
|
|
560
|
+
// decltype: 667 captures, alignas: 1, namespace: phantom on CPP-008,
|
|
561
|
+
// class (under enum): 10, if: 11. Audit query in commit message.
|
|
562
|
+
const C_FAMILY_ATTRIBUTE_PHANTOM_NAMES = new Set([
|
|
563
|
+
// Attribute / specifier keywords
|
|
564
|
+
'alignas', // C++11 keyword
|
|
565
|
+
'_Alignas', // C11 keyword
|
|
566
|
+
'__attribute__', // GCC extension
|
|
567
|
+
'__declspec', // MSVC extension
|
|
568
|
+
'__inline__', // GCC extension
|
|
569
|
+
'__forceinline', // MSVC extension
|
|
570
|
+
'final', // C++11 contextual keyword
|
|
571
|
+
'override', // C++11 contextual keyword
|
|
572
|
+
// Type-deduction operators
|
|
573
|
+
'decltype', // C++11
|
|
574
|
+
'typeof', // C2x / GCC extension
|
|
575
|
+
'__typeof', // GCC extension
|
|
576
|
+
'__typeof__', // GCC extension
|
|
577
|
+
// Structural keywords
|
|
578
|
+
'class', // C++ — miscaptured from `enum class` in .h→c misparse
|
|
579
|
+
'struct', // C/C++
|
|
580
|
+
'union', // C/C++
|
|
581
|
+
'enum', // C/C++
|
|
582
|
+
'namespace', // C++
|
|
583
|
+
'typedef', // C/C++
|
|
584
|
+
'template', // C++
|
|
585
|
+
// Control-flow keywords (miscaptured as functions on parse errors)
|
|
586
|
+
'if', // C/C++
|
|
587
|
+
'for', // C/C++
|
|
588
|
+
'while', // C/C++
|
|
589
|
+
'switch', // C/C++
|
|
590
|
+
'do', // C/C++
|
|
591
|
+
]);
|
|
592
|
+
|
|
593
|
+
// Languages where C_FAMILY_ATTRIBUTE_PHANTOM_NAMES should be filtered.
|
|
594
|
+
// Scoped narrowly to C/C++ so a Go/Python/JS file containing a type
|
|
595
|
+
// literally named `final` is not affected.
|
|
596
|
+
const C_FAMILY_LANGUAGES = new Set(['c', 'cpp']);
|
|
597
|
+
|
|
257
598
|
// Map capture names from tags.scm queries to entity types
|
|
258
599
|
const CAPTURE_TO_ENTITY_TYPE = {
|
|
259
600
|
'function.definition': 'function',
|
|
@@ -272,6 +613,30 @@ const CAPTURE_TO_ENTITY_TYPE = {
|
|
|
272
613
|
'record.definition': 'record',
|
|
273
614
|
'module.definition': 'module',
|
|
274
615
|
'object.definition': 'class',
|
|
616
|
+
// JS/TS exported const declarations (May 2026):
|
|
617
|
+
// @component fires only when value is a call_expression (memo/forwardRef/createSlice etc.);
|
|
618
|
+
// @variable fires for any export const, including string/object/typed literals.
|
|
619
|
+
// Both can match the same node; component wins via priority dedup downstream.
|
|
620
|
+
'component.definition': 'component',
|
|
621
|
+
'variable.definition': 'variable',
|
|
622
|
+
'macro.definition': 'macro',
|
|
623
|
+
// C# property declarations — `public RespCommand Command { get; init; }`.
|
|
624
|
+
// Used by the csharp TAGS_QUERIES entry to give init-only properties /
|
|
625
|
+
// computed properties / event-as-property declarations their own graph
|
|
626
|
+
// entity. Currently no other tree-sitter grammar in this codebase emits
|
|
627
|
+
// a @property.definition capture, so 'property' is a C#-private entity
|
|
628
|
+
// type at the moment (Swift's property_declaration node is not captured
|
|
629
|
+
// by tags.scm and won't reach this map).
|
|
630
|
+
'property.definition': 'property',
|
|
631
|
+
// Java enum constants (FieldNamingPolicy.UPPER_CAMEL_CASE) and field
|
|
632
|
+
// declarations (TypeAdapters.BIT_SET — a static final field whose
|
|
633
|
+
// initializer is an anonymous `new TypeAdapter<BitSet>() { ... }`).
|
|
634
|
+
// Both are first-class declarations per JLS but tree-sitter-java
|
|
635
|
+
// exposes them under distinct node types (enum_constant,
|
|
636
|
+
// field_declaration > variable_declarator) that our query previously
|
|
637
|
+
// ignored, so neither got a proper symbol anchor in the graph.
|
|
638
|
+
'enum_constant.definition': 'enum_constant',
|
|
639
|
+
'field.definition': 'field',
|
|
275
640
|
};
|
|
276
641
|
|
|
277
642
|
export class TreeSitterProvider {
|
|
@@ -381,7 +746,7 @@ export class TreeSitterProvider {
|
|
|
381
746
|
const seen = new Set(); // deduplicate by startIndex
|
|
382
747
|
for (const capture of captures) {
|
|
383
748
|
const { name: captureName, node } = capture;
|
|
384
|
-
|
|
749
|
+
let entityType = CAPTURE_TO_ENTITY_TYPE[captureName];
|
|
385
750
|
if (!entityType) continue;
|
|
386
751
|
|
|
387
752
|
// When queries capture an identifier (e.g. `name: (identifier) @x`),
|
|
@@ -390,6 +755,34 @@ export class TreeSitterProvider {
|
|
|
390
755
|
const isLeafIdent = IDENT_TYPES.has(node.type);
|
|
391
756
|
const extentNode = isLeafIdent && node.parent ? node.parent : node;
|
|
392
757
|
|
|
758
|
+
// Go's grammar collapses every `type X …` declaration into
|
|
759
|
+
// `type_declaration → type_spec` with a single @type.definition
|
|
760
|
+
// capture, which the table above maps to the catch-all 'typeAlias'.
|
|
761
|
+
// The downstream `type` field of a type_spec encodes whether X is a
|
|
762
|
+
// struct, an interface, or a true type alias / slice / func type.
|
|
763
|
+
// Drill in and emit the more specific entity type so symbol-type
|
|
764
|
+
// filtering, file-kind boosts and probe gold checks (GO-005 / GO-007
|
|
765
|
+
// / GO-008 expect 'interface'/'struct', not 'typeAlias') work as
|
|
766
|
+
// intended. Pure precision refinement: same node extent, same
|
|
767
|
+
// symbol name, only the label changes. Other languages have no
|
|
768
|
+
// `type_spec` node, so this branch is structurally Go-only.
|
|
769
|
+
if (
|
|
770
|
+
languageId === 'go' &&
|
|
771
|
+
entityType === 'typeAlias' &&
|
|
772
|
+
extentNode.type === 'type_spec'
|
|
773
|
+
) {
|
|
774
|
+
const typeField = extentNode.childForFieldName?.('type');
|
|
775
|
+
if (typeField) {
|
|
776
|
+
if (typeField.type === 'struct_type') entityType = 'struct';
|
|
777
|
+
else if (typeField.type === 'interface_type') entityType = 'interface';
|
|
778
|
+
// All other type-spec rhs shapes (slice/array/map/channel/
|
|
779
|
+
// function/pointer/qualified/identifier/generic/parenthesized)
|
|
780
|
+
// remain 'typeAlias', which is the correct semantic label for
|
|
781
|
+
// `type Middlewares []func(...)`, `type Handler = http.Handler`,
|
|
782
|
+
// etc.
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
|
|
393
786
|
// Deduplicate: multiple captures can match the same declaration
|
|
394
787
|
const key = `${extentNode.startIndex}:${entityType}`;
|
|
395
788
|
if (seen.has(key)) continue;
|
|
@@ -411,6 +804,16 @@ export class TreeSitterProvider {
|
|
|
411
804
|
|| this._extractNodeName(node)
|
|
412
805
|
|| `<anonymous:${entityType}>`);
|
|
413
806
|
|
|
807
|
+
// Filter C/C++ phantom captures where the parser bound the `name:`
|
|
808
|
+
// field to a C/C++ keyword (`alignas`, `__attribute__`, etc.) instead
|
|
809
|
+
// of the actual type name. See C_FAMILY_ATTRIBUTE_PHANTOM_NAMES above.
|
|
810
|
+
if (
|
|
811
|
+
C_FAMILY_LANGUAGES.has(languageId) &&
|
|
812
|
+
C_FAMILY_ATTRIBUTE_PHANTOM_NAMES.has(symbolName)
|
|
813
|
+
) {
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
|
|
414
817
|
symbols.push({
|
|
415
818
|
name: symbolName,
|
|
416
819
|
type: entityType,
|
|
@@ -451,10 +854,40 @@ export class TreeSitterProvider {
|
|
|
451
854
|
const maxChunkSize = (options.maxChunkSize || 2000) - headerOverhead;
|
|
452
855
|
this._chunkCounter = 0;
|
|
453
856
|
|
|
857
|
+
// Per-parse effective boundary set: BOUNDARY_TYPES ∪ language extras
|
|
858
|
+
// \ language excludes. For every language without an extras or excludes
|
|
859
|
+
// entry (= all 13 pre-2026-05-12 languages), this is byte-identical to
|
|
860
|
+
// BOUNDARY_TYPES (same Set reference), so non-C# parsing semantics are
|
|
861
|
+
// unchanged.
|
|
862
|
+
const langExtra = LANG_EXTRA_BOUNDARY_TYPES[languageId];
|
|
863
|
+
const langExcludes = LANG_BOUNDARY_TYPE_EXCLUDES[languageId];
|
|
864
|
+
let boundaryTypes;
|
|
865
|
+
if (!langExtra && !langExcludes) {
|
|
866
|
+
boundaryTypes = BOUNDARY_TYPES;
|
|
867
|
+
} else {
|
|
868
|
+
boundaryTypes = new Set(BOUNDARY_TYPES);
|
|
869
|
+
if (langExtra) for (const t of langExtra) boundaryTypes.add(t);
|
|
870
|
+
if (langExcludes) for (const t of langExcludes) boundaryTypes.delete(t);
|
|
871
|
+
}
|
|
872
|
+
|
|
454
873
|
const children = this._getChildren(tree.rootNode);
|
|
455
|
-
const chunks = this.recursiveChunk(children, content, maxChunkSize, null);
|
|
874
|
+
const chunks = this.recursiveChunk(children, content, maxChunkSize, null, boundaryTypes);
|
|
456
875
|
|
|
457
876
|
tree.delete(); // free WASM memory
|
|
877
|
+
|
|
878
|
+
// Filter phantom C/C++ attribute names — null out chunk.name when the
|
|
879
|
+
// parser bound it to a C/C++ keyword. The chunk itself stays (the code
|
|
880
|
+
// is real); only the symbol label is corrected. Downstream anomalous-
|
|
881
|
+
// chunk demotion will treat any small-span resulting anonymous chunk
|
|
882
|
+
// appropriately.
|
|
883
|
+
if (C_FAMILY_LANGUAGES.has(languageId) && chunks) {
|
|
884
|
+
for (const chunk of chunks) {
|
|
885
|
+
if (chunk?.name && C_FAMILY_ATTRIBUTE_PHANTOM_NAMES.has(chunk.name)) {
|
|
886
|
+
chunk.name = null;
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
|
|
458
891
|
return chunks.length > 0 ? chunks : null;
|
|
459
892
|
}
|
|
460
893
|
|
|
@@ -483,13 +916,33 @@ export class TreeSitterProvider {
|
|
|
483
916
|
* @param {string} content - Full file content
|
|
484
917
|
* @param {number} maxSize - Maximum chunk size in characters
|
|
485
918
|
* @param {object|null} parentInfo - Parent chunk info for hierarchical linking
|
|
919
|
+
* @param {Set<string>} [boundaryTypes] - Effective boundary set (per-parse,
|
|
920
|
+
* BOUNDARY_TYPES ∪ LANG_EXTRA_BOUNDARY_TYPES[lang]). When omitted, falls
|
|
921
|
+
* back to BOUNDARY_TYPES — preserves the pre-2026-05-12 call signature
|
|
922
|
+
* for any internal caller that constructs this provider directly.
|
|
486
923
|
* @returns {Array} List of chunk objects
|
|
487
924
|
*/
|
|
488
|
-
recursiveChunk(nodes, content, maxSize, parentInfo) {
|
|
925
|
+
recursiveChunk(nodes, content, maxSize, parentInfo, boundaryTypes = BOUNDARY_TYPES) {
|
|
489
926
|
const chunks = [];
|
|
490
927
|
let buffer = [];
|
|
491
928
|
let bufferSize = 0;
|
|
492
929
|
|
|
930
|
+
// SMALL_TAIL_THRESHOLD: chunks below this character count are
|
|
931
|
+
// considered "orphan tails" — they tend to be `module.exports`,
|
|
932
|
+
// closing braces, trailing const declarations, etc. that cAST's
|
|
933
|
+
// sibling-merge couldn't fit into the previous buffer when it
|
|
934
|
+
// overflowed maxSize. Merging them into the preceding emitted
|
|
935
|
+
// chunk (when it shares the same parent context and won't push
|
|
936
|
+
// past 1.25× maxSize) gives the agent a coherent unit instead
|
|
937
|
+
// of a 2-line dangling chunk that wins retrieval on its own.
|
|
938
|
+
//
|
|
939
|
+
// Verified canary: lib/schema-controller.js was emitting
|
|
940
|
+
// [148-161 setupSerializer] followed by [163-164 module.exports]
|
|
941
|
+
// as two separate chunks — the orphan tail won S2-Q3 retrieval.
|
|
942
|
+
// After merge, the tail joins setupSerializer.
|
|
943
|
+
const SMALL_TAIL_THRESHOLD = 100;
|
|
944
|
+
const TAIL_MERGE_HEADROOM = 1.25;
|
|
945
|
+
|
|
493
946
|
const flushBuffer = () => {
|
|
494
947
|
if (buffer.length === 0) return;
|
|
495
948
|
const text = buffer
|
|
@@ -497,23 +950,190 @@ export class TreeSitterProvider {
|
|
|
497
950
|
.join('\n');
|
|
498
951
|
|
|
499
952
|
if (text.trim().length > 30) {
|
|
500
|
-
const
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
953
|
+
const boundariesInBuffer = buffer.filter(n => boundaryTypes.has(n.type));
|
|
954
|
+
|
|
955
|
+
// SIBLING_DOC_SPLIT (RS-008 motivation, May 2026): at top level, when
|
|
956
|
+
// 2+ boundary-typed siblings each carry an immediately-preceding outer
|
|
957
|
+
// doc-comment, emit one chunk per boundary instead of merging them.
|
|
958
|
+
// cAST sibling-merge would otherwise collapse them into one chunk
|
|
959
|
+
// anchored solely on the first boundary's name (e.g. packaging.rs's
|
|
960
|
+
// `is_package` + `detect_package_root` collapse into one
|
|
961
|
+
// `# function: is_package` chunk). The bi-encoder then sees only the
|
|
962
|
+
// first symbol as primary, and the sibling's doc-comment gets
|
|
963
|
+
// averaged into the pooled embedding — a `# Additional:` header is
|
|
964
|
+
// too weak to recover the secondary symbol at production k=5.
|
|
965
|
+
//
|
|
966
|
+
// Section i = buffer[afterPrevBoundary .. boundary_i]. The first
|
|
967
|
+
// section absorbs all leading file-level material (module-level
|
|
968
|
+
// comments, use stmts) so it stays attached to the first boundary;
|
|
969
|
+
// the last section absorbs any trailing non-boundary nodes.
|
|
970
|
+
//
|
|
971
|
+
// Validation (May 2026, full §3 pipeline):
|
|
972
|
+
// - All 17 non-rust language packs: byte-identical to baseline
|
|
973
|
+
// - retrieval-probes 60: 46/4/10 identical
|
|
974
|
+
// - GCSN dev MRR@10: 86.92% exact
|
|
975
|
+
// - Rust AST-tester: 5/0/3 identical, zero PASS→FAIL flips
|
|
976
|
+
// - doc-positive / doc-negative rust: identical
|
|
977
|
+
// RS-008 did NOT flip — bottleneck is encoder-bound (resolver.rs's
|
|
978
|
+
// doc-string literally names `detect_package_root`, beating
|
|
979
|
+
// packaging.rs on TF/IR). Shipped anyway as a structurally-correct
|
|
980
|
+
// cAST refinement: more focused chunks for documented multi-fn
|
|
981
|
+
// top-level files (e.g. fs.rs now has 5 per-fn chunks instead of
|
|
982
|
+
// one merged chunk), with zero regression cost across all gates.
|
|
983
|
+
//
|
|
984
|
+
// Gating (conservative — undocumented helpers stay merged):
|
|
985
|
+
// 1. parentInfo == null: top-level only. Nested contexts (mod,
|
|
986
|
+
// impl, class bodies) keep cAST merge behaviour because their
|
|
987
|
+
// `# Parent:` header line already anchors siblings to the
|
|
988
|
+
// enclosing scope.
|
|
989
|
+
// 2. boundariesInBuffer.length >= 2: nothing to split if one.
|
|
990
|
+
// 3. every boundary has a leading outer-doc comment (`///` or
|
|
991
|
+
// `/**`). Mixed documented/undocumented buffers fall through
|
|
992
|
+
// to cAST merge to avoid inflating chunk counts unnecessarily.
|
|
993
|
+
// RUBY_CLASS_SIBLING_SPLIT: split when buffer has 2+ Ruby class/
|
|
994
|
+
// module siblings, each with an extractable name. cAST sibling-
|
|
995
|
+
// merge otherwise collapses adjacent tiny classes — e.g. sinatra
|
|
996
|
+
// base.rb's `class ExtendedRack` + `class CommonLogger` + `class
|
|
997
|
+
// Error` + ... — into one chunk labeled after the first boundary,
|
|
998
|
+
// and later entity adoption (file-kind-ranking.applyResultDemotions)
|
|
999
|
+
// walks UP via findEnclosingEntity over the merged range and
|
|
1000
|
+
// silently relabels the chunk to the outer module/namespace,
|
|
1001
|
+
// losing the IAR anchor. Splitting per-class restores 1:1 chunk-
|
|
1002
|
+
// to-entity alignment.
|
|
1003
|
+
//
|
|
1004
|
+
// Ruby-only gate via the tree-sitter-ruby-specific node type names
|
|
1005
|
+
// (`class`/`module`/`singleton_class`). Other grammars use
|
|
1006
|
+
// `class_declaration` (Java/JS/TS/Kotlin/C#), `class_definition`
|
|
1007
|
+
// (Python/Dart), `class_specifier` (C++), `struct_item` (Rust),
|
|
1008
|
+
// etc. — none of those node names exist in tree-sitter-ruby, and
|
|
1009
|
+
// `class`/`module`/`singleton_class` don't exist in any other
|
|
1010
|
+
// grammar. So this split is byte-identical-null-op for every non-
|
|
1011
|
+
// Ruby language pack and the 60-probe retrieval bench, while
|
|
1012
|
+
// fixing the chunker-bound regressions on Ruby AST probes RB-001
|
|
1013
|
+
// through RB-008.
|
|
1014
|
+
const RUBY_CLASS_LIKE_TYPES = new Set([
|
|
1015
|
+
'class',
|
|
1016
|
+
'module',
|
|
1017
|
+
'singleton_class',
|
|
1018
|
+
]);
|
|
1019
|
+
const isClassLikeSiblingSet = boundariesInBuffer.length >= 2
|
|
1020
|
+
&& boundariesInBuffer.every(b => {
|
|
1021
|
+
if (!RUBY_CLASS_LIKE_TYPES.has(b.type)) return false;
|
|
1022
|
+
const resolved = this._resolveBoundary(b);
|
|
1023
|
+
return !!this._extractNodeName(resolved.nameNode);
|
|
1024
|
+
});
|
|
1025
|
+
|
|
1026
|
+
if (
|
|
1027
|
+
parentInfo == null
|
|
1028
|
+
&& boundariesInBuffer.length >= 2
|
|
1029
|
+
&& boundariesInBuffer.every(b => {
|
|
1030
|
+
const idx = buffer.indexOf(b);
|
|
1031
|
+
return idx > 0 && this._isLeadingDocComment(buffer[idx - 1], content);
|
|
1032
|
+
})
|
|
1033
|
+
|| isClassLikeSiblingSet
|
|
1034
|
+
) {
|
|
1035
|
+
let sectionStart = 0;
|
|
1036
|
+
for (let i = 0; i < boundariesInBuffer.length; i++) {
|
|
1037
|
+
const b = boundariesInBuffer[i];
|
|
1038
|
+
const bIdx = buffer.indexOf(b);
|
|
1039
|
+
// Last section absorbs trailing non-boundary nodes after `b`.
|
|
1040
|
+
const isLast = i === boundariesInBuffer.length - 1;
|
|
1041
|
+
const sectionEnd = isLast ? buffer.length - 1 : bIdx;
|
|
1042
|
+
const section = buffer.slice(sectionStart, sectionEnd + 1);
|
|
1043
|
+
const sectionText = section
|
|
1044
|
+
.map(n => content.substring(n.startIndex, n.endIndex))
|
|
1045
|
+
.join('\n');
|
|
1046
|
+
if (sectionText.trim().length > 30) {
|
|
1047
|
+
const resolved = this._resolveBoundary(b);
|
|
1048
|
+
chunks.push({
|
|
1049
|
+
chunkId: this._nextChunkId(),
|
|
1050
|
+
parentChunkId: parentInfo?.chunkId || null,
|
|
1051
|
+
parentSymbol: parentInfo?.name || null,
|
|
1052
|
+
parentType: parentInfo?.type || null,
|
|
1053
|
+
text: sectionText.trim(),
|
|
1054
|
+
startLine: section[0].startPosition.row,
|
|
1055
|
+
endLine: section[section.length - 1].endPosition.row,
|
|
1056
|
+
type: resolved.type,
|
|
1057
|
+
name: this._extractNodeName(resolved.nameNode),
|
|
1058
|
+
signature: this._extractSignature(b, content, boundaryTypes),
|
|
1059
|
+
additionalSymbols: null,
|
|
1060
|
+
});
|
|
1061
|
+
}
|
|
1062
|
+
sectionStart = bIdx + 1;
|
|
1063
|
+
}
|
|
1064
|
+
buffer = [];
|
|
1065
|
+
bufferSize = 0;
|
|
1066
|
+
return;
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
const firstBoundary = boundariesInBuffer[0];
|
|
1070
|
+
let name = null;
|
|
1071
|
+
let type = 'code';
|
|
1072
|
+
if (firstBoundary) {
|
|
1073
|
+
const resolved = this._resolveBoundary(firstBoundary);
|
|
1074
|
+
name = this._extractNodeName(resolved.nameNode);
|
|
1075
|
+
type = resolved.type;
|
|
1076
|
+
}
|
|
1077
|
+
const signature = firstBoundary ? this._extractSignature(firstBoundary, content, boundaryTypes) : null;
|
|
1078
|
+
// When the cAST sibling-merge collapses multiple top-level
|
|
1079
|
+
// boundaries into one chunk (e.g. small rust file with two
|
|
1080
|
+
// adjacent free-standing fns), only the first boundary's name
|
|
1081
|
+
// would otherwise reach embedding/LI headers — the bi-encoder
|
|
1082
|
+
// never sees the sibling symbol names. Collect them here and
|
|
1083
|
+
// pass through so buildEmbeddingText() / buildLiText() can
|
|
1084
|
+
// surface them via an `# Additional:` header line.
|
|
1085
|
+
let additionalSymbols = null;
|
|
1086
|
+
if (boundariesInBuffer.length > 1) {
|
|
1087
|
+
const sibNames = boundariesInBuffer.slice(1)
|
|
1088
|
+
.map(n => this._extractNodeName(n))
|
|
1089
|
+
.filter(n => n && n !== name);
|
|
1090
|
+
if (sibNames.length > 0) additionalSymbols = sibNames;
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
// Tail-orphan merge: when the buffer about to be flushed is
|
|
1094
|
+
// small AND has no boundary symbol of its own, append it into
|
|
1095
|
+
// the previous chunk PROVIDED:
|
|
1096
|
+
// (a) the previous chunk's endLine is within 5 lines of this
|
|
1097
|
+
// buffer's startLine (spatial locality — avoids merging
|
|
1098
|
+
// a `module.exports` at line 163 with a class method at
|
|
1099
|
+
// line 30)
|
|
1100
|
+
// (b) merging keeps total under 1.25× maxSize (avoid overflow
|
|
1101
|
+
// cliffs)
|
|
1102
|
+
//
|
|
1103
|
+
// We deliberately don't require same parentChunkId because the
|
|
1104
|
+
// canonical orphan-tail case (Lib/schema-controller.js) has the
|
|
1105
|
+
// tail at FILE-level (parent=null) but the previous emitted
|
|
1106
|
+
// chunk is the last METHOD of a class (parent=class_id) emitted
|
|
1107
|
+
// via the recursive call. Spatial proximity is the more
|
|
1108
|
+
// structural test — a 2-line trailing assignment immediately
|
|
1109
|
+
// after a class block belongs with that block.
|
|
1110
|
+
const prev = chunks[chunks.length - 1];
|
|
1111
|
+
const isOrphanTail = !firstBoundary
|
|
1112
|
+
&& text.trim().length < SMALL_TAIL_THRESHOLD;
|
|
1113
|
+
const bufferStart = buffer[0].startPosition.row;
|
|
1114
|
+
const linesGap = prev ? bufferStart - prev.endLine : Infinity;
|
|
1115
|
+
const isSpatiallyClose = linesGap >= 0 && linesGap <= 5;
|
|
1116
|
+
const mergedSize = prev ? (prev.text.length + 1 + text.trim().length) : Infinity;
|
|
1117
|
+
const fitsHeadroom = mergedSize <= maxSize * TAIL_MERGE_HEADROOM;
|
|
1118
|
+
|
|
1119
|
+
if (isOrphanTail && prev && isSpatiallyClose && fitsHeadroom) {
|
|
1120
|
+
prev.text = prev.text + '\n' + text.trim();
|
|
1121
|
+
prev.endLine = buffer[buffer.length - 1].endPosition.row;
|
|
1122
|
+
} else {
|
|
1123
|
+
chunks.push({
|
|
1124
|
+
chunkId: this._nextChunkId(),
|
|
1125
|
+
parentChunkId: parentInfo?.chunkId || null,
|
|
1126
|
+
parentSymbol: parentInfo?.name || null,
|
|
1127
|
+
parentType: parentInfo?.type || null,
|
|
1128
|
+
text: text.trim(),
|
|
1129
|
+
startLine: buffer[0].startPosition.row,
|
|
1130
|
+
endLine: buffer[buffer.length - 1].endPosition.row,
|
|
1131
|
+
type,
|
|
1132
|
+
name: name || (buffer.length === 1 ? null : null),
|
|
1133
|
+
signature,
|
|
1134
|
+
additionalSymbols,
|
|
1135
|
+
});
|
|
1136
|
+
}
|
|
517
1137
|
}
|
|
518
1138
|
buffer = [];
|
|
519
1139
|
bufferSize = 0;
|
|
@@ -537,14 +1157,66 @@ export class TreeSitterProvider {
|
|
|
537
1157
|
} else {
|
|
538
1158
|
// Node is oversized even alone — recurse into children
|
|
539
1159
|
if (node.childCount > 0) {
|
|
540
|
-
const
|
|
541
|
-
const
|
|
1160
|
+
const resolved = this._resolveBoundary(node);
|
|
1161
|
+
const name = this._extractNodeName(resolved.nameNode);
|
|
1162
|
+
const type = resolved.type;
|
|
1163
|
+
|
|
1164
|
+
// Header chunk for oversized BOUNDARY nodes (large classes,
|
|
1165
|
+
// structs, traits, etc.): emit a small "header" chunk before
|
|
1166
|
+
// recursing into the body. Without this, queries that match
|
|
1167
|
+
// the boundary's name itself (rather than any inner member)
|
|
1168
|
+
// have NO chunk anchored on the boundary — only sub-chunks
|
|
1169
|
+
// with parent_symbol context. Empirically (kotlin JobSupport,
|
|
1170
|
+
// 1582-line `open class JobSupport`), this left class-targeted
|
|
1171
|
+
// queries to lose to inner method chunks. The header chunk
|
|
1172
|
+
// captures the declaration + leading doc-comment / opening
|
|
1173
|
+
// body (up to ~600 chars) so the boundary name is searchable.
|
|
1174
|
+
//
|
|
1175
|
+
// Gating: only when the node is a BOUNDARY_TYPES AND has a name.
|
|
1176
|
+
// Top-level Ruby method nodes are excluded because those
|
|
1177
|
+
// unscoped `def` snippets are normalized to anonymous code chunks
|
|
1178
|
+
// by ASTChunker. Parent-scoped Ruby methods still get header
|
|
1179
|
+
// chunks when oversized.
|
|
1180
|
+
// Header text is bounded to maxSize so we never exceed embed cap.
|
|
1181
|
+
const isRubyMethodHeader = parentInfo == null
|
|
1182
|
+
&& (node.type === 'method' || node.type === 'singleton_method');
|
|
1183
|
+
if (boundaryTypes.has(node.type) && name && !isRubyMethodHeader) {
|
|
1184
|
+
const HEADER_MAX_CHARS = Math.min(600, maxSize);
|
|
1185
|
+
const headerEndIdx = Math.min(node.endIndex, node.startIndex + HEADER_MAX_CHARS);
|
|
1186
|
+
const headerText = content.substring(node.startIndex, headerEndIdx);
|
|
1187
|
+
if (headerText.trim().length > 30) {
|
|
1188
|
+
const lineCount = headerText.split('\n').length;
|
|
1189
|
+
chunks.push({
|
|
1190
|
+
chunkId: this._nextChunkId(),
|
|
1191
|
+
parentChunkId: parentInfo?.chunkId || null,
|
|
1192
|
+
parentSymbol: parentInfo?.name || null,
|
|
1193
|
+
parentType: parentInfo?.type || null,
|
|
1194
|
+
text: headerText.trim(),
|
|
1195
|
+
startLine: node.startPosition.row,
|
|
1196
|
+
endLine: node.startPosition.row + Math.max(0, lineCount - 1),
|
|
1197
|
+
type,
|
|
1198
|
+
name,
|
|
1199
|
+
signature: this._extractSignature(node, content, boundaryTypes),
|
|
1200
|
+
});
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
542
1203
|
|
|
543
|
-
// Transparent nodes (
|
|
544
|
-
//
|
|
545
|
-
//
|
|
1204
|
+
// Transparent nodes (no name resolved) pass through the caller's
|
|
1205
|
+
// parent context instead of creating an anonymous "unknown" level.
|
|
1206
|
+
// Covers two cases:
|
|
1207
|
+
// 1. Non-boundary containers (statement_block, body_statement,
|
|
1208
|
+
// block) — pre-existing behaviour.
|
|
1209
|
+
// 2. Ruby `class << self` (singleton_class with value=self,
|
|
1210
|
+
// which has no extractable name). Without this carve-out
|
|
1211
|
+
// the chunk's sub-chunks get `parentSymbol='unknown'`,
|
|
1212
|
+
// losing the enclosing class context (e.g. Sinatra::Base);
|
|
1213
|
+
// with it they inherit `parentSymbol='Base'`. Narrowed to
|
|
1214
|
+
// singleton_class so other languages' nameless boundaries
|
|
1215
|
+
// (JS arrow_function, anonymous classes) keep their
|
|
1216
|
+
// pre-existing 'unknown' attribution unchanged.
|
|
546
1217
|
let subParent;
|
|
547
|
-
|
|
1218
|
+
const isNamelessRubySingleton = node.type === 'singleton_class';
|
|
1219
|
+
if (!name && (!boundaryTypes.has(node.type) || isNamelessRubySingleton) && parentInfo) {
|
|
548
1220
|
subParent = parentInfo;
|
|
549
1221
|
} else {
|
|
550
1222
|
const parentId = this._nextChunkId();
|
|
@@ -555,12 +1227,14 @@ export class TreeSitterProvider {
|
|
|
555
1227
|
this._getChildren(node),
|
|
556
1228
|
content,
|
|
557
1229
|
maxSize,
|
|
558
|
-
subParent
|
|
1230
|
+
subParent,
|
|
1231
|
+
boundaryTypes
|
|
559
1232
|
);
|
|
560
1233
|
chunks.push(...subChunks);
|
|
561
1234
|
} else {
|
|
562
1235
|
// Leaf node too big — emit as-is (never split mid-expression)
|
|
563
1236
|
const nodeText = content.substring(node.startIndex, node.endIndex);
|
|
1237
|
+
const resolved = this._resolveBoundary(node);
|
|
564
1238
|
chunks.push({
|
|
565
1239
|
chunkId: this._nextChunkId(),
|
|
566
1240
|
parentChunkId: parentInfo?.chunkId || null,
|
|
@@ -569,9 +1243,9 @@ export class TreeSitterProvider {
|
|
|
569
1243
|
text: nodeText.trim(),
|
|
570
1244
|
startLine: node.startPosition.row,
|
|
571
1245
|
endLine: node.endPosition.row,
|
|
572
|
-
type:
|
|
573
|
-
name: this._extractNodeName(
|
|
574
|
-
signature: this._extractSignature(node, content),
|
|
1246
|
+
type: resolved.type,
|
|
1247
|
+
name: this._extractNodeName(resolved.nameNode),
|
|
1248
|
+
signature: this._extractSignature(node, content, boundaryTypes),
|
|
575
1249
|
});
|
|
576
1250
|
}
|
|
577
1251
|
}
|
|
@@ -598,9 +1272,9 @@ export class TreeSitterProvider {
|
|
|
598
1272
|
* does NOT alter `text`, `li_text`, or `li_greedy_text` — signature
|
|
599
1273
|
* surface is research-only on `embedding_text`.
|
|
600
1274
|
*/
|
|
601
|
-
_extractSignature(node, content) {
|
|
1275
|
+
_extractSignature(node, content, boundaryTypes = BOUNDARY_TYPES) {
|
|
602
1276
|
if (!node || !content) return null;
|
|
603
|
-
if (!
|
|
1277
|
+
if (!boundaryTypes.has(node.type)) return null;
|
|
604
1278
|
|
|
605
1279
|
let bodyStart = null;
|
|
606
1280
|
// Try field-name lookup first (works for most modern grammars).
|
|
@@ -636,23 +1310,101 @@ export class TreeSitterProvider {
|
|
|
636
1310
|
return normalized.slice(0, MAX_SIGNATURE_LENGTH - 1) + '…';
|
|
637
1311
|
}
|
|
638
1312
|
|
|
1313
|
+
/**
|
|
1314
|
+
* Returns true if `node` is a comment-typed AST node whose source text
|
|
1315
|
+
* is an outer doc-comment immediately preceding a code item.
|
|
1316
|
+
*
|
|
1317
|
+
* Recognized outer-doc prefixes (cross-language):
|
|
1318
|
+
* /// — Rust outer doc, C/C++/C# triple-slash documentation
|
|
1319
|
+
* /** — Javadoc, JSDoc, PHPDoc, Doxygen, KDoc, Scaladoc
|
|
1320
|
+
*
|
|
1321
|
+
* Deliberately excludes:
|
|
1322
|
+
* //! — Rust inner doc (applies to enclosing module, not next item)
|
|
1323
|
+
* // — plain line comments (Go uses these as docs but the same
|
|
1324
|
+
* syntax is used for arbitrary inline notes; ambiguous, skip)
|
|
1325
|
+
* # — shell/Ruby/Python pound comments (ambiguous, and Python
|
|
1326
|
+
* docstrings live INSIDE the function, not preceding it)
|
|
1327
|
+
*
|
|
1328
|
+
* Used by the SIBLING_DOC_SPLIT branch in recursiveChunk.flushBuffer to
|
|
1329
|
+
* decide whether each of N top-level sibling boundaries has its own
|
|
1330
|
+
* docstring (in which case they each deserve their own chunk).
|
|
1331
|
+
*/
|
|
1332
|
+
_isLeadingDocComment(node, content) {
|
|
1333
|
+
if (!node || !node.type) return false;
|
|
1334
|
+
// Tree-sitter comment node names vary by grammar (line_comment,
|
|
1335
|
+
// block_comment, comment, doc_comment); gate on a stable suffix.
|
|
1336
|
+
if (!/comment$/.test(node.type)) return false;
|
|
1337
|
+
const text = content.substring(node.startIndex, node.endIndex).trimStart();
|
|
1338
|
+
return text.startsWith('///') || text.startsWith('/**');
|
|
1339
|
+
}
|
|
1340
|
+
|
|
639
1341
|
/** Extract symbol name from an AST node */
|
|
640
1342
|
_extractNodeName(node) {
|
|
641
1343
|
// Try field name first (most reliable)
|
|
642
1344
|
const nameNode = node.childForFieldName('name');
|
|
643
1345
|
if (nameNode) return nameNode.text;
|
|
644
1346
|
|
|
645
|
-
//
|
|
1347
|
+
// Rust `impl<'a> Type<'a> { ... }` — the type field is a
|
|
1348
|
+
// `generic_type` wrapper, not a leaf `type_identifier`, so the
|
|
1349
|
+
// IDENT_TYPES fallback below picks up the lifetime keyword instead
|
|
1350
|
+
// (or finds nothing). Drill into the wrapper to recover the type
|
|
1351
|
+
// name. Plain `impl Foo` (no generics) hits the IDENT_TYPES branch
|
|
1352
|
+
// unchanged; `impl Foo for Bar` also unchanged since `Foo` is the
|
|
1353
|
+
// first IDENT_TYPES child today.
|
|
1354
|
+
if (node.type === 'impl_item') {
|
|
1355
|
+
const typeNode = node.childForFieldName('type');
|
|
1356
|
+
if (typeNode && typeNode.type === 'generic_type') {
|
|
1357
|
+
const inner = typeNode.namedChild(0);
|
|
1358
|
+
if (inner && IDENT_TYPES.has(inner.type)) {
|
|
1359
|
+
return inner.text;
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
// Fallback: look for identifier-type children (uses IDENT_TYPES set).
|
|
1365
|
+
// Visibility-keyword stoplist: tree-sitter-ruby parses bare `private`,
|
|
1366
|
+
// `protected`, `public` (with no args) as standalone `identifier`
|
|
1367
|
+
// statements inside a class/module body — they're method calls on
|
|
1368
|
+
// `self` that toggle subsequent definitions' visibility, not entity
|
|
1369
|
+
// names. When the chunker recurses into an oversized body_statement
|
|
1370
|
+
// and falls back to scanning IDENT_TYPES children, the first such
|
|
1371
|
+
// identifier between method defs would otherwise become the parent
|
|
1372
|
+
// breadcrumb "name=private" and poison every nested chunk's
|
|
1373
|
+
// parentSymbol. Java/Kotlin/C++/C#/Swift parse the same words as
|
|
1374
|
+
// keywords, not identifiers, so this filter is null-op for those
|
|
1375
|
+
// grammars — a Ruby-targeted fix that's safe across the corpus.
|
|
646
1376
|
for (let i = 0; i < node.childCount; i++) {
|
|
647
1377
|
const child = node.child(i);
|
|
648
1378
|
if (IDENT_TYPES.has(child.type)) {
|
|
649
|
-
|
|
1379
|
+
const text = child.text;
|
|
1380
|
+
if (text === 'private' || text === 'protected' || text === 'public') continue;
|
|
1381
|
+
return text;
|
|
650
1382
|
}
|
|
651
1383
|
}
|
|
652
1384
|
|
|
653
1385
|
return null;
|
|
654
1386
|
}
|
|
655
1387
|
|
|
1388
|
+
/**
|
|
1389
|
+
* Resolve the effective chunk type + name node for a boundary node.
|
|
1390
|
+
* Handles C++ template_declaration wrappers by drilling into the first
|
|
1391
|
+
* child with a known NODE_TYPE_MAP entry (class_specifier, struct_specifier,
|
|
1392
|
+
* function_definition, alias_declaration, etc.). Without this, templated
|
|
1393
|
+
* structs/classes/aliases were emitted as type=code with name=null because
|
|
1394
|
+
* template_declaration itself has no name field.
|
|
1395
|
+
*/
|
|
1396
|
+
_resolveBoundary(node) {
|
|
1397
|
+
if (node.type === 'template_declaration') {
|
|
1398
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
1399
|
+
const c = node.child(i);
|
|
1400
|
+
if (NODE_TYPE_MAP[c.type]) {
|
|
1401
|
+
return { type: NODE_TYPE_MAP[c.type], nameNode: c };
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
return { type: NODE_TYPE_MAP[node.type] || 'code', nameNode: node };
|
|
1406
|
+
}
|
|
1407
|
+
|
|
656
1408
|
/** Create a tree-sitter query (mockable seam for tests) */
|
|
657
1409
|
async _createQuery(language, queryString) {
|
|
658
1410
|
const { Query } = await import('web-tree-sitter');
|
|
@@ -670,11 +1422,33 @@ export class TreeSitterProvider {
|
|
|
670
1422
|
if (fs.existsSync(localPath)) return localPath;
|
|
671
1423
|
}
|
|
672
1424
|
|
|
673
|
-
// Strategy 2: .sweet-search/grammars/
|
|
1425
|
+
// Strategy 2: .sweet-search/grammars/ relative to process.cwd().
|
|
1426
|
+
// Used when sweet-search is run from inside a target repo and that repo
|
|
1427
|
+
// ships project-specific grammar overrides under its own .sweet-search/.
|
|
674
1428
|
const dataDir = process.env.SWEET_SEARCH_DATA_DIR || '.sweet-search';
|
|
675
1429
|
const dataPath = pathMod.join(process.cwd(), dataDir, 'grammars', `${grammarName}.wasm`);
|
|
676
1430
|
if (fs.existsSync(dataPath)) return dataPath;
|
|
677
1431
|
|
|
1432
|
+
// Strategy 2b: .sweet-search/grammars/ relative to the sweet-search PACKAGE
|
|
1433
|
+
// root (the directory containing this provider file's parent's parent).
|
|
1434
|
+
// This is the home for grammar overrides that need to survive `npm install`
|
|
1435
|
+
// wiping the tree-sitter-wasms bundle (Strategy 3) and also be visible when
|
|
1436
|
+
// the indexer is run from an arbitrary target repo (so process.cwd() is not
|
|
1437
|
+
// the sweet-search root). Required for the Swift grammar override —
|
|
1438
|
+
// tree-sitter-wasms@0.1.13 ships swift v0.4.0 which crashes Node 25.x V8
|
|
1439
|
+
// turboshaft Wasm tier-up (Zone OOM in WasmLoweringPhase); the working
|
|
1440
|
+
// v0.7.2 wasm from alex-pinkus/tree-sitter-swift `0.7.2-pypi` lives here.
|
|
1441
|
+
// Resolve via import.meta.url so it works whether sweet-search is the cwd
|
|
1442
|
+
// or a node_modules dependency.
|
|
1443
|
+
try {
|
|
1444
|
+
const providerDir = pathMod.dirname(new URL(import.meta.url).pathname);
|
|
1445
|
+
const pkgRoot = pathMod.resolve(providerDir, '..', '..');
|
|
1446
|
+
const pkgOverridePath = pathMod.join(pkgRoot, '.sweet-search', 'grammars', `${grammarName}.wasm`);
|
|
1447
|
+
if (fs.existsSync(pkgOverridePath)) return pkgOverridePath;
|
|
1448
|
+
} catch {
|
|
1449
|
+
// import.meta.url unavailable (e.g. some bundlers); fall through.
|
|
1450
|
+
}
|
|
1451
|
+
|
|
678
1452
|
// Strategy 3: tree-sitter-wasms bundle (all grammars in one package)
|
|
679
1453
|
try {
|
|
680
1454
|
const bundlePkg = await import.meta.resolve?.('tree-sitter-wasms/package.json');
|