sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -221,6 +221,7 @@ export const CORE_LANGUAGES = {
221
221
  enum: /^(?:pub\s+)?enum\s+(\w+)/,
222
222
  trait: /^(?:pub\s+)?trait\s+(\w+)/,
223
223
  impl: /^impl(?:<[^>]+>)?\s+(?:\w+\s+for\s+)?(\w+)/,
224
+ macro: /^macro_rules!\s+(\w+)/,
224
225
  },
225
226
  graph: {
226
227
  entities: {
@@ -232,6 +233,7 @@ export const CORE_LANGUAGES = {
232
233
  type: /^(?:pub\s+)?type\s+(\w+)/,
233
234
  const: /^(?:pub\s+)?const\s+(\w+)\s*:/,
234
235
  static: /^(?:pub\s+)?static\s+(\w+)\s*:/,
236
+ macro: /^macro_rules!\s+(\w+)/,
235
237
  },
236
238
  relationships: {
237
239
  use: /^use\s+([\w:]+)(?:::\{([^}]+)\})?/,
@@ -252,13 +254,21 @@ export const CORE_LANGUAGES = {
252
254
  },
253
255
  chunker: {
254
256
  function: /^(?:[\w*\s]+)\s+(\w+)\s*\([^)]*\)\s*\{/,
255
- struct: /^(?:typedef\s+)?struct\s+(\w+)/,
257
+ // Skip attribute-like prefixes between `struct` and the type name:
258
+ // - function-like attrs: __attribute__((...)), __declspec(...), _Alignas(N), __forceinline(...), __inline__(...)
259
+ // Paren matcher allows up to 3 nesting levels — __attribute__ standardly takes
260
+ // a doubly-parenthesized arg, optionally containing call-form attrs like aligned(8).
261
+ // - C++11 attribute syntax: [[ ... ]]
262
+ // - ALL_CAPS user macros (≥3 chars): PACKED_API, HWY_DLLEXPORT, EIGEN_API, etc.
263
+ // Without this, `struct __attribute__((packed)) Foo` captured `__attribute__` as the name.
264
+ // Backtracking handles ALL_CAPS-named forward decls (`struct WHEEL;`).
265
+ struct: /^(?:typedef\s+)?struct(?:\s+(?:__attribute__|__declspec|_Alignas|__forceinline|__inline__)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
256
266
  enum: /^(?:typedef\s+)?enum\s+(\w+)/,
257
267
  },
258
268
  graph: {
259
269
  entities: {
260
270
  function: /^(?:static\s+)?(?:inline\s+)?(?:[\w*]+\s+)+(\w+)\s*\([^)]*\)\s*\{/,
261
- struct: /^(?:typedef\s+)?struct\s+(\w+)/,
271
+ struct: /^(?:typedef\s+)?struct(?:\s+(?:__attribute__|__declspec|_Alignas|__forceinline|__inline__)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
262
272
  enum: /^(?:typedef\s+)?enum\s+(\w+)/,
263
273
  typedef: /^typedef\s+.+\s+(\w+)\s*;/,
264
274
  macro: /^#define\s+(\w+)/,
@@ -279,14 +289,26 @@ export const CORE_LANGUAGES = {
279
289
  block: ["/*", "*/"],
280
290
  },
281
291
  chunker: {
282
- class: /^(?:class|struct)\s+(\w+)(?:\s*:\s*(?:public|protected|private)\s+(\w+))?/,
292
+ // Skip attribute-like prefixes between `class`/`struct` and the type name.
293
+ // Closed list of standard attributes:
294
+ // - function-like: alignas(N), __attribute__((...)), __declspec(...),
295
+ // __forceinline(...). Paren matcher allows up to 3 nesting levels —
296
+ // __attribute__ standardly takes a doubly-parenthesized arg, optionally
297
+ // containing call-form attrs like aligned(8).
298
+ // - C++11 attribute syntax: [[ ... ]]
299
+ // - ALL_CAPS user macros (≥3 chars): HWY_DLLEXPORT, EIGEN_API, FMT_API, etc.
300
+ // Common in dllexport/visibility shims across header-only C++ libraries.
301
+ // Without this, `struct alignas(16) uint128_t` captured `alignas` as the
302
+ // struct name (CPP-005 failure root cause). Backtracking preserves capture of
303
+ // ALL_CAPS forward decls like `class WHEEL;`.
304
+ class: /^(?:class|struct)(?:\s+(?:alignas|__attribute__|__declspec|__forceinline)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)(?:\s*:\s*(?:public|protected|private)\s+(\w+))?/,
283
305
  function: /^(?:[\w:*&<>\s]+)\s+(\w+)\s*\([^)]*\)\s*(?:const)?\s*(?:override)?\s*\{/,
284
306
  namespace: /^namespace\s+(\w+)/,
285
307
  template: /^template\s*(<[^>]+>)/,
286
308
  },
287
309
  graph: {
288
310
  entities: {
289
- class: /^(?:class|struct)\s+(\w+)/,
311
+ class: /^(?:class|struct)(?:\s+(?:alignas|__attribute__|__declspec|__forceinline)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
290
312
  namespace: /^namespace\s+(\w+)/,
291
313
  function: /^(?:[\w:*&<>\s]+)\s+(\w+)\s*\([^)]*\)\s*(?:const)?\s*(?:override)?\s*\{/,
292
314
  typedef: /^(?:typedef|using)\s+.+\s+(\w+)/,
@@ -310,22 +332,33 @@ export const CORE_LANGUAGES = {
310
332
  block: ["/*", "*/"],
311
333
  },
312
334
  chunker: {
313
- class: /(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)/,
314
- interface: /(?:public|internal)?\s*interface\s+(\w+)/,
315
- method: /(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\(/,
316
- property: /(?:public|private|protected|internal)\s+(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
317
- enum: /(?:public|internal)?\s*enum\s+(\w+)/,
318
- struct: /(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
335
+ class: /^\s*(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)/,
336
+ interface: /^\s*(?:public|internal)?\s*interface\s+(\w+)/,
337
+ // Two alternation branches:
338
+ // (a) visibility-prefixed: loose return type — covers `public Task<int> Foo(...)`
339
+ // and `internal override void Bar(...)`.
340
+ // (b) visibility-less: STRICT return type (C# primitives + Task/ValueTask/
341
+ // IEnumerable/IAsyncEnumerable + capitalized identifiers) — covers
342
+ // default-private `void Foo<T>(...)` / `async Task Bar<T>(...)` patterns
343
+ // shipped widely in real codebases (e.g. Garnet's AsyncProcessor.cs
344
+ // partial-class shard). Strict return-type list prevents false positives
345
+ // like `return Foo()` and `if (cond)` (those starting tokens not in list).
346
+ // Also adds `(?:<...>)?` between method name and `(` so generic-method declarations
347
+ // `void Foo<T>(...)` get matched (previously rejected the `<T>` between name and `(`).
348
+ method: /^\s*(?:(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:override\s+)?(?:virtual\s+)?(?:[\w<>\[\]?]+)|(?:(?:static|async|override|virtual|sealed|new)\s+)*(?:void|bool|byte|sbyte|short|ushort|int|uint|long|ulong|float|double|decimal|char|string|object|Task(?:<[^>]+>)?|ValueTask(?:<[^>]+>)?|IEnumerable(?:<[^>]+>)?|IAsyncEnumerable(?:<[^>]+>)?|[A-Z][\w<>?\[\],]*))\s+(\w+)(?:<[\w\s,?<>]+>)?\s*\(/,
349
+ property: /^\s*(?:public|private|protected|internal)\s+(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
350
+ enum: /^\s*(?:public|internal)?\s*enum\s+(\w+)/,
351
+ struct: /^\s*(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
319
352
  },
320
353
  graph: {
321
354
  entities: {
322
- class: /(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
323
- interface: /(?:public|internal)?\s*interface\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
324
- enum: /(?:public|internal)?\s*enum\s+(\w+)/,
325
- struct: /(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
326
- method: /(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:override\s+)?(?:virtual\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\(([^)]*)\)/,
327
- property: /(?:public|private|protected|internal)\s+(?:static\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
328
- field: /(?:public|private|protected|internal)\s+(?:static\s+)?(?:readonly\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*[;=]/,
355
+ class: /^\s*(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
356
+ interface: /^\s*(?:public|internal)?\s*interface\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
357
+ enum: /^\s*(?:public|internal)?\s*enum\s+(\w+)/,
358
+ struct: /^\s*(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
359
+ method: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:override\s+)?(?:virtual\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\(([^)]*)\)/,
360
+ property: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
361
+ field: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:readonly\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*[;=]/,
329
362
  },
330
363
  relationships: {
331
364
  using: /^using\s+([\w.]+)\s*;/,
@@ -338,4 +371,10 @@ export const CORE_LANGUAGES = {
338
371
  },
339
372
  };
340
373
 
374
+ // .tsx files share TypeScript chunker/graph regex; only the
375
+ // tree-sitter WASM grammar differs (see GRAMMAR_MAP in tree-sitter-provider.js).
376
+ // Aliasing the same object means chunker patterns, graph patterns, and
377
+ // skipCallObjects stay byte-identical to typescript without duplication.
378
+ CORE_LANGUAGES.tsx = CORE_LANGUAGES.typescript;
379
+
341
380
  export default CORE_LANGUAGES;
@@ -72,14 +72,21 @@ export const OBJECT_ORIENTED_LANGUAGES = {
72
72
  block: ["=begin", "=end"],
73
73
  },
74
74
  chunker: {
75
- class: /^class\s+(\w+)(?:\s*<\s*(\w+))?/,
76
- module: /^module\s+(\w+)/,
75
+ // Allow leading whitespace so indented `class Foo` declarations
76
+ // nested inside a module (the dominant Ruby idiom) actually match.
77
+ // The previous `^class` anchor missed every class inside a module
78
+ // wrapper (e.g. `class IndifferentHash < Hash` inside `module Sinatra`).
79
+ // Superclass can be any expression (`Rack::Request`, `Struct.new(:app)`)
80
+ // — `_matchBoundary` only consumes the first capture, so allowing any
81
+ // tail after the class name keeps the inheritance form parsable.
82
+ class: /^\s*class\s+(\w+)/,
83
+ module: /^\s*module\s+(\w+)/,
77
84
  method: /^\s*def\s+(\w+[?!=]?)/,
78
85
  },
79
86
  graph: {
80
87
  entities: {
81
- class: /^class\s+(\w+)(?:\s*<\s*(\w+))?/,
82
- module: /^module\s+(\w+)/,
88
+ class: /^\s*class\s+(\w+)/,
89
+ module: /^\s*module\s+(\w+)/,
83
90
  method: /^\s*def\s+(\w+[?!=]?)\s*(?:\(([^)]*)\))?/,
84
91
  },
85
92
  relationships: {
@@ -87,7 +94,7 @@ export const OBJECT_ORIENTED_LANGUAGES = {
87
94
  include: /^\s*include\s+(\w+)/,
88
95
  extend: /^\s*extend\s+(\w+)/,
89
96
  prepend: /^\s*prepend\s+(\w+)/,
90
- inherit: /^class\s+\w+\s*<\s*(\w+)/,
97
+ inherit: /^\s*class\s+\w+\s*<\s*([\w:]+)/,
91
98
  methodCall: /(\w+)\s*\.\s*(\w+)\s*[(!]/,
92
99
  },
93
100
  skipCallObjects: ["puts", "print", "p", "raise", "require", "attr_accessor", "attr_reader", "attr_writer"],
@@ -37,6 +37,74 @@ export function getLanguageByExtension(ext) {
37
37
  return { id, ...lang };
38
38
  }
39
39
 
40
+ /**
41
+ * Tokens that can appear in C++ source but NOT in valid C source outside
42
+ * of string literals or comments. Used by resolveLanguage() to disambiguate
43
+ * `.h` files between C and C++.
44
+ *
45
+ * template< — C has no templates (C99+ has `_Generic`, syntactically distinct).
46
+ * namespace IDENT — C has no namespaces (token reserved in C++).
47
+ * class IDENT[:{] — `class IDENT { ... }` and `class IDENT : base` are C++ syntax;
48
+ * `int class;` (using `class` as a C field name) does NOT match.
49
+ * decltype( — C++11 type-deduction; not in any C standard.
50
+ * {private|public|protected}: — access specifiers, C++ only at file scope. They
51
+ * could appear as goto labels in C, but doing so is exceptionally
52
+ * rare and the cost of a false positive is "parse a C header with
53
+ * tree-sitter-cpp" which is mostly a superset of tree-sitter-c.
54
+ * IDENT::IDENT — scope-resolution operator, C++ only.
55
+ *
56
+ * Strings/comments containing these tokens can produce false positives. The cost is
57
+ * one mis-routed header parsed by tree-sitter-cpp, which still produces reasonable
58
+ * results since tree-sitter-cpp is a near-superset of tree-sitter-c. This is the
59
+ * same disambiguation strategy used by GitHub Linguist for `.h`.
60
+ */
61
+ const HEADER_CPP_DISAMBIGUATOR = /\btemplate\s*<|\bnamespace\s+[A-Za-z_]|\bclass\s+[A-Za-z_]\w*\s*[:{]|\bdecltype\s*\(|\b(?:private|public|protected)\s*:|[A-Za-z_]\w*\s*::\s*[A-Za-z_]/;
62
+
63
+ // Number of leading characters scanned for C++ disambiguator tokens. Real-world
64
+ // C++ headers have at least one telltale token within the first ~1KB (include
65
+ // guards + namespace/template/class). 2KB gives generous margin without making
66
+ // the scan a hot-path cost.
67
+ const HEADER_DISAMBIGUATOR_SCAN_BYTES = 2048;
68
+
69
+ /**
70
+ * Resolve the language for a file, using file content to disambiguate
71
+ * ambiguous extensions (today: `.h` for C vs C++).
72
+ *
73
+ * The default `.h → c` mapping in EXTENSION_MAP is incorrect for header-
74
+ * only C++ libraries (highway, Eigen, fmt, abseil-cpp, range-v3, …)
75
+ * where the implementation lives in `.h` files. When `.h` is parsed by
76
+ * tree-sitter-c, C++ keywords (alignas, namespace, template, decltype)
77
+ * are misidentified, producing phantom symbols and oversized macro-cluster
78
+ * chunks that pollute retrieval.
79
+ *
80
+ * Strategy: per-file content scan. If a `.h` file contains any token that
81
+ * cannot appear in valid C outside strings/comments (template<, namespace,
82
+ * class IDENT[:{], decltype(, access specifiers, IDENT::IDENT), route to
83
+ * cpp; otherwise keep the default routing. No project-level state, no
84
+ * cross-file leakage; per-file decision is locally explainable.
85
+ *
86
+ * @param {string} filePath - File path
87
+ * @param {string} [content] - File content (optional; required for `.h` disambiguation)
88
+ * @returns {{ id: string, ...config } | null}
89
+ */
90
+ export function resolveLanguage(filePath, content) {
91
+ const langInfo = getLanguageByPath(filePath);
92
+ if (!langInfo) return null;
93
+ if (langInfo.id !== 'c') return langInfo;
94
+ // Only attempt content disambiguation for `.h` (the ambiguous extension).
95
+ // `.c` files are unambiguous C; we won't override them based on content.
96
+ const ext = path.extname(filePath).toLowerCase();
97
+ if (ext !== '.h') return langInfo;
98
+ if (typeof content !== 'string' || content.length === 0) return langInfo;
99
+ const probe = content.length > HEADER_DISAMBIGUATOR_SCAN_BYTES
100
+ ? content.slice(0, HEADER_DISAMBIGUATOR_SCAN_BYTES)
101
+ : content;
102
+ if (HEADER_CPP_DISAMBIGUATOR.test(probe)) {
103
+ return getLanguageByExtension('.cpp');
104
+ }
105
+ return langInfo;
106
+ }
107
+
40
108
  /**
41
109
  * Get language config by file path (handles both extension and filename).
42
110
  * @param {string} filePath - File path
@@ -132,6 +200,7 @@ export default {
132
200
  LANGUAGES,
133
201
  getLanguageByExtension,
134
202
  getLanguageByPath,
203
+ resolveLanguage,
135
204
  getChunkerPatterns,
136
205
  getGraphPatterns,
137
206
  getLanguageMeta,
@@ -86,6 +86,10 @@ export const MODEL_REGISTRY = {
86
86
  hfId: 'nomic-ai/CodeRankEmbed',
87
87
  profile: 'full',
88
88
  description: 'Local embedding model (FP32 safetensors, 768d) for native inference',
89
+ // Loaded only by the candle/native inference path, which is armed
90
+ // exclusively for accelerated indexing (Metal / CoreML cascade / CUDA).
91
+ // CPU-only hosts index with ORT INT8 and never load this — init skips it.
92
+ nativeAccelerated: true,
89
93
  files: [
90
94
  { path: 'model.safetensors', sizeBytes: 546938168, sha256: '827529bcd58aef0d9082e66eeff7e7d53a02f62bd005f841a26b3d3e2fb17ebe' },
91
95
  { path: 'config.json', sizeBytes: 1525, sha256: null },
@@ -96,6 +100,8 @@ export const MODEL_REGISTRY = {
96
100
  hfId: 'lightonai/LateOn-Code',
97
101
  profile: 'full',
98
102
  description: 'Late interaction model (FP32 safetensors, backbone 768d) for native inference',
103
+ // Native-accelerated only — see coderankembed-fp32 above.
104
+ nativeAccelerated: true,
99
105
  files: [
100
106
  { path: 'model.safetensors', sizeBytes: 596076280, sha256: '45c40bb4ba6b45f0c66b2deb3d27dd06efc3af23c78c8093b8cad2af61c683b2' },
101
107
  { path: '1_Dense/model.safetensors', sizeBytes: 393304, sha256: '22ea6a53cad3ed034934b5db7a214a0bcc28ff4cc440babea44029989e4bbcca' },
@@ -107,6 +113,8 @@ export const MODEL_REGISTRY = {
107
113
  hfId: 'lightonai/LateOn-Code-edge',
108
114
  profile: 'full',
109
115
  description: 'Late interaction edge model (FP32 safetensors, backbone 256d, 2-stage projection) for native inference',
116
+ // Native-accelerated only — see coderankembed-fp32 above.
117
+ nativeAccelerated: true,
110
118
  files: [
111
119
  { path: 'model.safetensors', sizeBytes: 67195976, sha256: '7ffc36b8ff71367249cd5220dbdd4bdbe177bc0e305b2e978a8b598bd8296f04' },
112
120
  { path: '1_Dense/model.safetensors', sizeBytes: 524376, sha256: '9efb17fcb2106cd8fcb01d57a9cd9c997a487ad20630ec8e44ce3f9d89efe0a7' },
@@ -160,6 +168,18 @@ export function getModelEntry(key) {
160
168
  return MODEL_REGISTRY[key] || null;
161
169
  }
162
170
 
171
+ /**
172
+ * Whether a model is a native-accelerated FP32 artifact (safetensors loaded
173
+ * by the candle/native inference path). These are only used for accelerated
174
+ * indexing on Metal / CoreML cascade / CUDA hosts; a CPU-only host indexes
175
+ * with ORT INT8 and never loads them, so init skips them by default (~1.2 GB
176
+ * of downloads avoided). Marked with `nativeAccelerated: true` in the
177
+ * registry entry. Returns false for unknown keys.
178
+ */
179
+ export function isNativeAcceleratedModel(key) {
180
+ return Boolean(MODEL_REGISTRY[key]?.nativeAccelerated);
181
+ }
182
+
163
183
  /**
164
184
  * Truthy env-flag parser. Matches the conventions used across the codebase:
165
185
  * "1" / "true" / "on" / "yes" (case-insensitive) → true
@@ -48,8 +48,7 @@
48
48
 
49
49
  import { existsSync } from 'fs';
50
50
  import { join } from 'path';
51
- import { createRequire } from 'module';
52
- import { resolveNativeAddon } from './native-resolver.js';
51
+ import { loadNativeAddon } from './native-resolver.js';
53
52
  import { createTokenizer } from './native-tokenizer.js';
54
53
  import { getModelCacheDir, fetchModel } from './model-fetcher.js';
55
54
  import { getModelEntry } from './model-registry.js';
@@ -57,8 +56,6 @@ import { getCoremlCascadeResolvedDirs } from './coreml-cascade.js';
57
56
  import { detectHardwareCapability } from './hardware-capability.js';
58
57
  import { LATE_INTERACTION_CONFIG } from './config/ranking.js';
59
58
 
60
- const require = createRequire(import.meta.url);
61
-
62
59
  // ─── State ───
63
60
 
64
61
  let _addon = null;
@@ -173,14 +170,12 @@ function resolveCoremlCascadeForAddon() {
173
170
 
174
171
  function loadAddon() {
175
172
  if (_addon) return _addon;
176
- const addonPath = resolveNativeAddon();
177
- if (!addonPath) return null;
178
- try {
179
- _addon = require(addonPath);
180
- return _addon;
181
- } catch {
182
- return null;
183
- }
173
+ // CUDA-preferred with CPU fallback (see loadNativeAddon): a CUDA addon that
174
+ // can't load on a no-GPU box falls back to the plain CPU addon, so native
175
+ // inference degrades to the CPU (ORT-INT8) path instead of failing.
176
+ const res = loadNativeAddon();
177
+ _addon = res ? res.mod : null;
178
+ return _addon;
184
179
  }
185
180
 
186
181
  // ─── Detection ───
@@ -80,61 +80,76 @@ function defaultPackageDirResolver(packageName) {
80
80
  }
81
81
 
82
82
  /**
83
- * Resolve the path to the native MaxSim .node addon, or null.
83
+ * All native .node addon candidate paths that EXIST on disk, in preference
84
+ * order: local dev build → local package template (CUDA, then CPU) →
85
+ * installed npm package (CUDA, then CPU).
84
86
  *
85
- * On Linux, the `-cuda` variant of the platform package is preferred when
86
- * installed it contains a binary built with the `cuda` Cargo feature
87
- * that will dispatch embedding + LI work to candle-cuda when libcuda.so
88
- * is present at runtime. Absence of the `-cuda` package or libcuda.so
89
- * cleanly falls back to the standard CPU variant.
87
+ * On Linux the `-cuda` variant is PREFERRED but a CUDA-built addon hard-links
88
+ * libcuda/libcudart/libcublas, so `require()`-ing it on a host without those
89
+ * libraries (any CPU-only box) THROWS. Returning an ordered candidate list
90
+ * rather than a single path lets the loader (`loadNativeAddon`) try CUDA
91
+ * first and transparently FALL BACK to the plain CPU addon (→ ORT-INT8 path)
92
+ * when CUDA can't load. This is what keeps GPU acceleration for CUDA hosts
93
+ * while a no-GPU `npm i` (which still auto-installs the optional -cuda package,
94
+ * matching os/cpu/libc) does not break indexing.
90
95
  */
91
- export function resolveNativeAddon(options = {}) {
96
+ export function resolveNativeAddonCandidates(options = {}) {
92
97
  const info = getPlatformInfo();
93
- if (!info) return null;
98
+ if (!info) return [];
94
99
  const { platform, arch, libc, cudaPackageName } = info;
95
100
  // napi-rs --platform output includes the libc suffix on Linux
96
- // (e.g. `sweet-search-native.linux-x64-gnu.node`). macOS has no
97
- // libc suffix so `${libc}` is '' there, yielding `.darwin-arm64.node`.
98
- // Historic bug: this constructed `${platform}-${arch}` only, which
99
- // worked on darwin but silently missed the Linux build output.
101
+ // (e.g. `sweet-search-native.linux-x64-gnu.node`). macOS has no libc suffix.
100
102
  const binaryName = `sweet-search-native.${platform}-${arch}${libc}.node`;
101
103
  const exists = options.existsSync ?? existsSync;
102
104
  const rootDir = options.rootDir ?? root;
103
105
  const resolvePackageDir = options.resolvePackageDir ?? defaultPackageDirResolver;
104
106
 
105
- // 1. Local dev: crates/sweet-search-native/ directory (or legacy native-maxsim/)
106
- const localDev = join(rootDir, 'crates', 'sweet-search-native', binaryName);
107
- if (exists(localDev)) return localDev;
108
- const legacyDev = join(rootDir, 'native-maxsim', binaryName);
109
- if (exists(legacyDev)) return legacyDev;
107
+ const out = [];
108
+ const add = (p) => { if (p && exists(p) && !out.includes(p)) out.push(p); };
110
109
 
111
- // 2. Local package template: packages/native-*-cuda/ preferred, then packages/native-*/
110
+ // 1. Local dev build (crates/sweet-search-native/ or legacy native-maxsim/).
111
+ add(join(rootDir, 'crates', 'sweet-search-native', binaryName));
112
+ add(join(rootDir, 'native-maxsim', binaryName));
113
+ // 2. Local package template — CUDA preferred, then CPU.
114
+ if (cudaPackageName) add(join(rootDir, 'packages', `native-${platform}-${arch}${libc}-cuda`, 'sweet-search-native.node'));
115
+ add(join(rootDir, 'packages', `native-${platform}-${arch}${libc}`, 'sweet-search-native.node'));
116
+ // 3. Installed npm package — CUDA preferred, then CPU.
112
117
  if (cudaPackageName) {
113
- const cudaLocalPkg = join(rootDir, 'packages', `native-${platform}-${arch}${libc}-cuda`, 'sweet-search-native.node');
114
- if (exists(cudaLocalPkg)) return cudaLocalPkg;
118
+ try { add(join(resolvePackageDir(cudaPackageName), 'sweet-search-native.node')); }
119
+ catch { /* -cuda package not installed */ }
115
120
  }
116
- const pkgDir = `native-${platform}-${arch}${libc}`;
117
- const localPkg = join(rootDir, 'packages', pkgDir, 'sweet-search-native.node');
118
- if (exists(localPkg)) return localPkg;
121
+ try { add(join(resolvePackageDir(getPlatformPackageName()), 'sweet-search-native.node')); }
122
+ catch { /* package not installed */ }
119
123
 
120
- // 3. Installed npm package — CUDA variant preferred on Linux.
121
- if (cudaPackageName) {
124
+ return out;
125
+ }
126
+
127
+ /**
128
+ * Resolve the single highest-preference native .node addon path, or null.
129
+ * Back-compat shim over `resolveNativeAddonCandidates` (returns the first,
130
+ * i.e. CUDA-preferred). Callers that need the CUDA→CPU load fallback should
131
+ * use `loadNativeAddon` instead of require()-ing this path directly.
132
+ */
133
+ export function resolveNativeAddon(options = {}) {
134
+ return resolveNativeAddonCandidates(options)[0] ?? null;
135
+ }
136
+
137
+ /**
138
+ * require() the first native-addon candidate that loads successfully and
139
+ * satisfies `validate(mod)` (default: any). Candidates are tried CUDA-first,
140
+ * CPU-second, so a host whose CUDA addon throws on load (libcuda absent)
141
+ * transparently falls back to the CPU addon. Returns `{ mod, path }` or null.
142
+ */
143
+ export function loadNativeAddon({ validate, requireFn, ...options } = {}) {
144
+ const load = requireFn ?? require; // requireFn is a test seam
145
+ for (const candidatePath of resolveNativeAddonCandidates(options)) {
122
146
  try {
123
- const cudaNpmPkgDir = resolvePackageDir(cudaPackageName);
124
- const cudaNpmAddon = join(cudaNpmPkgDir, 'sweet-search-native.node');
125
- if (exists(cudaNpmAddon)) return cudaNpmAddon;
147
+ const mod = load(candidatePath);
148
+ if (!validate || validate(mod)) return { mod, path: candidatePath };
126
149
  } catch {
127
- // -cuda package not installed fall through to standard variant.
150
+ // Candidate failed to load (e.g. CUDA addon without libcuda) — try next.
128
151
  }
129
152
  }
130
- try {
131
- const npmPkgDir = resolvePackageDir(getPlatformPackageName());
132
- const npmAddon = join(npmPkgDir, 'sweet-search-native.node');
133
- if (exists(npmAddon)) return npmAddon;
134
- } catch {
135
- // Package not installed
136
- }
137
-
138
153
  return null;
139
154
  }
140
155