sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -221,6 +221,7 @@ export const CORE_LANGUAGES = {
|
|
|
221
221
|
enum: /^(?:pub\s+)?enum\s+(\w+)/,
|
|
222
222
|
trait: /^(?:pub\s+)?trait\s+(\w+)/,
|
|
223
223
|
impl: /^impl(?:<[^>]+>)?\s+(?:\w+\s+for\s+)?(\w+)/,
|
|
224
|
+
macro: /^macro_rules!\s+(\w+)/,
|
|
224
225
|
},
|
|
225
226
|
graph: {
|
|
226
227
|
entities: {
|
|
@@ -232,6 +233,7 @@ export const CORE_LANGUAGES = {
|
|
|
232
233
|
type: /^(?:pub\s+)?type\s+(\w+)/,
|
|
233
234
|
const: /^(?:pub\s+)?const\s+(\w+)\s*:/,
|
|
234
235
|
static: /^(?:pub\s+)?static\s+(\w+)\s*:/,
|
|
236
|
+
macro: /^macro_rules!\s+(\w+)/,
|
|
235
237
|
},
|
|
236
238
|
relationships: {
|
|
237
239
|
use: /^use\s+([\w:]+)(?:::\{([^}]+)\})?/,
|
|
@@ -252,13 +254,21 @@ export const CORE_LANGUAGES = {
|
|
|
252
254
|
},
|
|
253
255
|
chunker: {
|
|
254
256
|
function: /^(?:[\w*\s]+)\s+(\w+)\s*\([^)]*\)\s*\{/,
|
|
255
|
-
struct:
|
|
257
|
+
// Skip attribute-like prefixes between `struct` and the type name:
|
|
258
|
+
// - function-like attrs: __attribute__((...)), __declspec(...), _Alignas(N), __forceinline(...), __inline__(...)
|
|
259
|
+
// Paren matcher allows up to 3 nesting levels — __attribute__ standardly takes
|
|
260
|
+
// a doubly-parenthesized arg, optionally containing call-form attrs like aligned(8).
|
|
261
|
+
// - C++11 attribute syntax: [[ ... ]]
|
|
262
|
+
// - ALL_CAPS user macros (≥3 chars): PACKED_API, HWY_DLLEXPORT, EIGEN_API, etc.
|
|
263
|
+
// Without this, `struct __attribute__((packed)) Foo` captured `__attribute__` as the name.
|
|
264
|
+
// Backtracking handles ALL_CAPS-named forward decls (`struct WHEEL;`).
|
|
265
|
+
struct: /^(?:typedef\s+)?struct(?:\s+(?:__attribute__|__declspec|_Alignas|__forceinline|__inline__)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
|
|
256
266
|
enum: /^(?:typedef\s+)?enum\s+(\w+)/,
|
|
257
267
|
},
|
|
258
268
|
graph: {
|
|
259
269
|
entities: {
|
|
260
270
|
function: /^(?:static\s+)?(?:inline\s+)?(?:[\w*]+\s+)+(\w+)\s*\([^)]*\)\s*\{/,
|
|
261
|
-
struct: /^(?:typedef\s+)?struct\s+(\w+)/,
|
|
271
|
+
struct: /^(?:typedef\s+)?struct(?:\s+(?:__attribute__|__declspec|_Alignas|__forceinline|__inline__)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
|
|
262
272
|
enum: /^(?:typedef\s+)?enum\s+(\w+)/,
|
|
263
273
|
typedef: /^typedef\s+.+\s+(\w+)\s*;/,
|
|
264
274
|
macro: /^#define\s+(\w+)/,
|
|
@@ -279,14 +289,26 @@ export const CORE_LANGUAGES = {
|
|
|
279
289
|
block: ["/*", "*/"],
|
|
280
290
|
},
|
|
281
291
|
chunker: {
|
|
282
|
-
|
|
292
|
+
// Skip attribute-like prefixes between `class`/`struct` and the type name.
|
|
293
|
+
// Closed list of standard attributes:
|
|
294
|
+
// - function-like: alignas(N), __attribute__((...)), __declspec(...),
|
|
295
|
+
// __forceinline(...). Paren matcher allows up to 3 nesting levels —
|
|
296
|
+
// __attribute__ standardly takes a doubly-parenthesized arg, optionally
|
|
297
|
+
// containing call-form attrs like aligned(8).
|
|
298
|
+
// - C++11 attribute syntax: [[ ... ]]
|
|
299
|
+
// - ALL_CAPS user macros (≥3 chars): HWY_DLLEXPORT, EIGEN_API, FMT_API, etc.
|
|
300
|
+
// Common in dllexport/visibility shims across header-only C++ libraries.
|
|
301
|
+
// Without this, `struct alignas(16) uint128_t` captured `alignas` as the
|
|
302
|
+
// struct name (CPP-005 failure root cause). Backtracking preserves capture of
|
|
303
|
+
// ALL_CAPS forward decls like `class WHEEL;`.
|
|
304
|
+
class: /^(?:class|struct)(?:\s+(?:alignas|__attribute__|__declspec|__forceinline)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)(?:\s*:\s*(?:public|protected|private)\s+(\w+))?/,
|
|
283
305
|
function: /^(?:[\w:*&<>\s]+)\s+(\w+)\s*\([^)]*\)\s*(?:const)?\s*(?:override)?\s*\{/,
|
|
284
306
|
namespace: /^namespace\s+(\w+)/,
|
|
285
307
|
template: /^template\s*(<[^>]+>)/,
|
|
286
308
|
},
|
|
287
309
|
graph: {
|
|
288
310
|
entities: {
|
|
289
|
-
class: /^(?:class|struct)\s+(\w+)/,
|
|
311
|
+
class: /^(?:class|struct)(?:\s+(?:alignas|__attribute__|__declspec|__forceinline)\s*\((?:[^()]|\((?:[^()]|\([^()]*\))*\))*\)|\s+\[\[[^\]]*\]\]|\s+[A-Z][A-Z0-9_]{2,})*\s+(\w+)/,
|
|
290
312
|
namespace: /^namespace\s+(\w+)/,
|
|
291
313
|
function: /^(?:[\w:*&<>\s]+)\s+(\w+)\s*\([^)]*\)\s*(?:const)?\s*(?:override)?\s*\{/,
|
|
292
314
|
typedef: /^(?:typedef|using)\s+.+\s+(\w+)/,
|
|
@@ -310,22 +332,33 @@ export const CORE_LANGUAGES = {
|
|
|
310
332
|
block: ["/*", "*/"],
|
|
311
333
|
},
|
|
312
334
|
chunker: {
|
|
313
|
-
class:
|
|
314
|
-
interface:
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
335
|
+
class: /^\s*(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)/,
|
|
336
|
+
interface: /^\s*(?:public|internal)?\s*interface\s+(\w+)/,
|
|
337
|
+
// Two alternation branches:
|
|
338
|
+
// (a) visibility-prefixed: loose return type — covers `public Task<int> Foo(...)`
|
|
339
|
+
// and `internal override void Bar(...)`.
|
|
340
|
+
// (b) visibility-less: STRICT return type (C# primitives + Task/ValueTask/
|
|
341
|
+
// IEnumerable/IAsyncEnumerable + capitalized identifiers) — covers
|
|
342
|
+
// default-private `void Foo<T>(...)` / `async Task Bar<T>(...)` patterns
|
|
343
|
+
// shipped widely in real codebases (e.g. Garnet's AsyncProcessor.cs
|
|
344
|
+
// partial-class shard). Strict return-type list prevents false positives
|
|
345
|
+
// like `return Foo()` and `if (cond)` (those starting tokens not in list).
|
|
346
|
+
// Also adds `(?:<...>)?` between method name and `(` so generic-method declarations
|
|
347
|
+
// `void Foo<T>(...)` get matched (previously rejected the `<T>` between name and `(`).
|
|
348
|
+
method: /^\s*(?:(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:override\s+)?(?:virtual\s+)?(?:[\w<>\[\]?]+)|(?:(?:static|async|override|virtual|sealed|new)\s+)*(?:void|bool|byte|sbyte|short|ushort|int|uint|long|ulong|float|double|decimal|char|string|object|Task(?:<[^>]+>)?|ValueTask(?:<[^>]+>)?|IEnumerable(?:<[^>]+>)?|IAsyncEnumerable(?:<[^>]+>)?|[A-Z][\w<>?\[\],]*))\s+(\w+)(?:<[\w\s,?<>]+>)?\s*\(/,
|
|
349
|
+
property: /^\s*(?:public|private|protected|internal)\s+(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
|
|
350
|
+
enum: /^\s*(?:public|internal)?\s*enum\s+(\w+)/,
|
|
351
|
+
struct: /^\s*(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
|
|
319
352
|
},
|
|
320
353
|
graph: {
|
|
321
354
|
entities: {
|
|
322
|
-
class:
|
|
323
|
-
interface:
|
|
324
|
-
enum:
|
|
325
|
-
struct:
|
|
326
|
-
method:
|
|
327
|
-
property:
|
|
328
|
-
field:
|
|
355
|
+
class: /^\s*(?:public|private|internal|protected)?\s*(?:static|sealed|abstract)?\s*(?:partial\s+)?class\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
|
|
356
|
+
interface: /^\s*(?:public|internal)?\s*interface\s+(\w+)(?:\s*:\s*([\w,\s<>]+))?/,
|
|
357
|
+
enum: /^\s*(?:public|internal)?\s*enum\s+(\w+)/,
|
|
358
|
+
struct: /^\s*(?:public|internal)?\s*(?:readonly\s+)?struct\s+(\w+)/,
|
|
359
|
+
method: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:async\s+)?(?:override\s+)?(?:virtual\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\(([^)]*)\)/,
|
|
360
|
+
property: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*\{/,
|
|
361
|
+
field: /^\s*(?:public|private|protected|internal)\s+(?:static\s+)?(?:readonly\s+)?(?:[\w<>\[\]?]+)\s+(\w+)\s*[;=]/,
|
|
329
362
|
},
|
|
330
363
|
relationships: {
|
|
331
364
|
using: /^using\s+([\w.]+)\s*;/,
|
|
@@ -338,4 +371,10 @@ export const CORE_LANGUAGES = {
|
|
|
338
371
|
},
|
|
339
372
|
};
|
|
340
373
|
|
|
374
|
+
// .tsx files share TypeScript chunker/graph regex; only the
|
|
375
|
+
// tree-sitter WASM grammar differs (see GRAMMAR_MAP in tree-sitter-provider.js).
|
|
376
|
+
// Aliasing the same object means chunker patterns, graph patterns, and
|
|
377
|
+
// skipCallObjects stay byte-identical to typescript without duplication.
|
|
378
|
+
CORE_LANGUAGES.tsx = CORE_LANGUAGES.typescript;
|
|
379
|
+
|
|
341
380
|
export default CORE_LANGUAGES;
|
|
@@ -72,14 +72,21 @@ export const OBJECT_ORIENTED_LANGUAGES = {
|
|
|
72
72
|
block: ["=begin", "=end"],
|
|
73
73
|
},
|
|
74
74
|
chunker: {
|
|
75
|
-
class
|
|
76
|
-
module
|
|
75
|
+
// Allow leading whitespace so indented `class Foo` declarations
|
|
76
|
+
// nested inside a module (the dominant Ruby idiom) actually match.
|
|
77
|
+
// The previous `^class` anchor missed every class inside a module
|
|
78
|
+
// wrapper (e.g. `class IndifferentHash < Hash` inside `module Sinatra`).
|
|
79
|
+
// Superclass can be any expression (`Rack::Request`, `Struct.new(:app)`)
|
|
80
|
+
// — `_matchBoundary` only consumes the first capture, so allowing any
|
|
81
|
+
// tail after the class name keeps the inheritance form parsable.
|
|
82
|
+
class: /^\s*class\s+(\w+)/,
|
|
83
|
+
module: /^\s*module\s+(\w+)/,
|
|
77
84
|
method: /^\s*def\s+(\w+[?!=]?)/,
|
|
78
85
|
},
|
|
79
86
|
graph: {
|
|
80
87
|
entities: {
|
|
81
|
-
class:
|
|
82
|
-
module:
|
|
88
|
+
class: /^\s*class\s+(\w+)/,
|
|
89
|
+
module: /^\s*module\s+(\w+)/,
|
|
83
90
|
method: /^\s*def\s+(\w+[?!=]?)\s*(?:\(([^)]*)\))?/,
|
|
84
91
|
},
|
|
85
92
|
relationships: {
|
|
@@ -87,7 +94,7 @@ export const OBJECT_ORIENTED_LANGUAGES = {
|
|
|
87
94
|
include: /^\s*include\s+(\w+)/,
|
|
88
95
|
extend: /^\s*extend\s+(\w+)/,
|
|
89
96
|
prepend: /^\s*prepend\s+(\w+)/,
|
|
90
|
-
inherit:
|
|
97
|
+
inherit: /^\s*class\s+\w+\s*<\s*([\w:]+)/,
|
|
91
98
|
methodCall: /(\w+)\s*\.\s*(\w+)\s*[(!]/,
|
|
92
99
|
},
|
|
93
100
|
skipCallObjects: ["puts", "print", "p", "raise", "require", "attr_accessor", "attr_reader", "attr_writer"],
|
|
@@ -37,6 +37,74 @@ export function getLanguageByExtension(ext) {
|
|
|
37
37
|
return { id, ...lang };
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Tokens that can appear in C++ source but NOT in valid C source outside
|
|
42
|
+
* of string literals or comments. Used by resolveLanguage() to disambiguate
|
|
43
|
+
* `.h` files between C and C++.
|
|
44
|
+
*
|
|
45
|
+
* template< — C has no templates (C99+ has `_Generic`, syntactically distinct).
|
|
46
|
+
* namespace IDENT — C has no namespaces (token reserved in C++).
|
|
47
|
+
* class IDENT[:{] — `class IDENT { ... }` and `class IDENT : base` are C++ syntax;
|
|
48
|
+
* `int class;` (using `class` as a C field name) does NOT match.
|
|
49
|
+
* decltype( — C++11 type-deduction; not in any C standard.
|
|
50
|
+
* {private|public|protected}: — access specifiers, C++ only at file scope. They
|
|
51
|
+
* could appear as goto labels in C, but doing so is exceptionally
|
|
52
|
+
* rare and the cost of a false positive is "parse a C header with
|
|
53
|
+
* tree-sitter-cpp" which is mostly a superset of tree-sitter-c.
|
|
54
|
+
* IDENT::IDENT — scope-resolution operator, C++ only.
|
|
55
|
+
*
|
|
56
|
+
* Strings/comments containing these tokens can produce false positives. The cost is
|
|
57
|
+
* one mis-routed header parsed by tree-sitter-cpp, which still produces reasonable
|
|
58
|
+
* results since tree-sitter-cpp is a near-superset of tree-sitter-c. This is the
|
|
59
|
+
* same disambiguation strategy used by GitHub Linguist for `.h`.
|
|
60
|
+
*/
|
|
61
|
+
const HEADER_CPP_DISAMBIGUATOR = /\btemplate\s*<|\bnamespace\s+[A-Za-z_]|\bclass\s+[A-Za-z_]\w*\s*[:{]|\bdecltype\s*\(|\b(?:private|public|protected)\s*:|[A-Za-z_]\w*\s*::\s*[A-Za-z_]/;
|
|
62
|
+
|
|
63
|
+
// Number of leading characters scanned for C++ disambiguator tokens. Real-world
|
|
64
|
+
// C++ headers have at least one telltale token within the first ~1KB (include
|
|
65
|
+
// guards + namespace/template/class). 2KB gives generous margin without making
|
|
66
|
+
// the scan a hot-path cost.
|
|
67
|
+
const HEADER_DISAMBIGUATOR_SCAN_BYTES = 2048;
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Resolve the language for a file, using file content to disambiguate
|
|
71
|
+
* ambiguous extensions (today: `.h` for C vs C++).
|
|
72
|
+
*
|
|
73
|
+
* The default `.h → c` mapping in EXTENSION_MAP is incorrect for header-
|
|
74
|
+
* only C++ libraries (highway, Eigen, fmt, abseil-cpp, range-v3, …)
|
|
75
|
+
* where the implementation lives in `.h` files. When `.h` is parsed by
|
|
76
|
+
* tree-sitter-c, C++ keywords (alignas, namespace, template, decltype)
|
|
77
|
+
* are misidentified, producing phantom symbols and oversized macro-cluster
|
|
78
|
+
* chunks that pollute retrieval.
|
|
79
|
+
*
|
|
80
|
+
* Strategy: per-file content scan. If a `.h` file contains any token that
|
|
81
|
+
* cannot appear in valid C outside strings/comments (template<, namespace,
|
|
82
|
+
* class IDENT[:{], decltype(, access specifiers, IDENT::IDENT), route to
|
|
83
|
+
* cpp; otherwise keep the default routing. No project-level state, no
|
|
84
|
+
* cross-file leakage; per-file decision is locally explainable.
|
|
85
|
+
*
|
|
86
|
+
* @param {string} filePath - File path
|
|
87
|
+
* @param {string} [content] - File content (optional; required for `.h` disambiguation)
|
|
88
|
+
* @returns {{ id: string, ...config } | null}
|
|
89
|
+
*/
|
|
90
|
+
export function resolveLanguage(filePath, content) {
|
|
91
|
+
const langInfo = getLanguageByPath(filePath);
|
|
92
|
+
if (!langInfo) return null;
|
|
93
|
+
if (langInfo.id !== 'c') return langInfo;
|
|
94
|
+
// Only attempt content disambiguation for `.h` (the ambiguous extension).
|
|
95
|
+
// `.c` files are unambiguous C; we won't override them based on content.
|
|
96
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
97
|
+
if (ext !== '.h') return langInfo;
|
|
98
|
+
if (typeof content !== 'string' || content.length === 0) return langInfo;
|
|
99
|
+
const probe = content.length > HEADER_DISAMBIGUATOR_SCAN_BYTES
|
|
100
|
+
? content.slice(0, HEADER_DISAMBIGUATOR_SCAN_BYTES)
|
|
101
|
+
: content;
|
|
102
|
+
if (HEADER_CPP_DISAMBIGUATOR.test(probe)) {
|
|
103
|
+
return getLanguageByExtension('.cpp');
|
|
104
|
+
}
|
|
105
|
+
return langInfo;
|
|
106
|
+
}
|
|
107
|
+
|
|
40
108
|
/**
|
|
41
109
|
* Get language config by file path (handles both extension and filename).
|
|
42
110
|
* @param {string} filePath - File path
|
|
@@ -132,6 +200,7 @@ export default {
|
|
|
132
200
|
LANGUAGES,
|
|
133
201
|
getLanguageByExtension,
|
|
134
202
|
getLanguageByPath,
|
|
203
|
+
resolveLanguage,
|
|
135
204
|
getChunkerPatterns,
|
|
136
205
|
getGraphPatterns,
|
|
137
206
|
getLanguageMeta,
|
|
@@ -86,6 +86,10 @@ export const MODEL_REGISTRY = {
|
|
|
86
86
|
hfId: 'nomic-ai/CodeRankEmbed',
|
|
87
87
|
profile: 'full',
|
|
88
88
|
description: 'Local embedding model (FP32 safetensors, 768d) for native inference',
|
|
89
|
+
// Loaded only by the candle/native inference path, which is armed
|
|
90
|
+
// exclusively for accelerated indexing (Metal / CoreML cascade / CUDA).
|
|
91
|
+
// CPU-only hosts index with ORT INT8 and never load this — init skips it.
|
|
92
|
+
nativeAccelerated: true,
|
|
89
93
|
files: [
|
|
90
94
|
{ path: 'model.safetensors', sizeBytes: 546938168, sha256: '827529bcd58aef0d9082e66eeff7e7d53a02f62bd005f841a26b3d3e2fb17ebe' },
|
|
91
95
|
{ path: 'config.json', sizeBytes: 1525, sha256: null },
|
|
@@ -96,6 +100,8 @@ export const MODEL_REGISTRY = {
|
|
|
96
100
|
hfId: 'lightonai/LateOn-Code',
|
|
97
101
|
profile: 'full',
|
|
98
102
|
description: 'Late interaction model (FP32 safetensors, backbone 768d) for native inference',
|
|
103
|
+
// Native-accelerated only — see coderankembed-fp32 above.
|
|
104
|
+
nativeAccelerated: true,
|
|
99
105
|
files: [
|
|
100
106
|
{ path: 'model.safetensors', sizeBytes: 596076280, sha256: '45c40bb4ba6b45f0c66b2deb3d27dd06efc3af23c78c8093b8cad2af61c683b2' },
|
|
101
107
|
{ path: '1_Dense/model.safetensors', sizeBytes: 393304, sha256: '22ea6a53cad3ed034934b5db7a214a0bcc28ff4cc440babea44029989e4bbcca' },
|
|
@@ -107,6 +113,8 @@ export const MODEL_REGISTRY = {
|
|
|
107
113
|
hfId: 'lightonai/LateOn-Code-edge',
|
|
108
114
|
profile: 'full',
|
|
109
115
|
description: 'Late interaction edge model (FP32 safetensors, backbone 256d, 2-stage projection) for native inference',
|
|
116
|
+
// Native-accelerated only — see coderankembed-fp32 above.
|
|
117
|
+
nativeAccelerated: true,
|
|
110
118
|
files: [
|
|
111
119
|
{ path: 'model.safetensors', sizeBytes: 67195976, sha256: '7ffc36b8ff71367249cd5220dbdd4bdbe177bc0e305b2e978a8b598bd8296f04' },
|
|
112
120
|
{ path: '1_Dense/model.safetensors', sizeBytes: 524376, sha256: '9efb17fcb2106cd8fcb01d57a9cd9c997a487ad20630ec8e44ce3f9d89efe0a7' },
|
|
@@ -160,6 +168,18 @@ export function getModelEntry(key) {
|
|
|
160
168
|
return MODEL_REGISTRY[key] || null;
|
|
161
169
|
}
|
|
162
170
|
|
|
171
|
+
/**
|
|
172
|
+
* Whether a model is a native-accelerated FP32 artifact (safetensors loaded
|
|
173
|
+
* by the candle/native inference path). These are only used for accelerated
|
|
174
|
+
* indexing on Metal / CoreML cascade / CUDA hosts; a CPU-only host indexes
|
|
175
|
+
* with ORT INT8 and never loads them, so init skips them by default (~1.2 GB
|
|
176
|
+
* of downloads avoided). Marked with `nativeAccelerated: true` in the
|
|
177
|
+
* registry entry. Returns false for unknown keys.
|
|
178
|
+
*/
|
|
179
|
+
export function isNativeAcceleratedModel(key) {
|
|
180
|
+
return Boolean(MODEL_REGISTRY[key]?.nativeAccelerated);
|
|
181
|
+
}
|
|
182
|
+
|
|
163
183
|
/**
|
|
164
184
|
* Truthy env-flag parser. Matches the conventions used across the codebase:
|
|
165
185
|
* "1" / "true" / "on" / "yes" (case-insensitive) → true
|
|
@@ -48,8 +48,7 @@
|
|
|
48
48
|
|
|
49
49
|
import { existsSync } from 'fs';
|
|
50
50
|
import { join } from 'path';
|
|
51
|
-
import {
|
|
52
|
-
import { resolveNativeAddon } from './native-resolver.js';
|
|
51
|
+
import { loadNativeAddon } from './native-resolver.js';
|
|
53
52
|
import { createTokenizer } from './native-tokenizer.js';
|
|
54
53
|
import { getModelCacheDir, fetchModel } from './model-fetcher.js';
|
|
55
54
|
import { getModelEntry } from './model-registry.js';
|
|
@@ -57,8 +56,6 @@ import { getCoremlCascadeResolvedDirs } from './coreml-cascade.js';
|
|
|
57
56
|
import { detectHardwareCapability } from './hardware-capability.js';
|
|
58
57
|
import { LATE_INTERACTION_CONFIG } from './config/ranking.js';
|
|
59
58
|
|
|
60
|
-
const require = createRequire(import.meta.url);
|
|
61
|
-
|
|
62
59
|
// ─── State ───
|
|
63
60
|
|
|
64
61
|
let _addon = null;
|
|
@@ -173,14 +170,12 @@ function resolveCoremlCascadeForAddon() {
|
|
|
173
170
|
|
|
174
171
|
function loadAddon() {
|
|
175
172
|
if (_addon) return _addon;
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
return null;
|
|
183
|
-
}
|
|
173
|
+
// CUDA-preferred with CPU fallback (see loadNativeAddon): a CUDA addon that
|
|
174
|
+
// can't load on a no-GPU box falls back to the plain CPU addon, so native
|
|
175
|
+
// inference degrades to the CPU (ORT-INT8) path instead of failing.
|
|
176
|
+
const res = loadNativeAddon();
|
|
177
|
+
_addon = res ? res.mod : null;
|
|
178
|
+
return _addon;
|
|
184
179
|
}
|
|
185
180
|
|
|
186
181
|
// ─── Detection ───
|
|
@@ -80,61 +80,76 @@ function defaultPackageDirResolver(packageName) {
|
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
/**
|
|
83
|
-
*
|
|
83
|
+
* All native .node addon candidate paths that EXIST on disk, in preference
|
|
84
|
+
* order: local dev build → local package template (CUDA, then CPU) →
|
|
85
|
+
* installed npm package (CUDA, then CPU).
|
|
84
86
|
*
|
|
85
|
-
* On Linux
|
|
86
|
-
*
|
|
87
|
-
*
|
|
88
|
-
*
|
|
89
|
-
*
|
|
87
|
+
* On Linux the `-cuda` variant is PREFERRED but a CUDA-built addon hard-links
|
|
88
|
+
* libcuda/libcudart/libcublas, so `require()`-ing it on a host without those
|
|
89
|
+
* libraries (any CPU-only box) THROWS. Returning an ordered candidate list —
|
|
90
|
+
* rather than a single path — lets the loader (`loadNativeAddon`) try CUDA
|
|
91
|
+
* first and transparently FALL BACK to the plain CPU addon (→ ORT-INT8 path)
|
|
92
|
+
* when CUDA can't load. This is what keeps GPU acceleration for CUDA hosts
|
|
93
|
+
* while a no-GPU `npm i` (which still auto-installs the optional -cuda package,
|
|
94
|
+
* matching os/cpu/libc) does not break indexing.
|
|
90
95
|
*/
|
|
91
|
-
export function
|
|
96
|
+
export function resolveNativeAddonCandidates(options = {}) {
|
|
92
97
|
const info = getPlatformInfo();
|
|
93
|
-
if (!info) return
|
|
98
|
+
if (!info) return [];
|
|
94
99
|
const { platform, arch, libc, cudaPackageName } = info;
|
|
95
100
|
// napi-rs --platform output includes the libc suffix on Linux
|
|
96
|
-
// (e.g. `sweet-search-native.linux-x64-gnu.node`). macOS has no
|
|
97
|
-
// libc suffix so `${libc}` is '' there, yielding `.darwin-arm64.node`.
|
|
98
|
-
// Historic bug: this constructed `${platform}-${arch}` only, which
|
|
99
|
-
// worked on darwin but silently missed the Linux build output.
|
|
101
|
+
// (e.g. `sweet-search-native.linux-x64-gnu.node`). macOS has no libc suffix.
|
|
100
102
|
const binaryName = `sweet-search-native.${platform}-${arch}${libc}.node`;
|
|
101
103
|
const exists = options.existsSync ?? existsSync;
|
|
102
104
|
const rootDir = options.rootDir ?? root;
|
|
103
105
|
const resolvePackageDir = options.resolvePackageDir ?? defaultPackageDirResolver;
|
|
104
106
|
|
|
105
|
-
|
|
106
|
-
const
|
|
107
|
-
if (exists(localDev)) return localDev;
|
|
108
|
-
const legacyDev = join(rootDir, 'native-maxsim', binaryName);
|
|
109
|
-
if (exists(legacyDev)) return legacyDev;
|
|
107
|
+
const out = [];
|
|
108
|
+
const add = (p) => { if (p && exists(p) && !out.includes(p)) out.push(p); };
|
|
110
109
|
|
|
111
|
-
//
|
|
110
|
+
// 1. Local dev build (crates/sweet-search-native/ or legacy native-maxsim/).
|
|
111
|
+
add(join(rootDir, 'crates', 'sweet-search-native', binaryName));
|
|
112
|
+
add(join(rootDir, 'native-maxsim', binaryName));
|
|
113
|
+
// 2. Local package template — CUDA preferred, then CPU.
|
|
114
|
+
if (cudaPackageName) add(join(rootDir, 'packages', `native-${platform}-${arch}${libc}-cuda`, 'sweet-search-native.node'));
|
|
115
|
+
add(join(rootDir, 'packages', `native-${platform}-${arch}${libc}`, 'sweet-search-native.node'));
|
|
116
|
+
// 3. Installed npm package — CUDA preferred, then CPU.
|
|
112
117
|
if (cudaPackageName) {
|
|
113
|
-
|
|
114
|
-
|
|
118
|
+
try { add(join(resolvePackageDir(cudaPackageName), 'sweet-search-native.node')); }
|
|
119
|
+
catch { /* -cuda package not installed */ }
|
|
115
120
|
}
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if (exists(localPkg)) return localPkg;
|
|
121
|
+
try { add(join(resolvePackageDir(getPlatformPackageName()), 'sweet-search-native.node')); }
|
|
122
|
+
catch { /* package not installed */ }
|
|
119
123
|
|
|
120
|
-
|
|
121
|
-
|
|
124
|
+
return out;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Resolve the single highest-preference native .node addon path, or null.
|
|
129
|
+
* Back-compat shim over `resolveNativeAddonCandidates` (returns the first,
|
|
130
|
+
* i.e. CUDA-preferred). Callers that need the CUDA→CPU load fallback should
|
|
131
|
+
* use `loadNativeAddon` instead of require()-ing this path directly.
|
|
132
|
+
*/
|
|
133
|
+
export function resolveNativeAddon(options = {}) {
|
|
134
|
+
return resolveNativeAddonCandidates(options)[0] ?? null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* require() the first native-addon candidate that loads successfully and
|
|
139
|
+
* satisfies `validate(mod)` (default: any). Candidates are tried CUDA-first,
|
|
140
|
+
* CPU-second, so a host whose CUDA addon throws on load (libcuda absent)
|
|
141
|
+
* transparently falls back to the CPU addon. Returns `{ mod, path }` or null.
|
|
142
|
+
*/
|
|
143
|
+
export function loadNativeAddon({ validate, requireFn, ...options } = {}) {
|
|
144
|
+
const load = requireFn ?? require; // requireFn is a test seam
|
|
145
|
+
for (const candidatePath of resolveNativeAddonCandidates(options)) {
|
|
122
146
|
try {
|
|
123
|
-
const
|
|
124
|
-
|
|
125
|
-
if (exists(cudaNpmAddon)) return cudaNpmAddon;
|
|
147
|
+
const mod = load(candidatePath);
|
|
148
|
+
if (!validate || validate(mod)) return { mod, path: candidatePath };
|
|
126
149
|
} catch {
|
|
127
|
-
//
|
|
150
|
+
// Candidate failed to load (e.g. CUDA addon without libcuda) — try next.
|
|
128
151
|
}
|
|
129
152
|
}
|
|
130
|
-
try {
|
|
131
|
-
const npmPkgDir = resolvePackageDir(getPlatformPackageName());
|
|
132
|
-
const npmAddon = join(npmPkgDir, 'sweet-search-native.node');
|
|
133
|
-
if (exists(npmAddon)) return npmAddon;
|
|
134
|
-
} catch {
|
|
135
|
-
// Package not installed
|
|
136
|
-
}
|
|
137
|
-
|
|
138
153
|
return null;
|
|
139
154
|
}
|
|
140
155
|
|