sweet-search 2.5.14 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +36 -9
  2. package/core/cli.js +41 -3
  3. package/core/embedding/embedding-local-model.js +106 -10
  4. package/core/embedding/embedding-service.js +59 -1
  5. package/core/embedding/model-client.mjs +257 -0
  6. package/core/embedding/model-server.mjs +217 -0
  7. package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
  8. package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
  9. package/core/incremental-indexing/application/operator-cli.mjs +14 -5
  10. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
  11. package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
  12. package/core/incremental-indexing/application/reconciler.mjs +87 -15
  13. package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
  14. package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
  15. package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
  16. package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
  17. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
  18. package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
  19. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
  20. package/core/indexing/artifact-builder.js +1 -1
  21. package/core/indexing/dedup/dedup-phase.js +36 -17
  22. package/core/indexing/dedup/exemplar-selector.js +5 -0
  23. package/core/indexing/index-codebase-v21.js +37 -14
  24. package/core/indexing/index-maintainer.mjs +337 -6
  25. package/core/indexing/indexer-ann.js +27 -434
  26. package/core/indexing/indexer-build.js +30 -14
  27. package/core/indexing/indexer-manifest.js +0 -3
  28. package/core/indexing/indexer-phases.js +101 -25
  29. package/core/indexing/maintainer-launcher.mjs +22 -0
  30. package/core/indexing/maintainer-watcher.mjs +397 -0
  31. package/core/indexing/os-priority.mjs +160 -0
  32. package/core/indexing/rss-budget.mjs +425 -0
  33. package/core/indexing/streaming-vectors.js +450 -0
  34. package/core/infrastructure/config/platform.js +14 -10
  35. package/core/infrastructure/onnx-session-utils.js +37 -0
  36. package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
  37. package/core/ranking/late-interaction-index.js +58 -7
  38. package/core/search/daemon-registry.js +199 -0
  39. package/core/search/search-read-semantic.js +9 -3
  40. package/core/search/search-semantic.js +6 -29
  41. package/core/search/search-server.js +527 -27
  42. package/core/search/session-daemon-prewarm.mjs +110 -1
  43. package/core/search/sweet-search.js +0 -38
  44. package/core/vector-store/binary-hnsw-index.js +692 -78
  45. package/core/vector-store/index.js +1 -4
  46. package/mcp/tool-handlers.js +1 -2
  47. package/package.json +11 -8
  48. package/scripts/uninstall.js +2 -0
  49. package/core/vector-store/hnsw-index.js +0 -751
package/README.md CHANGED
@@ -1,15 +1,16 @@
1
1
  <div align="center">
2
2
 
3
- <img src="assets/sweet-search-banner-pixelated.svg" alt="sweet-search" width="100%" />
3
+ <img src="assets/sweet-search-banner-pixelated.svg" alt="sweet-search — local code search for AI coding agents" width="100%" />
4
4
 
5
- ### *Maybe grep isn't all you need…* 🍬
5
+ <br/>
6
6
 
7
+ **Local code search for AI coding agents.** Six fast, purpose-built tools that hand *Claude Code*, *Codex* & friends ranked answers, not raw grep. Zero API keys, 100% on-device.
7
8
 
8
- Every AI coding agent of today is stuck believing grep+Read is the way... ***sweet-search*** challenges the narrative 😎
9
-
10
- A 100% local code-search engine for *Claude Code*, *Codex*, *Cursor* & friends with six blazing and purpose-built tools that hand your agent ranked, ready-to-use answers instead of walls of grep output. Up to 34% cheaper, 56% fewer tool calls, more useful answers, SOTA retrieval quality, zero API keys.
9
+ *Maybe grep isn't all you need…* 🍬<br/>
10
+ Every coding agent today reaches for grep + Read by reflex. *sweet-search* challenges the narrative. 😎
11
11
 
12
12
  [![npm](https://img.shields.io/npm/v/sweet-search?color=cb3837&label=npm)](https://www.npmjs.com/package/sweet-search)
13
+ [![GitHub stars](https://img.shields.io/github/stars/mrsladoje/sweet-search?style=social)](https://github.com/mrsladoje/sweet-search/stargazers)
13
14
  [![license](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
14
15
  [![node](https://img.shields.io/badge/node-%E2%89%A518-brightgreen)](package.json)
15
16
  [![platforms](https://img.shields.io/badge/platform-macOS%20%7C%20Linux-lightgrey)](#platform-support)
@@ -80,6 +81,9 @@ A 100% local code-search engine for *Claude Code*, *Codex*, *Cursor* & friends w
80
81
  [📊 Benchmarks](#-benchmarks)<br>
81
82
  <sub>agent cost savings · engine speed · full-corpus MRR</sub>
82
83
 
84
+ [🧭 Where sweet-search Fits](#-where-sweet-search-fits)<br>
85
+ <sub>honest wins & trade-offs vs peers</sub>
86
+
83
87
  [🙏 Prior Art & Acknowledgements](#-prior-art--acknowledgements)<br>
84
88
  <sub>the shoulders we stand on</sub>
85
89
 
@@ -105,10 +109,6 @@ sweet-search "where do we validate JWT tokens?"
105
109
  That's it. `init` is idempotent and SHA256-verifies every model binary; re-running it is always safe.
106
110
  From then on the index maintains itself — edit, save, search.
107
111
 
108
- > **Latest release: v2.5.5** — the agent-mode preview tier now defaults to a 3k token budget (was 4k):
109
- > same accuracy and usefulness in a 4-model paired sweep, ~11–15% cheaper per query. Already on an
110
- > older install? `npm install -g sweet-search` again to pick it up.
111
-
112
112
  <details>
113
113
  <summary><b>Setup options & details</b></summary>
114
114
 
@@ -290,6 +290,33 @@ We're SOTA in June 2026 on 3/4 attempted benchmarks at HARDER settings (running
290
290
  | 💾 Indexing memory | peak JS heap **785 MB → 213 MB** | [`docs/DISK_FLUSHING_STRATEGY.md`](docs/DISK_FLUSHING_STRATEGY.md) |
291
291
  | 🍏 CoreML cascade (M3 Max) | **18% faster** full indexing vs the Metal baseline | [`docs/INIT_STRATEGY.md`](docs/INIT_STRATEGY.md) |
292
292
 
293
+ ## 🧭 Where sweet-search Fits
294
+
295
+ Code search is a crowded space. Here's an honest read on where sweet-search wins and where it gives ground, against the trending leaders and our closest local peers.
296
+
297
+ | Capability | sweet-search | claude-context | Cursor index | codebase-memory | SocratiCode |
298
+ |---|:---:|:---:|:---:|:---:|:---:|
299
+ | 100% local — code never leaves your machine | ✅ | ✅¹ | ❌ | ✅ | ✅ |
300
+ | Works with zero API keys | ✅ | ✅¹ | ❌ | ✅ | ✅ |
301
+ | No external service to run (vector DB · Ollama · Docker) | ✅ | ❌ Milvus | ❌ cloud | ✅ | ⚠️⁵ |
302
+ | ColBERT late-interaction rerank | ✅ | ❌ | ❌ | ❌ | ❌ |
303
+ | Faster-than-ripgrep exact grep | ✅ | ❌ | ✅⁷ | ❌ | ❌ |
304
+ | Call-graph trace (callers · callees · impact) | ✅ | ❌ | ❌ | ✅ | ✅ |
305
+ | Drives any terminal agent (Claude Code · Codex · Gemini CLI) | ✅ | ✅ | ❌² | ✅ | ✅ |
306
+ | Published NL→code retrieval benchmarks | ✅ | ⚠️³ | ❌ | ⚠️³ | ⚠️³ |
307
+ | *…and where sweet-search gives ground* | | | | | |
308
+ | Native Windows | ❌⁴ | ✅ | ✅ | ✅ | ⚠️⁸ |
309
+ | Deep-AST language coverage | ⚠️ 14 (+70 via regex) | ⚠️ | ⚠️ | ✅ 158 | ⚠️ |
310
+ | In-editor GUI · writes & edits code | ❌ | ❌ | ✅ | ❌ | ❌⁶ |
311
+ | Org-wide, multi-repo scale | ❌ | ⚠️ | ⚠️ | ⚠️ | ✅ |
312
+
313
+ <sub>✅ yes · ⚠️ partial / with caveats · ❌ no. Verified June 2026; capabilities drift.<br/>
314
+ ¹ claude-context's local path (Milvus Lite + Ollama embeddings) needs no API key, but it defaults to OpenAI/Voyage embeddings + Zilliz Cloud — and still runs Milvus + Ollama either way. ² Cursor's index is editor-locked — external terminal agents can't query it. ³ Reports token-reduction / efficiency, not a public NL→code retrieval-quality leaderboard. ⁴ Runs on Windows via WSL2. ⁵ SocratiCode manages a bundled Qdrant for you, but uses an auto-detected Ollama for local embeddings. ⁶ Ships an interactive HTML graph viewer, but doesn't edit code. ⁷ Cursor's local Instant Grep — a literal + regex index it benchmarks at ripgrep 16.8 s → 13 ms (the post that inspired our own n-gram prefilter). ⁸ SocratiCode runs on Windows via Docker only — no native binary, and no GPU there.</sub>
315
+
316
+ **Where we lose, plainly:** no native Windows yet, no editor GUI, and we index one repo at a time. If you need org-wide search across many repos and branches, that's where [SocratiCode](https://github.com/giancarloerra/socraticode) and [Sourcegraph](https://sourcegraph.com) are built to win. If you live inside one editor, Cursor's index is already there. sweet-search is for the terminal agent that wants the best *local* retrieval on the repo in front of it. No one else combines all of it: ColBERT late-interaction reranking **and** faster-than-grep search, fully on-device, with nothing to sign up for.
317
+
318
+ <sub>Also in the space: <a href="https://sourcegraph.com">Sourcegraph/Cody</a> (org-scale, server-based), <a href="https://github.com/continuedev/continue">Continue.dev</a> (local-default RAG), <a href="https://github.com/oraios/serena">Serena</a> (LSP symbol search, no embeddings), <a href="https://github.com/yoanbernabeu/grepai">grepai</a> (local CLI + trace), and <a href="https://github.com/cocoindex-io/cocoindex-code">cocoindex-code</a> (embedded AST search).</sub>
319
+
293
320
  ## 🧰 The Six Tools
294
321
 
295
322
  Six small tools, one shared index. Each returns ranked, deduplicated, token-budgeted output designed
package/core/cli.js CHANGED
@@ -10,6 +10,11 @@ import { spawnSync } from 'node:child_process';
10
10
 
11
11
  const args = process.argv.slice(2);
12
12
 
13
+ function envFalsey(name) {
14
+ const v = String(process.env[name] || '').trim().toLowerCase();
15
+ return v === '0' || v === 'false' || v === 'off' || v === 'no';
16
+ }
17
+
13
18
  // Package-management commands always run in JS (never native dispatch)
14
19
  if (args[0] === 'init') {
15
20
  const { runInit } = await import('../scripts/init.js');
@@ -24,15 +29,48 @@ if (args[0] === 'init') {
24
29
  const { handleIncrementalCli } = await import('./incremental-indexing/application/operator-cli.mjs');
25
30
  await handleIncrementalCli(args[0], args.slice(1));
26
31
  } else if (args[0] === 'read') {
27
- // Filesystem-grounded reader; runs in JS (no native equivalent yet).
32
+ // Filesystem-grounded reader. Default dispatches to the native Unix-socket
33
+ // client so the warm daemon serves the read without per-call node startup.
34
+ // readFiles statSync's every call, so read-your-writes freshness is preserved.
35
+ // Set SWEET_SEARCH_READ_VIA_DAEMON=0 to force the in-process path.
36
+ if (!envFalsey('SWEET_SEARCH_READ_VIA_DAEMON')) {
37
+ const { resolveNativeBinary } = await import('./infrastructure/index.js');
38
+ const nativeBin = resolveNativeBinary();
39
+ if (nativeBin) {
40
+ const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
41
+ process.exit(result.status ?? 1);
42
+ }
43
+ }
28
44
  const { handleReadCli } = await import('./search/search-read.js');
29
45
  await handleReadCli(args.slice(1));
30
46
  } else if (args[0] === 'read-semantic') {
31
- // Hybrid span-selection reader; runs in JS (depends on LI index + ranking).
47
+ // Hybrid span-selection reader. Default dispatches to the native Unix-socket
48
+ // client so the warm daemon can serve LI scoring without per-call
49
+ // model/session startup. Set SWEET_SEARCH_SEMANTIC_VIA_DAEMON=0 to force the
50
+ // legacy in-process path for debugging.
51
+ if (!envFalsey('SWEET_SEARCH_SEMANTIC_VIA_DAEMON')) {
52
+ const { resolveNativeBinary } = await import('./infrastructure/index.js');
53
+ const nativeBin = resolveNativeBinary();
54
+ if (nativeBin) {
55
+ const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
56
+ process.exit(result.status ?? 1);
57
+ }
58
+ }
32
59
  const { handleReadSemanticCli } = await import('./search/search-read-semantic.js');
33
60
  await handleReadSemanticCli(args.slice(1));
34
61
  } else if (args[0] === 'trace') {
35
- // Unified structural code context: callers, callees, and impact.
62
+ // Unified structural code context: callers, callees, and impact. Default
63
+ // dispatches to the native Unix-socket client so the warm daemon serves the
64
+ // code-graph traversal without per-call node startup + cold code-graph.db
65
+ // open. Set SWEET_SEARCH_TRACE_VIA_DAEMON=0 to force the in-process path.
66
+ if (!envFalsey('SWEET_SEARCH_TRACE_VIA_DAEMON')) {
67
+ const { resolveNativeBinary } = await import('./infrastructure/index.js');
68
+ const nativeBin = resolveNativeBinary();
69
+ if (nativeBin) {
70
+ const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
71
+ process.exit(result.status ?? 1);
72
+ }
73
+ }
36
74
  const { handleTraceCli } = await import('./search/search-trace.js');
37
75
  await handleTraceCli(args.slice(1));
38
76
  } else if (args[0] === 'index') {
@@ -30,6 +30,7 @@ export const QUERY_MAX_LENGTH = parseInt(process.env.SWEET_SEARCH_QUERY_MAX_LENG
30
30
  // Import + re-export from infrastructure (canonical location)
31
31
  import {
32
32
  bestIntraOpThreads,
33
+ backgroundIntraOpThreads,
33
34
  defaultOrtExecutionMode,
34
35
  detectLastLevelCacheBytes,
35
36
  computeWeightsAwareBatchCap,
@@ -52,6 +53,13 @@ let localModelRuntimeConfig = {
52
53
  intraOpThreads: null,
53
54
  interOpThreads: null,
54
55
  executionMode: null,
56
+ // G3: background/maintainer ORT profile. When truthy, buildLocalSessionOptions
57
+ // emits force_spinning_stop:'1' + arena-off + 2–4 intra-op threads instead of
58
+ // the foreground allow_spinning:'1' + arena-on default. Set by the maintainer
59
+ // daemon (G4) via configureLocalModelRuntime({ background: true }) before the
60
+ // first encode (the session singleton is built once on first encode — setting
61
+ // it afterwards is a silent no-op). Default null/off everywhere else.
62
+ background: null,
55
63
  };
56
64
 
57
65
  export function configureLocalModelRuntime(overrides = {}) {
@@ -66,9 +74,26 @@ export function resetLocalModelRuntime() {
66
74
  intraOpThreads: null,
67
75
  interOpThreads: null,
68
76
  executionMode: null,
77
+ background: null,
69
78
  };
70
79
  }
71
80
 
81
+ /**
82
+ * Resolve whether the BACKGROUND/maintainer ORT profile is active.
83
+ *
84
+ * True when the daemon set `{ background: true }` via configureLocalModelRuntime
85
+ * OR the SWEET_SEARCH_ORT_BACKGROUND=1 env gate is set. Default OFF: the
86
+ * foreground/full-index path is unchanged. An explicit `background: false` in
87
+ * the runtime config wins over the env gate (lets a query daemon force the
88
+ * latency-critical foreground profile even under a global env flag).
89
+ */
90
+ export function isBackgroundOrtProfile(runtimeConfig = {}) {
91
+ const cfg = runtimeConfig.background ?? localModelRuntimeConfig.background;
92
+ if (cfg === true) return true;
93
+ if (cfg === false) return false;
94
+ return process.env.SWEET_SEARCH_ORT_BACKGROUND === '1';
95
+ }
96
+
72
97
  export function isOpenVinoProviderAvailable() {
73
98
  if (openVinoProviderAvailable !== null) return openVinoProviderAvailable;
74
99
 
@@ -159,6 +184,8 @@ export function getCalibrationFactor() {
159
184
  }
160
185
 
161
186
  export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = false, runtimeConfig = {}) {
187
+ const background = isBackgroundOrtProfile(runtimeConfig);
188
+
162
189
  const executionMode = runtimeConfig.executionMode
163
190
  ?? localModelRuntimeConfig.executionMode
164
191
  ?? process.env.SWEET_SEARCH_ORT_EXEC_MODE
@@ -166,9 +193,14 @@ export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = fa
166
193
  const interOpThreads = runtimeConfig.interOpThreads
167
194
  ?? localModelRuntimeConfig.interOpThreads
168
195
  ?? parseInt(process.env.SWEET_SEARCH_ORT_INTER_OP_THREADS || '1', 10);
196
+ // Foreground scales intra-op threads with the hardware (bestIntraOpThreads);
197
+ // the background/maintainer profile clamps to 2–4 so an idle-time reconcile
198
+ // tick never spikes every P-core. An explicit intraOpThreads override (from
199
+ // runtimeConfig or the daemon's configureLocalModelRuntime) still wins on
200
+ // both paths so callers can pin a specific count.
169
201
  const intraOpThreads = runtimeConfig.intraOpThreads
170
202
  ?? localModelRuntimeConfig.intraOpThreads
171
- ?? bestIntraOpThreads(runtimeConfig);
203
+ ?? (background ? backgroundIntraOpThreads(runtimeConfig) : bestIntraOpThreads(runtimeConfig));
172
204
 
173
205
  const sessionOptions = {
174
206
  graphOptimizationLevel: 'all',
@@ -176,18 +208,41 @@ export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = fa
176
208
  intraOpNumThreads: intraOpThreads,
177
209
  interOpNumThreads: interOpThreads,
178
210
  executionMode,
179
- enableCpuMemArena: true,
211
+ // Background profile disables the CPU mem arena: ORT never returns arena
212
+ // memory to the OS once grown (#25325), so a resident maintainer daemon
213
+ // would accrue monotonic RSS. Foreground keeps the arena on for throughput.
214
+ enableCpuMemArena: !background,
180
215
  enableMemPattern: true,
181
216
  optimizedModelFilePath: getOptimizedModelPath(quantLabel),
182
217
  };
183
218
 
184
- // Thread spinning keeps ORT worker threads hot-looping for work instead of
185
- // sleeping on OS primitives. Trades idle CPU for lower per-batch latency.
186
- sessionOptions.extra = {
187
- session: {
188
- intra_op: { allow_spinning: '1' },
189
- },
190
- };
219
+ if (background) {
220
+ // Background/maintainer profile: park worker threads immediately after the
221
+ // last Run() instead of hot-looping (allow_spinning would peg ~a full core
222
+ // while the daemon sits idle 20–60s between bursts). force_spinning_stop
223
+ // re-spins on the next Run() at ~14% latency cost — a good trade for a
224
+ // background daemon. Honoured by onnxruntime-node via SessionOptions.extra
225
+ // (verified by native-binding inspection of 1.24.3; self-checked at startup
226
+ // in getLocalPipeline, which falls back to thread-count-only if rejected).
227
+ // NB: do NOT set intra_op_thread_affinities — no-op on macOS; E-core
228
+ // routing comes from process-level taskpolicy -b (G5), and RunOptions.extra
229
+ // per-Run arena shrinkage is not wired in the Node binding (arena-off is the
230
+ // only resident-memory lever here).
231
+ sessionOptions.extra = {
232
+ session: {
233
+ force_spinning_stop: '1',
234
+ },
235
+ };
236
+ } else {
237
+ // Foreground/full-index profile: thread spinning keeps ORT worker threads
238
+ // hot-looping for work instead of sleeping on OS primitives. Trades idle
239
+ // CPU for lower per-batch latency. (Unchanged from the historical default.)
240
+ sessionOptions.extra = {
241
+ session: {
242
+ intra_op: { allow_spinning: '1' },
243
+ },
244
+ };
245
+ }
191
246
 
192
247
  if (shouldUseOpenVino()) {
193
248
  // Note: OpenVINO EP is not bundled in onnxruntime-node 1.24 for macOS.
@@ -399,6 +454,38 @@ async function embedBatchesWithPool(pool, batches, maxLength, onProgress, totalT
399
454
  // PIPELINE SINGLETON
400
455
  // =============================================================================
401
456
 
457
+ /**
458
+ * Self-check that the background ORT profile's SessionOptions.extra is accepted
459
+ * by the onnxruntime-node binding. Builds a throwaway session with the bg
460
+ * `extra` (force_spinning_stop); if it constructs cleanly, the real session
461
+ * keeps the extra. If construction throws (key rejected by a future ORT), log
462
+ * and return a copy of the options with `extra` removed (thread-count-only
463
+ * fallback — the clamped intra-op count + arena-off still apply). Best-effort:
464
+ * any failure to even run the probe leaves the options untouched.
465
+ *
466
+ * Throwaway sessions are disposed when supported so the probe leaves no
467
+ * resident native memory behind.
468
+ */
469
+ async function verifyBackgroundExtraOrFallback(ort, onnxPath, sessionOptions) {
470
+ let probe = null;
471
+ try {
472
+ probe = await ort.InferenceSession.create(onnxPath, sessionOptions);
473
+ return sessionOptions; // extra accepted — use it
474
+ } catch (err) {
475
+ const fallback = { ...sessionOptions };
476
+ delete fallback.extra;
477
+ console.warn(
478
+ `[L3b] ORT background profile extra rejected (${err?.message || err}); ` +
479
+ 'falling back to thread-count-only background profile (arena-off retained).',
480
+ );
481
+ return fallback;
482
+ } finally {
483
+ if (probe && typeof probe.release === 'function') {
484
+ try { await probe.release(); } catch { /* best effort */ }
485
+ }
486
+ }
487
+ }
488
+
402
489
  let localPipeline = null;
403
490
  let isLoadingLocal = false;
404
491
  let loadPromise = null;
@@ -429,7 +516,16 @@ export async function getLocalPipeline() {
429
516
  if (isAppleSilicon() && !existsSync(coremlFlagPath)) {
430
517
  coremlAvailable = await isCoreMLProviderAvailable();
431
518
  }
432
- const sessionOptions = buildLocalSessionOptions(quantLabel, coremlAvailable);
519
+ let sessionOptions = buildLocalSessionOptions(quantLabel, coremlAvailable);
520
+ // G3 startup self-check: the background profile relies on
521
+ // SessionOptions.extra.session.force_spinning_stop being honoured by the
522
+ // onnxruntime-node binding (confirmed via native-binding inspection of
523
+ // 1.24.3, but verify at runtime). If a future ORT version rejects the
524
+ // config key, fall back to a thread-count-only background profile (keep the
525
+ // clamped intra-op count + arena-off; drop only the unsupported `extra`).
526
+ if (isBackgroundOrtProfile() && sessionOptions.extra) {
527
+ sessionOptions = await verifyBackgroundExtraOrFallback(ort, onnxPath, sessionOptions);
528
+ }
433
529
  let backend = 'cpu';
434
530
  if (sessionOptions.executionProviders) {
435
531
  const names = sessionOptions.executionProviders.map(ep => typeof ep === 'string' ? ep : ep.name);
@@ -38,6 +38,21 @@ import {
38
38
  resetLocalModelRuntime,
39
39
  } from './embedding-local-model.js';
40
40
 
41
+ // G8 shared model server — the RPC client is imported LAZILY (only when the
42
+ // SWEET_SEARCH_SHARED_MODEL_SERVER gate is on) so the default in-process path
43
+ // never pays the import cost and stays byte-and-behavior identical to today.
44
+ let _modelClientModule;
45
+ async function _getModelClient() {
46
+ if (_modelClientModule === undefined) {
47
+ try {
48
+ _modelClientModule = await import('./model-client.mjs');
49
+ } catch {
50
+ _modelClientModule = null; // import failed → permanently fall back
51
+ }
52
+ }
53
+ return _modelClientModule;
54
+ }
55
+
41
56
  import {
42
57
  queryCache,
43
58
  vocabulary,
@@ -315,6 +330,49 @@ export async function embed(text, options = {}) {
315
330
  return result.embedding;
316
331
  }
317
332
 
333
+ /**
334
+ * G8 dispatch shim. Generate embeddings for the uncached texts.
335
+ *
336
+ * When `SWEET_SEARCH_SHARED_MODEL_SERVER==='1'` AND the embedding provider is
337
+ * the local ONNX model (the only model the shared server hosts), route the
338
+ * generation through the model-server RPC client over a Unix socket. The RPC
339
+ * result is BYTE-IDENTICAL to in-process (same model, same preprocessing — the
340
+ * floats travel as raw Float32 bytes). On ANY failure (flag off, client import
341
+ * failed, socket unavailable, server error, timeout) we fall through to the
342
+ * existing in-process `generateEmbeddings` path UNCHANGED — the shared server
343
+ * is a pure performance/memory optimization, never a correctness dependency.
344
+ */
345
+ async function _generateUncachedEmbeddings(uncachedTexts, provider, providerOptions, onProgress) {
346
+ const sharedServerOn = process.env.SWEET_SEARCH_SHARED_MODEL_SERVER === '1';
347
+ // The shared model server only hosts the local ONNX model. Remote providers
348
+ // (voyage/mistral/jina) must keep their existing in-process API path.
349
+ const isLocalModel = !EMBEDDING_PROVIDERS[provider]
350
+ || !EMBEDDING_PROVIDERS[provider].enabled
351
+ || provider === 'local';
352
+
353
+ if (sharedServerOn && isLocalModel) {
354
+ const client = await _getModelClient();
355
+ if (client && typeof client.requestEmbeddings === 'function') {
356
+ try {
357
+ const rpc = await client.requestEmbeddings(uncachedTexts, { providerOptions });
358
+ // Guard against a partial/short reply — only trust a complete result.
359
+ if (Array.isArray(rpc) && rpc.length === uncachedTexts.length) {
360
+ if (onProgress) onProgress(uncachedTexts.length, uncachedTexts.length);
361
+ return rpc;
362
+ }
363
+ } catch (err) {
364
+ if (process.env.DEBUG_CATCHES) {
365
+ process.stderr.write(`[embedding-service] shared model server RPC failed, falling back: ${err?.message || err}\n`);
366
+ }
367
+ // fall through to in-process
368
+ }
369
+ }
370
+ }
371
+
372
+ // Default / fallback path — byte-and-behavior identical to today.
373
+ return generateEmbeddings(uncachedTexts, provider, { ...providerOptions, onProgress });
374
+ }
375
+
318
376
  export async function getEmbeddings(texts, options = {}) {
319
377
  const {
320
378
  useCache = true,
@@ -355,7 +413,7 @@ export async function getEmbeddings(texts, options = {}) {
355
413
  }
356
414
 
357
415
  if (uncachedTexts.length > 0) {
358
- const newEmbeddings = await generateEmbeddings(uncachedTexts, provider, { ...providerOptions, onProgress });
416
+ const newEmbeddings = await _generateUncachedEmbeddings(uncachedTexts, provider, providerOptions, onProgress);
359
417
  for (let i = 0; i < uncachedIndices.length; i++) {
360
418
  const idx = uncachedIndices[i];
361
419
  results[idx] = { embedding: newEmbeddings[i], cached: false };
@@ -0,0 +1,257 @@
1
+ /**
2
+ * G8 — Shared model server: RPC CLIENT + wire protocol codec.
3
+ *
4
+ * One ONNX model is loaded ONCE in a separate process (`model-server.mjs`);
5
+ * per-repo daemons RPC to it for embeddings over a Unix domain socket. This
6
+ * module is the CLIENT used by `embedding-service.js` when
7
+ * `SWEET_SEARCH_SHARED_MODEL_SERVER==='1'`. It connects, sends `getEmbeddings`
8
+ * requests, and falls back to in-process embedding when the socket is
9
+ * unavailable (the caller catches and reverts — see the dispatch shim).
10
+ *
11
+ * Wire protocol (length-prefixed binary frames). Each frame is:
12
+ *
13
+ * [4 bytes BE] header JSON byte length (H)
14
+ * [4 bytes BE] payload byte length (P)
15
+ * [H bytes] UTF-8 JSON header (type, metadata, dims, lengths…)
16
+ * [P bytes] raw payload (concatenated Float32 little-endian)
17
+ *
18
+ * CRITICAL byte-identity guarantee: embedding floats travel as RAW Float32
19
+ * little-endian bytes in the payload, never JSON-stringified. The bytes the
20
+ * server reads out of the model are the bytes the client reconstructs — a
21
+ * pure transport hop, no lossy float→string→float round-trip. The codec here
22
+ * is the single source of truth for that framing; the server imports it.
23
+ *
24
+ * This module owns NO model state and performs NO inference; it is pure
25
+ * transport + (de)serialization, safe to import from any process.
26
+ */
27
+
28
+ import net from 'node:net';
29
+ import os from 'node:os';
30
+ import path from 'node:path';
31
+
32
+ // Header lengths are 32-bit BE; payloads are bounded by the same width.
33
+ export const FRAME_HEADER_BYTES = 8; // 4 (header len) + 4 (payload len)
34
+ export const PROTOCOL_VERSION = 1;
35
+
36
+ /**
37
+ * Resolve the shared model server's socket path. The model server is GLOBAL
38
+ * (one per machine/user, shared across all repos) — unlike the per-project
39
+ * search server — so the default socket is a single fixed path. A deep path
40
+ * would overflow `sockaddr_un.sun_path` (~104 bytes on macOS), so we keep it
41
+ * short under the OS temp dir. Override with `SWEET_SEARCH_MODEL_SOCKET_PATH`.
42
+ */
43
+ export function modelServerSocketPath(env = process.env) {
44
+ if (env.SWEET_SEARCH_MODEL_SOCKET_PATH) return env.SWEET_SEARCH_MODEL_SOCKET_PATH;
45
+ // Scope by uid where available so multiple users don't collide on one path.
46
+ let uidPart = '';
47
+ try {
48
+ if (typeof process.getuid === 'function') uidPart = `-${process.getuid()}`;
49
+ } catch { /* getuid unavailable (e.g. Windows) — fall through */ }
50
+ return path.join(os.tmpdir(), `sweet-search-model${uidPart}.sock`);
51
+ }
52
+
53
+ // ── Wire codec ────────────────────────────────────────────────────────────
54
+
55
+ /**
56
+ * Encode a single frame. `header` is a JSON-serializable object; `payload` is
57
+ * an optional Buffer of raw bytes (Float32 little-endian for embeddings).
58
+ * Returns one Buffer ready to write to the socket.
59
+ */
60
+ export function encodeFrame(header, payload = null) {
61
+ const headerJson = Buffer.from(JSON.stringify(header), 'utf8');
62
+ const payloadBuf = payload || Buffer.alloc(0);
63
+ const prefix = Buffer.allocUnsafe(FRAME_HEADER_BYTES);
64
+ prefix.writeUInt32BE(headerJson.length, 0);
65
+ prefix.writeUInt32BE(payloadBuf.length, 4);
66
+ return Buffer.concat([prefix, headerJson, payloadBuf]);
67
+ }
68
+
69
+ /**
70
+ * Incremental frame decoder. Feed it chunks; it emits whole frames via the
71
+ * `onFrame(header, payloadBuffer)` callback. Handles TCP/stream fragmentation
72
+ * (a frame split across many chunks, or many frames in one chunk).
73
+ */
74
+ export class FrameDecoder {
75
+ constructor(onFrame) {
76
+ this._onFrame = onFrame;
77
+ this._buf = Buffer.alloc(0);
78
+ }
79
+
80
+ push(chunk) {
81
+ this._buf = this._buf.length === 0 ? chunk : Buffer.concat([this._buf, chunk]);
82
+ // Drain as many complete frames as are buffered.
83
+ for (;;) {
84
+ if (this._buf.length < FRAME_HEADER_BYTES) return;
85
+ const headerLen = this._buf.readUInt32BE(0);
86
+ const payloadLen = this._buf.readUInt32BE(4);
87
+ const total = FRAME_HEADER_BYTES + headerLen + payloadLen;
88
+ if (this._buf.length < total) return; // wait for more bytes
89
+ const headerJson = this._buf.toString('utf8', FRAME_HEADER_BYTES, FRAME_HEADER_BYTES + headerLen);
90
+ const payload = this._buf.subarray(FRAME_HEADER_BYTES + headerLen, total);
91
+ // Copy the payload out so the retained buffer slice can be GC'd and the
92
+ // caller owns a stable Buffer independent of our internal buffer.
93
+ const payloadCopy = Buffer.from(payload);
94
+ this._buf = this._buf.subarray(total);
95
+ let header;
96
+ try {
97
+ header = JSON.parse(headerJson);
98
+ } catch (err) {
99
+ // A corrupt header is unrecoverable on a stream — surface and stop.
100
+ this._onFrame(null, null, err);
101
+ return;
102
+ }
103
+ this._onFrame(header, payloadCopy, null);
104
+ }
105
+ }
106
+ }
107
+
108
+ /**
109
+ * Pack an array of embeddings (Float32Array | number[]) into one contiguous
110
+ * Float32 little-endian payload + a per-vector length list (so ragged dims are
111
+ * preserved exactly). Returns { payload: Buffer, dims: number[] }.
112
+ *
113
+ * We do NOT assume a fixed dimension: each vector's length is recorded so the
114
+ * decode is exact even if a caller ever returns mixed-width vectors.
115
+ */
116
+ export function packEmbeddings(embeddings) {
117
+ const dims = new Array(embeddings.length);
118
+ let totalFloats = 0;
119
+ for (let i = 0; i < embeddings.length; i++) {
120
+ const v = embeddings[i];
121
+ const len = v == null ? 0 : v.length;
122
+ dims[i] = len;
123
+ totalFloats += len;
124
+ }
125
+ // One backing buffer; copy each vector's raw little-endian bytes in order.
126
+ const out = Buffer.allocUnsafe(totalFloats * 4);
127
+ let offset = 0;
128
+ for (let i = 0; i < embeddings.length; i++) {
129
+ const v = embeddings[i];
130
+ if (!v || v.length === 0) continue;
131
+ // Float32Array view over the SAME bytes — copy losslessly into `out`.
132
+ const src = v instanceof Float32Array ? v : Float32Array.from(v);
133
+ const srcBytes = Buffer.from(src.buffer, src.byteOffset, src.length * 4);
134
+ srcBytes.copy(out, offset);
135
+ offset += src.length * 4;
136
+ }
137
+ return { payload: out, dims };
138
+ }
139
+
140
+ /**
141
+ * Inverse of `packEmbeddings`. Reconstructs an array of Float32Array from a
142
+ * raw little-endian payload + per-vector dims. The reconstructed arrays are
143
+ * byte-identical to the originals (same IEEE-754 bit patterns).
144
+ */
145
+ export function unpackEmbeddings(payload, dims) {
146
+ const out = new Array(dims.length);
147
+ let floatOffset = 0;
148
+ for (let i = 0; i < dims.length; i++) {
149
+ const len = dims[i];
150
+ const vec = new Float32Array(len);
151
+ for (let j = 0; j < len; j++) {
152
+ // Read each float by absolute byte offset to stay correct regardless of
153
+ // payload alignment (Buffer is not guaranteed 4-byte aligned).
154
+ vec[j] = payload.readFloatLE((floatOffset + j) * 4);
155
+ }
156
+ out[i] = vec;
157
+ floatOffset += len;
158
+ }
159
+ return out;
160
+ }
161
+
162
+ // ── RPC client ──────────────────────────────────────────────────────────────
163
+
164
+ let _nextRequestId = 1;
165
+
166
+ /**
167
+ * Send a single `getEmbeddings` RPC and resolve with Float32Array[] (one per
168
+ * input text). Opens a fresh connection per call (simple, robust; the model
169
+ * server multiplexes concurrent connections). REJECTS on any transport/server
170
+ * error so the dispatch shim can fall back to in-process — it must NEVER throw
171
+ * a value that looks like a successful (but wrong) result.
172
+ *
173
+ * @param {string[]} texts
174
+ * @param {object} [opts]
175
+ * @param {object} [opts.providerOptions] forwarded to the server-side embed.
176
+ * @param {string} [opts.socketPath]
177
+ * @param {number} [opts.timeoutMs]
178
+ */
179
+ export function requestEmbeddings(texts, opts = {}) {
180
+ const socketPath = opts.socketPath || modelServerSocketPath();
181
+ const timeoutMs = opts.timeoutMs ?? 60_000;
182
+ const providerOptions = opts.providerOptions || {};
183
+ const requestId = _nextRequestId++;
184
+
185
+ return new Promise((resolve, reject) => {
186
+ let settled = false;
187
+ const finish = (fn, arg) => {
188
+ if (settled) return;
189
+ settled = true;
190
+ clearTimeout(timer);
191
+ try { socket.destroy(); } catch { /* ignore */ }
192
+ fn(arg);
193
+ };
194
+
195
+ const socket = net.connect(socketPath);
196
+ const decoder = new FrameDecoder((header, payload, err) => {
197
+ if (err) return finish(reject, err);
198
+ if (header.type === 'error') {
199
+ return finish(reject, new Error(header.message || 'model-server error'));
200
+ }
201
+ if (header.type === 'embeddings' && header.requestId === requestId) {
202
+ try {
203
+ const embeddings = unpackEmbeddings(payload, header.dims || []);
204
+ return finish(resolve, embeddings);
205
+ } catch (e) {
206
+ return finish(reject, e);
207
+ }
208
+ }
209
+ // Unknown / mismatched frame — treat as protocol error, fall back.
210
+ finish(reject, new Error(`unexpected model-server frame: ${header.type}`));
211
+ });
212
+
213
+ const timer = setTimeout(() => finish(reject, new Error('model-server RPC timeout')), timeoutMs);
214
+
215
+ socket.on('connect', () => {
216
+ const frame = encodeFrame({
217
+ type: 'getEmbeddings',
218
+ v: PROTOCOL_VERSION,
219
+ requestId,
220
+ texts,
221
+ providerOptions,
222
+ });
223
+ socket.write(frame);
224
+ });
225
+ socket.on('data', (chunk) => decoder.push(chunk));
226
+ socket.on('error', (e) => finish(reject, e));
227
+ socket.on('close', () => finish(reject, new Error('model-server connection closed before reply')));
228
+ });
229
+ }
230
+
231
+ /**
232
+ * Best-effort liveness probe: resolves true if the model server answers a
233
+ * `ping` over the socket within `timeoutMs`, false otherwise. Never throws.
234
+ */
235
+ export function pingModelServer(opts = {}) {
236
+ const socketPath = opts.socketPath || modelServerSocketPath();
237
+ const timeoutMs = opts.timeoutMs ?? 1_000;
238
+ return new Promise((resolve) => {
239
+ let settled = false;
240
+ const done = (ok) => {
241
+ if (settled) return;
242
+ settled = true;
243
+ clearTimeout(timer);
244
+ try { socket.destroy(); } catch { /* ignore */ }
245
+ resolve(ok);
246
+ };
247
+ const socket = net.connect(socketPath);
248
+ const decoder = new FrameDecoder((header) => {
249
+ done(!!header && header.type === 'pong');
250
+ });
251
+ const timer = setTimeout(() => done(false), timeoutMs);
252
+ socket.on('connect', () => socket.write(encodeFrame({ type: 'ping', v: PROTOCOL_VERSION })));
253
+ socket.on('data', (chunk) => decoder.push(chunk));
254
+ socket.on('error', () => done(false));
255
+ socket.on('close', () => done(false));
256
+ });
257
+ }