npm - sweet-search - Versions diffs - 2.5.14 → 2.6.0 - Mend

sweet-search 2.5.14 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/README.md +36 -9
package/core/cli.js +41 -3
package/core/embedding/embedding-local-model.js +106 -10
package/core/embedding/embedding-service.js +59 -1
package/core/embedding/model-client.mjs +257 -0
package/core/embedding/model-server.mjs +217 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +19 -98
package/core/incremental-indexing/application/maintenance-worker.mjs +46 -9
package/core/incremental-indexing/application/operator-cli.mjs +14 -5
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +40 -0
package/core/incremental-indexing/application/production-reconciler.mjs +718 -54
package/core/incremental-indexing/application/reconciler.mjs +87 -15
package/core/incremental-indexing/domain/cutoff-cache.mjs +191 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +84 -1
package/core/incremental-indexing/domain/reconcile-counters.mjs +0 -4
package/core/incremental-indexing/domain/watermark-scheduler.mjs +0 -24
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +2 -26
package/core/incremental-indexing/infrastructure/manifest.mjs +1 -9
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +72 -0
package/core/indexing/artifact-builder.js +1 -1
package/core/indexing/dedup/dedup-phase.js +36 -17
package/core/indexing/dedup/exemplar-selector.js +5 -0
package/core/indexing/index-codebase-v21.js +37 -14
package/core/indexing/index-maintainer.mjs +337 -6
package/core/indexing/indexer-ann.js +27 -434
package/core/indexing/indexer-build.js +30 -14
package/core/indexing/indexer-manifest.js +0 -3
package/core/indexing/indexer-phases.js +101 -25
package/core/indexing/maintainer-launcher.mjs +22 -0
package/core/indexing/maintainer-watcher.mjs +397 -0
package/core/indexing/os-priority.mjs +160 -0
package/core/indexing/rss-budget.mjs +425 -0
package/core/indexing/streaming-vectors.js +450 -0
package/core/infrastructure/config/platform.js +14 -10
package/core/infrastructure/onnx-session-utils.js +37 -0
package/core/infrastructure/sparse-gram-delta-reader.js +11 -1
package/core/ranking/late-interaction-index.js +58 -7
package/core/search/daemon-registry.js +199 -0
package/core/search/search-read-semantic.js +9 -3
package/core/search/search-semantic.js +6 -29
package/core/search/search-server.js +527 -27
package/core/search/session-daemon-prewarm.mjs +110 -1
package/core/search/sweet-search.js +0 -38
package/core/vector-store/binary-hnsw-index.js +692 -78
package/core/vector-store/index.js +1 -4
package/mcp/tool-handlers.js +1 -2
package/package.json +11 -8
package/scripts/uninstall.js +2 -0
package/core/vector-store/hnsw-index.js +0 -751

package/README.md CHANGED Viewed

@@ -1,15 +1,16 @@
 <div align="center">
-<img src="assets/sweet-search-banner-pixelated.svg" alt="sweet-search" width="100%" />
+<img src="assets/sweet-search-banner-pixelated.svg" alt="sweet-search — local code search for AI coding agents" width="100%" />
-### *Maybe grep isn't all you need…* 🍬
+<br/>
+**Local code search for AI coding agents.** Six fast, purpose-built tools that hand *Claude Code*, *Codex* & friends ranked answers, not raw grep. Zero API keys, 100% on-device.
-Every AI coding agent of today is stuck believing grep+Read is the way... ***sweet-search*** challenges the narrative 😎
-A 100% local code-search engine for *Claude Code*, *Codex*, *Cursor* & friends with six blazing and purpose-built tools that hand your agent ranked, ready-to-use answers instead of walls of grep output. Up to 34% cheaper, 56% fewer tool calls, more useful answers, SOTA retrieval quality, zero API keys.
+*Maybe grep isn't all you need…* 🍬<br/>
+Every coding agent today reaches for grep + Read by reflex. *sweet-search* challenges the narrative. 😎
 [![npm](https://img.shields.io/npm/v/sweet-search?color=cb3837&label=npm)](https://www.npmjs.com/package/sweet-search)
+[![GitHub stars](https://img.shields.io/github/stars/mrsladoje/sweet-search?style=social)](https://github.com/mrsladoje/sweet-search/stargazers)
 [![license](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
 [![node](https://img.shields.io/badge/node-%E2%89%A518-brightgreen)](package.json)
 [![platforms](https://img.shields.io/badge/platform-macOS%20%7C%20Linux-lightgrey)](#platform-support)
@@ -80,6 +81,9 @@ A 100% local code-search engine for *Claude Code*, *Codex*, *Cursor* & friends w
 [📊 Benchmarks](#-benchmarks)<br>
 <sub>agent cost savings · engine speed · full-corpus MRR</sub>
+[🧭 Where sweet-search Fits](#-where-sweet-search-fits)<br>
+<sub>honest wins & trade-offs vs peers</sub>
 [🙏 Prior Art & Acknowledgements](#-prior-art--acknowledgements)<br>
 <sub>the shoulders we stand on</sub>
@@ -105,10 +109,6 @@ sweet-search "where do we validate JWT tokens?"
 That's it. `init` is idempotent and SHA256-verifies every model binary; re-running it is always safe.
 From then on the index maintains itself — edit, save, search.
-> **Latest release: v2.5.5** — the agent-mode preview tier now defaults to a 3k token budget (was 4k):
-> same accuracy and usefulness in a 4-model paired sweep, ~11–15% cheaper per query. Already on an
-> older install? `npm install -g sweet-search` again to pick it up.
 <details>
 <summary><b>Setup options & details</b></summary>
@@ -290,6 +290,33 @@ We're SOTA in June 2026 on 3/4 attempted benchmarks at HARDER settings (running
 | 💾 Indexing memory | peak JS heap **785 MB → 213 MB** | [`docs/DISK_FLUSHING_STRATEGY.md`](docs/DISK_FLUSHING_STRATEGY.md) |
 | 🍏 CoreML cascade (M3 Max) | **18% faster** full indexing vs the Metal baseline | [`docs/INIT_STRATEGY.md`](docs/INIT_STRATEGY.md) |
+## 🧭 Where sweet-search Fits
+Code search is a crowded space. Here's an honest read on where sweet-search wins and where it gives ground, against the trending leaders and our closest local peers.
+| Capability | sweet-search | claude-context | Cursor index | codebase-memory | SocratiCode |
+|---|:---:|:---:|:---:|:---:|:---:|
+| 100% local — code never leaves your machine | ✅ | ✅¹ | ❌ | ✅ | ✅ |
+| Works with zero API keys | ✅ | ✅¹ | ❌ | ✅ | ✅ |
+| No external service to run (vector DB · Ollama · Docker) | ✅ | ❌ Milvus | ❌ cloud | ✅ | ⚠️⁵ |
+| ColBERT late-interaction rerank | ✅ | ❌ | ❌ | ❌ | ❌ |
+| Faster-than-ripgrep exact grep | ✅ | ❌ | ✅⁷ | ❌ | ❌ |
+| Call-graph trace (callers · callees · impact) | ✅ | ❌ | ❌ | ✅ | ✅ |
+| Drives any terminal agent (Claude Code · Codex · Gemini CLI) | ✅ | ✅ | ❌² | ✅ | ✅ |
+| Published NL→code retrieval benchmarks | ✅ | ⚠️³ | ❌ | ⚠️³ | ⚠️³ |
+| *…and where sweet-search gives ground* | | | | | |
+| Native Windows | ❌⁴ | ✅ | ✅ | ✅ | ⚠️⁸ |
+| Deep-AST language coverage | ⚠️ 14 (+70 via regex) | ⚠️ | ⚠️ | ✅ 158 | ⚠️ |
+| In-editor GUI · writes & edits code | ❌ | ❌ | ✅ | ❌ | ❌⁶ |
+| Org-wide, multi-repo scale | ❌ | ⚠️ | ⚠️ | ⚠️ | ✅ |
+<sub>✅ yes · ⚠️ partial / with caveats · ❌ no. Verified June 2026; capabilities drift.<br/>
+¹ claude-context's local path (Milvus Lite + Ollama embeddings) needs no API key, but it defaults to OpenAI/Voyage embeddings + Zilliz Cloud — and still runs Milvus + Ollama either way. ² Cursor's index is editor-locked — external terminal agents can't query it. ³ Reports token-reduction / efficiency, not a public NL→code retrieval-quality leaderboard. ⁴ Runs on Windows via WSL2. ⁵ SocratiCode manages a bundled Qdrant for you, but uses an auto-detected Ollama for local embeddings. ⁶ Ships an interactive HTML graph viewer, but doesn't edit code. ⁷ Cursor's local Instant Grep — a literal + regex index it benchmarks at ripgrep 16.8 s → 13 ms (the post that inspired our own n-gram prefilter). ⁸ SocratiCode runs on Windows via Docker only — no native binary, and no GPU there.</sub>
+**Where we lose, plainly:** no native Windows yet, no editor GUI, and we index one repo at a time. If you need org-wide search across many repos and branches, that's where [SocratiCode](https://github.com/giancarloerra/socraticode) and [Sourcegraph](https://sourcegraph.com) are built to win. If you live inside one editor, Cursor's index is already there. sweet-search is for the terminal agent that wants the best *local* retrieval on the repo in front of it. No one else combines all of it: ColBERT late-interaction reranking **and** faster-than-grep search, fully on-device, with nothing to sign up for.
+<sub>Also in the space: <a href="https://sourcegraph.com">Sourcegraph/Cody</a> (org-scale, server-based), <a href="https://github.com/continuedev/continue">Continue.dev</a> (local-default RAG), <a href="https://github.com/oraios/serena">Serena</a> (LSP symbol search, no embeddings), <a href="https://github.com/yoanbernabeu/grepai">grepai</a> (local CLI + trace), and <a href="https://github.com/cocoindex-io/cocoindex-code">cocoindex-code</a> (embedded AST search).</sub>
 ## 🧰 The Six Tools
 Six small tools, one shared index. Each returns ranked, deduplicated, token-budgeted output designed

package/core/cli.js CHANGED Viewed

@@ -10,6 +10,11 @@ import { spawnSync } from 'node:child_process';
 const args = process.argv.slice(2);
+function envFalsey(name) {
+  const v = String(process.env[name] || '').trim().toLowerCase();
+  return v === '0' || v === 'false' || v === 'off' || v === 'no';
+}
 // Package-management commands always run in JS (never native dispatch)
 if (args[0] === 'init') {
   const { runInit } = await import('../scripts/init.js');
@@ -24,15 +29,48 @@ if (args[0] === 'init') {
   const { handleIncrementalCli } = await import('./incremental-indexing/application/operator-cli.mjs');
   await handleIncrementalCli(args[0], args.slice(1));
 } else if (args[0] === 'read') {
-  // Filesystem-grounded reader; runs in JS (no native equivalent yet).
+  // Filesystem-grounded reader. Default dispatches to the native Unix-socket
+  // client so the warm daemon serves the read without per-call node startup.
+  // readFiles statSync's every call, so read-your-writes freshness is preserved.
+  // Set SWEET_SEARCH_READ_VIA_DAEMON=0 to force the in-process path.
+  if (!envFalsey('SWEET_SEARCH_READ_VIA_DAEMON')) {
+    const { resolveNativeBinary } = await import('./infrastructure/index.js');
+    const nativeBin = resolveNativeBinary();
+    if (nativeBin) {
+      const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
+      process.exit(result.status ?? 1);
+    }
+  }
   const { handleReadCli } = await import('./search/search-read.js');
   await handleReadCli(args.slice(1));
 } else if (args[0] === 'read-semantic') {
-  // Hybrid span-selection reader; runs in JS (depends on LI index + ranking).
+  // Hybrid span-selection reader. Default dispatches to the native Unix-socket
+  // client so the warm daemon can serve LI scoring without per-call
+  // model/session startup. Set SWEET_SEARCH_SEMANTIC_VIA_DAEMON=0 to force the
+  // legacy in-process path for debugging.
+  if (!envFalsey('SWEET_SEARCH_SEMANTIC_VIA_DAEMON')) {
+    const { resolveNativeBinary } = await import('./infrastructure/index.js');
+    const nativeBin = resolveNativeBinary();
+    if (nativeBin) {
+      const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
+      process.exit(result.status ?? 1);
+    }
+  }
   const { handleReadSemanticCli } = await import('./search/search-read-semantic.js');
   await handleReadSemanticCli(args.slice(1));
 } else if (args[0] === 'trace') {
-  // Unified structural code context: callers, callees, and impact.
+  // Unified structural code context: callers, callees, and impact. Default
+  // dispatches to the native Unix-socket client so the warm daemon serves the
+  // code-graph traversal without per-call node startup + cold code-graph.db
+  // open. Set SWEET_SEARCH_TRACE_VIA_DAEMON=0 to force the in-process path.
+  if (!envFalsey('SWEET_SEARCH_TRACE_VIA_DAEMON')) {
+    const { resolveNativeBinary } = await import('./infrastructure/index.js');
+    const nativeBin = resolveNativeBinary();
+    if (nativeBin) {
+      const result = spawnSync(nativeBin, args, { stdio: 'inherit' });
+      process.exit(result.status ?? 1);
+    }
+  }
   const { handleTraceCli } = await import('./search/search-trace.js');
   await handleTraceCli(args.slice(1));
 } else if (args[0] === 'index') {

package/core/embedding/embedding-local-model.js CHANGED Viewed

@@ -30,6 +30,7 @@ export const QUERY_MAX_LENGTH = parseInt(process.env.SWEET_SEARCH_QUERY_MAX_LENG
 // Import + re-export from infrastructure (canonical location)
 import {
   bestIntraOpThreads,
+  backgroundIntraOpThreads,
   defaultOrtExecutionMode,
   detectLastLevelCacheBytes,
   computeWeightsAwareBatchCap,
@@ -52,6 +53,13 @@ let localModelRuntimeConfig = {
   intraOpThreads: null,
   interOpThreads: null,
   executionMode: null,
+  // G3: background/maintainer ORT profile. When truthy, buildLocalSessionOptions
+  // emits force_spinning_stop:'1' + arena-off + 2–4 intra-op threads instead of
+  // the foreground allow_spinning:'1' + arena-on default. Set by the maintainer
+  // daemon (G4) via configureLocalModelRuntime({ background: true }) before the
+  // first encode (the session singleton is built once on first encode — setting
+  // it afterwards is a silent no-op). Default null/off everywhere else.
+  background: null,
 };
 export function configureLocalModelRuntime(overrides = {}) {
@@ -66,9 +74,26 @@ export function resetLocalModelRuntime() {
     intraOpThreads: null,
     interOpThreads: null,
     executionMode: null,
+    background: null,
   };
 }
+/**
+ * Resolve whether the BACKGROUND/maintainer ORT profile is active.
+ *
+ * True when the daemon set `{ background: true }` via configureLocalModelRuntime
+ * OR the SWEET_SEARCH_ORT_BACKGROUND=1 env gate is set. Default OFF: the
+ * foreground/full-index path is unchanged. An explicit `background: false` in
+ * the runtime config wins over the env gate (lets a query daemon force the
+ * latency-critical foreground profile even under a global env flag).
+ */
+export function isBackgroundOrtProfile(runtimeConfig = {}) {
+  const cfg = runtimeConfig.background ?? localModelRuntimeConfig.background;
+  if (cfg === true) return true;
+  if (cfg === false) return false;
+  return process.env.SWEET_SEARCH_ORT_BACKGROUND === '1';
+}
 export function isOpenVinoProviderAvailable() {
   if (openVinoProviderAvailable !== null) return openVinoProviderAvailable;
@@ -159,6 +184,8 @@ export function getCalibrationFactor() {
 }
 export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = false, runtimeConfig = {}) {
+  const background = isBackgroundOrtProfile(runtimeConfig);
   const executionMode = runtimeConfig.executionMode
     ?? localModelRuntimeConfig.executionMode
     ?? process.env.SWEET_SEARCH_ORT_EXEC_MODE
@@ -166,9 +193,14 @@ export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = fa
   const interOpThreads = runtimeConfig.interOpThreads
     ?? localModelRuntimeConfig.interOpThreads
     ?? parseInt(process.env.SWEET_SEARCH_ORT_INTER_OP_THREADS || '1', 10);
+  // Foreground scales intra-op threads with the hardware (bestIntraOpThreads);
+  // the background/maintainer profile clamps to 2–4 so an idle-time reconcile
+  // tick never spikes every P-core. An explicit intraOpThreads override (from
+  // runtimeConfig or the daemon's configureLocalModelRuntime) still wins on
+  // both paths so callers can pin a specific count.
   const intraOpThreads = runtimeConfig.intraOpThreads
     ?? localModelRuntimeConfig.intraOpThreads
-    ?? bestIntraOpThreads(runtimeConfig);
+    ?? (background ? backgroundIntraOpThreads(runtimeConfig) : bestIntraOpThreads(runtimeConfig));
   const sessionOptions = {
     graphOptimizationLevel: 'all',
@@ -176,18 +208,41 @@ export function buildLocalSessionOptions(quantLabel = 'q8', coremlAvailable = fa
     intraOpNumThreads: intraOpThreads,
     interOpNumThreads: interOpThreads,
     executionMode,
-    enableCpuMemArena: true,
+    // Background profile disables the CPU mem arena: ORT never returns arena
+    // memory to the OS once grown (#25325), so a resident maintainer daemon
+    // would accrue monotonic RSS. Foreground keeps the arena on for throughput.
+    enableCpuMemArena: !background,
     enableMemPattern: true,
     optimizedModelFilePath: getOptimizedModelPath(quantLabel),
   };
-  // Thread spinning keeps ORT worker threads hot-looping for work instead of
-  // sleeping on OS primitives. Trades idle CPU for lower per-batch latency.
-  sessionOptions.extra = {
-    session: {
-      intra_op: { allow_spinning: '1' },
-    },
-  };
+  if (background) {
+    // Background/maintainer profile: park worker threads immediately after the
+    // last Run() instead of hot-looping (allow_spinning would peg ~a full core
+    // while the daemon sits idle 20–60s between bursts). force_spinning_stop
+    // re-spins on the next Run() at ~14% latency cost — a good trade for a
+    // background daemon. Honoured by onnxruntime-node via SessionOptions.extra
+    // (verified by native-binding inspection of 1.24.3; self-checked at startup
+    // in getLocalPipeline, which falls back to thread-count-only if rejected).
+    // NB: do NOT set intra_op_thread_affinities — no-op on macOS; E-core
+    // routing comes from process-level taskpolicy -b (G5), and RunOptions.extra
+    // per-Run arena shrinkage is not wired in the Node binding (arena-off is the
+    // only resident-memory lever here).
+    sessionOptions.extra = {
+      session: {
+        force_spinning_stop: '1',
+      },
+    };
+  } else {
+    // Foreground/full-index profile: thread spinning keeps ORT worker threads
+    // hot-looping for work instead of sleeping on OS primitives. Trades idle
+    // CPU for lower per-batch latency. (Unchanged from the historical default.)
+    sessionOptions.extra = {
+      session: {
+        intra_op: { allow_spinning: '1' },
+      },
+    };
+  }
   if (shouldUseOpenVino()) {
     // Note: OpenVINO EP is not bundled in onnxruntime-node 1.24 for macOS.
@@ -399,6 +454,38 @@ async function embedBatchesWithPool(pool, batches, maxLength, onProgress, totalT
 // PIPELINE SINGLETON
 // =============================================================================
+/**
+ * Self-check that the background ORT profile's SessionOptions.extra is accepted
+ * by the onnxruntime-node binding. Builds a throwaway session with the bg
+ * `extra` (force_spinning_stop); if it constructs cleanly, the real session
+ * keeps the extra. If construction throws (key rejected by a future ORT), log
+ * and return a copy of the options with `extra` removed (thread-count-only
+ * fallback — the clamped intra-op count + arena-off still apply). Best-effort:
+ * any failure to even run the probe leaves the options untouched.
+ *
+ * Throwaway sessions are disposed when supported so the probe leaves no
+ * resident native memory behind.
+ */
+async function verifyBackgroundExtraOrFallback(ort, onnxPath, sessionOptions) {
+  let probe = null;
+  try {
+    probe = await ort.InferenceSession.create(onnxPath, sessionOptions);
+    return sessionOptions; // extra accepted — use it
+  } catch (err) {
+    const fallback = { ...sessionOptions };
+    delete fallback.extra;
+    console.warn(
+      `[L3b] ORT background profile extra rejected (${err?.message || err}); ` +
+      'falling back to thread-count-only background profile (arena-off retained).',
+    );
+    return fallback;
+  } finally {
+    if (probe && typeof probe.release === 'function') {
+      try { await probe.release(); } catch { /* best effort */ }
+    }
+  }
+}
 let localPipeline = null;
 let isLoadingLocal = false;
 let loadPromise = null;
@@ -429,7 +516,16 @@ export async function getLocalPipeline() {
     if (isAppleSilicon() && !existsSync(coremlFlagPath)) {
       coremlAvailable = await isCoreMLProviderAvailable();
     }
-    const sessionOptions = buildLocalSessionOptions(quantLabel, coremlAvailable);
+    let sessionOptions = buildLocalSessionOptions(quantLabel, coremlAvailable);
+    // G3 startup self-check: the background profile relies on
+    // SessionOptions.extra.session.force_spinning_stop being honoured by the
+    // onnxruntime-node binding (confirmed via native-binding inspection of
+    // 1.24.3, but verify at runtime). If a future ORT version rejects the
+    // config key, fall back to a thread-count-only background profile (keep the
+    // clamped intra-op count + arena-off; drop only the unsupported `extra`).
+    if (isBackgroundOrtProfile() && sessionOptions.extra) {
+      sessionOptions = await verifyBackgroundExtraOrFallback(ort, onnxPath, sessionOptions);
+    }
     let backend = 'cpu';
     if (sessionOptions.executionProviders) {
       const names = sessionOptions.executionProviders.map(ep => typeof ep === 'string' ? ep : ep.name);

package/core/embedding/embedding-service.js CHANGED Viewed

@@ -38,6 +38,21 @@ import {
   resetLocalModelRuntime,
 } from './embedding-local-model.js';
+// G8 shared model server — the RPC client is imported LAZILY (only when the
+// SWEET_SEARCH_SHARED_MODEL_SERVER gate is on) so the default in-process path
+// never pays the import cost and stays byte-and-behavior identical to today.
+let _modelClientModule;
+async function _getModelClient() {
+  if (_modelClientModule === undefined) {
+    try {
+      _modelClientModule = await import('./model-client.mjs');
+    } catch {
+      _modelClientModule = null; // import failed → permanently fall back
+    }
+  }
+  return _modelClientModule;
+}
 import {
   queryCache,
   vocabulary,
@@ -315,6 +330,49 @@ export async function embed(text, options = {}) {
   return result.embedding;
 }
+/**
+ * G8 dispatch shim. Generate embeddings for the uncached texts.
+ *
+ * When `SWEET_SEARCH_SHARED_MODEL_SERVER==='1'` AND the embedding provider is
+ * the local ONNX model (the only model the shared server hosts), route the
+ * generation through the model-server RPC client over a Unix socket. The RPC
+ * result is BYTE-IDENTICAL to in-process (same model, same preprocessing — the
+ * floats travel as raw Float32 bytes). On ANY failure (flag off, client import
+ * failed, socket unavailable, server error, timeout) we fall through to the
+ * existing in-process `generateEmbeddings` path UNCHANGED — the shared server
+ * is a pure performance/memory optimization, never a correctness dependency.
+ */
+async function _generateUncachedEmbeddings(uncachedTexts, provider, providerOptions, onProgress) {
+  const sharedServerOn = process.env.SWEET_SEARCH_SHARED_MODEL_SERVER === '1';
+  // The shared model server only hosts the local ONNX model. Remote providers
+  // (voyage/mistral/jina) must keep their existing in-process API path.
+  const isLocalModel = !EMBEDDING_PROVIDERS[provider]
+    || !EMBEDDING_PROVIDERS[provider].enabled
+    || provider === 'local';
+  if (sharedServerOn && isLocalModel) {
+    const client = await _getModelClient();
+    if (client && typeof client.requestEmbeddings === 'function') {
+      try {
+        const rpc = await client.requestEmbeddings(uncachedTexts, { providerOptions });
+        // Guard against a partial/short reply — only trust a complete result.
+        if (Array.isArray(rpc) && rpc.length === uncachedTexts.length) {
+          if (onProgress) onProgress(uncachedTexts.length, uncachedTexts.length);
+          return rpc;
+        }
+      } catch (err) {
+        if (process.env.DEBUG_CATCHES) {
+          process.stderr.write(`[embedding-service] shared model server RPC failed, falling back: ${err?.message || err}\n`);
+        }
+        // fall through to in-process
+      }
+    }
+  }
+  // Default / fallback path — byte-and-behavior identical to today.
+  return generateEmbeddings(uncachedTexts, provider, { ...providerOptions, onProgress });
+}
 export async function getEmbeddings(texts, options = {}) {
   const {
     useCache = true,
@@ -355,7 +413,7 @@ export async function getEmbeddings(texts, options = {}) {
   }
   if (uncachedTexts.length > 0) {
-    const newEmbeddings = await generateEmbeddings(uncachedTexts, provider, { ...providerOptions, onProgress });
+    const newEmbeddings = await _generateUncachedEmbeddings(uncachedTexts, provider, providerOptions, onProgress);
     for (let i = 0; i < uncachedIndices.length; i++) {
       const idx = uncachedIndices[i];
       results[idx] = { embedding: newEmbeddings[i], cached: false };

package/core/embedding/model-client.mjs ADDED Viewed

@@ -0,0 +1,257 @@
+/**
+ * G8 — Shared model server: RPC CLIENT + wire protocol codec.
+ *
+ * One ONNX model is loaded ONCE in a separate process (`model-server.mjs`);
+ * per-repo daemons RPC to it for embeddings over a Unix domain socket. This
+ * module is the CLIENT used by `embedding-service.js` when
+ * `SWEET_SEARCH_SHARED_MODEL_SERVER==='1'`. It connects, sends `getEmbeddings`
+ * requests, and falls back to in-process embedding when the socket is
+ * unavailable (the caller catches and reverts — see the dispatch shim).
+ *
+ * Wire protocol (length-prefixed binary frames). Each frame is:
+ *
+ *   [4 bytes BE]  header JSON byte length  (H)
+ *   [4 bytes BE]  payload byte length      (P)
+ *   [H bytes]     UTF-8 JSON header        (type, metadata, dims, lengths…)
+ *   [P bytes]     raw payload              (concatenated Float32 little-endian)
+ *
+ * CRITICAL byte-identity guarantee: embedding floats travel as RAW Float32
+ * little-endian bytes in the payload, never JSON-stringified. The bytes the
+ * server reads out of the model are the bytes the client reconstructs — a
+ * pure transport hop, no lossy float→string→float round-trip. The codec here
+ * is the single source of truth for that framing; the server imports it.
+ *
+ * This module owns NO model state and performs NO inference; it is pure
+ * transport + (de)serialization, safe to import from any process.
+ */
+import net from 'node:net';
+import os from 'node:os';
+import path from 'node:path';
+// Header lengths are 32-bit BE; payloads are bounded by the same width.
+export const FRAME_HEADER_BYTES = 8; // 4 (header len) + 4 (payload len)
+export const PROTOCOL_VERSION = 1;
+/**
+ * Resolve the shared model server's socket path. The model server is GLOBAL
+ * (one per machine/user, shared across all repos) — unlike the per-project
+ * search server — so the default socket is a single fixed path. A deep path
+ * would overflow `sockaddr_un.sun_path` (~104 bytes on macOS), so we keep it
+ * short under the OS temp dir. Override with `SWEET_SEARCH_MODEL_SOCKET_PATH`.
+ */
+export function modelServerSocketPath(env = process.env) {
+  if (env.SWEET_SEARCH_MODEL_SOCKET_PATH) return env.SWEET_SEARCH_MODEL_SOCKET_PATH;
+  // Scope by uid where available so multiple users don't collide on one path.
+  let uidPart = '';
+  try {
+    if (typeof process.getuid === 'function') uidPart = `-${process.getuid()}`;
+  } catch { /* getuid unavailable (e.g. Windows) — fall through */ }
+  return path.join(os.tmpdir(), `sweet-search-model${uidPart}.sock`);
+}
+// ── Wire codec ────────────────────────────────────────────────────────────
+/**
+ * Encode a single frame. `header` is a JSON-serializable object; `payload` is
+ * an optional Buffer of raw bytes (Float32 little-endian for embeddings).
+ * Returns one Buffer ready to write to the socket.
+ */
+export function encodeFrame(header, payload = null) {
+  const headerJson = Buffer.from(JSON.stringify(header), 'utf8');
+  const payloadBuf = payload || Buffer.alloc(0);
+  const prefix = Buffer.allocUnsafe(FRAME_HEADER_BYTES);
+  prefix.writeUInt32BE(headerJson.length, 0);
+  prefix.writeUInt32BE(payloadBuf.length, 4);
+  return Buffer.concat([prefix, headerJson, payloadBuf]);
+}
+/**
+ * Incremental frame decoder. Feed it chunks; it emits whole frames via the
+ * `onFrame(header, payloadBuffer)` callback. Handles TCP/stream fragmentation
+ * (a frame split across many chunks, or many frames in one chunk).
+ */
+export class FrameDecoder {
+  constructor(onFrame) {
+    this._onFrame = onFrame;
+    this._buf = Buffer.alloc(0);
+  }
+  push(chunk) {
+    this._buf = this._buf.length === 0 ? chunk : Buffer.concat([this._buf, chunk]);
+    // Drain as many complete frames as are buffered.
+    for (;;) {
+      if (this._buf.length < FRAME_HEADER_BYTES) return;
+      const headerLen = this._buf.readUInt32BE(0);
+      const payloadLen = this._buf.readUInt32BE(4);
+      const total = FRAME_HEADER_BYTES + headerLen + payloadLen;
+      if (this._buf.length < total) return; // wait for more bytes
+      const headerJson = this._buf.toString('utf8', FRAME_HEADER_BYTES, FRAME_HEADER_BYTES + headerLen);
+      const payload = this._buf.subarray(FRAME_HEADER_BYTES + headerLen, total);
+      // Copy the payload out so the retained buffer slice can be GC'd and the
+      // caller owns a stable Buffer independent of our internal buffer.
+      const payloadCopy = Buffer.from(payload);
+      this._buf = this._buf.subarray(total);
+      let header;
+      try {
+        header = JSON.parse(headerJson);
+      } catch (err) {
+        // A corrupt header is unrecoverable on a stream — surface and stop.
+        this._onFrame(null, null, err);
+        return;
+      }
+      this._onFrame(header, payloadCopy, null);
+    }
+  }
+}
+/**
+ * Pack an array of embeddings (Float32Array | number[]) into one contiguous
+ * Float32 little-endian payload + a per-vector length list (so ragged dims are
+ * preserved exactly). Returns { payload: Buffer, dims: number[] }.
+ *
+ * We do NOT assume a fixed dimension: each vector's length is recorded so the
+ * decode is exact even if a caller ever returns mixed-width vectors.
+ */
+export function packEmbeddings(embeddings) {
+  const dims = new Array(embeddings.length);
+  let totalFloats = 0;
+  for (let i = 0; i < embeddings.length; i++) {
+    const v = embeddings[i];
+    const len = v == null ? 0 : v.length;
+    dims[i] = len;
+    totalFloats += len;
+  }
+  // One backing buffer; copy each vector's raw little-endian bytes in order.
+  const out = Buffer.allocUnsafe(totalFloats * 4);
+  let offset = 0;
+  for (let i = 0; i < embeddings.length; i++) {
+    const v = embeddings[i];
+    if (!v || v.length === 0) continue;
+    // Float32Array view over the SAME bytes — copy losslessly into `out`.
+    const src = v instanceof Float32Array ? v : Float32Array.from(v);
+    const srcBytes = Buffer.from(src.buffer, src.byteOffset, src.length * 4);
+    srcBytes.copy(out, offset);
+    offset += src.length * 4;
+  }
+  return { payload: out, dims };
+}
+/**
+ * Inverse of `packEmbeddings`. Reconstructs an array of Float32Array from a
+ * raw little-endian payload + per-vector dims. The reconstructed arrays are
+ * byte-identical to the originals (same IEEE-754 bit patterns).
+ */
+export function unpackEmbeddings(payload, dims) {
+  const out = new Array(dims.length);
+  let floatOffset = 0;
+  for (let i = 0; i < dims.length; i++) {
+    const len = dims[i];
+    const vec = new Float32Array(len);
+    for (let j = 0; j < len; j++) {
+      // Read each float by absolute byte offset to stay correct regardless of
+      // payload alignment (Buffer is not guaranteed 4-byte aligned).
+      vec[j] = payload.readFloatLE((floatOffset + j) * 4);
+    }
+    out[i] = vec;
+    floatOffset += len;
+  }
+  return out;
+}
+// ── RPC client ──────────────────────────────────────────────────────────────
+let _nextRequestId = 1;
+/**
+ * Send a single `getEmbeddings` RPC and resolve with Float32Array[] (one per
+ * input text). Opens a fresh connection per call (simple, robust; the model
+ * server multiplexes concurrent connections). REJECTS on any transport/server
+ * error so the dispatch shim can fall back to in-process — it must NEVER throw
+ * a value that looks like a successful (but wrong) result.
+ *
+ * @param {string[]} texts
+ * @param {object}   [opts]
+ * @param {object}   [opts.providerOptions] forwarded to the server-side embed.
+ * @param {string}   [opts.socketPath]
+ * @param {number}   [opts.timeoutMs]
+ */
+export function requestEmbeddings(texts, opts = {}) {
+  const socketPath = opts.socketPath || modelServerSocketPath();
+  const timeoutMs = opts.timeoutMs ?? 60_000;
+  const providerOptions = opts.providerOptions || {};
+  const requestId = _nextRequestId++;
+  return new Promise((resolve, reject) => {
+    let settled = false;
+    const finish = (fn, arg) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      try { socket.destroy(); } catch { /* ignore */ }
+      fn(arg);
+    };
+    const socket = net.connect(socketPath);
+    const decoder = new FrameDecoder((header, payload, err) => {
+      if (err) return finish(reject, err);
+      if (header.type === 'error') {
+        return finish(reject, new Error(header.message || 'model-server error'));
+      }
+      if (header.type === 'embeddings' && header.requestId === requestId) {
+        try {
+          const embeddings = unpackEmbeddings(payload, header.dims || []);
+          return finish(resolve, embeddings);
+        } catch (e) {
+          return finish(reject, e);
+        }
+      }
+      // Unknown / mismatched frame — treat as protocol error, fall back.
+      finish(reject, new Error(`unexpected model-server frame: ${header.type}`));
+    });
+    const timer = setTimeout(() => finish(reject, new Error('model-server RPC timeout')), timeoutMs);
+    socket.on('connect', () => {
+      const frame = encodeFrame({
+        type: 'getEmbeddings',
+        v: PROTOCOL_VERSION,
+        requestId,
+        texts,
+        providerOptions,
+      });
+      socket.write(frame);
+    });
+    socket.on('data', (chunk) => decoder.push(chunk));
+    socket.on('error', (e) => finish(reject, e));
+    socket.on('close', () => finish(reject, new Error('model-server connection closed before reply')));
+  });
+}
+/**
+ * Best-effort liveness probe: resolves true if the model server answers a
+ * `ping` over the socket within `timeoutMs`, false otherwise. Never throws.
+ */
+export function pingModelServer(opts = {}) {
+  const socketPath = opts.socketPath || modelServerSocketPath();
+  const timeoutMs = opts.timeoutMs ?? 1_000;
+  return new Promise((resolve) => {
+    let settled = false;
+    const done = (ok) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      try { socket.destroy(); } catch { /* ignore */ }
+      resolve(ok);
+    };
+    const socket = net.connect(socketPath);
+    const decoder = new FrameDecoder((header) => {
+      done(!!header && header.type === 'pong');
+    });
+    const timer = setTimeout(() => done(false), timeoutMs);
+    socket.on('connect', () => socket.write(encodeFrame({ type: 'ping', v: PROTOCOL_VERSION })));
+    socket.on('data', (chunk) => decoder.push(chunk));
+    socket.on('error', () => done(false));
+    socket.on('close', () => done(false));
+  });
+}