npm - @loreai/core - Versions diffs - 0.16.0 → 0.17.0 - Mend

@loreai/core 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/README.md +11 -0
package/dist/bun/agents-file.d.ts +13 -1
package/dist/bun/agents-file.d.ts.map +1 -1
package/dist/bun/config.d.ts +20 -1
package/dist/bun/config.d.ts.map +1 -1
package/dist/bun/data.d.ts +174 -0
package/dist/bun/data.d.ts.map +1 -0
package/dist/bun/db.d.ts +65 -0
package/dist/bun/db.d.ts.map +1 -1
package/dist/bun/distillation.d.ts +49 -6
package/dist/bun/distillation.d.ts.map +1 -1
package/dist/bun/embedding-vendor.d.ts +66 -0
package/dist/bun/embedding-vendor.d.ts.map +1 -0
package/dist/bun/embedding-worker-types.d.ts +66 -0
package/dist/bun/embedding-worker-types.d.ts.map +1 -0
package/dist/bun/embedding-worker.d.ts +16 -0
package/dist/bun/embedding-worker.d.ts.map +1 -0
package/dist/bun/embedding-worker.js +100 -0
package/dist/bun/embedding-worker.js.map +7 -0
package/dist/bun/embedding.d.ts +91 -8
package/dist/bun/embedding.d.ts.map +1 -1
package/dist/bun/git.d.ts +47 -0
package/dist/bun/git.d.ts.map +1 -0
package/dist/bun/gradient.d.ts +19 -1
package/dist/bun/gradient.d.ts.map +1 -1
package/dist/bun/index.d.ts +9 -6
package/dist/bun/index.d.ts.map +1 -1
package/dist/bun/index.js +13029 -10885
package/dist/bun/index.js.map +4 -4
package/dist/bun/lat-reader.d.ts +1 -1
package/dist/bun/lat-reader.d.ts.map +1 -1
package/dist/bun/ltm.d.ts.map +1 -1
package/dist/bun/markdown.d.ts +11 -0
package/dist/bun/markdown.d.ts.map +1 -1
package/dist/bun/prompt.d.ts +1 -1
package/dist/bun/prompt.d.ts.map +1 -1
package/dist/bun/recall.d.ts +53 -0
package/dist/bun/recall.d.ts.map +1 -1
package/dist/bun/search.d.ts +29 -0
package/dist/bun/search.d.ts.map +1 -1
package/dist/bun/temporal.d.ts +2 -0
package/dist/bun/temporal.d.ts.map +1 -1
package/dist/bun/types.d.ts +15 -0
package/dist/bun/types.d.ts.map +1 -1
package/dist/bun/worker-model.d.ts +12 -9
package/dist/bun/worker-model.d.ts.map +1 -1
package/dist/node/agents-file.d.ts +13 -1
package/dist/node/agents-file.d.ts.map +1 -1
package/dist/node/config.d.ts +20 -1
package/dist/node/config.d.ts.map +1 -1
package/dist/node/data.d.ts +174 -0
package/dist/node/data.d.ts.map +1 -0
package/dist/node/db.d.ts +65 -0
package/dist/node/db.d.ts.map +1 -1
package/dist/node/distillation.d.ts +49 -6
package/dist/node/distillation.d.ts.map +1 -1
package/dist/node/embedding-vendor.d.ts +66 -0
package/dist/node/embedding-vendor.d.ts.map +1 -0
package/dist/node/embedding-worker-types.d.ts +66 -0
package/dist/node/embedding-worker-types.d.ts.map +1 -0
package/dist/node/embedding-worker.d.ts +16 -0
package/dist/node/embedding-worker.d.ts.map +1 -0
package/dist/node/embedding-worker.js +100 -0
package/dist/node/embedding-worker.js.map +7 -0
package/dist/node/embedding.d.ts +91 -8
package/dist/node/embedding.d.ts.map +1 -1
package/dist/node/git.d.ts +47 -0
package/dist/node/git.d.ts.map +1 -0
package/dist/node/gradient.d.ts +19 -1
package/dist/node/gradient.d.ts.map +1 -1
package/dist/node/index.d.ts +9 -6
package/dist/node/index.d.ts.map +1 -1
package/dist/node/index.js +13029 -10885
package/dist/node/index.js.map +4 -4
package/dist/node/lat-reader.d.ts +1 -1
package/dist/node/lat-reader.d.ts.map +1 -1
package/dist/node/ltm.d.ts.map +1 -1
package/dist/node/markdown.d.ts +11 -0
package/dist/node/markdown.d.ts.map +1 -1
package/dist/node/prompt.d.ts +1 -1
package/dist/node/prompt.d.ts.map +1 -1
package/dist/node/recall.d.ts +53 -0
package/dist/node/recall.d.ts.map +1 -1
package/dist/node/search.d.ts +29 -0
package/dist/node/search.d.ts.map +1 -1
package/dist/node/temporal.d.ts +2 -0
package/dist/node/temporal.d.ts.map +1 -1
package/dist/node/types.d.ts +15 -0
package/dist/node/types.d.ts.map +1 -1
package/dist/node/worker-model.d.ts +12 -9
package/dist/node/worker-model.d.ts.map +1 -1
package/dist/types/agents-file.d.ts +13 -1
package/dist/types/agents-file.d.ts.map +1 -1
package/dist/types/config.d.ts +20 -1
package/dist/types/config.d.ts.map +1 -1
package/dist/types/data.d.ts +174 -0
package/dist/types/data.d.ts.map +1 -0
package/dist/types/db.d.ts +65 -0
package/dist/types/db.d.ts.map +1 -1
package/dist/types/distillation.d.ts +49 -6
package/dist/types/distillation.d.ts.map +1 -1
package/dist/types/embedding-vendor.d.ts +66 -0
package/dist/types/embedding-vendor.d.ts.map +1 -0
package/dist/types/embedding-worker-types.d.ts +66 -0
package/dist/types/embedding-worker-types.d.ts.map +1 -0
package/dist/types/embedding-worker.d.ts +16 -0
package/dist/types/embedding-worker.d.ts.map +1 -0
package/dist/types/embedding.d.ts +91 -8
package/dist/types/embedding.d.ts.map +1 -1
package/dist/types/git.d.ts +47 -0
package/dist/types/git.d.ts.map +1 -0
package/dist/types/gradient.d.ts +19 -1
package/dist/types/gradient.d.ts.map +1 -1
package/dist/types/index.d.ts +9 -6
package/dist/types/index.d.ts.map +1 -1
package/dist/types/lat-reader.d.ts +1 -1
package/dist/types/lat-reader.d.ts.map +1 -1
package/dist/types/ltm.d.ts.map +1 -1
package/dist/types/markdown.d.ts +11 -0
package/dist/types/markdown.d.ts.map +1 -1
package/dist/types/prompt.d.ts +1 -1
package/dist/types/prompt.d.ts.map +1 -1
package/dist/types/recall.d.ts +53 -0
package/dist/types/recall.d.ts.map +1 -1
package/dist/types/search.d.ts +29 -0
package/dist/types/search.d.ts.map +1 -1
package/dist/types/temporal.d.ts +2 -0
package/dist/types/temporal.d.ts.map +1 -1
package/dist/types/types.d.ts +15 -0
package/dist/types/types.d.ts.map +1 -1
package/dist/types/worker-model.d.ts +12 -9
package/dist/types/worker-model.d.ts.map +1 -1
package/package.json +5 -2
package/src/agents-file.ts +87 -4
package/src/config.ts +68 -5
package/src/curator.ts +2 -2
package/src/data.ts +768 -0
package/src/db.ts +386 -7
package/src/distillation.ts +178 -35
package/src/embedding-vendor.ts +102 -0
package/src/embedding-worker-types.ts +82 -0
package/src/embedding-worker.ts +185 -0
package/src/embedding.ts +607 -61
package/src/git.ts +144 -0
package/src/gradient.ts +174 -17
package/src/index.ts +20 -0
package/src/lat-reader.ts +5 -11
package/src/ltm.ts +17 -44
package/src/markdown.ts +15 -0
package/src/prompt.ts +1 -2
package/src/recall.ts +401 -70
package/src/search.ts +71 -1
package/src/temporal.ts +42 -35
package/src/types.ts +15 -0
package/src/worker-model.ts +14 -9

package/src/embedding.ts CHANGED Viewed

@@ -11,6 +11,12 @@
 import { db } from "./db";
 import { config } from "./config";
 import * as log from "./log";
+import { isVendoredBinary, vendorModelInfo } from "./embedding-vendor";
+import type {
+  WorkerInbound,
+  WorkerOutbound,
+  WorkerInitData,
+} from "./embedding-worker-types";
 /** Timeout for embedding API fetch calls (ms). Prevents a hanging API from
  *  blocking the recall tool indefinitely. 10s is generous for typical 100-500ms
@@ -136,6 +142,125 @@ class OpenAIProvider implements EmbeddingProvider {
 // Local provider (fastembed + ONNX Runtime)
 // ---------------------------------------------------------------------------
+/**
+ * Thrown when `LocalProvider` is requested but `fastembed` cannot be loaded.
+ * `fastembed` is an optionalDependency of `@loreai/core`: if its postinstall
+ * fails (e.g. CUDA 13 hits the upstream `onnxruntime-node` bug — see #185),
+ * the package install still succeeds but local embeddings are disabled.
+ * Callers in `recall.ts` / `ltm.ts` / `distillation.ts` already gate on
+ * `isAvailable()`, which flips to `false` after this error fires once.
+ */
+export class LocalProviderUnavailableError extends Error {
+  constructor(cause?: unknown) {
+    super(
+      "Local embedding provider unavailable: 'fastembed' is not installed. " +
+        "Configure search.embeddings.provider to 'voyage' or 'openai', or " +
+        "reinstall with ONNXRUNTIME_NODE_INSTALL_CUDA=skip to retry the optional fastembed install.",
+    );
+    this.name = "LocalProviderUnavailableError";
+    if (cause !== undefined) (this as Error & { cause?: unknown }).cause = cause;
+  }
+}
+/** Cache of the fastembed module-load probe.
+ *  null = not yet probed; module = imported successfully; false = import failed. */
+let fastembedModule: typeof import("fastembed") | null = null;
+let fastembedProbed: boolean = false;
+let fastembedAvailable: boolean = false;
+let fastembedLogged: boolean = false;
+/** For tests: reset the fastembed probe cache. */
+export function _resetFastembedProbe(): void {
+  fastembedModule = null;
+  fastembedProbed = false;
+  fastembedAvailable = false;
+  fastembedLogged = false;
+}
+/** For tests: simulate fastembed being unresolvable, without mocking the
+ *  dynamic import. After this call, `tryLoadFastembed()` short-circuits to
+ *  `null` and `isAvailable()` returns false for the local provider. */
+export function _markFastembedUnavailable(): void {
+  fastembedModule = null;
+  fastembedProbed = true;
+  fastembedAvailable = false;
+  fastembedLogged = true; // suppress the info log in tests
+}
+/**
+ * Probe `fastembed` once. Returns the module on success, `null` on failure.
+ * Logs an info-level note exactly once on the first failure so users know
+ * how to recover (switch provider, fix the install, or rely on the
+ * VOYAGE/OPENAI auto-fallback in `embed()`).
+ *
+ * In binary mode `import("fastembed")` resolves to the bundle Bun packed
+ * at compile time (the binary's wrapper has already preloaded the
+ * side-load `libonnxruntime` lib so the addon's dlopen succeeds). In
+ * npm mode it goes through standard module resolution and may fail if
+ * the optional postinstall didn't run.
+ */
+async function tryLoadFastembed(): Promise<typeof import("fastembed") | null> {
+  if (fastembedProbed) return fastembedAvailable ? fastembedModule : null;
+  try {
+    const mod = await loadFastembedModule();
+    // Re-check after the async boundary: another caller (e.g. a test helper
+    // like _markFastembedUnavailable) may have set the probe while we were
+    // awaiting. Their decision takes priority — don't overwrite it.
+    if (fastembedProbed) return fastembedAvailable ? fastembedModule : null;
+    fastembedModule = mod;
+    fastembedAvailable = true;
+  } catch (err) {
+    if (fastembedProbed) return fastembedAvailable ? fastembedModule : null;
+    fastembedAvailable = false;
+    if (!fastembedLogged) {
+      fastembedLogged = true;
+      const msg = err instanceof Error ? err.message : String(err);
+      // Binary mode: a load failure here is a real bug (everything was
+      // bundled at build time). npm mode: the optional dep didn't
+      // install — point the user at the standard recovery options.
+      const remediation = isVendoredBinary()
+        ? "this is a bug in the lore binary; please file an issue. " +
+          "Set VOYAGE_API_KEY/OPENAI_API_KEY for automatic remote fallback in the meantime"
+        : "set search.embeddings.provider to 'voyage' or 'openai', " +
+          "set VOYAGE_API_KEY/OPENAI_API_KEY for automatic remote fallback, " +
+          "or reinstall fastembed with ONNXRUNTIME_NODE_INSTALL_CUDA=skip";
+      log.info(
+        `local embedding provider unavailable (fastembed not installed: ${msg}) — ${remediation}`,
+      );
+    }
+  } finally {
+    fastembedProbed = true;
+  }
+  return fastembedAvailable ? fastembedModule : null;
+}
+/**
+ * Resolve and import the fastembed module.
+ *
+ * One bare import covers both modes:
+ *
+ *   - Binary mode: `bun build --compile` resolves "fastembed" against the
+ *     per-target staging `node_modules/` at build time and bundles it
+ *     (plus its transitive deps and `.node` addons) into the binary. The
+ *     side-load `libonnxruntime.so.1` / `.dylib` / `.dll` is preloaded
+ *     by the binary's wrapper before this import evaluates, so the
+ *     bundled `onnxruntime_binding.node`'s dlopen finds the cached
+ *     handle instead of failing with "shared object not found".
+ *
+ *   - npm mode: standard Node/Bun resolution — works for `@loreai/core`
+ *     consumers whose `npm install` cleanly installed the optional dep.
+ *     If the postinstall failed (CUDA-13 hosts), the import throws here
+ *     and the caller logs + falls back to a remote provider.
+ */
+async function loadFastembedModule(): Promise<typeof import("fastembed")> {
+  return (await import("fastembed")) as typeof import("fastembed");
+}
+/** True iff the fastembed probe has run and reported the module missing. */
+function fastembedKnownUnavailable(): boolean {
+  return fastembedProbed && !fastembedAvailable;
+}
 /**
  * Local embedding provider using fastembed (bge-small-en-v1.5 by default).
  *
@@ -143,61 +268,236 @@ class OpenAIProvider implements EmbeddingProvider {
  * Model files are downloaded on first use (~33MB) and cached in
  * `~/.cache/fastembed`. Subsequent inits load from disk in ~350ms.
  *
+ * ONNX inference runs in a dedicated `node:worker_threads` Worker so the
+ * main thread's event loop stays free. This class is a thin RPC client —
+ * it posts `{ texts, inputType }` to the worker and awaits a reply.
+ * The worker owns the `FlagEmbedding` model and processes requests
+ * sequentially from a priority queue (recall queries jump ahead of
+ * backfill batches).
+ *
  * Uses dynamic import so the module is only loaded when the "local"
  * provider is actually selected — avoids startup cost and allows
- * graceful fallback if fastembed is not installed.
+ * graceful fallback when the optional `fastembed` peer isn't installed
+ * (its native onnxruntime-node may fail to build, e.g. on CUDA 13).
  */
 class LocalProvider implements EmbeddingProvider {
+  // With inference off the main thread, large batches no longer block
+  // the event loop. 256 maximises throughput per round-trip to the
+  // worker. Backfill callers use a smaller BACKFILL_CHUNK_SIZE to give
+  // the worker's priority queue breathing room for recall queries.
   readonly maxBatchSize = 256;
-  private model: unknown | null = null;
-  private initPromise: Promise<unknown> | null = null;
+  private worker: import("node:worker_threads").Worker | null = null;
+  private workerReady = false;
+  private workerInitError: string | null = null;
+  private pendingRequests = new Map<
+    number,
+    { resolve: (vectors: Float32Array[]) => void; reject: (error: Error) => void }
+  >();
+  private nextRequestId = 0;
+  private initPromise: Promise<void> | null = null;
   private modelName: string;
   constructor(modelName: string) {
     this.modelName = modelName;
   }
-  private async getModel(): Promise<unknown> {
-    if (this.model) return this.model;
-    if (!this.initPromise) {
-      this.initPromise = (async () => {
-        const { EmbeddingModel, FlagEmbedding } = await import("fastembed");
-        // Map config model string to EmbeddingModel enum value.
-        // If the configured model matches an enum key, use it; otherwise try
-        // the raw string as a model name (CUSTOM model support in fastembed).
-        const enumValue = (EmbeddingModel as Record<string, string>)[this.modelName];
-        // fastembed's init() has overloaded signatures expecting specific enum
-        // members, but we resolve the model dynamically from config. The enum
-        // lookup guarantees a valid value at runtime; cast to satisfy the type.
-        const m = await FlagEmbedding.init({
-          model: enumValue ?? this.modelName,
-        } as { model: typeof EmbeddingModel.BGESmallENV15 });
-        this.model = m;
-        return m;
-      })();
-    }
+  /**
+   * Ensure the worker thread is running. Probes fastembed on the main
+   * thread first (fast, cached) as a fast-fail gate — the worker is only
+   * spawned if the module is known-loadable. Worker startup failure is
+   * surfaced as `LocalProviderUnavailableError` to trigger the existing
+   * auto-fallback to remote providers.
+   */
+  private async ensureWorker(): Promise<void> {
+    if (this.workerReady) return;
+    if (this.workerInitError) throw new LocalProviderUnavailableError(this.workerInitError);
+    if (this.initPromise) return this.initPromise;
+    this.initPromise = (async () => {
+      // Fast-fail: probe fastembed on the main thread. This is cached
+      // after the first call and preserves the existing error flow.
+      const fastembed = await tryLoadFastembed();
+      if (!fastembed) throw new LocalProviderUnavailableError();
+      const { Worker } = await import("node:worker_threads");
+      // Resolve the worker script path.
+      //
+      // In vendored binary mode: the compiled binary's wrapper.ts detects
+      // `!isMainThread` and runs the embedding worker code path. We spawn
+      // the Worker with the wrapper's own `import.meta.url` (registered as
+      // __LORE_VENDOR_WORKER_URL__). This avoids needing a separate worker
+      // entrypoint — Bun's --compile silently drops additional entrypoints
+      // on macOS and Windows.
+      //
+      // In dev (Bun running .ts directly): embedding-worker.ts
+      // In dist (esbuild bundle): embedding-worker.js
+      const vendorWorkerUrl = (globalThis as Record<string, unknown>).__LORE_VENDOR_WORKER_URL__ as string | undefined;
+      // On Windows, new Worker() with a file:// URL pointing to $bunfs
+      // fails with ENOENT. Pass the raw path instead (B:\~BUN\root\...).
+      // On macOS/Linux the file:// URL works fine with $bunfs paths.
+      let workerUrl: string | URL;
+      if (vendorWorkerUrl) {
+        if (process.platform === "win32") {
+          // On Windows, new Worker() with a file:// URL pointing to $bunfs
+          // fails with ENOENT (Bun bug). Extract the raw path instead.
+          // URL.pathname keeps %7E encoded; decodeURIComponent restores ~.
+          workerUrl = decodeURIComponent(new URL(vendorWorkerUrl).pathname);
+          // URL.pathname on Windows: /B:/~BUN/root/wrapper.js → strip leading /
+          if (/^\/[A-Za-z]:/.test(workerUrl)) {
+            workerUrl = workerUrl.slice(1);
+          }
+        } else {
+          workerUrl = vendorWorkerUrl;
+        }
+      } else {
+        workerUrl = new URL(`./embedding-worker${import.meta.url.endsWith(".ts") ? ".ts" : ".js"}`, import.meta.url);
+      }
+      const vendor = vendorModelInfo();
+      const workerInitData: WorkerInitData = {
+        modelName: this.modelName,
+        vendorModel: vendor
+          ? { modelAbsoluteDirPath: vendor.modelAbsoluteDirPath, modelName: vendor.modelName }
+          : null,
+      };
+      this.worker = new Worker(workerUrl, { workerData: workerInitData });
+      // Don't let the worker prevent process exit.
+      this.worker.unref();
+      // Wire up response handler.
+      this.worker.on("message", (msg: WorkerOutbound) => {
+        switch (msg.type) {
+          case "result": {
+            const pending = this.pendingRequests.get(msg.id);
+            if (pending) {
+              this.pendingRequests.delete(msg.id);
+              this.updateWorkerRef();
+              pending.resolve(msg.vectors);
+            }
+            break;
+          }
+          case "error": {
+            const pending = this.pendingRequests.get(msg.id);
+            if (pending) {
+              this.pendingRequests.delete(msg.id);
+              this.updateWorkerRef();
+              pending.reject(new Error(`Worker embedding failed: ${msg.error}`));
+            }
+            break;
+          }
+          case "init-error": {
+            // Model init failed inside the worker — surface as
+            // LocalProviderUnavailableError on all pending + future requests.
+            this.workerInitError = msg.error;
+            this.workerReady = false;
+            for (const [, p] of this.pendingRequests) {
+              p.reject(new LocalProviderUnavailableError(msg.error));
+            }
+            this.pendingRequests.clear();
+            this.updateWorkerRef();
+            break;
+          }
+        }
+      });
+      // Worker crash / exit — reject all in-flight requests.
+      this.worker.on("error", (err: Error) => {
+        this.workerInitError = err.message;
+        this.workerReady = false;
+        for (const [, p] of this.pendingRequests) {
+          p.reject(new LocalProviderUnavailableError(err));
+        }
+        this.pendingRequests.clear();
+        this.updateWorkerRef();
+      });
+      this.worker.on("exit", (code) => {
+        if (code !== 0 && !this.workerInitError) {
+          this.workerInitError = `embedding worker exited with code ${code}`;
+        }
+        this.workerReady = false;
+        for (const [, p] of this.pendingRequests) {
+          p.reject(
+            new LocalProviderUnavailableError(this.workerInitError ?? "embedding worker exited"),
+          );
+        }
+        this.pendingRequests.clear();
+        this.updateWorkerRef();
+      });
+      this.workerReady = true;
+    })().catch((err) => {
+      this.initPromise = null; // allow retry
+      throw err;
+    });
     return this.initPromise;
   }
+  /** Keep the worker ref'd while requests are in flight so the event loop
+   *  doesn't exit before responses arrive. When the pending map drains,
+   *  unref again so the worker doesn't prevent graceful process exit. */
+  private updateWorkerRef(): void {
+    if (!this.worker) return;
+    if (this.pendingRequests.size > 0) {
+      this.worker.ref();
+    } else {
+      this.worker.unref();
+    }
+  }
   async embed(texts: string[], inputType: "document" | "query"): Promise<Float32Array[]> {
-    const model = (await this.getModel()) as {
-      queryEmbed(text: string): Promise<number[]>;
-      passageEmbed(texts: string[], batchSize?: number): AsyncGenerator<number[][]>;
-    };
+    await this.ensureWorker();
+    const id = this.nextRequestId++;
+    // Recall queries (single query-type texts) get high priority so they
+    // jump ahead of any queued backfill batches in the worker.
+    const priority = inputType === "query" && texts.length === 1 ? "high" : "normal";
+    return new Promise<Float32Array[]>((resolve, reject) => {
+      this.pendingRequests.set(id, { resolve, reject });
+      this.updateWorkerRef();
+      this.worker!.postMessage({
+        type: "embed",
+        id,
+        texts,
+        inputType,
+        priority,
+      } satisfies WorkerInbound);
+    });
+  }
-    if (inputType === "query" && texts.length === 1) {
-      const vec = await model.queryEmbed(texts[0]);
-      return [new Float32Array(vec)];
+  /** Shut down the worker thread. Called by `resetProvider()` on config change.
+   *  Sends a shutdown message so the worker calls `process.exit(0)` internally.
+   *  We avoid `worker.terminate()` because Bun's forced termination triggers a
+   *  NAPI fatal error when tearing down onnxruntime's native bindings.
+   *
+   *  Returns a promise that resolves once the worker has fully exited. Callers
+   *  that need a clean teardown (tests, config change) should await the result.
+   *  Fire-and-forget callers (process exit) can ignore it. */
+  shutdown(): Promise<void> {
+    if (!this.worker) return Promise.resolve();
+    const worker = this.worker;
+    this.worker = null;
+    this.workerReady = false;
+    this.workerInitError = null;
+    this.initPromise = null;
+    // Reject any in-flight requests.
+    for (const [, p] of this.pendingRequests) {
+      p.reject(new Error("embedding worker shut down"));
     }
+    this.pendingRequests.clear();
-    // passageEmbed returns an async generator of batches
-    const results: Float32Array[] = [];
-    for await (const batch of model.passageEmbed(texts)) {
-      for (const vec of batch) {
-        results.push(new Float32Array(vec));
-      }
-    }
-    return results;
+    return new Promise<void>((resolve) => {
+      worker.on("exit", () => resolve());
+      worker.postMessage({ type: "shutdown" } satisfies WorkerInbound);
+    });
   }
 }
@@ -239,12 +539,12 @@ function getProvider(): EmbeddingProvider | null {
   switch (providerName) {
     case "local": {
-      try {
-        cachedProvider = new LocalProvider(model);
-      } catch {
-        log.info("local embedding provider unavailable (fastembed not installed)");
-        cachedProvider = null;
-      }
+      // `fastembed` is an optionalDependency. We construct the provider
+      // optimistically here; the import + ONNX init happens lazily in
+      // `LocalProvider.getModel()`, which throws `LocalProviderUnavailableError`
+      // if the optional dep isn't installed. After that first failure
+      // `isAvailable()` short-circuits to false and callers fall back to FTS.
+      cachedProvider = new LocalProvider(model);
       break;
     }
     case "voyage": {
@@ -273,9 +573,92 @@ function getProvider(): EmbeddingProvider | null {
   return cachedProvider;
 }
-/** Reset cached provider — called when config changes. */
-export function resetProvider(): void {
+/** Reset cached provider — called when config changes.
+ *  Shuts down the worker thread if the current provider is a LocalProvider.
+ *  Returns a promise that resolves once any worker has fully exited.
+ *  Callers that need clean teardown (tests) should await the result. */
+export function resetProvider(): Promise<void> {
+  let shutdownPromise: Promise<void> = Promise.resolve();
+  if (cachedProvider instanceof LocalProvider) {
+    shutdownPromise = cachedProvider.shutdown();
+  }
   cachedProvider = undefined;
+  remoteFallbackLogged = false;
+  return shutdownPromise;
+}
+/** Shut down the current provider and prevent any new provider from being
+ *  created. After this call, `embed()` throws and `isAvailable()` returns
+ *  false. Test-only: prevents fire-and-forget embeds (queued by other test
+ *  files) from spawning a new worker after cleanup. */
+export function _shutdownAndDisable(): Promise<void> {
+  let shutdownPromise: Promise<void> = Promise.resolve();
+  if (cachedProvider instanceof LocalProvider) {
+    shutdownPromise = cachedProvider.shutdown();
+  }
+  cachedProvider = null; // null (not undefined) → getProvider() returns null, won't create new
+  remoteFallbackLogged = false;
+  return shutdownPromise;
+}
+/** Save the current cached provider reference (including the live worker)
+ *  and clear the cache so the next `getProvider()` call creates a fresh one.
+ *  Returns an opaque token that must be passed to `_restoreProvider()` to
+ *  put the original provider back — without this, the worker is orphaned and
+ *  a second ONNX load in the same Bun process will crash.
+ *
+ *  Test-only helper: lets suites temporarily swap in a mock/unavailable
+ *  provider without killing the real worker. */
+export function _saveAndClearProvider(): unknown {
+  const saved = { provider: cachedProvider, remoteFallbackLogged };
+  cachedProvider = undefined;
+  remoteFallbackLogged = false;
+  return saved;
+}
+/** Restore a provider previously saved by `_saveAndClearProvider()`. Any
+ *  provider created between save and restore is discarded (callers must
+ *  ensure it's not a LocalProvider with a live worker — those suites only
+ *  use `_markFastembedUnavailable()` so no worker is spawned). */
+export function _restoreProvider(token: unknown): void {
+  const saved = token as { provider: EmbeddingProvider | null | undefined; remoteFallbackLogged: boolean };
+  cachedProvider = saved.provider;
+  remoteFallbackLogged = saved.remoteFallbackLogged;
+}
+/** True once we've logged an auto-fallback notice this process — keeps the
+ *  one-line warning from spamming on every fire-and-forget embed call. */
+let remoteFallbackLogged = false;
+/**
+ * Build a remote `EmbeddingProvider` from whichever API key is in env.
+ * Returns `null` when neither `VOYAGE_API_KEY` nor `OPENAI_API_KEY` is set,
+ * which is the signal for callers to fall through to FTS-only behaviour.
+ *
+ * Voyage wins ties because it's the higher-quality option for code search;
+ * users who want OpenAI specifically can pin `search.embeddings.provider`
+ * in `.lore.json` and skip the fallback path entirely.
+ */
+export function pickRemoteFallback(): {
+  name: "voyage" | "openai";
+  provider: EmbeddingProvider;
+} | null {
+  if (process.env.VOYAGE_API_KEY) {
+    const d = PROVIDER_DEFAULTS.voyage;
+    return {
+      name: "voyage",
+      provider: new VoyageProvider(process.env.VOYAGE_API_KEY, d.model, d.dimensions),
+    };
+  }
+  if (process.env.OPENAI_API_KEY) {
+    const d = PROVIDER_DEFAULTS.openai;
+    return {
+      name: "openai",
+      provider: new OpenAIProvider(process.env.OPENAI_API_KEY, d.model, d.dimensions),
+    };
+  }
+  return null;
 }
 // ---------------------------------------------------------------------------
@@ -284,9 +667,16 @@ export function resetProvider(): void {
 /** Returns true if embedding is available.
  *  Active when the configured provider's API key is set, unless explicitly
- *  disabled via `search.embeddings.enabled: false` in .lore.json. */
+ *  disabled via `search.embeddings.enabled: false` in .lore.json.
+ *
+ *  For the `local` provider, also returns false once we've discovered the
+ *  optional `fastembed` peer is missing — callers (recall, ltm, distillation)
+ *  use this gate to skip embedding work and fall back to FTS-only search. */
 export function isAvailable(): boolean {
-  return getProvider() !== null;
+  const provider = getProvider();
+  if (!provider) return false;
+  if (provider instanceof LocalProvider && fastembedKnownUnavailable()) return false;
+  return true;
 }
 // ---------------------------------------------------------------------------
@@ -296,10 +686,18 @@ export function isAvailable(): boolean {
 /**
  * Generate embeddings for the given texts using the configured provider.
  *
+ * If the configured provider is `local` and `fastembed` turns out to be
+ * unavailable at runtime (failed install, vendor extraction blocked, etc.),
+ * automatically swap to a remote provider when `VOYAGE_API_KEY` or
+ * `OPENAI_API_KEY` is set in env. The swap is permanent for the rest of
+ * the process — `cachedProvider` is replaced so subsequent calls skip the
+ * local-then-fail path.
+ *
  * @param texts     Array of texts to embed
  * @param inputType "document" for storage, "query" for search
  * @returns         Float32Array per input text
- * @throws          On API errors or missing provider
+ * @throws          On API errors or when no provider (local or remote) is
+ *                  available
  */
 export async function embed(
   texts: string[],
@@ -307,7 +705,26 @@ export async function embed(
 ): Promise<Float32Array[]> {
   const provider = getProvider();
   if (!provider) throw new Error("No embedding provider available");
-  return provider.embed(texts, inputType);
+  try {
+    return await provider.embed(texts, inputType);
+  } catch (err) {
+    if (!(err instanceof LocalProviderUnavailableError)) throw err;
+    const fallback = pickRemoteFallback();
+    if (!fallback) throw err;
+    if (!remoteFallbackLogged) {
+      remoteFallbackLogged = true;
+      log.info(
+        `fastembed unavailable; auto-switching to ${fallback.name} ` +
+          `(set search.embeddings.provider in .lore.json to silence this)`,
+      );
+    }
+    cachedProvider = fallback.provider;
+    return fallback.provider.embed(texts, inputType);
+  }
 }
 // ---------------------------------------------------------------------------
@@ -455,6 +872,71 @@ export function embedDistillation(
     });
 }
+/**
+ * Embed a temporal message and store the result in the DB.
+ * Fire-and-forget — errors are logged, never thrown.
+ * Only called for undistilled messages; once distilled, the embedding
+ * is NULLed (semantic content captured by distillation embedding).
+ */
+export function embedTemporalMessage(
+  id: string,
+  content: string,
+): void {
+  // Skip very short messages — they don't carry enough semantic signal
+  // to be useful in vector search and would waste embedding capacity.
+  if (content.length < 50) return;
+  embed([content], "document")
+    .then(([vec]) => {
+      db()
+        .query("UPDATE temporal_messages SET embedding = ? WHERE id = ?")
+        .run(toBlob(vec), id);
+    })
+    .catch((err) => {
+      log.info("embedding failed for temporal message", id, ":", err);
+    });
+}
+// ---------------------------------------------------------------------------
+// Vector search — temporal messages (undistilled only)
+// ---------------------------------------------------------------------------
+/**
+ * Search undistilled temporal messages with embeddings by cosine similarity.
+ * Returns top-k entries sorted by similarity descending.
+ *
+ * Only scans undistilled messages (distilled=0) — once a message is
+ * distilled, its semantic content is captured by the distillation
+ * embedding and the temporal embedding is cleared.
+ *
+ * Scoped to a single project. Optionally scoped to a single session.
+ */
+export function vectorSearchTemporal(
+  queryEmbedding: Float32Array,
+  projectId: string,
+  limit = 10,
+  sessionId?: string,
+): VectorHit[] {
+  const sql = sessionId
+    ? "SELECT id, embedding FROM temporal_messages WHERE embedding IS NOT NULL AND distilled = 0 AND project_id = ? AND session_id = ?"
+    : "SELECT id, embedding FROM temporal_messages WHERE embedding IS NOT NULL AND distilled = 0 AND project_id = ?";
+  const params = sessionId ? [projectId, sessionId] : [projectId];
+  const rows = db()
+    .query(sql)
+    .all(...params) as Array<{ id: string; embedding: Buffer }>;
+  const scored: VectorHit[] = [];
+  for (const row of rows) {
+    const vec = fromBlob(row.embedding);
+    const sim = cosineSimilarity(queryEmbedding, vec);
+    scored.push({ id: row.id, similarity: sim });
+  }
+  scored.sort((a, b) => b.similarity - a.similarity);
+  return scored.slice(0, limit);
+}
 // ---------------------------------------------------------------------------
 // Config change detection
 // ---------------------------------------------------------------------------
@@ -488,7 +970,7 @@ export function checkConfigChange(): boolean {
   if (stored && stored.value === current) return false;
-  // Config changed (or first run) — clear all embeddings in both tables
+  // Config changed (or first run) — clear all embeddings in all tables
   if (stored) {
     const knowledgeCount = db()
       .query("SELECT COUNT(*) as n FROM knowledge WHERE embedding IS NOT NULL")
@@ -496,10 +978,14 @@ export function checkConfigChange(): boolean {
     const distillCount = db()
       .query("SELECT COUNT(*) as n FROM distillations WHERE embedding IS NOT NULL")
       .get() as { n: number };
-    const total = knowledgeCount.n + distillCount.n;
+    const temporalCount = db()
+      .query("SELECT COUNT(*) as n FROM temporal_messages WHERE embedding IS NOT NULL")
+      .get() as { n: number };
+    const total = knowledgeCount.n + distillCount.n + temporalCount.n;
     if (total > 0) {
       db().query("UPDATE knowledge SET embedding = NULL").run();
       db().query("UPDATE distillations SET embedding = NULL").run();
+      db().query("UPDATE temporal_messages SET embedding = NULL").run();
       log.info(
         `embedding config changed (${stored.value} → ${current}), cleared ${total} stale embeddings`,
       );
@@ -520,15 +1006,26 @@ export function checkConfigChange(): boolean {
 // Startup backfill — single entry point for all hosts
 // ---------------------------------------------------------------------------
+/**
+ * Delay before the startup backfill begins, so the host's HTTP server has
+ * a clear window to answer the first wave of requests (web UI shell load,
+ * terminal session-connect handshake) before the embedding worker starts
+ * competing for CPU. With inference off the main thread the event loop
+ * isn't blocked, but the worker still consumes a CPU core — a short delay
+ * avoids contention during the first-connect burst.
+ */
+const STARTUP_BACKFILL_DELAY_MS = 2_000;
 /**
  * Run all embedding backfills and log coverage stats.
  *
  * This is the canonical entry point that every host adapter (OpenCode, Pi,
  * future ACP) should call once during init. It:
- *   1. Detects config changes (provider swap) and clears stale embeddings
- *   2. Backfills knowledge entries missing embeddings
- *   3. Backfills non-archived distillations missing embeddings
- *   4. Logs a one-line coverage summary to stderr (always visible, not gated)
+ *   1. Waits a short grace period so first-connect HTTP requests can finish
+ *   2. Detects config changes (provider swap) and clears stale embeddings
+ *   3. Backfills knowledge entries missing embeddings
+ *   4. Backfills non-archived distillations missing embeddings
+ *   5. Logs a one-line coverage summary to stderr (always visible, not gated)
  *
  * Fire-and-forget: callers should `.catch()` — embedding failures must not
  * block plugin initialization.
@@ -536,6 +1033,34 @@ export function checkConfigChange(): boolean {
 export async function runStartupBackfill(): Promise<void> {
   if (!isAvailable()) return;
+  // Surface backlog up-front so a slow startup is self-explanatory in logs.
+  // Counts use the same predicates the backfill loops use, so the two
+  // numbers always match what we're about to do.
+  const pendingKnowledge = (
+    db()
+      .query(
+        "SELECT COUNT(*) as n FROM knowledge WHERE embedding IS NULL AND confidence > 0.2",
+      )
+      .get() as { n: number }
+  ).n;
+  const pendingDistillations = (
+    db()
+      .query(
+        "SELECT COUNT(*) as n FROM distillations WHERE embedding IS NULL AND archived = 0 AND observations != ''",
+      )
+      .get() as { n: number }
+  ).n;
+  if (pendingKnowledge + pendingDistillations > 0) {
+    log.info(
+      `embedding backfill scheduled: ${pendingKnowledge} knowledge + ` +
+        `${pendingDistillations} distillations pending — starting in ` +
+        `${STARTUP_BACKFILL_DELAY_MS / 1000}s, batches yield between calls ` +
+        `(host stays responsive)`,
+    );
+    await new Promise<void>((r) => setTimeout(r, STARTUP_BACKFILL_DELAY_MS));
+  }
   const knowledgeEmbedded = await backfillEmbeddings();
   const distillationEmbedded = await backfillDistillationEmbeddings();
@@ -581,6 +1106,16 @@ export async function runStartupBackfill(): Promise<void> {
 // Backfill — knowledge
 // ---------------------------------------------------------------------------
+/**
+ * Chunk size for backfill embed requests. Each chunk becomes a separate
+ * message to the embedding worker. Keeping chunks small (32) gives the
+ * worker's priority queue natural gaps to interleave high-priority recall
+ * queries between backfill batches. The provider's `maxBatchSize` (256)
+ * is the upper limit for any single embed call; this is intentionally
+ * smaller for backfill-vs-live interleaving.
+ */
+const BACKFILL_CHUNK_SIZE = 32;
 /**
  * Embed all knowledge entries that are missing embeddings.
  * Called by `runStartupBackfill()`.
@@ -601,11 +1136,10 @@ export async function backfillEmbeddings(): Promise<number> {
   if (!rows.length) return 0;
-  const batchSize = provider.maxBatchSize;
   let embedded = 0;
-  for (let i = 0; i < rows.length; i += batchSize) {
-    const batch = rows.slice(i, i + batchSize);
+  for (let i = 0; i < rows.length; i += BACKFILL_CHUNK_SIZE) {
+    const batch = rows.slice(i, i + BACKFILL_CHUNK_SIZE);
     const texts = batch.map((r) => `${r.title}\n${r.content}`);
     try {
@@ -621,6 +1155,7 @@ export async function backfillEmbeddings(): Promise<number> {
     } catch (err) {
       log.info(`embedding backfill batch ${i}-${i + batch.length} failed:`, err);
     }
+    // No yieldToEventLoop() needed — embed() is truly async (worker thread).
   }
   if (embedded > 0) {
@@ -650,11 +1185,16 @@ export async function backfillDistillationEmbeddings(): Promise<number> {
   if (!rows.length) return 0;
-  const batchSize = provider.maxBatchSize;
   let embedded = 0;
-  for (let i = 0; i < rows.length; i += batchSize) {
-    const batch = rows.slice(i, i + batchSize);
+  // Progress logging: heartbeat every PROGRESS_INTERVAL embedded so a long
+  // backfill (e.g. 1000+ pending after a fastembed reinstall) doesn't look
+  // like a silent hang. Without this, only the final tally was logged.
+  const PROGRESS_INTERVAL = 256;
+  let nextProgressAt = PROGRESS_INTERVAL;
+  for (let i = 0; i < rows.length; i += BACKFILL_CHUNK_SIZE) {
+    const batch = rows.slice(i, i + BACKFILL_CHUNK_SIZE);
     const texts = batch.map((r) => r.observations);
     try {
@@ -670,6 +1210,12 @@ export async function backfillDistillationEmbeddings(): Promise<number> {
     } catch (err) {
       log.info(`distillation embedding backfill batch ${i}-${i + batch.length} failed:`, err);
     }
+    if (embedded >= nextProgressAt) {
+      log.info(`embedding distillations: ${embedded}/${rows.length}…`);
+      nextProgressAt = embedded + PROGRESS_INTERVAL;
+    }
+    // No yieldToEventLoop() needed — embed() is truly async (worker thread).
   }
   if (embedded > 0) {