membot 0.7.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ import type { Subprocess } from "bun";
2
+ import { EMBED_WORKER_SENTINEL, EMBEDDING_BATCH_SIZE, EMBEDDING_MODEL } from "../constants.ts";
3
+ import { asHelpful, HelpfulError } from "../errors.ts";
4
+ import { logger } from "../output/logger.ts";
5
+ import { type EmbedOptions, setEmbedderPool } from "./embedder.ts";
6
+
7
+ interface PendingRequest {
8
+ id: number;
9
+ resolve: (vectors: number[][]) => void;
10
+ reject: (err: unknown) => void;
11
+ }
12
+
13
+ interface Worker {
14
+ proc: Subprocess<"pipe", "pipe", "inherit">;
15
+ busy: boolean;
16
+ pending: PendingRequest | null;
17
+ exited: boolean;
18
+ }
19
+
20
+ interface EmbedResponseLine {
21
+ type: "embed-response";
22
+ id: number;
23
+ vectors?: number[][];
24
+ error?: { kind: string; message: string; hint: string };
25
+ }
26
+
27
+ /**
28
+ * A short-lived pool of embed-worker subprocesses. Created at the start of
29
+ * a bulk-embedding command (`add` / `refresh` / `write`), kept alive only
30
+ * for the duration of that command, and disposed before the command
31
+ * returns. Workers spawn lazily — they don't pre-load the WASM pipeline;
32
+ * the model is loaded on-demand inside the worker the first time a batch
33
+ * arrives. Each worker holds its own ~50MB WASM heap, so the parallelism
34
+ * comes for free in CPU but costs proportional RAM while the command runs.
35
+ *
36
+ * The pool is plugged in via `setEmbedderPool()` so the existing `embed()`
37
+ * call sites in the ingest pipeline transparently fan out without code
38
+ * changes.
39
+ */
40
+ export class EmbedderPool {
41
+ private readonly workerCount: number;
42
+ private readonly model: string;
43
+ private workers: Worker[] = [];
44
+ private acquireQueue: Array<(w: Worker) => void> = [];
45
+ private nextRequestId = 1;
46
+ private spawned = false;
47
+ private disposed = false;
48
+
49
+ constructor(workerCount: number, model: string = EMBEDDING_MODEL) {
50
+ if (workerCount < 1 || !Number.isInteger(workerCount)) {
51
+ throw new HelpfulError({
52
+ kind: "input_error",
53
+ message: `EmbedderPool worker count must be a positive integer, got ${workerCount}`,
54
+ hint: "Set config.embedding.workers to a positive integer (or null for auto = cpus-1).",
55
+ });
56
+ }
57
+ this.workerCount = workerCount;
58
+ this.model = model;
59
+ }
60
+
61
+ /** Number of worker subprocesses this pool manages. */
62
+ get size(): number {
63
+ return this.workerCount;
64
+ }
65
+
66
+ /**
67
+ * Spawn the worker subprocesses. Returns immediately — workers load the
68
+ * WASM model lazily when the first batch arrives, so this is a cheap
69
+ * operation. The first batch a worker receives pays the ~hundreds-of-ms
70
+ * load cost; subsequent batches in the same worker are fast.
71
+ */
72
+ spawn(): void {
73
+ if (this.spawned) return;
74
+ this.spawned = true;
75
+ logger.info(`embedder-pool: spawning ${this.workerCount} workers (model=${this.model})`);
76
+ for (let i = 0; i < this.workerCount; i++) {
77
+ this.workers.push(this.spawnWorker(i));
78
+ }
79
+ }
80
+
81
+ /**
82
+ * Embed `texts` using the worker pool. Splits into batches of
83
+ * `EMBEDDING_BATCH_SIZE`, dispatches each batch to whichever worker is
84
+ * free, and reassembles vectors in original order. `opts.onProgress` is
85
+ * called once per completed batch with `(done, total)` chunk counts.
86
+ */
87
+ async embed(texts: string[], model?: string, opts: EmbedOptions = {}): Promise<number[][]> {
88
+ if (this.disposed) {
89
+ throw new HelpfulError({
90
+ kind: "internal_error",
91
+ message: "EmbedderPool: embed() called after dispose()",
92
+ hint: "The pool is per-command — wrap your work in `withEmbedderPool()` so a fresh pool is created.",
93
+ });
94
+ }
95
+ if (!this.spawned) this.spawn();
96
+ if (texts.length === 0) return [];
97
+
98
+ const targetModel = model ?? this.model;
99
+ const out = new Array<number[]>(texts.length);
100
+ let done = 0;
101
+
102
+ const batches: Array<{ start: number; texts: string[] }> = [];
103
+ for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
104
+ batches.push({ start: i, texts: texts.slice(i, i + EMBEDDING_BATCH_SIZE) });
105
+ }
106
+
107
+ await Promise.all(
108
+ batches.map(async (batch) => {
109
+ const vectors = await this.dispatchBatch(batch.texts, targetModel);
110
+ for (let i = 0; i < vectors.length; i++) {
111
+ const vec = vectors[i];
112
+ if (!vec) {
113
+ throw new HelpfulError({
114
+ kind: "internal_error",
115
+ message: `embedder-pool: worker returned undefined vector at batch index ${i}`,
116
+ hint: "Re-run with --verbose; check the worker stderr for a transformers/WASM error.",
117
+ });
118
+ }
119
+ out[batch.start + i] = vec;
120
+ }
121
+ done += vectors.length;
122
+ opts.onProgress?.(done, texts.length);
123
+ }),
124
+ );
125
+ return out;
126
+ }
127
+
128
+ /**
129
+ * Tear down every worker subprocess. Idempotent. Pending requests are
130
+ * rejected so any in-flight `embed()` callers see a HelpfulError instead
131
+ * of hanging forever.
132
+ */
133
+ async dispose(): Promise<void> {
134
+ if (this.disposed) return;
135
+ this.disposed = true;
136
+ if (this.spawned) {
137
+ logger.info(`embedder-pool: tearing down ${this.workers.length} workers`);
138
+ }
139
+ const disposeError = () =>
140
+ new HelpfulError({
141
+ kind: "internal_error",
142
+ message: "EmbedderPool disposed while a request was in flight",
143
+ hint: "This is usually fine on shutdown; if it appears mid-run, file an issue with the preceding stderr.",
144
+ });
145
+ for (const w of this.workers) {
146
+ if (w.pending) {
147
+ w.pending.reject(disposeError());
148
+ w.pending = null;
149
+ }
150
+ try {
151
+ w.proc.stdin.end();
152
+ } catch {
153
+ // stdin may already be closed; ignore.
154
+ }
155
+ try {
156
+ w.proc.kill();
157
+ } catch {
158
+ // process may already be dead; ignore.
159
+ }
160
+ }
161
+ // Anyone waiting on acquire() will never get a worker — release them.
162
+ const queue = this.acquireQueue;
163
+ this.acquireQueue = [];
164
+ for (const resolver of queue) {
165
+ // Fabricate an "already exited" worker so dispatchBatch's disposed
166
+ // guard fires and rejects with a clear error.
167
+ resolver({
168
+ proc: null as unknown as Subprocess<"pipe", "pipe", "inherit">,
169
+ busy: true,
170
+ pending: null,
171
+ exited: true,
172
+ });
173
+ }
174
+ await Promise.all(
175
+ this.workers.map(async (w) => {
176
+ try {
177
+ await w.proc.exited;
178
+ } catch {
179
+ // best effort
180
+ }
181
+ }),
182
+ );
183
+ this.workers = [];
184
+ }
185
+
186
+ /** Send one batch to a free worker and resolve with its vectors. */
187
+ private async dispatchBatch(texts: string[], model: string): Promise<number[][]> {
188
+ const worker = await this.acquire();
189
+ try {
190
+ if (this.disposed || worker.exited) {
191
+ throw new HelpfulError({
192
+ kind: "internal_error",
193
+ message: "EmbedderPool disposed before batch could be dispatched",
194
+ hint: "The pool was torn down mid-call — wrap your work in `withEmbedderPool()` for a fresh per-command pool.",
195
+ });
196
+ }
197
+ const id = this.nextRequestId++;
198
+ return await new Promise<number[][]>((resolve, reject) => {
199
+ worker.pending = { id, resolve, reject };
200
+ try {
201
+ worker.proc.stdin.write(`${JSON.stringify({ type: "embed", id, model, texts })}\n`);
202
+ worker.proc.stdin.flush();
203
+ } catch (err) {
204
+ worker.pending = null;
205
+ reject(
206
+ asHelpful(
207
+ err,
208
+ "while writing to embed worker stdin",
209
+ "The worker subprocess likely crashed. Set config.embedding.workers=1 to bypass the pool while debugging.",
210
+ ),
211
+ );
212
+ }
213
+ });
214
+ } finally {
215
+ this.release(worker);
216
+ }
217
+ }
218
+
219
+ /** Wait for a free worker; first-come, first-served via the acquireQueue. */
220
+ private acquire(): Promise<Worker> {
221
+ const free = this.workers.find((w) => !w.exited && !w.busy);
222
+ if (free) {
223
+ free.busy = true;
224
+ return Promise.resolve(free);
225
+ }
226
+ return new Promise((resolve) => {
227
+ this.acquireQueue.push((w) => {
228
+ w.busy = true;
229
+ resolve(w);
230
+ });
231
+ });
232
+ }
233
+
234
+ /**
235
+ * Hand a finished worker to the next waiter, or mark it idle. Called from
236
+ * `dispatchBatch`'s finally block so it runs whether the request resolved
237
+ * or rejected.
238
+ */
239
+ private release(w: Worker): void {
240
+ w.pending = null;
241
+ w.busy = false;
242
+ if (w.exited) return;
243
+ const next = this.acquireQueue.shift();
244
+ if (next) next(w);
245
+ }
246
+
247
+ /**
248
+ * Build the spawn command for one worker. Two regimes:
249
+ * - Compiled binary (`./dist/membot`): `process.execPath` is the membot
250
+ * binary itself, so we just hand it the sentinel arg and the early
251
+ * branch in `cli.ts` takes over before commander sees it.
252
+ * - Bun dev / `bun add -g`: `process.execPath` is the `bun` binary; we
253
+ * must point it at `cli.ts` explicitly. Resolve the path relative to
254
+ * this module so it survives whatever working directory the user
255
+ * invoked membot from.
256
+ */
257
+ private resolveSpawnCommand(): string[] {
258
+ const exec = process.execPath;
259
+ const isBun = /[\\/]bunx?(\.exe)?$/.test(exec);
260
+ if (!isBun) {
261
+ return [exec, EMBED_WORKER_SENTINEL];
262
+ }
263
+ const cliPath = new URL("../cli.ts", import.meta.url).pathname;
264
+ return [exec, cliPath, EMBED_WORKER_SENTINEL];
265
+ }
266
+
267
+ /**
268
+ * Spawn one worker subprocess and start its stdout reader. The worker
269
+ * lazy-loads the WASM pipeline on its first `embed` request, so spawn is
270
+ * cheap (no init handshake, no preload).
271
+ */
272
+ private spawnWorker(index: number): Worker {
273
+ const proc = Bun.spawn(this.resolveSpawnCommand(), {
274
+ stdio: ["pipe", "pipe", "inherit"],
275
+ }) as Subprocess<"pipe", "pipe", "inherit">;
276
+
277
+ const worker: Worker = {
278
+ proc,
279
+ busy: false,
280
+ pending: null,
281
+ exited: false,
282
+ };
283
+
284
+ // Watch for premature exit and surface it to any in-flight request.
285
+ void proc.exited
286
+ .then((code) => {
287
+ worker.exited = true;
288
+ if (worker.pending) {
289
+ worker.pending.reject(
290
+ new HelpfulError({
291
+ kind: "internal_error",
292
+ message: `embed worker ${index} exited (code=${code}) with a request in flight`,
293
+ hint: "Run with --verbose; the worker's stderr was inherited and should explain the crash.",
294
+ }),
295
+ );
296
+ worker.pending = null;
297
+ }
298
+ })
299
+ .catch(() => {
300
+ // Bun's exited promise shouldn't reject, but guard anyway.
301
+ });
302
+
303
+ void this.readWorker(worker, index);
304
+ return worker;
305
+ }
306
+
307
+ /**
308
+ * Newline-delimited JSON reader for one worker's stdout. Matches every
309
+ * `{type:"embed-response", id}` to its pending request.
310
+ */
311
+ private async readWorker(worker: Worker, index: number): Promise<void> {
312
+ const reader = worker.proc.stdout.getReader();
313
+ const decoder = new TextDecoder();
314
+ let buffer = "";
315
+ try {
316
+ while (true) {
317
+ const { done, value } = await reader.read();
318
+ if (done) break;
319
+ buffer += decoder.decode(value, { stream: true });
320
+ while (true) {
321
+ const nl = buffer.indexOf("\n");
322
+ if (nl === -1) break;
323
+ const line = buffer.slice(0, nl);
324
+ buffer = buffer.slice(nl + 1);
325
+ if (!line.trim()) continue;
326
+ this.handleWorkerLine(worker, index, line);
327
+ }
328
+ }
329
+ } catch (err) {
330
+ logger.debug(`embedder-pool: worker ${index} stdout read failed: ${(err as Error).message}`);
331
+ }
332
+ }
333
+
334
+ /** Parse + dispatch one JSON line emitted by a worker. */
335
+ private handleWorkerLine(worker: Worker, index: number, line: string): void {
336
+ let parsed: EmbedResponseLine;
337
+ try {
338
+ parsed = JSON.parse(line) as EmbedResponseLine;
339
+ } catch {
340
+ logger.debug(`embedder-pool: worker ${index} emitted unparseable line: ${line.slice(0, 200)}`);
341
+ return;
342
+ }
343
+ if (parsed.type !== "embed-response") return;
344
+ const pending = worker.pending;
345
+ if (!pending) {
346
+ logger.debug(`embedder-pool: worker ${index} returned response with no pending request`);
347
+ return;
348
+ }
349
+ if (parsed.error) {
350
+ pending.reject(
351
+ new HelpfulError({
352
+ kind: "internal_error",
353
+ message: `embed worker ${index} failed: ${parsed.error.message}`,
354
+ hint: parsed.error.hint || "Inspect parent stderr for the full error.",
355
+ }),
356
+ );
357
+ } else if (parsed.vectors) {
358
+ pending.resolve(parsed.vectors);
359
+ } else {
360
+ pending.reject(
361
+ new HelpfulError({
362
+ kind: "internal_error",
363
+ message: `embed worker ${index} returned response with neither vectors nor error`,
364
+ hint: "This is a worker protocol bug — file an issue with the preceding stderr.",
365
+ }),
366
+ );
367
+ }
368
+ }
369
+ }
370
+
371
+ /**
372
+ * Run `fn` with a fresh `EmbedderPool` registered as the global embedder. The
373
+ * pool is created, plugged in via `setEmbedderPool()`, and disposed
374
+ * (subprocesses killed) before `fn`'s promise resolves — so the workers only
375
+ * exist for the duration of one bulk-embedding command (`add` / `refresh` /
376
+ * `write` / a daemon tick). When `workers <= 1` the helper short-circuits
377
+ * and runs `fn` inline against the single-process embedder, with no spawn
378
+ * overhead.
379
+ */
380
+ export async function withEmbedderPool<T>(workerCount: number, model: string, fn: () => Promise<T>): Promise<T> {
381
+ if (workerCount <= 1) return fn();
382
+ const pool = new EmbedderPool(workerCount, model);
383
+ pool.spawn();
384
+ setEmbedderPool(pool);
385
+ try {
386
+ return await fn();
387
+ } finally {
388
+ setEmbedderPool(null);
389
+ await pool.dispose();
390
+ }
391
+ }
@@ -68,9 +68,40 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
68
68
  * with `(done, total)` chunk counts so callers can drive a spinner / progress
69
69
  * bar — ONNX WASM holds the JS thread for hundreds of ms per batch and would
70
70
  * otherwise leave nanospinner's setInterval starved between updates.
71
+ *
72
+ * `directOnly` bypasses any registered EmbedderPool and runs the embed call
73
+ * inline in the current process. Use it for query-time single-text embedding
74
+ * where IPC overhead would dominate.
71
75
  */
72
76
  export interface EmbedOptions {
73
77
  onProgress?: (done: number, total: number) => void;
78
+ directOnly?: boolean;
79
+ }
80
+
81
+ /**
82
+ * The minimal surface the embedder needs from a worker pool. Defined as an
83
+ * interface (not an `import type`) so we don't take a hard dependency on
84
+ * `embedder-pool.ts` from this hot path — the pool is plugged in via
85
+ * `setEmbedderPool()` from outside.
86
+ */
87
+ export interface PooledEmbedder {
88
+ embed(texts: string[], model?: string, opts?: EmbedOptions): Promise<number[][]>;
89
+ }
90
+
91
+ let pool: PooledEmbedder | null = null;
92
+
93
+ /**
94
+ * Register a worker pool to handle bulk embed calls. After this is set, every
95
+ * `embed()` call (without `directOnly`) is dispatched through the pool.
96
+ * Called once during `buildContext()` when `config.embedding.workers > 1`.
97
+ */
98
+ export function setEmbedderPool(p: PooledEmbedder | null): void {
99
+ pool = p;
100
+ }
101
+
102
+ /** Read the currently registered pool, or `null` when running single-process. */
103
+ export function getEmbedderPool(): PooledEmbedder | null {
104
+ return pool;
74
105
  }
75
106
 
76
107
  /**
@@ -92,6 +123,9 @@ export async function embed(
92
123
  opts: EmbedOptions = {},
93
124
  ): Promise<number[][]> {
94
125
  if (texts.length === 0) return [];
126
+ if (pool && !opts.directOnly) {
127
+ return pool.embed(texts, model, opts);
128
+ }
95
129
  const extractor = await getPipeline(model);
96
130
  const out: number[][] = [];
97
131
  for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
@@ -114,9 +148,13 @@ export async function embed(
114
148
  return out;
115
149
  }
116
150
 
117
- /** Embed a single text — convenience wrapper for query-time embedding. */
151
+ /**
152
+ * Embed a single text — convenience wrapper for query-time embedding. Always
153
+ * runs in-process (`directOnly: true`) so search latency isn't paying the IPC
154
+ * round-trip through the worker pool for one vector.
155
+ */
118
156
  export async function embedSingle(text: string, model: string = EMBEDDING_MODEL): Promise<number[]> {
119
- const all = await embed([text], model);
157
+ const all = await embed([text], model, { directOnly: true });
120
158
  const vec = all[0];
121
159
  if (!vec) {
122
160
  throw new HelpfulError({