membot 0.7.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
+ import { DEFAULTS } from "../constants.ts";
1
2
  import type { AppContext } from "../context.ts";
2
3
  import { upsertBlob } from "../db/blobs.ts";
3
4
  import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
4
5
  import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
5
6
  import { asHelpful, HelpfulError } from "../errors.ts";
6
- import { logger } from "../output/logger.ts";
7
+ import { pieFor } from "../output/progress.ts";
7
8
  import { chunkDeterministic } from "./chunker.ts";
9
+ import { AsyncMutex, pMap } from "./concurrency.ts";
8
10
  import { convert } from "./converter/index.ts";
9
11
  import { describe } from "./describer.ts";
10
12
  import { embed } from "./embedder.ts";
@@ -33,6 +35,7 @@ export interface IngestEntryResult {
33
35
  error?: string;
34
36
  mime_type: string | null;
35
37
  size_bytes: number;
38
+ chunk_count: number | null;
36
39
  fetcher: FetcherKind;
37
40
  source_sha256: string;
38
41
  }
@@ -51,17 +54,32 @@ export interface IngestResult {
51
54
  * without re-resolving anything. `onEntryStart` fires before the pipeline
52
55
  * touches an entry; `onEntryComplete` fires after the result (ok / unchanged
53
56
  * / failed) is known. Both are optional.
57
+ *
58
+ * The optional `workerId` arg threads the slot index through so the UI can
59
+ * show one status line per in-flight worker; callers that don't want that
60
+ * detail simply ignore it.
54
61
  */
55
62
  export interface IngestCallbacks {
56
- onEntryStart?: (label: string) => void;
57
- onEntryComplete?: (entry: IngestEntryResult) => void;
63
+ onEntryStart?: (label: string, workerId?: number) => void;
64
+ onEntryComplete?: (entry: IngestEntryResult, workerId?: number) => void;
58
65
  /**
59
66
  * Fires for sub-step progress within a single entry (e.g. "embedding
60
67
  * 32/168"). The callback runs many times per entry and is intended for
61
68
  * driving an interactive spinner — non-interactive callers should ignore
62
69
  * it to avoid log spam.
63
70
  */
64
- onEntryProgress?: (label: string, sublabel: string) => void;
71
+ onEntryProgress?: (label: string, sublabel: string, workerId?: number) => void;
72
+ /**
73
+ * Fires once after the worker pool size has been determined, before the
74
+ * first entry begins. Lets the progress reporter size its per-worker
75
+ * status section.
76
+ */
77
+ onWorkerCount?: (n: number) => void;
78
+ /**
79
+ * Fires after each successful persist with the number of new chunks
80
+ * written, so the progress reporter can track a running total.
81
+ */
82
+ onChunks?: (n: number) => void;
65
83
  }
66
84
 
67
85
  /**
@@ -92,7 +110,26 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
92
110
  const total = countResolvedEntries(resolved);
93
111
  ctx.progress.start(total, "ingest");
94
112
  const callbacks: IngestCallbacks = {
95
- onEntryStart: (label) => ctx.progress.tick(label),
113
+ // Tick on completion so the bar reflects done-and-persisted entries,
114
+ // not concurrently-in-flight ones. setLabel shows the in-flight file
115
+ // without advancing the count; sub-step suffix flows via update; per-
116
+ // worker status lines + chunk total light up if the reporter supports
117
+ // them (multi-line UI in TTY, no-op otherwise). The pie glyph fills
118
+ // in as the per-file pipeline marches read → … → persist.
119
+ onWorkerCount: (n) => ctx.progress.setWorkers(n),
120
+ onEntryStart: (label, workerId) => {
121
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
122
+ ctx.progress.setLabel(label);
123
+ },
124
+ onEntryComplete: (entry, workerId) => {
125
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
126
+ ctx.progress.tick(entry.logical_path);
127
+ },
128
+ onEntryProgress: (label, sublabel, workerId) => {
129
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
130
+ ctx.progress.update(sublabel);
131
+ },
132
+ onChunks: (n) => ctx.progress.addChunks(n),
96
133
  };
97
134
  const result = await ingestResolved(resolved, input, ctx, callbacks);
98
135
  const okCount = result.ok;
@@ -144,11 +181,12 @@ async function ingestInline(
144
181
  status: "ok",
145
182
  mime_type: "text/markdown",
146
183
  size_bytes: bytes.byteLength,
184
+ chunk_count: null,
147
185
  fetcher: "inline",
148
186
  source_sha256: sha,
149
187
  };
150
188
  try {
151
- const versionId = await persistVersion(
189
+ const persisted = await persistVersion(
152
190
  ctx,
153
191
  {
154
192
  logicalPath,
@@ -168,7 +206,8 @@ async function ingestInline(
168
206
  },
169
207
  (sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
170
208
  );
171
- result.version_id = versionId;
209
+ result.version_id = persisted.versionId;
210
+ result.chunk_count = persisted.chunkCount;
172
211
  } catch (err) {
173
212
  result.status = "failed";
174
213
  result.error = errorMessage(err);
@@ -195,6 +234,7 @@ async function ingestUrl(
195
234
  status: "ok",
196
235
  mime_type: null,
197
236
  size_bytes: 0,
237
+ chunk_count: null,
198
238
  fetcher: "downloader",
199
239
  source_sha256: "",
200
240
  };
@@ -225,7 +265,7 @@ async function ingestUrl(
225
265
  }
226
266
  }
227
267
 
228
- const versionId = await pipelineForBytes(
268
+ const persisted = await pipelineForBytes(
229
269
  ctx,
230
270
  {
231
271
  logicalPath,
@@ -244,7 +284,8 @@ async function ingestUrl(
244
284
  },
245
285
  (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
246
286
  );
247
- result.version_id = versionId;
287
+ result.version_id = persisted.versionId;
288
+ result.chunk_count = persisted.chunkCount;
248
289
  } catch (err) {
249
290
  result.status = "failed";
250
291
  result.error = errorMessage(err);
@@ -277,11 +318,28 @@ async function ingestLocalFiles(
277
318
  });
278
319
  }
279
320
 
280
- const results: IngestEntryResult[] = [];
281
321
  const isMulti = resolved.entries.length > 1;
282
-
283
- for (const entry of resolved.entries) {
284
- callbacks?.onEntryStart?.(entry.relPathFromBase);
322
+ // Cap worker count by the actual file count so tiny batches don't pay
323
+ // the cost of spawning N threads (each loads ~130MB of model weights);
324
+ // also clamp by config and the global MAX_WORKERS ceiling.
325
+ const configured = Math.min(DEFAULTS.MAX_WORKERS, Math.max(1, ctx.config.ingest.worker_concurrency));
326
+ const workerCount = Math.max(1, Math.min(configured, resolved.entries.length));
327
+ callbacks?.onWorkerCount?.(workerCount);
328
+ const persistMutex = new AsyncMutex();
329
+ let anyOk = false;
330
+
331
+ // Each pMap worker pulls a file from the shared queue and runs the
332
+ // entire pipeline end-to-end (read → unchanged check → convert →
333
+ // describe → chunk → embed → persist). The persist phase is gated by a
334
+ // single mutex because all workers share one DuckDB connection and
335
+ // DuckDB rejects nested BEGINs. The embed step itself fans out across
336
+ // the per-command embedder subprocess pool that `add` / `refresh`
337
+ // register via `withEmbedderPool()` — so the WASM call truly
338
+ // parallelizes across cores instead of serializing on the main JS
339
+ // event loop. When that pool isn't registered (single-shot SDK call,
340
+ // `embedding.workers = 1`), embed() runs inline against the in-process
341
+ // extractor with no IPC overhead.
342
+ const outcomes = await pMap(resolved.entries, workerCount, async (entry, _index, workerId) => {
285
343
  const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
286
344
  const result: IngestEntryResult = {
287
345
  source_path: entry.absPath,
@@ -290,10 +348,14 @@ async function ingestLocalFiles(
290
348
  status: "ok",
291
349
  mime_type: null,
292
350
  size_bytes: 0,
351
+ chunk_count: null,
293
352
  fetcher: "local",
294
353
  source_sha256: "",
295
354
  };
355
+ callbacks?.onEntryStart?.(entry.relPathFromBase, workerId);
356
+ const onPhase = (sublabel: string) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel, workerId);
296
357
  try {
358
+ onPhase("reading");
297
359
  const local = await readLocalFile(entry.absPath);
298
360
  result.mime_type = local.mimeType;
299
361
  result.size_bytes = local.sizeBytes;
@@ -304,48 +366,189 @@ async function ingestLocalFiles(
304
366
  if (cur && cur.source_sha256 === local.sha256) {
305
367
  result.status = "unchanged";
306
368
  result.version_id = cur.version_id;
307
- results.push(result);
308
- callbacks?.onEntryComplete?.(result);
309
- continue;
369
+ callbacks?.onEntryComplete?.(result, workerId);
370
+ return result;
310
371
  }
311
372
  }
312
373
 
313
- const versionId = await pipelineForBytes(
314
- ctx,
315
- {
374
+ onPhase("converting");
375
+ const conversion = await convert(
376
+ local.bytes,
377
+ local.mimeType,
378
+ entry.absPath,
379
+ ctx.config.llm,
380
+ ctx.config.converters,
381
+ );
382
+ const markdown = conversion.markdown;
383
+
384
+ onPhase("describing");
385
+ const description = await describe(logicalPath, local.mimeType, markdown, ctx.config.llm);
386
+
387
+ onPhase("chunking");
388
+ const chunks = chunkDeterministic(markdown, ctx.config.chunker);
389
+ const searchTexts = chunks.map((c) => buildSearchText(logicalPath, description, c.content));
390
+
391
+ let embeddings: number[][];
392
+ try {
393
+ embeddings = await embed(searchTexts, ctx.config.embedding_model, {
394
+ onProgress: (done, total) => onPhase(`embedding ${done}/${total}`),
395
+ });
396
+ } catch (err) {
397
+ throw asHelpful(
398
+ err,
399
+ `while embedding chunks for ${logicalPath}`,
400
+ "Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
401
+ );
402
+ }
403
+
404
+ const versionId = await persistMutex.lock(async () => {
405
+ onPhase("persisting");
406
+ return persistOne(ctx, {
316
407
  logicalPath,
317
- bytes: local.bytes,
318
- mime: local.mimeType,
319
- source: entry.absPath,
320
408
  sourceType: "local",
321
409
  sourcePath: entry.absPath,
322
410
  sourceMtimeMs: local.mtimeMs,
323
411
  sourceSha: local.sha256,
412
+ blobSha: local.sha256,
413
+ mime: local.mimeType,
414
+ bytes: local.bytes,
415
+ markdown,
416
+ description,
417
+ chunks,
418
+ searchTexts,
419
+ embeddings,
324
420
  fetcher: "local",
325
421
  downloader: null,
326
422
  downloaderArgs: null,
327
423
  refreshSec,
328
424
  changeNote: input.change_note ?? null,
329
- },
330
- (sublabel) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel),
331
- );
425
+ });
426
+ });
332
427
  result.version_id = versionId;
428
+ result.chunk_count = chunks.length;
429
+ anyOk = true;
430
+ callbacks?.onChunks?.(chunks.length);
333
431
  } catch (err) {
334
432
  result.status = "failed";
335
433
  result.error = errorMessage(err);
336
- } finally {
337
- // Release the DB lock between files in a directory/glob walk so
338
- // concurrent processes can wedge in mid-batch. The next entry's
339
- // first DB call reopens (cheap — same-process reopen).
340
- await ctx.db.release();
341
434
  }
342
- results.push(result);
343
- callbacks?.onEntryComplete?.(result);
435
+ callbacks?.onEntryComplete?.(result, workerId);
436
+ return result;
437
+ });
438
+
439
+ const results: IngestEntryResult[] = outcomes.map((o) => {
440
+ if (o.ok) return o.value;
441
+ // pMap caught a worker rejection — shouldn't happen since the worker
442
+ // catches its own errors, but surface defensively.
443
+ return {
444
+ source_path: "",
445
+ logical_path: "",
446
+ version_id: null,
447
+ status: "failed",
448
+ error: errorMessage(o.error),
449
+ mime_type: null,
450
+ size_bytes: 0,
451
+ chunk_count: null,
452
+ fetcher: "local",
453
+ source_sha256: "",
454
+ };
455
+ });
456
+
457
+ // Single FTS rebuild for the whole batch — replaces N per-entry rebuilds
458
+ // in the prior implementation. Skip when nothing was newly persisted.
459
+ if (anyOk) {
460
+ await rebuildFts(ctx.db);
344
461
  }
345
462
 
346
463
  return summarize(results);
347
464
  }
348
465
 
466
+ /**
467
+ * Per-file persist payload. All inputs are precomputed by the worker; this
468
+ * helper just executes the transactional DB writes.
469
+ */
470
+ interface PersistOneParams {
471
+ logicalPath: string;
472
+ sourceType: SourceType;
473
+ sourcePath: string | null;
474
+ sourceMtimeMs: number | null;
475
+ sourceSha: string;
476
+ blobSha: string | null;
477
+ mime: string;
478
+ bytes: Uint8Array | null;
479
+ markdown: string;
480
+ description: string;
481
+ chunks: { index: number; content: string }[];
482
+ searchTexts: string[];
483
+ embeddings: number[][];
484
+ fetcher: FetcherKind;
485
+ downloader: string | null;
486
+ downloaderArgs: Record<string, unknown> | null;
487
+ refreshSec: number | null;
488
+ changeNote: string | null;
489
+ }
490
+
491
+ /**
492
+ * Write blob + new (logical_path, version_id) row + its chunks under a
493
+ * single DuckDB transaction. ROLLBACK on failure keeps the row+chunks pair
494
+ * atomic; one COMMIT replaces ~N+2 autocommitted round-trips.
495
+ */
496
+ async function persistOne(ctx: AppContext, p: PersistOneParams): Promise<string> {
497
+ const versionId = millisIso(Date.now());
498
+ const contentSha = sha256Hex(new TextEncoder().encode(p.markdown));
499
+ await ctx.db.exec("BEGIN TRANSACTION");
500
+ try {
501
+ if (p.bytes) {
502
+ await upsertBlob(ctx.db, {
503
+ sha256: p.sourceSha,
504
+ mime_type: p.mime,
505
+ size_bytes: p.bytes.byteLength,
506
+ bytes: p.bytes,
507
+ });
508
+ }
509
+ await insertVersion(ctx.db, {
510
+ logical_path: p.logicalPath,
511
+ version_id: versionId,
512
+ source_type: p.sourceType,
513
+ source_path: p.sourcePath,
514
+ source_mtime_ms: p.sourceMtimeMs,
515
+ source_sha256: p.sourceSha,
516
+ blob_sha256: p.blobSha,
517
+ content_sha256: contentSha,
518
+ content: p.markdown,
519
+ description: p.description,
520
+ mime_type: p.mime,
521
+ size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
522
+ fetcher: p.fetcher,
523
+ downloader: p.downloader,
524
+ downloader_args: p.downloaderArgs,
525
+ refresh_frequency_sec: p.refreshSec,
526
+ refreshed_at: new Date().toISOString(),
527
+ last_refresh_status: "ok",
528
+ change_note: p.changeNote,
529
+ });
530
+ await insertChunksForVersion(
531
+ ctx.db,
532
+ p.logicalPath,
533
+ versionId,
534
+ p.chunks.map((c, i) => ({
535
+ chunk_index: c.index,
536
+ chunk_content: c.content,
537
+ search_text: p.searchTexts[i] ?? buildSearchText(p.logicalPath, p.description, c.content),
538
+ embedding: p.embeddings[i] ?? new Array(p.embeddings[0]?.length ?? 0).fill(0),
539
+ })),
540
+ );
541
+ await ctx.db.exec("COMMIT");
542
+ } catch (err) {
543
+ await ctx.db.exec("ROLLBACK").catch(() => {
544
+ // Best effort — if ROLLBACK itself fails (already aborted, lock
545
+ // dropped, etc.) we still want the original error to surface.
546
+ });
547
+ throw err;
548
+ }
549
+ return versionId;
550
+ }
551
+
349
552
  interface PipelineParams {
350
553
  logicalPath: string;
351
554
  bytes: Uint8Array;
@@ -373,7 +576,7 @@ async function pipelineForBytes(
373
576
  ctx: AppContext,
374
577
  p: PipelineParams,
375
578
  onPhase?: (sublabel: string) => void,
376
- ): Promise<string> {
579
+ ): Promise<{ versionId: string; chunkCount: number }> {
377
580
  onPhase?.("storing blob");
378
581
  await upsertBlob(ctx.db, {
379
582
  sha256: p.sourceSha,
@@ -438,7 +641,7 @@ async function persistVersion(
438
641
  ctx: AppContext,
439
642
  p: PersistParams,
440
643
  onPhase?: (sublabel: string) => void,
441
- ): Promise<string> {
644
+ ): Promise<{ versionId: string; chunkCount: number }> {
442
645
  onPhase?.("describing");
443
646
  const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
444
647
  onPhase?.("chunking");
@@ -460,42 +663,49 @@ async function persistVersion(
460
663
  onPhase?.("persisting");
461
664
  const versionId = millisIso(Date.now());
462
665
  const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
463
- await insertVersion(ctx.db, {
464
- logical_path: p.logicalPath,
465
- version_id: versionId,
466
- source_type: p.sourceType,
467
- source_path: p.sourcePath,
468
- source_mtime_ms: p.sourceMtimeMs,
469
- source_sha256: p.sourceSha,
470
- blob_sha256: p.blobSha,
471
- content_sha256: contentSha,
472
- content: p.markdown,
473
- description,
474
- mime_type: p.mime,
475
- size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
476
- fetcher: p.fetcher,
477
- downloader: p.downloader,
478
- downloader_args: p.downloaderArgs,
479
- refresh_frequency_sec: p.refreshSec,
480
- refreshed_at: new Date().toISOString(),
481
- last_refresh_status: "ok",
482
- change_note: p.changeNote,
483
- });
666
+ await ctx.db.exec("BEGIN TRANSACTION");
667
+ try {
668
+ await insertVersion(ctx.db, {
669
+ logical_path: p.logicalPath,
670
+ version_id: versionId,
671
+ source_type: p.sourceType,
672
+ source_path: p.sourcePath,
673
+ source_mtime_ms: p.sourceMtimeMs,
674
+ source_sha256: p.sourceSha,
675
+ blob_sha256: p.blobSha,
676
+ content_sha256: contentSha,
677
+ content: p.markdown,
678
+ description,
679
+ mime_type: p.mime,
680
+ size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
681
+ fetcher: p.fetcher,
682
+ downloader: p.downloader,
683
+ downloader_args: p.downloaderArgs,
684
+ refresh_frequency_sec: p.refreshSec,
685
+ refreshed_at: new Date().toISOString(),
686
+ last_refresh_status: "ok",
687
+ change_note: p.changeNote,
688
+ });
484
689
 
485
- await insertChunksForVersion(
486
- ctx.db,
487
- p.logicalPath,
488
- versionId,
489
- chunks.map((c, i) => ({
490
- chunk_index: c.index,
491
- chunk_content: c.content,
492
- search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
493
- embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
494
- })),
495
- );
690
+ await insertChunksForVersion(
691
+ ctx.db,
692
+ p.logicalPath,
693
+ versionId,
694
+ chunks.map((c, i) => ({
695
+ chunk_index: c.index,
696
+ chunk_content: c.content,
697
+ search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
698
+ embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
699
+ })),
700
+ );
701
+ await ctx.db.exec("COMMIT");
702
+ } catch (err) {
703
+ await ctx.db.exec("ROLLBACK").catch(() => {});
704
+ throw err;
705
+ }
496
706
  onPhase?.("indexing");
497
707
  await rebuildFts(ctx.db);
498
- return versionId;
708
+ return { versionId, chunkCount: chunks.length };
499
709
  }
500
710
 
501
711
  /**