membot 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
+ import { DEFAULTS } from "../constants.ts";
1
2
  import type { AppContext } from "../context.ts";
2
3
  import { upsertBlob } from "../db/blobs.ts";
3
4
  import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
4
5
  import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
5
6
  import { asHelpful, HelpfulError } from "../errors.ts";
6
- import { logger } from "../output/logger.ts";
7
+ import { pieFor } from "../output/progress.ts";
7
8
  import { chunkDeterministic } from "./chunker.ts";
9
+ import { AsyncMutex, pMap } from "./concurrency.ts";
8
10
  import { convert } from "./converter/index.ts";
9
11
  import { describe } from "./describer.ts";
10
12
  import { embed } from "./embedder.ts";
@@ -33,6 +35,7 @@ export interface IngestEntryResult {
33
35
  error?: string;
34
36
  mime_type: string | null;
35
37
  size_bytes: number;
38
+ chunk_count: number | null;
36
39
  fetcher: FetcherKind;
37
40
  source_sha256: string;
38
41
  }
@@ -51,17 +54,32 @@ export interface IngestResult {
51
54
  * without re-resolving anything. `onEntryStart` fires before the pipeline
52
55
  * touches an entry; `onEntryComplete` fires after the result (ok / unchanged
53
56
  * / failed) is known. Both are optional.
57
+ *
58
+ * The optional `workerId` arg threads the slot index through so the UI can
59
+ * show one status line per in-flight worker; callers that don't want that
60
+ * detail simply ignore it.
54
61
  */
55
62
  export interface IngestCallbacks {
56
- onEntryStart?: (label: string) => void;
57
- onEntryComplete?: (entry: IngestEntryResult) => void;
63
+ onEntryStart?: (label: string, workerId?: number) => void;
64
+ onEntryComplete?: (entry: IngestEntryResult, workerId?: number) => void;
58
65
  /**
59
66
  * Fires for sub-step progress within a single entry (e.g. "embedding
60
67
  * 32/168"). The callback runs many times per entry and is intended for
61
68
  * driving an interactive spinner — non-interactive callers should ignore
62
69
  * it to avoid log spam.
63
70
  */
64
- onEntryProgress?: (label: string, sublabel: string) => void;
71
+ onEntryProgress?: (label: string, sublabel: string, workerId?: number) => void;
72
+ /**
73
+ * Fires once after the worker pool size has been determined, before the
74
+ * first entry begins. Lets the progress reporter size its per-worker
75
+ * status section.
76
+ */
77
+ onWorkerCount?: (n: number) => void;
78
+ /**
79
+ * Fires after each successful persist with the number of new chunks
80
+ * written, so the progress reporter can track a running total.
81
+ */
82
+ onChunks?: (n: number) => void;
65
83
  }
66
84
 
67
85
  /**
@@ -92,7 +110,26 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
92
110
  const total = countResolvedEntries(resolved);
93
111
  ctx.progress.start(total, "ingest");
94
112
  const callbacks: IngestCallbacks = {
95
- onEntryStart: (label) => ctx.progress.tick(label),
113
+ // Tick on completion so the bar reflects done-and-persisted entries,
114
+ // not concurrently-in-flight ones. setLabel shows the in-flight file
115
+ // without advancing the count; sub-step suffix flows via update; per-
116
+ // worker status lines + chunk total light up if the reporter supports
117
+ // them (multi-line UI in TTY, no-op otherwise). The pie glyph fills
118
+ // in as the per-file pipeline marches read → … → persist.
119
+ onWorkerCount: (n) => ctx.progress.setWorkers(n),
120
+ onEntryStart: (label, workerId) => {
121
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
122
+ ctx.progress.setLabel(label);
123
+ },
124
+ onEntryComplete: (entry, workerId) => {
125
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
126
+ ctx.progress.tick(entry.logical_path);
127
+ },
128
+ onEntryProgress: (label, sublabel, workerId) => {
129
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
130
+ ctx.progress.update(sublabel);
131
+ },
132
+ onChunks: (n) => ctx.progress.addChunks(n),
96
133
  };
97
134
  const result = await ingestResolved(resolved, input, ctx, callbacks);
98
135
  const okCount = result.ok;
@@ -144,11 +181,12 @@ async function ingestInline(
144
181
  status: "ok",
145
182
  mime_type: "text/markdown",
146
183
  size_bytes: bytes.byteLength,
184
+ chunk_count: null,
147
185
  fetcher: "inline",
148
186
  source_sha256: sha,
149
187
  };
150
188
  try {
151
- const versionId = await persistVersion(
189
+ const persisted = await persistVersion(
152
190
  ctx,
153
191
  {
154
192
  logicalPath,
@@ -168,7 +206,8 @@ async function ingestInline(
168
206
  },
169
207
  (sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
170
208
  );
171
- result.version_id = versionId;
209
+ result.version_id = persisted.versionId;
210
+ result.chunk_count = persisted.chunkCount;
172
211
  } catch (err) {
173
212
  result.status = "failed";
174
213
  result.error = errorMessage(err);
@@ -195,6 +234,7 @@ async function ingestUrl(
195
234
  status: "ok",
196
235
  mime_type: null,
197
236
  size_bytes: 0,
237
+ chunk_count: null,
198
238
  fetcher: "downloader",
199
239
  source_sha256: "",
200
240
  };
@@ -225,7 +265,7 @@ async function ingestUrl(
225
265
  }
226
266
  }
227
267
 
228
- const versionId = await pipelineForBytes(
268
+ const persisted = await pipelineForBytes(
229
269
  ctx,
230
270
  {
231
271
  logicalPath,
@@ -244,7 +284,8 @@ async function ingestUrl(
244
284
  },
245
285
  (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
246
286
  );
247
- result.version_id = versionId;
287
+ result.version_id = persisted.versionId;
288
+ result.chunk_count = persisted.chunkCount;
248
289
  } catch (err) {
249
290
  result.status = "failed";
250
291
  result.error = errorMessage(err);
@@ -277,11 +318,28 @@ async function ingestLocalFiles(
277
318
  });
278
319
  }
279
320
 
280
- const results: IngestEntryResult[] = [];
281
321
  const isMulti = resolved.entries.length > 1;
282
-
283
- for (const entry of resolved.entries) {
284
- callbacks?.onEntryStart?.(entry.relPathFromBase);
322
+ // Cap worker count by the actual file count so tiny batches don't pay
323
+ // the cost of spawning N threads (each loads ~130MB of model weights);
324
+ // also clamp by config and the global MAX_WORKERS ceiling.
325
+ const configured = Math.min(DEFAULTS.MAX_WORKERS, Math.max(1, ctx.config.ingest.worker_concurrency));
326
+ const workerCount = Math.max(1, Math.min(configured, resolved.entries.length));
327
+ callbacks?.onWorkerCount?.(workerCount);
328
+ const persistMutex = new AsyncMutex();
329
+ let anyOk = false;
330
+
331
+ // Each pMap worker pulls a file from the shared queue and runs the
332
+ // entire pipeline end-to-end (read → unchanged check → convert →
333
+ // describe → chunk → embed → persist). The persist phase is gated by a
334
+ // single mutex because all workers share one DuckDB connection and
335
+ // DuckDB rejects nested BEGINs. The embed step itself fans out across
336
+ // the per-command embedder subprocess pool that `add` / `refresh`
337
+ // register via `withEmbedderPool()` — so the WASM call truly
338
+ // parallelizes across cores instead of serializing on the main JS
339
+ // event loop. When that pool isn't registered (single-shot SDK call,
340
+ // `embedding.workers = 1`), embed() runs inline against the in-process
341
+ // extractor with no IPC overhead.
342
+ const outcomes = await pMap(resolved.entries, workerCount, async (entry, _index, workerId) => {
285
343
  const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
286
344
  const result: IngestEntryResult = {
287
345
  source_path: entry.absPath,
@@ -290,10 +348,14 @@ async function ingestLocalFiles(
290
348
  status: "ok",
291
349
  mime_type: null,
292
350
  size_bytes: 0,
351
+ chunk_count: null,
293
352
  fetcher: "local",
294
353
  source_sha256: "",
295
354
  };
355
+ callbacks?.onEntryStart?.(entry.relPathFromBase, workerId);
356
+ const onPhase = (sublabel: string) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel, workerId);
296
357
  try {
358
+ onPhase("reading");
297
359
  const local = await readLocalFile(entry.absPath);
298
360
  result.mime_type = local.mimeType;
299
361
  result.size_bytes = local.sizeBytes;
@@ -304,48 +366,189 @@ async function ingestLocalFiles(
304
366
  if (cur && cur.source_sha256 === local.sha256) {
305
367
  result.status = "unchanged";
306
368
  result.version_id = cur.version_id;
307
- results.push(result);
308
- callbacks?.onEntryComplete?.(result);
309
- continue;
369
+ callbacks?.onEntryComplete?.(result, workerId);
370
+ return result;
310
371
  }
311
372
  }
312
373
 
313
- const versionId = await pipelineForBytes(
314
- ctx,
315
- {
374
+ onPhase("converting");
375
+ const conversion = await convert(
376
+ local.bytes,
377
+ local.mimeType,
378
+ entry.absPath,
379
+ ctx.config.llm,
380
+ ctx.config.converters,
381
+ );
382
+ const markdown = conversion.markdown;
383
+
384
+ onPhase("describing");
385
+ const description = await describe(logicalPath, local.mimeType, markdown, ctx.config.llm);
386
+
387
+ onPhase("chunking");
388
+ const chunks = chunkDeterministic(markdown, ctx.config.chunker);
389
+ const searchTexts = chunks.map((c) => buildSearchText(logicalPath, description, c.content));
390
+
391
+ let embeddings: number[][];
392
+ try {
393
+ embeddings = await embed(searchTexts, ctx.config.embedding_model, {
394
+ onProgress: (done, total) => onPhase(`embedding ${done}/${total}`),
395
+ });
396
+ } catch (err) {
397
+ throw asHelpful(
398
+ err,
399
+ `while embedding chunks for ${logicalPath}`,
400
+ "Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
401
+ );
402
+ }
403
+
404
+ const versionId = await persistMutex.lock(async () => {
405
+ onPhase("persisting");
406
+ return persistOne(ctx, {
316
407
  logicalPath,
317
- bytes: local.bytes,
318
- mime: local.mimeType,
319
- source: entry.absPath,
320
408
  sourceType: "local",
321
409
  sourcePath: entry.absPath,
322
410
  sourceMtimeMs: local.mtimeMs,
323
411
  sourceSha: local.sha256,
412
+ blobSha: local.sha256,
413
+ mime: local.mimeType,
414
+ bytes: local.bytes,
415
+ markdown,
416
+ description,
417
+ chunks,
418
+ searchTexts,
419
+ embeddings,
324
420
  fetcher: "local",
325
421
  downloader: null,
326
422
  downloaderArgs: null,
327
423
  refreshSec,
328
424
  changeNote: input.change_note ?? null,
329
- },
330
- (sublabel) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel),
331
- );
425
+ });
426
+ });
332
427
  result.version_id = versionId;
428
+ result.chunk_count = chunks.length;
429
+ anyOk = true;
430
+ callbacks?.onChunks?.(chunks.length);
333
431
  } catch (err) {
334
432
  result.status = "failed";
335
433
  result.error = errorMessage(err);
336
- } finally {
337
- // Release the DB lock between files in a directory/glob walk so
338
- // concurrent processes can wedge in mid-batch. The next entry's
339
- // first DB call reopens (cheap — same-process reopen).
340
- await ctx.db.release();
341
434
  }
342
- results.push(result);
343
- callbacks?.onEntryComplete?.(result);
435
+ callbacks?.onEntryComplete?.(result, workerId);
436
+ return result;
437
+ });
438
+
439
+ const results: IngestEntryResult[] = outcomes.map((o) => {
440
+ if (o.ok) return o.value;
441
+ // pMap caught a worker rejection — shouldn't happen since the worker
442
+ // catches its own errors, but surface defensively.
443
+ return {
444
+ source_path: "",
445
+ logical_path: "",
446
+ version_id: null,
447
+ status: "failed",
448
+ error: errorMessage(o.error),
449
+ mime_type: null,
450
+ size_bytes: 0,
451
+ chunk_count: null,
452
+ fetcher: "local",
453
+ source_sha256: "",
454
+ };
455
+ });
456
+
457
+ // Single FTS rebuild for the whole batch — replaces N per-entry rebuilds
458
+ // in the prior implementation. Skip when nothing was newly persisted.
459
+ if (anyOk) {
460
+ await rebuildFts(ctx.db);
344
461
  }
345
462
 
346
463
  return summarize(results);
347
464
  }
348
465
 
466
+ /**
467
+ * Per-file persist payload. All inputs are precomputed by the worker; this
468
+ * helper just executes the transactional DB writes.
469
+ */
470
+ interface PersistOneParams {
471
+ logicalPath: string;
472
+ sourceType: SourceType;
473
+ sourcePath: string | null;
474
+ sourceMtimeMs: number | null;
475
+ sourceSha: string;
476
+ blobSha: string | null;
477
+ mime: string;
478
+ bytes: Uint8Array | null;
479
+ markdown: string;
480
+ description: string;
481
+ chunks: { index: number; content: string }[];
482
+ searchTexts: string[];
483
+ embeddings: number[][];
484
+ fetcher: FetcherKind;
485
+ downloader: string | null;
486
+ downloaderArgs: Record<string, unknown> | null;
487
+ refreshSec: number | null;
488
+ changeNote: string | null;
489
+ }
490
+
491
+ /**
492
+ * Write blob + new (logical_path, version_id) row + its chunks under a
493
+ * single DuckDB transaction. ROLLBACK on failure keeps the row+chunks pair
494
+ * atomic; one COMMIT replaces ~N+2 autocommitted round-trips.
495
+ */
496
+ async function persistOne(ctx: AppContext, p: PersistOneParams): Promise<string> {
497
+ const versionId = millisIso(Date.now());
498
+ const contentSha = sha256Hex(new TextEncoder().encode(p.markdown));
499
+ await ctx.db.exec("BEGIN TRANSACTION");
500
+ try {
501
+ if (p.bytes) {
502
+ await upsertBlob(ctx.db, {
503
+ sha256: p.sourceSha,
504
+ mime_type: p.mime,
505
+ size_bytes: p.bytes.byteLength,
506
+ bytes: p.bytes,
507
+ });
508
+ }
509
+ await insertVersion(ctx.db, {
510
+ logical_path: p.logicalPath,
511
+ version_id: versionId,
512
+ source_type: p.sourceType,
513
+ source_path: p.sourcePath,
514
+ source_mtime_ms: p.sourceMtimeMs,
515
+ source_sha256: p.sourceSha,
516
+ blob_sha256: p.blobSha,
517
+ content_sha256: contentSha,
518
+ content: p.markdown,
519
+ description: p.description,
520
+ mime_type: p.mime,
521
+ size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
522
+ fetcher: p.fetcher,
523
+ downloader: p.downloader,
524
+ downloader_args: p.downloaderArgs,
525
+ refresh_frequency_sec: p.refreshSec,
526
+ refreshed_at: new Date().toISOString(),
527
+ last_refresh_status: "ok",
528
+ change_note: p.changeNote,
529
+ });
530
+ await insertChunksForVersion(
531
+ ctx.db,
532
+ p.logicalPath,
533
+ versionId,
534
+ p.chunks.map((c, i) => ({
535
+ chunk_index: c.index,
536
+ chunk_content: c.content,
537
+ search_text: p.searchTexts[i] ?? buildSearchText(p.logicalPath, p.description, c.content),
538
+ embedding: p.embeddings[i] ?? new Array(p.embeddings[0]?.length ?? 0).fill(0),
539
+ })),
540
+ );
541
+ await ctx.db.exec("COMMIT");
542
+ } catch (err) {
543
+ await ctx.db.exec("ROLLBACK").catch(() => {
544
+ // Best effort — if ROLLBACK itself fails (already aborted, lock
545
+ // dropped, etc.) we still want the original error to surface.
546
+ });
547
+ throw err;
548
+ }
549
+ return versionId;
550
+ }
551
+
349
552
  interface PipelineParams {
350
553
  logicalPath: string;
351
554
  bytes: Uint8Array;
@@ -373,7 +576,7 @@ async function pipelineForBytes(
373
576
  ctx: AppContext,
374
577
  p: PipelineParams,
375
578
  onPhase?: (sublabel: string) => void,
376
- ): Promise<string> {
579
+ ): Promise<{ versionId: string; chunkCount: number }> {
377
580
  onPhase?.("storing blob");
378
581
  await upsertBlob(ctx.db, {
379
582
  sha256: p.sourceSha,
@@ -438,7 +641,7 @@ async function persistVersion(
438
641
  ctx: AppContext,
439
642
  p: PersistParams,
440
643
  onPhase?: (sublabel: string) => void,
441
- ): Promise<string> {
644
+ ): Promise<{ versionId: string; chunkCount: number }> {
442
645
  onPhase?.("describing");
443
646
  const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
444
647
  onPhase?.("chunking");
@@ -460,42 +663,49 @@ async function persistVersion(
460
663
  onPhase?.("persisting");
461
664
  const versionId = millisIso(Date.now());
462
665
  const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
463
- await insertVersion(ctx.db, {
464
- logical_path: p.logicalPath,
465
- version_id: versionId,
466
- source_type: p.sourceType,
467
- source_path: p.sourcePath,
468
- source_mtime_ms: p.sourceMtimeMs,
469
- source_sha256: p.sourceSha,
470
- blob_sha256: p.blobSha,
471
- content_sha256: contentSha,
472
- content: p.markdown,
473
- description,
474
- mime_type: p.mime,
475
- size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
476
- fetcher: p.fetcher,
477
- downloader: p.downloader,
478
- downloader_args: p.downloaderArgs,
479
- refresh_frequency_sec: p.refreshSec,
480
- refreshed_at: new Date().toISOString(),
481
- last_refresh_status: "ok",
482
- change_note: p.changeNote,
483
- });
666
+ await ctx.db.exec("BEGIN TRANSACTION");
667
+ try {
668
+ await insertVersion(ctx.db, {
669
+ logical_path: p.logicalPath,
670
+ version_id: versionId,
671
+ source_type: p.sourceType,
672
+ source_path: p.sourcePath,
673
+ source_mtime_ms: p.sourceMtimeMs,
674
+ source_sha256: p.sourceSha,
675
+ blob_sha256: p.blobSha,
676
+ content_sha256: contentSha,
677
+ content: p.markdown,
678
+ description,
679
+ mime_type: p.mime,
680
+ size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
681
+ fetcher: p.fetcher,
682
+ downloader: p.downloader,
683
+ downloader_args: p.downloaderArgs,
684
+ refresh_frequency_sec: p.refreshSec,
685
+ refreshed_at: new Date().toISOString(),
686
+ last_refresh_status: "ok",
687
+ change_note: p.changeNote,
688
+ });
484
689
 
485
- await insertChunksForVersion(
486
- ctx.db,
487
- p.logicalPath,
488
- versionId,
489
- chunks.map((c, i) => ({
490
- chunk_index: c.index,
491
- chunk_content: c.content,
492
- search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
493
- embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
494
- })),
495
- );
690
+ await insertChunksForVersion(
691
+ ctx.db,
692
+ p.logicalPath,
693
+ versionId,
694
+ chunks.map((c, i) => ({
695
+ chunk_index: c.index,
696
+ chunk_content: c.content,
697
+ search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
698
+ embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
699
+ })),
700
+ );
701
+ await ctx.db.exec("COMMIT");
702
+ } catch (err) {
703
+ await ctx.db.exec("ROLLBACK").catch(() => {});
704
+ throw err;
705
+ }
496
706
  onPhase?.("indexing");
497
707
  await rebuildFts(ctx.db);
498
- return versionId;
708
+ return { versionId, chunkCount: chunks.length };
499
709
  }
500
710
 
501
711
  /**
@@ -9,7 +9,9 @@ import {
9
9
  ingestResolved,
10
10
  } from "../ingest/ingest.ts";
11
11
  import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts";
12
- import { colors } from "../output/formatter.ts";
12
+ import { colors, formatBytes } from "../output/formatter.ts";
13
+ import { pieFor } from "../output/progress.ts";
14
+ import { isInteractive } from "../output/tty.ts";
13
15
  import { defineOperation } from "./types.ts";
14
16
 
15
17
  const FetcherKindEnum = z.enum(["downloader", "local", "inline"]);
@@ -78,6 +80,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
78
80
  error: z.string().optional(),
79
81
  mime_type: z.string().nullable(),
80
82
  size_bytes: z.number(),
83
+ chunk_count: z.number().nullable(),
81
84
  fetcher: FetcherKindEnum,
82
85
  source_sha256: z.string(),
83
86
  }),
@@ -92,24 +95,27 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
92
95
  aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m", force: "-f" },
93
96
  },
94
97
  console_formatter: (result) => {
95
- const lines = result.ingested.map((e) => {
96
- if (e.status === "ok") {
97
- return `${colors.green("✓")} ${colors.cyan(e.logical_path)} ${colors.dim(`(${e.fetcher}, ${e.size_bytes}B)`)}`;
98
- }
99
- if (e.status === "unchanged") {
100
- return `${colors.dim("≡")} ${colors.cyan(e.logical_path)} ${colors.dim("(unchanged)")}`;
101
- }
102
- return `${colors.red("✗")} ${e.source_path} ${colors.dim(e.error ?? "")}`;
103
- });
104
98
  const parts: string[] = [colors.green(`added ${result.ok}`)];
105
99
  if (result.unchanged > 0) parts.push(colors.dim(`unchanged ${result.unchanged}`));
106
100
  if (result.failed > 0) parts.push(colors.red(`failed ${result.failed}`));
107
- return `${lines.join("\n")}\n${parts.join(", ")}`;
101
+ const summary = parts.join(", ");
102
+
103
+ // In interactive mode, every entry was already streamed to stderr via
104
+ // progress.entry() during ingest; printing the same list to stdout
105
+ // here would just duplicate the scrollback. Non-interactive callers
106
+ // (JSON, piped stdout, CI) don't see the live stream, so they still
107
+ // get the full per-entry list as the operation's stdout payload.
108
+ if (isInteractive()) return summary;
109
+
110
+ const lines = result.ingested.map(formatEntryLine);
111
+ return `${lines.join("\n")}\n${summary}`;
108
112
  },
109
113
  handler: async (input, ctx) => {
110
114
  // Spin up an ephemeral embedder pool for the whole `add` command —
111
115
  // `withEmbedderPool` handles the workers=1 short-circuit and disposes
112
- // the children when the closure returns (see embedder-pool.ts).
116
+ // the children when the closure returns (see embedder-pool.ts). Inside
117
+ // the closure, every embed() call from the ingest pipeline transparently
118
+ // fans out to the subprocess pool.
113
119
  const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
114
120
  return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
115
121
  const { sources, ...rest } = input;
@@ -145,9 +151,27 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
145
151
 
146
152
  ctx.progress.start(total, "ingest");
147
153
  const callbacks: IngestCallbacks = {
148
- onEntryStart: (label) => ctx.progress.tick(label),
149
- onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
150
- onEntryProgress: (_label, sublabel) => ctx.progress.update(sublabel),
154
+ // Counter advances on COMPLETION so concurrent prep doesn't race the
155
+ // bar to 100% before any file is fully persisted. The per-worker
156
+ // status section (one line per active worker) shows file + step in
157
+ // real time, prefixed with a pie glyph that fills as the per-file
158
+ // pipeline progresses. `setWorkers(n)` resizes the section whenever
159
+ // a new ingest source kicks off with its own pool size.
160
+ onWorkerCount: (n) => ctx.progress.setWorkers(n),
161
+ onEntryStart: (label, workerId) => {
162
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
163
+ ctx.progress.setLabel(label);
164
+ },
165
+ onEntryComplete: (entry, workerId) => {
166
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
167
+ ctx.progress.tick(entry.logical_path);
168
+ ctx.progress.entry(formatEntryLine(entry));
169
+ },
170
+ onEntryProgress: (label, sublabel, workerId) => {
171
+ if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
172
+ ctx.progress.update(sublabel);
173
+ },
174
+ onChunks: (n) => ctx.progress.addChunks(n),
151
175
  };
152
176
 
153
177
  for (const outcome of outcomes) {
@@ -160,6 +184,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
160
184
  error: outcome.error.message,
161
185
  mime_type: null,
162
186
  size_bytes: 0,
187
+ chunk_count: null,
163
188
  fetcher: "local",
164
189
  source_sha256: "",
165
190
  };
@@ -188,6 +213,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
188
213
  error: message,
189
214
  mime_type: null,
190
215
  size_bytes: 0,
216
+ chunk_count: null,
191
217
  fetcher: "local",
192
218
  source_sha256: "",
193
219
  };
@@ -215,11 +241,17 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
215
241
  * Render the persistent stderr line shown for one completed entry. Mirrors
216
242
  * the glyphs used by the final `console_formatter` so users see the same
217
243
  * status indicators twice (once during ingest on stderr, once in the final
218
- * stdout summary).
244
+ * stdout summary). Successful entries show source kind, humanized byte
245
+ * size, and chunk count so the user can spot oddly small / oddly large
246
+ * files at a glance.
219
247
  */
220
248
  function formatEntryLine(entry: IngestEntryResult): string {
221
249
  if (entry.status === "ok") {
222
- return `${colors.green("✓")} ${colors.cyan(entry.logical_path)} ${colors.dim(`(${entry.fetcher}, ${entry.size_bytes}B)`)}`;
250
+ const parts: string[] = [entry.fetcher, formatBytes(entry.size_bytes)];
251
+ if (entry.chunk_count !== null) {
252
+ parts.push(`${entry.chunk_count} chunk${entry.chunk_count === 1 ? "" : "s"}`);
253
+ }
254
+ return `${colors.green("✓")} ${colors.cyan(entry.logical_path)} ${colors.dim(`(${parts.join(", ")})`)}`;
223
255
  }
224
256
  if (entry.status === "unchanged") {
225
257
  return `${colors.dim("≡")} ${colors.cyan(entry.logical_path)} ${colors.dim("(unchanged)")}`;