membot 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/package.json +1 -1
- package/src/config/schemas.ts +19 -0
- package/src/constants.ts +15 -0
- package/src/ingest/concurrency.ts +60 -0
- package/src/ingest/describer.ts +49 -3
- package/src/ingest/ingest.ts +277 -67
- package/src/operations/add.ts +49 -17
- package/src/operations/refresh.ts +43 -24
- package/src/output/formatter.ts +21 -0
- package/src/output/logger.ts +36 -0
- package/src/output/progress.ts +408 -46
package/src/ingest/ingest.ts
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
import { DEFAULTS } from "../constants.ts";
|
|
1
2
|
import type { AppContext } from "../context.ts";
|
|
2
3
|
import { upsertBlob } from "../db/blobs.ts";
|
|
3
4
|
import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
|
|
4
5
|
import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
|
|
5
6
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
6
|
-
import {
|
|
7
|
+
import { pieFor } from "../output/progress.ts";
|
|
7
8
|
import { chunkDeterministic } from "./chunker.ts";
|
|
9
|
+
import { AsyncMutex, pMap } from "./concurrency.ts";
|
|
8
10
|
import { convert } from "./converter/index.ts";
|
|
9
11
|
import { describe } from "./describer.ts";
|
|
10
12
|
import { embed } from "./embedder.ts";
|
|
@@ -33,6 +35,7 @@ export interface IngestEntryResult {
|
|
|
33
35
|
error?: string;
|
|
34
36
|
mime_type: string | null;
|
|
35
37
|
size_bytes: number;
|
|
38
|
+
chunk_count: number | null;
|
|
36
39
|
fetcher: FetcherKind;
|
|
37
40
|
source_sha256: string;
|
|
38
41
|
}
|
|
@@ -51,17 +54,32 @@ export interface IngestResult {
|
|
|
51
54
|
* without re-resolving anything. `onEntryStart` fires before the pipeline
|
|
52
55
|
* touches an entry; `onEntryComplete` fires after the result (ok / unchanged
|
|
53
56
|
* / failed) is known. Both are optional.
|
|
57
|
+
*
|
|
58
|
+
* The optional `workerId` arg threads the slot index through so the UI can
|
|
59
|
+
* show one status line per in-flight worker; callers that don't want that
|
|
60
|
+
* detail simply ignore it.
|
|
54
61
|
*/
|
|
55
62
|
export interface IngestCallbacks {
|
|
56
|
-
onEntryStart?: (label: string) => void;
|
|
57
|
-
onEntryComplete?: (entry: IngestEntryResult) => void;
|
|
63
|
+
onEntryStart?: (label: string, workerId?: number) => void;
|
|
64
|
+
onEntryComplete?: (entry: IngestEntryResult, workerId?: number) => void;
|
|
58
65
|
/**
|
|
59
66
|
* Fires for sub-step progress within a single entry (e.g. "embedding
|
|
60
67
|
* 32/168"). The callback runs many times per entry and is intended for
|
|
61
68
|
* driving an interactive spinner — non-interactive callers should ignore
|
|
62
69
|
* it to avoid log spam.
|
|
63
70
|
*/
|
|
64
|
-
onEntryProgress?: (label: string, sublabel: string) => void;
|
|
71
|
+
onEntryProgress?: (label: string, sublabel: string, workerId?: number) => void;
|
|
72
|
+
/**
|
|
73
|
+
* Fires once after the worker pool size has been determined, before the
|
|
74
|
+
* first entry begins. Lets the progress reporter size its per-worker
|
|
75
|
+
* status section.
|
|
76
|
+
*/
|
|
77
|
+
onWorkerCount?: (n: number) => void;
|
|
78
|
+
/**
|
|
79
|
+
* Fires after each successful persist with the number of new chunks
|
|
80
|
+
* written, so the progress reporter can track a running total.
|
|
81
|
+
*/
|
|
82
|
+
onChunks?: (n: number) => void;
|
|
65
83
|
}
|
|
66
84
|
|
|
67
85
|
/**
|
|
@@ -92,7 +110,26 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
|
|
|
92
110
|
const total = countResolvedEntries(resolved);
|
|
93
111
|
ctx.progress.start(total, "ingest");
|
|
94
112
|
const callbacks: IngestCallbacks = {
|
|
95
|
-
|
|
113
|
+
// Tick on completion so the bar reflects done-and-persisted entries,
|
|
114
|
+
// not concurrently-in-flight ones. setLabel shows the in-flight file
|
|
115
|
+
// without advancing the count; sub-step suffix flows via update; per-
|
|
116
|
+
// worker status lines + chunk total light up if the reporter supports
|
|
117
|
+
// them (multi-line UI in TTY, no-op otherwise). The pie glyph fills
|
|
118
|
+
// in as the per-file pipeline marches read → … → persist.
|
|
119
|
+
onWorkerCount: (n) => ctx.progress.setWorkers(n),
|
|
120
|
+
onEntryStart: (label, workerId) => {
|
|
121
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
|
|
122
|
+
ctx.progress.setLabel(label);
|
|
123
|
+
},
|
|
124
|
+
onEntryComplete: (entry, workerId) => {
|
|
125
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
|
|
126
|
+
ctx.progress.tick(entry.logical_path);
|
|
127
|
+
},
|
|
128
|
+
onEntryProgress: (label, sublabel, workerId) => {
|
|
129
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
|
|
130
|
+
ctx.progress.update(sublabel);
|
|
131
|
+
},
|
|
132
|
+
onChunks: (n) => ctx.progress.addChunks(n),
|
|
96
133
|
};
|
|
97
134
|
const result = await ingestResolved(resolved, input, ctx, callbacks);
|
|
98
135
|
const okCount = result.ok;
|
|
@@ -144,11 +181,12 @@ async function ingestInline(
|
|
|
144
181
|
status: "ok",
|
|
145
182
|
mime_type: "text/markdown",
|
|
146
183
|
size_bytes: bytes.byteLength,
|
|
184
|
+
chunk_count: null,
|
|
147
185
|
fetcher: "inline",
|
|
148
186
|
source_sha256: sha,
|
|
149
187
|
};
|
|
150
188
|
try {
|
|
151
|
-
const
|
|
189
|
+
const persisted = await persistVersion(
|
|
152
190
|
ctx,
|
|
153
191
|
{
|
|
154
192
|
logicalPath,
|
|
@@ -168,7 +206,8 @@ async function ingestInline(
|
|
|
168
206
|
},
|
|
169
207
|
(sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
|
|
170
208
|
);
|
|
171
|
-
result.version_id = versionId;
|
|
209
|
+
result.version_id = persisted.versionId;
|
|
210
|
+
result.chunk_count = persisted.chunkCount;
|
|
172
211
|
} catch (err) {
|
|
173
212
|
result.status = "failed";
|
|
174
213
|
result.error = errorMessage(err);
|
|
@@ -195,6 +234,7 @@ async function ingestUrl(
|
|
|
195
234
|
status: "ok",
|
|
196
235
|
mime_type: null,
|
|
197
236
|
size_bytes: 0,
|
|
237
|
+
chunk_count: null,
|
|
198
238
|
fetcher: "downloader",
|
|
199
239
|
source_sha256: "",
|
|
200
240
|
};
|
|
@@ -225,7 +265,7 @@ async function ingestUrl(
|
|
|
225
265
|
}
|
|
226
266
|
}
|
|
227
267
|
|
|
228
|
-
const
|
|
268
|
+
const persisted = await pipelineForBytes(
|
|
229
269
|
ctx,
|
|
230
270
|
{
|
|
231
271
|
logicalPath,
|
|
@@ -244,7 +284,8 @@ async function ingestUrl(
|
|
|
244
284
|
},
|
|
245
285
|
(sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
|
|
246
286
|
);
|
|
247
|
-
result.version_id = versionId;
|
|
287
|
+
result.version_id = persisted.versionId;
|
|
288
|
+
result.chunk_count = persisted.chunkCount;
|
|
248
289
|
} catch (err) {
|
|
249
290
|
result.status = "failed";
|
|
250
291
|
result.error = errorMessage(err);
|
|
@@ -277,11 +318,28 @@ async function ingestLocalFiles(
|
|
|
277
318
|
});
|
|
278
319
|
}
|
|
279
320
|
|
|
280
|
-
const results: IngestEntryResult[] = [];
|
|
281
321
|
const isMulti = resolved.entries.length > 1;
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
322
|
+
// Cap worker count by the actual file count so tiny batches don't pay
|
|
323
|
+
// the cost of spawning N threads (each loads ~130MB of model weights);
|
|
324
|
+
// also clamp by config and the global MAX_WORKERS ceiling.
|
|
325
|
+
const configured = Math.min(DEFAULTS.MAX_WORKERS, Math.max(1, ctx.config.ingest.worker_concurrency));
|
|
326
|
+
const workerCount = Math.max(1, Math.min(configured, resolved.entries.length));
|
|
327
|
+
callbacks?.onWorkerCount?.(workerCount);
|
|
328
|
+
const persistMutex = new AsyncMutex();
|
|
329
|
+
let anyOk = false;
|
|
330
|
+
|
|
331
|
+
// Each pMap worker pulls a file from the shared queue and runs the
|
|
332
|
+
// entire pipeline end-to-end (read → unchanged check → convert →
|
|
333
|
+
// describe → chunk → embed → persist). The persist phase is gated by a
|
|
334
|
+
// single mutex because all workers share one DuckDB connection and
|
|
335
|
+
// DuckDB rejects nested BEGINs. The embed step itself fans out across
|
|
336
|
+
// the per-command embedder subprocess pool that `add` / `refresh`
|
|
337
|
+
// register via `withEmbedderPool()` — so the WASM call truly
|
|
338
|
+
// parallelizes across cores instead of serializing on the main JS
|
|
339
|
+
// event loop. When that pool isn't registered (single-shot SDK call,
|
|
340
|
+
// `embedding.workers = 1`), embed() runs inline against the in-process
|
|
341
|
+
// extractor with no IPC overhead.
|
|
342
|
+
const outcomes = await pMap(resolved.entries, workerCount, async (entry, _index, workerId) => {
|
|
285
343
|
const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
|
|
286
344
|
const result: IngestEntryResult = {
|
|
287
345
|
source_path: entry.absPath,
|
|
@@ -290,10 +348,14 @@ async function ingestLocalFiles(
|
|
|
290
348
|
status: "ok",
|
|
291
349
|
mime_type: null,
|
|
292
350
|
size_bytes: 0,
|
|
351
|
+
chunk_count: null,
|
|
293
352
|
fetcher: "local",
|
|
294
353
|
source_sha256: "",
|
|
295
354
|
};
|
|
355
|
+
callbacks?.onEntryStart?.(entry.relPathFromBase, workerId);
|
|
356
|
+
const onPhase = (sublabel: string) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel, workerId);
|
|
296
357
|
try {
|
|
358
|
+
onPhase("reading");
|
|
297
359
|
const local = await readLocalFile(entry.absPath);
|
|
298
360
|
result.mime_type = local.mimeType;
|
|
299
361
|
result.size_bytes = local.sizeBytes;
|
|
@@ -304,48 +366,189 @@ async function ingestLocalFiles(
|
|
|
304
366
|
if (cur && cur.source_sha256 === local.sha256) {
|
|
305
367
|
result.status = "unchanged";
|
|
306
368
|
result.version_id = cur.version_id;
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
continue;
|
|
369
|
+
callbacks?.onEntryComplete?.(result, workerId);
|
|
370
|
+
return result;
|
|
310
371
|
}
|
|
311
372
|
}
|
|
312
373
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
374
|
+
onPhase("converting");
|
|
375
|
+
const conversion = await convert(
|
|
376
|
+
local.bytes,
|
|
377
|
+
local.mimeType,
|
|
378
|
+
entry.absPath,
|
|
379
|
+
ctx.config.llm,
|
|
380
|
+
ctx.config.converters,
|
|
381
|
+
);
|
|
382
|
+
const markdown = conversion.markdown;
|
|
383
|
+
|
|
384
|
+
onPhase("describing");
|
|
385
|
+
const description = await describe(logicalPath, local.mimeType, markdown, ctx.config.llm);
|
|
386
|
+
|
|
387
|
+
onPhase("chunking");
|
|
388
|
+
const chunks = chunkDeterministic(markdown, ctx.config.chunker);
|
|
389
|
+
const searchTexts = chunks.map((c) => buildSearchText(logicalPath, description, c.content));
|
|
390
|
+
|
|
391
|
+
let embeddings: number[][];
|
|
392
|
+
try {
|
|
393
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model, {
|
|
394
|
+
onProgress: (done, total) => onPhase(`embedding ${done}/${total}`),
|
|
395
|
+
});
|
|
396
|
+
} catch (err) {
|
|
397
|
+
throw asHelpful(
|
|
398
|
+
err,
|
|
399
|
+
`while embedding chunks for ${logicalPath}`,
|
|
400
|
+
"Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
|
|
401
|
+
);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const versionId = await persistMutex.lock(async () => {
|
|
405
|
+
onPhase("persisting");
|
|
406
|
+
return persistOne(ctx, {
|
|
316
407
|
logicalPath,
|
|
317
|
-
bytes: local.bytes,
|
|
318
|
-
mime: local.mimeType,
|
|
319
|
-
source: entry.absPath,
|
|
320
408
|
sourceType: "local",
|
|
321
409
|
sourcePath: entry.absPath,
|
|
322
410
|
sourceMtimeMs: local.mtimeMs,
|
|
323
411
|
sourceSha: local.sha256,
|
|
412
|
+
blobSha: local.sha256,
|
|
413
|
+
mime: local.mimeType,
|
|
414
|
+
bytes: local.bytes,
|
|
415
|
+
markdown,
|
|
416
|
+
description,
|
|
417
|
+
chunks,
|
|
418
|
+
searchTexts,
|
|
419
|
+
embeddings,
|
|
324
420
|
fetcher: "local",
|
|
325
421
|
downloader: null,
|
|
326
422
|
downloaderArgs: null,
|
|
327
423
|
refreshSec,
|
|
328
424
|
changeNote: input.change_note ?? null,
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
);
|
|
425
|
+
});
|
|
426
|
+
});
|
|
332
427
|
result.version_id = versionId;
|
|
428
|
+
result.chunk_count = chunks.length;
|
|
429
|
+
anyOk = true;
|
|
430
|
+
callbacks?.onChunks?.(chunks.length);
|
|
333
431
|
} catch (err) {
|
|
334
432
|
result.status = "failed";
|
|
335
433
|
result.error = errorMessage(err);
|
|
336
|
-
} finally {
|
|
337
|
-
// Release the DB lock between files in a directory/glob walk so
|
|
338
|
-
// concurrent processes can wedge in mid-batch. The next entry's
|
|
339
|
-
// first DB call reopens (cheap — same-process reopen).
|
|
340
|
-
await ctx.db.release();
|
|
341
434
|
}
|
|
342
|
-
|
|
343
|
-
|
|
435
|
+
callbacks?.onEntryComplete?.(result, workerId);
|
|
436
|
+
return result;
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
const results: IngestEntryResult[] = outcomes.map((o) => {
|
|
440
|
+
if (o.ok) return o.value;
|
|
441
|
+
// pMap caught a worker rejection — shouldn't happen since the worker
|
|
442
|
+
// catches its own errors, but surface defensively.
|
|
443
|
+
return {
|
|
444
|
+
source_path: "",
|
|
445
|
+
logical_path: "",
|
|
446
|
+
version_id: null,
|
|
447
|
+
status: "failed",
|
|
448
|
+
error: errorMessage(o.error),
|
|
449
|
+
mime_type: null,
|
|
450
|
+
size_bytes: 0,
|
|
451
|
+
chunk_count: null,
|
|
452
|
+
fetcher: "local",
|
|
453
|
+
source_sha256: "",
|
|
454
|
+
};
|
|
455
|
+
});
|
|
456
|
+
|
|
457
|
+
// Single FTS rebuild for the whole batch — replaces N per-entry rebuilds
|
|
458
|
+
// in the prior implementation. Skip when nothing was newly persisted.
|
|
459
|
+
if (anyOk) {
|
|
460
|
+
await rebuildFts(ctx.db);
|
|
344
461
|
}
|
|
345
462
|
|
|
346
463
|
return summarize(results);
|
|
347
464
|
}
|
|
348
465
|
|
|
466
|
+
/**
|
|
467
|
+
* Per-file persist payload. All inputs are precomputed by the worker; this
|
|
468
|
+
* helper just executes the transactional DB writes.
|
|
469
|
+
*/
|
|
470
|
+
interface PersistOneParams {
|
|
471
|
+
logicalPath: string;
|
|
472
|
+
sourceType: SourceType;
|
|
473
|
+
sourcePath: string | null;
|
|
474
|
+
sourceMtimeMs: number | null;
|
|
475
|
+
sourceSha: string;
|
|
476
|
+
blobSha: string | null;
|
|
477
|
+
mime: string;
|
|
478
|
+
bytes: Uint8Array | null;
|
|
479
|
+
markdown: string;
|
|
480
|
+
description: string;
|
|
481
|
+
chunks: { index: number; content: string }[];
|
|
482
|
+
searchTexts: string[];
|
|
483
|
+
embeddings: number[][];
|
|
484
|
+
fetcher: FetcherKind;
|
|
485
|
+
downloader: string | null;
|
|
486
|
+
downloaderArgs: Record<string, unknown> | null;
|
|
487
|
+
refreshSec: number | null;
|
|
488
|
+
changeNote: string | null;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Write blob + new (logical_path, version_id) row + its chunks under a
|
|
493
|
+
* single DuckDB transaction. ROLLBACK on failure keeps the row+chunks pair
|
|
494
|
+
* atomic; one COMMIT replaces ~N+2 autocommitted round-trips.
|
|
495
|
+
*/
|
|
496
|
+
async function persistOne(ctx: AppContext, p: PersistOneParams): Promise<string> {
|
|
497
|
+
const versionId = millisIso(Date.now());
|
|
498
|
+
const contentSha = sha256Hex(new TextEncoder().encode(p.markdown));
|
|
499
|
+
await ctx.db.exec("BEGIN TRANSACTION");
|
|
500
|
+
try {
|
|
501
|
+
if (p.bytes) {
|
|
502
|
+
await upsertBlob(ctx.db, {
|
|
503
|
+
sha256: p.sourceSha,
|
|
504
|
+
mime_type: p.mime,
|
|
505
|
+
size_bytes: p.bytes.byteLength,
|
|
506
|
+
bytes: p.bytes,
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
await insertVersion(ctx.db, {
|
|
510
|
+
logical_path: p.logicalPath,
|
|
511
|
+
version_id: versionId,
|
|
512
|
+
source_type: p.sourceType,
|
|
513
|
+
source_path: p.sourcePath,
|
|
514
|
+
source_mtime_ms: p.sourceMtimeMs,
|
|
515
|
+
source_sha256: p.sourceSha,
|
|
516
|
+
blob_sha256: p.blobSha,
|
|
517
|
+
content_sha256: contentSha,
|
|
518
|
+
content: p.markdown,
|
|
519
|
+
description: p.description,
|
|
520
|
+
mime_type: p.mime,
|
|
521
|
+
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
522
|
+
fetcher: p.fetcher,
|
|
523
|
+
downloader: p.downloader,
|
|
524
|
+
downloader_args: p.downloaderArgs,
|
|
525
|
+
refresh_frequency_sec: p.refreshSec,
|
|
526
|
+
refreshed_at: new Date().toISOString(),
|
|
527
|
+
last_refresh_status: "ok",
|
|
528
|
+
change_note: p.changeNote,
|
|
529
|
+
});
|
|
530
|
+
await insertChunksForVersion(
|
|
531
|
+
ctx.db,
|
|
532
|
+
p.logicalPath,
|
|
533
|
+
versionId,
|
|
534
|
+
p.chunks.map((c, i) => ({
|
|
535
|
+
chunk_index: c.index,
|
|
536
|
+
chunk_content: c.content,
|
|
537
|
+
search_text: p.searchTexts[i] ?? buildSearchText(p.logicalPath, p.description, c.content),
|
|
538
|
+
embedding: p.embeddings[i] ?? new Array(p.embeddings[0]?.length ?? 0).fill(0),
|
|
539
|
+
})),
|
|
540
|
+
);
|
|
541
|
+
await ctx.db.exec("COMMIT");
|
|
542
|
+
} catch (err) {
|
|
543
|
+
await ctx.db.exec("ROLLBACK").catch(() => {
|
|
544
|
+
// Best effort — if ROLLBACK itself fails (already aborted, lock
|
|
545
|
+
// dropped, etc.) we still want the original error to surface.
|
|
546
|
+
});
|
|
547
|
+
throw err;
|
|
548
|
+
}
|
|
549
|
+
return versionId;
|
|
550
|
+
}
|
|
551
|
+
|
|
349
552
|
interface PipelineParams {
|
|
350
553
|
logicalPath: string;
|
|
351
554
|
bytes: Uint8Array;
|
|
@@ -373,7 +576,7 @@ async function pipelineForBytes(
|
|
|
373
576
|
ctx: AppContext,
|
|
374
577
|
p: PipelineParams,
|
|
375
578
|
onPhase?: (sublabel: string) => void,
|
|
376
|
-
): Promise<string> {
|
|
579
|
+
): Promise<{ versionId: string; chunkCount: number }> {
|
|
377
580
|
onPhase?.("storing blob");
|
|
378
581
|
await upsertBlob(ctx.db, {
|
|
379
582
|
sha256: p.sourceSha,
|
|
@@ -438,7 +641,7 @@ async function persistVersion(
|
|
|
438
641
|
ctx: AppContext,
|
|
439
642
|
p: PersistParams,
|
|
440
643
|
onPhase?: (sublabel: string) => void,
|
|
441
|
-
): Promise<string> {
|
|
644
|
+
): Promise<{ versionId: string; chunkCount: number }> {
|
|
442
645
|
onPhase?.("describing");
|
|
443
646
|
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
444
647
|
onPhase?.("chunking");
|
|
@@ -460,42 +663,49 @@ async function persistVersion(
|
|
|
460
663
|
onPhase?.("persisting");
|
|
461
664
|
const versionId = millisIso(Date.now());
|
|
462
665
|
const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
|
|
463
|
-
await
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
666
|
+
await ctx.db.exec("BEGIN TRANSACTION");
|
|
667
|
+
try {
|
|
668
|
+
await insertVersion(ctx.db, {
|
|
669
|
+
logical_path: p.logicalPath,
|
|
670
|
+
version_id: versionId,
|
|
671
|
+
source_type: p.sourceType,
|
|
672
|
+
source_path: p.sourcePath,
|
|
673
|
+
source_mtime_ms: p.sourceMtimeMs,
|
|
674
|
+
source_sha256: p.sourceSha,
|
|
675
|
+
blob_sha256: p.blobSha,
|
|
676
|
+
content_sha256: contentSha,
|
|
677
|
+
content: p.markdown,
|
|
678
|
+
description,
|
|
679
|
+
mime_type: p.mime,
|
|
680
|
+
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
681
|
+
fetcher: p.fetcher,
|
|
682
|
+
downloader: p.downloader,
|
|
683
|
+
downloader_args: p.downloaderArgs,
|
|
684
|
+
refresh_frequency_sec: p.refreshSec,
|
|
685
|
+
refreshed_at: new Date().toISOString(),
|
|
686
|
+
last_refresh_status: "ok",
|
|
687
|
+
change_note: p.changeNote,
|
|
688
|
+
});
|
|
484
689
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
690
|
+
await insertChunksForVersion(
|
|
691
|
+
ctx.db,
|
|
692
|
+
p.logicalPath,
|
|
693
|
+
versionId,
|
|
694
|
+
chunks.map((c, i) => ({
|
|
695
|
+
chunk_index: c.index,
|
|
696
|
+
chunk_content: c.content,
|
|
697
|
+
search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
|
|
698
|
+
embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
|
|
699
|
+
})),
|
|
700
|
+
);
|
|
701
|
+
await ctx.db.exec("COMMIT");
|
|
702
|
+
} catch (err) {
|
|
703
|
+
await ctx.db.exec("ROLLBACK").catch(() => {});
|
|
704
|
+
throw err;
|
|
705
|
+
}
|
|
496
706
|
onPhase?.("indexing");
|
|
497
707
|
await rebuildFts(ctx.db);
|
|
498
|
-
return versionId;
|
|
708
|
+
return { versionId, chunkCount: chunks.length };
|
|
499
709
|
}
|
|
500
710
|
|
|
501
711
|
/**
|
package/src/operations/add.ts
CHANGED
|
@@ -9,7 +9,9 @@ import {
|
|
|
9
9
|
ingestResolved,
|
|
10
10
|
} from "../ingest/ingest.ts";
|
|
11
11
|
import { type ResolvedSource, resolveSource } from "../ingest/source-resolver.ts";
|
|
12
|
-
import { colors } from "../output/formatter.ts";
|
|
12
|
+
import { colors, formatBytes } from "../output/formatter.ts";
|
|
13
|
+
import { pieFor } from "../output/progress.ts";
|
|
14
|
+
import { isInteractive } from "../output/tty.ts";
|
|
13
15
|
import { defineOperation } from "./types.ts";
|
|
14
16
|
|
|
15
17
|
const FetcherKindEnum = z.enum(["downloader", "local", "inline"]);
|
|
@@ -78,6 +80,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
78
80
|
error: z.string().optional(),
|
|
79
81
|
mime_type: z.string().nullable(),
|
|
80
82
|
size_bytes: z.number(),
|
|
83
|
+
chunk_count: z.number().nullable(),
|
|
81
84
|
fetcher: FetcherKindEnum,
|
|
82
85
|
source_sha256: z.string(),
|
|
83
86
|
}),
|
|
@@ -92,24 +95,27 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
92
95
|
aliases: { logical_path: "-p", refresh_frequency: "-r", change_note: "-m", force: "-f" },
|
|
93
96
|
},
|
|
94
97
|
console_formatter: (result) => {
|
|
95
|
-
const lines = result.ingested.map((e) => {
|
|
96
|
-
if (e.status === "ok") {
|
|
97
|
-
return `${colors.green("✓")} ${colors.cyan(e.logical_path)} ${colors.dim(`(${e.fetcher}, ${e.size_bytes}B)`)}`;
|
|
98
|
-
}
|
|
99
|
-
if (e.status === "unchanged") {
|
|
100
|
-
return `${colors.dim("≡")} ${colors.cyan(e.logical_path)} ${colors.dim("(unchanged)")}`;
|
|
101
|
-
}
|
|
102
|
-
return `${colors.red("✗")} ${e.source_path} ${colors.dim(e.error ?? "")}`;
|
|
103
|
-
});
|
|
104
98
|
const parts: string[] = [colors.green(`added ${result.ok}`)];
|
|
105
99
|
if (result.unchanged > 0) parts.push(colors.dim(`unchanged ${result.unchanged}`));
|
|
106
100
|
if (result.failed > 0) parts.push(colors.red(`failed ${result.failed}`));
|
|
107
|
-
|
|
101
|
+
const summary = parts.join(", ");
|
|
102
|
+
|
|
103
|
+
// In interactive mode, every entry was already streamed to stderr via
|
|
104
|
+
// progress.entry() during ingest; printing the same list to stdout
|
|
105
|
+
// here would just duplicate the scrollback. Non-interactive callers
|
|
106
|
+
// (JSON, piped stdout, CI) don't see the live stream, so they still
|
|
107
|
+
// get the full per-entry list as the operation's stdout payload.
|
|
108
|
+
if (isInteractive()) return summary;
|
|
109
|
+
|
|
110
|
+
const lines = result.ingested.map(formatEntryLine);
|
|
111
|
+
return `${lines.join("\n")}\n${summary}`;
|
|
108
112
|
},
|
|
109
113
|
handler: async (input, ctx) => {
|
|
110
114
|
// Spin up an ephemeral embedder pool for the whole `add` command —
|
|
111
115
|
// `withEmbedderPool` handles the workers=1 short-circuit and disposes
|
|
112
|
-
// the children when the closure returns (see embedder-pool.ts).
|
|
116
|
+
// the children when the closure returns (see embedder-pool.ts). Inside
|
|
117
|
+
// the closure, every embed() call from the ingest pipeline transparently
|
|
118
|
+
// fans out to the subprocess pool.
|
|
113
119
|
const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
|
|
114
120
|
return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
|
|
115
121
|
const { sources, ...rest } = input;
|
|
@@ -145,9 +151,27 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
145
151
|
|
|
146
152
|
ctx.progress.start(total, "ingest");
|
|
147
153
|
const callbacks: IngestCallbacks = {
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
154
|
+
// Counter advances on COMPLETION so concurrent prep doesn't race the
|
|
155
|
+
// bar to 100% before any file is fully persisted. The per-worker
|
|
156
|
+
// status section (one line per active worker) shows file + step in
|
|
157
|
+
// real time, prefixed with a pie glyph that fills as the per-file
|
|
158
|
+
// pipeline progresses. `setWorkers(n)` resizes the section whenever
|
|
159
|
+
// a new ingest source kicks off with its own pool size.
|
|
160
|
+
onWorkerCount: (n) => ctx.progress.setWorkers(n),
|
|
161
|
+
onEntryStart: (label, workerId) => {
|
|
162
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
|
|
163
|
+
ctx.progress.setLabel(label);
|
|
164
|
+
},
|
|
165
|
+
onEntryComplete: (entry, workerId) => {
|
|
166
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
|
|
167
|
+
ctx.progress.tick(entry.logical_path);
|
|
168
|
+
ctx.progress.entry(formatEntryLine(entry));
|
|
169
|
+
},
|
|
170
|
+
onEntryProgress: (label, sublabel, workerId) => {
|
|
171
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
|
|
172
|
+
ctx.progress.update(sublabel);
|
|
173
|
+
},
|
|
174
|
+
onChunks: (n) => ctx.progress.addChunks(n),
|
|
151
175
|
};
|
|
152
176
|
|
|
153
177
|
for (const outcome of outcomes) {
|
|
@@ -160,6 +184,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
160
184
|
error: outcome.error.message,
|
|
161
185
|
mime_type: null,
|
|
162
186
|
size_bytes: 0,
|
|
187
|
+
chunk_count: null,
|
|
163
188
|
fetcher: "local",
|
|
164
189
|
source_sha256: "",
|
|
165
190
|
};
|
|
@@ -188,6 +213,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
188
213
|
error: message,
|
|
189
214
|
mime_type: null,
|
|
190
215
|
size_bytes: 0,
|
|
216
|
+
chunk_count: null,
|
|
191
217
|
fetcher: "local",
|
|
192
218
|
source_sha256: "",
|
|
193
219
|
};
|
|
@@ -215,11 +241,17 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
215
241
|
* Render the persistent stderr line shown for one completed entry. Mirrors
|
|
216
242
|
* the glyphs used by the final `console_formatter` so users see the same
|
|
217
243
|
* status indicators twice (once during ingest on stderr, once in the final
|
|
218
|
-
* stdout summary).
|
|
244
|
+
* stdout summary). Successful entries show source kind, humanized byte
|
|
245
|
+
* size, and chunk count so the user can spot oddly small / oddly large
|
|
246
|
+
* files at a glance.
|
|
219
247
|
*/
|
|
220
248
|
function formatEntryLine(entry: IngestEntryResult): string {
|
|
221
249
|
if (entry.status === "ok") {
|
|
222
|
-
|
|
250
|
+
const parts: string[] = [entry.fetcher, formatBytes(entry.size_bytes)];
|
|
251
|
+
if (entry.chunk_count !== null) {
|
|
252
|
+
parts.push(`${entry.chunk_count} chunk${entry.chunk_count === 1 ? "" : "s"}`);
|
|
253
|
+
}
|
|
254
|
+
return `${colors.green("✓")} ${colors.cyan(entry.logical_path)} ${colors.dim(`(${parts.join(", ")})`)}`;
|
|
223
255
|
}
|
|
224
256
|
if (entry.status === "unchanged") {
|
|
225
257
|
return `${colors.dim("≡")} ${colors.cyan(entry.logical_path)} ${colors.dim("(unchanged)")}`;
|