membot 0.7.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +3 -0
- package/.cursor/rules/membot.mdc +3 -0
- package/README.md +7 -0
- package/package.json +1 -1
- package/src/cli.ts +11 -0
- package/src/config/schemas.ts +33 -0
- package/src/constants.ts +23 -0
- package/src/context.ts +24 -0
- package/src/ingest/concurrency.ts +60 -0
- package/src/ingest/describer.ts +49 -3
- package/src/ingest/embed-worker.ts +74 -0
- package/src/ingest/embedder-pool.ts +391 -0
- package/src/ingest/embedder.ts +40 -2
- package/src/ingest/ingest.ts +277 -67
- package/src/operations/add.ts +139 -99
- package/src/operations/index.ts +2 -0
- package/src/operations/refresh.ts +61 -34
- package/src/operations/stats.ts +342 -0
- package/src/operations/write.ts +48 -40
- package/src/output/formatter.ts +21 -0
- package/src/output/logger.ts +36 -0
- package/src/output/progress.ts +408 -46
- package/src/refresh/scheduler.ts +22 -13
package/src/ingest/ingest.ts
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
import { DEFAULTS } from "../constants.ts";
|
|
1
2
|
import type { AppContext } from "../context.ts";
|
|
2
3
|
import { upsertBlob } from "../db/blobs.ts";
|
|
3
4
|
import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
|
|
4
5
|
import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
|
|
5
6
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
6
|
-
import {
|
|
7
|
+
import { pieFor } from "../output/progress.ts";
|
|
7
8
|
import { chunkDeterministic } from "./chunker.ts";
|
|
9
|
+
import { AsyncMutex, pMap } from "./concurrency.ts";
|
|
8
10
|
import { convert } from "./converter/index.ts";
|
|
9
11
|
import { describe } from "./describer.ts";
|
|
10
12
|
import { embed } from "./embedder.ts";
|
|
@@ -33,6 +35,7 @@ export interface IngestEntryResult {
|
|
|
33
35
|
error?: string;
|
|
34
36
|
mime_type: string | null;
|
|
35
37
|
size_bytes: number;
|
|
38
|
+
chunk_count: number | null;
|
|
36
39
|
fetcher: FetcherKind;
|
|
37
40
|
source_sha256: string;
|
|
38
41
|
}
|
|
@@ -51,17 +54,32 @@ export interface IngestResult {
|
|
|
51
54
|
* without re-resolving anything. `onEntryStart` fires before the pipeline
|
|
52
55
|
* touches an entry; `onEntryComplete` fires after the result (ok / unchanged
|
|
53
56
|
* / failed) is known. Both are optional.
|
|
57
|
+
*
|
|
58
|
+
* The optional `workerId` arg threads the slot index through so the UI can
|
|
59
|
+
* show one status line per in-flight worker; callers that don't want that
|
|
60
|
+
* detail simply ignore it.
|
|
54
61
|
*/
|
|
55
62
|
export interface IngestCallbacks {
|
|
56
|
-
onEntryStart?: (label: string) => void;
|
|
57
|
-
onEntryComplete?: (entry: IngestEntryResult) => void;
|
|
63
|
+
onEntryStart?: (label: string, workerId?: number) => void;
|
|
64
|
+
onEntryComplete?: (entry: IngestEntryResult, workerId?: number) => void;
|
|
58
65
|
/**
|
|
59
66
|
* Fires for sub-step progress within a single entry (e.g. "embedding
|
|
60
67
|
* 32/168"). The callback runs many times per entry and is intended for
|
|
61
68
|
* driving an interactive spinner — non-interactive callers should ignore
|
|
62
69
|
* it to avoid log spam.
|
|
63
70
|
*/
|
|
64
|
-
onEntryProgress?: (label: string, sublabel: string) => void;
|
|
71
|
+
onEntryProgress?: (label: string, sublabel: string, workerId?: number) => void;
|
|
72
|
+
/**
|
|
73
|
+
* Fires once after the worker pool size has been determined, before the
|
|
74
|
+
* first entry begins. Lets the progress reporter size its per-worker
|
|
75
|
+
* status section.
|
|
76
|
+
*/
|
|
77
|
+
onWorkerCount?: (n: number) => void;
|
|
78
|
+
/**
|
|
79
|
+
* Fires after each successful persist with the number of new chunks
|
|
80
|
+
* written, so the progress reporter can track a running total.
|
|
81
|
+
*/
|
|
82
|
+
onChunks?: (n: number) => void;
|
|
65
83
|
}
|
|
66
84
|
|
|
67
85
|
/**
|
|
@@ -92,7 +110,26 @@ export async function ingest(input: IngestInput, ctx: AppContext): Promise<Inges
|
|
|
92
110
|
const total = countResolvedEntries(resolved);
|
|
93
111
|
ctx.progress.start(total, "ingest");
|
|
94
112
|
const callbacks: IngestCallbacks = {
|
|
95
|
-
|
|
113
|
+
// Tick on completion so the bar reflects done-and-persisted entries,
|
|
114
|
+
// not concurrently-in-flight ones. setLabel shows the in-flight file
|
|
115
|
+
// without advancing the count; sub-step suffix flows via update; per-
|
|
116
|
+
// worker status lines + chunk total light up if the reporter supports
|
|
117
|
+
// them (multi-line UI in TTY, no-op otherwise). The pie glyph fills
|
|
118
|
+
// in as the per-file pipeline marches read → … → persist.
|
|
119
|
+
onWorkerCount: (n) => ctx.progress.setWorkers(n),
|
|
120
|
+
onEntryStart: (label, workerId) => {
|
|
121
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(undefined)} ${label}`);
|
|
122
|
+
ctx.progress.setLabel(label);
|
|
123
|
+
},
|
|
124
|
+
onEntryComplete: (entry, workerId) => {
|
|
125
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, "");
|
|
126
|
+
ctx.progress.tick(entry.logical_path);
|
|
127
|
+
},
|
|
128
|
+
onEntryProgress: (label, sublabel, workerId) => {
|
|
129
|
+
if (workerId !== undefined) ctx.progress.workerSet(workerId, `${pieFor(sublabel)} ${label} — ${sublabel}`);
|
|
130
|
+
ctx.progress.update(sublabel);
|
|
131
|
+
},
|
|
132
|
+
onChunks: (n) => ctx.progress.addChunks(n),
|
|
96
133
|
};
|
|
97
134
|
const result = await ingestResolved(resolved, input, ctx, callbacks);
|
|
98
135
|
const okCount = result.ok;
|
|
@@ -144,11 +181,12 @@ async function ingestInline(
|
|
|
144
181
|
status: "ok",
|
|
145
182
|
mime_type: "text/markdown",
|
|
146
183
|
size_bytes: bytes.byteLength,
|
|
184
|
+
chunk_count: null,
|
|
147
185
|
fetcher: "inline",
|
|
148
186
|
source_sha256: sha,
|
|
149
187
|
};
|
|
150
188
|
try {
|
|
151
|
-
const
|
|
189
|
+
const persisted = await persistVersion(
|
|
152
190
|
ctx,
|
|
153
191
|
{
|
|
154
192
|
logicalPath,
|
|
@@ -168,7 +206,8 @@ async function ingestInline(
|
|
|
168
206
|
},
|
|
169
207
|
(sublabel) => callbacks?.onEntryProgress?.(logicalPath, sublabel),
|
|
170
208
|
);
|
|
171
|
-
result.version_id = versionId;
|
|
209
|
+
result.version_id = persisted.versionId;
|
|
210
|
+
result.chunk_count = persisted.chunkCount;
|
|
172
211
|
} catch (err) {
|
|
173
212
|
result.status = "failed";
|
|
174
213
|
result.error = errorMessage(err);
|
|
@@ -195,6 +234,7 @@ async function ingestUrl(
|
|
|
195
234
|
status: "ok",
|
|
196
235
|
mime_type: null,
|
|
197
236
|
size_bytes: 0,
|
|
237
|
+
chunk_count: null,
|
|
198
238
|
fetcher: "downloader",
|
|
199
239
|
source_sha256: "",
|
|
200
240
|
};
|
|
@@ -225,7 +265,7 @@ async function ingestUrl(
|
|
|
225
265
|
}
|
|
226
266
|
}
|
|
227
267
|
|
|
228
|
-
const
|
|
268
|
+
const persisted = await pipelineForBytes(
|
|
229
269
|
ctx,
|
|
230
270
|
{
|
|
231
271
|
logicalPath,
|
|
@@ -244,7 +284,8 @@ async function ingestUrl(
|
|
|
244
284
|
},
|
|
245
285
|
(sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
|
|
246
286
|
);
|
|
247
|
-
result.version_id = versionId;
|
|
287
|
+
result.version_id = persisted.versionId;
|
|
288
|
+
result.chunk_count = persisted.chunkCount;
|
|
248
289
|
} catch (err) {
|
|
249
290
|
result.status = "failed";
|
|
250
291
|
result.error = errorMessage(err);
|
|
@@ -277,11 +318,28 @@ async function ingestLocalFiles(
|
|
|
277
318
|
});
|
|
278
319
|
}
|
|
279
320
|
|
|
280
|
-
const results: IngestEntryResult[] = [];
|
|
281
321
|
const isMulti = resolved.entries.length > 1;
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
322
|
+
// Cap worker count by the actual file count so tiny batches don't pay
|
|
323
|
+
// the cost of spawning N threads (each loads ~130MB of model weights);
|
|
324
|
+
// also clamp by config and the global MAX_WORKERS ceiling.
|
|
325
|
+
const configured = Math.min(DEFAULTS.MAX_WORKERS, Math.max(1, ctx.config.ingest.worker_concurrency));
|
|
326
|
+
const workerCount = Math.max(1, Math.min(configured, resolved.entries.length));
|
|
327
|
+
callbacks?.onWorkerCount?.(workerCount);
|
|
328
|
+
const persistMutex = new AsyncMutex();
|
|
329
|
+
let anyOk = false;
|
|
330
|
+
|
|
331
|
+
// Each pMap worker pulls a file from the shared queue and runs the
|
|
332
|
+
// entire pipeline end-to-end (read → unchanged check → convert →
|
|
333
|
+
// describe → chunk → embed → persist). The persist phase is gated by a
|
|
334
|
+
// single mutex because all workers share one DuckDB connection and
|
|
335
|
+
// DuckDB rejects nested BEGINs. The embed step itself fans out across
|
|
336
|
+
// the per-command embedder subprocess pool that `add` / `refresh`
|
|
337
|
+
// register via `withEmbedderPool()` — so the WASM call truly
|
|
338
|
+
// parallelizes across cores instead of serializing on the main JS
|
|
339
|
+
// event loop. When that pool isn't registered (single-shot SDK call,
|
|
340
|
+
// `embedding.workers = 1`), embed() runs inline against the in-process
|
|
341
|
+
// extractor with no IPC overhead.
|
|
342
|
+
const outcomes = await pMap(resolved.entries, workerCount, async (entry, _index, workerId) => {
|
|
285
343
|
const logicalPath = pickLogicalPath(input.logical_path, entry, isMulti);
|
|
286
344
|
const result: IngestEntryResult = {
|
|
287
345
|
source_path: entry.absPath,
|
|
@@ -290,10 +348,14 @@ async function ingestLocalFiles(
|
|
|
290
348
|
status: "ok",
|
|
291
349
|
mime_type: null,
|
|
292
350
|
size_bytes: 0,
|
|
351
|
+
chunk_count: null,
|
|
293
352
|
fetcher: "local",
|
|
294
353
|
source_sha256: "",
|
|
295
354
|
};
|
|
355
|
+
callbacks?.onEntryStart?.(entry.relPathFromBase, workerId);
|
|
356
|
+
const onPhase = (sublabel: string) => callbacks?.onEntryProgress?.(entry.relPathFromBase, sublabel, workerId);
|
|
296
357
|
try {
|
|
358
|
+
onPhase("reading");
|
|
297
359
|
const local = await readLocalFile(entry.absPath);
|
|
298
360
|
result.mime_type = local.mimeType;
|
|
299
361
|
result.size_bytes = local.sizeBytes;
|
|
@@ -304,48 +366,189 @@ async function ingestLocalFiles(
|
|
|
304
366
|
if (cur && cur.source_sha256 === local.sha256) {
|
|
305
367
|
result.status = "unchanged";
|
|
306
368
|
result.version_id = cur.version_id;
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
continue;
|
|
369
|
+
callbacks?.onEntryComplete?.(result, workerId);
|
|
370
|
+
return result;
|
|
310
371
|
}
|
|
311
372
|
}
|
|
312
373
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
374
|
+
onPhase("converting");
|
|
375
|
+
const conversion = await convert(
|
|
376
|
+
local.bytes,
|
|
377
|
+
local.mimeType,
|
|
378
|
+
entry.absPath,
|
|
379
|
+
ctx.config.llm,
|
|
380
|
+
ctx.config.converters,
|
|
381
|
+
);
|
|
382
|
+
const markdown = conversion.markdown;
|
|
383
|
+
|
|
384
|
+
onPhase("describing");
|
|
385
|
+
const description = await describe(logicalPath, local.mimeType, markdown, ctx.config.llm);
|
|
386
|
+
|
|
387
|
+
onPhase("chunking");
|
|
388
|
+
const chunks = chunkDeterministic(markdown, ctx.config.chunker);
|
|
389
|
+
const searchTexts = chunks.map((c) => buildSearchText(logicalPath, description, c.content));
|
|
390
|
+
|
|
391
|
+
let embeddings: number[][];
|
|
392
|
+
try {
|
|
393
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model, {
|
|
394
|
+
onProgress: (done, total) => onPhase(`embedding ${done}/${total}`),
|
|
395
|
+
});
|
|
396
|
+
} catch (err) {
|
|
397
|
+
throw asHelpful(
|
|
398
|
+
err,
|
|
399
|
+
`while embedding chunks for ${logicalPath}`,
|
|
400
|
+
"Run `bun run prebuild` to apply the transformers WASM patch, or set a different config.embedding_model.",
|
|
401
|
+
);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const versionId = await persistMutex.lock(async () => {
|
|
405
|
+
onPhase("persisting");
|
|
406
|
+
return persistOne(ctx, {
|
|
316
407
|
logicalPath,
|
|
317
|
-
bytes: local.bytes,
|
|
318
|
-
mime: local.mimeType,
|
|
319
|
-
source: entry.absPath,
|
|
320
408
|
sourceType: "local",
|
|
321
409
|
sourcePath: entry.absPath,
|
|
322
410
|
sourceMtimeMs: local.mtimeMs,
|
|
323
411
|
sourceSha: local.sha256,
|
|
412
|
+
blobSha: local.sha256,
|
|
413
|
+
mime: local.mimeType,
|
|
414
|
+
bytes: local.bytes,
|
|
415
|
+
markdown,
|
|
416
|
+
description,
|
|
417
|
+
chunks,
|
|
418
|
+
searchTexts,
|
|
419
|
+
embeddings,
|
|
324
420
|
fetcher: "local",
|
|
325
421
|
downloader: null,
|
|
326
422
|
downloaderArgs: null,
|
|
327
423
|
refreshSec,
|
|
328
424
|
changeNote: input.change_note ?? null,
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
);
|
|
425
|
+
});
|
|
426
|
+
});
|
|
332
427
|
result.version_id = versionId;
|
|
428
|
+
result.chunk_count = chunks.length;
|
|
429
|
+
anyOk = true;
|
|
430
|
+
callbacks?.onChunks?.(chunks.length);
|
|
333
431
|
} catch (err) {
|
|
334
432
|
result.status = "failed";
|
|
335
433
|
result.error = errorMessage(err);
|
|
336
|
-
} finally {
|
|
337
|
-
// Release the DB lock between files in a directory/glob walk so
|
|
338
|
-
// concurrent processes can wedge in mid-batch. The next entry's
|
|
339
|
-
// first DB call reopens (cheap — same-process reopen).
|
|
340
|
-
await ctx.db.release();
|
|
341
434
|
}
|
|
342
|
-
|
|
343
|
-
|
|
435
|
+
callbacks?.onEntryComplete?.(result, workerId);
|
|
436
|
+
return result;
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
const results: IngestEntryResult[] = outcomes.map((o) => {
|
|
440
|
+
if (o.ok) return o.value;
|
|
441
|
+
// pMap caught a worker rejection — shouldn't happen since the worker
|
|
442
|
+
// catches its own errors, but surface defensively.
|
|
443
|
+
return {
|
|
444
|
+
source_path: "",
|
|
445
|
+
logical_path: "",
|
|
446
|
+
version_id: null,
|
|
447
|
+
status: "failed",
|
|
448
|
+
error: errorMessage(o.error),
|
|
449
|
+
mime_type: null,
|
|
450
|
+
size_bytes: 0,
|
|
451
|
+
chunk_count: null,
|
|
452
|
+
fetcher: "local",
|
|
453
|
+
source_sha256: "",
|
|
454
|
+
};
|
|
455
|
+
});
|
|
456
|
+
|
|
457
|
+
// Single FTS rebuild for the whole batch — replaces N per-entry rebuilds
|
|
458
|
+
// in the prior implementation. Skip when nothing was newly persisted.
|
|
459
|
+
if (anyOk) {
|
|
460
|
+
await rebuildFts(ctx.db);
|
|
344
461
|
}
|
|
345
462
|
|
|
346
463
|
return summarize(results);
|
|
347
464
|
}
|
|
348
465
|
|
|
466
|
+
/**
|
|
467
|
+
* Per-file persist payload. All inputs are precomputed by the worker; this
|
|
468
|
+
* helper just executes the transactional DB writes.
|
|
469
|
+
*/
|
|
470
|
+
interface PersistOneParams {
|
|
471
|
+
logicalPath: string;
|
|
472
|
+
sourceType: SourceType;
|
|
473
|
+
sourcePath: string | null;
|
|
474
|
+
sourceMtimeMs: number | null;
|
|
475
|
+
sourceSha: string;
|
|
476
|
+
blobSha: string | null;
|
|
477
|
+
mime: string;
|
|
478
|
+
bytes: Uint8Array | null;
|
|
479
|
+
markdown: string;
|
|
480
|
+
description: string;
|
|
481
|
+
chunks: { index: number; content: string }[];
|
|
482
|
+
searchTexts: string[];
|
|
483
|
+
embeddings: number[][];
|
|
484
|
+
fetcher: FetcherKind;
|
|
485
|
+
downloader: string | null;
|
|
486
|
+
downloaderArgs: Record<string, unknown> | null;
|
|
487
|
+
refreshSec: number | null;
|
|
488
|
+
changeNote: string | null;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Write blob + new (logical_path, version_id) row + its chunks under a
|
|
493
|
+
* single DuckDB transaction. ROLLBACK on failure keeps the row+chunks pair
|
|
494
|
+
* atomic; one COMMIT replaces ~N+2 autocommitted round-trips.
|
|
495
|
+
*/
|
|
496
|
+
async function persistOne(ctx: AppContext, p: PersistOneParams): Promise<string> {
|
|
497
|
+
const versionId = millisIso(Date.now());
|
|
498
|
+
const contentSha = sha256Hex(new TextEncoder().encode(p.markdown));
|
|
499
|
+
await ctx.db.exec("BEGIN TRANSACTION");
|
|
500
|
+
try {
|
|
501
|
+
if (p.bytes) {
|
|
502
|
+
await upsertBlob(ctx.db, {
|
|
503
|
+
sha256: p.sourceSha,
|
|
504
|
+
mime_type: p.mime,
|
|
505
|
+
size_bytes: p.bytes.byteLength,
|
|
506
|
+
bytes: p.bytes,
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
await insertVersion(ctx.db, {
|
|
510
|
+
logical_path: p.logicalPath,
|
|
511
|
+
version_id: versionId,
|
|
512
|
+
source_type: p.sourceType,
|
|
513
|
+
source_path: p.sourcePath,
|
|
514
|
+
source_mtime_ms: p.sourceMtimeMs,
|
|
515
|
+
source_sha256: p.sourceSha,
|
|
516
|
+
blob_sha256: p.blobSha,
|
|
517
|
+
content_sha256: contentSha,
|
|
518
|
+
content: p.markdown,
|
|
519
|
+
description: p.description,
|
|
520
|
+
mime_type: p.mime,
|
|
521
|
+
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
522
|
+
fetcher: p.fetcher,
|
|
523
|
+
downloader: p.downloader,
|
|
524
|
+
downloader_args: p.downloaderArgs,
|
|
525
|
+
refresh_frequency_sec: p.refreshSec,
|
|
526
|
+
refreshed_at: new Date().toISOString(),
|
|
527
|
+
last_refresh_status: "ok",
|
|
528
|
+
change_note: p.changeNote,
|
|
529
|
+
});
|
|
530
|
+
await insertChunksForVersion(
|
|
531
|
+
ctx.db,
|
|
532
|
+
p.logicalPath,
|
|
533
|
+
versionId,
|
|
534
|
+
p.chunks.map((c, i) => ({
|
|
535
|
+
chunk_index: c.index,
|
|
536
|
+
chunk_content: c.content,
|
|
537
|
+
search_text: p.searchTexts[i] ?? buildSearchText(p.logicalPath, p.description, c.content),
|
|
538
|
+
embedding: p.embeddings[i] ?? new Array(p.embeddings[0]?.length ?? 0).fill(0),
|
|
539
|
+
})),
|
|
540
|
+
);
|
|
541
|
+
await ctx.db.exec("COMMIT");
|
|
542
|
+
} catch (err) {
|
|
543
|
+
await ctx.db.exec("ROLLBACK").catch(() => {
|
|
544
|
+
// Best effort — if ROLLBACK itself fails (already aborted, lock
|
|
545
|
+
// dropped, etc.) we still want the original error to surface.
|
|
546
|
+
});
|
|
547
|
+
throw err;
|
|
548
|
+
}
|
|
549
|
+
return versionId;
|
|
550
|
+
}
|
|
551
|
+
|
|
349
552
|
interface PipelineParams {
|
|
350
553
|
logicalPath: string;
|
|
351
554
|
bytes: Uint8Array;
|
|
@@ -373,7 +576,7 @@ async function pipelineForBytes(
|
|
|
373
576
|
ctx: AppContext,
|
|
374
577
|
p: PipelineParams,
|
|
375
578
|
onPhase?: (sublabel: string) => void,
|
|
376
|
-
): Promise<string> {
|
|
579
|
+
): Promise<{ versionId: string; chunkCount: number }> {
|
|
377
580
|
onPhase?.("storing blob");
|
|
378
581
|
await upsertBlob(ctx.db, {
|
|
379
582
|
sha256: p.sourceSha,
|
|
@@ -438,7 +641,7 @@ async function persistVersion(
|
|
|
438
641
|
ctx: AppContext,
|
|
439
642
|
p: PersistParams,
|
|
440
643
|
onPhase?: (sublabel: string) => void,
|
|
441
|
-
): Promise<string> {
|
|
644
|
+
): Promise<{ versionId: string; chunkCount: number }> {
|
|
442
645
|
onPhase?.("describing");
|
|
443
646
|
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
444
647
|
onPhase?.("chunking");
|
|
@@ -460,42 +663,49 @@ async function persistVersion(
|
|
|
460
663
|
onPhase?.("persisting");
|
|
461
664
|
const versionId = millisIso(Date.now());
|
|
462
665
|
const contentSha = p.contentSha ?? sha256Hex(new TextEncoder().encode(p.markdown));
|
|
463
|
-
await
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
666
|
+
await ctx.db.exec("BEGIN TRANSACTION");
|
|
667
|
+
try {
|
|
668
|
+
await insertVersion(ctx.db, {
|
|
669
|
+
logical_path: p.logicalPath,
|
|
670
|
+
version_id: versionId,
|
|
671
|
+
source_type: p.sourceType,
|
|
672
|
+
source_path: p.sourcePath,
|
|
673
|
+
source_mtime_ms: p.sourceMtimeMs,
|
|
674
|
+
source_sha256: p.sourceSha,
|
|
675
|
+
blob_sha256: p.blobSha,
|
|
676
|
+
content_sha256: contentSha,
|
|
677
|
+
content: p.markdown,
|
|
678
|
+
description,
|
|
679
|
+
mime_type: p.mime,
|
|
680
|
+
size_bytes: p.bytes?.byteLength ?? new TextEncoder().encode(p.markdown).byteLength,
|
|
681
|
+
fetcher: p.fetcher,
|
|
682
|
+
downloader: p.downloader,
|
|
683
|
+
downloader_args: p.downloaderArgs,
|
|
684
|
+
refresh_frequency_sec: p.refreshSec,
|
|
685
|
+
refreshed_at: new Date().toISOString(),
|
|
686
|
+
last_refresh_status: "ok",
|
|
687
|
+
change_note: p.changeNote,
|
|
688
|
+
});
|
|
484
689
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
690
|
+
await insertChunksForVersion(
|
|
691
|
+
ctx.db,
|
|
692
|
+
p.logicalPath,
|
|
693
|
+
versionId,
|
|
694
|
+
chunks.map((c, i) => ({
|
|
695
|
+
chunk_index: c.index,
|
|
696
|
+
chunk_content: c.content,
|
|
697
|
+
search_text: searchTexts[i] ?? buildSearchText(p.logicalPath, description, c.content),
|
|
698
|
+
embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
|
|
699
|
+
})),
|
|
700
|
+
);
|
|
701
|
+
await ctx.db.exec("COMMIT");
|
|
702
|
+
} catch (err) {
|
|
703
|
+
await ctx.db.exec("ROLLBACK").catch(() => {});
|
|
704
|
+
throw err;
|
|
705
|
+
}
|
|
496
706
|
onPhase?.("indexing");
|
|
497
707
|
await rebuildFts(ctx.db);
|
|
498
|
-
return versionId;
|
|
708
|
+
return { versionId, chunkCount: chunks.length };
|
|
499
709
|
}
|
|
500
710
|
|
|
501
711
|
/**
|