membot 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/ingest/embedder.ts +23 -1
- package/src/ingest/ingest.ts +103 -71
- package/src/operations/add.ts +1 -0
- package/src/operations/refresh.ts +3 -1
- package/src/output/progress.ts +20 -4
- package/src/refresh/runner.ts +67 -38
package/package.json
CHANGED
package/src/ingest/embedder.ts
CHANGED
|
@@ -63,6 +63,16 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
|
63
63
|
return p;
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
/**
|
|
67
|
+
* Options for `embed()`. `onProgress` fires once after each batch finishes
|
|
68
|
+
* with `(done, total)` chunk counts so callers can drive a spinner / progress
|
|
69
|
+
* bar — ONNX WASM holds the JS thread for hundreds of ms per batch and would
|
|
70
|
+
* otherwise leave nanospinner's setInterval starved between updates.
|
|
71
|
+
*/
|
|
72
|
+
export interface EmbedOptions {
|
|
73
|
+
onProgress?: (done: number, total: number) => void;
|
|
74
|
+
}
|
|
75
|
+
|
|
66
76
|
/**
|
|
67
77
|
* Embed an array of texts to L2-normalized vectors with the configured
|
|
68
78
|
* model. Throws a HelpfulError when the model's dimension doesn't match
|
|
@@ -71,8 +81,16 @@ async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
|
71
81
|
* Inputs are sliced into windows of EMBEDDING_BATCH_SIZE so a single
|
|
72
82
|
* forward pass never has to allocate activations for arbitrarily many
|
|
73
83
|
* chunks — large files (hundreds of chunks) otherwise OOM the WASM heap.
|
|
84
|
+
*
|
|
85
|
+
* Between batches we yield a macrotask (`setTimeout(0)`) so the event loop
|
|
86
|
+
* can flush nanospinner renders and stderr writes — without that, the spinner
|
|
87
|
+
* visibly freezes for the entire embed phase on large files.
|
|
74
88
|
*/
|
|
75
|
-
export async function embed(
|
|
89
|
+
export async function embed(
|
|
90
|
+
texts: string[],
|
|
91
|
+
model: string = EMBEDDING_MODEL,
|
|
92
|
+
opts: EmbedOptions = {},
|
|
93
|
+
): Promise<number[][]> {
|
|
76
94
|
if (texts.length === 0) return [];
|
|
77
95
|
const extractor = await getPipeline(model);
|
|
78
96
|
const out: number[][] = [];
|
|
@@ -88,6 +106,10 @@ export async function embed(texts: string[], model: string = EMBEDDING_MODEL): P
|
|
|
88
106
|
});
|
|
89
107
|
}
|
|
90
108
|
for (const vec of data) out.push(vec);
|
|
109
|
+
opts.onProgress?.(out.length, texts.length);
|
|
110
|
+
// Yield a macrotask so nanospinner's setInterval and any queued
|
|
111
|
+
// stderr writes get a chance to run between batches.
|
|
112
|
+
await new Promise<void>((resolve) => setTimeout(resolve, 0));
|
|
91
113
|
}
|
|
92
114
|
return out;
|
|
93
115
|
}
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -54,6 +54,13 @@ export interface IngestResult {
|
|
|
54
54
|
export interface IngestCallbacks {
|
|
55
55
|
onEntryStart?: (label: string) => void;
|
|
56
56
|
onEntryComplete?: (entry: IngestEntryResult) => void;
|
|
57
|
+
/**
|
|
58
|
+
* Fires for sub-step progress within a single entry (e.g. "embedding
|
|
59
|
+
* 32/168"). The callback runs many times per entry and is intended for
|
|
60
|
+
* driving an interactive spinner — non-interactive callers should ignore
|
|
61
|
+
* it to avoid log spam.
|
|
62
|
+
*/
|
|
63
|
+
onEntryProgress?: (label: string, sublabel: string) => void;
|
|
57
64
|
}
|
|
58
65
|
|
|
59
66
|
/**
|
|
@@ -140,23 +147,27 @@ async function ingestInline(
|
|
|
140
147
|
source_sha256: sha,
|
|
141
148
|
};
|
|
142
149
|
try {
|
|
143
|
-
const versionId = await persistVersion(
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
150
|
+
const versionId = await persistVersion(
|
|
151
|
+
ctx,
|
|
152
|
+
{
|
|
153
|
+
logicalPath,
|
|
154
|
+
sourceType: "inline",
|
|
155
|
+
sourcePath: null,
|
|
156
|
+
sourceMtimeMs: null,
|
|
157
|
+
sourceSha: sha,
|
|
158
|
+
blobSha: null,
|
|
159
|
+
mime: "text/markdown",
|
|
160
|
+
bytes: null,
|
|
161
|
+
markdown: text,
|
|
162
|
+
fetcher: "inline",
|
|
163
|
+
fetcherServer: null,
|
|
164
|
+
fetcherTool: null,
|
|
165
|
+
fetcherArgs: null,
|
|
166
|
+
refreshSec,
|
|
167
|
+
changeNote: input.change_note ?? null,
|
|
168
|
+
},
|
|
169
|
+
(done, total) => callbacks?.onEntryProgress?.(logicalPath, `embedding ${done}/${total}`),
|
|
170
|
+
);
|
|
160
171
|
result.version_id = versionId;
|
|
161
172
|
} catch (err) {
|
|
162
173
|
result.status = "failed";
|
|
@@ -217,22 +228,26 @@ async function ingestUrl(
|
|
|
217
228
|
}
|
|
218
229
|
}
|
|
219
230
|
|
|
220
|
-
const versionId = await pipelineForBytes(
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
231
|
+
const versionId = await pipelineForBytes(
|
|
232
|
+
ctx,
|
|
233
|
+
{
|
|
234
|
+
logicalPath,
|
|
235
|
+
bytes: fetched.bytes,
|
|
236
|
+
mime: fetched.mimeType,
|
|
237
|
+
source: url,
|
|
238
|
+
sourceType: "remote",
|
|
239
|
+
sourcePath: url,
|
|
240
|
+
sourceMtimeMs: null,
|
|
241
|
+
sourceSha: fetched.sha256,
|
|
242
|
+
fetcher: fetched.fetcher,
|
|
243
|
+
fetcherServer: fetched.fetcherServer,
|
|
244
|
+
fetcherTool: fetched.fetcherTool,
|
|
245
|
+
fetcherArgs: fetched.fetcherArgs,
|
|
246
|
+
refreshSec,
|
|
247
|
+
changeNote: input.change_note ?? null,
|
|
248
|
+
},
|
|
249
|
+
(done, total) => callbacks?.onEntryProgress?.(url, `embedding ${done}/${total}`),
|
|
250
|
+
);
|
|
236
251
|
result.version_id = versionId;
|
|
237
252
|
} catch (err) {
|
|
238
253
|
result.status = "failed";
|
|
@@ -299,22 +314,26 @@ async function ingestLocalFiles(
|
|
|
299
314
|
}
|
|
300
315
|
}
|
|
301
316
|
|
|
302
|
-
const versionId = await pipelineForBytes(
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
317
|
+
const versionId = await pipelineForBytes(
|
|
318
|
+
ctx,
|
|
319
|
+
{
|
|
320
|
+
logicalPath,
|
|
321
|
+
bytes: local.bytes,
|
|
322
|
+
mime: local.mimeType,
|
|
323
|
+
source: entry.absPath,
|
|
324
|
+
sourceType: "local",
|
|
325
|
+
sourcePath: entry.absPath,
|
|
326
|
+
sourceMtimeMs: local.mtimeMs,
|
|
327
|
+
sourceSha: local.sha256,
|
|
328
|
+
fetcher: "local",
|
|
329
|
+
fetcherServer: null,
|
|
330
|
+
fetcherTool: null,
|
|
331
|
+
fetcherArgs: null,
|
|
332
|
+
refreshSec,
|
|
333
|
+
changeNote: input.change_note ?? null,
|
|
334
|
+
},
|
|
335
|
+
(done, total) => callbacks?.onEntryProgress?.(entry.relPathFromBase, `embedding ${done}/${total}`),
|
|
336
|
+
);
|
|
318
337
|
result.version_id = versionId;
|
|
319
338
|
} catch (err) {
|
|
320
339
|
result.status = "failed";
|
|
@@ -353,9 +372,14 @@ interface PipelineParams {
|
|
|
353
372
|
* Run the bytes-in / version-out pipeline: store the blob, convert to
|
|
354
373
|
* markdown, describe, chunk, embed, and write a new files row + chunks
|
|
355
374
|
* rows under a fresh version_id. Returns the version_id so callers can
|
|
356
|
-
* report it back.
|
|
375
|
+
* report it back. The optional `onEmbedProgress` is forwarded to the
|
|
376
|
+
* embedder so callers can drive a spinner during the slow phase.
|
|
357
377
|
*/
|
|
358
|
-
async function pipelineForBytes(
|
|
378
|
+
async function pipelineForBytes(
|
|
379
|
+
ctx: AppContext,
|
|
380
|
+
p: PipelineParams,
|
|
381
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
382
|
+
): Promise<string> {
|
|
359
383
|
await upsertBlob(ctx.db, {
|
|
360
384
|
sha256: p.sourceSha,
|
|
361
385
|
mime_type: p.mime,
|
|
@@ -367,24 +391,28 @@ async function pipelineForBytes(ctx: AppContext, p: PipelineParams): Promise<str
|
|
|
367
391
|
const markdown = conversion.markdown;
|
|
368
392
|
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|
|
369
393
|
|
|
370
|
-
return persistVersion(
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
394
|
+
return persistVersion(
|
|
395
|
+
ctx,
|
|
396
|
+
{
|
|
397
|
+
logicalPath: p.logicalPath,
|
|
398
|
+
sourceType: p.sourceType,
|
|
399
|
+
sourcePath: p.sourcePath,
|
|
400
|
+
sourceMtimeMs: p.sourceMtimeMs,
|
|
401
|
+
sourceSha: p.sourceSha,
|
|
402
|
+
blobSha: p.sourceSha,
|
|
403
|
+
mime: p.mime,
|
|
404
|
+
bytes: p.bytes,
|
|
405
|
+
markdown,
|
|
406
|
+
contentSha,
|
|
407
|
+
fetcher: p.fetcher,
|
|
408
|
+
fetcherServer: p.fetcherServer,
|
|
409
|
+
fetcherTool: p.fetcherTool,
|
|
410
|
+
fetcherArgs: p.fetcherArgs,
|
|
411
|
+
refreshSec: p.refreshSec,
|
|
412
|
+
changeNote: p.changeNote,
|
|
413
|
+
},
|
|
414
|
+
onEmbedProgress,
|
|
415
|
+
);
|
|
388
416
|
}
|
|
389
417
|
|
|
390
418
|
interface PersistParams {
|
|
@@ -412,13 +440,17 @@ interface PersistParams {
|
|
|
412
440
|
* embedded text per chunk is `<path>\n<description>\n\n<body>`, stored
|
|
413
441
|
* verbatim as `chunks.search_text` and later FTS-indexed.
|
|
414
442
|
*/
|
|
415
|
-
async function persistVersion(
|
|
443
|
+
async function persistVersion(
|
|
444
|
+
ctx: AppContext,
|
|
445
|
+
p: PersistParams,
|
|
446
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
447
|
+
): Promise<string> {
|
|
416
448
|
const description = await describe(p.logicalPath, p.mime, p.markdown, ctx.config.llm);
|
|
417
449
|
const chunks = chunkDeterministic(p.markdown, ctx.config.chunker);
|
|
418
450
|
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
419
451
|
let embeddings: number[][];
|
|
420
452
|
try {
|
|
421
|
-
embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
453
|
+
embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
|
|
422
454
|
} catch (err) {
|
|
423
455
|
throw asHelpful(
|
|
424
456
|
err,
|
package/src/operations/add.ts
CHANGED
|
@@ -138,6 +138,7 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
138
138
|
const callbacks: IngestCallbacks = {
|
|
139
139
|
onEntryStart: (label) => ctx.progress.tick(label),
|
|
140
140
|
onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
|
|
141
|
+
onEntryProgress: (_label, sublabel) => ctx.progress.update(sublabel),
|
|
141
142
|
};
|
|
142
143
|
|
|
143
144
|
for (const outcome of outcomes) {
|
|
@@ -60,7 +60,9 @@ export const refreshOperation = defineOperation({
|
|
|
60
60
|
for (const path of targets) {
|
|
61
61
|
ctx.progress.tick(path);
|
|
62
62
|
try {
|
|
63
|
-
const r = await refreshOne(ctx, path, input.force)
|
|
63
|
+
const r = await refreshOne(ctx, path, input.force, (done, total) =>
|
|
64
|
+
ctx.progress.update(`embedding ${done}/${total}`),
|
|
65
|
+
);
|
|
64
66
|
out.push(r);
|
|
65
67
|
} catch (err) {
|
|
66
68
|
out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });
|
package/src/output/progress.ts
CHANGED
|
@@ -15,6 +15,13 @@ import { isSilent, useSpinner } from "./tty.ts";
|
|
|
15
15
|
export interface Progress {
|
|
16
16
|
start(total: number, label?: string): void;
|
|
17
17
|
tick(label: string): void;
|
|
18
|
+
/**
|
|
19
|
+
* Re-render the active spinner with the most recent `tick` label plus an
|
|
20
|
+
* extra suffix (e.g. "embedding 32/168") without advancing the counter.
|
|
21
|
+
* No-op in non-interactive / silent / JSON modes — sub-step progress is
|
|
22
|
+
* deliberately TTY-only so CI logs don't get one line per inner batch.
|
|
23
|
+
*/
|
|
24
|
+
update(suffix: string): void;
|
|
18
25
|
entry(line: string): void;
|
|
19
26
|
done(summary?: string): void;
|
|
20
27
|
fail(summary?: string): void;
|
|
@@ -51,25 +58,28 @@ function truncateLabel(label: string, max = LABEL_MAX): string {
|
|
|
51
58
|
export function createProgress(): Progress {
|
|
52
59
|
let total = 0;
|
|
53
60
|
let count = 0;
|
|
61
|
+
let lastLabel = "";
|
|
54
62
|
let spinner: ReturnType<typeof logger.startSpinner> | null = null;
|
|
55
63
|
|
|
56
64
|
const interactive = useSpinner();
|
|
57
65
|
const silent = isSilent();
|
|
58
66
|
|
|
59
|
-
const renderSpinnerText = (label: string): string => {
|
|
67
|
+
const renderSpinnerText = (label: string, suffix?: string): string => {
|
|
60
68
|
const bar = renderBar(count, total);
|
|
61
69
|
const pct = total > 0 ? Math.floor((count / total) * 100) : 0;
|
|
62
|
-
const
|
|
63
|
-
|
|
70
|
+
const labelTail = label ? ` — ${truncateLabel(label)}` : "";
|
|
71
|
+
const suffixTail = suffix ? ` — ${suffix}` : "";
|
|
72
|
+
return `${bar} ${count}/${total} (${pct}%)${labelTail}${suffixTail}`;
|
|
64
73
|
};
|
|
65
74
|
|
|
66
75
|
return {
|
|
67
76
|
start(t: number, label?: string) {
|
|
68
77
|
total = t;
|
|
69
78
|
count = 0;
|
|
79
|
+
lastLabel = label ?? "";
|
|
70
80
|
if (silent) return;
|
|
71
81
|
if (interactive) {
|
|
72
|
-
const initial = renderSpinnerText(
|
|
82
|
+
const initial = renderSpinnerText(lastLabel);
|
|
73
83
|
spinner = logger.startSpinner(initial);
|
|
74
84
|
} else if (label) {
|
|
75
85
|
logger.info(`${label}: 0/${total}`);
|
|
@@ -77,6 +87,7 @@ export function createProgress(): Progress {
|
|
|
77
87
|
},
|
|
78
88
|
tick(label: string) {
|
|
79
89
|
count += 1;
|
|
90
|
+
lastLabel = label;
|
|
80
91
|
if (silent) return;
|
|
81
92
|
if (interactive && spinner) {
|
|
82
93
|
spinner.update(renderSpinnerText(label));
|
|
@@ -84,6 +95,11 @@ export function createProgress(): Progress {
|
|
|
84
95
|
logger.info(`[${count}/${total}] ${label}`);
|
|
85
96
|
}
|
|
86
97
|
},
|
|
98
|
+
update(suffix: string) {
|
|
99
|
+
if (silent) return;
|
|
100
|
+
if (!interactive || !spinner) return;
|
|
101
|
+
spinner.update(renderSpinnerText(lastLabel, suffix));
|
|
102
|
+
},
|
|
87
103
|
entry(line: string) {
|
|
88
104
|
if (silent) return;
|
|
89
105
|
logger.info(line);
|
package/src/refresh/runner.ts
CHANGED
|
@@ -24,9 +24,16 @@ export interface RefreshOutcome {
|
|
|
24
24
|
* via the persisted mcpx invocation), and creates a new version only if
|
|
25
25
|
* the source bytes changed. Always updates `refreshed_at` and
|
|
26
26
|
* `last_refresh_status` on the row. Returns a per-path outcome — never
|
|
27
|
-
* throws unless the path doesn't exist.
|
|
27
|
+
* throws unless the path doesn't exist. The optional `onEmbedProgress`
|
|
28
|
+
* callback is forwarded to the embedder so interactive callers (e.g. the
|
|
29
|
+
* `refresh` operation) can drive a spinner during the slow phase.
|
|
28
30
|
*/
|
|
29
|
-
export async function refreshOne(
|
|
31
|
+
export async function refreshOne(
|
|
32
|
+
ctx: AppContext,
|
|
33
|
+
logicalPath: string,
|
|
34
|
+
force = false,
|
|
35
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
36
|
+
): Promise<RefreshOutcome> {
|
|
30
37
|
const cur = await getCurrent(ctx.db, logicalPath);
|
|
31
38
|
if (!cur) {
|
|
32
39
|
throw new HelpfulError({
|
|
@@ -42,10 +49,10 @@ export async function refreshOne(ctx: AppContext, logicalPath: string, force = f
|
|
|
42
49
|
|
|
43
50
|
try {
|
|
44
51
|
if (cur.source_type === "local") {
|
|
45
|
-
return await refreshLocal(ctx, cur, force);
|
|
52
|
+
return await refreshLocal(ctx, cur, force, onEmbedProgress);
|
|
46
53
|
}
|
|
47
54
|
if (cur.source_type === "remote") {
|
|
48
|
-
return await refreshRemote(ctx, cur, force);
|
|
55
|
+
return await refreshRemote(ctx, cur, force, onEmbedProgress);
|
|
49
56
|
}
|
|
50
57
|
} catch (err) {
|
|
51
58
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -74,7 +81,12 @@ interface CurrentRow {
|
|
|
74
81
|
}
|
|
75
82
|
|
|
76
83
|
/** Local-file refresh: stat-then-sha gate before re-running the pipeline. */
|
|
77
|
-
async function refreshLocal(
|
|
84
|
+
async function refreshLocal(
|
|
85
|
+
ctx: AppContext,
|
|
86
|
+
cur: CurrentRow,
|
|
87
|
+
force: boolean,
|
|
88
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
89
|
+
): Promise<RefreshOutcome> {
|
|
78
90
|
if (!cur.source_path) {
|
|
79
91
|
throw new HelpfulError({
|
|
80
92
|
kind: "input_error",
|
|
@@ -92,26 +104,35 @@ async function refreshLocal(ctx: AppContext, cur: CurrentRow, force: boolean): P
|
|
|
92
104
|
return { logical_path: cur.logical_path, status: "unchanged" };
|
|
93
105
|
}
|
|
94
106
|
|
|
95
|
-
const versionId = await runPipelineForRefresh(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
107
|
+
const versionId = await runPipelineForRefresh(
|
|
108
|
+
ctx,
|
|
109
|
+
{
|
|
110
|
+
logicalPath: cur.logical_path,
|
|
111
|
+
bytes: local.bytes,
|
|
112
|
+
mime: local.mimeType,
|
|
113
|
+
source: cur.source_path,
|
|
114
|
+
sourceType: "local",
|
|
115
|
+
sourcePath: cur.source_path,
|
|
116
|
+
sourceMtimeMs: local.mtimeMs,
|
|
117
|
+
sourceSha: local.sha256,
|
|
118
|
+
fetcher: "local",
|
|
119
|
+
fetcherServer: null,
|
|
120
|
+
fetcherTool: null,
|
|
121
|
+
fetcherArgs: null,
|
|
122
|
+
refreshSec: cur.refresh_frequency_sec,
|
|
123
|
+
},
|
|
124
|
+
onEmbedProgress,
|
|
125
|
+
);
|
|
110
126
|
return { logical_path: cur.logical_path, status: "ok", new_version_id: versionId };
|
|
111
127
|
}
|
|
112
128
|
|
|
113
129
|
/** Remote refresh: replay the persisted mcpx invocation, or plain HTTP. */
|
|
114
|
-
async function refreshRemote(
|
|
130
|
+
async function refreshRemote(
|
|
131
|
+
ctx: AppContext,
|
|
132
|
+
cur: CurrentRow,
|
|
133
|
+
force: boolean,
|
|
134
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
135
|
+
): Promise<RefreshOutcome> {
|
|
115
136
|
if (!cur.source_path) {
|
|
116
137
|
throw new HelpfulError({
|
|
117
138
|
kind: "input_error",
|
|
@@ -129,21 +150,25 @@ async function refreshRemote(ctx: AppContext, cur: CurrentRow, force: boolean):
|
|
|
129
150
|
return { logical_path: cur.logical_path, status: "unchanged" };
|
|
130
151
|
}
|
|
131
152
|
|
|
132
|
-
const versionId = await runPipelineForRefresh(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
153
|
+
const versionId = await runPipelineForRefresh(
|
|
154
|
+
ctx,
|
|
155
|
+
{
|
|
156
|
+
logicalPath: cur.logical_path,
|
|
157
|
+
bytes: fetched.bytes,
|
|
158
|
+
mime: fetched.mimeType,
|
|
159
|
+
source: cur.source_path,
|
|
160
|
+
sourceType: "remote",
|
|
161
|
+
sourcePath: cur.source_path,
|
|
162
|
+
sourceMtimeMs: null,
|
|
163
|
+
sourceSha: fetched.sha256,
|
|
164
|
+
fetcher: cur.fetcher === "mcpx" ? "mcpx" : "http",
|
|
165
|
+
fetcherServer: fetched.fetcherServer,
|
|
166
|
+
fetcherTool: fetched.fetcherTool,
|
|
167
|
+
fetcherArgs: fetched.fetcherArgs,
|
|
168
|
+
refreshSec: cur.refresh_frequency_sec,
|
|
169
|
+
},
|
|
170
|
+
onEmbedProgress,
|
|
171
|
+
);
|
|
147
172
|
return { logical_path: cur.logical_path, status: "ok", new_version_id: versionId };
|
|
148
173
|
}
|
|
149
174
|
|
|
@@ -237,7 +262,11 @@ interface PipelineParams {
|
|
|
237
262
|
* fields (`change_note='refresh: source updated'`) aren't accidentally
|
|
238
263
|
* applied to first-time ingests.
|
|
239
264
|
*/
|
|
240
|
-
async function runPipelineForRefresh(
|
|
265
|
+
async function runPipelineForRefresh(
|
|
266
|
+
ctx: AppContext,
|
|
267
|
+
p: PipelineParams,
|
|
268
|
+
onEmbedProgress?: (done: number, total: number) => void,
|
|
269
|
+
): Promise<string> {
|
|
241
270
|
await upsertBlob(ctx.db, {
|
|
242
271
|
sha256: p.sourceSha,
|
|
243
272
|
mime_type: p.mime,
|
|
@@ -250,7 +279,7 @@ async function runPipelineForRefresh(ctx: AppContext, p: PipelineParams): Promis
|
|
|
250
279
|
const description = await describe(p.logicalPath, p.mime, markdown, ctx.config.llm);
|
|
251
280
|
const chunks = chunkDeterministic(markdown, ctx.config.chunker);
|
|
252
281
|
const searchTexts = chunks.map((c) => buildSearchText(p.logicalPath, description, c.content));
|
|
253
|
-
const embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
282
|
+
const embeddings = await embed(searchTexts, ctx.config.embedding_model, { onProgress: onEmbedProgress });
|
|
254
283
|
|
|
255
284
|
const versionId = millisIso(Date.now());
|
|
256
285
|
const contentSha = sha256Hex(new TextEncoder().encode(markdown));
|