membot 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +3 -0
- package/.cursor/rules/membot.mdc +3 -0
- package/README.md +5 -0
- package/package.json +1 -1
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +11 -0
- package/src/config/schemas.ts +20 -0
- package/src/constants.ts +15 -0
- package/src/context.ts +24 -0
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +4 -3
- package/src/ingest/embed-worker.ts +74 -0
- package/src/ingest/embedder-pool.ts +391 -0
- package/src/ingest/embedder.ts +40 -2
- package/src/ingest/ingest.ts +1 -1
- package/src/operations/add.ts +94 -86
- package/src/operations/index.ts +2 -0
- package/src/operations/refresh.ts +28 -20
- package/src/operations/stats.ts +342 -0
- package/src/operations/write.ts +48 -40
- package/src/refresh/runner.ts +1 -1
- package/src/refresh/scheduler.ts +22 -13
package/src/operations/write.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
import { resolveEmbeddingWorkers } from "../context.ts";
|
|
2
3
|
import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
|
|
3
4
|
import { insertVersion, millisIso } from "../db/files.ts";
|
|
4
5
|
import { chunkDeterministic } from "../ingest/chunker.ts";
|
|
5
6
|
import { describe } from "../ingest/describer.ts";
|
|
6
7
|
import { embed } from "../ingest/embedder.ts";
|
|
8
|
+
import { withEmbedderPool } from "../ingest/embedder-pool.ts";
|
|
7
9
|
import { parseDuration } from "../ingest/ingest.ts";
|
|
8
10
|
import { sha256Hex } from "../ingest/local-reader.ts";
|
|
9
11
|
import { buildSearchText } from "../ingest/search-text.ts";
|
|
@@ -30,48 +32,54 @@ export const writeOperation = defineOperation({
|
|
|
30
32
|
console_formatter: (result) =>
|
|
31
33
|
`${colors.green("✓")} ${colors.cyan(result.logical_path)} ${colors.dim(`@ ${result.version_id}`)} ${colors.dim(`(${result.size_bytes}B)`)}`,
|
|
32
34
|
handler: async (input, ctx) => {
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
// Per-command embedder pool: spawn workers, embed this version's
|
|
36
|
+
// chunks in parallel, kill workers before returning. Short-circuits
|
|
37
|
+
// to single-process when `embedding.workers` is 1.
|
|
38
|
+
const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
|
|
39
|
+
return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
|
|
40
|
+
const refreshSec = parseDuration(input.refresh_frequency);
|
|
41
|
+
const bytes = new TextEncoder().encode(input.content);
|
|
42
|
+
const description = await describe(input.logical_path, "text/markdown", input.content, ctx.config.llm);
|
|
43
|
+
const chunks = chunkDeterministic(input.content, ctx.config.chunker);
|
|
44
|
+
const searchTexts = chunks.map((c) => buildSearchText(input.logical_path, description, c.content));
|
|
45
|
+
const embeddings = await embed(searchTexts, ctx.config.embedding_model);
|
|
39
46
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
47
|
+
const versionId = millisIso(Date.now());
|
|
48
|
+
const contentSha = sha256Hex(bytes);
|
|
49
|
+
await insertVersion(ctx.db, {
|
|
50
|
+
logical_path: input.logical_path,
|
|
51
|
+
version_id: versionId,
|
|
52
|
+
source_type: "inline",
|
|
53
|
+
source_path: null,
|
|
54
|
+
source_mtime_ms: null,
|
|
55
|
+
source_sha256: contentSha,
|
|
56
|
+
blob_sha256: null,
|
|
57
|
+
content_sha256: contentSha,
|
|
58
|
+
content: input.content,
|
|
59
|
+
description,
|
|
60
|
+
mime_type: "text/markdown",
|
|
61
|
+
size_bytes: bytes.byteLength,
|
|
62
|
+
fetcher: "inline",
|
|
63
|
+
refresh_frequency_sec: refreshSec,
|
|
64
|
+
refreshed_at: new Date().toISOString(),
|
|
65
|
+
last_refresh_status: "ok",
|
|
66
|
+
change_note: input.change_note ?? null,
|
|
67
|
+
});
|
|
61
68
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
69
|
+
await insertChunksForVersion(
|
|
70
|
+
ctx.db,
|
|
71
|
+
input.logical_path,
|
|
72
|
+
versionId,
|
|
73
|
+
chunks.map((c, i) => ({
|
|
74
|
+
chunk_index: c.index,
|
|
75
|
+
chunk_content: c.content,
|
|
76
|
+
search_text: searchTexts[i] ?? buildSearchText(input.logical_path, description, c.content),
|
|
77
|
+
embedding: embeddings[i] ?? new Array(embeddings[0]?.length ?? 0).fill(0),
|
|
78
|
+
})),
|
|
79
|
+
);
|
|
80
|
+
await rebuildFts(ctx.db);
|
|
74
81
|
|
|
75
|
-
|
|
82
|
+
return { logical_path: input.logical_path, version_id: versionId, size_bytes: bytes.byteLength };
|
|
83
|
+
});
|
|
76
84
|
},
|
|
77
85
|
});
|
package/src/refresh/runner.ts
CHANGED
|
@@ -221,7 +221,7 @@ async function runPipelineForRefresh(
|
|
|
221
221
|
});
|
|
222
222
|
|
|
223
223
|
onPhase?.("converting");
|
|
224
|
-
const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm);
|
|
224
|
+
const conversion = await convert(p.bytes, p.mime, p.source, ctx.config.llm, ctx.config.converters);
|
|
225
225
|
const markdown = conversion.markdown;
|
|
226
226
|
onPhase?.("describing");
|
|
227
227
|
const description = await describe(p.logicalPath, p.mime, markdown, ctx.config.llm);
|
package/src/refresh/scheduler.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type AppContext, resolveEmbeddingWorkers } from "../context.ts";
|
|
2
2
|
import { listDueRefreshes } from "../db/files.ts";
|
|
3
|
+
import { withEmbedderPool } from "../ingest/embedder-pool.ts";
|
|
3
4
|
import { logger } from "../output/logger.ts";
|
|
4
5
|
import { type RefreshOutcome, refreshOne } from "./runner.ts";
|
|
5
6
|
|
|
@@ -7,22 +8,30 @@ import { type RefreshOutcome, refreshOne } from "./runner.ts";
|
|
|
7
8
|
* One scheduler tick: refresh every row whose `refresh_frequency_sec` has
|
|
8
9
|
* elapsed since `refreshed_at`. Errors on individual rows are logged and
|
|
9
10
|
* the loop continues so one bad source doesn't halt the daemon.
|
|
11
|
+
*
|
|
12
|
+
* The embedder worker pool is per-tick: spun up only if there are due rows,
|
|
13
|
+
* torn down before the tick returns. The daemon never holds idle workers
|
|
14
|
+
* between ticks (which can be minutes apart).
|
|
10
15
|
*/
|
|
11
16
|
export async function runDueRefreshes(ctx: AppContext): Promise<RefreshOutcome[]> {
|
|
12
17
|
const due = await listDueRefreshes(ctx.db);
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
if (due.length === 0) return [];
|
|
19
|
+
const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
|
|
20
|
+
return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
|
|
21
|
+
const out: RefreshOutcome[] = [];
|
|
22
|
+
for (const row of due) {
|
|
23
|
+
try {
|
|
24
|
+
const r = await refreshOne(ctx, row.logical_path);
|
|
25
|
+
out.push(r);
|
|
26
|
+
if (r.status === "ok") logger.info(`refresh: ${row.logical_path} → new version ${r.new_version_id}`);
|
|
27
|
+
} catch (err) {
|
|
28
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
29
|
+
logger.warn(`refresh: ${row.logical_path} failed (${msg})`);
|
|
30
|
+
out.push({ logical_path: row.logical_path, status: "failed", error: msg });
|
|
31
|
+
}
|
|
23
32
|
}
|
|
24
|
-
|
|
25
|
-
|
|
33
|
+
return out;
|
|
34
|
+
});
|
|
26
35
|
}
|
|
27
36
|
|
|
28
37
|
/**
|