membot 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +3 -0
- package/.cursor/rules/membot.mdc +3 -0
- package/README.md +5 -0
- package/package.json +1 -1
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +11 -0
- package/src/config/schemas.ts +20 -0
- package/src/constants.ts +15 -0
- package/src/context.ts +24 -0
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +4 -3
- package/src/ingest/embed-worker.ts +74 -0
- package/src/ingest/embedder-pool.ts +391 -0
- package/src/ingest/embedder.ts +40 -2
- package/src/ingest/ingest.ts +1 -1
- package/src/operations/add.ts +94 -86
- package/src/operations/index.ts +2 -0
- package/src/operations/refresh.ts +28 -20
- package/src/operations/stats.ts +342 -0
- package/src/operations/write.ts +48 -40
- package/src/refresh/runner.ts +1 -1
- package/src/refresh/scheduler.ts +22 -13
package/.claude/skills/membot.md
CHANGED
|
@@ -64,6 +64,7 @@ membot read <logical_path> # current markdown surrogate
|
|
|
64
64
|
membot read <logical_path> --bytes # original bytes (base64) — PDF/DOCX/image as ingested
|
|
65
65
|
membot read <logical_path> --version <ts> # historical snapshot
|
|
66
66
|
membot info <logical_path> # metadata only (no content)
|
|
67
|
+
membot stats [prefix] # whole-index summary; optional prefix scopes the aggregates
|
|
67
68
|
membot versions <logical_path> # every version, newest first
|
|
68
69
|
membot diff <logical_path> --a <ts> [--b <ts>] # unified diff between versions
|
|
69
70
|
```
|
|
@@ -129,6 +130,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
129
130
|
| `membot write <path> --content <txt>` | Write inline agent-authored markdown as a new version |
|
|
130
131
|
| `membot search <query>` | Hybrid search (semantic + BM25); add `--include-history` to search older versions |
|
|
131
132
|
| `membot info <path>` | Inspect metadata (source, downloader, refresh schedule, digests) without content |
|
|
133
|
+
| `membot stats [prefix]` | Summarize the index (file/version/chunk/blob counts, on-disk size, refresh health, mime/source/downloader breakdowns); optional prefix scopes |
|
|
132
134
|
| `membot versions <path>` | List every version newest-first with version_id and change notes |
|
|
133
135
|
| `membot diff <path> --a <ts>` | Unified diff between two versions |
|
|
134
136
|
| `membot mv <old> <new>` | Rename a logical_path (history preserved) |
|
|
@@ -161,4 +163,5 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
161
163
|
|
|
162
164
|
- Data lives in `~/.membot/index.duckdb` (override via `MEMBOT_HOME`).
|
|
163
165
|
- Optional `ANTHROPIC_API_KEY` enables LLM fallback for messy/binary input. Without it, conversion degrades to deterministic native output.
|
|
166
|
+
- `embedding.workers` (config key) caps the per-command embed-worker subprocess pool spawned at the top of `add` / `refresh` / `write`. Default `null` resolves to `cpus()-1`; set `1` to disable the pool.
|
|
164
167
|
- Config file: `~/.membot/config.json` (see `membot --help` for the global flags).
|
package/.cursor/rules/membot.mdc
CHANGED
|
@@ -64,6 +64,7 @@ membot read <logical_path> # current markdown surrogate
|
|
|
64
64
|
membot read <logical_path> --bytes # original bytes (base64) — PDF/DOCX/image as ingested
|
|
65
65
|
membot read <logical_path> --version <ts> # historical snapshot
|
|
66
66
|
membot info <logical_path> # metadata only (no content)
|
|
67
|
+
membot stats [prefix] # whole-index summary; optional prefix scopes the aggregates
|
|
67
68
|
membot versions <logical_path> # every version, newest first
|
|
68
69
|
membot diff <logical_path> --a <ts> [--b <ts>] # unified diff between versions
|
|
69
70
|
```
|
|
@@ -129,6 +130,7 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
129
130
|
| `membot write <path> --content <txt>` | Write inline agent-authored markdown as a new version |
|
|
130
131
|
| `membot search <query>` | Hybrid search (semantic + BM25); add `--include-history` to search older versions |
|
|
131
132
|
| `membot info <path>` | Inspect metadata (source, downloader, refresh schedule, digests) without content |
|
|
133
|
+
| `membot stats [prefix]` | Summarize the index (file/version/chunk/blob counts, on-disk size, refresh health, mime/source/downloader breakdowns); optional prefix scopes |
|
|
132
134
|
| `membot versions <path>` | List every version newest-first with version_id and change notes |
|
|
133
135
|
| `membot diff <path> --a <ts>` | Unified diff between two versions |
|
|
134
136
|
| `membot mv <old> <new>` | Rename a logical_path (history preserved) |
|
|
@@ -161,4 +163,5 @@ Tombstones hide a path from `ls` / `tree` / `search` but `versions` and `read --
|
|
|
161
163
|
|
|
162
164
|
- Data lives in `~/.membot/index.duckdb` (override via `MEMBOT_HOME`).
|
|
163
165
|
- Optional `ANTHROPIC_API_KEY` enables LLM fallback for messy/binary input. Without it, conversion degrades to deterministic native output.
|
|
166
|
+
- `embedding.workers` (config key) caps the per-command embed-worker subprocess pool spawned at the top of `add` / `refresh` / `write`. Default `null` resolves to `cpus()-1`; set `1` to disable the pool.
|
|
164
167
|
- Config file: `~/.membot/config.json` (see `membot --help` for the global flags).
|
package/README.md
CHANGED
|
@@ -83,6 +83,7 @@ The skill files describe the discover → ingest → search → read → write w
|
|
|
83
83
|
| `membot read <path>` | Read the markdown surrogate (or `--bytes` for original bytes, base64) |
|
|
84
84
|
| `membot search <query>` | Hybrid search (semantic + BM25); `--include-history` searches older versions |
|
|
85
85
|
| `membot info <path>` | Inspect metadata (source, fetcher, schedule, digests) without content |
|
|
86
|
+
| `membot stats [prefix]` | Summarize the index (file/version/chunk/blob counts, on-disk size, refresh health, mime/source/downloader breakdowns); optional prefix scopes the aggregates |
|
|
86
87
|
| `membot versions <path>` | List every version newest-first |
|
|
87
88
|
| `membot diff <path> <a> [b]` | Unified diff between two versions |
|
|
88
89
|
| `membot write <path>` | Write inline agent-authored markdown as a new version |
|
|
@@ -136,11 +137,15 @@ Add `--watch` (and optional `--tick <sec>`) to also run the refresh daemon, whic
|
|
|
136
137
|
membot config list # show every value (secrets masked)
|
|
137
138
|
membot config set llm.anthropic_api_key sk-ant-... # enable LLM-fallback paths
|
|
138
139
|
membot config set chunker.target_chars 800 # tweak any nested value
|
|
140
|
+
membot config set embedding.workers 4 # cap parallel embed workers
|
|
141
|
+
membot config set converters.max_inline_image_captions 50 # raise per-doc cap on vision captions for embedded images
|
|
139
142
|
membot config get llm.anthropic_api_key --show-secrets # reveal the masked key
|
|
140
143
|
membot config unset chunker.target_chars # back to schema default
|
|
141
144
|
membot config path # print the absolute config path
|
|
142
145
|
```
|
|
143
146
|
|
|
147
|
+
**Parallel embedding:** `embedding.workers` (default `null` → `cpus()-1`) controls how many subprocess workers fan out the WASM embedding work. The pool is **per-command** — spawned at the start of `add` / `refresh` / `write` and killed before the command returns, so membot doesn't keep idle workers around between invocations. Each worker loads its own ~50MB copy of the model, so on RAM-constrained machines drop it to a small fixed number (e.g. `4`); set `1` to disable the pool entirely and embed inline.
|
|
148
|
+
|
|
144
149
|
Values are written with file mode `0600`. `ANTHROPIC_API_KEY` set in the environment still wins on read, so existing env-var setups keep working.
|
|
145
150
|
- **Browser session:** `~/.membot/auth/browser-profile/` (Playwright persistent profile — cookies, localStorage, IndexedDB). Captured by `membot login`; cookie-based downloaders (Google) reuse it on every fetch. Delete the directory to force a fresh login.
|
|
146
151
|
- **API keys:** stored under `downloaders.<service>.api_key` in `~/.membot/config.json`. Read by API-based downloaders (GitHub, Linear).
|
package/package.json
CHANGED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* One-shot generator for `test/fixtures/sample-with-image.docx`. Run this
|
|
4
|
+
* (`bun scripts/build-test-docx.ts`) when the fixture is missing or when
|
|
5
|
+
* the embedded test image needs to change. The DOCX itself is committed
|
|
6
|
+
* to the repo so test runs don't depend on jszip-as-transitive-dep.
|
|
7
|
+
*/
|
|
8
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
9
|
+
import { dirname } from "node:path";
|
|
10
|
+
// jszip ships transitively via mammoth; this script is run by hand, not in tests.
|
|
11
|
+
import JSZip from "../node_modules/jszip/lib/index.js";
|
|
12
|
+
|
|
13
|
+
const TINY_PNG_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
|
14
|
+
|
|
15
|
+
const documentXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
16
|
+
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
17
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
18
|
+
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
|
19
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
20
|
+
xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
|
|
21
|
+
<w:body>
|
|
22
|
+
<w:p><w:r><w:t>Lead paragraph before the diagram.</w:t></w:r></w:p>
|
|
23
|
+
<w:p><w:r><w:drawing>
|
|
24
|
+
<wp:inline>
|
|
25
|
+
<wp:extent cx="635" cy="635"/>
|
|
26
|
+
<wp:docPr id="1" name="Picture 1" descr="architecture diagram"/>
|
|
27
|
+
<a:graphic>
|
|
28
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
|
|
29
|
+
<pic:pic>
|
|
30
|
+
<pic:nvPicPr>
|
|
31
|
+
<pic:cNvPr id="1" name="img.png" descr="architecture diagram"/>
|
|
32
|
+
<pic:cNvPicPr/>
|
|
33
|
+
</pic:nvPicPr>
|
|
34
|
+
<pic:blipFill>
|
|
35
|
+
<a:blip r:embed="rId1"/>
|
|
36
|
+
<a:stretch><a:fillRect/></a:stretch>
|
|
37
|
+
</pic:blipFill>
|
|
38
|
+
<pic:spPr>
|
|
39
|
+
<a:xfrm><a:off x="0" y="0"/><a:ext cx="635" cy="635"/></a:xfrm>
|
|
40
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
41
|
+
</pic:spPr>
|
|
42
|
+
</pic:pic>
|
|
43
|
+
</a:graphicData>
|
|
44
|
+
</a:graphic>
|
|
45
|
+
</wp:inline>
|
|
46
|
+
</w:drawing></w:r></w:p>
|
|
47
|
+
<w:p><w:r><w:t>Trailing paragraph after the diagram.</w:t></w:r></w:p>
|
|
48
|
+
</w:body>
|
|
49
|
+
</w:document>`;
|
|
50
|
+
|
|
51
|
+
const documentRels = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
52
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
53
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/>
|
|
54
|
+
</Relationships>`;
|
|
55
|
+
|
|
56
|
+
const rootRels = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
57
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
58
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
|
|
59
|
+
</Relationships>`;
|
|
60
|
+
|
|
61
|
+
const contentTypes = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
62
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
63
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
64
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
65
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
66
|
+
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
|
|
67
|
+
</Types>`;
|
|
68
|
+
|
|
69
|
+
async function main(): Promise<void> {
|
|
70
|
+
const zip = new JSZip();
|
|
71
|
+
zip.file("[Content_Types].xml", contentTypes);
|
|
72
|
+
zip.file("_rels/.rels", rootRels);
|
|
73
|
+
zip.file("word/document.xml", documentXml);
|
|
74
|
+
zip.file("word/_rels/document.xml.rels", documentRels);
|
|
75
|
+
zip.file("word/media/image1.png", Buffer.from(TINY_PNG_BASE64, "base64"));
|
|
76
|
+
|
|
77
|
+
const buffer = await zip.generateAsync({ type: "nodebuffer" });
|
|
78
|
+
const out = "test/fixtures/sample-with-image.docx";
|
|
79
|
+
mkdirSync(dirname(out), { recursive: true });
|
|
80
|
+
writeFileSync(out, buffer);
|
|
81
|
+
console.log(`wrote ${out} (${buffer.byteLength} bytes)`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
await main();
|
package/src/cli.ts
CHANGED
|
@@ -10,12 +10,23 @@ import { registerReindexCommand } from "./commands/reindex.ts";
|
|
|
10
10
|
import { registerServeCommand } from "./commands/serve.ts";
|
|
11
11
|
import { registerSkillCommand } from "./commands/skill.ts";
|
|
12
12
|
import { registerUpgradeCommand } from "./commands/upgrade.ts";
|
|
13
|
+
import { EMBED_WORKER_SENTINEL } from "./constants.ts";
|
|
13
14
|
import type { BuildContextOptions } from "./context.ts";
|
|
15
|
+
import { runEmbedWorker } from "./ingest/embed-worker.ts";
|
|
14
16
|
import { mountAsCommanderCommand } from "./mount/commander.ts";
|
|
15
17
|
import { OPERATIONS } from "./operations/index.ts";
|
|
16
18
|
import { logger } from "./output/logger.ts";
|
|
17
19
|
import { maybeCheckForUpdate } from "./update/background.ts";
|
|
18
20
|
|
|
21
|
+
// Hidden worker mode: the EmbedderPool re-execs this binary with the sentinel
|
|
22
|
+
// as argv[2] (or argv[1] when `bun run src/cli.ts <sentinel>` is invoked
|
|
23
|
+
// directly during tests). We bypass commander entirely and run the worker
|
|
24
|
+
// stdin/stdout protocol loop instead.
|
|
25
|
+
if (process.argv.includes(EMBED_WORKER_SENTINEL)) {
|
|
26
|
+
await runEmbedWorker();
|
|
27
|
+
process.exit(0);
|
|
28
|
+
}
|
|
29
|
+
|
|
19
30
|
program
|
|
20
31
|
.name("membot")
|
|
21
32
|
.description("Versioned context store with hybrid search for AI agents. Stdio + HTTP MCP server and CLI.")
|
package/src/config/schemas.ts
CHANGED
|
@@ -7,6 +7,10 @@ export const ChunkerConfigSchema = z.object({
|
|
|
7
7
|
max_chars: z.number().int().positive().default(DEFAULTS.CHUNKER_MAX_CHARS),
|
|
8
8
|
});
|
|
9
9
|
|
|
10
|
+
export const ConvertersConfigSchema = z.object({
|
|
11
|
+
max_inline_image_captions: z.number().int().nonnegative().default(DEFAULTS.MAX_INLINE_IMAGE_CAPTIONS),
|
|
12
|
+
});
|
|
13
|
+
|
|
10
14
|
export const LlmConfigSchema = z.object({
|
|
11
15
|
anthropic_api_key: z.string().meta({ secret: true }).default(""),
|
|
12
16
|
converter_model: z.string().default(DEFAULTS.CONVERTER_MODEL),
|
|
@@ -19,6 +23,18 @@ export const DaemonConfigSchema = z.object({
|
|
|
19
23
|
tick_interval_sec: z.number().int().positive().default(DEFAULTS.DAEMON_TICK_SEC),
|
|
20
24
|
});
|
|
21
25
|
|
|
26
|
+
/**
|
|
27
|
+
* Embedding parallelism. `workers = null` (the default) resolves to
|
|
28
|
+
* `max(1, cpus()-1)` at context-build time so the pool grows with the host
|
|
29
|
+
* machine. Setting `workers = 1` disables the subprocess pool entirely
|
|
30
|
+
* and runs embedding inline in the parent (the original single-thread
|
|
31
|
+
* behaviour). Each worker loads its own copy of the WASM model
|
|
32
|
+
* (~50MB resident), so cap this on RAM-constrained machines.
|
|
33
|
+
*/
|
|
34
|
+
export const EmbeddingConfigSchema = z.object({
|
|
35
|
+
workers: z.number().int().min(1).nullable().default(null),
|
|
36
|
+
});
|
|
37
|
+
|
|
22
38
|
export const LinearDownloaderConfigSchema = z.object({
|
|
23
39
|
api_key: z.string().meta({ secret: true }).default(""),
|
|
24
40
|
});
|
|
@@ -43,6 +59,8 @@ export const MembotConfigSchema = z.object({
|
|
|
43
59
|
embedding_model: z.string().default(EMBEDDING_MODEL),
|
|
44
60
|
embedding_dimension: z.number().int().positive().default(EMBEDDING_DIMENSION),
|
|
45
61
|
chunker: ChunkerConfigSchema.default(() => ChunkerConfigSchema.parse({})),
|
|
62
|
+
embedding: EmbeddingConfigSchema.default(() => EmbeddingConfigSchema.parse({})),
|
|
63
|
+
converters: ConvertersConfigSchema.default(() => ConvertersConfigSchema.parse({})),
|
|
46
64
|
llm: LlmConfigSchema.default(() => LlmConfigSchema.parse({})),
|
|
47
65
|
downloaders: DownloadersConfigSchema.default(() => DownloadersConfigSchema.parse({})),
|
|
48
66
|
daemon: DaemonConfigSchema.default(() => DaemonConfigSchema.parse({})),
|
|
@@ -52,6 +70,8 @@ export const MembotConfigSchema = z.object({
|
|
|
52
70
|
|
|
53
71
|
export type MembotConfig = z.infer<typeof MembotConfigSchema>;
|
|
54
72
|
export type ChunkerConfig = z.infer<typeof ChunkerConfigSchema>;
|
|
73
|
+
export type EmbeddingConfig = z.infer<typeof EmbeddingConfigSchema>;
|
|
74
|
+
export type ConvertersConfig = z.infer<typeof ConvertersConfigSchema>;
|
|
55
75
|
export type LlmConfig = z.infer<typeof LlmConfigSchema>;
|
|
56
76
|
export type DownloadersConfig = z.infer<typeof DownloadersConfigSchema>;
|
|
57
77
|
export type LinearDownloaderConfig = z.infer<typeof LinearDownloaderConfigSchema>;
|
package/src/constants.ts
CHANGED
|
@@ -28,6 +28,14 @@ export const EMBEDDING_DIMENSION = 384;
|
|
|
28
28
|
*/
|
|
29
29
|
export const EMBEDDING_BATCH_SIZE = 16;
|
|
30
30
|
|
|
31
|
+
/**
|
|
32
|
+
* Hidden first-arg sentinel that re-execs the membot binary as an embed
|
|
33
|
+
* worker. The pool spawns `process.execPath <sentinel>` so the same compiled
|
|
34
|
+
* binary serves both the user-facing CLI and the worker subprocess; cli.ts
|
|
35
|
+
* checks this argv slot before commander sees it.
|
|
36
|
+
*/
|
|
37
|
+
export const EMBED_WORKER_SENTINEL = "__embed_worker";
|
|
38
|
+
|
|
31
39
|
export const DEFAULTS = {
|
|
32
40
|
CHUNKER_MODE: "deterministic" as const,
|
|
33
41
|
CHUNKER_TARGET_CHARS: 4_000,
|
|
@@ -40,6 +48,13 @@ export const DEFAULTS = {
|
|
|
40
48
|
VISION_MODEL: "claude-haiku-4-5-20251001",
|
|
41
49
|
UPDATE_CHECK_INTERVAL_MS: 24 * 60 * 60 * 1000,
|
|
42
50
|
UPDATE_CHECK_TIMEOUT_MS: 5_000,
|
|
51
|
+
/**
|
|
52
|
+
* Per-document cap on Claude vision caption calls when expanding inline
|
|
53
|
+
* images during DOCX/HTML conversion. Beyond this, images get a small
|
|
54
|
+
* deterministic placeholder so a slide-deck-shaped doc with hundreds of
|
|
55
|
+
* embedded images doesn't fan out into hundreds of vision requests.
|
|
56
|
+
*/
|
|
57
|
+
MAX_INLINE_IMAGE_CAPTIONS: 20,
|
|
43
58
|
} as const;
|
|
44
59
|
|
|
45
60
|
export const FILES = {
|
package/src/context.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { cpus } from "node:os";
|
|
1
2
|
import { join } from "node:path";
|
|
2
3
|
import { loadConfig } from "./config/loader.ts";
|
|
3
4
|
import type { MembotConfig } from "./config/schemas.ts";
|
|
@@ -25,11 +26,34 @@ export interface BuildContextOptions {
|
|
|
25
26
|
noInteractive?: boolean;
|
|
26
27
|
}
|
|
27
28
|
|
|
29
|
+
/**
|
|
30
|
+
* Resolve `config.embedding.workers` to a concrete worker count. Precedence:
|
|
31
|
+
* 1. An explicit numeric value in the config wins (user opt-in).
|
|
32
|
+
* 2. `MEMBOT_EMBEDDING_WORKERS` env var, if set to a positive integer.
|
|
33
|
+
* The test harness sets this to `1` so unit tests doing tiny writes
|
|
34
|
+
* don't pay the per-pool subprocess-spawn cost on slow CI runners.
|
|
35
|
+
* 3. Otherwise `null`/missing → `max(1, cpus()-1)`. The minus-one leaves
|
|
36
|
+
* a core for the parent process (DB writes, IO, the spinner).
|
|
37
|
+
*/
|
|
38
|
+
export function resolveEmbeddingWorkers(configured: number | null | undefined): number {
|
|
39
|
+
if (typeof configured === "number" && configured >= 1) return configured;
|
|
40
|
+
const envOverride = process.env.MEMBOT_EMBEDDING_WORKERS;
|
|
41
|
+
if (envOverride) {
|
|
42
|
+
const n = Number(envOverride);
|
|
43
|
+
if (Number.isFinite(n) && n >= 1) return Math.floor(n);
|
|
44
|
+
}
|
|
45
|
+
return Math.max(1, cpus().length - 1);
|
|
46
|
+
}
|
|
47
|
+
|
|
28
48
|
/**
|
|
29
49
|
* Build the AppContext used by every operation handler. Initializes:
|
|
30
50
|
* - output mode (TTY/JSON/color detection — frozen for the rest of the run)
|
|
31
51
|
* - config (~/.membot/config.json with env overrides)
|
|
32
52
|
* - DuckDB connection (~/.membot/index.duckdb), running migrations on first open
|
|
53
|
+
*
|
|
54
|
+
* The embedder worker pool is NOT created here — it's per-command,
|
|
55
|
+
* spawned by `withEmbedderPool()` at the top of bulk-embedding handlers
|
|
56
|
+
* (`add`, `refresh`, `write`) and disposed before they return.
|
|
33
57
|
*/
|
|
34
58
|
export async function buildContext(options: BuildContextOptions = {}): Promise<AppContext> {
|
|
35
59
|
setMode(detectMode({ json: options.json, verbose: options.verbose, noColor: options.noColor }));
|
|
@@ -1,15 +1,57 @@
|
|
|
1
1
|
import mammoth from "mammoth";
|
|
2
2
|
import TurndownService from "turndown";
|
|
3
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
4
|
+
import { type CapturedImage, inlineImageCaptions, MEMBOT_IMG_PREFIX } from "./images-inline.ts";
|
|
3
5
|
|
|
4
6
|
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" });
|
|
5
7
|
|
|
8
|
+
/**
|
|
9
|
+
* Mammoth's image element wears an `altText` field that isn't reflected in
|
|
10
|
+
* the published `.d.ts`. We declare the bits we actually touch so the rest
|
|
11
|
+
* of the module can stay strict-typed.
|
|
12
|
+
*/
|
|
13
|
+
interface MammothImage {
|
|
14
|
+
contentType: string;
|
|
15
|
+
altText?: string;
|
|
16
|
+
readAsBuffer: () => Promise<Buffer>;
|
|
17
|
+
}
|
|
18
|
+
|
|
6
19
|
/**
|
|
7
20
|
* Convert a DOCX file to markdown. Mammoth gives us HTML; we then run that
|
|
8
|
-
* through turndown to get clean markdown.
|
|
9
|
-
*
|
|
21
|
+
* through turndown to get clean markdown. Embedded images (which mammoth
|
|
22
|
+
* would otherwise inline as 5MB base64 `data:` URIs) are intercepted and
|
|
23
|
+
* replaced with `membot-img://<id>` placeholders, then expanded into Claude
|
|
24
|
+
* vision captions by `inlineImageCaptions`. Conversion warnings from
|
|
25
|
+
* mammoth are silently dropped — they're typically about styles we don't
|
|
26
|
+
* preserve.
|
|
10
27
|
*/
|
|
11
|
-
export async function convertDocx(bytes: Uint8Array): Promise<string> {
|
|
28
|
+
export async function convertDocx(bytes: Uint8Array, llm: LlmConfig, converters: ConvertersConfig): Promise<string> {
|
|
12
29
|
const buf = Buffer.from(bytes);
|
|
13
|
-
const
|
|
14
|
-
|
|
30
|
+
const images = new Map<string, CapturedImage>();
|
|
31
|
+
let counter = 0;
|
|
32
|
+
|
|
33
|
+
const result = await mammoth.convertToHtml(
|
|
34
|
+
{ buffer: buf },
|
|
35
|
+
{
|
|
36
|
+
convertImage: mammoth.images.imgElement(async (image) => {
|
|
37
|
+
const img = image as unknown as MammothImage;
|
|
38
|
+
const id = `img-${counter++}`;
|
|
39
|
+
try {
|
|
40
|
+
const buffer = await img.readAsBuffer();
|
|
41
|
+
images.set(id, {
|
|
42
|
+
bytes: new Uint8Array(buffer),
|
|
43
|
+
mimeType: img.contentType,
|
|
44
|
+
altText: img.altText,
|
|
45
|
+
});
|
|
46
|
+
} catch {
|
|
47
|
+
// If we can't read the image bytes, still emit the placeholder so
|
|
48
|
+
// turndown doesn't fall back to a giant inline data URI.
|
|
49
|
+
}
|
|
50
|
+
return { src: `${MEMBOT_IMG_PREFIX}${id}` };
|
|
51
|
+
}),
|
|
52
|
+
},
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
const md = turndown.turndown(result.value).trim();
|
|
56
|
+
return inlineImageCaptions(md, images, llm, converters);
|
|
15
57
|
}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import TurndownService from "turndown";
|
|
2
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
3
|
+
import { extractDataUriImages, inlineImageCaptions } from "./images-inline.ts";
|
|
2
4
|
|
|
3
5
|
const turndown = new TurndownService({
|
|
4
6
|
headingStyle: "atx",
|
|
@@ -8,13 +10,18 @@ const turndown = new TurndownService({
|
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
12
|
* Convert HTML bytes to markdown using turndown. Strips script/style blocks
|
|
11
|
-
* before conversion so they don't leak into the chunker.
|
|
13
|
+
* before conversion so they don't leak into the chunker. Inline data-URI
|
|
14
|
+
* images are extracted into their bytes and replaced with vision captions
|
|
15
|
+
* via `inlineImageCaptions`; external `<img src="https://…">` references
|
|
16
|
+
* are left for turndown to render normally.
|
|
12
17
|
*/
|
|
13
|
-
export function convertHtml(bytes: Uint8Array): string {
|
|
18
|
+
export async function convertHtml(bytes: Uint8Array, llm: LlmConfig, converters: ConvertersConfig): Promise<string> {
|
|
14
19
|
const html = new TextDecoder("utf-8").decode(bytes);
|
|
15
20
|
const cleaned = html
|
|
16
21
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
17
22
|
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
18
23
|
.replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
|
|
19
|
-
|
|
24
|
+
const { html: rewritten, images } = extractDataUriImages(cleaned);
|
|
25
|
+
const md = turndown.turndown(rewritten).trim();
|
|
26
|
+
return inlineImageCaptions(md, images, llm, converters);
|
|
20
27
|
}
|
|
@@ -12,6 +12,13 @@ Output the caption only, no preamble.`;
|
|
|
12
12
|
|
|
13
13
|
const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/webp"]);
|
|
14
14
|
|
|
15
|
+
/** Anthropic vision rejects images > 5MB; stay under that with margin. */
|
|
16
|
+
const VISION_MAX_BYTES = 4 * 1024 * 1024;
|
|
17
|
+
/** Tesseract is roughly linear in pixel count; bail past this byte size to avoid pathological hangs. */
|
|
18
|
+
const OCR_MAX_BYTES = 8 * 1024 * 1024;
|
|
19
|
+
/** Hard wall-clock for either subtask so a stuck network call never freezes ingest. */
|
|
20
|
+
const SUBTASK_TIMEOUT_MS = 60_000;
|
|
21
|
+
|
|
15
22
|
/**
|
|
16
23
|
* Build the markdown surrogate for an image: an LLM-generated caption
|
|
17
24
|
* (when an API key is available) folded together with any text recovered
|
|
@@ -19,17 +26,47 @@ const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/web
|
|
|
19
26
|
* when no API key is set.
|
|
20
27
|
*/
|
|
21
28
|
export async function convertImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
|
|
22
|
-
const captionPromise =
|
|
23
|
-
|
|
29
|
+
const captionPromise =
|
|
30
|
+
bytes.byteLength <= VISION_MAX_BYTES
|
|
31
|
+
? withTimeout(describeImage(bytes, mimeType, llm), SUBTASK_TIMEOUT_MS, "vision")
|
|
32
|
+
: Promise.resolve("");
|
|
33
|
+
const ocrPromise =
|
|
34
|
+
bytes.byteLength <= OCR_MAX_BYTES ? withTimeout(ocrImage(bytes), SUBTASK_TIMEOUT_MS, "ocr") : Promise.resolve("");
|
|
24
35
|
const [caption, ocrText] = await Promise.all([captionPromise, ocrPromise]);
|
|
25
36
|
|
|
26
37
|
const sections: string[] = [];
|
|
27
38
|
if (caption) sections.push(caption);
|
|
28
39
|
if (ocrText) sections.push(`## Text detected via OCR\n\n${ocrText}`);
|
|
29
|
-
if (sections.length === 0)
|
|
40
|
+
if (sections.length === 0) {
|
|
41
|
+
const note =
|
|
42
|
+
bytes.byteLength > VISION_MAX_BYTES
|
|
43
|
+
? `(image, ${mimeType}, ${bytes.byteLength} bytes — exceeds vision size limit, no caption available)`
|
|
44
|
+
: `(image, ${mimeType}, no caption available)`;
|
|
45
|
+
sections.push(note);
|
|
46
|
+
}
|
|
30
47
|
return sections.join("\n\n");
|
|
31
48
|
}
|
|
32
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Race a promise against a timer so a stuck network call (vision) or a
|
|
52
|
+
* pathological CPU-bound job (OCR on a multi-megapixel image) never freezes
|
|
53
|
+
* the whole conversion pipeline. Logs a warning when the timer wins.
|
|
54
|
+
*/
|
|
55
|
+
async function withTimeout<T extends string>(p: Promise<T>, ms: number, label: string): Promise<T | ""> {
|
|
56
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
57
|
+
const timeout = new Promise<"">((resolve) => {
|
|
58
|
+
timer = setTimeout(() => {
|
|
59
|
+
logger.warn(`image: ${label} timed out after ${ms}ms`);
|
|
60
|
+
resolve("");
|
|
61
|
+
}, ms);
|
|
62
|
+
});
|
|
63
|
+
try {
|
|
64
|
+
return await Promise.race([p, timeout]);
|
|
65
|
+
} finally {
|
|
66
|
+
if (timer) clearTimeout(timer);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
33
70
|
/**
|
|
34
71
|
* Single-shot vision call asking Claude to caption an image. Returns the
|
|
35
72
|
* caption text or an empty string when the API key is missing or the
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
2
|
+
import { logger } from "../../output/logger.ts";
|
|
3
|
+
import { convertImage } from "./image.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Bytes captured from an embedded image during DOCX/HTML conversion. The
|
|
7
|
+
* image-inlining helpers run `convertImage` over each one to produce a
|
|
8
|
+
* markdown caption that gets spliced back into the document body in place
|
|
9
|
+
* of the original `<img>` reference.
|
|
10
|
+
*/
|
|
11
|
+
export interface CapturedImage {
|
|
12
|
+
bytes: Uint8Array;
|
|
13
|
+
mimeType: string;
|
|
14
|
+
altText?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/** URI scheme used to mark images that the inliner should expand. */
|
|
18
|
+
export const MEMBOT_IMG_PREFIX = "membot-img://";
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Match `` markdown image references. The id may
|
|
22
|
+
* contain any non-whitespace, non-`)` character so we don't accidentally
|
|
23
|
+
* stop at characters mammoth/turndown might emit inside an id.
|
|
24
|
+
*/
|
|
25
|
+
const TOKEN_RE = /!\[([^\]]*)\]\(membot-img:\/\/([^)\s]+)\)/g;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Extract data-URI images from raw HTML and rewrite each `<img src="data:…">`
|
|
29
|
+
* to `<img src="membot-img://<id>">`. The captured bytes flow through the
|
|
30
|
+
* shared `inlineImageCaptions` step so HTML and DOCX share one captioning
|
|
31
|
+
* code path. Non-data `<img>` references are left untouched.
|
|
32
|
+
*/
|
|
33
|
+
export function extractDataUriImages(html: string): { html: string; images: Map<string, CapturedImage> } {
|
|
34
|
+
const images = new Map<string, CapturedImage>();
|
|
35
|
+
let counter = 0;
|
|
36
|
+
const rewritten = html.replace(
|
|
37
|
+
/<img\b([^>]*?)\bsrc\s*=\s*(?:"data:([^";]+);base64,([^"]*)"|'data:([^';]+);base64,([^']*)')([^>]*)>/gi,
|
|
38
|
+
(
|
|
39
|
+
_match,
|
|
40
|
+
beforeSrc: string,
|
|
41
|
+
mimeDouble: string | undefined,
|
|
42
|
+
b64Double: string | undefined,
|
|
43
|
+
mimeSingle: string | undefined,
|
|
44
|
+
b64Single: string | undefined,
|
|
45
|
+
afterSrc: string,
|
|
46
|
+
) => {
|
|
47
|
+
const mimeType = (mimeDouble ?? mimeSingle ?? "image/png").trim();
|
|
48
|
+
const b64 = (b64Double ?? b64Single ?? "").replace(/\s+/g, "");
|
|
49
|
+
const id = `img-${counter++}`;
|
|
50
|
+
try {
|
|
51
|
+
const bytes = new Uint8Array(Buffer.from(b64, "base64"));
|
|
52
|
+
images.set(id, { bytes, mimeType });
|
|
53
|
+
} catch (err) {
|
|
54
|
+
logger.warn(
|
|
55
|
+
`images-inline: failed to decode embedded image (${err instanceof Error ? err.message : String(err)})`,
|
|
56
|
+
);
|
|
57
|
+
return `<img${beforeSrc} src=""${afterSrc}>`;
|
|
58
|
+
}
|
|
59
|
+
return `<img${beforeSrc} src="${MEMBOT_IMG_PREFIX}${id}"${afterSrc}>`;
|
|
60
|
+
},
|
|
61
|
+
);
|
|
62
|
+
return { html: rewritten, images };
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Replace each `` token in `markdown` with the
|
|
67
|
+
* caption produced by `convertImage`. Captures are processed in document
|
|
68
|
+
* order; once `max_inline_image_captions` (from `ConvertersConfig`) has been
|
|
69
|
+
* reached, the remaining tokens get a tiny deterministic placeholder rather
|
|
70
|
+
* than an LLM call so a doc full of embedded images doesn't fan out into
|
|
71
|
+
* hundreds of vision requests.
|
|
72
|
+
*
|
|
73
|
+
* No-ops on a markdown string with no `membot-img://` references; safe to
|
|
74
|
+
* call unconditionally from the converters.
|
|
75
|
+
*/
|
|
76
|
+
export async function inlineImageCaptions(
|
|
77
|
+
markdown: string,
|
|
78
|
+
images: Map<string, CapturedImage>,
|
|
79
|
+
llm: LlmConfig,
|
|
80
|
+
converters: ConvertersConfig,
|
|
81
|
+
): Promise<string> {
|
|
82
|
+
if (images.size === 0) return markdown;
|
|
83
|
+
|
|
84
|
+
const captions = new Map<string, string>();
|
|
85
|
+
const overflow = new Set<string>();
|
|
86
|
+
let captioned = 0;
|
|
87
|
+
|
|
88
|
+
for (const match of markdown.matchAll(TOKEN_RE)) {
|
|
89
|
+
const alt = match[1] ?? "";
|
|
90
|
+
const id = match[2];
|
|
91
|
+
if (!id || captions.has(id) || overflow.has(id)) continue;
|
|
92
|
+
const img = images.get(id);
|
|
93
|
+
if (!img) continue;
|
|
94
|
+
|
|
95
|
+
if (captioned >= converters.max_inline_image_captions) {
|
|
96
|
+
overflow.add(id);
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
captioned++;
|
|
100
|
+
try {
|
|
101
|
+
const caption = await convertImage(img.bytes, img.mimeType, llm);
|
|
102
|
+
captions.set(id, formatCaptionBlock(alt || img.altText || "", caption));
|
|
103
|
+
} catch (err) {
|
|
104
|
+
logger.warn(`images-inline: caption failed for ${id} (${err instanceof Error ? err.message : String(err)})`);
|
|
105
|
+
captions.set(id, formatCaptionBlock(alt || img.altText || "", `(image, ${img.mimeType}, no caption available)`));
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return markdown.replace(TOKEN_RE, (_match, alt: string, id: string) => {
|
|
110
|
+
const cached = captions.get(id);
|
|
111
|
+
if (cached) return cached;
|
|
112
|
+
const img = images.get(id);
|
|
113
|
+
if (!img) return formatCaptionBlock(alt, "(image, no caption available)");
|
|
114
|
+
return formatCaptionBlock(
|
|
115
|
+
alt || img.altText || "",
|
|
116
|
+
`(image, ${img.mimeType}, ${img.bytes.byteLength} bytes — caption skipped, exceeded max_inline_image_captions)`,
|
|
117
|
+
);
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Render a captioned image as its own markdown paragraph block. Wrapping the
|
|
123
|
+
* caption in blank lines guarantees the deterministic chunker sees it as a
|
|
124
|
+
* paragraph boundary; an HTML comment with the alt text keeps the original
|
|
125
|
+
* positional cue without polluting search snippets.
|
|
126
|
+
*/
|
|
127
|
+
function formatCaptionBlock(alt: string, caption: string): string {
|
|
128
|
+
const trimmed = caption.trim();
|
|
129
|
+
const header = alt.trim() ? `<!-- image: ${alt.trim()} -->` : `<!-- image -->`;
|
|
130
|
+
const body = trimmed.length > 0 ? trimmed : "(image, no caption available)";
|
|
131
|
+
return `\n\n${header}\n\n${body}\n\n`;
|
|
132
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { LlmConfig } from "../../config/schemas.ts";
|
|
1
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
2
2
|
import { convertDocx } from "./docx.ts";
|
|
3
3
|
import { convertHtml } from "./html.ts";
|
|
4
4
|
import { convertImage } from "./image.ts";
|
|
@@ -44,6 +44,7 @@ export async function convert(
|
|
|
44
44
|
mimeType: string,
|
|
45
45
|
source: string,
|
|
46
46
|
llm: LlmConfig,
|
|
47
|
+
converters: ConvertersConfig,
|
|
47
48
|
): Promise<ConvertResult> {
|
|
48
49
|
const mt = mimeType.toLowerCase();
|
|
49
50
|
|
|
@@ -52,11 +53,11 @@ export async function convert(
|
|
|
52
53
|
}
|
|
53
54
|
|
|
54
55
|
if (HTML_MIMES.has(mt)) {
|
|
55
|
-
return { markdown: convertHtml(bytes), contentMimeType: "text/markdown" };
|
|
56
|
+
return { markdown: await convertHtml(bytes, llm, converters), contentMimeType: "text/markdown" };
|
|
56
57
|
}
|
|
57
58
|
|
|
58
59
|
if (DOCX_MIMES.has(mt)) {
|
|
59
|
-
return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
|
|
60
|
+
return { markdown: await convertDocx(bytes, llm, converters), contentMimeType: "text/markdown" };
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
if (XLSX_MIMES.has(mt)) {
|