membot 0.5.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +25 -10
- package/.cursor/rules/membot.mdc +25 -10
- package/README.md +36 -4
- package/package.json +8 -5
- package/scripts/apply-patches.sh +0 -11
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +2 -2
- package/src/commands/login-page.mustache +50 -0
- package/src/commands/login.ts +83 -0
- package/src/config/schemas.ts +23 -5
- package/src/constants.ts +20 -1
- package/src/context.ts +1 -24
- package/src/db/files.ts +21 -25
- package/src/db/migrations/003-downloader-columns.ts +58 -0
- package/src/db/migrations.ts +2 -1
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +13 -3
- package/src/ingest/converter/xlsx.ts +111 -0
- package/src/ingest/downloaders/browser.ts +180 -0
- package/src/ingest/downloaders/generic-web.ts +81 -0
- package/src/ingest/downloaders/github.ts +178 -0
- package/src/ingest/downloaders/google-docs.ts +56 -0
- package/src/ingest/downloaders/google-shared.ts +86 -0
- package/src/ingest/downloaders/google-sheets.ts +58 -0
- package/src/ingest/downloaders/google-slides.ts +53 -0
- package/src/ingest/downloaders/index.ts +182 -0
- package/src/ingest/downloaders/linear.ts +291 -0
- package/src/ingest/fetcher.ts +104 -129
- package/src/ingest/ingest.ts +44 -71
- package/src/mcp/instructions.ts +4 -2
- package/src/operations/add.ts +6 -4
- package/src/operations/info.ts +4 -6
- package/src/operations/move.ts +2 -3
- package/src/operations/refresh.ts +2 -4
- package/src/operations/remove.ts +23 -2
- package/src/operations/tree.ts +1 -1
- package/src/operations/types.ts +1 -1
- package/src/refresh/runner.ts +60 -115
- package/src/types/text-modules.d.ts +5 -0
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +0 -51
- package/src/commands/mcpx.ts +0 -112
- package/src/ingest/agent-fetcher.ts +0 -639
package/src/context.ts
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import { join } from "node:path";
|
|
2
|
-
import { McpxClient } from "@evantahler/mcpx";
|
|
3
2
|
import { loadConfig } from "./config/loader.ts";
|
|
4
3
|
import type { MembotConfig } from "./config/schemas.ts";
|
|
5
|
-
import {
|
|
4
|
+
import { FILES } from "./constants.ts";
|
|
6
5
|
import { type DbConnection, openDb } from "./db/connection.ts";
|
|
7
6
|
import { logger } from "./output/logger.ts";
|
|
8
7
|
import type { Progress } from "./output/progress.ts";
|
|
@@ -16,7 +15,6 @@ export interface AppContext {
|
|
|
16
15
|
db: DbConnection;
|
|
17
16
|
logger: typeof logger;
|
|
18
17
|
progress: Progress;
|
|
19
|
-
mcpx: McpxClient | null;
|
|
20
18
|
}
|
|
21
19
|
|
|
22
20
|
export interface BuildContextOptions {
|
|
@@ -32,7 +30,6 @@ export interface BuildContextOptions {
|
|
|
32
30
|
* - output mode (TTY/JSON/color detection — frozen for the rest of the run)
|
|
33
31
|
* - config (~/.membot/config.json with env overrides)
|
|
34
32
|
* - DuckDB connection (~/.membot/index.duckdb), running migrations on first open
|
|
35
|
-
* - mcpx client (lazy — opened on first remote fetch; null when no servers)
|
|
36
33
|
*/
|
|
37
34
|
export async function buildContext(options: BuildContextOptions = {}): Promise<AppContext> {
|
|
38
35
|
setMode(detectMode({ json: options.json, verbose: options.verbose, noColor: options.noColor }));
|
|
@@ -45,8 +42,6 @@ export async function buildContext(options: BuildContextOptions = {}): Promise<A
|
|
|
45
42
|
maxDelayMs: config.db_lock_retry.max_delay_ms,
|
|
46
43
|
});
|
|
47
44
|
|
|
48
|
-
const mcpx = await maybeMcpx(config);
|
|
49
|
-
|
|
50
45
|
return {
|
|
51
46
|
config,
|
|
52
47
|
dataDir,
|
|
@@ -54,31 +49,13 @@ export async function buildContext(options: BuildContextOptions = {}): Promise<A
|
|
|
54
49
|
db,
|
|
55
50
|
logger,
|
|
56
51
|
progress: createProgress(),
|
|
57
|
-
mcpx,
|
|
58
52
|
};
|
|
59
53
|
}
|
|
60
54
|
|
|
61
|
-
async function maybeMcpx(config: MembotConfig): Promise<McpxClient | null> {
|
|
62
|
-
const configDir = config.mcpx.config_path || process.env[ENV.MCPX_CONFIG_PATH];
|
|
63
|
-
try {
|
|
64
|
-
const client = new McpxClient(configDir ? { configDir } : {});
|
|
65
|
-
return client;
|
|
66
|
-
} catch {
|
|
67
|
-
return null;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
55
|
export async function closeContext(ctx: AppContext): Promise<void> {
|
|
72
56
|
try {
|
|
73
57
|
await ctx.db.close();
|
|
74
58
|
} catch {
|
|
75
59
|
// best effort
|
|
76
60
|
}
|
|
77
|
-
if (ctx.mcpx) {
|
|
78
|
-
try {
|
|
79
|
-
await ctx.mcpx.close();
|
|
80
|
-
} catch {
|
|
81
|
-
// best effort
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
61
|
}
|
package/src/db/files.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { DbConnection, SqlParam } from "./connection.ts";
|
|
2
2
|
|
|
3
3
|
export type SourceType = "local" | "remote" | "inline";
|
|
4
|
-
export type FetcherKind = "
|
|
4
|
+
export type FetcherKind = "downloader" | "local" | "inline";
|
|
5
5
|
|
|
6
6
|
export interface FileRow {
|
|
7
7
|
logical_path: string;
|
|
@@ -18,9 +18,8 @@ export interface FileRow {
|
|
|
18
18
|
mime_type: string | null;
|
|
19
19
|
size_bytes: number | null;
|
|
20
20
|
fetcher: FetcherKind | null;
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
fetcher_args: Record<string, unknown> | null;
|
|
21
|
+
downloader: string | null;
|
|
22
|
+
downloader_args: Record<string, unknown> | null;
|
|
24
23
|
refresh_frequency_sec: number | null;
|
|
25
24
|
refreshed_at: string | null;
|
|
26
25
|
last_refresh_status: string | null;
|
|
@@ -43,9 +42,8 @@ export interface NewFileVersion {
|
|
|
43
42
|
mime_type?: string | null;
|
|
44
43
|
size_bytes?: number | null;
|
|
45
44
|
fetcher?: FetcherKind | null;
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
fetcher_args?: Record<string, unknown> | null;
|
|
45
|
+
downloader?: string | null;
|
|
46
|
+
downloader_args?: Record<string, unknown> | null;
|
|
49
47
|
refresh_frequency_sec?: number | null;
|
|
50
48
|
refreshed_at?: string | null;
|
|
51
49
|
last_refresh_status?: string | null;
|
|
@@ -67,9 +65,8 @@ const ROW_COLUMNS = [
|
|
|
67
65
|
"mime_type",
|
|
68
66
|
"size_bytes",
|
|
69
67
|
"fetcher",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"fetcher_args",
|
|
68
|
+
"downloader",
|
|
69
|
+
"downloader_args",
|
|
73
70
|
"refresh_frequency_sec",
|
|
74
71
|
"refreshed_at",
|
|
75
72
|
"last_refresh_status",
|
|
@@ -86,21 +83,21 @@ const COLUMN_LIST = ROW_COLUMNS.join(", ");
|
|
|
86
83
|
*/
|
|
87
84
|
export async function insertVersion(db: DbConnection, file: NewFileVersion): Promise<string> {
|
|
88
85
|
const versionId = file.version_id ?? millisIso(Date.now());
|
|
89
|
-
const
|
|
86
|
+
const downloaderArgsJson = file.downloader_args ? JSON.stringify(file.downloader_args) : null;
|
|
90
87
|
|
|
91
88
|
await db.queryRun(
|
|
92
89
|
`INSERT INTO files (
|
|
93
90
|
logical_path, version_id, tombstone, source_type,
|
|
94
91
|
source_path, source_mtime_ms, source_sha256, blob_sha256,
|
|
95
92
|
content_sha256, content, description, mime_type, size_bytes,
|
|
96
|
-
fetcher,
|
|
93
|
+
fetcher, downloader, downloader_args,
|
|
97
94
|
refresh_frequency_sec, refreshed_at, last_refresh_status, change_note
|
|
98
95
|
) VALUES (
|
|
99
96
|
?1, CAST(?2 AS TIMESTAMP), ?3, ?4,
|
|
100
97
|
?5, ?6, ?7, ?8,
|
|
101
98
|
?9, ?10, ?11, ?12, ?13,
|
|
102
|
-
?14, ?15, ?16,
|
|
103
|
-
?18, ?19, ?20
|
|
99
|
+
?14, ?15, ?16,
|
|
100
|
+
?17, ?18, ?19, ?20
|
|
104
101
|
)`,
|
|
105
102
|
file.logical_path,
|
|
106
103
|
versionId,
|
|
@@ -116,9 +113,8 @@ export async function insertVersion(db: DbConnection, file: NewFileVersion): Pro
|
|
|
116
113
|
file.mime_type ?? null,
|
|
117
114
|
file.size_bytes ?? null,
|
|
118
115
|
file.fetcher ?? null,
|
|
119
|
-
file.
|
|
120
|
-
|
|
121
|
-
fetcherArgsJson,
|
|
116
|
+
file.downloader ?? null,
|
|
117
|
+
downloaderArgsJson,
|
|
122
118
|
file.refresh_frequency_sec ?? null,
|
|
123
119
|
file.refreshed_at ?? null,
|
|
124
120
|
file.last_refresh_status ?? null,
|
|
@@ -132,33 +128,33 @@ export function millisIso(ms: number): string {
|
|
|
132
128
|
return new Date(ms).toISOString();
|
|
133
129
|
}
|
|
134
130
|
|
|
135
|
-
interface RawFileRow extends Omit<FileRow, "
|
|
136
|
-
|
|
131
|
+
interface RawFileRow extends Omit<FileRow, "downloader_args" | "tombstone"> {
|
|
132
|
+
downloader_args: string | null | Record<string, unknown>;
|
|
137
133
|
tombstone: boolean | number;
|
|
138
134
|
[key: string]: unknown;
|
|
139
135
|
}
|
|
140
136
|
|
|
141
137
|
/**
|
|
142
138
|
* Coerce a raw DuckDB row into a typed `FileRow`. JSON-parses the
|
|
143
|
-
* `
|
|
139
|
+
* `downloader_args` column (DuckDB returns it as text or a parsed object
|
|
144
140
|
* depending on driver version) and normalizes `tombstone` to a boolean
|
|
145
141
|
* (some drivers return 0/1).
|
|
146
142
|
*/
|
|
147
143
|
function toFileRow(row: RawFileRow | null): FileRow | null {
|
|
148
144
|
if (!row) return null;
|
|
149
145
|
let parsed: Record<string, unknown> | null = null;
|
|
150
|
-
if (row.
|
|
146
|
+
if (row.downloader_args && typeof row.downloader_args === "string") {
|
|
151
147
|
try {
|
|
152
|
-
parsed = JSON.parse(row.
|
|
148
|
+
parsed = JSON.parse(row.downloader_args);
|
|
153
149
|
} catch {
|
|
154
150
|
parsed = null;
|
|
155
151
|
}
|
|
156
|
-
} else if (row.
|
|
157
|
-
parsed = row.
|
|
152
|
+
} else if (row.downloader_args && typeof row.downloader_args === "object") {
|
|
153
|
+
parsed = row.downloader_args;
|
|
158
154
|
}
|
|
159
155
|
return {
|
|
160
156
|
...row,
|
|
161
|
-
|
|
157
|
+
downloader_args: parsed,
|
|
162
158
|
tombstone: !!row.tombstone,
|
|
163
159
|
};
|
|
164
160
|
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import type { Migration } from "../migrations.ts";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Replace the old mcpx-era fetcher metadata triple
|
|
5
|
+
* (`fetcher_server` / `fetcher_tool` / `fetcher_args`) with a flat
|
|
6
|
+
* `(downloader, downloader_args)` shape. The mcpx-driven agent fetcher
|
|
7
|
+
* is gone; per-service downloaders match a URL → run a known fetch
|
|
8
|
+
* (Playwright export endpoints for Google, rendered HTML for GitHub /
|
|
9
|
+
* Linear, headless print-to-PDF for everything else) → return bytes
|
|
10
|
+
* for the existing native converter pipeline.
|
|
11
|
+
*
|
|
12
|
+
* Existing rows whose `fetcher` was `'http'` or `'mcpx'` are migrated
|
|
13
|
+
* to `'downloader'` with `downloader=NULL`. The mcpx-driven ones
|
|
14
|
+
* become refresh-broken (the `fetcher_*` arguments that drove them no
|
|
15
|
+
* longer exist) but their stored `content` is still readable; the
|
|
16
|
+
* plain-HTTP ones will be re-routed through the generic-web downloader
|
|
17
|
+
* the next time refresh runs. The `fetcher` enum loses both `'http'`
|
|
18
|
+
* and `'mcpx'` — every remote row is `'downloader'` now, since even
|
|
19
|
+
* the plain-HTTP fallback is wrapped by the generic-web downloader.
|
|
20
|
+
*
|
|
21
|
+
* The `current_files` view is `SELECT f.* FROM files f`, so it pins the
|
|
22
|
+
* old column shape; we drop and recreate it (and the dependent
|
|
23
|
+
* `current_chunks` view) around the schema change.
|
|
24
|
+
*/
|
|
25
|
+
export const MIGRATION_003: Migration = {
|
|
26
|
+
id: 3,
|
|
27
|
+
name: "downloader-columns",
|
|
28
|
+
statements: [
|
|
29
|
+
// DuckDB refuses DROP COLUMN when an index covers any column that
|
|
30
|
+
// comes AFTER the dropped one in the schema, so the indexes have
|
|
31
|
+
// to come down first. The view drops are needed for the same
|
|
32
|
+
// reason — `current_files` is `SELECT f.*`, which pins every
|
|
33
|
+
// column at view-creation time.
|
|
34
|
+
`DROP VIEW IF EXISTS current_chunks`,
|
|
35
|
+
`DROP VIEW IF EXISTS current_files`,
|
|
36
|
+
`DROP INDEX IF EXISTS files_refresh_due_idx`,
|
|
37
|
+
`DROP INDEX IF EXISTS files_blob_sha256_idx`,
|
|
38
|
+
`DROP INDEX IF EXISTS files_logical_path_idx`,
|
|
39
|
+
`UPDATE files SET fetcher = 'downloader' WHERE fetcher IN ('http', 'mcpx')`,
|
|
40
|
+
`ALTER TABLE files DROP COLUMN fetcher_server`,
|
|
41
|
+
`ALTER TABLE files DROP COLUMN fetcher_tool`,
|
|
42
|
+
`ALTER TABLE files DROP COLUMN fetcher_args`,
|
|
43
|
+
`ALTER TABLE files ADD COLUMN downloader TEXT`,
|
|
44
|
+
`ALTER TABLE files ADD COLUMN downloader_args JSON`,
|
|
45
|
+
`CREATE INDEX files_logical_path_idx ON files (logical_path)`,
|
|
46
|
+
`CREATE INDEX files_blob_sha256_idx ON files (blob_sha256)`,
|
|
47
|
+
`CREATE INDEX files_refresh_due_idx ON files (refresh_frequency_sec, refreshed_at)`,
|
|
48
|
+
`CREATE VIEW current_files AS
|
|
49
|
+
SELECT f.* FROM files f
|
|
50
|
+
WHERE (f.logical_path, f.version_id) IN (
|
|
51
|
+
SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
|
|
52
|
+
)
|
|
53
|
+
AND f.tombstone = FALSE`,
|
|
54
|
+
`CREATE VIEW current_chunks AS
|
|
55
|
+
SELECT c.* FROM chunks c
|
|
56
|
+
JOIN current_files cf USING (logical_path, version_id)`,
|
|
57
|
+
],
|
|
58
|
+
};
|
package/src/db/migrations.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { logger } from "../output/logger.ts";
|
|
|
2
2
|
import type { DbConnection } from "./connection.ts";
|
|
3
3
|
import { MIGRATION_001 } from "./migrations/001-init.ts";
|
|
4
4
|
import { MIGRATION_002 } from "./migrations/002-fts.ts";
|
|
5
|
+
import { MIGRATION_003 } from "./migrations/003-downloader-columns.ts";
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* One DDL/DML migration step. The id is monotonically increasing; the name
|
|
@@ -14,7 +15,7 @@ export interface Migration {
|
|
|
14
15
|
statements: string[];
|
|
15
16
|
}
|
|
16
17
|
|
|
17
|
-
const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
|
|
18
|
+
const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002, MIGRATION_003];
|
|
18
19
|
|
|
19
20
|
/**
|
|
20
21
|
* Process-level cache of paths whose migrations have been applied (or
|
|
@@ -1,15 +1,57 @@
|
|
|
1
1
|
import mammoth from "mammoth";
|
|
2
2
|
import TurndownService from "turndown";
|
|
3
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
4
|
+
import { type CapturedImage, inlineImageCaptions, MEMBOT_IMG_PREFIX } from "./images-inline.ts";
|
|
3
5
|
|
|
4
6
|
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" });
|
|
5
7
|
|
|
8
|
+
/**
|
|
9
|
+
* Mammoth's image element wears an `altText` field that isn't reflected in
|
|
10
|
+
* the published `.d.ts`. We declare the bits we actually touch so the rest
|
|
11
|
+
* of the module can stay strict-typed.
|
|
12
|
+
*/
|
|
13
|
+
interface MammothImage {
|
|
14
|
+
contentType: string;
|
|
15
|
+
altText?: string;
|
|
16
|
+
readAsBuffer: () => Promise<Buffer>;
|
|
17
|
+
}
|
|
18
|
+
|
|
6
19
|
/**
|
|
7
20
|
* Convert a DOCX file to markdown. Mammoth gives us HTML; we then run that
|
|
8
|
-
* through turndown to get clean markdown.
|
|
9
|
-
*
|
|
21
|
+
* through turndown to get clean markdown. Embedded images (which mammoth
|
|
22
|
+
* would otherwise inline as 5MB base64 `data:` URIs) are intercepted and
|
|
23
|
+
* replaced with `membot-img://<id>` placeholders, then expanded into Claude
|
|
24
|
+
* vision captions by `inlineImageCaptions`. Conversion warnings from
|
|
25
|
+
* mammoth are silently dropped — they're typically about styles we don't
|
|
26
|
+
* preserve.
|
|
10
27
|
*/
|
|
11
|
-
export async function convertDocx(bytes: Uint8Array): Promise<string> {
|
|
28
|
+
export async function convertDocx(bytes: Uint8Array, llm: LlmConfig, converters: ConvertersConfig): Promise<string> {
|
|
12
29
|
const buf = Buffer.from(bytes);
|
|
13
|
-
const
|
|
14
|
-
|
|
30
|
+
const images = new Map<string, CapturedImage>();
|
|
31
|
+
let counter = 0;
|
|
32
|
+
|
|
33
|
+
const result = await mammoth.convertToHtml(
|
|
34
|
+
{ buffer: buf },
|
|
35
|
+
{
|
|
36
|
+
convertImage: mammoth.images.imgElement(async (image) => {
|
|
37
|
+
const img = image as unknown as MammothImage;
|
|
38
|
+
const id = `img-${counter++}`;
|
|
39
|
+
try {
|
|
40
|
+
const buffer = await img.readAsBuffer();
|
|
41
|
+
images.set(id, {
|
|
42
|
+
bytes: new Uint8Array(buffer),
|
|
43
|
+
mimeType: img.contentType,
|
|
44
|
+
altText: img.altText,
|
|
45
|
+
});
|
|
46
|
+
} catch {
|
|
47
|
+
// If we can't read the image bytes, still emit the placeholder so
|
|
48
|
+
// turndown doesn't fall back to a giant inline data URI.
|
|
49
|
+
}
|
|
50
|
+
return { src: `${MEMBOT_IMG_PREFIX}${id}` };
|
|
51
|
+
}),
|
|
52
|
+
},
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
const md = turndown.turndown(result.value).trim();
|
|
56
|
+
return inlineImageCaptions(md, images, llm, converters);
|
|
15
57
|
}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import TurndownService from "turndown";
|
|
2
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
3
|
+
import { extractDataUriImages, inlineImageCaptions } from "./images-inline.ts";
|
|
2
4
|
|
|
3
5
|
const turndown = new TurndownService({
|
|
4
6
|
headingStyle: "atx",
|
|
@@ -8,13 +10,18 @@ const turndown = new TurndownService({
|
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
12
|
* Convert HTML bytes to markdown using turndown. Strips script/style blocks
|
|
11
|
-
* before conversion so they don't leak into the chunker.
|
|
13
|
+
* before conversion so they don't leak into the chunker. Inline data-URI
|
|
14
|
+
* images are extracted into their bytes and replaced with vision captions
|
|
15
|
+
* via `inlineImageCaptions`; external `<img src="https://…">` references
|
|
16
|
+
* are left for turndown to render normally.
|
|
12
17
|
*/
|
|
13
|
-
export function convertHtml(bytes: Uint8Array): string {
|
|
18
|
+
export async function convertHtml(bytes: Uint8Array, llm: LlmConfig, converters: ConvertersConfig): Promise<string> {
|
|
14
19
|
const html = new TextDecoder("utf-8").decode(bytes);
|
|
15
20
|
const cleaned = html
|
|
16
21
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
17
22
|
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
18
23
|
.replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
|
|
19
|
-
|
|
24
|
+
const { html: rewritten, images } = extractDataUriImages(cleaned);
|
|
25
|
+
const md = turndown.turndown(rewritten).trim();
|
|
26
|
+
return inlineImageCaptions(md, images, llm, converters);
|
|
20
27
|
}
|
|
@@ -12,6 +12,13 @@ Output the caption only, no preamble.`;
|
|
|
12
12
|
|
|
13
13
|
const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/webp"]);
|
|
14
14
|
|
|
15
|
+
/** Anthropic vision rejects images > 5MB; stay under that with margin. */
|
|
16
|
+
const VISION_MAX_BYTES = 4 * 1024 * 1024;
|
|
17
|
+
/** Tesseract is roughly linear in pixel count; bail past this byte size to avoid pathological hangs. */
|
|
18
|
+
const OCR_MAX_BYTES = 8 * 1024 * 1024;
|
|
19
|
+
/** Hard wall-clock for either subtask so a stuck network call never freezes ingest. */
|
|
20
|
+
const SUBTASK_TIMEOUT_MS = 60_000;
|
|
21
|
+
|
|
15
22
|
/**
|
|
16
23
|
* Build the markdown surrogate for an image: an LLM-generated caption
|
|
17
24
|
* (when an API key is available) folded together with any text recovered
|
|
@@ -19,17 +26,47 @@ const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/web
|
|
|
19
26
|
* when no API key is set.
|
|
20
27
|
*/
|
|
21
28
|
export async function convertImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
|
|
22
|
-
const captionPromise =
|
|
23
|
-
|
|
29
|
+
const captionPromise =
|
|
30
|
+
bytes.byteLength <= VISION_MAX_BYTES
|
|
31
|
+
? withTimeout(describeImage(bytes, mimeType, llm), SUBTASK_TIMEOUT_MS, "vision")
|
|
32
|
+
: Promise.resolve("");
|
|
33
|
+
const ocrPromise =
|
|
34
|
+
bytes.byteLength <= OCR_MAX_BYTES ? withTimeout(ocrImage(bytes), SUBTASK_TIMEOUT_MS, "ocr") : Promise.resolve("");
|
|
24
35
|
const [caption, ocrText] = await Promise.all([captionPromise, ocrPromise]);
|
|
25
36
|
|
|
26
37
|
const sections: string[] = [];
|
|
27
38
|
if (caption) sections.push(caption);
|
|
28
39
|
if (ocrText) sections.push(`## Text detected via OCR\n\n${ocrText}`);
|
|
29
|
-
if (sections.length === 0)
|
|
40
|
+
if (sections.length === 0) {
|
|
41
|
+
const note =
|
|
42
|
+
bytes.byteLength > VISION_MAX_BYTES
|
|
43
|
+
? `(image, ${mimeType}, ${bytes.byteLength} bytes — exceeds vision size limit, no caption available)`
|
|
44
|
+
: `(image, ${mimeType}, no caption available)`;
|
|
45
|
+
sections.push(note);
|
|
46
|
+
}
|
|
30
47
|
return sections.join("\n\n");
|
|
31
48
|
}
|
|
32
49
|
|
|
50
|
+
/**
|
|
51
|
+
* Race a promise against a timer so a stuck network call (vision) or a
|
|
52
|
+
* pathological CPU-bound job (OCR on a multi-megapixel image) never freezes
|
|
53
|
+
* the whole conversion pipeline. Logs a warning when the timer wins.
|
|
54
|
+
*/
|
|
55
|
+
async function withTimeout<T extends string>(p: Promise<T>, ms: number, label: string): Promise<T | ""> {
|
|
56
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
57
|
+
const timeout = new Promise<"">((resolve) => {
|
|
58
|
+
timer = setTimeout(() => {
|
|
59
|
+
logger.warn(`image: ${label} timed out after ${ms}ms`);
|
|
60
|
+
resolve("");
|
|
61
|
+
}, ms);
|
|
62
|
+
});
|
|
63
|
+
try {
|
|
64
|
+
return await Promise.race([p, timeout]);
|
|
65
|
+
} finally {
|
|
66
|
+
if (timer) clearTimeout(timer);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
33
70
|
/**
|
|
34
71
|
* Single-shot vision call asking Claude to caption an image. Returns the
|
|
35
72
|
* caption text or an empty string when the API key is missing or the
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
2
|
+
import { logger } from "../../output/logger.ts";
|
|
3
|
+
import { convertImage } from "./image.ts";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Bytes captured from an embedded image during DOCX/HTML conversion. The
|
|
7
|
+
* image-inlining helpers run `convertImage` over each one to produce a
|
|
8
|
+
* markdown caption that gets spliced back into the document body in place
|
|
9
|
+
* of the original `<img>` reference.
|
|
10
|
+
*/
|
|
11
|
+
export interface CapturedImage {
|
|
12
|
+
bytes: Uint8Array;
|
|
13
|
+
mimeType: string;
|
|
14
|
+
altText?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/** URI scheme used to mark images that the inliner should expand. */
|
|
18
|
+
export const MEMBOT_IMG_PREFIX = "membot-img://";
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Match `` markdown image references. The id may
|
|
22
|
+
* contain any non-whitespace, non-`)` character so we don't accidentally
|
|
23
|
+
* stop at characters mammoth/turndown might emit inside an id.
|
|
24
|
+
*/
|
|
25
|
+
const TOKEN_RE = /!\[([^\]]*)\]\(membot-img:\/\/([^)\s]+)\)/g;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Extract data-URI images from raw HTML and rewrite each `<img src="data:…">`
|
|
29
|
+
* to `<img src="membot-img://<id>">`. The captured bytes flow through the
|
|
30
|
+
* shared `inlineImageCaptions` step so HTML and DOCX share one captioning
|
|
31
|
+
* code path. Non-data `<img>` references are left untouched.
|
|
32
|
+
*/
|
|
33
|
+
export function extractDataUriImages(html: string): { html: string; images: Map<string, CapturedImage> } {
|
|
34
|
+
const images = new Map<string, CapturedImage>();
|
|
35
|
+
let counter = 0;
|
|
36
|
+
const rewritten = html.replace(
|
|
37
|
+
/<img\b([^>]*?)\bsrc\s*=\s*(?:"data:([^";]+);base64,([^"]*)"|'data:([^';]+);base64,([^']*)')([^>]*)>/gi,
|
|
38
|
+
(
|
|
39
|
+
_match,
|
|
40
|
+
beforeSrc: string,
|
|
41
|
+
mimeDouble: string | undefined,
|
|
42
|
+
b64Double: string | undefined,
|
|
43
|
+
mimeSingle: string | undefined,
|
|
44
|
+
b64Single: string | undefined,
|
|
45
|
+
afterSrc: string,
|
|
46
|
+
) => {
|
|
47
|
+
const mimeType = (mimeDouble ?? mimeSingle ?? "image/png").trim();
|
|
48
|
+
const b64 = (b64Double ?? b64Single ?? "").replace(/\s+/g, "");
|
|
49
|
+
const id = `img-${counter++}`;
|
|
50
|
+
try {
|
|
51
|
+
const bytes = new Uint8Array(Buffer.from(b64, "base64"));
|
|
52
|
+
images.set(id, { bytes, mimeType });
|
|
53
|
+
} catch (err) {
|
|
54
|
+
logger.warn(
|
|
55
|
+
`images-inline: failed to decode embedded image (${err instanceof Error ? err.message : String(err)})`,
|
|
56
|
+
);
|
|
57
|
+
return `<img${beforeSrc} src=""${afterSrc}>`;
|
|
58
|
+
}
|
|
59
|
+
return `<img${beforeSrc} src="${MEMBOT_IMG_PREFIX}${id}"${afterSrc}>`;
|
|
60
|
+
},
|
|
61
|
+
);
|
|
62
|
+
return { html: rewritten, images };
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Replace each `` token in `markdown` with the
|
|
67
|
+
* caption produced by `convertImage`. Captures are processed in document
|
|
68
|
+
* order; once `max_inline_image_captions` (from `ConvertersConfig`) has been
|
|
69
|
+
* reached, the remaining tokens get a tiny deterministic placeholder rather
|
|
70
|
+
* than an LLM call so a doc full of embedded images doesn't fan out into
|
|
71
|
+
* hundreds of vision requests.
|
|
72
|
+
*
|
|
73
|
+
* No-ops on a markdown string with no `membot-img://` references; safe to
|
|
74
|
+
* call unconditionally from the converters.
|
|
75
|
+
*/
|
|
76
|
+
export async function inlineImageCaptions(
|
|
77
|
+
markdown: string,
|
|
78
|
+
images: Map<string, CapturedImage>,
|
|
79
|
+
llm: LlmConfig,
|
|
80
|
+
converters: ConvertersConfig,
|
|
81
|
+
): Promise<string> {
|
|
82
|
+
if (images.size === 0) return markdown;
|
|
83
|
+
|
|
84
|
+
const captions = new Map<string, string>();
|
|
85
|
+
const overflow = new Set<string>();
|
|
86
|
+
let captioned = 0;
|
|
87
|
+
|
|
88
|
+
for (const match of markdown.matchAll(TOKEN_RE)) {
|
|
89
|
+
const alt = match[1] ?? "";
|
|
90
|
+
const id = match[2];
|
|
91
|
+
if (!id || captions.has(id) || overflow.has(id)) continue;
|
|
92
|
+
const img = images.get(id);
|
|
93
|
+
if (!img) continue;
|
|
94
|
+
|
|
95
|
+
if (captioned >= converters.max_inline_image_captions) {
|
|
96
|
+
overflow.add(id);
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
captioned++;
|
|
100
|
+
try {
|
|
101
|
+
const caption = await convertImage(img.bytes, img.mimeType, llm);
|
|
102
|
+
captions.set(id, formatCaptionBlock(alt || img.altText || "", caption));
|
|
103
|
+
} catch (err) {
|
|
104
|
+
logger.warn(`images-inline: caption failed for ${id} (${err instanceof Error ? err.message : String(err)})`);
|
|
105
|
+
captions.set(id, formatCaptionBlock(alt || img.altText || "", `(image, ${img.mimeType}, no caption available)`));
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return markdown.replace(TOKEN_RE, (_match, alt: string, id: string) => {
|
|
110
|
+
const cached = captions.get(id);
|
|
111
|
+
if (cached) return cached;
|
|
112
|
+
const img = images.get(id);
|
|
113
|
+
if (!img) return formatCaptionBlock(alt, "(image, no caption available)");
|
|
114
|
+
return formatCaptionBlock(
|
|
115
|
+
alt || img.altText || "",
|
|
116
|
+
`(image, ${img.mimeType}, ${img.bytes.byteLength} bytes — caption skipped, exceeded max_inline_image_captions)`,
|
|
117
|
+
);
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Render a captioned image as its own markdown paragraph block. Wrapping the
|
|
123
|
+
* caption in blank lines guarantees the deterministic chunker sees it as a
|
|
124
|
+
* paragraph boundary; an HTML comment with the alt text keeps the original
|
|
125
|
+
* positional cue without polluting search snippets.
|
|
126
|
+
*/
|
|
127
|
+
function formatCaptionBlock(alt: string, caption: string): string {
|
|
128
|
+
const trimmed = caption.trim();
|
|
129
|
+
const header = alt.trim() ? `<!-- image: ${alt.trim()} -->` : `<!-- image -->`;
|
|
130
|
+
const body = trimmed.length > 0 ? trimmed : "(image, no caption available)";
|
|
131
|
+
return `\n\n${header}\n\n${body}\n\n`;
|
|
132
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { LlmConfig } from "../../config/schemas.ts";
|
|
1
|
+
import type { ConvertersConfig, LlmConfig } from "../../config/schemas.ts";
|
|
2
2
|
import { convertDocx } from "./docx.ts";
|
|
3
3
|
import { convertHtml } from "./html.ts";
|
|
4
4
|
import { convertImage } from "./image.ts";
|
|
@@ -6,6 +6,7 @@ import { convertWithLlm } from "./llm.ts";
|
|
|
6
6
|
import { ocrImage } from "./ocr.ts";
|
|
7
7
|
import { convertPdf, shouldOcrPdf } from "./pdf.ts";
|
|
8
8
|
import { convertText } from "./text.ts";
|
|
9
|
+
import { convertXlsx } from "./xlsx.ts";
|
|
9
10
|
|
|
10
11
|
export interface ConvertResult {
|
|
11
12
|
markdown: string;
|
|
@@ -25,6 +26,10 @@ const STRUCTURED_TEXT_MIMES = new Set([
|
|
|
25
26
|
"application/typescript",
|
|
26
27
|
]);
|
|
27
28
|
const DOCX_MIMES = new Set(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]);
|
|
29
|
+
const XLSX_MIMES = new Set([
|
|
30
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
31
|
+
"application/vnd.ms-excel",
|
|
32
|
+
]);
|
|
28
33
|
const PDF_MIMES = new Set(["application/pdf"]);
|
|
29
34
|
|
|
30
35
|
/**
|
|
@@ -39,6 +44,7 @@ export async function convert(
|
|
|
39
44
|
mimeType: string,
|
|
40
45
|
source: string,
|
|
41
46
|
llm: LlmConfig,
|
|
47
|
+
converters: ConvertersConfig,
|
|
42
48
|
): Promise<ConvertResult> {
|
|
43
49
|
const mt = mimeType.toLowerCase();
|
|
44
50
|
|
|
@@ -47,11 +53,15 @@ export async function convert(
|
|
|
47
53
|
}
|
|
48
54
|
|
|
49
55
|
if (HTML_MIMES.has(mt)) {
|
|
50
|
-
return { markdown: convertHtml(bytes), contentMimeType: "text/markdown" };
|
|
56
|
+
return { markdown: await convertHtml(bytes, llm, converters), contentMimeType: "text/markdown" };
|
|
51
57
|
}
|
|
52
58
|
|
|
53
59
|
if (DOCX_MIMES.has(mt)) {
|
|
54
|
-
return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
|
|
60
|
+
return { markdown: await convertDocx(bytes, llm, converters), contentMimeType: "text/markdown" };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (XLSX_MIMES.has(mt)) {
|
|
64
|
+
return { markdown: await convertXlsx(bytes), contentMimeType: "text/markdown" };
|
|
55
65
|
}
|
|
56
66
|
|
|
57
67
|
if (PDF_MIMES.has(mt)) {
|