membot 0.0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/.claude/skills/membot.md +137 -0
  2. package/.cursor/rules/membot.mdc +137 -0
  3. package/README.md +131 -0
  4. package/package.json +83 -24
  5. package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
  6. package/scripts/apply-transformers-patch.sh +35 -0
  7. package/src/cli.ts +72 -0
  8. package/src/commands/check-update.ts +69 -0
  9. package/src/commands/mcpx.ts +112 -0
  10. package/src/commands/reindex.ts +53 -0
  11. package/src/commands/serve.ts +58 -0
  12. package/src/commands/skill.ts +131 -0
  13. package/src/commands/upgrade.ts +220 -0
  14. package/src/config/loader.ts +100 -0
  15. package/src/config/schemas.ts +39 -0
  16. package/src/constants.ts +42 -0
  17. package/src/context.ts +80 -0
  18. package/src/db/blobs.ts +53 -0
  19. package/src/db/chunks.ts +176 -0
  20. package/src/db/connection.ts +173 -0
  21. package/src/db/files.ts +325 -0
  22. package/src/db/migrations/001-init.ts +63 -0
  23. package/src/db/migrations/002-fts.ts +12 -0
  24. package/src/db/migrations.ts +45 -0
  25. package/src/errors.ts +87 -0
  26. package/src/ingest/chunker.ts +117 -0
  27. package/src/ingest/converter/docx.ts +15 -0
  28. package/src/ingest/converter/html.ts +20 -0
  29. package/src/ingest/converter/image.ts +71 -0
  30. package/src/ingest/converter/index.ts +119 -0
  31. package/src/ingest/converter/llm.ts +66 -0
  32. package/src/ingest/converter/ocr.ts +51 -0
  33. package/src/ingest/converter/pdf.ts +38 -0
  34. package/src/ingest/converter/text.ts +8 -0
  35. package/src/ingest/describer.ts +72 -0
  36. package/src/ingest/embedder.ts +98 -0
  37. package/src/ingest/fetcher.ts +280 -0
  38. package/src/ingest/ingest.ts +444 -0
  39. package/src/ingest/local-reader.ts +64 -0
  40. package/src/ingest/search-text.ts +18 -0
  41. package/src/ingest/source-resolver.ts +186 -0
  42. package/src/mcp/instructions.ts +34 -0
  43. package/src/mcp/server.ts +101 -0
  44. package/src/mount/commander.ts +174 -0
  45. package/src/mount/mcp.ts +111 -0
  46. package/src/mount/zod-to-cli.ts +158 -0
  47. package/src/operations/add.ts +69 -0
  48. package/src/operations/diff.ts +105 -0
  49. package/src/operations/index.ts +38 -0
  50. package/src/operations/info.ts +95 -0
  51. package/src/operations/list.ts +87 -0
  52. package/src/operations/move.ts +83 -0
  53. package/src/operations/prune.ts +80 -0
  54. package/src/operations/read.ts +102 -0
  55. package/src/operations/refresh.ts +72 -0
  56. package/src/operations/remove.ts +35 -0
  57. package/src/operations/search.ts +72 -0
  58. package/src/operations/tree.ts +103 -0
  59. package/src/operations/types.ts +81 -0
  60. package/src/operations/versions.ts +78 -0
  61. package/src/operations/write.ts +77 -0
  62. package/src/output/formatter.ts +68 -0
  63. package/src/output/logger.ts +114 -0
  64. package/src/output/progress.ts +78 -0
  65. package/src/output/tty.ts +91 -0
  66. package/src/refresh/runner.ts +296 -0
  67. package/src/refresh/scheduler.ts +54 -0
  68. package/src/sdk.ts +27 -0
  69. package/src/search/hybrid.ts +100 -0
  70. package/src/search/keyword.ts +62 -0
  71. package/src/search/semantic.ts +56 -0
  72. package/src/types/text-modules.d.ts +9 -0
  73. package/src/update/background.ts +73 -0
  74. package/src/update/cache.ts +40 -0
  75. package/src/update/checker.ts +117 -0
  76. package/.claude/settings.local.json +0 -7
  77. package/CLAUDE.md +0 -139
  78. package/docs/plan.md +0 -905
@@ -0,0 +1,63 @@
1
+ import { EMBEDDING_DIMENSION } from "../../constants.ts";
2
+ import type { Migration } from "../migrations.ts";
3
+
4
+ export const MIGRATION_001: Migration = {
5
+ id: 1,
6
+ name: "init",
7
+ statements: [
8
+ `CREATE TABLE blobs (
9
+ sha256 TEXT PRIMARY KEY,
10
+ mime_type TEXT NOT NULL,
11
+ size_bytes BIGINT NOT NULL,
12
+ bytes BLOB NOT NULL,
13
+ created_at TIMESTAMP NOT NULL DEFAULT now()
14
+ )`,
15
+ `CREATE TABLE files (
16
+ logical_path TEXT NOT NULL,
17
+ version_id TIMESTAMP NOT NULL DEFAULT now(),
18
+ tombstone BOOLEAN NOT NULL DEFAULT FALSE,
19
+ source_type TEXT NOT NULL,
20
+ source_path TEXT,
21
+ source_mtime_ms BIGINT,
22
+ source_sha256 TEXT,
23
+ blob_sha256 TEXT,
24
+ content_sha256 TEXT,
25
+ content TEXT,
26
+ description TEXT,
27
+ mime_type TEXT,
28
+ size_bytes BIGINT,
29
+ fetcher TEXT,
30
+ fetcher_server TEXT,
31
+ fetcher_tool TEXT,
32
+ fetcher_args JSON,
33
+ refresh_frequency_sec INTEGER,
34
+ refreshed_at TIMESTAMP,
35
+ last_refresh_status TEXT,
36
+ change_note TEXT,
37
+ created_at TIMESTAMP NOT NULL DEFAULT now(),
38
+ PRIMARY KEY (logical_path, version_id)
39
+ )`,
40
+ `CREATE INDEX files_logical_path_idx ON files (logical_path)`,
41
+ `CREATE INDEX files_blob_sha256_idx ON files (blob_sha256)`,
42
+ `CREATE INDEX files_refresh_due_idx ON files (refresh_frequency_sec, refreshed_at)`,
43
+ `CREATE TABLE chunks (
44
+ logical_path TEXT NOT NULL,
45
+ version_id TIMESTAMP NOT NULL,
46
+ chunk_index INTEGER NOT NULL,
47
+ chunk_content TEXT NOT NULL,
48
+ search_text TEXT NOT NULL,
49
+ embedding FLOAT[${EMBEDDING_DIMENSION}] NOT NULL,
50
+ PRIMARY KEY (logical_path, version_id, chunk_index)
51
+ )`,
52
+ `CREATE INDEX chunks_path_idx ON chunks (logical_path)`,
53
+ `CREATE VIEW current_files AS
54
+ SELECT f.* FROM files f
55
+ WHERE (f.logical_path, f.version_id) IN (
56
+ SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
57
+ )
58
+ AND f.tombstone = FALSE`,
59
+ `CREATE VIEW current_chunks AS
60
+ SELECT c.* FROM chunks c
61
+ JOIN current_files cf USING (logical_path, version_id)`,
62
+ ],
63
+ };
@@ -0,0 +1,12 @@
1
+ import type { Migration } from "../migrations.ts";
2
+
3
+ export const MIGRATION_002: Migration = {
4
+ id: 2,
5
+ name: "fts",
6
+ statements: [
7
+ `INSTALL fts`,
8
+ `LOAD fts`,
9
+ // FTS index built lazily by ingest.ts / refresh.ts after the first chunk insert,
10
+ // because PRAGMA create_fts_index errors when the table is empty in some builds.
11
+ ],
12
+ };
@@ -0,0 +1,45 @@
1
+ import { logger } from "../output/logger.ts";
2
+ import type { DbConnection } from "./connection.ts";
3
+ import { MIGRATION_001 } from "./migrations/001-init.ts";
4
+ import { MIGRATION_002 } from "./migrations/002-fts.ts";
5
+
6
+ /**
7
+ * One DDL/DML migration step. The id is monotonically increasing; the name
8
+ * is for logging only. Each statement runs independently so PRAGMA / INSTALL
9
+ * / LOAD calls (which DuckDB doesn't allow in multi-statement strings) work.
10
+ */
11
+ export interface Migration {
12
+ id: number;
13
+ name: string;
14
+ statements: string[];
15
+ }
16
+
17
+ const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
18
+
19
+ /**
20
+ * Apply every unapplied migration in id order. Tracks applied ids in
21
+ * `_migrations`. Each successful run is logged via the shared logger so a
22
+ * user upgrading membot can see exactly what changed in their store.
23
+ */
24
+ export async function applyMigrations(db: DbConnection): Promise<void> {
25
+ await db.exec(`CREATE TABLE IF NOT EXISTS _migrations (
26
+ id INTEGER PRIMARY KEY,
27
+ name TEXT NOT NULL,
28
+ applied_at TIMESTAMP NOT NULL DEFAULT now()
29
+ )`);
30
+
31
+ const applied = await db.queryAll<{ id: number }>(`SELECT id FROM _migrations ORDER BY id`);
32
+ const appliedIds = new Set(applied.map((r) => Number(r.id)));
33
+
34
+ for (const migration of MIGRATIONS) {
35
+ if (appliedIds.has(migration.id)) continue;
36
+ logger.info(`migration: applying ${String(migration.id).padStart(3, "0")}-${migration.name}`);
37
+ for (const stmt of migration.statements) {
38
+ const trimmed = stmt.trim();
39
+ if (!trimmed) continue;
40
+ await db.exec(trimmed);
41
+ }
42
+ await db.queryRun(`INSERT INTO _migrations(id, name) VALUES (?1, ?2)`, migration.id, migration.name);
43
+ logger.info(`migration: applied ${String(migration.id).padStart(3, "0")}-${migration.name}`);
44
+ }
45
+ }
package/src/errors.ts ADDED
@@ -0,0 +1,87 @@
1
+ export type ErrorKind =
2
+ | "input_error"
3
+ | "not_found"
4
+ | "conflict"
5
+ | "auth_error"
6
+ | "network_error"
7
+ | "unsupported_mime"
8
+ | "partial_failure"
9
+ | "internal_error";
10
+
11
+ export interface HelpfulErrorArgs {
12
+ kind: ErrorKind;
13
+ message: string;
14
+ hint: string;
15
+ details?: unknown;
16
+ cause?: unknown;
17
+ }
18
+
19
+ /**
20
+ * The only error type allowed inside membot handlers. The mount adapters
21
+ * (commander + MCP) catch this and render `kind` + `message` + `hint`
22
+ * for both surfaces.
23
+ */
24
+ export class HelpfulError extends Error {
25
+ readonly kind: ErrorKind;
26
+ readonly hint: string;
27
+ readonly details?: unknown;
28
+ override readonly cause?: unknown;
29
+
30
+ constructor(args: HelpfulErrorArgs) {
31
+ super(args.message);
32
+ if (!args.hint?.trim()) {
33
+ throw new Error("HelpfulError requires a non-empty hint");
34
+ }
35
+ this.name = "HelpfulError";
36
+ this.kind = args.kind;
37
+ this.hint = args.hint;
38
+ this.details = args.details;
39
+ this.cause = args.cause;
40
+ }
41
+ }
42
+
43
+ export function isHelpfulError(e: unknown): e is HelpfulError {
44
+ return e instanceof HelpfulError;
45
+ }
46
+
47
+ /**
48
+ * Wrap an unknown error so callers can:
49
+ * try { ... } catch (e) { throw asHelpful(e, "while reading PDF", "Try ...", "internal_error") }
50
+ */
51
+ export function asHelpful(
52
+ cause: unknown,
53
+ context: string,
54
+ hint: string,
55
+ kind: ErrorKind = "internal_error",
56
+ ): HelpfulError {
57
+ if (cause instanceof HelpfulError) return cause;
58
+ const msg = cause instanceof Error ? cause.message : String(cause);
59
+ return new HelpfulError({
60
+ kind,
61
+ message: `${context}: ${msg}`,
62
+ hint,
63
+ cause,
64
+ });
65
+ }
66
+
67
+ /** Map an ErrorKind to a stable process exit code. */
68
+ export function mapKindToExit(kind: ErrorKind): number {
69
+ switch (kind) {
70
+ case "input_error":
71
+ return 2;
72
+ case "not_found":
73
+ return 3;
74
+ case "conflict":
75
+ return 4;
76
+ case "auth_error":
77
+ return 5;
78
+ case "network_error":
79
+ return 6;
80
+ case "unsupported_mime":
81
+ return 7;
82
+ case "partial_failure":
83
+ return 8;
84
+ default:
85
+ return 1;
86
+ }
87
+ }
@@ -0,0 +1,117 @@
1
+ import type { ChunkerConfig } from "../config/schemas.ts";
2
+ import { DEFAULTS } from "../constants.ts";
3
+
4
+ export interface Chunk {
5
+ index: number;
6
+ content: string;
7
+ }
8
+
9
+ const SHORT_CONTENT_THRESHOLD = 200;
10
+ const DEFAULT_OVERLAP_LINES = 2;
11
+
12
+ /**
13
+ * Split text into pieces no larger than `maxChars`, preferring paragraph,
14
+ * then line, then hard-character boundaries. Used to bound chunk size for
15
+ * the embedding model's input window.
16
+ */
17
+ export function splitText(text: string, maxChars: number): string[] {
18
+ if (text.length <= maxChars) return [text];
19
+
20
+ const paragraphs = text.split(/\n\n+/);
21
+ if (paragraphs.length > 1) {
22
+ const out: string[] = [];
23
+ let buf = "";
24
+ for (const p of paragraphs) {
25
+ const candidate = buf ? `${buf}\n\n${p}` : p;
26
+ if (candidate.length <= maxChars) {
27
+ buf = candidate;
28
+ } else {
29
+ if (buf) out.push(buf);
30
+ if (p.length <= maxChars) {
31
+ buf = p;
32
+ } else {
33
+ out.push(...splitText(p, maxChars));
34
+ buf = "";
35
+ }
36
+ }
37
+ }
38
+ if (buf) out.push(buf);
39
+ return out;
40
+ }
41
+
42
+ const lines = text.split("\n");
43
+ if (lines.length > 1) {
44
+ const out: string[] = [];
45
+ let buf = "";
46
+ for (const line of lines) {
47
+ const candidate = buf ? `${buf}\n${line}` : line;
48
+ if (candidate.length <= maxChars) {
49
+ buf = candidate;
50
+ } else {
51
+ if (buf) out.push(buf);
52
+ if (line.length <= maxChars) {
53
+ buf = line;
54
+ } else {
55
+ for (let i = 0; i < line.length; i += maxChars) {
56
+ out.push(line.slice(i, i + maxChars));
57
+ }
58
+ buf = "";
59
+ }
60
+ }
61
+ }
62
+ if (buf) out.push(buf);
63
+ return out;
64
+ }
65
+
66
+ const out: string[] = [];
67
+ for (let i = 0; i < text.length; i += maxChars) {
68
+ out.push(text.slice(i, i + maxChars));
69
+ }
70
+ return out;
71
+ }
72
+
73
+ /** Re-chunk any chunks larger than `maxChars`, preserving order and reindexing. */
74
+ export function enforceMaxChunkSize(chunks: Chunk[], maxChars: number = DEFAULTS.CHUNKER_MAX_CHARS): Chunk[] {
75
+ const out: Chunk[] = [];
76
+ for (const c of chunks) {
77
+ if (c.content.length <= maxChars) {
78
+ out.push({ index: out.length, content: c.content });
79
+ continue;
80
+ }
81
+ for (const piece of splitText(c.content, maxChars)) {
82
+ out.push({ index: out.length, content: piece });
83
+ }
84
+ }
85
+ return out;
86
+ }
87
+
88
+ /**
89
+ * Add overlapping lines from the end of each chunk to the start of the
90
+ * next so retrieval still works when concepts span chunk boundaries.
91
+ */
92
+ export function addOverlapToChunks(chunks: Chunk[], overlapLines = DEFAULT_OVERLAP_LINES): Chunk[] {
93
+ if (chunks.length <= 1 || overlapLines <= 0) return chunks;
94
+ return chunks.map((c, i) => {
95
+ if (i === 0) return { ...c };
96
+ const prev = chunks[i - 1];
97
+ if (!prev) return { ...c };
98
+ const overlap = prev.content.split("\n").slice(-overlapLines).join("\n");
99
+ return { ...c, content: `${overlap}\n${c.content}` };
100
+ });
101
+ }
102
+
103
+ /**
104
+ * Deterministic chunker. Splits on paragraph/line/hard boundaries to a
105
+ * target size, then enforces a hard max-size after overlap is added. The
106
+ * LLM chunker is a separate code path opted into via config; this is the
107
+ * default and what tests rely on for stability.
108
+ */
109
+ export function chunkDeterministic(content: string, config: ChunkerConfig): Chunk[] {
110
+ if (content.length < SHORT_CONTENT_THRESHOLD) {
111
+ return [{ index: 0, content }];
112
+ }
113
+ const initial = splitText(content, config.target_chars).map((c, i) => ({ index: i, content: c }));
114
+ const sized = enforceMaxChunkSize(initial, config.max_chars);
115
+ const withOverlap = addOverlapToChunks(sized);
116
+ return enforceMaxChunkSize(withOverlap, config.max_chars);
117
+ }
@@ -0,0 +1,15 @@
1
+ import mammoth from "mammoth";
2
+ import TurndownService from "turndown";
3
+
4
+ const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" });
5
+
6
+ /**
7
+ * Convert a DOCX file to markdown. Mammoth gives us HTML; we then run that
8
+ * through turndown to get clean markdown. Any conversion warnings are
9
+ * silently dropped — they're typically about styles we don't preserve.
10
+ */
11
+ export async function convertDocx(bytes: Uint8Array): Promise<string> {
12
+ const buf = Buffer.from(bytes);
13
+ const result = await mammoth.convertToHtml({ buffer: buf });
14
+ return turndown.turndown(result.value).trim();
15
+ }
@@ -0,0 +1,20 @@
1
+ import TurndownService from "turndown";
2
+
3
+ const turndown = new TurndownService({
4
+ headingStyle: "atx",
5
+ codeBlockStyle: "fenced",
6
+ bulletListMarker: "-",
7
+ });
8
+
9
+ /**
10
+ * Convert HTML bytes to markdown using turndown. Strips script/style blocks
11
+ * before conversion so they don't leak into the chunker.
12
+ */
13
+ export function convertHtml(bytes: Uint8Array): string {
14
+ const html = new TextDecoder("utf-8").decode(bytes);
15
+ const cleaned = html
16
+ .replace(/<script[\s\S]*?<\/script>/gi, "")
17
+ .replace(/<style[\s\S]*?<\/style>/gi, "")
18
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
19
+ return turndown.turndown(cleaned).trim();
20
+ }
@@ -0,0 +1,71 @@
1
+ import Anthropic from "@anthropic-ai/sdk";
2
+ import type { LlmConfig } from "../../config/schemas.ts";
3
+ import { logger } from "../../output/logger.ts";
4
+ import { ocrImage } from "./ocr.ts";
5
+
6
+ const VISION_PROMPT = `Describe this image as a one-paragraph caption suitable for retrieval. Focus on:
7
+ - The subject and any people / objects / diagrams visible
8
+ - Visible text content if present
9
+ - The visual style (screenshot, photograph, diagram, chart, etc.)
10
+
11
+ Output the caption only, no preamble.`;
12
+
13
+ const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/webp"]);
14
+
15
+ /**
16
+ * Build the markdown surrogate for an image: an LLM-generated caption
17
+ * (when an API key is available) folded together with any text recovered
18
+ * by Tesseract OCR. Falls back to OCR-only or a deterministic placeholder
19
+ * when no API key is set.
20
+ */
21
+ export async function convertImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
22
+ const captionPromise = describeImage(bytes, mimeType, llm);
23
+ const ocrPromise = ocrImage(bytes);
24
+ const [caption, ocrText] = await Promise.all([captionPromise, ocrPromise]);
25
+
26
+ const sections: string[] = [];
27
+ if (caption) sections.push(caption);
28
+ if (ocrText) sections.push(`## Text detected via OCR\n\n${ocrText}`);
29
+ if (sections.length === 0) sections.push(`(image, ${mimeType}, no caption available)`);
30
+ return sections.join("\n\n");
31
+ }
32
+
33
+ /**
34
+ * Single-shot vision call asking Claude to caption an image. Returns the
35
+ * caption text or an empty string when the API key is missing or the
36
+ * MIME type isn't accepted by the vision endpoint.
37
+ */
38
+ async function describeImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
39
+ if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") return "";
40
+ if (!VISION_MIMES.has(mimeType)) return "";
41
+
42
+ const client = new Anthropic({ apiKey: llm.anthropic_api_key });
43
+ const base64 = Buffer.from(bytes).toString("base64");
44
+ try {
45
+ const resp = await client.messages.create({
46
+ model: llm.vision_model,
47
+ max_tokens: 500,
48
+ messages: [
49
+ {
50
+ role: "user",
51
+ content: [
52
+ {
53
+ type: "image",
54
+ source: {
55
+ type: "base64",
56
+ media_type: mimeType as "image/png" | "image/jpeg" | "image/gif" | "image/webp",
57
+ data: base64,
58
+ },
59
+ },
60
+ { type: "text", text: VISION_PROMPT },
61
+ ],
62
+ },
63
+ ],
64
+ });
65
+ const text = resp.content.flatMap((b) => (b.type === "text" ? [b.text] : [])).join("");
66
+ return text.trim();
67
+ } catch (err) {
68
+ logger.warn(`vision: caption failed (${err instanceof Error ? err.message : String(err)})`);
69
+ return "";
70
+ }
71
+ }
@@ -0,0 +1,119 @@
1
+ import type { LlmConfig } from "../../config/schemas.ts";
2
+ import { convertDocx } from "./docx.ts";
3
+ import { convertHtml } from "./html.ts";
4
+ import { convertImage } from "./image.ts";
5
+ import { convertWithLlm } from "./llm.ts";
6
+ import { ocrImage } from "./ocr.ts";
7
+ import { convertPdf, shouldOcrPdf } from "./pdf.ts";
8
+ import { convertText } from "./text.ts";
9
+
10
+ export interface ConvertResult {
11
+ markdown: string;
12
+ contentMimeType: "text/markdown";
13
+ }
14
+
15
+ const TEXT_MIMES = new Set(["text/markdown", "text/plain", "text/x-markdown", "text/md"]);
16
+ const HTML_MIMES = new Set(["text/html", "application/xhtml+xml"]);
17
+ const STRUCTURED_TEXT_MIMES = new Set([
18
+ "application/json",
19
+ "application/xml",
20
+ "text/xml",
21
+ "application/yaml",
22
+ "text/yaml",
23
+ "text/csv",
24
+ "application/javascript",
25
+ "application/typescript",
26
+ ]);
27
+ const DOCX_MIMES = new Set(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]);
28
+ const PDF_MIMES = new Set(["application/pdf"]);
29
+
30
+ /**
31
+ * Convert raw bytes to a markdown surrogate via mime-dispatched native
32
+ * libraries first, with an LLM fallback when no native converter applies
33
+ * and an Anthropic API key is configured. Always returns markdown — even
34
+ * for binary types — so the chunker / embedder pipeline never has to
35
+ * branch on the source mime.
36
+ */
37
+ export async function convert(
38
+ bytes: Uint8Array,
39
+ mimeType: string,
40
+ source: string,
41
+ llm: LlmConfig,
42
+ ): Promise<ConvertResult> {
43
+ const mt = mimeType.toLowerCase();
44
+
45
+ if (TEXT_MIMES.has(mt)) {
46
+ return { markdown: convertText(bytes), contentMimeType: "text/markdown" };
47
+ }
48
+
49
+ if (HTML_MIMES.has(mt)) {
50
+ return { markdown: convertHtml(bytes), contentMimeType: "text/markdown" };
51
+ }
52
+
53
+ if (DOCX_MIMES.has(mt)) {
54
+ return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
55
+ }
56
+
57
+ if (PDF_MIMES.has(mt)) {
58
+ const conversion = await convertPdf(bytes);
59
+ if (!shouldOcrPdf(conversion)) {
60
+ return { markdown: conversion.markdown, contentMimeType: "text/markdown" };
61
+ }
62
+ const ocrText = await ocrPdfBytes(bytes);
63
+ const merged = [conversion.markdown, ocrText ? `## Text detected via OCR\n\n${ocrText}` : ""]
64
+ .filter(Boolean)
65
+ .join("\n\n");
66
+ return {
67
+ markdown: merged || `(scanned PDF, ${bytes.byteLength} bytes — no recognizable text)`,
68
+ contentMimeType: "text/markdown",
69
+ };
70
+ }
71
+
72
+ if (mt.startsWith("image/")) {
73
+ return { markdown: await convertImage(bytes, mt, llm), contentMimeType: "text/markdown" };
74
+ }
75
+
76
+ if (STRUCTURED_TEXT_MIMES.has(mt)) {
77
+ const raw = convertText(bytes);
78
+ const md = await convertWithLlm(raw, mt, source, llm);
79
+ return { markdown: md || raw, contentMimeType: "text/markdown" };
80
+ }
81
+
82
+ // Last resort: try LLM conversion with a base64 sample (truncated) so we
83
+ // at least produce something for unknown binary types. Without an API
84
+ // key we fall straight through to a deterministic placeholder.
85
+ if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") {
86
+ return {
87
+ markdown: `(unknown binary, ${mt}, ${bytes.byteLength} bytes)`,
88
+ contentMimeType: "text/markdown",
89
+ };
90
+ }
91
+ const sample = sampleAsText(bytes, mt);
92
+ const md = await convertWithLlm(sample, mt, source, llm);
93
+ if (md && md.trim().length > 0 && md !== sample) {
94
+ return { markdown: md, contentMimeType: "text/markdown" };
95
+ }
96
+ return { markdown: `(unknown binary, ${mt}, ${bytes.byteLength} bytes)`, contentMimeType: "text/markdown" };
97
+ }
98
+
99
+ /**
100
+ * Render a small slice of unknown-binary bytes as a base64 sample so the
101
+ * LLM converter has something to look at without us shipping a 50MB blob.
102
+ */
103
+ function sampleAsText(bytes: Uint8Array, mimeType: string): string {
104
+ const slice = bytes.slice(0, 4096);
105
+ const b64 = Buffer.from(slice).toString("base64");
106
+ return `Binary content of type ${mimeType}, ${bytes.byteLength} bytes total. First 4096 bytes (base64):\n\n${b64}`;
107
+ }
108
+
109
+ /**
110
+ * Tesseract over a PDF's bytes is unhelpful (it's not an image). For a real
111
+ * scanned-PDF OCR pipeline we'd rasterize each page first; for now this
112
+ * function exists as a hook and returns an empty string so the dispatcher
113
+ * still produces a usable surrogate.
114
+ */
115
+ async function ocrPdfBytes(_bytes: Uint8Array): Promise<string> {
116
+ return "";
117
+ }
118
+
119
+ export { ocrImage };
@@ -0,0 +1,66 @@
1
+ import Anthropic from "@anthropic-ai/sdk";
2
+ import type { LlmConfig } from "../../config/schemas.ts";
3
+ import { logger } from "../../output/logger.ts";
4
+
5
+ const CONVERTER_MAX_TOKENS = 16_384;
6
+
7
+ const CONVERTER_SYSTEM_PROMPT = `You normalize documents to clean, well-structured Markdown.
8
+
9
+ If the input is already clean, valid Markdown, return it verbatim with no edits.
10
+
11
+ Otherwise, convert it. The input mime_type is a hint, not a guarantee — verify the actual content. Common non-markdown formats:
12
+ - HTML — strip tags, scripts, styles, navigation/footer chrome.
13
+ - JSON / XML / YAML — render structure as readable Markdown.
14
+ - DocMD-like annotation formats — strip bracket annotations, map H1→#, H2→##, P→paragraph.
15
+
16
+ Rules for the output:
17
+ - Preserve all semantic content: headings, paragraphs, lists, tables, links, inline code, code blocks, blockquotes.
18
+ - Use ATX headings (#, ##, ###), fenced code blocks, GFM-style tables.
19
+ - Strip metadata headers/IDs (e.g. @document_id: ...).
20
+ - Output ONLY the Markdown. No preamble, no trailing commentary, no wrapping the entire output in a code fence.`;
21
+
22
+ /**
23
+ * Last-resort converter: ship the raw text/binary preview to Claude and ask
24
+ * for clean markdown. Returns the raw input unchanged when there's no API
25
+ * key configured (the pipeline degrades to a less-clean surrogate rather
26
+ * than failing the ingest). Does NOT run when the input is already known
27
+ * to be markdown — caller should short-circuit that path.
28
+ */
29
+ export async function convertWithLlm(
30
+ content: string,
31
+ mimeType: string,
32
+ source: string,
33
+ llm: LlmConfig,
34
+ ): Promise<string> {
35
+ if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") {
36
+ return content;
37
+ }
38
+ const client = new Anthropic({ apiKey: llm.anthropic_api_key });
39
+ try {
40
+ const stream = client.messages.stream({
41
+ model: llm.converter_model,
42
+ max_tokens: CONVERTER_MAX_TOKENS,
43
+ system: CONVERTER_SYSTEM_PROMPT,
44
+ messages: [
45
+ {
46
+ role: "user",
47
+ content: `Convert this ${mimeType} content to Markdown. Source: ${source}\n\n${content}`,
48
+ },
49
+ ],
50
+ });
51
+ const final = await stream.finalMessage();
52
+ const text = final.content.flatMap((b) => (b.type === "text" ? [b.text] : [])).join("");
53
+ if (!text.trim()) return content;
54
+ return stripLeadingMarkdownFence(text);
55
+ } catch (err) {
56
+ logger.warn(`llm-converter: failed (${err instanceof Error ? err.message : String(err)}) — using raw input`);
57
+ return content;
58
+ }
59
+ }
60
+
61
+ function stripLeadingMarkdownFence(text: string): string {
62
+ const trimmed = text.trim();
63
+ const fenceMatch = trimmed.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/);
64
+ if (fenceMatch?.[1]) return fenceMatch[1];
65
+ return text;
66
+ }
@@ -0,0 +1,51 @@
1
+ import { logger } from "../../output/logger.ts";
2
+
3
+ interface TesseractWorker {
4
+ recognize(input: Uint8Array | Buffer | string): Promise<{ data: { text: string } }>;
5
+ terminate(): Promise<void>;
6
+ }
7
+
8
+ let workerPromise: Promise<TesseractWorker> | null = null;
9
+
10
+ /**
11
+ * Lazily initialize a Tesseract worker for English OCR. Held as a process-
12
+ * wide singleton because spinning a worker up costs hundreds of ms.
13
+ */
14
+ async function getWorker(): Promise<TesseractWorker> {
15
+ if (!workerPromise) {
16
+ workerPromise = (async () => {
17
+ const tesseract = await import("tesseract.js");
18
+ const w = await tesseract.createWorker("eng");
19
+ return w as unknown as TesseractWorker;
20
+ })();
21
+ }
22
+ return workerPromise;
23
+ }
24
+
25
+ /**
26
+ * Run Tesseract OCR over the provided bytes (image bytes). Returns the
27
+ * recognized text. Errors are logged and turned into an empty string so
28
+ * the calling pipeline can degrade gracefully.
29
+ */
30
+ export async function ocrImage(bytes: Uint8Array): Promise<string> {
31
+ try {
32
+ const worker = await getWorker();
33
+ const result = await worker.recognize(Buffer.from(bytes));
34
+ return (result.data.text ?? "").trim();
35
+ } catch (err) {
36
+ logger.warn(`ocr: recognition failed (${err instanceof Error ? err.message : String(err)})`);
37
+ return "";
38
+ }
39
+ }
40
+
41
+ /** Tear down the singleton worker — call once at process exit if needed. */
42
+ export async function shutdownOcr(): Promise<void> {
43
+ if (!workerPromise) return;
44
+ const w = await workerPromise;
45
+ workerPromise = null;
46
+ try {
47
+ await w.terminate();
48
+ } catch {
49
+ // best effort
50
+ }
51
+ }