npm - membot - Versions diffs - 0.0.1 → 0.1.1 - Mend

membot 0.0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/.claude/skills/membot.md +137 -0
package/.cursor/rules/membot.mdc +137 -0
package/README.md +131 -0
package/package.json +83 -24
package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
package/scripts/apply-transformers-patch.sh +35 -0
package/src/cli.ts +72 -0
package/src/commands/check-update.ts +69 -0
package/src/commands/mcpx.ts +112 -0
package/src/commands/reindex.ts +53 -0
package/src/commands/serve.ts +58 -0
package/src/commands/skill.ts +131 -0
package/src/commands/upgrade.ts +220 -0
package/src/config/loader.ts +100 -0
package/src/config/schemas.ts +39 -0
package/src/constants.ts +42 -0
package/src/context.ts +80 -0
package/src/db/blobs.ts +53 -0
package/src/db/chunks.ts +176 -0
package/src/db/connection.ts +173 -0
package/src/db/files.ts +325 -0
package/src/db/migrations/001-init.ts +63 -0
package/src/db/migrations/002-fts.ts +12 -0
package/src/db/migrations.ts +45 -0
package/src/errors.ts +87 -0
package/src/ingest/chunker.ts +117 -0
package/src/ingest/converter/docx.ts +15 -0
package/src/ingest/converter/html.ts +20 -0
package/src/ingest/converter/image.ts +71 -0
package/src/ingest/converter/index.ts +119 -0
package/src/ingest/converter/llm.ts +66 -0
package/src/ingest/converter/ocr.ts +51 -0
package/src/ingest/converter/pdf.ts +38 -0
package/src/ingest/converter/text.ts +8 -0
package/src/ingest/describer.ts +72 -0
package/src/ingest/embedder.ts +98 -0
package/src/ingest/fetcher.ts +280 -0
package/src/ingest/ingest.ts +444 -0
package/src/ingest/local-reader.ts +64 -0
package/src/ingest/search-text.ts +18 -0
package/src/ingest/source-resolver.ts +186 -0
package/src/mcp/instructions.ts +34 -0
package/src/mcp/server.ts +101 -0
package/src/mount/commander.ts +174 -0
package/src/mount/mcp.ts +111 -0
package/src/mount/zod-to-cli.ts +158 -0
package/src/operations/add.ts +69 -0
package/src/operations/diff.ts +105 -0
package/src/operations/index.ts +38 -0
package/src/operations/info.ts +95 -0
package/src/operations/list.ts +87 -0
package/src/operations/move.ts +83 -0
package/src/operations/prune.ts +80 -0
package/src/operations/read.ts +102 -0
package/src/operations/refresh.ts +72 -0
package/src/operations/remove.ts +35 -0
package/src/operations/search.ts +72 -0
package/src/operations/tree.ts +103 -0
package/src/operations/types.ts +81 -0
package/src/operations/versions.ts +78 -0
package/src/operations/write.ts +77 -0
package/src/output/formatter.ts +68 -0
package/src/output/logger.ts +114 -0
package/src/output/progress.ts +78 -0
package/src/output/tty.ts +91 -0
package/src/refresh/runner.ts +296 -0
package/src/refresh/scheduler.ts +54 -0
package/src/sdk.ts +27 -0
package/src/search/hybrid.ts +100 -0
package/src/search/keyword.ts +62 -0
package/src/search/semantic.ts +56 -0
package/src/types/text-modules.d.ts +9 -0
package/src/update/background.ts +73 -0
package/src/update/cache.ts +40 -0
package/src/update/checker.ts +117 -0
package/.claude/settings.local.json +0 -7
package/CLAUDE.md +0 -139
package/docs/plan.md +0 -905

package/src/db/migrations/001-init.ts ADDED Viewed

@@ -0,0 +1,63 @@
+import { EMBEDDING_DIMENSION } from "../../constants.ts";
+import type { Migration } from "../migrations.ts";
+export const MIGRATION_001: Migration = {
+	id: 1,
+	name: "init",
+	statements: [
+		`CREATE TABLE blobs (
+			sha256     TEXT PRIMARY KEY,
+			mime_type  TEXT NOT NULL,
+			size_bytes BIGINT NOT NULL,
+			bytes      BLOB NOT NULL,
+			created_at TIMESTAMP NOT NULL DEFAULT now()
+		)`,
+		`CREATE TABLE files (
+			logical_path    TEXT NOT NULL,
+			version_id      TIMESTAMP NOT NULL DEFAULT now(),
+			tombstone       BOOLEAN NOT NULL DEFAULT FALSE,
+			source_type     TEXT NOT NULL,
+			source_path     TEXT,
+			source_mtime_ms BIGINT,
+			source_sha256   TEXT,
+			blob_sha256     TEXT,
+			content_sha256  TEXT,
+			content         TEXT,
+			description     TEXT,
+			mime_type       TEXT,
+			size_bytes      BIGINT,
+			fetcher         TEXT,
+			fetcher_server  TEXT,
+			fetcher_tool    TEXT,
+			fetcher_args    JSON,
+			refresh_frequency_sec INTEGER,
+			refreshed_at    TIMESTAMP,
+			last_refresh_status TEXT,
+			change_note     TEXT,
+			created_at      TIMESTAMP NOT NULL DEFAULT now(),
+			PRIMARY KEY (logical_path, version_id)
+		)`,
+		`CREATE INDEX files_logical_path_idx ON files (logical_path)`,
+		`CREATE INDEX files_blob_sha256_idx ON files (blob_sha256)`,
+		`CREATE INDEX files_refresh_due_idx ON files (refresh_frequency_sec, refreshed_at)`,
+		`CREATE TABLE chunks (
+			logical_path   TEXT NOT NULL,
+			version_id     TIMESTAMP NOT NULL,
+			chunk_index    INTEGER NOT NULL,
+			chunk_content  TEXT NOT NULL,
+			search_text    TEXT NOT NULL,
+			embedding      FLOAT[${EMBEDDING_DIMENSION}] NOT NULL,
+			PRIMARY KEY (logical_path, version_id, chunk_index)
+		)`,
+		`CREATE INDEX chunks_path_idx ON chunks (logical_path)`,
+		`CREATE VIEW current_files AS
+			SELECT f.* FROM files f
+			WHERE (f.logical_path, f.version_id) IN (
+				SELECT logical_path, MAX(version_id) FROM files GROUP BY logical_path
+			)
+				AND f.tombstone = FALSE`,
+		`CREATE VIEW current_chunks AS
+			SELECT c.* FROM chunks c
+			JOIN current_files cf USING (logical_path, version_id)`,
+	],
+};

package/src/db/migrations/002-fts.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import type { Migration } from "../migrations.ts";
+export const MIGRATION_002: Migration = {
+	id: 2,
+	name: "fts",
+	statements: [
+		`INSTALL fts`,
+		`LOAD fts`,
+		// FTS index built lazily by ingest.ts / refresh.ts after the first chunk insert,
+		// because PRAGMA create_fts_index errors when the table is empty in some builds.
+	],
+};

package/src/db/migrations.ts ADDED Viewed

@@ -0,0 +1,45 @@
+import { logger } from "../output/logger.ts";
+import type { DbConnection } from "./connection.ts";
+import { MIGRATION_001 } from "./migrations/001-init.ts";
+import { MIGRATION_002 } from "./migrations/002-fts.ts";
+/**
+ * One DDL/DML migration step. The id is monotonically increasing; the name
+ * is for logging only. Each statement runs independently so PRAGMA / INSTALL
+ * / LOAD calls (which DuckDB doesn't allow in multi-statement strings) work.
+ */
+export interface Migration {
+	id: number;
+	name: string;
+	statements: string[];
+}
+const MIGRATIONS: Migration[] = [MIGRATION_001, MIGRATION_002];
+/**
+ * Apply every unapplied migration in id order. Tracks applied ids in
+ * `_migrations`. Each successful run is logged via the shared logger so a
+ * user upgrading membot can see exactly what changed in their store.
+ */
+export async function applyMigrations(db: DbConnection): Promise<void> {
+	await db.exec(`CREATE TABLE IF NOT EXISTS _migrations (
+		id INTEGER PRIMARY KEY,
+		name TEXT NOT NULL,
+		applied_at TIMESTAMP NOT NULL DEFAULT now()
+	)`);
+	const applied = await db.queryAll<{ id: number }>(`SELECT id FROM _migrations ORDER BY id`);
+	const appliedIds = new Set(applied.map((r) => Number(r.id)));
+	for (const migration of MIGRATIONS) {
+		if (appliedIds.has(migration.id)) continue;
+		logger.info(`migration: applying ${String(migration.id).padStart(3, "0")}-${migration.name}`);
+		for (const stmt of migration.statements) {
+			const trimmed = stmt.trim();
+			if (!trimmed) continue;
+			await db.exec(trimmed);
+		}
+		await db.queryRun(`INSERT INTO _migrations(id, name) VALUES (?1, ?2)`, migration.id, migration.name);
+		logger.info(`migration: applied  ${String(migration.id).padStart(3, "0")}-${migration.name}`);
+	}
+}

package/src/errors.ts ADDED Viewed

@@ -0,0 +1,87 @@
+export type ErrorKind =
+	| "input_error"
+	| "not_found"
+	| "conflict"
+	| "auth_error"
+	| "network_error"
+	| "unsupported_mime"
+	| "partial_failure"
+	| "internal_error";
+export interface HelpfulErrorArgs {
+	kind: ErrorKind;
+	message: string;
+	hint: string;
+	details?: unknown;
+	cause?: unknown;
+}
+/**
+ * The only error type allowed inside membot handlers. The mount adapters
+ * (commander + MCP) catch this and render `kind` + `message` + `hint`
+ * for both surfaces.
+ */
+export class HelpfulError extends Error {
+	readonly kind: ErrorKind;
+	readonly hint: string;
+	readonly details?: unknown;
+	override readonly cause?: unknown;
+	constructor(args: HelpfulErrorArgs) {
+		super(args.message);
+		if (!args.hint?.trim()) {
+			throw new Error("HelpfulError requires a non-empty hint");
+		}
+		this.name = "HelpfulError";
+		this.kind = args.kind;
+		this.hint = args.hint;
+		this.details = args.details;
+		this.cause = args.cause;
+	}
+}
+export function isHelpfulError(e: unknown): e is HelpfulError {
+	return e instanceof HelpfulError;
+}
+/**
+ * Wrap an unknown error so callers can:
+ *   try { ... } catch (e) { throw asHelpful(e, "while reading PDF", "Try ...", "internal_error") }
+ */
+export function asHelpful(
+	cause: unknown,
+	context: string,
+	hint: string,
+	kind: ErrorKind = "internal_error",
+): HelpfulError {
+	if (cause instanceof HelpfulError) return cause;
+	const msg = cause instanceof Error ? cause.message : String(cause);
+	return new HelpfulError({
+		kind,
+		message: `${context}: ${msg}`,
+		hint,
+		cause,
+	});
+}
+/** Map an ErrorKind to a stable process exit code. */
+export function mapKindToExit(kind: ErrorKind): number {
+	switch (kind) {
+		case "input_error":
+			return 2;
+		case "not_found":
+			return 3;
+		case "conflict":
+			return 4;
+		case "auth_error":
+			return 5;
+		case "network_error":
+			return 6;
+		case "unsupported_mime":
+			return 7;
+		case "partial_failure":
+			return 8;
+		default:
+			return 1;
+	}
+}

package/src/ingest/chunker.ts ADDED Viewed

@@ -0,0 +1,117 @@
+import type { ChunkerConfig } from "../config/schemas.ts";
+import { DEFAULTS } from "../constants.ts";
+export interface Chunk {
+	index: number;
+	content: string;
+}
+const SHORT_CONTENT_THRESHOLD = 200;
+const DEFAULT_OVERLAP_LINES = 2;
+/**
+ * Split text into pieces no larger than `maxChars`, preferring paragraph,
+ * then line, then hard-character boundaries. Used to bound chunk size for
+ * the embedding model's input window.
+ */
+export function splitText(text: string, maxChars: number): string[] {
+	if (text.length <= maxChars) return [text];
+	const paragraphs = text.split(/\n\n+/);
+	if (paragraphs.length > 1) {
+		const out: string[] = [];
+		let buf = "";
+		for (const p of paragraphs) {
+			const candidate = buf ? `${buf}\n\n${p}` : p;
+			if (candidate.length <= maxChars) {
+				buf = candidate;
+			} else {
+				if (buf) out.push(buf);
+				if (p.length <= maxChars) {
+					buf = p;
+				} else {
+					out.push(...splitText(p, maxChars));
+					buf = "";
+				}
+			}
+		}
+		if (buf) out.push(buf);
+		return out;
+	}
+	const lines = text.split("\n");
+	if (lines.length > 1) {
+		const out: string[] = [];
+		let buf = "";
+		for (const line of lines) {
+			const candidate = buf ? `${buf}\n${line}` : line;
+			if (candidate.length <= maxChars) {
+				buf = candidate;
+			} else {
+				if (buf) out.push(buf);
+				if (line.length <= maxChars) {
+					buf = line;
+				} else {
+					for (let i = 0; i < line.length; i += maxChars) {
+						out.push(line.slice(i, i + maxChars));
+					}
+					buf = "";
+				}
+			}
+		}
+		if (buf) out.push(buf);
+		return out;
+	}
+	const out: string[] = [];
+	for (let i = 0; i < text.length; i += maxChars) {
+		out.push(text.slice(i, i + maxChars));
+	}
+	return out;
+}
+/** Re-chunk any chunks larger than `maxChars`, preserving order and reindexing. */
+export function enforceMaxChunkSize(chunks: Chunk[], maxChars: number = DEFAULTS.CHUNKER_MAX_CHARS): Chunk[] {
+	const out: Chunk[] = [];
+	for (const c of chunks) {
+		if (c.content.length <= maxChars) {
+			out.push({ index: out.length, content: c.content });
+			continue;
+		}
+		for (const piece of splitText(c.content, maxChars)) {
+			out.push({ index: out.length, content: piece });
+		}
+	}
+	return out;
+}
+/**
+ * Add overlapping lines from the end of each chunk to the start of the
+ * next so retrieval still works when concepts span chunk boundaries.
+ */
+export function addOverlapToChunks(chunks: Chunk[], overlapLines = DEFAULT_OVERLAP_LINES): Chunk[] {
+	if (chunks.length <= 1 || overlapLines <= 0) return chunks;
+	return chunks.map((c, i) => {
+		if (i === 0) return { ...c };
+		const prev = chunks[i - 1];
+		if (!prev) return { ...c };
+		const overlap = prev.content.split("\n").slice(-overlapLines).join("\n");
+		return { ...c, content: `${overlap}\n${c.content}` };
+	});
+}
+/**
+ * Deterministic chunker. Splits on paragraph/line/hard boundaries to a
+ * target size, then enforces a hard max-size after overlap is added. The
+ * LLM chunker is a separate code path opted into via config; this is the
+ * default and what tests rely on for stability.
+ */
+export function chunkDeterministic(content: string, config: ChunkerConfig): Chunk[] {
+	if (content.length < SHORT_CONTENT_THRESHOLD) {
+		return [{ index: 0, content }];
+	}
+	const initial = splitText(content, config.target_chars).map((c, i) => ({ index: i, content: c }));
+	const sized = enforceMaxChunkSize(initial, config.max_chars);
+	const withOverlap = addOverlapToChunks(sized);
+	return enforceMaxChunkSize(withOverlap, config.max_chars);
+}

package/src/ingest/converter/docx.ts ADDED Viewed

@@ -0,0 +1,15 @@
+import mammoth from "mammoth";
+import TurndownService from "turndown";
+const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", bulletListMarker: "-" });
+/**
+ * Convert a DOCX file to markdown. Mammoth gives us HTML; we then run that
+ * through turndown to get clean markdown. Any conversion warnings are
+ * silently dropped — they're typically about styles we don't preserve.
+ */
+export async function convertDocx(bytes: Uint8Array): Promise<string> {
+	const buf = Buffer.from(bytes);
+	const result = await mammoth.convertToHtml({ buffer: buf });
+	return turndown.turndown(result.value).trim();
+}

package/src/ingest/converter/html.ts ADDED Viewed

@@ -0,0 +1,20 @@
+import TurndownService from "turndown";
+const turndown = new TurndownService({
+	headingStyle: "atx",
+	codeBlockStyle: "fenced",
+	bulletListMarker: "-",
+});
+/**
+ * Convert HTML bytes to markdown using turndown. Strips script/style blocks
+ * before conversion so they don't leak into the chunker.
+ */
+export function convertHtml(bytes: Uint8Array): string {
+	const html = new TextDecoder("utf-8").decode(bytes);
+	const cleaned = html
+		.replace(/<script[\s\S]*?<\/script>/gi, "")
+		.replace(/<style[\s\S]*?<\/style>/gi, "")
+		.replace(/<noscript[\s\S]*?<\/noscript>/gi, "");
+	return turndown.turndown(cleaned).trim();
+}

package/src/ingest/converter/image.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import Anthropic from "@anthropic-ai/sdk";
+import type { LlmConfig } from "../../config/schemas.ts";
+import { logger } from "../../output/logger.ts";
+import { ocrImage } from "./ocr.ts";
+const VISION_PROMPT = `Describe this image as a one-paragraph caption suitable for retrieval. Focus on:
+- The subject and any people / objects / diagrams visible
+- Visible text content if present
+- The visual style (screenshot, photograph, diagram, chart, etc.)
+Output the caption only, no preamble.`;
+const VISION_MIMES = new Set(["image/png", "image/jpeg", "image/gif", "image/webp"]);
+/**
+ * Build the markdown surrogate for an image: an LLM-generated caption
+ * (when an API key is available) folded together with any text recovered
+ * by Tesseract OCR. Falls back to OCR-only or a deterministic placeholder
+ * when no API key is set.
+ */
+export async function convertImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
+	const captionPromise = describeImage(bytes, mimeType, llm);
+	const ocrPromise = ocrImage(bytes);
+	const [caption, ocrText] = await Promise.all([captionPromise, ocrPromise]);
+	const sections: string[] = [];
+	if (caption) sections.push(caption);
+	if (ocrText) sections.push(`## Text detected via OCR\n\n${ocrText}`);
+	if (sections.length === 0) sections.push(`(image, ${mimeType}, no caption available)`);
+	return sections.join("\n\n");
+}
+/**
+ * Single-shot vision call asking Claude to caption an image. Returns the
+ * caption text or an empty string when the API key is missing or the
+ * MIME type isn't accepted by the vision endpoint.
+ */
+async function describeImage(bytes: Uint8Array, mimeType: string, llm: LlmConfig): Promise<string> {
+	if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") return "";
+	if (!VISION_MIMES.has(mimeType)) return "";
+	const client = new Anthropic({ apiKey: llm.anthropic_api_key });
+	const base64 = Buffer.from(bytes).toString("base64");
+	try {
+		const resp = await client.messages.create({
+			model: llm.vision_model,
+			max_tokens: 500,
+			messages: [
+				{
+					role: "user",
+					content: [
+						{
+							type: "image",
+							source: {
+								type: "base64",
+								media_type: mimeType as "image/png" | "image/jpeg" | "image/gif" | "image/webp",
+								data: base64,
+							},
+						},
+						{ type: "text", text: VISION_PROMPT },
+					],
+				},
+			],
+		});
+		const text = resp.content.flatMap((b) => (b.type === "text" ? [b.text] : [])).join("");
+		return text.trim();
+	} catch (err) {
+		logger.warn(`vision: caption failed (${err instanceof Error ? err.message : String(err)})`);
+		return "";
+	}
+}

package/src/ingest/converter/index.ts ADDED Viewed

@@ -0,0 +1,119 @@
+import type { LlmConfig } from "../../config/schemas.ts";
+import { convertDocx } from "./docx.ts";
+import { convertHtml } from "./html.ts";
+import { convertImage } from "./image.ts";
+import { convertWithLlm } from "./llm.ts";
+import { ocrImage } from "./ocr.ts";
+import { convertPdf, shouldOcrPdf } from "./pdf.ts";
+import { convertText } from "./text.ts";
+export interface ConvertResult {
+	markdown: string;
+	contentMimeType: "text/markdown";
+}
+const TEXT_MIMES = new Set(["text/markdown", "text/plain", "text/x-markdown", "text/md"]);
+const HTML_MIMES = new Set(["text/html", "application/xhtml+xml"]);
+const STRUCTURED_TEXT_MIMES = new Set([
+	"application/json",
+	"application/xml",
+	"text/xml",
+	"application/yaml",
+	"text/yaml",
+	"text/csv",
+	"application/javascript",
+	"application/typescript",
+]);
+const DOCX_MIMES = new Set(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]);
+const PDF_MIMES = new Set(["application/pdf"]);
+/**
+ * Convert raw bytes to a markdown surrogate via mime-dispatched native
+ * libraries first, with an LLM fallback when no native converter applies
+ * and an Anthropic API key is configured. Always returns markdown — even
+ * for binary types — so the chunker / embedder pipeline never has to
+ * branch on the source mime.
+ */
+export async function convert(
+	bytes: Uint8Array,
+	mimeType: string,
+	source: string,
+	llm: LlmConfig,
+): Promise<ConvertResult> {
+	const mt = mimeType.toLowerCase();
+	if (TEXT_MIMES.has(mt)) {
+		return { markdown: convertText(bytes), contentMimeType: "text/markdown" };
+	}
+	if (HTML_MIMES.has(mt)) {
+		return { markdown: convertHtml(bytes), contentMimeType: "text/markdown" };
+	}
+	if (DOCX_MIMES.has(mt)) {
+		return { markdown: await convertDocx(bytes), contentMimeType: "text/markdown" };
+	}
+	if (PDF_MIMES.has(mt)) {
+		const conversion = await convertPdf(bytes);
+		if (!shouldOcrPdf(conversion)) {
+			return { markdown: conversion.markdown, contentMimeType: "text/markdown" };
+		}
+		const ocrText = await ocrPdfBytes(bytes);
+		const merged = [conversion.markdown, ocrText ? `## Text detected via OCR\n\n${ocrText}` : ""]
+			.filter(Boolean)
+			.join("\n\n");
+		return {
+			markdown: merged || `(scanned PDF, ${bytes.byteLength} bytes — no recognizable text)`,
+			contentMimeType: "text/markdown",
+		};
+	}
+	if (mt.startsWith("image/")) {
+		return { markdown: await convertImage(bytes, mt, llm), contentMimeType: "text/markdown" };
+	}
+	if (STRUCTURED_TEXT_MIMES.has(mt)) {
+		const raw = convertText(bytes);
+		const md = await convertWithLlm(raw, mt, source, llm);
+		return { markdown: md || raw, contentMimeType: "text/markdown" };
+	}
+	// Last resort: try LLM conversion with a base64 sample (truncated) so we
+	// at least produce something for unknown binary types. Without an API
+	// key we fall straight through to a deterministic placeholder.
+	if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") {
+		return {
+			markdown: `(unknown binary, ${mt}, ${bytes.byteLength} bytes)`,
+			contentMimeType: "text/markdown",
+		};
+	}
+	const sample = sampleAsText(bytes, mt);
+	const md = await convertWithLlm(sample, mt, source, llm);
+	if (md && md.trim().length > 0 && md !== sample) {
+		return { markdown: md, contentMimeType: "text/markdown" };
+	}
+	return { markdown: `(unknown binary, ${mt}, ${bytes.byteLength} bytes)`, contentMimeType: "text/markdown" };
+}
+/**
+ * Render a small slice of unknown-binary bytes as a base64 sample so the
+ * LLM converter has something to look at without us shipping a 50MB blob.
+ */
+function sampleAsText(bytes: Uint8Array, mimeType: string): string {
+	const slice = bytes.slice(0, 4096);
+	const b64 = Buffer.from(slice).toString("base64");
+	return `Binary content of type ${mimeType}, ${bytes.byteLength} bytes total. First 4096 bytes (base64):\n\n${b64}`;
+}
+/**
+ * Tesseract over a PDF's bytes is unhelpful (it's not an image). For a real
+ * scanned-PDF OCR pipeline we'd rasterize each page first; for now this
+ * function exists as a hook and returns an empty string so the dispatcher
+ * still produces a usable surrogate.
+ */
+async function ocrPdfBytes(_bytes: Uint8Array): Promise<string> {
+	return "";
+}
+export { ocrImage };

package/src/ingest/converter/llm.ts ADDED Viewed

@@ -0,0 +1,66 @@
+import Anthropic from "@anthropic-ai/sdk";
+import type { LlmConfig } from "../../config/schemas.ts";
+import { logger } from "../../output/logger.ts";
+const CONVERTER_MAX_TOKENS = 16_384;
+const CONVERTER_SYSTEM_PROMPT = `You normalize documents to clean, well-structured Markdown.
+If the input is already clean, valid Markdown, return it verbatim with no edits.
+Otherwise, convert it. The input mime_type is a hint, not a guarantee — verify the actual content. Common non-markdown formats:
+- HTML — strip tags, scripts, styles, navigation/footer chrome.
+- JSON / XML / YAML — render structure as readable Markdown.
+- DocMD-like annotation formats — strip bracket annotations, map H1→#, H2→##, P→paragraph.
+Rules for the output:
+- Preserve all semantic content: headings, paragraphs, lists, tables, links, inline code, code blocks, blockquotes.
+- Use ATX headings (#, ##, ###), fenced code blocks, GFM-style tables.
+- Strip metadata headers/IDs (e.g. @document_id: ...).
+- Output ONLY the Markdown. No preamble, no trailing commentary, no wrapping the entire output in a code fence.`;
+/**
+ * Last-resort converter: ship the raw text/binary preview to Claude and ask
+ * for clean markdown. Returns the raw input unchanged when there's no API
+ * key configured (the pipeline degrades to a less-clean surrogate rather
+ * than failing the ingest). Does NOT run when the input is already known
+ * to be markdown — caller should short-circuit that path.
+ */
+export async function convertWithLlm(
+	content: string,
+	mimeType: string,
+	source: string,
+	llm: LlmConfig,
+): Promise<string> {
+	if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") {
+		return content;
+	}
+	const client = new Anthropic({ apiKey: llm.anthropic_api_key });
+	try {
+		const stream = client.messages.stream({
+			model: llm.converter_model,
+			max_tokens: CONVERTER_MAX_TOKENS,
+			system: CONVERTER_SYSTEM_PROMPT,
+			messages: [
+				{
+					role: "user",
+					content: `Convert this ${mimeType} content to Markdown. Source: ${source}\n\n${content}`,
+				},
+			],
+		});
+		const final = await stream.finalMessage();
+		const text = final.content.flatMap((b) => (b.type === "text" ? [b.text] : [])).join("");
+		if (!text.trim()) return content;
+		return stripLeadingMarkdownFence(text);
+	} catch (err) {
+		logger.warn(`llm-converter: failed (${err instanceof Error ? err.message : String(err)}) — using raw input`);
+		return content;
+	}
+}
+function stripLeadingMarkdownFence(text: string): string {
+	const trimmed = text.trim();
+	const fenceMatch = trimmed.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/);
+	if (fenceMatch?.[1]) return fenceMatch[1];
+	return text;
+}

package/src/ingest/converter/ocr.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import { logger } from "../../output/logger.ts";
+interface TesseractWorker {
+	recognize(input: Uint8Array | Buffer | string): Promise<{ data: { text: string } }>;
+	terminate(): Promise<void>;
+}
+let workerPromise: Promise<TesseractWorker> | null = null;
+/**
+ * Lazily initialize a Tesseract worker for English OCR. Held as a process-
+ * wide singleton because spinning a worker up costs hundreds of ms.
+ */
+async function getWorker(): Promise<TesseractWorker> {
+	if (!workerPromise) {
+		workerPromise = (async () => {
+			const tesseract = await import("tesseract.js");
+			const w = await tesseract.createWorker("eng");
+			return w as unknown as TesseractWorker;
+		})();
+	}
+	return workerPromise;
+}
+/**
+ * Run Tesseract OCR over the provided bytes (image bytes). Returns the
+ * recognized text. Errors are logged and turned into an empty string so
+ * the calling pipeline can degrade gracefully.
+ */
+export async function ocrImage(bytes: Uint8Array): Promise<string> {
+	try {
+		const worker = await getWorker();
+		const result = await worker.recognize(Buffer.from(bytes));
+		return (result.data.text ?? "").trim();
+	} catch (err) {
+		logger.warn(`ocr: recognition failed (${err instanceof Error ? err.message : String(err)})`);
+		return "";
+	}
+}
+/** Tear down the singleton worker — call once at process exit if needed. */
+export async function shutdownOcr(): Promise<void> {
+	if (!workerPromise) return;
+	const w = await workerPromise;
+	workerPromise = null;
+	try {
+		await w.terminate();
+	} catch {
+		// best effort
+	}
+}