pi-ocr 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,434 @@
1
+ /**
2
+ * pi-minimodel-ocr — Multi-backend OCR for Pi Coding Agent
3
+ *
4
+ * Registers a `minimodel_ocr` tool that the LLM can call to read images and PDFs
5
+ * using one of three backends:
6
+ * - Ollama (local vision models like glm-ocr)
7
+ * - MinerU API (free Agent API, ≤10MB, ≤20 pages)
8
+ * - Pix2Text (local Python library)
9
+ *
10
+ * Single command:
11
+ * /ocr → open settings UI (backend, model, split toggle)
12
+ * /ocr <file> [task] → OCR file with current settings
13
+ *
14
+ * Settings persisted to ~/.pi/agent/settings.json.
15
+ *
16
+ * Prerequisites:
17
+ * Ollama: brew install ollama && ollama pull glm-ocr
18
+ * MinerU: no setup (free API, IP rate-limited)
19
+ * Pix2Text: pip install pix2text
20
+ * PDF tools: brew install poppler (macOS multi-page PDF for Ollama)
21
+ *
22
+ * Install: pi install npm:pi-minimodel-ocr
23
+ */
24
+
25
+ import { Type } from "@earendil-works/pi-ai";
26
+ import {
27
+ defineTool,
28
+ getSettingsListTheme,
29
+ type ExtensionAPI,
30
+ type ExtensionContext,
31
+ } from "@earendil-works/pi-coding-agent";
32
+ import {
33
+ Container,
34
+ Text,
35
+ type SettingItem,
36
+ SettingsList,
37
+ type SelectItem,
38
+ SelectList,
39
+ } from "@earendil-works/pi-tui";
40
+ import { existsSync, readFileSync, writeFileSync, mkdirSync } from "node:fs";
41
+ import { basename, extname, dirname, join } from "node:path";
42
+ import { homedir } from "node:os";
43
+
44
+ import type { Backend, Task, OcrConfig } from "./types";
45
+ import { TASKS, BACKENDS } from "./types";
46
+ import { isImage, isPdf, getPdfPageCount, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./ollama";
47
+ import { mineruOcr } from "./mineru";
48
+ import { pix2textOcr } from "./pix2text";
49
+
50
+ // ── Config persistence ───────────────────────────────────────────────────────
51
+
52
+ const SETTINGS_PATH = join(homedir(), ".pi", "agent", "settings.json");
53
+
54
+ function loadOcrConfig(): Partial<OcrConfig> {
55
+ try {
56
+ if (!existsSync(SETTINGS_PATH)) return {};
57
+ return (JSON.parse(readFileSync(SETTINGS_PATH, "utf8")) as any).minimodelOcr || {};
58
+ } catch { return {}; }
59
+ }
60
+
61
+ function saveOcrConfig(updates: Partial<OcrConfig>) {
62
+ try {
63
+ const dir = dirname(SETTINGS_PATH);
64
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
65
+ const settings = existsSync(SETTINGS_PATH)
66
+ ? JSON.parse(readFileSync(SETTINGS_PATH, "utf8"))
67
+ : {};
68
+ settings.minimodelOcr = { ...(settings.minimodelOcr || {}), ...updates };
69
+ writeFileSync(SETTINGS_PATH, JSON.stringify(settings, null, 2) + "\n", "utf8");
70
+ } catch { /* best effort */ }
71
+ }
72
+
73
+ function getConfig(): OcrConfig {
74
+ const s = loadOcrConfig();
75
+ return {
76
+ backend: (BACKENDS.includes(s.backend as Backend) ? s.backend : "mineru") as Backend,
77
+ ollamaHost: process.env.OLLAMA_HOST || s.ollamaHost || "http://localhost:11434",
78
+ model: process.env.OCR_MODEL || s.model || "glm-ocr",
79
+ mineruSplitPdf: s.mineruSplitPdf !== false,
80
+ };
81
+ }
82
+
83
+ // ── Recommended models ───────────────────────────────────────────────────────
84
+
85
+ const RECOMMENDED_MODELS = [
86
+ { name: "glm-ocr:q8_0", desc: "balanced — smallest (1.6GB), fast" },
87
+ { name: "glm-ocr", desc: "best formula OCR (2.2GB, 94.6 OmniDocBench)" },
88
+ { name: "minicpm-v", desc: "strong all-around vision + OCR (8B, 5.5GB)" },
89
+ { name: "llama3.2-vision", desc: "Meta's vision model (11B)" },
90
+ ];
91
+
92
+ // ── Tool Definition ──────────────────────────────────────────────────────────
93
+
94
+ const ocrSchema = Type.Object({
95
+ path: Type.String({
96
+ description:
97
+ "Absolute or relative path to the image or PDF file to OCR. Supported formats: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.",
98
+ }),
99
+ task: Type.Optional(
100
+ Type.String({
101
+ description:
102
+ 'OCR task type. "text" for Markdown text, "formula" for LaTeX math, "table" for Markdown tables, "figure" for description, "auto" for full document OCR (default).',
103
+ }),
104
+ ),
105
+ model: Type.Optional(
106
+ Type.String({
107
+ description:
108
+ "Ollama model to use for OCR. Defaults to 'glm-ocr'. You can use any Ollama vision model, e.g. 'glm-ocr:q8_0' for the 8-bit quantized version, 'llama3.2-vision', 'minicpm-v', etc.",
109
+ }),
110
+ ),
111
+ });
112
+
113
+ const ocrTool = defineTool({
114
+ name: "minimodel_ocr",
115
+ label: "Minimodel OCR",
116
+ description:
117
+ "Extract text, math formulas (LaTeX), and tables from images or PDFs using local Ollama vision models. " +
118
+ "Use this when you need to read text from an image or PDF, especially mathematical formulas that need LaTeX output. " +
119
+ "This is the tool to use when working with non-vision LLMs like DeepSeek that cannot process images directly.",
120
+ promptSnippet:
121
+ "Extract text/formulas/tables from images and PDFs using local Ollama OCR",
122
+ promptGuidelines: [
123
+ "When the user asks about the content of an image or PDF, use minimodel_ocr to extract the text first.",
124
+ "For mathematical documents, use minimodel_ocr with task='formula' or task='auto' to get LaTeX output.",
125
+ "Use minimodel_ocr with task='auto' for general document OCR to extract all text, formulas, tables, and figures.",
126
+ ],
127
+ parameters: ocrSchema,
128
+ async execute(_toolCallId, params, signal, onUpdate, _ctx) {
129
+ const { path: filePath, task = "auto", model: modelOverride } = params as {
130
+ path: string; task?: string; model?: string;
131
+ };
132
+ const resolvedTask = (TASKS.includes(task as Task) ? task : "auto") as Task;
133
+ const config = getConfig();
134
+ const resolvedModel = modelOverride || config.model;
135
+
136
+ if (!existsSync(filePath)) throw new Error(`File not found: ${filePath}`);
137
+ if (!isImage(filePath) && !isPdf(filePath)) {
138
+ throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
139
+ }
140
+
141
+ const backendLabel = { ollama: "🦙 Ollama", mineru: "☁️ MinerU", pix2text: "📐 Pix2Text" }[config.backend];
142
+ onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
143
+
144
+ const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
145
+
146
+ try {
147
+ let result: { text: string; details: Record<string, unknown> };
148
+
149
+ switch (config.backend) {
150
+ case "ollama":
151
+ result = await ollamaOcr(filePath, resolvedTask, config.ollamaHost, resolvedModel, signal, onProgress);
152
+ break;
153
+ case "mineru": {
154
+ const { stat } = await import("node:fs/promises");
155
+ const stats = await stat(filePath);
156
+ if (stats.size > 10 * 1024 * 1024) {
157
+ onProgress(`⚠️ File is ${(stats.size / 1024 / 1024).toFixed(1)}MB. MinerU free tier limit is 10MB.\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend with /ocr.`);
158
+ }
159
+ result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
160
+ break;
161
+ }
162
+ case "pix2text":
163
+ result = await pix2textOcr(filePath, resolvedTask, signal, onProgress);
164
+ break;
165
+ default:
166
+ throw new Error(`Unknown backend "${config.backend}"`);
167
+ }
168
+
169
+ const preview = result.text.length > 5000 ? result.text.slice(0, 5000) + "\n\n… (truncated)" : result.text;
170
+ return {
171
+ content: [{ type: "text", text: `## OCR Result (${resolvedTask})\n\n**File:** \`${basename(filePath)}\`\n**Backend:** ${config.backend}\n\n${preview}` }],
172
+ details: { task: resolvedTask, path: filePath, fullText: result.text, truncated: result.text.length > 5000, backend: config.backend, ...result.details },
173
+ };
174
+ } catch (e: any) {
175
+ const msg = e.message || String(e);
176
+ let hint = "";
177
+ if (config.backend === "ollama" && (msg.includes("fetch failed") || msg.includes("ECONNREFUSED"))) hint = "\n\n💡 Is Ollama running? Start: `ollama serve`";
178
+ else if (config.backend === "pix2text" && msg.includes("python3")) hint = "\n\n💡 Install: `pip install pix2text`";
179
+ else if (config.backend === "mineru" && msg.includes("429")) hint = "\n\n💡 MinerU rate limit. Wait a minute or switch backend with /ocr.";
180
+ else if (config.backend === "mineru" && msg.includes("too large")) hint = "\n\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend.";
181
+ throw new Error(`OCR error (${config.backend}): ${msg}${hint}`);
182
+ }
183
+ },
184
+ });
185
+
186
+ // ── Extension Entry ─────────────────────────────────────────────────────────
187
+
188
+ export default function ocrExtension(pi: ExtensionAPI) {
189
+ pi.registerTool(ocrTool);
190
+
191
+ // ── /ocr command ─────────────────────────────────────────────────────────
192
+
193
+ pi.registerCommand("ocr", {
194
+ description: "OCR an image or PDF, or configure OCR settings",
195
+ handler: async (args, ctx) => {
196
+ const trimmed = (args || "").trim();
197
+
198
+ // No args → open settings UI
199
+ if (!trimmed) {
200
+ await showOcrSettings(ctx);
201
+ return;
202
+ }
203
+
204
+ // Args → OCR a file
205
+ const parts = trimmed.split(/\s+/);
206
+ const filePath = parts[0];
207
+ const task = parts[1] || "auto";
208
+ const model = parts[2] || undefined;
209
+
210
+ if (!existsSync(filePath)) {
211
+ ctx.ui.notify(`File not found: ${filePath}`, "error");
212
+ return;
213
+ }
214
+
215
+ try {
216
+ const result = await ocrTool.execute("", { path: filePath, task, model }, undefined as any, undefined, ctx);
217
+ const textLen = (result.details as any)?.fullText?.length || 0;
218
+ ctx.ui.notify(`OCR complete — ${textLen} chars via ${(result.details as any)?.backend || "?"}`, "info");
219
+ } catch (e: any) {
220
+ ctx.ui.notify(e.message?.slice(0, 200) || "OCR failed", "error");
221
+ }
222
+ },
223
+ });
224
+
225
+ // ── Settings UI ────────────────────────────────────────────────────────────
226
+ //
227
+ // Shows a SettingsList with:
228
+ // 1. Backend selector (toggle: ollama / mineru / pix2text)
229
+ // 2. MinerU: Split PDF >20 pages (toggle: ON / OFF)
230
+ // 3. Ollama model (current value shown; Enter opens model picker submenu)
231
+ //
232
+ // Changes are saved immediately to ~/.pi/agent/settings.json.
233
+
234
+ async function showOcrSettings(ctx: ExtensionContext) {
235
+ const config = getConfig();
236
+
237
+ const items: SettingItem[] = [
238
+ {
239
+ id: "backend",
240
+ label: "OCR Backend",
241
+ description: "Ollama=local GPU, MinerU=free cloud API, Pix2Text=local Python",
242
+ currentValue: config.backend,
243
+ values: [...BACKENDS],
244
+ },
245
+ {
246
+ id: "mineruSplitPdf",
247
+ label: "MinerU: Split PDF >20 pages",
248
+ description: "Auto-split large PDFs into ≤20-page free-tier chunks",
249
+ currentValue: config.mineruSplitPdf ? "ON" : "OFF",
250
+ values: ["ON", "OFF"],
251
+ },
252
+ {
253
+ id: "model",
254
+ label: "Ollama Model",
255
+ description: "Vision model used for OCR (only applies to Ollama backend)",
256
+ currentValue: config.model,
257
+ submenu: (_currentValue, done) => {
258
+ return createModelSelector(config.model, ctx, (selected) => {
259
+ if (selected) {
260
+ saveOcrConfig({ model: selected });
261
+ process.env.OCR_MODEL = selected;
262
+ updateStatus(ctx);
263
+ // Update the SettingsList item value in-place
264
+ settingsListRef?.updateValue("model", selected);
265
+ }
266
+ done(selected);
267
+ });
268
+ },
269
+ },
270
+ ];
271
+
272
+ let settingsListRef: SettingsList | null = null;
273
+
274
+ await new Promise<void>((resolve) => {
275
+ ctx.ui.custom((tui, theme, _kb, done) => {
276
+ const settingsList = new SettingsList(
277
+ items,
278
+ 8, // max visible items
279
+ getSettingsListTheme(),
280
+ (id, newValue) => {
281
+ // onChange — save immediately
282
+ switch (id) {
283
+ case "backend": {
284
+ const backend = BACKENDS.includes(newValue as Backend) ? newValue as Backend : "ollama";
285
+ saveOcrConfig({ backend });
286
+ updateStatus(ctx);
287
+ // Show hints when switching
288
+ if (backend === "mineru") {
289
+ ctx.ui.notify(
290
+ "☁️ MinerU: free for ≤10MB & ≤20 pages. Auto-split " +
291
+ (config.mineruSplitPdf ? "ON" : "OFF — enable in settings") +
292
+ ".\nLarge files? Compress at https://ilovepdf.com/compress_pdf",
293
+ "info",
294
+ );
295
+ } else if (backend === "pix2text") {
296
+ ctx.ui.notify("🐍 Pix2Text: needs `pip install pix2text`", "warning");
297
+ }
298
+ break;
299
+ }
300
+ case "mineruSplitPdf":
301
+ saveOcrConfig({ mineruSplitPdf: newValue === "ON" });
302
+ break;
303
+ }
304
+ },
305
+ () => done(undefined), // onCancel
306
+ );
307
+
308
+ settingsListRef = settingsList;
309
+
310
+ const container = new Container();
311
+ container.addChild(new Text(theme.fg("accent", theme.bold("OCR Settings")), 1, 0));
312
+ container.addChild(settingsList);
313
+ container.addChild(
314
+ new Text(theme.fg("dim", "↑↓ navigate • ← → toggle • enter select • esc close"), 1, 0),
315
+ );
316
+
317
+ return {
318
+ render(width: number) {
319
+ return container.render(width);
320
+ },
321
+ invalidate() {
322
+ container.invalidate();
323
+ },
324
+ handleInput(data: string) {
325
+ settingsList.handleInput(data);
326
+ tui.requestRender();
327
+ },
328
+ };
329
+ });
330
+ });
331
+ }
332
+
333
+ // ── Model selector submenu ─────────────────────────────────────────────────
334
+
335
+ function createModelSelector(
336
+ currentModel: string,
337
+ ctx: ExtensionContext,
338
+ onDone: (selected: string | undefined) => void,
339
+ ) {
340
+ const items: SelectItem[] = RECOMMENDED_MODELS.map((m) => ({
341
+ value: m.name,
342
+ label: m.name === currentModel ? `${m.name} ✓` : m.name,
343
+ description: m.desc,
344
+ }));
345
+ items.push({
346
+ value: "__custom__",
347
+ label: "Type a custom name…",
348
+ description: "Enter any Ollama model name",
349
+ });
350
+
351
+ const container = new Container();
352
+ container.addChild(new Text("Choose Ollama Model", 1, 0));
353
+
354
+ const selectList = new SelectList(items, Math.min(items.length, 8), {
355
+ selectedPrefix: (text) => ctx.ui.theme.fg("accent", text),
356
+ selectedText: (text) => ctx.ui.theme.fg("accent", text),
357
+ description: (text) => ctx.ui.theme.fg("muted", text),
358
+ scrollInfo: (text) => ctx.ui.theme.fg("dim", text),
359
+ noMatch: (text) => ctx.ui.theme.fg("warning", text),
360
+ });
361
+
362
+ selectList.onSelect = async (item) => {
363
+ if (item.value === "__custom__") {
364
+ const custom = await ctx.ui.input("Enter Ollama model name:", currentModel);
365
+ if (custom?.trim()) {
366
+ await ensureModelPulled(custom.trim(), ctx);
367
+ onDone(custom.trim());
368
+ } else {
369
+ onDone(undefined);
370
+ }
371
+ return;
372
+ }
373
+ await ensureModelPulled(item.value, ctx);
374
+ onDone(item.value);
375
+ };
376
+
377
+ selectList.onCancel = () => onDone(undefined);
378
+ container.addChild(selectList);
379
+
380
+ return {
381
+ render(width: number) { return container.render(width); },
382
+ invalidate() { container.invalidate(); },
383
+ handleInput(data: string) { selectList.handleInput(data); },
384
+ };
385
+ }
386
+
387
+ async function ensureModelPulled(model: string, ctx: ExtensionContext) {
388
+ const config = getConfig();
389
+ const exists = await ollamaCheckModel(config.ollamaHost, model);
390
+ if (!exists) {
391
+ const pull = await ctx.ui.confirm(
392
+ "Model not found",
393
+ `"${model}" is not pulled locally.\n\nPull it now? (ollama pull ${model})`,
394
+ );
395
+ if (pull) {
396
+ ctx.ui.notify(`Pulling ${model}…`, "info");
397
+ ollamaPullModel(model)
398
+ .then(() => ctx.ui.notify(`${model} ready`, "info"))
399
+ .catch((e) => ctx.ui.notify(`Pull failed: ${e.message}`.slice(0, 200), "error"));
400
+ }
401
+ }
402
+ }
403
+
404
+ // ── Status bar ─────────────────────────────────────────────────────────────
405
+
406
+ function updateStatus(ctx: ExtensionContext) {
407
+ const config = getConfig();
408
+ const text = config.backend === "ollama"
409
+ ? `OCR: ollama ${config.model}`
410
+ : `OCR: ${config.backend}`;
411
+ ctx.ui.setStatus("minimodel-ocr", text);
412
+ }
413
+
414
+ // ── Startup ────────────────────────────────────────────────────────────────
415
+
416
+ pi.on("session_start", async (_event, ctx) => {
417
+ updateStatus(ctx);
418
+
419
+ // Proactive check: macOS multi-page PDF support
420
+ if (process.platform === "darwin" && getConfig().backend === "ollama") {
421
+ const { spawn } = await import("node:child_process");
422
+ const hasPdftoppm = await new Promise<boolean>((resolve) => {
423
+ const child = spawn("pdftoppm", ["-v"], { stdio: "ignore" });
424
+ child.on("close", (code) => resolve(code === 0));
425
+ child.on("error", () => resolve(false));
426
+ });
427
+ if (!hasPdftoppm) {
428
+ ctx.ui.notify("💡 Multi-page PDF via Ollama needs pdftoppm: brew install poppler", "warning");
429
+ }
430
+ }
431
+ });
432
+
433
+ console.log("[pi-ocr] Loaded — /ocr (file or settings), tool: minimodel_ocr, default: mineru");
434
+ }