pi-ocr 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -29,7 +29,8 @@ Switch anytime with `/ocr` (no args).
29
29
  |---|---|---|---|
30
30
  | ☁️ | **MinerU** (default) | PDFs, tables, general docs | None — works instantly |
31
31
  | 🦙 | Ollama | Math formulas → LaTeX, offline | `brew install ollama && ollama pull glm-ocr` |
32
- | 📐 | Pix2Text | Math + text, CPU-only | `pip install pix2text` |
32
+ | 🔤 | Tesseract | Plain text, ultra-light (~30MB) | `brew install tesseract` |
33
+ | 📐 | Pix2Text | Math + text, CPU Python | `pip install pix2text` |
33
34
 
34
35
  ---
35
36
 
@@ -63,7 +64,20 @@ Switch with `/ocr` → "OCR Backend" → ollama.
63
64
 
64
65
  ---
65
66
 
66
- ## Pix2Text (optional, for offline CPU)
67
+ ## Tesseract (optional, for plain text)
68
+
69
+ Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. System package, zero Python.
70
+
71
+ ```bash
72
+ brew install tesseract # macOS
73
+ sudo apt install tesseract-ocr # Linux
74
+ ```
75
+
76
+ Switch with `/ocr` → "OCR Backend" → tesseract.
77
+
78
+ ---
79
+
80
+ ## Pix2Text (optional, for math + text on CPU)
67
81
 
68
82
  Local Python OCR. Mathpix alternative — handles text + formulas on CPU.
69
83
 
@@ -103,6 +117,8 @@ First run downloads ONNX models (~50MB). Switch with `/ocr` → "OCR Backend"
103
117
 
104
118
  **"python3 not found" (Pix2Text)** → `python3 -m pip install pix2text`
105
119
 
120
+ **"tesseract not found"** → `brew install tesseract` (macOS) / `sudo apt install tesseract-ocr` (Linux)
121
+
106
122
  **"pdftoppm not found" (Ollama multi-page)** → `brew install poppler` (macOS) / `sudo apt install poppler-utils` (Linux)
107
123
 
108
124
  ---
@@ -45,6 +45,7 @@ import type { Backend, Task, OcrConfig } from "./types";
45
45
  import { TASKS, BACKENDS } from "./types";
46
46
  import { isImage, isPdf, getPdfPageCount, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./ollama";
47
47
  import { mineruOcr } from "./mineru";
48
+ import { tesseractOcr } from "./tesseract";
48
49
  import { pix2textOcr } from "./pix2text";
49
50
 
50
51
  // ── Config persistence ───────────────────────────────────────────────────────
@@ -138,7 +139,7 @@ const ocrTool = defineTool({
138
139
  throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
139
140
  }
140
141
 
141
- const backendLabel = { ollama: "🦙 Ollama", mineru: "☁️ MinerU", pix2text: "📐 Pix2Text" }[config.backend];
142
+ const backendLabel = { mineru: "☁️ MinerU", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
142
143
  onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
143
144
 
144
145
  const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
@@ -159,6 +160,9 @@ const ocrTool = defineTool({
159
160
  result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
160
161
  break;
161
162
  }
163
+ case "tesseract":
164
+ result = await tesseractOcr(filePath, resolvedTask, signal, onProgress);
165
+ break;
162
166
  case "pix2text":
163
167
  result = await pix2textOcr(filePath, resolvedTask, signal, onProgress);
164
168
  break;
@@ -175,6 +179,7 @@ const ocrTool = defineTool({
175
179
  const msg = e.message || String(e);
176
180
  let hint = "";
177
181
  if (config.backend === "ollama" && (msg.includes("fetch failed") || msg.includes("ECONNREFUSED"))) hint = "\n\n💡 Is Ollama running? Start: `ollama serve`";
182
+ else if (config.backend === "tesseract" && msg.includes("not found")) hint = "\n\n💡 Install: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux)";
178
183
  else if (config.backend === "pix2text" && msg.includes("python3")) hint = "\n\n💡 Install: `pip install pix2text`";
179
184
  else if (config.backend === "mineru" && msg.includes("429")) hint = "\n\n💡 MinerU rate limit. Wait a minute or switch backend with /ocr.";
180
185
  else if (config.backend === "mineru" && msg.includes("too large")) hint = "\n\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend.";
@@ -292,7 +297,9 @@ export default function ocrExtension(pi: ExtensionAPI) {
292
297
  ".\nLarge files? Compress at https://ilovepdf.com/compress_pdf",
293
298
  "info",
294
299
  );
295
- } else if (backend === "pix2text") {
300
+ } else if (backend === "tesseract") {
301
+ ctx.ui.notify("🔤 Tesseract: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux). ~30MB, CPU-only.", "warning");
302
+ } else if (backend === "pix2text") {
296
303
  ctx.ui.notify("🐍 Pix2Text: needs `pip install pix2text`", "warning");
297
304
  }
298
305
  break;
@@ -0,0 +1,132 @@
1
+ /**
2
+ * pi-ocr — Tesseract backend
3
+ *
4
+ * Uses Tesseract OCR (https://github.com/tesseract-ocr/tesseract) —
5
+ * the classic open-source OCR engine. Ultra-lightweight (~30MB),
6
+ * zero Python deps, CPU-only, fast on plain text.
7
+ *
8
+ * Prerequisites:
9
+ * brew install tesseract # macOS
10
+ * sudo apt install tesseract-ocr # Linux
11
+ *
12
+ * For non-English languages, install the corresponding lang pack:
13
+ * brew install tesseract-lang # macOS (all languages)
14
+ * sudo apt install tesseract-ocr-chi-sim # Chinese simplified
15
+ */
16
+
17
+ import { mkdtempSync, readdirSync, unlinkSync, rmdirSync } from "node:fs";
18
+ import { basename, join } from "node:path";
19
+ import { tmpdir } from "node:os";
20
+ import { spawn } from "node:child_process";
21
+ import type { Task, OcrResult, OcrProgressCallback } from "./types";
22
+ import { isImage, isPdf, getPdfPageCount } from "./ollama";
23
+
24
+ // ── Helpers ──────────────────────────────────────────────────────────────────
25
+
26
+ async function execCapture(cmd: string, args: string[]): Promise<{ stdout: string; stderr: string; code: number }> {
27
+ return new Promise((resolve) => {
28
+ const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
29
+ const out: Buffer[] = [];
30
+ const err: Buffer[] = [];
31
+ child.stdout.on("data", (d) => out.push(d));
32
+ child.stderr.on("data", (d) => err.push(d));
33
+ child.on("error", () => resolve({
34
+ stdout: "", stderr: "tesseract not found. Install: brew install tesseract", code: 1,
35
+ }));
36
+ child.on("close", (code) => resolve({
37
+ stdout: Buffer.concat(out).toString("utf8").trim(),
38
+ stderr: Buffer.concat(err).toString("utf8").trim(),
39
+ code: code ?? 1,
40
+ }));
41
+ });
42
+ }
43
+
44
+ function cleanupDir(dir: string) {
45
+ try {
46
+ for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
47
+ rmdirSync(dir);
48
+ } catch { /* best effort */ }
49
+ }
50
+
51
+ // ── PDF → image conversion (reuses ollama helpers approach) ──────────────────
52
+
53
+ async function convertPdfPage(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
54
+ if (process.platform === "darwin") {
55
+ if (pageIndex === 0) {
56
+ await execCapture("sips", ["-s", "format", "png", pdfPath, "--out", outPath]);
57
+ return;
58
+ }
59
+ }
60
+ await execCapture("pdftoppm", [
61
+ "-png", "-r", "200", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
62
+ "-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
63
+ ]);
64
+ }
65
+
66
+ // ── Tesseract OCR ────────────────────────────────────────────────────────────
67
+
68
+ async function tesseractImage(imagePath: string, _task: Task): Promise<string> {
69
+ const { stdout, stderr, code } = await execCapture("tesseract", [
70
+ imagePath, "stdout",
71
+ "-l", "eng+chi_sim", // English + Chinese simplified
72
+ "--psm", "3", // Auto page segmentation
73
+ ]);
74
+
75
+ if (code !== 0) {
76
+ const msg = stderr || "tesseract failed";
77
+ if (msg.includes("not found") || msg.includes("ENOENT")) {
78
+ throw new Error("tesseract not found. Install: brew install tesseract");
79
+ }
80
+ throw new Error(msg.slice(0, 500));
81
+ }
82
+
83
+ return stdout;
84
+ }
85
+
86
+ // ── Public API ───────────────────────────────────────────────────────────────
87
+
88
+ export async function tesseractOcr(
89
+ filePath: string, task: Task,
90
+ signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
91
+ ): Promise<OcrResult> {
92
+ let resultText = "";
93
+ let tmpDir: string | null = null;
94
+
95
+ try {
96
+ if (isPdf(filePath)) {
97
+ onProgress("📄 Converting PDF pages to images…");
98
+ tmpDir = mkdtempSync(join(tmpdir(), "pi-tesseract-"));
99
+ const pageCount = await getPdfPageCount(filePath);
100
+
101
+ const pageResults: string[] = [];
102
+ for (let i = 0; i < pageCount; i++) {
103
+ if (signal?.aborted) throw new Error("Aborted");
104
+ const pageOut = join(tmpDir, `page_${i + 1}.png`);
105
+
106
+ try {
107
+ await convertPdfPage(filePath, i, pageOut);
108
+ } catch (e: any) {
109
+ pageResults.push(`## Page ${i + 1}\n\n> ⚠️ Skipped: ${e.message}`);
110
+ continue;
111
+ }
112
+
113
+ onProgress(`📄 Page ${i + 1}/${pageCount}`);
114
+ const pageText = await tesseractImage(pageOut, task);
115
+ if (!pageText.trim()) {
116
+ pageResults.push(`## Page ${i + 1}\n\n> ⚠️ No text detected`);
117
+ } else {
118
+ pageResults.push(`## Page ${i + 1}\n\n${pageText}`);
119
+ }
120
+ }
121
+ resultText = pageResults.join("\n\n");
122
+ } else if (isImage(filePath)) {
123
+ resultText = await tesseractImage(filePath, task);
124
+ } else {
125
+ throw new Error(`Unsupported file type: ${basename(filePath)}`);
126
+ }
127
+
128
+ return { text: resultText, details: { backend: "tesseract", task } };
129
+ } finally {
130
+ if (tmpDir) cleanupDir(tmpDir);
131
+ }
132
+ }
@@ -6,7 +6,7 @@ export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
6
6
  export type Task = (typeof TASKS)[number];
7
7
 
8
8
  /** All supported OCR backends */
9
- export const BACKENDS = ["ollama", "mineru", "pix2text"] as const;
9
+ export const BACKENDS = ["mineru", "ollama", "tesseract", "pix2text"] as const;
10
10
  export type Backend = (typeof BACKENDS)[number];
11
11
 
12
12
  export interface OcrConfig {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-ocr",
3
- "version": "1.0.2",
3
+ "version": "1.1.1",
4
4
  "description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
5
5
  "keywords": [
6
6
  "pi-package",