pi-ocr 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -2
- package/extensions/index.ts +9 -2
- package/extensions/tesseract.ts +132 -0
- package/extensions/types.ts +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -29,7 +29,8 @@ Switch anytime with `/ocr` (no args).
|
|
|
29
29
|
|---|---|---|---|
|
|
30
30
|
| ☁️ | **MinerU** (default) | PDFs, tables, general docs | None — works instantly |
|
|
31
31
|
| 🦙 | Ollama | Math formulas → LaTeX, offline | `brew install ollama && ollama pull glm-ocr` |
|
|
32
|
-
|
|
|
32
|
+
| 🔤 | Tesseract | Plain text, ultra-light (~30MB) | `brew install tesseract` |
|
|
33
|
+
| 📐 | Pix2Text | Math + text, CPU Python | `pip install pix2text` |
|
|
33
34
|
|
|
34
35
|
---
|
|
35
36
|
|
|
@@ -63,7 +64,20 @@ Switch with `/ocr` → "OCR Backend" → ollama.
|
|
|
63
64
|
|
|
64
65
|
---
|
|
65
66
|
|
|
66
|
-
##
|
|
67
|
+
## Tesseract (optional, for plain text)
|
|
68
|
+
|
|
69
|
+
Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. System package, zero Python.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
brew install tesseract # macOS
|
|
73
|
+
sudo apt install tesseract-ocr # Linux
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Switch with `/ocr` → "OCR Backend" → tesseract.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Pix2Text (optional, for math + text on CPU)
|
|
67
81
|
|
|
68
82
|
Local Python OCR. Mathpix alternative — handles text + formulas on CPU.
|
|
69
83
|
|
|
@@ -103,6 +117,8 @@ First run downloads ONNX models (~50MB). Switch with `/ocr` → "OCR Backend"
|
|
|
103
117
|
|
|
104
118
|
**"python3 not found" (Pix2Text)** → `python3 -m pip install pix2text`
|
|
105
119
|
|
|
120
|
+
**"tesseract not found"** → `brew install tesseract` (macOS) / `sudo apt install tesseract-ocr` (Linux)
|
|
121
|
+
|
|
106
122
|
**"pdftoppm not found" (Ollama multi-page)** → `brew install poppler` (macOS) / `sudo apt install poppler-utils` (Linux)
|
|
107
123
|
|
|
108
124
|
---
|
package/extensions/index.ts
CHANGED
|
@@ -45,6 +45,7 @@ import type { Backend, Task, OcrConfig } from "./types";
|
|
|
45
45
|
import { TASKS, BACKENDS } from "./types";
|
|
46
46
|
import { isImage, isPdf, getPdfPageCount, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./ollama";
|
|
47
47
|
import { mineruOcr } from "./mineru";
|
|
48
|
+
import { tesseractOcr } from "./tesseract";
|
|
48
49
|
import { pix2textOcr } from "./pix2text";
|
|
49
50
|
|
|
50
51
|
// ── Config persistence ───────────────────────────────────────────────────────
|
|
@@ -138,7 +139,7 @@ const ocrTool = defineTool({
|
|
|
138
139
|
throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
|
|
139
140
|
}
|
|
140
141
|
|
|
141
|
-
const backendLabel = { ollama: "🦙 Ollama",
|
|
142
|
+
const backendLabel = { mineru: "☁️ MinerU", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
|
|
142
143
|
onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
|
|
143
144
|
|
|
144
145
|
const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
|
|
@@ -159,6 +160,9 @@ const ocrTool = defineTool({
|
|
|
159
160
|
result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
|
|
160
161
|
break;
|
|
161
162
|
}
|
|
163
|
+
case "tesseract":
|
|
164
|
+
result = await tesseractOcr(filePath, resolvedTask, signal, onProgress);
|
|
165
|
+
break;
|
|
162
166
|
case "pix2text":
|
|
163
167
|
result = await pix2textOcr(filePath, resolvedTask, signal, onProgress);
|
|
164
168
|
break;
|
|
@@ -175,6 +179,7 @@ const ocrTool = defineTool({
|
|
|
175
179
|
const msg = e.message || String(e);
|
|
176
180
|
let hint = "";
|
|
177
181
|
if (config.backend === "ollama" && (msg.includes("fetch failed") || msg.includes("ECONNREFUSED"))) hint = "\n\n💡 Is Ollama running? Start: `ollama serve`";
|
|
182
|
+
else if (config.backend === "tesseract" && msg.includes("not found")) hint = "\n\n💡 Install: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux)";
|
|
178
183
|
else if (config.backend === "pix2text" && msg.includes("python3")) hint = "\n\n💡 Install: `pip install pix2text`";
|
|
179
184
|
else if (config.backend === "mineru" && msg.includes("429")) hint = "\n\n💡 MinerU rate limit. Wait a minute or switch backend with /ocr.";
|
|
180
185
|
else if (config.backend === "mineru" && msg.includes("too large")) hint = "\n\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend.";
|
|
@@ -292,7 +297,9 @@ export default function ocrExtension(pi: ExtensionAPI) {
|
|
|
292
297
|
".\nLarge files? Compress at https://ilovepdf.com/compress_pdf",
|
|
293
298
|
"info",
|
|
294
299
|
);
|
|
295
|
-
} else if (backend === "
|
|
300
|
+
} else if (backend === "tesseract") {
|
|
301
|
+
ctx.ui.notify("🔤 Tesseract: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux). ~30MB, CPU-only.", "warning");
|
|
302
|
+
} else if (backend === "pix2text") {
|
|
296
303
|
ctx.ui.notify("🐍 Pix2Text: needs `pip install pix2text`", "warning");
|
|
297
304
|
}
|
|
298
305
|
break;
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-ocr — Tesseract backend
|
|
3
|
+
*
|
|
4
|
+
* Uses Tesseract OCR (https://github.com/tesseract-ocr/tesseract) —
|
|
5
|
+
* the classic open-source OCR engine. Ultra-lightweight (~30MB),
|
|
6
|
+
* zero Python deps, CPU-only, fast on plain text.
|
|
7
|
+
*
|
|
8
|
+
* Prerequisites:
|
|
9
|
+
* brew install tesseract # macOS
|
|
10
|
+
* sudo apt install tesseract-ocr # Linux
|
|
11
|
+
*
|
|
12
|
+
* For non-English languages, install the corresponding lang pack:
|
|
13
|
+
* brew install tesseract-lang # macOS (all languages)
|
|
14
|
+
* sudo apt install tesseract-ocr-chi-sim # Chinese simplified
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { mkdtempSync, readdirSync, unlinkSync, rmdirSync } from "node:fs";
|
|
18
|
+
import { basename, join } from "node:path";
|
|
19
|
+
import { tmpdir } from "node:os";
|
|
20
|
+
import { spawn } from "node:child_process";
|
|
21
|
+
import type { Task, OcrResult, OcrProgressCallback } from "./types";
|
|
22
|
+
import { isImage, isPdf, getPdfPageCount } from "./ollama";
|
|
23
|
+
|
|
24
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
async function execCapture(cmd: string, args: string[]): Promise<{ stdout: string; stderr: string; code: number }> {
|
|
27
|
+
return new Promise((resolve) => {
|
|
28
|
+
const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
|
|
29
|
+
const out: Buffer[] = [];
|
|
30
|
+
const err: Buffer[] = [];
|
|
31
|
+
child.stdout.on("data", (d) => out.push(d));
|
|
32
|
+
child.stderr.on("data", (d) => err.push(d));
|
|
33
|
+
child.on("error", () => resolve({
|
|
34
|
+
stdout: "", stderr: "tesseract not found. Install: brew install tesseract", code: 1,
|
|
35
|
+
}));
|
|
36
|
+
child.on("close", (code) => resolve({
|
|
37
|
+
stdout: Buffer.concat(out).toString("utf8").trim(),
|
|
38
|
+
stderr: Buffer.concat(err).toString("utf8").trim(),
|
|
39
|
+
code: code ?? 1,
|
|
40
|
+
}));
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function cleanupDir(dir: string) {
|
|
45
|
+
try {
|
|
46
|
+
for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
|
|
47
|
+
rmdirSync(dir);
|
|
48
|
+
} catch { /* best effort */ }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// ── PDF → image conversion (reuses ollama helpers approach) ──────────────────
|
|
52
|
+
|
|
53
|
+
async function convertPdfPage(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
|
|
54
|
+
if (process.platform === "darwin") {
|
|
55
|
+
if (pageIndex === 0) {
|
|
56
|
+
await execCapture("sips", ["-s", "format", "png", pdfPath, "--out", outPath]);
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
await execCapture("pdftoppm", [
|
|
61
|
+
"-png", "-r", "200", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
|
|
62
|
+
"-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
|
|
63
|
+
]);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ── Tesseract OCR ────────────────────────────────────────────────────────────
|
|
67
|
+
|
|
68
|
+
async function tesseractImage(imagePath: string, _task: Task): Promise<string> {
|
|
69
|
+
const { stdout, stderr, code } = await execCapture("tesseract", [
|
|
70
|
+
imagePath, "stdout",
|
|
71
|
+
"-l", "eng+chi_sim", // English + Chinese simplified
|
|
72
|
+
"--psm", "3", // Auto page segmentation
|
|
73
|
+
]);
|
|
74
|
+
|
|
75
|
+
if (code !== 0) {
|
|
76
|
+
const msg = stderr || "tesseract failed";
|
|
77
|
+
if (msg.includes("not found") || msg.includes("ENOENT")) {
|
|
78
|
+
throw new Error("tesseract not found. Install: brew install tesseract");
|
|
79
|
+
}
|
|
80
|
+
throw new Error(msg.slice(0, 500));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return stdout;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
export async function tesseractOcr(
|
|
89
|
+
filePath: string, task: Task,
|
|
90
|
+
signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
|
|
91
|
+
): Promise<OcrResult> {
|
|
92
|
+
let resultText = "";
|
|
93
|
+
let tmpDir: string | null = null;
|
|
94
|
+
|
|
95
|
+
try {
|
|
96
|
+
if (isPdf(filePath)) {
|
|
97
|
+
onProgress("📄 Converting PDF pages to images…");
|
|
98
|
+
tmpDir = mkdtempSync(join(tmpdir(), "pi-tesseract-"));
|
|
99
|
+
const pageCount = await getPdfPageCount(filePath);
|
|
100
|
+
|
|
101
|
+
const pageResults: string[] = [];
|
|
102
|
+
for (let i = 0; i < pageCount; i++) {
|
|
103
|
+
if (signal?.aborted) throw new Error("Aborted");
|
|
104
|
+
const pageOut = join(tmpDir, `page_${i + 1}.png`);
|
|
105
|
+
|
|
106
|
+
try {
|
|
107
|
+
await convertPdfPage(filePath, i, pageOut);
|
|
108
|
+
} catch (e: any) {
|
|
109
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ Skipped: ${e.message}`);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
onProgress(`📄 Page ${i + 1}/${pageCount}`);
|
|
114
|
+
const pageText = await tesseractImage(pageOut, task);
|
|
115
|
+
if (!pageText.trim()) {
|
|
116
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ No text detected`);
|
|
117
|
+
} else {
|
|
118
|
+
pageResults.push(`## Page ${i + 1}\n\n${pageText}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
resultText = pageResults.join("\n\n");
|
|
122
|
+
} else if (isImage(filePath)) {
|
|
123
|
+
resultText = await tesseractImage(filePath, task);
|
|
124
|
+
} else {
|
|
125
|
+
throw new Error(`Unsupported file type: ${basename(filePath)}`);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return { text: resultText, details: { backend: "tesseract", task } };
|
|
129
|
+
} finally {
|
|
130
|
+
if (tmpDir) cleanupDir(tmpDir);
|
|
131
|
+
}
|
|
132
|
+
}
|
package/extensions/types.ts
CHANGED
|
@@ -6,7 +6,7 @@ export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
|
|
|
6
6
|
export type Task = (typeof TASKS)[number];
|
|
7
7
|
|
|
8
8
|
/** All supported OCR backends */
|
|
9
|
-
export const BACKENDS = ["ollama", "
|
|
9
|
+
export const BACKENDS = ["mineru", "ollama", "tesseract", "pix2text"] as const;
|
|
10
10
|
export type Backend = (typeof BACKENDS)[number];
|
|
11
11
|
|
|
12
12
|
export interface OcrConfig {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-ocr",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.1",
|
|
4
4
|
"description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|