npm - pi-ocr - Versions diffs - 1.0.2 → 1.1.1 - Mend

pi-ocr 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -29,7 +29,8 @@ Switch anytime with `/ocr` (no args).
 |---|---|---|---|
 | ☁️ | **MinerU** (default) | PDFs, tables, general docs | None — works instantly |
 | 🦙 | Ollama | Math formulas → LaTeX, offline | `brew install ollama && ollama pull glm-ocr` |
-| 📐 | Pix2Text | Math + text, CPU-only | `pip install pix2text` |
+| 🔤 | Tesseract | Plain text, ultra-light (~30MB) | `brew install tesseract` |
+| 📐 | Pix2Text | Math + text, CPU Python | `pip install pix2text` |
 ---
@@ -63,7 +64,20 @@ Switch with `/ocr` → "OCR Backend" → ollama.
 ---
-## Pix2Text (optional, for offline CPU)
+## Tesseract (optional, for plain text)
+Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. System package, zero Python.
+```bash
+brew install tesseract              # macOS
+sudo apt install tesseract-ocr      # Linux
+```
+Switch with `/ocr` → "OCR Backend" → tesseract.
+---
+## Pix2Text (optional, for math + text on CPU)
 Local Python OCR. Mathpix alternative — handles text + formulas on CPU.
@@ -103,6 +117,8 @@ First run downloads ONNX models (~50MB). Switch with `/ocr` → "OCR Backend"
 **"python3 not found" (Pix2Text)** → `python3 -m pip install pix2text`
+**"tesseract not found"** → `brew install tesseract` (macOS) / `sudo apt install tesseract-ocr` (Linux)
 **"pdftoppm not found" (Ollama multi-page)** → `brew install poppler` (macOS) / `sudo apt install poppler-utils` (Linux)
 ---

package/extensions/index.ts CHANGED Viewed

@@ -45,6 +45,7 @@ import type { Backend, Task, OcrConfig } from "./types";
 import { TASKS, BACKENDS } from "./types";
 import { isImage, isPdf, getPdfPageCount, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./ollama";
 import { mineruOcr } from "./mineru";
+import { tesseractOcr } from "./tesseract";
 import { pix2textOcr } from "./pix2text";
 // ── Config persistence ───────────────────────────────────────────────────────
@@ -138,7 +139,7 @@ const ocrTool = defineTool({
       throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
     }
-    const backendLabel = { ollama: "🦙 Ollama", mineru: "☁️ MinerU", pix2text: "📐 Pix2Text" }[config.backend];
+    const backendLabel = { mineru: "☁️ MinerU", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
     onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
     const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
@@ -159,6 +160,9 @@ const ocrTool = defineTool({
           result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
           break;
         }
+        case "tesseract":
+          result = await tesseractOcr(filePath, resolvedTask, signal, onProgress);
+          break;
         case "pix2text":
           result = await pix2textOcr(filePath, resolvedTask, signal, onProgress);
           break;
@@ -175,6 +179,7 @@ const ocrTool = defineTool({
       const msg = e.message || String(e);
       let hint = "";
       if (config.backend === "ollama" && (msg.includes("fetch failed") || msg.includes("ECONNREFUSED"))) hint = "\n\n💡 Is Ollama running? Start: `ollama serve`";
+      else if (config.backend === "tesseract" && msg.includes("not found")) hint = "\n\n💡 Install: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux)";
       else if (config.backend === "pix2text" && msg.includes("python3")) hint = "\n\n💡 Install: `pip install pix2text`";
       else if (config.backend === "mineru" && msg.includes("429")) hint = "\n\n💡 MinerU rate limit. Wait a minute or switch backend with /ocr.";
       else if (config.backend === "mineru" && msg.includes("too large")) hint = "\n\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend.";
@@ -292,7 +297,9 @@ export default function ocrExtension(pi: ExtensionAPI) {
                     ".\nLarge files? Compress at https://ilovepdf.com/compress_pdf",
                     "info",
                   );
-                } else if (backend === "pix2text") {
+                } else if (backend === "tesseract") {
+                ctx.ui.notify("🔤 Tesseract: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux). ~30MB, CPU-only.", "warning");
+              } else if (backend === "pix2text") {
                   ctx.ui.notify("🐍 Pix2Text: needs `pip install pix2text`", "warning");
                 }
                 break;

package/extensions/tesseract.ts ADDED Viewed

@@ -0,0 +1,132 @@
+/**
+ * pi-ocr — Tesseract backend
+ *
+ * Uses Tesseract OCR (https://github.com/tesseract-ocr/tesseract) —
+ * the classic open-source OCR engine. Ultra-lightweight (~30MB),
+ * zero Python deps, CPU-only, fast on plain text.
+ *
+ * Prerequisites:
+ *   brew install tesseract        # macOS
+ *   sudo apt install tesseract-ocr # Linux
+ *
+ * For non-English languages, install the corresponding lang pack:
+ *   brew install tesseract-lang   # macOS (all languages)
+ *   sudo apt install tesseract-ocr-chi-sim  # Chinese simplified
+ */
+import { mkdtempSync, readdirSync, unlinkSync, rmdirSync } from "node:fs";
+import { basename, join } from "node:path";
+import { tmpdir } from "node:os";
+import { spawn } from "node:child_process";
+import type { Task, OcrResult, OcrProgressCallback } from "./types";
+import { isImage, isPdf, getPdfPageCount } from "./ollama";
+// ── Helpers ──────────────────────────────────────────────────────────────────
+async function execCapture(cmd: string, args: string[]): Promise<{ stdout: string; stderr: string; code: number }> {
+  return new Promise((resolve) => {
+    const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
+    const out: Buffer[] = [];
+    const err: Buffer[] = [];
+    child.stdout.on("data", (d) => out.push(d));
+    child.stderr.on("data", (d) => err.push(d));
+    child.on("error", () => resolve({
+      stdout: "", stderr: "tesseract not found. Install: brew install tesseract", code: 1,
+    }));
+    child.on("close", (code) => resolve({
+      stdout: Buffer.concat(out).toString("utf8").trim(),
+      stderr: Buffer.concat(err).toString("utf8").trim(),
+      code: code ?? 1,
+    }));
+  });
+}
+function cleanupDir(dir: string) {
+  try {
+    for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
+    rmdirSync(dir);
+  } catch { /* best effort */ }
+}
+// ── PDF → image conversion (reuses ollama helpers approach) ──────────────────
+async function convertPdfPage(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
+  if (process.platform === "darwin") {
+    if (pageIndex === 0) {
+      await execCapture("sips", ["-s", "format", "png", pdfPath, "--out", outPath]);
+      return;
+    }
+  }
+  await execCapture("pdftoppm", [
+    "-png", "-r", "200", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
+    "-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
+  ]);
+}
+// ── Tesseract OCR ────────────────────────────────────────────────────────────
+async function tesseractImage(imagePath: string, _task: Task): Promise<string> {
+  const { stdout, stderr, code } = await execCapture("tesseract", [
+    imagePath, "stdout",
+    "-l", "eng+chi_sim",   // English + Chinese simplified
+    "--psm", "3",  // Auto page segmentation
+  ]);
+  if (code !== 0) {
+    const msg = stderr || "tesseract failed";
+    if (msg.includes("not found") || msg.includes("ENOENT")) {
+      throw new Error("tesseract not found. Install: brew install tesseract");
+    }
+    throw new Error(msg.slice(0, 500));
+  }
+  return stdout;
+}
+// ── Public API ───────────────────────────────────────────────────────────────
+export async function tesseractOcr(
+  filePath: string, task: Task,
+  signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
+): Promise<OcrResult> {
+  let resultText = "";
+  let tmpDir: string | null = null;
+  try {
+    if (isPdf(filePath)) {
+      onProgress("📄 Converting PDF pages to images…");
+      tmpDir = mkdtempSync(join(tmpdir(), "pi-tesseract-"));
+      const pageCount = await getPdfPageCount(filePath);
+      const pageResults: string[] = [];
+      for (let i = 0; i < pageCount; i++) {
+        if (signal?.aborted) throw new Error("Aborted");
+        const pageOut = join(tmpDir, `page_${i + 1}.png`);
+        try {
+          await convertPdfPage(filePath, i, pageOut);
+        } catch (e: any) {
+          pageResults.push(`## Page ${i + 1}\n\n> ⚠️ Skipped: ${e.message}`);
+          continue;
+        }
+        onProgress(`📄 Page ${i + 1}/${pageCount}`);
+        const pageText = await tesseractImage(pageOut, task);
+        if (!pageText.trim()) {
+          pageResults.push(`## Page ${i + 1}\n\n> ⚠️ No text detected`);
+        } else {
+          pageResults.push(`## Page ${i + 1}\n\n${pageText}`);
+        }
+      }
+      resultText = pageResults.join("\n\n");
+    } else if (isImage(filePath)) {
+      resultText = await tesseractImage(filePath, task);
+    } else {
+      throw new Error(`Unsupported file type: ${basename(filePath)}`);
+    }
+    return { text: resultText, details: { backend: "tesseract", task } };
+  } finally {
+    if (tmpDir) cleanupDir(tmpDir);
+  }
+}

package/extensions/types.ts CHANGED Viewed

@@ -6,7 +6,7 @@ export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
 export type Task = (typeof TASKS)[number];
 /** All supported OCR backends */
-export const BACKENDS = ["ollama", "mineru", "pix2text"] as const;
+export const BACKENDS = ["mineru", "ollama", "tesseract", "pix2text"] as const;
 export type Backend = (typeof BACKENDS)[number];
 export interface OcrConfig {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pi-ocr",
-  "version": "1.0.2",
+  "version": "1.1.1",
   "description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
   "keywords": [
     "pi-package",