pi-ocr 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -29,7 +29,7 @@ Switch anytime with `/ocr` (no args).
29
29
  |---|---|---|---|
30
30
  | ☁️ | **MinerU** (default) | PDFs, tables, general docs | None — works instantly |
31
31
  | 🦙 | Ollama | Math formulas → LaTeX, offline | `brew install ollama && ollama pull glm-ocr` |
32
- | 🔤 | Tesseract | Plain text, ultra-light (~30MB) | `brew install tesseract` |
32
+ | 🔤 | Tesseract | Plain text, no formulas (~30MB) | `brew install tesseract` |
33
33
  | 📐 | Pix2Text | Math + text, CPU Python | `pip install pix2text` |
34
34
 
35
35
  ---
@@ -64,9 +64,9 @@ Switch with `/ocr` → "OCR Backend" → ollama.
64
64
 
65
65
  ---
66
66
 
67
- ## Tesseract (optional, for plain text)
67
+ ## Tesseract (optional, plain text only)
68
68
 
69
- Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. System package, zero Python.
69
+ Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. **No formula support** — use Ollama or Pix2Text for math.
70
70
 
71
71
  ```bash
72
72
  brew install tesseract # macOS
@@ -47,6 +47,7 @@ import { isImage, isPdf, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./
47
47
  import { mineruOcr } from "./mineru";
48
48
  import { tesseractOcr } from "./tesseract";
49
49
  import { pix2textOcr } from "./pix2text";
50
+ import { mineruProOcr } from "./mineru-pro";
50
51
 
51
52
  // ── Config persistence ───────────────────────────────────────────────────────
52
53
 
@@ -87,6 +88,7 @@ function getConfig(): OcrConfig {
87
88
  ollamaHost: process.env.OLLAMA_HOST || s.ollamaHost || "http://localhost:11434",
88
89
  model: process.env.OCR_MODEL || s.model || "glm-ocr",
89
90
  mineruSplitPdf: s.mineruSplitPdf !== false,
91
+ mineruToken: s.mineruToken,
90
92
  };
91
93
  }
92
94
 
@@ -148,7 +150,7 @@ const ocrTool = defineTool({
148
150
  throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
149
151
  }
150
152
 
151
- const backendLabel = { mineru: "☁️ MinerU", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
153
+ const backendLabel = { mineru: "☁️ MinerU", "mineru-pro": "☁️ MinerU Pro", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
152
154
  onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
153
155
 
154
156
  const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
@@ -169,6 +171,12 @@ const ocrTool = defineTool({
169
171
  result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
170
172
  break;
171
173
  }
174
+ case "mineru-pro": {
175
+ const token = config.mineruToken || process.env.MINERU_TOKEN;
176
+ if (!token) throw new Error("MinerU Pro requires a token. Get one at https://mineru.net/apiManage, then set it with /ocr settings.");
177
+ result = await mineruProOcr(filePath, resolvedTask, token, signal, onProgress);
178
+ break;
179
+ }
172
180
  case "tesseract":
173
181
  result = await tesseractOcr(filePath, resolvedTask, signal, onProgress);
174
182
  break;
@@ -299,7 +307,9 @@ export default function ocrExtension(pi: ExtensionAPI) {
299
307
  saveOcrConfig({ backend });
300
308
  updateStatus(ctx);
301
309
  // Show hints when switching
302
- if (backend === "mineru") {
310
+ if (backend === "mineru-pro") {
311
+ ctx.ui.notify("☁️ MinerU Pro: vlm model, ≤200MB, ≤200 pages. Requires API token from https://mineru.net/apiManage", "info");
312
+ } else if (backend === "mineru") {
303
313
  ctx.ui.notify(
304
314
  "☁️ MinerU: free for ≤10MB & ≤20 pages. Auto-split " +
305
315
  (config.mineruSplitPdf ? "ON" : "OFF — enable in settings") +
@@ -421,9 +431,7 @@ export default function ocrExtension(pi: ExtensionAPI) {
421
431
 
422
432
  function updateStatus(ctx: ExtensionContext) {
423
433
  const config = getConfig();
424
- const text = config.backend === "ollama"
425
- ? `OCR: ollama ${config.model}`
426
- : `OCR: ${config.backend}`;
434
+ const text = config.backend === "ollama" ? `OCR: ollama ${config.model}` : config.backend === "mineru-pro" ? "OCR: mineru-pro (vlm)" : `OCR: ${config.backend}`;
427
435
  ctx.ui.setStatus("pi-ocr", text);
428
436
  }
429
437
 
@@ -0,0 +1,249 @@
1
+ /**
2
+ * pi-ocr — MinerU Pro backend (Precision API, token required)
3
+ *
4
+ * API reference: https://mineru.net/apiManage/docs
5
+ *
6
+ * Single file flow (URL mode):
7
+ * 1. POST /api/v4/extract/task → {task_id}
8
+ * 2. Poll GET /api/v4/extract/task/{task_id} → {state, full_zip_url}
9
+ * 3. Download full_zip_url → extract .md
10
+ *
11
+ * Local file flow (batch upload):
12
+ * 1. POST /api/v4/file-urls/batch → {batch_id, file_urls[]}
13
+ * 2. PUT file to file_urls[0] → auto-submits
14
+ * 3. Poll GET /api/v4/extract-results/batch/{batch_id} → {extract_result[].full_zip_url}
15
+ * 4. Download zip → extract .md
16
+ *
17
+ * Limits: ≤200MB, ≤200 pages, 1000 pages/day high-priority
18
+ */
19
+
20
+ import { readFileSync, mkdtempSync, readdirSync } from "node:fs";
21
+ import { basename, extname, join } from "node:path";
22
+ import { tmpdir } from "node:os";
23
+ import { stat } from "node:fs/promises";
24
+ import { spawn } from "node:child_process";
25
+ import type { Task, OcrResult, OcrProgressCallback } from "./types";
26
+
27
+ const BASE_URL = "https://mineru.net/api/v4";
28
+
29
+ // ── Auth helper ─────────────────────────────────────────────────────────────
30
+
31
+ function authHeaders(token: string) {
32
+ return { "Content-Type": "application/json", "Authorization": `Bearer ${token}` };
33
+ }
34
+
35
+ // ── API calls ───────────────────────────────────────────────────────────────
36
+
37
+ async function apiPost(token: string, url: string, body: Record<string, unknown>) {
38
+ const resp = await fetch(url, {
39
+ method: "POST", headers: authHeaders(token),
40
+ body: JSON.stringify(body), signal: AbortSignal.timeout(30_000),
41
+ });
42
+ if (!resp.ok) throw new Error(`MinerU Pro ${resp.status}: ${(await resp.text()).slice(0, 200)}`);
43
+ const data = (await resp.json()) as { code: number; msg: string; data: any };
44
+ if (data.code !== 0) throw new Error(`MinerU Pro: ${data.msg}`);
45
+ return data.data;
46
+ }
47
+
48
+ async function apiGet(token: string, url: string): Promise<any> {
49
+ const resp = await fetch(url, {
50
+ headers: { "Authorization": `Bearer ${token}` },
51
+ signal: AbortSignal.timeout(15_000),
52
+ });
53
+ const data = (await resp.json()) as { code: number; msg: string; data: any };
54
+ if (data.code !== 0) throw new Error(`MinerU Pro poll: ${data.msg}`);
55
+ return data.data;
56
+ }
57
+
58
+ // ── Download zip and extract .md ───────────────────────────────────────────
59
+
60
+ async function downloadAndExtractMd(zipUrl: string): Promise<string> {
61
+ const tmpDir = mkdtempSync(join(tmpdir(), "pi-mineru-pro-"));
62
+ const zipPath = join(tmpDir, "result.zip");
63
+
64
+ // Download zip
65
+ const resp = await fetch(zipUrl, { signal: AbortSignal.timeout(120_000) });
66
+ if (!resp.ok) throw new Error(`Failed to download zip: ${resp.status}`);
67
+ const buf = Buffer.from(await resp.arrayBuffer());
68
+ require("node:fs").writeFileSync(zipPath, buf);
69
+
70
+ // Extract
71
+ try {
72
+ await extractZip(zipPath, tmpDir);
73
+ } catch {
74
+ throw new Error("Failed to extract zip — install unzip or python3");
75
+ }
76
+
77
+ // Find and read .md file
78
+ try {
79
+ const files = readdirSync(tmpDir, { recursive: true }) as string[];
80
+ const mdFile = files.find(f => f.endsWith(".md") && !f.includes("content_list") && !f.includes("model"));
81
+ if (!mdFile) throw new Error("No markdown in extracted zip");
82
+
83
+ // Read all .md files that are actual content (not content_list.json.md or model.json.md)
84
+ const contentFiles = files.filter(f => f.endsWith(".md") && !f.includes("_content_list") && !f.includes("_model") && !f.includes("middle") && !f.includes("layout"));
85
+ const content = contentFiles.map(f => {
86
+ const text = readFileSync(join(tmpDir, f), "utf8");
87
+ return text;
88
+ }).join("\n\n");
89
+
90
+ cleanupDir(tmpDir);
91
+ return content || readFileSync(join(tmpDir, mdFile), "utf8");
92
+ } finally {
93
+ cleanupDir(tmpDir);
94
+ }
95
+ }
96
+
97
+ async function extractZip(zipPath: string, outDir: string): Promise<void> {
98
+ return new Promise((resolve, reject) => {
99
+ // Try unzip first
100
+ const { execFile } = require("node:child_process");
101
+ execFile("unzip", ["-qo", zipPath, "-d", outDir], (err: Error | null) => {
102
+ if (!err) return resolve();
103
+ // Fallback: python3
104
+ const child = spawn("python3", ["-c", `
105
+ import zipfile, sys
106
+ with zipfile.ZipFile(sys.argv[1]) as z: z.extractall(sys.argv[2])
107
+ `, zipPath, outDir]);
108
+ child.on("close", (code) => code === 0 ? resolve() : reject(new Error("extract failed")));
109
+ child.on("error", () => reject(new Error("no extract tool")));
110
+ });
111
+ });
112
+ }
113
+
114
+ function cleanupDir(dir: string) {
115
+ try { require("node:fs").rmSync(dir, { recursive: true, force: true }); } catch {}
116
+ }
117
+
118
+ // ── Single file (URL mode) ─────────────────────────────────────────────────
119
+
120
+ async function processUrl(
121
+ token: string, fileUrl: string, fileName: string,
122
+ progressPrefix: string, onProgress: OcrProgressCallback,
123
+ ): Promise<string> {
124
+ onProgress(`${progressPrefix} submitting…`);
125
+ const { task_id } = await apiPost(token, `${BASE_URL}/extract/task`, {
126
+ url: fileUrl,
127
+ model_version: "vlm",
128
+ language: "ch",
129
+ enable_table: true,
130
+ enable_formula: true,
131
+ });
132
+
133
+ return await pollSingleTask(token, task_id, 600_000, progressPrefix, onProgress);
134
+ }
135
+
136
+ async function pollSingleTask(
137
+ token: string, taskId: string, timeoutMs: number,
138
+ progressPrefix: string, onProgress: OcrProgressCallback,
139
+ ): Promise<string> {
140
+ const start = Date.now();
141
+ let lastState = "";
142
+ while (Date.now() - start < timeoutMs) {
143
+ const data = await apiGet(token, `${BASE_URL}/extract/task/${taskId}`);
144
+ const state: string = data.state || "unknown";
145
+
146
+ if (state === "done") {
147
+ return await downloadAndExtractMd(data.full_zip_url);
148
+ }
149
+ if (state === "failed") {
150
+ throw new Error(`MinerU Pro failed: ${data.err_msg || "unknown"}`);
151
+ }
152
+ if (state !== lastState) {
153
+ lastState = state;
154
+ onProgress(`${progressPrefix} ${state}…`);
155
+ }
156
+ await new Promise(r => setTimeout(r, 5000));
157
+ }
158
+ throw new Error(`MinerU Pro task ${taskId} timed out`);
159
+ }
160
+
161
+ // ── Local file upload (batch mode) ─────────────────────────────────────────
162
+
163
+ async function processLocalFile(
164
+ token: string, filePath: string, fileName: string,
165
+ progressPrefix: string, onProgress: OcrProgressCallback,
166
+ ): Promise<string> {
167
+ // Step 1: Get signed upload URL
168
+ onProgress(`${progressPrefix} requesting upload…`);
169
+ const { batch_id, file_urls } = await apiPost(token, `${BASE_URL}/file-urls/batch`, {
170
+ files: [{ name: fileName }],
171
+ model_version: "vlm",
172
+ });
173
+
174
+ if (!file_urls?.[0]) throw new Error("No upload URL returned");
175
+
176
+ // Step 2: Upload file (no Content-Type header per docs)
177
+ onProgress(`${progressPrefix} uploading…`);
178
+ const fileData = readFileSync(filePath);
179
+ const putResp = await fetch(file_urls[0], {
180
+ method: "PUT",
181
+ body: fileData,
182
+ signal: AbortSignal.timeout(120_000),
183
+ });
184
+ if (!putResp.ok) throw new Error(`Upload failed: ${putResp.status}`);
185
+
186
+ // Upload complete → auto-submitted. Poll batch.
187
+ return await pollBatch(token, batch_id, 600_000, progressPrefix, onProgress);
188
+ }
189
+
190
+ async function pollBatch(
191
+ token: string, batchId: string, timeoutMs: number,
192
+ progressPrefix: string, onProgress: OcrProgressCallback,
193
+ ): Promise<string> {
194
+ const start = Date.now();
195
+ while (Date.now() - start < timeoutMs) {
196
+ const data = await apiGet(token, `${BASE_URL}/extract-results/batch/${batchId}`);
197
+ const results: any[] = data.extract_result || [];
198
+
199
+ const allDone = results.every((r: any) => r.state === "done" || r.state === "failed");
200
+ if (allDone) {
201
+ const markdowns: string[] = [];
202
+ for (const r of results) {
203
+ if (r.state === "done" && r.full_zip_url) {
204
+ onProgress(`${progressPrefix} downloading ${r.file_name}…`);
205
+ const md = await downloadAndExtractMd(r.full_zip_url);
206
+ markdowns.push(md);
207
+ }
208
+ }
209
+ return markdowns.join("\n\n");
210
+ }
211
+
212
+ // Show progress
213
+ const running = results.filter((r: any) => r.state === "running");
214
+ if (running.length > 0) {
215
+ const r = running[0];
216
+ const pct = r.extract_progress
217
+ ? `${r.extract_progress.extracted_pages || "?"}/${r.extract_progress.total_pages || "?"}p`
218
+ : "";
219
+ onProgress(`${progressPrefix} running ${pct}…`);
220
+ } else {
221
+ onProgress(`${progressPrefix} ${results[0]?.state || "pending"}…`);
222
+ }
223
+ await new Promise(r => setTimeout(r, 5000));
224
+ }
225
+ throw new Error(`MinerU Pro batch ${batchId} timed out`);
226
+ }
227
+
228
+ // ── Public API ───────────────────────────────────────────────────────────────
229
+
230
+ export async function mineruProOcr(
231
+ filePath: string, task: Task, token: string,
232
+ signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
233
+ ): Promise<OcrResult> {
234
+ const ext = extname(filePath).toLowerCase();
235
+ const fileName = basename(filePath);
236
+
237
+ if (![".pdf", ".png", ".jpg", ".jpeg", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"].includes(ext)) {
238
+ throw new Error(`MinerU Pro unsupported: ${ext}`);
239
+ }
240
+
241
+ const stats = await stat(filePath);
242
+ if (stats.size > 200 * 1024 * 1024) throw new Error("File exceeds 200MB limit");
243
+
244
+ onProgress("[1/1] MinerU Pro (vlm)…");
245
+ const markdown = await processLocalFile(token, filePath, fileName, "[1/1]", onProgress);
246
+ onProgress("[1/1] done");
247
+
248
+ return { text: markdown, details: { backend: "mineru-pro", fileName } };
249
+ }
@@ -6,7 +6,7 @@ export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
6
6
  export type Task = (typeof TASKS)[number];
7
7
 
8
8
  /** All supported OCR backends */
9
- export const BACKENDS = ["mineru", "ollama", "tesseract", "pix2text"] as const;
9
+ export const BACKENDS = ["mineru", "mineru-pro", "ollama", "tesseract", "pix2text"] as const;
10
10
  export type Backend = (typeof BACKENDS)[number];
11
11
 
12
12
  export interface OcrConfig {
@@ -15,6 +15,8 @@ export interface OcrConfig {
15
15
  model: string;
16
16
  /** MinerU: auto-split PDFs with >20 pages into free-tier chunks */
17
17
  mineruSplitPdf: boolean;
18
+ /** MinerU Pro: API token for precision API */
19
+ mineruToken?: string;
18
20
  }
19
21
 
20
22
  export interface OcrResult {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-ocr",
3
- "version": "1.2.1",
3
+ "version": "1.3.0",
4
4
  "description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
5
5
  "keywords": [
6
6
  "pi-package",