pi-ocr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +394 -0
- package/extensions/index.ts +434 -0
- package/extensions/mineru.ts +276 -0
- package/extensions/ollama.ts +226 -0
- package/extensions/pix2text.ts +189 -0
- package/extensions/types.ts +27 -0
- package/package.json +54 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-minimodel-ocr — MinerU API backend
|
|
3
|
+
*
|
|
4
|
+
* Uses the free Agent Lightweight API (no token required):
|
|
5
|
+
* - File ≤10MB, ≤20 pages → one free request
|
|
6
|
+
* - File ≤10MB, >20 pages → auto-splits into ≤20-page chunks (if mineruSplitPdf enabled)
|
|
7
|
+
* - File >10MB → warns and suggests compression at ilovepdf.com
|
|
8
|
+
*
|
|
9
|
+
* API flow (file mode) — each chunk is a SEPARATE request (not batch):
|
|
10
|
+
* 1. POST /api/v1/agent/parse/file → task_id + signed OSS upload URL
|
|
11
|
+
* 2. PUT file bytes to signed URL
|
|
12
|
+
* 3. Poll GET /api/v1/agent/parse/{task_id} until state=done
|
|
13
|
+
* 4. GET markdown_url → download final Markdown
|
|
14
|
+
*
|
|
15
|
+
* PDF splitting uses pypdfium2 (same dep as Pix2Text backend).
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { readFileSync, mkdtempSync, unlinkSync, rmdirSync, readdirSync } from "node:fs";
|
|
19
|
+
import { basename, extname, join } from "node:path";
|
|
20
|
+
import { tmpdir } from "node:os";
|
|
21
|
+
import { stat } from "node:fs/promises";
|
|
22
|
+
import { spawn } from "node:child_process";
|
|
23
|
+
import type { Task, OcrResult, OcrProgressCallback } from "./types";
|
|
24
|
+
|
|
25
|
+
const BASE_URL = "https://mineru.net/api/v1/agent";
|
|
26
|
+
|
|
27
|
+
// ── Python PDF splitter (uses pypdfium2) ─────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
const PDF_SPLIT_SCRIPT = `
|
|
30
|
+
import sys, json, os
|
|
31
|
+
import pypdfium2 as pdfium
|
|
32
|
+
|
|
33
|
+
input_path = sys.argv[1]
|
|
34
|
+
chunk_size = int(sys.argv[2])
|
|
35
|
+
output_dir = sys.argv[3]
|
|
36
|
+
|
|
37
|
+
src = pdfium.PdfDocument(input_path)
|
|
38
|
+
total = len(src)
|
|
39
|
+
|
|
40
|
+
results = []
|
|
41
|
+
for start in range(0, total, chunk_size):
|
|
42
|
+
end = min(start + chunk_size, total)
|
|
43
|
+
|
|
44
|
+
# Create a new PDF with pages [start, end)
|
|
45
|
+
dst = pdfium.PdfDocument.new()
|
|
46
|
+
dst.import_pages(src, list(range(start, end)))
|
|
47
|
+
out_path = os.path.join(output_dir, f"chunk_{start // chunk_size + 1}.pdf")
|
|
48
|
+
dst.save(out_path)
|
|
49
|
+
dst.close()
|
|
50
|
+
|
|
51
|
+
results.append({"path": out_path, "firstPage": start + 1, "lastPage": end})
|
|
52
|
+
|
|
53
|
+
src.close()
|
|
54
|
+
print(json.dumps({"total": total, "chunks": results}))
|
|
55
|
+
`;
|
|
56
|
+
|
|
57
|
+
async function execPy(code: string, args: string[]): Promise<string> {
|
|
58
|
+
return new Promise((resolve, reject) => {
|
|
59
|
+
const child = spawn("python3", ["-c", code, ...args], { stdio: ["ignore", "pipe", "pipe"] });
|
|
60
|
+
const out: Buffer[] = [];
|
|
61
|
+
const err: Buffer[] = [];
|
|
62
|
+
child.stdout.on("data", (d) => out.push(d));
|
|
63
|
+
child.stderr.on("data", (d) => err.push(d));
|
|
64
|
+
child.on("error", () => reject(new Error(
|
|
65
|
+
"python3 not found. Install Python 3 and pypdfium2:\n pip install pypdfium2"
|
|
66
|
+
)));
|
|
67
|
+
child.on("close", (code) => {
|
|
68
|
+
if (code === 0) resolve(Buffer.concat(out).toString("utf8").trim());
|
|
69
|
+
else reject(new Error(Buffer.concat(err).toString("utf8").trim() || `python3 exited with code ${code}`));
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function rmdirSafe(dir: string) {
|
|
75
|
+
try {
|
|
76
|
+
for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
|
|
77
|
+
rmdirSync(dir);
|
|
78
|
+
} catch { /* best effort */ }
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── MinerU API helpers ───────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
async function apiPost(url: string, body: Record<string, unknown>): Promise<{ task_id: string; file_url?: string }> {
|
|
84
|
+
const resp = await fetch(url, {
|
|
85
|
+
method: "POST",
|
|
86
|
+
headers: { "Content-Type": "application/json" },
|
|
87
|
+
body: JSON.stringify(body),
|
|
88
|
+
signal: AbortSignal.timeout(30_000),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (resp.status === 429) throw new Error("MinerU rate limit (429). Wait a minute and retry, or switch backend with /ocr.");
|
|
92
|
+
if (!resp.ok) {
|
|
93
|
+
const text = await resp.text().catch(() => "");
|
|
94
|
+
throw new Error(`MinerU API error ${resp.status}: ${text.slice(0, 200)}`);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const data = (await resp.json()) as { code: number; msg: string; data: { task_id: string; file_url?: string } };
|
|
98
|
+
if (data.code !== 0 || !data.data?.task_id) {
|
|
99
|
+
throw new Error(`MinerU API error: ${data.msg || "no task_id returned"}`);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return { task_id: data.data.task_id, file_url: data.data.file_url };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
async function putFile(uploadUrl: string, filePath: string): Promise<void> {
|
|
106
|
+
const fileData = readFileSync(filePath);
|
|
107
|
+
const resp = await fetch(uploadUrl, {
|
|
108
|
+
method: "PUT",
|
|
109
|
+
body: fileData,
|
|
110
|
+
signal: AbortSignal.timeout(60_000),
|
|
111
|
+
});
|
|
112
|
+
if (!resp.ok) {
|
|
113
|
+
const text = await resp.text().catch(() => "");
|
|
114
|
+
throw new Error(`File upload failed (${resp.status}): ${text.slice(0, 200)}`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async function pollTask(taskId: string, timeoutMs: number, progressPrefix: string, onProgress: OcrProgressCallback): Promise<string> {
|
|
119
|
+
const start = Date.now();
|
|
120
|
+
while (Date.now() - start < timeoutMs) {
|
|
121
|
+
const resp = await fetch(`${BASE_URL}/parse/${taskId}`, {
|
|
122
|
+
signal: AbortSignal.timeout(10_000),
|
|
123
|
+
});
|
|
124
|
+
const data = (await resp.json()) as {
|
|
125
|
+
code: number;
|
|
126
|
+
data: { state: string; markdown_url?: string; err_msg?: string; err_code?: number };
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
const state = data.data?.state || "unknown";
|
|
130
|
+
|
|
131
|
+
if (state === "done") {
|
|
132
|
+
const markdownUrl = data.data.markdown_url;
|
|
133
|
+
if (!markdownUrl) throw new Error("MinerU returned done state but no markdown_url");
|
|
134
|
+
const mdResp = await fetch(markdownUrl, { signal: AbortSignal.timeout(60_000) });
|
|
135
|
+
if (!mdResp.ok) throw new Error(`Failed to download markdown: ${mdResp.status}`);
|
|
136
|
+
return mdResp.text();
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (state === "failed") {
|
|
140
|
+
throw new Error(`MinerU parsing failed: ${data.data.err_msg || "unknown error"} (code: ${data.data.err_code})`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const elapsed = Math.floor((Date.now() - start) / 1000);
|
|
144
|
+
onProgress(`${progressPrefix} ${state} (${elapsed}s)`);
|
|
145
|
+
await new Promise((r) => setTimeout(r, 3000));
|
|
146
|
+
}
|
|
147
|
+
throw new Error(`MinerU task ${taskId} timed out after ${timeoutMs / 1000}s`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ── Single-file processing (one individual request, NOT batch) ───────────────
|
|
151
|
+
|
|
152
|
+
async function mineruProcessFile(
|
|
153
|
+
filePath: string, fileName: string, progressPrefix: string,
|
|
154
|
+
onProgress: OcrProgressCallback,
|
|
155
|
+
): Promise<string> {
|
|
156
|
+
const stats = await stat(filePath);
|
|
157
|
+
const sizeMB = stats.size / (1024 * 1024);
|
|
158
|
+
if (sizeMB > 10) {
|
|
159
|
+
throw new Error(
|
|
160
|
+
`File too large for free MinerU API: ${sizeMB.toFixed(1)}MB (limit: 10MB).\n` +
|
|
161
|
+
`Compress at https://ilovepdf.com/compress_pdf or switch to Ollama/Pix2Text backend with /ocr.`
|
|
162
|
+
);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Step 1: Get signed upload URL
|
|
166
|
+
onProgress(`${progressPrefix} uploading…`);
|
|
167
|
+
const { task_id, file_url } = await apiPost(`${BASE_URL}/parse/file`, {
|
|
168
|
+
file_name: fileName,
|
|
169
|
+
language: "en",
|
|
170
|
+
enable_table: true,
|
|
171
|
+
enable_formula: true,
|
|
172
|
+
is_ocr: false,
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
if (!file_url) {
|
|
176
|
+
throw new Error("MinerU did not return a file upload URL — this endpoint may have changed.");
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Step 2: Upload file bytes
|
|
180
|
+
await putFile(file_url, filePath);
|
|
181
|
+
|
|
182
|
+
// Step 3 + 4: Poll for result and download markdown
|
|
183
|
+
onProgress(`${progressPrefix} pending…`);
|
|
184
|
+
const markdown = await pollTask(task_id, 300_000, progressPrefix, onProgress);
|
|
185
|
+
onProgress(`${progressPrefix} done`);
|
|
186
|
+
return markdown;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
190
|
+
|
|
191
|
+
export async function mineruOcr(
|
|
192
|
+
filePath: string, task: Task, splitPdf: boolean,
|
|
193
|
+
signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
|
|
194
|
+
): Promise<OcrResult> {
|
|
195
|
+
const ext = extname(filePath).toLowerCase();
|
|
196
|
+
const fileName = basename(filePath);
|
|
197
|
+
|
|
198
|
+
// For images (non-PDF): process as a single individual request
|
|
199
|
+
if (ext !== ".pdf") {
|
|
200
|
+
if (![".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"].includes(ext)) {
|
|
201
|
+
throw new Error(`MinerU does not support this file type: ${ext}. Use PDF, PNG, JPG, Docx, PPTx, or Xlsx.`);
|
|
202
|
+
}
|
|
203
|
+
const markdown = await mineruProcessFile(filePath, fileName, "[1/1]", onProgress);
|
|
204
|
+
return { text: markdown, details: { backend: "mineru", fileName, pages: 1 } };
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// ── PDF handling ──
|
|
208
|
+
const { getPdfPageCount } = await import("./ollama");
|
|
209
|
+
const pageCount = await getPdfPageCount(filePath);
|
|
210
|
+
|
|
211
|
+
const totalStats = await stat(filePath);
|
|
212
|
+
const totalMB = totalStats.size / (1024 * 1024);
|
|
213
|
+
|
|
214
|
+
if (totalMB > 10) {
|
|
215
|
+
onProgress(
|
|
216
|
+
`⚠️ PDF is ${totalMB.toFixed(1)}MB — MinerU free tier limit is 10MB.\n` +
|
|
217
|
+
`💡 Compress at https://ilovepdf.com/compress_pdf first, or switch backend with /ocr.`
|
|
218
|
+
);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Single chunk case: one individual request
|
|
222
|
+
if (pageCount <= 20) {
|
|
223
|
+
const markdown = await mineruProcessFile(filePath, fileName, "[1/1]", onProgress);
|
|
224
|
+
return { text: markdown, details: { backend: "mineru", fileName, pages: pageCount } };
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// ── Multi-chunk: split PDF and process each chunk as SEPARATE requests ──
|
|
228
|
+
if (!splitPdf) {
|
|
229
|
+
onProgress(
|
|
230
|
+
`⚠️ PDF has ${pageCount} pages but splitting is disabled.\n` +
|
|
231
|
+
`Enable in /ocr settings → "MinerU: Split PDF >20 pages: ON"`
|
|
232
|
+
);
|
|
233
|
+
try {
|
|
234
|
+
const markdown = await mineruProcessFile(filePath, fileName, "[1/1]", onProgress);
|
|
235
|
+
return { text: markdown, details: { backend: "mineru", fileName, pages: pageCount } };
|
|
236
|
+
} catch (e: any) {
|
|
237
|
+
throw new Error(
|
|
238
|
+
`${e.message}\n\n💡 Enable PDF splitting in /ocr settings.`
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
onProgress(`Splitting ${pageCount}-page PDF into ≤20-page chunks…`);
|
|
244
|
+
|
|
245
|
+
const splitDir = mkdtempSync(join(tmpdir(), "pi-mineru-split-"));
|
|
246
|
+
try {
|
|
247
|
+
const raw = await execPy(PDF_SPLIT_SCRIPT, [filePath, "20", splitDir]);
|
|
248
|
+
const { chunks } = JSON.parse(raw) as {
|
|
249
|
+
total: number;
|
|
250
|
+
chunks: Array<{ path: string; firstPage: number; lastPage: number }>;
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
const results: string[] = [];
|
|
254
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
255
|
+
if (signal?.aborted) throw new Error("Aborted");
|
|
256
|
+
const chunk = chunks[i];
|
|
257
|
+
const prefix = `[${i + 1}/${chunks.length}]`;
|
|
258
|
+
|
|
259
|
+
if (i > 0) {
|
|
260
|
+
onProgress(`${prefix} waiting rate limit…`);
|
|
261
|
+
await new Promise((r) => setTimeout(r, 3_000));
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const chunkName = `${fileName.replace(/\.pdf$/i, "")}_p${chunk.firstPage}-${chunk.lastPage}.pdf`;
|
|
265
|
+
const markdown = await mineruProcessFile(chunk.path, chunkName, prefix, onProgress);
|
|
266
|
+
results.push(`## Pages ${chunk.firstPage}-${chunk.lastPage}\n\n${markdown}`);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
text: results.join("\n\n"),
|
|
271
|
+
details: { backend: "mineru", fileName, pages: pageCount, chunks: chunks.length },
|
|
272
|
+
};
|
|
273
|
+
} finally {
|
|
274
|
+
rmdirSafe(splitDir);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-minimodel-ocr — Ollama backend
|
|
3
|
+
*
|
|
4
|
+
* Uses any locally-running Ollama vision model (default: glm-ocr) to OCR
|
|
5
|
+
* images and PDFs. Converts PDF pages to PNG before sending to Ollama.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, existsSync, mkdtempSync, readdirSync, unlinkSync, rmdirSync } from "node:fs";
|
|
9
|
+
import { basename, extname, join } from "node:path";
|
|
10
|
+
import { tmpdir } from "node:os";
|
|
11
|
+
import { spawn } from "node:child_process";
|
|
12
|
+
import type { Task, OcrResult, OcrProgressCallback } from "./types";
|
|
13
|
+
|
|
14
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
export function isImage(filePath: string): boolean {
|
|
17
|
+
const ext = extname(filePath).toLowerCase();
|
|
18
|
+
return [".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"].includes(ext);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function isPdf(filePath: string): boolean {
|
|
22
|
+
const ext = extname(filePath).toLowerCase();
|
|
23
|
+
if (ext === ".pdf") return true;
|
|
24
|
+
try {
|
|
25
|
+
const buf = readFileSync(filePath).subarray(0, 4);
|
|
26
|
+
return buf.toString() === "%PDF";
|
|
27
|
+
} catch {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function execCmdCapture(cmd: string, args: string[]): Promise<string> {
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
|
|
35
|
+
const outChunks: Buffer[] = [];
|
|
36
|
+
const errChunks: Buffer[] = [];
|
|
37
|
+
child.stdout.on("data", (d) => outChunks.push(d));
|
|
38
|
+
child.stderr.on("data", (d) => errChunks.push(d));
|
|
39
|
+
child.on("error", (e) => {
|
|
40
|
+
reject(new Error(`${cmd}: ${(e as any).code === "ENOENT" ? "command not found" : e.message}`));
|
|
41
|
+
});
|
|
42
|
+
child.on("close", (code) => {
|
|
43
|
+
const stderr = Buffer.concat(errChunks).toString("utf8").trim();
|
|
44
|
+
if (code === 0) {
|
|
45
|
+
resolve(Buffer.concat(outChunks).toString("utf8"));
|
|
46
|
+
} else {
|
|
47
|
+
reject(new Error(`${cmd} exited with code ${code}${stderr ? ": " + stderr : ""}`));
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function cleanupDir(dir: string) {
|
|
54
|
+
try {
|
|
55
|
+
for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
|
|
56
|
+
rmdirSync(dir);
|
|
57
|
+
} catch { /* best effort */ }
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function buildPrompt(task: Task): string {
|
|
61
|
+
switch (task) {
|
|
62
|
+
case "text": return "Text Recognition";
|
|
63
|
+
case "formula": return "Formula Recognition";
|
|
64
|
+
case "table": return "Table Recognition";
|
|
65
|
+
case "figure": return "Figure Recognition";
|
|
66
|
+
case "auto":
|
|
67
|
+
return "Recognize all text, formulas, tables, and figures in this document. Output formulas in LaTeX format, tables in Markdown format.";
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ── PDF helpers ──────────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
export async function getPdfPageCount(pdfPath: string): Promise<number> {
|
|
74
|
+
if (process.platform === "darwin") {
|
|
75
|
+
try {
|
|
76
|
+
const out = await execCmdCapture("mdls", ["-name", "kMDItemNumberOfPages", "-raw", pdfPath]);
|
|
77
|
+
const n = parseInt(out.trim(), 10);
|
|
78
|
+
if (!isNaN(n) && n > 0) return n;
|
|
79
|
+
} catch { /* fall through */ }
|
|
80
|
+
}
|
|
81
|
+
if (process.platform === "linux") {
|
|
82
|
+
try {
|
|
83
|
+
const out = await execCmdCapture("pdfinfo", [pdfPath]);
|
|
84
|
+
const m = out.match(/Pages:\s+(\d+)/);
|
|
85
|
+
if (m) return parseInt(m[1], 10) || 1;
|
|
86
|
+
} catch { /* fall through */ }
|
|
87
|
+
}
|
|
88
|
+
return 1;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async function convertPdfPageMac(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
|
|
92
|
+
if (pageIndex === 0) {
|
|
93
|
+
try {
|
|
94
|
+
await execCmdCapture("sips", ["-s", "format", "png", pdfPath, "--out", outPath]);
|
|
95
|
+
return;
|
|
96
|
+
} catch (e: any) {
|
|
97
|
+
throw new Error(`sips PDF conversion failed: ${e.message}`);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
try {
|
|
101
|
+
await execCmdCapture("pdftoppm", [
|
|
102
|
+
"-png", "-r", "150", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
|
|
103
|
+
"-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
|
|
104
|
+
]);
|
|
105
|
+
if (!existsSync(outPath) || readFileSync(outPath).length === 0) {
|
|
106
|
+
throw new Error(`pdftoppm produced no output for page ${pageIndex + 1}`);
|
|
107
|
+
}
|
|
108
|
+
return;
|
|
109
|
+
} catch (e: any) {
|
|
110
|
+
const msg = e.message || String(e);
|
|
111
|
+
if (msg.includes("command not found") || msg.includes("ENOENT")) {
|
|
112
|
+
throw new Error(`pdftoppm not found. Install with: brew install poppler. Only page 1 was processed with sips.`);
|
|
113
|
+
}
|
|
114
|
+
throw new Error(`PDF page ${pageIndex + 1} conversion failed: ${msg}`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async function convertPdfPage(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
|
|
119
|
+
if (process.platform === "darwin") {
|
|
120
|
+
await convertPdfPageMac(pdfPath, pageIndex, outPath);
|
|
121
|
+
} else {
|
|
122
|
+
await execCmdCapture("pdftoppm", [
|
|
123
|
+
"-png", "-r", "150", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
|
|
124
|
+
"-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
|
|
125
|
+
]);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// ── Ollama API call ──────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
async function callOllama(
|
|
132
|
+
host: string, imagePath: string, task: Task, signal: AbortSignal | undefined, model: string,
|
|
133
|
+
): Promise<string> {
|
|
134
|
+
const imageBase64 = readFileSync(imagePath).toString("base64");
|
|
135
|
+
const prompt = buildPrompt(task);
|
|
136
|
+
|
|
137
|
+
const body = JSON.stringify({ model, prompt, images: [imageBase64], stream: false });
|
|
138
|
+
|
|
139
|
+
const response = await fetch(`${host}/api/generate`, {
|
|
140
|
+
method: "POST",
|
|
141
|
+
headers: { "Content-Type": "application/json" },
|
|
142
|
+
body,
|
|
143
|
+
signal,
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
if (!response.ok) {
|
|
147
|
+
const text = await response.text().catch(() => "");
|
|
148
|
+
throw new Error(`Ollama API error ${response.status}: ${text.slice(0, 200)}. Is Ollama running and is the ${model} model pulled?`);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const data = (await response.json()) as { response?: string; error?: string };
|
|
152
|
+
if (data.error) throw new Error(`OCR error: ${data.error}`);
|
|
153
|
+
return data.response?.trim() || "";
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
export async function ollamaOcr(
|
|
159
|
+
filePath: string, task: Task, ollamaHost: string, model: string,
|
|
160
|
+
signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
|
|
161
|
+
): Promise<OcrResult> {
|
|
162
|
+
let resultText = "";
|
|
163
|
+
let tmpDir: string | null = null;
|
|
164
|
+
|
|
165
|
+
try {
|
|
166
|
+
if (isPdf(filePath)) {
|
|
167
|
+
onProgress("📄 Converting PDF pages to images…");
|
|
168
|
+
tmpDir = mkdtempSync(join(tmpdir(), "pi-ocr-"));
|
|
169
|
+
const pageCount = await getPdfPageCount(filePath);
|
|
170
|
+
|
|
171
|
+
// Proactive check for multi-page PDF on macOS without pdftoppm
|
|
172
|
+
if (pageCount > 1 && process.platform === "darwin") {
|
|
173
|
+
try { await execCmdCapture("pdftoppm", ["-v"]); } catch {
|
|
174
|
+
onProgress(`⚠️ Multi-page PDF (${pageCount} pages) but pdftoppm is not installed. Only page 1 will be processed.\nInstall: brew install poppler`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const pageResults: string[] = [];
|
|
179
|
+
for (let i = 0; i < pageCount; i++) {
|
|
180
|
+
if (signal?.aborted) throw new Error("Aborted");
|
|
181
|
+
const pageOut = join(tmpDir, `page_${i + 1}.png`);
|
|
182
|
+
|
|
183
|
+
try {
|
|
184
|
+
await convertPdfPage(filePath, i, pageOut);
|
|
185
|
+
} catch (e: any) {
|
|
186
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ Skipped: ${e.message}`);
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
onProgress(`🔍 OCR page ${i + 1}/${pageCount}…`);
|
|
191
|
+
const pageText = await callOllama(ollamaHost, pageOut, task, signal, model);
|
|
192
|
+
if (!pageText.trim()) {
|
|
193
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ OCR returned empty result for this page.`);
|
|
194
|
+
} else {
|
|
195
|
+
pageResults.push(`## Page ${i + 1}\n\n${pageText}`);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
resultText = pageResults.join("\n\n");
|
|
199
|
+
} else {
|
|
200
|
+
resultText = await callOllama(ollamaHost, filePath, task, signal, model);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return { text: resultText, details: { backend: "ollama", model, task } };
|
|
204
|
+
} finally {
|
|
205
|
+
if (tmpDir) cleanupDir(tmpDir);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/** Check if a model exists locally via Ollama API */
|
|
210
|
+
export async function ollamaCheckModel(host: string, model: string): Promise<boolean> {
|
|
211
|
+
try {
|
|
212
|
+
const resp = await fetch(`${host}/api/show`, {
|
|
213
|
+
method: "POST",
|
|
214
|
+
headers: { "Content-Type": "application/json" },
|
|
215
|
+
body: JSON.stringify({ name: model }),
|
|
216
|
+
});
|
|
217
|
+
return resp.ok;
|
|
218
|
+
} catch {
|
|
219
|
+
return false;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/** Pull a model via ollama pull */
|
|
224
|
+
export function ollamaPullModel(model: string): Promise<void> {
|
|
225
|
+
return execCmdCapture("ollama", ["pull", model]).then(() => {});
|
|
226
|
+
}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-minimodel-ocr — Pix2Text backend
|
|
3
|
+
*
|
|
4
|
+
* Uses Pix2Text (https://github.com/breezedeus/Pix2Text) — an open-source
|
|
5
|
+
* Python alternative to Mathpix. Recognizes layouts, text, math formulas (LaTeX),
|
|
6
|
+
* and tables, outputting Markdown directly. SMALL ONNX models, CPU-friendly.
|
|
7
|
+
*
|
|
8
|
+
* Prerequisites:
|
|
9
|
+
* pip install pix2text
|
|
10
|
+
*
|
|
11
|
+
* First run downloads ONNX models (~50MB) to ~/.pix2text/1.1/.
|
|
12
|
+
*
|
|
13
|
+
* Progress reporting: for multi-page PDFs, processes pages individually
|
|
14
|
+
* and sends per-page progress via stderr (JSON lines), final result via stdout.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { spawn } from "node:child_process";
|
|
18
|
+
import { basename } from "node:path";
|
|
19
|
+
import type { Task, OcrResult, OcrProgressCallback } from "./types";
|
|
20
|
+
import { isImage, isPdf } from "./ollama";
|
|
21
|
+
|
|
22
|
+
// ── Embedded Python OCR engine ───────────────────────────────────────────────
|
|
23
|
+
//
|
|
24
|
+
// For PDFs: extracts pages via PyMuPDF, runs Pix2Text on each page individually,
|
|
25
|
+
// printing progress JSON to stderr so TypeScript can relay it.
|
|
26
|
+
// For images: single call to p2t.recognize().
|
|
27
|
+
//
|
|
28
|
+
// Usage: python3 -c SCRIPT <file_path> <task>
|
|
29
|
+
// stdout → final Markdown result
|
|
30
|
+
// stderr → progress lines: {"page": 1, "total": 10} / {"status": "loading"} / errors
|
|
31
|
+
|
|
32
|
+
const PIX2TEXT_ENGINE = `
|
|
33
|
+
import sys, os, json, io
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
|
|
36
|
+
file_path = sys.argv[1]
|
|
37
|
+
task = sys.argv[2]
|
|
38
|
+
ext = Path(file_path).suffix.lower()
|
|
39
|
+
|
|
40
|
+
# Suppress noisy library output
|
|
41
|
+
os.environ.setdefault("DISABLE_TQDM", "1")
|
|
42
|
+
|
|
43
|
+
# Progress helper — JSON lines to stderr
|
|
44
|
+
def progress(payload):
|
|
45
|
+
print(json.dumps(payload), file=sys.stderr, flush=True)
|
|
46
|
+
|
|
47
|
+
progress({"status": "loading", "message": "Initializing Pix2Text models..."})
|
|
48
|
+
|
|
49
|
+
from pix2text import Pix2Text
|
|
50
|
+
p2t = Pix2Text.from_config(enable_formula=True, enable_table=False)
|
|
51
|
+
|
|
52
|
+
if ext == ".pdf":
|
|
53
|
+
import fitz # PyMuPDF — already a pix2text dependency
|
|
54
|
+
|
|
55
|
+
progress({"status": "loading", "message": "Opening PDF..."})
|
|
56
|
+
doc = fitz.open(file_path)
|
|
57
|
+
total = len(doc)
|
|
58
|
+
progress({"status": "started", "pages": total})
|
|
59
|
+
|
|
60
|
+
results = []
|
|
61
|
+
for i in range(total):
|
|
62
|
+
progress({"status": "page", "page": i + 1, "total": total})
|
|
63
|
+
|
|
64
|
+
# Render page to PNG at 200 DPI
|
|
65
|
+
page = doc[i]
|
|
66
|
+
pix = page.get_pixmap(dpi=200)
|
|
67
|
+
img_bytes = pix.tobytes("png")
|
|
68
|
+
|
|
69
|
+
# Write to temp file (Pix2Text needs a file path)
|
|
70
|
+
tmp_path = f"/tmp/pi-p2t-page-{os.getpid()}-{i}.png"
|
|
71
|
+
with open(tmp_path, "wb") as f:
|
|
72
|
+
f.write(img_bytes)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
text = p2t.recognize(tmp_path)
|
|
76
|
+
if text:
|
|
77
|
+
results.append(f"## Page {i + 1}\\n\\n{text}")
|
|
78
|
+
else:
|
|
79
|
+
results.append(f"## Page {i + 1}\\n\\n> ⚠️ No text detected")
|
|
80
|
+
except Exception as e:
|
|
81
|
+
results.append(f"## Page {i + 1}\\n\\n> ⚠️ Error: {e}")
|
|
82
|
+
finally:
|
|
83
|
+
try:
|
|
84
|
+
os.unlink(tmp_path)
|
|
85
|
+
except:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
doc.close()
|
|
89
|
+
progress({"status": "done", "pages": total})
|
|
90
|
+
print("\\n\\n".join(results))
|
|
91
|
+
|
|
92
|
+
elif ext in (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"):
|
|
93
|
+
progress({"status": "recognizing"})
|
|
94
|
+
result = p2t.recognize(file_path)
|
|
95
|
+
progress({"status": "done", "pages": 1})
|
|
96
|
+
print(result if result else "")
|
|
97
|
+
|
|
98
|
+
else:
|
|
99
|
+
progress({"status": "error", "message": f"Unsupported file type: {ext}"})
|
|
100
|
+
print(f"ERROR: unsupported file type {ext}")
|
|
101
|
+
sys.exit(1)
|
|
102
|
+
`;
|
|
103
|
+
|
|
104
|
+
// ── Subprocess runner with progress streaming ────────────────────────────────
|
|
105
|
+
|
|
106
|
+
async function execPythonWithProgress(
|
|
107
|
+
code: string, args: string[], onProgress: OcrProgressCallback,
|
|
108
|
+
): Promise<{ stdout: string; exitCode: number }> {
|
|
109
|
+
return new Promise((resolve) => {
|
|
110
|
+
const child = spawn("python3", ["-c", code, ...args], { stdio: ["ignore", "pipe", "pipe"] });
|
|
111
|
+
const outChunks: Buffer[] = [];
|
|
112
|
+
let lastPageReported = 0;
|
|
113
|
+
|
|
114
|
+
child.stdout.on("data", (d) => outChunks.push(d));
|
|
115
|
+
|
|
116
|
+
child.stderr.on("data", (d) => {
|
|
117
|
+
const text = d.toString("utf8").trim();
|
|
118
|
+
// Parse JSON progress lines, ignore noisy library output
|
|
119
|
+
for (const line of text.split("\n")) {
|
|
120
|
+
const trimmed = line.trim();
|
|
121
|
+
if (!trimmed.startsWith("{")) continue;
|
|
122
|
+
try {
|
|
123
|
+
const p = JSON.parse(trimmed);
|
|
124
|
+
if (p.status === "page" && p.page && p.total) {
|
|
125
|
+
// Only report every 5th page or first/last to avoid spam
|
|
126
|
+
if (p.page !== lastPageReported) {
|
|
127
|
+
lastPageReported = p.page;
|
|
128
|
+
onProgress(`📄 Page ${p.page}/${p.total}`);
|
|
129
|
+
}
|
|
130
|
+
} else if (p.status === "loading" && p.message) {
|
|
131
|
+
onProgress(`⏳ ${p.message}`);
|
|
132
|
+
} else if (p.status === "started" && p.pages) {
|
|
133
|
+
onProgress(`📄 Processing ${p.pages} page(s) with Pix2Text…`);
|
|
134
|
+
} else if (p.status === "done") {
|
|
135
|
+
onProgress(`✅ Pix2Text complete (${p.pages} page(s))`);
|
|
136
|
+
} else if (p.status === "error") {
|
|
137
|
+
// Error will be handled by exit code
|
|
138
|
+
}
|
|
139
|
+
} catch {
|
|
140
|
+
// Not JSON — library noise, ignore
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
child.on("error", () => resolve({
|
|
146
|
+
stdout: "",
|
|
147
|
+
exitCode: 1,
|
|
148
|
+
}));
|
|
149
|
+
|
|
150
|
+
child.on("close", (code) => {
|
|
151
|
+
resolve({
|
|
152
|
+
stdout: Buffer.concat(outChunks).toString("utf8").trim(),
|
|
153
|
+
exitCode: code ?? 1,
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
160
|
+
|
|
161
|
+
export async function pix2textOcr(
|
|
162
|
+
filePath: string, task: Task,
|
|
163
|
+
signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
|
|
164
|
+
): Promise<OcrResult> {
|
|
165
|
+
if (!isImage(filePath) && !isPdf(filePath)) {
|
|
166
|
+
throw new Error(`Unsupported file type: ${basename(filePath)}`);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const { stdout, exitCode } = await execPythonWithProgress(
|
|
170
|
+
PIX2TEXT_ENGINE, [filePath, task], onProgress,
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
if (exitCode !== 0) {
|
|
174
|
+
const msg = stdout || "Pix2Text failed";
|
|
175
|
+
if (msg.includes("ModuleNotFoundError") || msg.includes("No module named")) {
|
|
176
|
+
throw new Error("Pix2Text not installed. Run:\n pip install pix2text");
|
|
177
|
+
}
|
|
178
|
+
if (msg.includes("table-rec") || msg.includes("pytorch_model")) {
|
|
179
|
+
throw new Error("Pix2Text model download incomplete. Try:\n pip install pix2text --upgrade");
|
|
180
|
+
}
|
|
181
|
+
throw new Error(msg.slice(0, 1000));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (!stdout || stdout.startsWith("ERROR:")) {
|
|
185
|
+
return { text: "", details: { backend: "pix2text", task, warning: stdout || "no text detected" } };
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return { text: stdout, details: { backend: "pix2text", task } };
|
|
189
|
+
}
|