pi-ocr 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -327
- package/extensions/index.ts +9 -2
- package/extensions/tesseract.ts +132 -0
- package/extensions/types.ts +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -2,396 +2,127 @@
|
|
|
2
2
|
|
|
3
3
|
> ### ⚡ Zero setup. Works out of the box.
|
|
4
4
|
>
|
|
5
|
-
> Default backend
|
|
6
|
-
> Just `pi install
|
|
5
|
+
> Default backend is **MinerU** — a free cloud API.
|
|
6
|
+
> No GPU, no API key, no pip install. Just `pi install` and `/ocr`.
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
> Bridges the multimodal gap for non-vision LLMs like **DeepSeek**. When your model can't see images, `pi_ocr` acts as its eyes.
|
|
11
|
-
|
|
12
|
-
## Three Backends — One Tool
|
|
13
|
-
|
|
14
|
-
| Backend | Type | Best For |
|
|
15
|
-
|---|---|---|
|
|
16
|
-
| 🦙 **Ollama** | Local GPU | Math formulas (LaTeX), privacy, offline |
|
|
17
|
-
| ☁️ **MinerU** | Free cloud API | Complex PDFs, no GPU, zero setup |
|
|
18
|
-
| 📐 **Pix2Text** | Local Python | Math formulas + text, free Mathpix alternative |
|
|
19
|
-
|
|
20
|
-
Switch anytime with `/ocr` (no args) — a visual `SettingsList` menu lets you pick and configure everything without editing JSON:
|
|
21
|
-
|
|
22
|
-
```
|
|
23
|
-
/ocr → opens settings: backend, model, PDF split toggle
|
|
24
|
-
/ocr <file> [task] → OCR a file
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
## Features
|
|
28
|
-
|
|
29
|
-
| | |
|
|
30
|
-
|---|---|
|
|
31
|
-
| 🔤 **Text** | General text recognition → Markdown |
|
|
32
|
-
| 🧮 **Formulas** | Math formulas → LaTeX (Ollama glm-ocr: 94.6 OmniDocBench) |
|
|
33
|
-
| 📊 **Tables** | Table structure → Markdown tables |
|
|
34
|
-
| 🖼️ **Figures** | Diagrams and illustrations → descriptions |
|
|
35
|
-
| 📄 **PDF** | Full PDF support across all backends |
|
|
36
|
-
| 🎛️ **Any model** | Ollama works with glm-ocr, llama3.2-vision, minicpm-v, etc. |
|
|
37
|
-
| ☁️ **Free cloud** | MinerU Agent API: no token, ≤10MB, ≤20 pages free |
|
|
38
|
-
| 📦 **Auto-split** | MinerU splits PDFs >20 pages into free-tier chunks |
|
|
8
|
+
OCR for [Pi Coding Agent](https://pi.dev). Bridges the multimodal gap for non-vision LLMs like DeepSeek: when your model can't see images, `pi_ocr` reads them for you.
|
|
39
9
|
|
|
40
10
|
---
|
|
41
11
|
|
|
42
12
|
## Quickstart
|
|
43
13
|
|
|
44
|
-
### One command
|
|
45
|
-
|
|
46
14
|
```bash
|
|
47
15
|
pi install npm:pi-ocr
|
|
16
|
+
/ocr ./screenshot.png
|
|
17
|
+
/ocr ./paper.pdf
|
|
48
18
|
```
|
|
49
19
|
|
|
50
|
-
**That's
|
|
51
|
-
Start OCR'ing immediately:
|
|
52
|
-
|
|
53
|
-
```
|
|
54
|
-
/ocr ./scan.png
|
|
55
|
-
/ocr ./document.pdf
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
> 💡 Want offline OCR or math formulas? Switch backends anytime with `/ocr` (no args).
|
|
20
|
+
**That's all.** MinerU (free cloud API) is the default and requires nothing.
|
|
59
21
|
|
|
60
22
|
---
|
|
61
23
|
|
|
62
|
-
|
|
24
|
+
## Backends
|
|
63
25
|
|
|
64
|
-
|
|
26
|
+
Switch anytime with `/ocr` (no args).
|
|
27
|
+
|
|
28
|
+
| | Backend | Best for | Setup |
|
|
29
|
+
|---|---|---|---|
|
|
30
|
+
| ☁️ | **MinerU** (default) | PDFs, tables, general docs | None — works instantly |
|
|
31
|
+
| 🦙 | Ollama | Math formulas → LaTeX, offline | `brew install ollama && ollama pull glm-ocr` |
|
|
32
|
+
| 🔤 | Tesseract | Plain text, ultra-light (~30MB) | `brew install tesseract` |
|
|
33
|
+
| 📐 | Pix2Text | Math + text, CPU Python | `pip install pix2text` |
|
|
65
34
|
|
|
66
35
|
---
|
|
67
36
|
|
|
68
|
-
|
|
37
|
+
## MinerU (default)
|
|
69
38
|
|
|
70
|
-
|
|
39
|
+
Free cloud API. Handles PDF, PNG, JPG, Docx, PPTx, Xlsx.
|
|
71
40
|
|
|
72
|
-
|
|
73
|
-
# 1. Install Ollama
|
|
74
|
-
brew install ollama
|
|
41
|
+
**Limits:** ≤10MB per file, ≤20 pages per request.
|
|
75
42
|
|
|
76
|
-
|
|
77
|
-
ollama pull glm-ocr
|
|
43
|
+
PDFs >20 pages can be auto-split (enabled by default in `/ocr` settings). Splitting needs `pip install pypdfium2`.
|
|
78
44
|
|
|
79
|
-
|
|
80
|
-
brew install poppler
|
|
81
|
-
```
|
|
45
|
+
---
|
|
82
46
|
|
|
83
|
-
|
|
84
|
-
> Multi-page PDFs need `poppler` for the `pdftoppm` tool.
|
|
47
|
+
## Ollama (optional, for math formulas)
|
|
85
48
|
|
|
86
|
-
|
|
49
|
+
Local GPU OCR via [glm-ocr](https://ollama.com) — state-of-the-art formula recognition (94.6 OmniDocBench). Outputs LaTeX.
|
|
87
50
|
|
|
88
51
|
```bash
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# 2. Pull the default OCR model (~2.2 GB)
|
|
52
|
+
# macOS
|
|
53
|
+
brew install ollama
|
|
93
54
|
ollama pull glm-ocr
|
|
55
|
+
brew install poppler # multi-page PDFs only
|
|
94
56
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
sudo
|
|
57
|
+
# Linux
|
|
58
|
+
curl -fsSL https://ollama.com/install.sh | sh
|
|
59
|
+
ollama pull glm-ocr
|
|
60
|
+
sudo apt install poppler-utils
|
|
99
61
|
```
|
|
100
62
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
```bash
|
|
104
|
-
# Check Ollama is running and model is pulled
|
|
105
|
-
ollama list | grep glm-ocr
|
|
106
|
-
```
|
|
63
|
+
Switch with `/ocr` → "OCR Backend" → ollama.
|
|
107
64
|
|
|
108
65
|
---
|
|
109
66
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
Pix2Text runs entirely in Python. Handles images and PDFs via a single API call — no manual conversion needed.
|
|
113
|
-
|
|
114
|
-
#### Step 1: Make sure you have Python 3.9+ (macOS/Linux)
|
|
67
|
+
## Tesseract (optional, for plain text)
|
|
115
68
|
|
|
116
|
-
|
|
117
|
-
|---|---|
|
|
118
|
-
| macOS/Linux | `python3 --version` |
|
|
119
|
-
|
|
120
|
-
> ⚠️ **Important:** Know which Python you're using. Run `which python3` — if it shows `conda`, `brew`, or `/usr/bin/python3`, your `pip install` must target the same Python:
|
|
121
|
-
> ```bash
|
|
122
|
-
> # Conda Python
|
|
123
|
-
> pip install pix2text
|
|
124
|
-
>
|
|
125
|
-
> # System Python (may need --user or sudo)
|
|
126
|
-
> pip install --user pix2text
|
|
127
|
-
>
|
|
128
|
-
> # Brew Python (macOS)
|
|
129
|
-
> /opt/homebrew/bin/pip3 install pix2text
|
|
130
|
-
> ```
|
|
131
|
-
> If unsure, use `python3 -m pip install ...` — this always installs for the active `python3`.
|
|
132
|
-
|
|
133
|
-
#### Step 2: Install packages
|
|
134
|
-
|
|
135
|
-
```bash
|
|
136
|
-
python3 -m pip install pix2text
|
|
137
|
-
```
|
|
138
|
-
|
|
139
|
-
#### Step 3: Verify
|
|
69
|
+
Classic OCR engine. Ultra-lightweight (~30MB), CPU-only, fast. System package, zero Python.
|
|
140
70
|
|
|
141
71
|
```bash
|
|
142
|
-
|
|
72
|
+
brew install tesseract # macOS
|
|
73
|
+
sudo apt install tesseract-ocr # Linux
|
|
143
74
|
```
|
|
144
75
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
> First run downloads ONNX models (~50MB) to `~/.pix2text/`.
|
|
148
|
-
|
|
76
|
+
Switch with `/ocr` → "OCR Backend" → tesseract.
|
|
149
77
|
|
|
150
78
|
---
|
|
151
79
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
**No setup required.** The free Agent API works immediately. No token, no account, no install.
|
|
155
|
-
|
|
156
|
-
Free tier limits:
|
|
157
|
-
- ≤ 10 MB per file
|
|
158
|
-
- ≤ 20 pages per request
|
|
159
|
-
- IP-based rate limiting
|
|
80
|
+
## Pix2Text (optional, for math + text on CPU)
|
|
160
81
|
|
|
161
|
-
|
|
162
|
-
> Most PDFs are under 20 pages — you'll likely never need this.
|
|
163
|
-
|
|
164
|
-
For files >10MB, compress first at [ilovepdf.com/compress_pdf](https://ilovepdf.com/compress_pdf).
|
|
165
|
-
|
|
166
|
-
---
|
|
167
|
-
|
|
168
|
-
## Usage
|
|
169
|
-
|
|
170
|
-
### Settings UI
|
|
171
|
-
|
|
172
|
-
```
|
|
173
|
-
/ocr
|
|
174
|
-
```
|
|
82
|
+
Local Python OCR. Mathpix alternative — handles text + formulas on CPU.
|
|
175
83
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
```
|
|
179
|
-
┌─ OCR Settings ─────────────────────────────────┐
|
|
180
|
-
│ OCR Backend [ollama / mineru / pix2text] │
|
|
181
|
-
│ MinerU: Split PDF [ON / OFF] │
|
|
182
|
-
│ Ollama Model [glm-ocr] │
|
|
183
|
-
│ ↑↓ navigate • ← → toggle • enter select • esc close │
|
|
184
|
-
└──────────────────────────────────────────────────┘
|
|
84
|
+
```bash
|
|
85
|
+
pip install pix2text
|
|
185
86
|
```
|
|
186
87
|
|
|
187
|
-
|
|
188
|
-
- **MinerU Split**: ← → to toggle ON/OFF — when ON, PDFs >20 pages are auto-split
|
|
189
|
-
- **Model**: Enter opens a sub-menu with recommended models + custom input
|
|
88
|
+
First run downloads ONNX models (~50MB). Switch with `/ocr` → "OCR Backend" → pix2text.
|
|
190
89
|
|
|
191
|
-
|
|
90
|
+
---
|
|
192
91
|
|
|
193
|
-
|
|
194
|
-
/ocr <file> [task] [model]
|
|
195
|
-
```
|
|
92
|
+
## Commands
|
|
196
93
|
|
|
197
|
-
|
|
|
94
|
+
| Command | |
|
|
198
95
|
|---|---|
|
|
199
|
-
| `/ocr
|
|
200
|
-
| `/ocr
|
|
201
|
-
| `/ocr
|
|
202
|
-
| `/ocr ./paper.pdf auto glm-ocr:q8_0` | Use specific model |
|
|
96
|
+
| `/ocr` | Open settings (backend, model, split toggle) |
|
|
97
|
+
| `/ocr <file> [task]` | OCR a file |
|
|
98
|
+
| `/ocr <file> formula` | Math LaTeX output |
|
|
203
99
|
|
|
204
100
|
### Tasks
|
|
205
101
|
|
|
206
|
-
| Task |
|
|
207
|
-
|
|
208
|
-
| `auto`
|
|
209
|
-
| `text` | Plain
|
|
210
|
-
| `formula` |
|
|
211
|
-
| `table` |
|
|
212
|
-
| `figure` |
|
|
213
|
-
|
|
214
|
-
### LLM-invoked (automatic)
|
|
215
|
-
|
|
216
|
-
The extension registers a `pi_ocr` tool. The agent invokes it automatically:
|
|
217
|
-
|
|
218
|
-
```
|
|
219
|
-
> What formula is written in this screenshot?
|
|
220
|
-
> OCR this 50-page PDF into markdown.
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
---
|
|
224
|
-
|
|
225
|
-
## MinerU PDF Splitting
|
|
226
|
-
|
|
227
|
-
When `MinerU: Split PDF >20 pages` is ON (default), large PDFs are automatically split into ≤20-page chunks. Each chunk is a separate request with 3s spacing:
|
|
228
|
-
|
|
229
|
-
```
|
|
230
|
-
Splitting 85-page PDF into ≤20-page chunks…
|
|
231
|
-
[1/5] uploading…
|
|
232
|
-
[1/5] running (12s)
|
|
233
|
-
[1/5] done
|
|
234
|
-
[2/5] waiting rate limit…
|
|
235
|
-
[2/5] uploading…
|
|
236
|
-
[2/5] running (18s)
|
|
237
|
-
[2/5] done
|
|
238
|
-
...
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
---
|
|
242
|
-
|
|
243
|
-
## Backend Comparison
|
|
244
|
-
|
|
245
|
-
| | 🦙 Ollama | ☁️ MinerU | 📐 **Pix2Text** |
|
|
246
|
-
|---|---|---|---|
|
|
247
|
-
| **Setup** | Install Ollama + pull model | None | `pip install` 3 packages |
|
|
248
|
-
| **GPU needed** | Recommended | No | No |
|
|
249
|
-
| **Internet** | No | Yes | No (first run: yes) |
|
|
250
|
-
| **Math formulas** | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐ |
|
|
251
|
-
| **Complex PDFs** | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐ |
|
|
252
|
-
| **Chinese text** | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
|
|
253
|
-
| **File size limit** | None | 10MB (free) | None |
|
|
254
|
-
| **Page limit** | None | 20/request (free) | None |
|
|
255
|
-
| **Cost** | Free (local) | Free (rate-limited) | Free (local) |
|
|
256
|
-
|
|
257
|
-
---
|
|
258
|
-
|
|
259
|
-
## Supported File Types
|
|
260
|
-
|
|
261
|
-
| Format | Ollama | MinerU | Pix2Text |
|
|
262
|
-
|---|---|---|---|
|
|
263
|
-
| PNG, JPG, GIF, WEBP, BMP, TIFF | ✅ | ✅ | ✅ |
|
|
264
|
-
| PDF | ✅ | ✅ | ✅ |
|
|
265
|
-
| Docx, PPTx, Xlsx | ❌ | ✅ | ❌ |
|
|
266
|
-
|
|
267
|
-
---
|
|
268
|
-
|
|
269
|
-
## PDF Support Details
|
|
270
|
-
|
|
271
|
-
| Backend | Conversion method | System deps |
|
|
272
|
-
|---|---|---|
|
|
273
|
-
| **Ollama** | `sips` (macOS page 1) / `pdftoppm` (multi-page, Linux) | `poppler` (multi-page only) |
|
|
274
|
-
| **MinerU** | Direct PDF upload — no conversion | None |
|
|
275
|
-
| **Pix2Text** | Built-in `recognize_pdf()` — no system deps | None |
|
|
276
|
-
|
|
277
|
-
---
|
|
278
|
-
|
|
279
|
-
## Configuration
|
|
280
|
-
|
|
281
|
-
All settings are persisted to `~/.pi/agent/settings.json`:
|
|
282
|
-
|
|
283
|
-
```json
|
|
284
|
-
{
|
|
285
|
-
"minimodelOcr": {
|
|
286
|
-
"backend": "ollama",
|
|
287
|
-
"model": "glm-ocr",
|
|
288
|
-
"ollamaHost": "http://localhost:11434",
|
|
289
|
-
"mineruSplitPdf": true
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
```
|
|
293
|
-
|
|
294
|
-
Change settings via `/ocr` (interactive) or edit directly. Environment variables override file settings:
|
|
295
|
-
|
|
296
|
-
```bash
|
|
297
|
-
export OLLAMA_HOST="http://localhost:11434"
|
|
298
|
-
export OCR_MODEL="glm-ocr"
|
|
299
|
-
```
|
|
102
|
+
| Task | Output |
|
|
103
|
+
|---|---|
|
|
104
|
+
| `auto` (default) | Markdown + LaTeX |
|
|
105
|
+
| `text` | Plain Markdown |
|
|
106
|
+
| `formula` | LaTeX only |
|
|
107
|
+
| `table` | Markdown tables |
|
|
108
|
+
| `figure` | Description |
|
|
300
109
|
|
|
301
110
|
---
|
|
302
111
|
|
|
303
112
|
## Troubleshooting
|
|
304
113
|
|
|
305
|
-
|
|
114
|
+
**"Is Ollama running?"** → `ollama serve`
|
|
306
115
|
|
|
307
|
-
|
|
308
|
-
# Start Ollama in the background
|
|
309
|
-
ollama serve
|
|
310
|
-
```
|
|
116
|
+
**MinerU 429** → Rate limited. Wait a minute or switch backend.
|
|
311
117
|
|
|
312
|
-
|
|
118
|
+
**"python3 not found" (Pix2Text)** → `python3 -m pip install pix2text`
|
|
313
119
|
|
|
314
|
-
|
|
315
|
-
ollama pull glm-ocr
|
|
316
|
-
```
|
|
120
|
+
**"tesseract not found"** → `brew install tesseract` (macOS) / `sudo apt install tesseract-ocr` (Linux)
|
|
317
121
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
```bash
|
|
321
|
-
# Check your python:
|
|
322
|
-
which python3 && python3 --version
|
|
323
|
-
|
|
324
|
-
# If using conda:
|
|
325
|
-
conda activate base && pip install pix2text
|
|
326
|
-
|
|
327
|
-
# If using system python:
|
|
328
|
-
python3 -m pip install --user pix2text
|
|
329
|
-
```
|
|
330
|
-
|
|
331
|
-
### Pix2Text: "No module named 'pix2text'"
|
|
332
|
-
|
|
333
|
-
You likely installed with a different `pip` than your active `python3`:
|
|
334
|
-
|
|
335
|
-
```bash
|
|
336
|
-
python3 -m pip install pix2text
|
|
337
|
-
```
|
|
338
|
-
```
|
|
339
|
-
|
|
340
|
-
### MinerU: "429 Too Many Requests"
|
|
341
|
-
|
|
342
|
-
IP rate limit hit. Wait 1-2 minutes, or switch to Ollama/Pix2Text with `/ocr`.
|
|
343
|
-
|
|
344
|
-
### MinerU: "file page count exceeds lightweight API limit"
|
|
345
|
-
|
|
346
|
-
Enable PDF splitting: `/ocr` → toggle "MinerU: Split PDF" to ON.
|
|
347
|
-
|
|
348
|
-
### MinerU: "File too large for free MinerU API"
|
|
349
|
-
|
|
350
|
-
Compress the PDF at [ilovepdf.com/compress_pdf](https://ilovepdf.com/compress_pdf) or switch to a local backend with `/ocr`.
|
|
351
|
-
|
|
352
|
-
### macOS multi-page PDF: "pdftoppm not found"
|
|
353
|
-
|
|
354
|
-
```bash
|
|
355
|
-
brew install poppler
|
|
356
|
-
```
|
|
357
|
-
|
|
358
|
-
### Linux multi-page PDF: "pdftoppm not found"
|
|
359
|
-
|
|
360
|
-
```bash
|
|
361
|
-
# Debian/Ubuntu
|
|
362
|
-
sudo apt install poppler-utils
|
|
363
|
-
|
|
364
|
-
# Fedora
|
|
365
|
-
sudo dnf install poppler-utils
|
|
366
|
-
|
|
367
|
-
# Arch
|
|
368
|
-
sudo pacman -S poppler
|
|
369
|
-
```
|
|
122
|
+
**"pdftoppm not found" (Ollama multi-page)** → `brew install poppler` (macOS) / `sudo apt install poppler-utils` (Linux)
|
|
370
123
|
|
|
371
124
|
---
|
|
372
125
|
|
|
373
|
-
## How It Works
|
|
374
|
-
|
|
375
|
-
```
|
|
376
|
-
┌──────────────────┐ ┌──────────────────┐ ┌──────────────────────┐
|
|
377
|
-
│ pi (DeepSeek) │────▶│ pi_ocr │────▶│ Ollama / MinerU │
|
|
378
|
-
│ (no vision) │ │ pi extension │ │ / Pix2Text │
|
|
379
|
-
└──────────────────┘ └──────────────────┘ └──────────────────────┘
|
|
380
|
-
│ │ │
|
|
381
|
-
│ "read this image" │ POST /api/generate │
|
|
382
|
-
│────────────────────────▶│ (Ollama) │
|
|
383
|
-
│ │ or POST /api/v1/agent │
|
|
384
|
-
│ │ (MinerU) │
|
|
385
|
-
│ │ or python3 subprocess │
|
|
386
|
-
│ │ (Pix2Text) │
|
|
387
|
-
│ │──────────────────────────▶│
|
|
388
|
-
│ │ OCR text response │
|
|
389
|
-
│ LaTeX / Markdown │◀──────────────────────────│
|
|
390
|
-
│◀────────────────────────│ │
|
|
391
|
-
```
|
|
392
|
-
|
|
393
|
-
The tool dispatches to your selected backend. Switch anytime with `/ocr` — no restart needed.
|
|
394
|
-
|
|
395
126
|
## License
|
|
396
127
|
|
|
397
128
|
MIT
|
package/extensions/index.ts
CHANGED
|
@@ -45,6 +45,7 @@ import type { Backend, Task, OcrConfig } from "./types";
|
|
|
45
45
|
import { TASKS, BACKENDS } from "./types";
|
|
46
46
|
import { isImage, isPdf, getPdfPageCount, ollamaOcr, ollamaCheckModel, ollamaPullModel } from "./ollama";
|
|
47
47
|
import { mineruOcr } from "./mineru";
|
|
48
|
+
import { tesseractOcr } from "./tesseract";
|
|
48
49
|
import { pix2textOcr } from "./pix2text";
|
|
49
50
|
|
|
50
51
|
// ── Config persistence ───────────────────────────────────────────────────────
|
|
@@ -138,7 +139,7 @@ const ocrTool = defineTool({
|
|
|
138
139
|
throw new Error(`Unsupported file type "${extname(filePath)}". Supported: PNG, JPG, GIF, WEBP, BMP, TIFF, PDF.`);
|
|
139
140
|
}
|
|
140
141
|
|
|
141
|
-
const backendLabel = { ollama: "🦙 Ollama",
|
|
142
|
+
const backendLabel = { mineru: "☁️ MinerU", ollama: "🦙 Ollama", tesseract: "🔤 Tesseract", pix2text: "📐 Pix2Text" }[config.backend];
|
|
142
143
|
onUpdate?.({ content: [{ type: "text", text: `🔍 OCR ${basename(filePath)} via ${backendLabel} (${resolvedTask})…` }], details: {} });
|
|
143
144
|
|
|
144
145
|
const onProgress = (msg: string) => onUpdate?.({ content: [{ type: "text", text: msg }], details: {} });
|
|
@@ -159,6 +160,9 @@ const ocrTool = defineTool({
|
|
|
159
160
|
result = await mineruOcr(filePath, resolvedTask, config.mineruSplitPdf, signal, onProgress);
|
|
160
161
|
break;
|
|
161
162
|
}
|
|
163
|
+
case "tesseract":
|
|
164
|
+
result = await tesseractOcr(filePath, resolvedTask, signal, onProgress);
|
|
165
|
+
break;
|
|
162
166
|
case "pix2text":
|
|
163
167
|
result = await pix2textOcr(filePath, resolvedTask, signal, onProgress);
|
|
164
168
|
break;
|
|
@@ -175,6 +179,7 @@ const ocrTool = defineTool({
|
|
|
175
179
|
const msg = e.message || String(e);
|
|
176
180
|
let hint = "";
|
|
177
181
|
if (config.backend === "ollama" && (msg.includes("fetch failed") || msg.includes("ECONNREFUSED"))) hint = "\n\n💡 Is Ollama running? Start: `ollama serve`";
|
|
182
|
+
else if (config.backend === "tesseract" && msg.includes("not found")) hint = "\n\n💡 Install: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux)";
|
|
178
183
|
else if (config.backend === "pix2text" && msg.includes("python3")) hint = "\n\n💡 Install: `pip install pix2text`";
|
|
179
184
|
else if (config.backend === "mineru" && msg.includes("429")) hint = "\n\n💡 MinerU rate limit. Wait a minute or switch backend with /ocr.";
|
|
180
185
|
else if (config.backend === "mineru" && msg.includes("too large")) hint = "\n\n💡 Compress at https://ilovepdf.com/compress_pdf or switch backend.";
|
|
@@ -292,7 +297,9 @@ export default function ocrExtension(pi: ExtensionAPI) {
|
|
|
292
297
|
".\nLarge files? Compress at https://ilovepdf.com/compress_pdf",
|
|
293
298
|
"info",
|
|
294
299
|
);
|
|
295
|
-
} else if (backend === "
|
|
300
|
+
} else if (backend === "tesseract") {
|
|
301
|
+
ctx.ui.notify("🔤 Tesseract: `brew install tesseract` (macOS) or `sudo apt install tesseract-ocr` (Linux). ~30MB, CPU-only.", "warning");
|
|
302
|
+
} else if (backend === "pix2text") {
|
|
296
303
|
ctx.ui.notify("🐍 Pix2Text: needs `pip install pix2text`", "warning");
|
|
297
304
|
}
|
|
298
305
|
break;
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-ocr — Tesseract backend
|
|
3
|
+
*
|
|
4
|
+
* Uses Tesseract OCR (https://github.com/tesseract-ocr/tesseract) —
|
|
5
|
+
* the classic open-source OCR engine. Ultra-lightweight (~30MB),
|
|
6
|
+
* zero Python deps, CPU-only, fast on plain text.
|
|
7
|
+
*
|
|
8
|
+
* Prerequisites:
|
|
9
|
+
* brew install tesseract # macOS
|
|
10
|
+
* sudo apt install tesseract-ocr # Linux
|
|
11
|
+
*
|
|
12
|
+
* For non-English languages, install the corresponding lang pack:
|
|
13
|
+
* brew install tesseract-lang # macOS (all languages)
|
|
14
|
+
* sudo apt install tesseract-ocr-chi-sim # Chinese simplified
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { mkdtempSync, readdirSync, unlinkSync, rmdirSync } from "node:fs";
|
|
18
|
+
import { basename, join } from "node:path";
|
|
19
|
+
import { tmpdir } from "node:os";
|
|
20
|
+
import { spawn } from "node:child_process";
|
|
21
|
+
import type { Task, OcrResult, OcrProgressCallback } from "./types";
|
|
22
|
+
import { isImage, isPdf, getPdfPageCount } from "./ollama";
|
|
23
|
+
|
|
24
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
async function execCapture(cmd: string, args: string[]): Promise<{ stdout: string; stderr: string; code: number }> {
|
|
27
|
+
return new Promise((resolve) => {
|
|
28
|
+
const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
|
|
29
|
+
const out: Buffer[] = [];
|
|
30
|
+
const err: Buffer[] = [];
|
|
31
|
+
child.stdout.on("data", (d) => out.push(d));
|
|
32
|
+
child.stderr.on("data", (d) => err.push(d));
|
|
33
|
+
child.on("error", () => resolve({
|
|
34
|
+
stdout: "", stderr: "tesseract not found. Install: brew install tesseract", code: 1,
|
|
35
|
+
}));
|
|
36
|
+
child.on("close", (code) => resolve({
|
|
37
|
+
stdout: Buffer.concat(out).toString("utf8").trim(),
|
|
38
|
+
stderr: Buffer.concat(err).toString("utf8").trim(),
|
|
39
|
+
code: code ?? 1,
|
|
40
|
+
}));
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function cleanupDir(dir: string) {
|
|
45
|
+
try {
|
|
46
|
+
for (const f of readdirSync(dir)) unlinkSync(join(dir, f));
|
|
47
|
+
rmdirSync(dir);
|
|
48
|
+
} catch { /* best effort */ }
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// ── PDF → image conversion (reuses ollama helpers approach) ──────────────────
|
|
52
|
+
|
|
53
|
+
async function convertPdfPage(pdfPath: string, pageIndex: number, outPath: string): Promise<void> {
|
|
54
|
+
if (process.platform === "darwin") {
|
|
55
|
+
if (pageIndex === 0) {
|
|
56
|
+
await execCapture("sips", ["-s", "format", "png", pdfPath, "--out", outPath]);
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
await execCapture("pdftoppm", [
|
|
61
|
+
"-png", "-r", "200", "-f", String(pageIndex + 1), "-l", String(pageIndex + 1),
|
|
62
|
+
"-singlefile", pdfPath, outPath.replace(/\.png$/, ""),
|
|
63
|
+
]);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ── Tesseract OCR ────────────────────────────────────────────────────────────
|
|
67
|
+
|
|
68
|
+
async function tesseractImage(imagePath: string, _task: Task): Promise<string> {
|
|
69
|
+
const { stdout, stderr, code } = await execCapture("tesseract", [
|
|
70
|
+
imagePath, "stdout",
|
|
71
|
+
"-l", "eng", // English by default; user can install more langs
|
|
72
|
+
"--psm", "3", // Auto page segmentation
|
|
73
|
+
]);
|
|
74
|
+
|
|
75
|
+
if (code !== 0) {
|
|
76
|
+
const msg = stderr || "tesseract failed";
|
|
77
|
+
if (msg.includes("not found") || msg.includes("ENOENT")) {
|
|
78
|
+
throw new Error("tesseract not found. Install: brew install tesseract");
|
|
79
|
+
}
|
|
80
|
+
throw new Error(msg.slice(0, 500));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return stdout;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
export async function tesseractOcr(
|
|
89
|
+
filePath: string, task: Task,
|
|
90
|
+
signal: AbortSignal | undefined, onProgress: OcrProgressCallback,
|
|
91
|
+
): Promise<OcrResult> {
|
|
92
|
+
let resultText = "";
|
|
93
|
+
let tmpDir: string | null = null;
|
|
94
|
+
|
|
95
|
+
try {
|
|
96
|
+
if (isPdf(filePath)) {
|
|
97
|
+
onProgress("📄 Converting PDF pages to images…");
|
|
98
|
+
tmpDir = mkdtempSync(join(tmpdir(), "pi-tesseract-"));
|
|
99
|
+
const pageCount = await getPdfPageCount(filePath);
|
|
100
|
+
|
|
101
|
+
const pageResults: string[] = [];
|
|
102
|
+
for (let i = 0; i < pageCount; i++) {
|
|
103
|
+
if (signal?.aborted) throw new Error("Aborted");
|
|
104
|
+
const pageOut = join(tmpDir, `page_${i + 1}.png`);
|
|
105
|
+
|
|
106
|
+
try {
|
|
107
|
+
await convertPdfPage(filePath, i, pageOut);
|
|
108
|
+
} catch (e: any) {
|
|
109
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ Skipped: ${e.message}`);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
onProgress(`📄 Page ${i + 1}/${pageCount}`);
|
|
114
|
+
const pageText = await tesseractImage(pageOut, task);
|
|
115
|
+
if (!pageText.trim()) {
|
|
116
|
+
pageResults.push(`## Page ${i + 1}\n\n> ⚠️ No text detected`);
|
|
117
|
+
} else {
|
|
118
|
+
pageResults.push(`## Page ${i + 1}\n\n${pageText}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
resultText = pageResults.join("\n\n");
|
|
122
|
+
} else if (isImage(filePath)) {
|
|
123
|
+
resultText = await tesseractImage(filePath, task);
|
|
124
|
+
} else {
|
|
125
|
+
throw new Error(`Unsupported file type: ${basename(filePath)}`);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return { text: resultText, details: { backend: "tesseract", task } };
|
|
129
|
+
} finally {
|
|
130
|
+
if (tmpDir) cleanupDir(tmpDir);
|
|
131
|
+
}
|
|
132
|
+
}
|
package/extensions/types.ts
CHANGED
|
@@ -6,7 +6,7 @@ export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
|
|
|
6
6
|
export type Task = (typeof TASKS)[number];
|
|
7
7
|
|
|
8
8
|
/** All supported OCR backends */
|
|
9
|
-
export const BACKENDS = ["ollama", "
|
|
9
|
+
export const BACKENDS = ["mineru", "ollama", "tesseract", "pix2text"] as const;
|
|
10
10
|
export type Backend = (typeof BACKENDS)[number];
|
|
11
11
|
|
|
12
12
|
export interface OcrConfig {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-ocr",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|