pi-ocr 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/extensions/index.ts +9 -9
- package/extensions/mineru.ts +1 -1
- package/extensions/ollama.ts +1 -1
- package/extensions/pix2text.ts +1 -1
- package/extensions/types.ts +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
Multi-backend OCR for [Pi Coding Agent](https://pi.dev) — extract text, LaTeX math formulas, and tables from images and PDFs. Choose the backend that fits your needs: free cloud API, local GPU, or pure Python.
|
|
9
9
|
|
|
10
|
-
> Bridges the multimodal gap for non-vision LLMs like **DeepSeek**. When your model can't see images, `
|
|
10
|
+
> Bridges the multimodal gap for non-vision LLMs like **DeepSeek**. When your model can't see images, `pi_ocr` acts as its eyes.
|
|
11
11
|
|
|
12
12
|
## Three Backends — One Tool
|
|
13
13
|
|
|
@@ -158,6 +158,9 @@ Free tier limits:
|
|
|
158
158
|
- ≤ 20 pages per request
|
|
159
159
|
- IP-based rate limiting
|
|
160
160
|
|
|
161
|
+
> 💡 PDFs >20 pages: auto-splitting needs `python3` + `pypdfium2` (`pip install pypdfium2`).
|
|
162
|
+
> Most PDFs are under 20 pages — you'll likely never need this.
|
|
163
|
+
|
|
161
164
|
For files >10MB, compress first at [ilovepdf.com/compress_pdf](https://ilovepdf.com/compress_pdf).
|
|
162
165
|
|
|
163
166
|
---
|
|
@@ -210,7 +213,7 @@ Opens an interactive `SettingsList` with keyboard navigation:
|
|
|
210
213
|
|
|
211
214
|
### LLM-invoked (automatic)
|
|
212
215
|
|
|
213
|
-
The extension registers a `
|
|
216
|
+
The extension registers a `pi_ocr` tool. The agent invokes it automatically:
|
|
214
217
|
|
|
215
218
|
```
|
|
216
219
|
> What formula is written in this screenshot?
|
|
@@ -371,7 +374,7 @@ sudo pacman -S poppler
|
|
|
371
374
|
|
|
372
375
|
```
|
|
373
376
|
┌──────────────────┐ ┌──────────────────┐ ┌──────────────────────┐
|
|
374
|
-
│ pi (DeepSeek) │────▶│
|
|
377
|
+
│ pi (DeepSeek) │────▶│ pi_ocr │────▶│ Ollama / MinerU │
|
|
375
378
|
│ (no vision) │ │ pi extension │ │ / Pix2Text │
|
|
376
379
|
└──────────────────┘ └──────────────────┘ └──────────────────────┘
|
|
377
380
|
│ │ │
|
package/extensions/index.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* pi-
|
|
2
|
+
* pi-ocr — Multi-backend OCR for Pi Coding Agent
|
|
3
3
|
*
|
|
4
|
-
* Registers a `
|
|
4
|
+
* Registers a `pi_ocr` tool that the LLM can call to read images and PDFs
|
|
5
5
|
* using one of three backends:
|
|
6
6
|
* - Ollama (local vision models like glm-ocr)
|
|
7
7
|
* - MinerU API (free Agent API, ≤10MB, ≤20 pages)
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
* Pix2Text: pip install pix2text
|
|
20
20
|
* PDF tools: brew install poppler (macOS multi-page PDF for Ollama)
|
|
21
21
|
*
|
|
22
|
-
* Install: pi install npm:pi-
|
|
22
|
+
* Install: pi install npm:pi-ocr
|
|
23
23
|
*/
|
|
24
24
|
|
|
25
25
|
import { Type } from "@earendil-works/pi-ai";
|
|
@@ -111,7 +111,7 @@ const ocrSchema = Type.Object({
|
|
|
111
111
|
});
|
|
112
112
|
|
|
113
113
|
const ocrTool = defineTool({
|
|
114
|
-
name: "
|
|
114
|
+
name: "pi_ocr",
|
|
115
115
|
label: "Minimodel OCR",
|
|
116
116
|
description:
|
|
117
117
|
"Extract text, math formulas (LaTeX), and tables from images or PDFs using local Ollama vision models. " +
|
|
@@ -120,9 +120,9 @@ const ocrTool = defineTool({
|
|
|
120
120
|
promptSnippet:
|
|
121
121
|
"Extract text/formulas/tables from images and PDFs using local Ollama OCR",
|
|
122
122
|
promptGuidelines: [
|
|
123
|
-
"When the user asks about the content of an image or PDF, use
|
|
124
|
-
"For mathematical documents, use
|
|
125
|
-
"Use
|
|
123
|
+
"When the user asks about the content of an image or PDF, use pi_ocr to extract the text first.",
|
|
124
|
+
"For mathematical documents, use pi_ocr with task='formula' or task='auto' to get LaTeX output.",
|
|
125
|
+
"Use pi_ocr with task='auto' for general document OCR to extract all text, formulas, tables, and figures.",
|
|
126
126
|
],
|
|
127
127
|
parameters: ocrSchema,
|
|
128
128
|
async execute(_toolCallId, params, signal, onUpdate, _ctx) {
|
|
@@ -408,7 +408,7 @@ export default function ocrExtension(pi: ExtensionAPI) {
|
|
|
408
408
|
const text = config.backend === "ollama"
|
|
409
409
|
? `OCR: ollama ${config.model}`
|
|
410
410
|
: `OCR: ${config.backend}`;
|
|
411
|
-
ctx.ui.setStatus("
|
|
411
|
+
ctx.ui.setStatus("pi-ocr", text);
|
|
412
412
|
}
|
|
413
413
|
|
|
414
414
|
// ── Startup ────────────────────────────────────────────────────────────────
|
|
@@ -430,5 +430,5 @@ export default function ocrExtension(pi: ExtensionAPI) {
|
|
|
430
430
|
}
|
|
431
431
|
});
|
|
432
432
|
|
|
433
|
-
console.log("[pi-ocr] Loaded — /ocr (file or settings), tool:
|
|
433
|
+
console.log("[pi-ocr] Loaded — /ocr (file or settings), tool: pi_ocr, default: mineru");
|
|
434
434
|
}
|
package/extensions/mineru.ts
CHANGED
package/extensions/ollama.ts
CHANGED
package/extensions/pix2text.ts
CHANGED
package/extensions/types.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-ocr",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|