pi-ocr 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,7 @@
7
7
 
8
8
  Multi-backend OCR for [Pi Coding Agent](https://pi.dev) — extract text, LaTeX math formulas, and tables from images and PDFs. Choose the backend that fits your needs: free cloud API, local GPU, or pure Python.
9
9
 
10
- > Bridges the multimodal gap for non-vision LLMs like **DeepSeek**. When your model can't see images, `minimodel_ocr` acts as its eyes.
10
+ > Bridges the multimodal gap for non-vision LLMs like **DeepSeek**. When your model can't see images, `pi_ocr` acts as its eyes.
11
11
 
12
12
  ## Three Backends — One Tool
13
13
 
@@ -158,6 +158,9 @@ Free tier limits:
158
158
  - ≤ 20 pages per request
159
159
  - IP-based rate limiting
160
160
 
161
+ > 💡 PDFs >20 pages: auto-splitting needs `python3` + `pypdfium2` (`pip install pypdfium2`).
162
+ > Most PDFs are under 20 pages — you'll likely never need this.
163
+
161
164
  For files >10MB, compress first at [ilovepdf.com/compress_pdf](https://ilovepdf.com/compress_pdf).
162
165
 
163
166
  ---
@@ -210,7 +213,7 @@ Opens an interactive `SettingsList` with keyboard navigation:
210
213
 
211
214
  ### LLM-invoked (automatic)
212
215
 
213
- The extension registers a `minimodel_ocr` tool. The agent invokes it automatically:
216
+ The extension registers a `pi_ocr` tool. The agent invokes it automatically:
214
217
 
215
218
  ```
216
219
  > What formula is written in this screenshot?
@@ -371,7 +374,7 @@ sudo pacman -S poppler
371
374
 
372
375
  ```
373
376
  ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────────┐
374
- │ pi (DeepSeek) │────▶│ minimodel_ocr │────▶│ Ollama / MinerU │
377
+ │ pi (DeepSeek) │────▶│ pi_ocr │────▶│ Ollama / MinerU │
375
378
  │ (no vision) │ │ pi extension │ │ / Pix2Text │
376
379
  └──────────────────┘ └──────────────────┘ └──────────────────────┘
377
380
  │ │ │
@@ -1,7 +1,7 @@
1
1
  /**
2
- * pi-minimodel-ocr — Multi-backend OCR for Pi Coding Agent
2
+ * pi-ocr — Multi-backend OCR for Pi Coding Agent
3
3
  *
4
- * Registers a `minimodel_ocr` tool that the LLM can call to read images and PDFs
4
+ * Registers a `pi_ocr` tool that the LLM can call to read images and PDFs
5
5
  * using one of three backends:
6
6
  * - Ollama (local vision models like glm-ocr)
7
7
  * - MinerU API (free Agent API, ≤10MB, ≤20 pages)
@@ -19,7 +19,7 @@
19
19
  * Pix2Text: pip install pix2text
20
20
  * PDF tools: brew install poppler (macOS multi-page PDF for Ollama)
21
21
  *
22
- * Install: pi install npm:pi-minimodel-ocr
22
+ * Install: pi install npm:pi-ocr
23
23
  */
24
24
 
25
25
  import { Type } from "@earendil-works/pi-ai";
@@ -111,7 +111,7 @@ const ocrSchema = Type.Object({
111
111
  });
112
112
 
113
113
  const ocrTool = defineTool({
114
- name: "minimodel_ocr",
114
+ name: "pi_ocr",
115
115
  label: "Minimodel OCR",
116
116
  description:
117
117
  "Extract text, math formulas (LaTeX), and tables from images or PDFs using local Ollama vision models. " +
@@ -120,9 +120,9 @@ const ocrTool = defineTool({
120
120
  promptSnippet:
121
121
  "Extract text/formulas/tables from images and PDFs using local Ollama OCR",
122
122
  promptGuidelines: [
123
- "When the user asks about the content of an image or PDF, use minimodel_ocr to extract the text first.",
124
- "For mathematical documents, use minimodel_ocr with task='formula' or task='auto' to get LaTeX output.",
125
- "Use minimodel_ocr with task='auto' for general document OCR to extract all text, formulas, tables, and figures.",
123
+ "When the user asks about the content of an image or PDF, use pi_ocr to extract the text first.",
124
+ "For mathematical documents, use pi_ocr with task='formula' or task='auto' to get LaTeX output.",
125
+ "Use pi_ocr with task='auto' for general document OCR to extract all text, formulas, tables, and figures.",
126
126
  ],
127
127
  parameters: ocrSchema,
128
128
  async execute(_toolCallId, params, signal, onUpdate, _ctx) {
@@ -408,7 +408,7 @@ export default function ocrExtension(pi: ExtensionAPI) {
408
408
  const text = config.backend === "ollama"
409
409
  ? `OCR: ollama ${config.model}`
410
410
  : `OCR: ${config.backend}`;
411
- ctx.ui.setStatus("minimodel-ocr", text);
411
+ ctx.ui.setStatus("pi-ocr", text);
412
412
  }
413
413
 
414
414
  // ── Startup ────────────────────────────────────────────────────────────────
@@ -430,5 +430,5 @@ export default function ocrExtension(pi: ExtensionAPI) {
430
430
  }
431
431
  });
432
432
 
433
- console.log("[pi-ocr] Loaded — /ocr (file or settings), tool: minimodel_ocr, default: mineru");
433
+ console.log("[pi-ocr] Loaded — /ocr (file or settings), tool: pi_ocr, default: mineru");
434
434
  }
@@ -1,5 +1,5 @@
1
1
  /**
2
- * pi-minimodel-ocr — MinerU API backend
2
+ * pi-ocr — MinerU API backend
3
3
  *
4
4
  * Uses the free Agent Lightweight API (no token required):
5
5
  * - File ≤10MB, ≤20 pages → one free request
@@ -1,5 +1,5 @@
1
1
  /**
2
- * pi-minimodel-ocr — Ollama backend
2
+ * pi-ocr — Ollama backend
3
3
  *
4
4
  * Uses any locally-running Ollama vision model (default: glm-ocr) to OCR
5
5
  * images and PDFs. Converts PDF pages to PNG before sending to Ollama.
@@ -1,5 +1,5 @@
1
1
  /**
2
- * pi-minimodel-ocr — Pix2Text backend
2
+ * pi-ocr — Pix2Text backend
3
3
  *
4
4
  * Uses Pix2Text (https://github.com/breezedeus/Pix2Text) — an open-source
5
5
  * Python alternative to Mathpix. Recognizes layouts, text, math formulas (LaTeX),
@@ -1,5 +1,5 @@
1
1
  /**
2
- * pi-minimodel-ocr — shared types for OCR backends
2
+ * pi-ocr — shared types for OCR backends
3
3
  */
4
4
 
5
5
  export const TASKS = ["text", "formula", "table", "figure", "auto"] as const;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-ocr",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "Pi extension: Zero-setup multi-backend OCR — MinerU (free cloud), Ollama (local GPU, LaTeX formulas), Pix2Text (local Python). Extract text, formulas, and tables from images and PDFs. Default: zero config, works out of the box.",
5
5
  "keywords": [
6
6
  "pi-package",