@xiaotianxt/skills 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/EXCLUDED.md +42 -0
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/SECURITY.md +23 -0
- package/SOURCES.md +45 -0
- package/bin/skills.mjs +241 -0
- package/package.json +38 -0
- package/skills/1password/SKILL.md +94 -0
- package/skills/1password/agents/openai.yaml +4 -0
- package/skills/1password/references/item-management.md +80 -0
- package/skills/1password/references/op-cli.md +107 -0
- package/skills/apple-calendar-event/SKILL.md +81 -0
- package/skills/apple-calendar-event/agents/openai.yaml +4 -0
- package/skills/apple-calendar-event/scripts/calendar_audit.py +201 -0
- package/skills/apple-calendar-event/scripts/calendar_event.py +164 -0
- package/skills/bro-browser/SKILL.md +118 -0
- package/skills/bro-browser/agents/openai.yaml +4 -0
- package/skills/bro-browser/references/tool-map.md +102 -0
- package/skills/bro-browser/references/workflows.md +146 -0
- package/skills/bro-browser/scripts/bro-call.mjs +189 -0
- package/skills/calendar/SKILL.md +182 -0
- package/skills/calendar/agents/openai.yaml +4 -0
- package/skills/calendar/references/operations.md +255 -0
- package/skills/calendar/scripts/calendar_list_review.py +157 -0
- package/skills/calendar/scripts/event_dedupe_preview.py +155 -0
- package/skills/canvas/SKILL.md +70 -0
- package/skills/canvas/agents/openai.yaml +4 -0
- package/skills/canvas/references/canvas-api.md +76 -0
- package/skills/course-exam-review-planner/SKILL.md +127 -0
- package/skills/cx/SKILL.md +25 -0
- package/skills/gh-fix-ci/LICENSE.txt +201 -0
- package/skills/gh-fix-ci/SKILL.md +81 -0
- package/skills/gh-fix-ci/agents/openai.yaml +6 -0
- package/skills/gh-fix-ci/assets/github-small.svg +3 -0
- package/skills/gh-fix-ci/assets/github.png +0 -0
- package/skills/gh-fix-ci/scripts/inspect_pr_checks.py +509 -0
- package/skills/gh-review-workflow/SKILL.md +61 -0
- package/skills/gh-review-workflow/agents/openai.yaml +4 -0
- package/skills/gh-review-workflow/references/workflow.md +48 -0
- package/skills/gh-review-workflow/scripts/fetch_review_state.py +222 -0
- package/skills/gh-review-workflow/scripts/resolve_review_threads.py +83 -0
- package/skills/github/SKILL.md +74 -0
- package/skills/github/agents/openai.yaml +6 -0
- package/skills/github/assets/github-small.svg +3 -0
- package/skills/github/assets/github.png +0 -0
- package/skills/gws-calendar/SKILL.md +126 -0
- package/skills/gws-calendar-agenda/SKILL.md +52 -0
- package/skills/gws-calendar-insert/SKILL.md +66 -0
- package/skills/gws-docs/SKILL.md +48 -0
- package/skills/gws-docs-write/SKILL.md +49 -0
- package/skills/gws-drive/SKILL.md +137 -0
- package/skills/gws-drive-upload/SKILL.md +52 -0
- package/skills/gws-gmail/SKILL.md +62 -0
- package/skills/gws-gmail-forward/SKILL.md +55 -0
- package/skills/gws-gmail-reply/SKILL.md +58 -0
- package/skills/gws-gmail-reply-all/SKILL.md +62 -0
- package/skills/gws-gmail-send/SKILL.md +57 -0
- package/skills/gws-gmail-triage/SKILL.md +50 -0
- package/skills/gws-gmail-watch/SKILL.md +58 -0
- package/skills/gws-shared/SKILL.md +27 -0
- package/skills/helium-browser-mcp/SKILL.md +137 -0
- package/skills/helium-browser-mcp/agents/openai.yaml +4 -0
- package/skills/helium-browser-mcp/scripts/obmcp.mjs +92 -0
- package/skills/helium-browser-mcp/scripts/openbrowsermcp-stdio-proxy.mjs +170 -0
- package/skills/learn/SKILL.md +122 -0
- package/skills/learn/agents/openai.yaml +7 -0
- package/skills/learn/assets/AGENTS.template.md +33 -0
- package/skills/learn/assets/errorlog.template.typ +61 -0
- package/skills/learn/assets/reading-sequence.template.md +23 -0
- package/skills/learn/assets/source-index.template.md +17 -0
- package/skills/learn/assets/tasklog.template.typ +57 -0
- package/skills/learn/assets/workbook.template.typ +60 -0
- package/skills/learn/references/learning-science.md +103 -0
- package/skills/learn/scripts/init_learning_workspace.py +70 -0
- package/skills/macos-messages/SKILL.md +258 -0
- package/skills/memory/SKILL.md +33 -0
- package/skills/memory/codex.md +186 -0
- package/skills/memory/opencode.md +164 -0
- package/skills/mimestreamctl/SKILL.md +170 -0
- package/skills/mimestreamctl/agents/openai.yaml +4 -0
- package/skills/mimestreamctl/scripts/mimestreamctl +33 -0
- package/skills/mon/SKILL.md +51 -0
- package/skills/mon/scripts/mon_spend_review.py +458 -0
- package/skills/ocr/SKILL.md +136 -0
- package/skills/ocr/agents/openai.yaml +4 -0
- package/skills/ocr/references/local-ocr-best-practices.md +297 -0
- package/skills/ocr/references/mineru-api.md +159 -0
- package/skills/ocr/scripts/ocr-router +22 -0
- package/skills/ocr/scripts/ocr_router.py +741 -0
- package/skills/panopto-mp4-bulk-download/SKILL.md +57 -0
- package/skills/panopto-mp4-bulk-download/agents/openai.yaml +4 -0
- package/skills/panopto-mp4-bulk-download/references/url-patterns.md +26 -0
- package/skills/panopto-mp4-bulk-download/scripts/panopto_bulk_mp4.sh +213 -0
- package/skills/rust-systems-style/SKILL.md +109 -0
- package/skills/rust-systems-style/agents/openai.yaml +4 -0
- package/skills/rust-systems-style/references/rust-review-checklist.md +77 -0
- package/skills/rust-systems-style/references/style-sources.md +68 -0
- package/skills/ship-ai-native-cli/SKILL.md +76 -0
- package/skills/ship-ai-native-cli/agents/openai.yaml +4 -0
- package/skills/ship-ai-native-cli/references/case-notes.md +83 -0
- package/skills/ship-ai-native-cli/references/product-method.md +82 -0
- package/skills/ship-ai-native-cli/references/release-checklist.md +147 -0
- package/skills/ship-ai-native-cli/references/rust-cli-shape.md +111 -0
- package/skills/telegram-mtproto-session/SKILL.md +125 -0
- package/skills/telegram-mtproto-session/agents/openai.yaml +4 -0
- package/skills/telegram-mtproto-session/scripts/telegram_session.py +687 -0
- package/skills/tg/SKILL.md +173 -0
- package/skills/things3-manager/SKILL.md +116 -0
- package/skills/things3-manager/scripts/things +42 -0
- package/skills/things3-manager/scripts/things_cli.py +514 -0
- package/skills/web-artifacts-builder/LICENSE.txt +202 -0
- package/skills/web-artifacts-builder/SKILL.md +74 -0
- package/skills/web-artifacts-builder/scripts/bundle-artifact.sh +54 -0
- package/skills/web-artifacts-builder/scripts/init-artifact.sh +379 -0
- package/skills/web-artifacts-builder/scripts/shadcn-components.tar.gz +0 -0
- package/skills/yeet/LICENSE.txt +201 -0
- package/skills/yeet/SKILL.md +71 -0
- package/skills/yeet/agents/openai.yaml +6 -0
- package/skills/yeet/assets/yeet-small.svg +3 -0
- package/skills/yeet/assets/yeet.png +0 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
# Local OCR Best Practices
|
|
2
|
+
|
|
3
|
+
## Local Tools
|
|
4
|
+
|
|
5
|
+
Known local tools:
|
|
6
|
+
|
|
7
|
+
- `/Users/yupeit/bin/ocr`: Swift CLI using macOS Apple Vision; supports image path, stdin image bytes, `capture`, and `fullscreen`.
|
|
8
|
+
- `/Users/yupeit/bin/ocr.py`: Gemini VLM OCR script; renders PDFs to images and returns Markdown page output.
|
|
9
|
+
- `/Users/yupeit/bin/ocr-doc`: document extraction router from this skill.
|
|
10
|
+
- `PyMuPDF/fitz`: preferred by `ocr-doc` for PDF profiling when available.
|
|
11
|
+
- `pdftotext`, `pdfinfo`, `pdfimages`: Poppler fallback tools for PDF text layer and metadata inspection.
|
|
12
|
+
- `uvx 'mineru[all]'`: local MinerU CLI.
|
|
13
|
+
- `tesseract`: traditional OCR fallback, usually lower priority than Apple Vision for screenshots on this Mac.
|
|
14
|
+
|
|
15
|
+
## `ocr-doc` Router Parameters
|
|
16
|
+
|
|
17
|
+
Common engine selection:
|
|
18
|
+
|
|
19
|
+
| Flag | Values | Default | Guidance |
|
|
20
|
+
|---|---|---:|---|
|
|
21
|
+
| `--engine` | `auto`, `native-text`, `apple-vision`, `gemini-vlm`, `mineru-local`, `mineru-api`, `mineru-agent` | `auto` | Use `auto` first. Pin an engine when testing quality or reproducing a workflow. |
|
|
22
|
+
| `--profile-only` | flag | off | Inspect file shape and recommended engine without extracting. Use this before touching unknown PDFs. |
|
|
23
|
+
| `--show-profile` | flag | off | Print profile and continue extraction. |
|
|
24
|
+
| `--no-cloud` | flag | off | Forbid all cloud upload paths. Use when confidentiality is unknown. |
|
|
25
|
+
| `--allow-cloud` | flag | off | Skip interactive upload confirmation. Use only after explicit user approval for that document. |
|
|
26
|
+
| `--out-dir` | path | derived from input and engine | Put outputs in a known directory for review/repro. |
|
|
27
|
+
|
|
28
|
+
Quality intent flags:
|
|
29
|
+
|
|
30
|
+
| Flag | Effect | Guidance |
|
|
31
|
+
|---|---|---|
|
|
32
|
+
| `--require-structure` | In `auto`, prefer MinerU over native text. | Use for Markdown with figures, layout artifacts, page markers, tables, or RAG ingestion. |
|
|
33
|
+
| `--need-formulas` | In `auto`, prefer MinerU. | Use for math, finance, papers, textbooks, and STEM slides. |
|
|
34
|
+
| `--need-tables` | In `auto`, prefer MinerU. | Use for reports, schedules, invoices, handbooks, papers, or any table-heavy PDF. |
|
|
35
|
+
|
|
36
|
+
MinerU shared flags:
|
|
37
|
+
|
|
38
|
+
| Flag | Maps to | Default | Guidance |
|
|
39
|
+
|---|---|---:|---|
|
|
40
|
+
| `--language` | MinerU `language` / local `-l` | `en` | Set explicitly. Use `en` for English docs and `ch` for Chinese docs. |
|
|
41
|
+
| `--enable-formula` / `--no-enable-formula` | `enable_formula` / local `-f` | true | Keep true for STEM/finance/math. Disable for prose if formula recognition creates noise. |
|
|
42
|
+
| `--enable-table` / `--no-enable-table` | `enable_table` / local `-t` | true | Keep true for table-bearing docs. Disable for prose if tables are irrelevant. |
|
|
43
|
+
| `--image-analysis` / `--no-image-analysis` | local `--image-analysis` | false | Turn on only when you want image/figure analysis from MinerU itself. Keep false if image assets are extracted by another pipeline. |
|
|
44
|
+
| `--timeout-seconds` | MinerU wait timeout env / router polling | 86400 | Keep long for local MinerU and large cloud jobs. |
|
|
45
|
+
|
|
46
|
+
Cloud MinerU flags:
|
|
47
|
+
|
|
48
|
+
| Flag | Maps to | Default | Guidance |
|
|
49
|
+
|---|---|---:|---|
|
|
50
|
+
| `--model-version` | `model_version` | `vlm` | Use `vlm` for official API quality, `pipeline` for speed/reproducibility, `MinerU-HTML` for HTML. |
|
|
51
|
+
| `--is-ocr` / `--no-is-ocr` | `is_ocr` | true | Use true for scans/mixed PDFs. Try false for born-digital PDFs with good text layers. |
|
|
52
|
+
| `--page-ranges` | `page_ranges` | omitted | Precision API only. Use 1-based comma-separated ranges, e.g. `1-5,9`. |
|
|
53
|
+
| `--poll-interval` | polling sleep | 5s | Increase for large jobs to reduce polling noise. |
|
|
54
|
+
|
|
55
|
+
Local MinerU CLI flags:
|
|
56
|
+
|
|
57
|
+
| Flag | Maps to MinerU CLI | Default | Guidance |
|
|
58
|
+
|---|---|---:|---|
|
|
59
|
+
| `--mineru-backend` | `-b/--backend` | `pipeline` | Use `pipeline` for full documents. Try `hybrid-auto-engine` or `vlm-auto-engine` only for selected pages where quality justifies time. |
|
|
60
|
+
| `--mineru-method` | `-m/--method` | `txt` | Use `txt` for PDFs with a good text layer. Use `ocr` for scans. Use `auto` when mixed and unsure. Applies to pipeline/hybrid backends. |
|
|
61
|
+
| `--start-page` | `-s` | omitted | 0-based local MinerU start page. Use to chunk or spot-check. |
|
|
62
|
+
| `--end-page` | `-e` | omitted | 0-based inclusive local MinerU end page. |
|
|
63
|
+
|
|
64
|
+
Important mismatch:
|
|
65
|
+
|
|
66
|
+
- Local MinerU page indexes are 0-based for `-s/-e`.
|
|
67
|
+
- Cloud Precision API `page_ranges` and Agent API `page_range` are 1-based.
|
|
68
|
+
- Precision API supports comma-separated ranges; Agent API supports a single page or `from-to`, not complex comma-separated ranges.
|
|
69
|
+
|
|
70
|
+
## PDF Shape Heuristics
|
|
71
|
+
|
|
72
|
+
Prefer native text extraction when:
|
|
73
|
+
|
|
74
|
+
- The PDF has a strong embedded text layer.
|
|
75
|
+
- The output can be plain Markdown/text.
|
|
76
|
+
- Tables do not need HTML structure.
|
|
77
|
+
- Formulas do not need semantic LaTeX.
|
|
78
|
+
|
|
79
|
+
Prefer MinerU when:
|
|
80
|
+
|
|
81
|
+
- You need table structure, formulas, images, page markers, layout JSON, or chunk-level artifacts.
|
|
82
|
+
- The PDF is scanned or mixed text/image.
|
|
83
|
+
- Text-layer extraction produces math/font garbage.
|
|
84
|
+
- You need a durable ingestion artifact for an agent/RAG pipeline.
|
|
85
|
+
|
|
86
|
+
Prefer VLM OCR when:
|
|
87
|
+
|
|
88
|
+
- Visual semantics matter more than deterministic layout.
|
|
89
|
+
- The page is a diagram, screenshot, form, slide, or mixed visual explanation.
|
|
90
|
+
- Local OCR sees text but misses intent.
|
|
91
|
+
|
|
92
|
+
Use cloud only when:
|
|
93
|
+
|
|
94
|
+
- The user explicitly says the document is non-confidential or otherwise permits upload.
|
|
95
|
+
- `--allow-cloud` is passed only after that permission.
|
|
96
|
+
|
|
97
|
+
Apple Vision note:
|
|
98
|
+
|
|
99
|
+
- Apple Vision may fail inside the Codex sandbox even for local images.
|
|
100
|
+
- If the error mentions `Foundation._GenericObjCError` or Vision request failure, rerun through an approved local command outside the sandbox.
|
|
101
|
+
|
|
102
|
+
Map/diagram note:
|
|
103
|
+
|
|
104
|
+
- MinerU can classify map-like pages as images and emit only an image reference.
|
|
105
|
+
- For visible map labels, compare `native-text` and Apple Vision OCR.
|
|
106
|
+
|
|
107
|
+
## MinerU Findings From Prior Local Work
|
|
108
|
+
|
|
109
|
+
For a technical 880-page book with a real text layer:
|
|
110
|
+
|
|
111
|
+
- `pdftotext -layout` produced good fast text-only output.
|
|
112
|
+
- `pipeline + txt` was the best full-book MinerU base because it preserved text-layer quality while adding formula/table recognition.
|
|
113
|
+
- `--formula true` was essential; plain text extraction could not recover custom math fonts into semantic LaTeX.
|
|
114
|
+
- `--table true` was worthwhile; table pages became HTML-like structured output instead of whitespace-only alignment.
|
|
115
|
+
- `--image-analysis false` was better when images were extracted separately by a dedicated PyMuPDF pipeline.
|
|
116
|
+
- `hybrid-auto-engine + txt` improved some formula formatting but was materially slower.
|
|
117
|
+
- `vlm-auto-engine` gave a useful second opinion on formula-heavy pages but was too expensive for full-book local runs.
|
|
118
|
+
- The best final artifact used `pipeline + txt` for the full book, then overlaid VLM pages for selected formula-heavy pages.
|
|
119
|
+
|
|
120
|
+
## Long Textbook Playbook
|
|
121
|
+
|
|
122
|
+
This playbook captures the John Hull textbook extraction lessons. Use it for long born-digital textbooks, lecture-note books, technical manuals, and similar PDFs.
|
|
123
|
+
|
|
124
|
+
### 1. First classify the PDF
|
|
125
|
+
|
|
126
|
+
Run:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
ocr-doc file.pdf --profile-only --no-cloud
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
If the PDF has a strong text layer:
|
|
133
|
+
|
|
134
|
+
- Do not full-OCR it.
|
|
135
|
+
- Do not use `native-text` as the final artifact if formulas/tables matter.
|
|
136
|
+
- Use text-layer aware MinerU extraction so body text stays clean while MinerU adds formulas/tables/layout artifacts.
|
|
137
|
+
|
|
138
|
+
If the PDF is scanned or the text layer is bad:
|
|
139
|
+
|
|
140
|
+
- Use `--mineru-method ocr` locally, or cloud `--is-ocr`.
|
|
141
|
+
- Expect slower runtime and more spot-checking.
|
|
142
|
+
|
|
143
|
+
### 2. Full-book base extraction
|
|
144
|
+
|
|
145
|
+
For a long born-digital technical book, the best tested local base settings were:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
python3 /Users/yupeit/dev/learn/quant/scripts/run_mineru_chunks.py \
|
|
149
|
+
--pdf file.pdf \
|
|
150
|
+
--output-dir out_chunks \
|
|
151
|
+
--page-count PAGE_COUNT \
|
|
152
|
+
--chunk-size 64 \
|
|
153
|
+
--backend pipeline \
|
|
154
|
+
--method txt \
|
|
155
|
+
--lang en \
|
|
156
|
+
--formula \
|
|
157
|
+
--table \
|
|
158
|
+
--no-image-analysis \
|
|
159
|
+
--timeout-seconds 86400
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Parameter rationale:
|
|
163
|
+
|
|
164
|
+
| Parameter | Setting | Why |
|
|
165
|
+
|---|---|---|
|
|
166
|
+
| `--backend` | `pipeline` | Fast enough for a full book and produced readable formulas. |
|
|
167
|
+
| `--method` | `txt` | Preserves real PDF text layer and avoids unnecessary OCR artifacts. |
|
|
168
|
+
| `--lang` | `en` | The tested textbook was English; set explicitly for OCR components. |
|
|
169
|
+
| `--formula` | true | Essential for semantic LaTeX; plain text extraction failed on custom math fonts. |
|
|
170
|
+
| `--table` | true | Produces structured table output instead of whitespace-only alignment. |
|
|
171
|
+
| `--image-analysis` | false | Better when figures/images are extracted separately and referenced by manifest. |
|
|
172
|
+
| `--chunk-size` | `64` | Completed reliably for an 880-page book in 14 chunks. |
|
|
173
|
+
| `--timeout-seconds` | `86400` | Avoids client timeout killing the temporary local MinerU service. |
|
|
174
|
+
|
|
175
|
+
Do not rely on one giant all-or-nothing MinerU run for a long textbook.
|
|
176
|
+
|
|
177
|
+
### 3. Merge and preserve auditability
|
|
178
|
+
|
|
179
|
+
After chunks finish, merge `*_content_list.json` instead of only concatenating Markdown:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
python3 /Users/yupeit/dev/learn/quant/scripts/merge_mineru_chunks.py \
|
|
183
|
+
--chunks-dir out_chunks \
|
|
184
|
+
--out merged.md \
|
|
185
|
+
--equations-out equations.json \
|
|
186
|
+
--title "Book title - MinerU pipeline extraction"
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Keep:
|
|
190
|
+
|
|
191
|
+
- Page markers.
|
|
192
|
+
- `mineru.log` per chunk.
|
|
193
|
+
- `chunk_status.json` per chunk.
|
|
194
|
+
- Equation JSON.
|
|
195
|
+
- Image/formula references when possible.
|
|
196
|
+
|
|
197
|
+
These artifacts make later quality review and selective reprocessing possible.
|
|
198
|
+
|
|
199
|
+
### 4. Formula-heavy page enhancement
|
|
200
|
+
|
|
201
|
+
For formula-heavy pages, do not rerun the whole book with VLM. Select pages using the equation manifest or manual review:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
python3 /Users/yupeit/dev/learn/quant/scripts/run_mineru_selected_pages.py \
|
|
205
|
+
--pdf file.pdf \
|
|
206
|
+
--output-dir vlm_pages \
|
|
207
|
+
--equations-json equations.json \
|
|
208
|
+
--min-equations 8 \
|
|
209
|
+
--backend vlm-auto-engine \
|
|
210
|
+
--lang en \
|
|
211
|
+
--formula \
|
|
212
|
+
--table \
|
|
213
|
+
--no-image-analysis \
|
|
214
|
+
--timeout-seconds 86400
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Then overlay selected page blocks:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
python3 /Users/yupeit/dev/learn/quant/scripts/overlay_markdown_pages.py \
|
|
221
|
+
--base merged.md \
|
|
222
|
+
--overlay vlm_pages_merged.md \
|
|
223
|
+
--out merged_vlm_formula_enhanced.md \
|
|
224
|
+
--preserve-formula-image-markers
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Prior measured result: the expanded VLM enhancement covered 139 pages containing about 55.9% of the original equation blocks; it took about 119.5 minutes across 90 chunks. This is useful as a targeted quality pass, not as the default full-book mode.
|
|
228
|
+
|
|
229
|
+
### 5. Backend tradeoffs for textbooks
|
|
230
|
+
|
|
231
|
+
| Backend/method | Use for | Avoid when |
|
|
232
|
+
|---|---|---|
|
|
233
|
+
| `pipeline + txt` | Full born-digital textbook base extraction. | Scanned books with no usable text layer. |
|
|
234
|
+
| `pipeline + ocr` | Scanned books or bad text layer. | Born-digital PDFs where OCR would degrade body text. |
|
|
235
|
+
| `hybrid-auto-engine + txt` | Selected pages where cleaner formulas justify slower runtime. | Whole-book extraction unless you have time budget. |
|
|
236
|
+
| `vlm-auto-engine` | Selected formula-heavy or semantically tricky pages. | Full long books by default; slower and still not perfect on complex variables/subscripts. |
|
|
237
|
+
| `native-text` | Fast preview, search, plain text, and baseline comparison. | Final artifact when formulas/tables/figures matter. |
|
|
238
|
+
|
|
239
|
+
### 6. Cloud equivalent for non-confidential textbooks
|
|
240
|
+
|
|
241
|
+
For official MinerU API on a non-confidential born-digital textbook, align cloud flags with the local finding:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
ocr-doc file.pdf \
|
|
245
|
+
--engine mineru-api \
|
|
246
|
+
--allow-cloud \
|
|
247
|
+
--model-version pipeline \
|
|
248
|
+
--no-is-ocr \
|
|
249
|
+
--enable-formula \
|
|
250
|
+
--enable-table \
|
|
251
|
+
--language en
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Use `--page-ranges` to test representative slices before sending a whole document:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
ocr-doc file.pdf --engine mineru-api --allow-cloud --page-ranges 31,203,337 --model-version pipeline --no-is-ocr
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Then compare with `--model-version vlm` on the same pages if formulas or layout are weak. For a full textbook, do not assume `vlm + OCR` is superior just because `vlm` is the recommended generic cloud model.
|
|
261
|
+
|
|
262
|
+
### 7. Known limitations
|
|
263
|
+
|
|
264
|
+
- MinerU can misread complex formula subscripts, superscripts, or variables.
|
|
265
|
+
- Map-like/diagram-heavy pages may become image references without useful visible-label text.
|
|
266
|
+
- Textbook indexes, tables of contents, and dense tables need manual spot checks.
|
|
267
|
+
- Keep the original PDF page numbers visible in the output so questionable content can be traced back.
|
|
268
|
+
|
|
269
|
+
Reliability lessons:
|
|
270
|
+
|
|
271
|
+
- Local MinerU may start a temporary FastAPI service and write final artifacts only after a task finishes.
|
|
272
|
+
- Lack of output in the target directory does not always mean no progress; check temp output, logs, CPU, and local service health.
|
|
273
|
+
- Long runs should be chunked. A 64-page chunk size worked well for the 880-page book.
|
|
274
|
+
- Set long timeouts, especially `MINERU_TASK_RESULT_TIMEOUT_SECONDS=86400`.
|
|
275
|
+
- Keep each chunk directory, `mineru.log`, and `chunk_status.json`.
|
|
276
|
+
|
|
277
|
+
Relevant prior scripts in `/Users/yupeit/dev/learn/quant/scripts/`:
|
|
278
|
+
|
|
279
|
+
- `run_mineru_chunks.py`: resumable local MinerU chunk runner.
|
|
280
|
+
- `run_mineru_selected_pages.py`: rerun selected pages, useful for formula-heavy VLM overlays.
|
|
281
|
+
- `merge_mineru_chunks.py`: merge MinerU `*_content_list.json` chunks into page-marked Markdown.
|
|
282
|
+
- `overlay_markdown_pages.py`: replace selected page blocks with VLM-enhanced output.
|
|
283
|
+
- `export_pdf_text_images.py`: extract PDF text/images/captions for separate image-reference workflows.
|
|
284
|
+
|
|
285
|
+
## Quality Checks
|
|
286
|
+
|
|
287
|
+
Always spot-check against the source PDF:
|
|
288
|
+
|
|
289
|
+
- Front matter and table of contents.
|
|
290
|
+
- A text-heavy page.
|
|
291
|
+
- A table page.
|
|
292
|
+
- A formula-heavy page.
|
|
293
|
+
- A figure/caption page.
|
|
294
|
+
- A scanned or low-contrast page if present.
|
|
295
|
+
- Final pages or index.
|
|
296
|
+
|
|
297
|
+
For formulas, keep page markers and image references when possible so questionable variables/subscripts can be checked against the original PDF.
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# MinerU API Reference
|
|
2
|
+
|
|
3
|
+
Source docs:
|
|
4
|
+
|
|
5
|
+
- English docs: https://mineru.net/doc/docs/index_en/
|
|
6
|
+
- Chinese docs: https://mineru.org.cn/doc/docs/
|
|
7
|
+
- Limits: https://mineru.org.cn/doc/docs/limit/
|
|
8
|
+
|
|
9
|
+
As of 2026-05-17, MinerU exposes two cloud API families.
|
|
10
|
+
|
|
11
|
+
## Precision Extract API
|
|
12
|
+
|
|
13
|
+
Use for non-confidential production parsing where structured outputs, formulas, tables, and large files matter.
|
|
14
|
+
|
|
15
|
+
Limits from the official docs:
|
|
16
|
+
|
|
17
|
+
- Token required.
|
|
18
|
+
- File size: up to 200MB.
|
|
19
|
+
- Page count: up to 600 pages.
|
|
20
|
+
- Batch: up to 200 files.
|
|
21
|
+
- Models: `pipeline`, `vlm`, `MinerU-HTML`.
|
|
22
|
+
- Output: ZIP with Markdown and JSON artifacts; export can include docx/html/latex.
|
|
23
|
+
|
|
24
|
+
Key endpoints:
|
|
25
|
+
|
|
26
|
+
```text
|
|
27
|
+
POST https://mineru.net/api/v4/extract/task
|
|
28
|
+
GET https://mineru.net/api/v4/extract/task/{task_id}
|
|
29
|
+
POST https://mineru.net/api/v4/file-urls/batch
|
|
30
|
+
POST https://mineru.net/api/v4/extract/task/batch
|
|
31
|
+
GET https://mineru.net/api/v4/extract-results/batch/{batch_id}
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
For local files, prefer signed upload:
|
|
35
|
+
|
|
36
|
+
1. `POST /api/v4/file-urls/batch` with Bearer token.
|
|
37
|
+
2. `PUT` file bytes to the returned `upload_url`.
|
|
38
|
+
3. Poll `GET /api/v4/extract-results/batch/{batch_id}`.
|
|
39
|
+
4. Download and unzip the returned result ZIP.
|
|
40
|
+
|
|
41
|
+
Common request fields:
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"enable_formula": true,
|
|
46
|
+
"enable_table": true,
|
|
47
|
+
"language": "en",
|
|
48
|
+
"model_version": "vlm",
|
|
49
|
+
"files": [
|
|
50
|
+
{
|
|
51
|
+
"name": "document.pdf",
|
|
52
|
+
"data_id": "stable-id",
|
|
53
|
+
"is_ocr": true,
|
|
54
|
+
"page_ranges": "1-10,15"
|
|
55
|
+
}
|
|
56
|
+
]
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Notes:
|
|
61
|
+
|
|
62
|
+
- `enable_formula` and `enable_table` apply to `pipeline` and `vlm`; for `vlm`, formula behavior especially affects inline formula extraction.
|
|
63
|
+
- `language` defaults to `ch` in docs; set `en` for English documents.
|
|
64
|
+
- `page_ranges` is a comma-separated 1-based range string.
|
|
65
|
+
- Use `MinerU-HTML` for HTML files.
|
|
66
|
+
|
|
67
|
+
## Precision API Parameter Guide
|
|
68
|
+
|
|
69
|
+
Use this table when calling `ocr-doc --engine mineru-api` or editing `ocr_router.py`.
|
|
70
|
+
|
|
71
|
+
| Parameter | Router flag | Default | Scope | Guidance |
|
|
72
|
+
|---|---|---:|---|---|
|
|
73
|
+
| `model_version` | `--model-version` | `vlm` in router, `pipeline` in official docs | Precision API | Use `vlm` for best cloud quality on complex PDFs. Use `pipeline` when speed/cost is more important or when reproducing local pipeline behavior. Use `MinerU-HTML` only for HTML files. |
|
|
74
|
+
| `is_ocr` | `--is-ocr` / `--no-is-ocr` | `true` in router, `false` in official docs | PDF with `pipeline`/`vlm` | `true` forces OCR behavior and is safer for scans/mixed PDFs. For born-digital PDFs with good text layer, `false` may preserve text-layer fidelity and reduce OCR artifacts. |
|
|
75
|
+
| `enable_formula` | `--enable-formula` / `--no-enable-formula` | `true` | `pipeline`/`vlm` | Keep `true` for textbooks, papers, STEM slides, finance/math docs. Disable only for speed or if formulas are irrelevant. For `vlm`, official docs note it mainly affects inline formula extraction. |
|
|
76
|
+
| `enable_table` | `--enable-table` / `--no-enable-table` | `true` | `pipeline`/`vlm` | Keep `true` for papers, invoices, reports, syllabi, tables of contents, and spreadsheets exported as PDF. Disable for pure prose if table recognition creates noise. |
|
|
77
|
+
| `language` | `--language` | `en` in router, `ch` in official docs | OCR recognition | Set explicitly. Use `en` for English, `ch` for Chinese, and supported language codes from MinerU docs for other languages. Wrong language mostly hurts OCR pages. |
|
|
78
|
+
| `page_ranges` | `--page-ranges` | omitted | Precision API local upload | Use to reduce cost/time or parse selected pages. Precision API supports comma-separated 1-based ranges such as `1-10,15`. |
|
|
79
|
+
| `files[].name` | derived from path | input filename | local upload | Must include a useful extension because MinerU uses it to infer file type. |
|
|
80
|
+
| `files[].data_id` | generated UUID | UUID | local upload | Stable caller-side ID for matching results in batch polling. Use deterministic IDs if integrating with a larger pipeline. |
|
|
81
|
+
| `url` / `files[].url` | not exposed by current router for local files | n/a | URL parse | Use URL parse only for already-public/non-confidential remote files. Local files should use signed upload. |
|
|
82
|
+
|
|
83
|
+
Parameter selection:
|
|
84
|
+
|
|
85
|
+
- Born-digital PDF, plain text needed: do not use cloud MinerU; use `native-text`.
|
|
86
|
+
- Born-digital textbook or long technical PDF with formulas/tables: start with `model_version=pipeline`, `is_ocr=false`, `enable_formula=true`, `enable_table=true`; test `vlm` only on representative difficult pages before scaling up.
|
|
87
|
+
- Born-digital short/complex PDF where generic cloud quality matters more than reproducing text-layer behavior: `model_version=vlm`, `is_ocr=false` is a reasonable first test.
|
|
88
|
+
- Scanned/mixed PDF: `model_version=vlm`, `is_ocr=true`, `enable_formula/table` as needed.
|
|
89
|
+
- HTML: `model_version=MinerU-HTML`; formula/table/OCR flags are not the primary controls.
|
|
90
|
+
- Selected pages: pass `--page-ranges` for cloud, or `--start-page/--end-page` for local MinerU.
|
|
91
|
+
|
|
92
|
+
## Agent Lightweight API
|
|
93
|
+
|
|
94
|
+
Use for quick, non-confidential, agent-style parsing.
|
|
95
|
+
|
|
96
|
+
Limits from the official docs:
|
|
97
|
+
|
|
98
|
+
- No token required.
|
|
99
|
+
- IP rate-limited.
|
|
100
|
+
- File size: up to 10MB.
|
|
101
|
+
- Page count: up to 20 pages.
|
|
102
|
+
- Single file only.
|
|
103
|
+
- Output: Markdown CDN link only.
|
|
104
|
+
|
|
105
|
+
Endpoints:
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
POST https://mineru.net/api/v1/agent/parse/url
|
|
109
|
+
POST https://mineru.net/api/v1/agent/parse/file
|
|
110
|
+
GET https://mineru.net/api/v1/agent/parse/{task_id}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Local file flow:
|
|
114
|
+
|
|
115
|
+
1. `POST /api/v1/agent/parse/file` with `file_name`, `language`, `page_range`, `enable_table`, `is_ocr`, `enable_formula`.
|
|
116
|
+
2. Upload file bytes with `PUT` to returned `data.file_url`.
|
|
117
|
+
3. Poll `GET /api/v1/agent/parse/{task_id}`.
|
|
118
|
+
4. Download returned Markdown URL when `data.state == done`.
|
|
119
|
+
|
|
120
|
+
## Agent API Parameter Guide
|
|
121
|
+
|
|
122
|
+
| Parameter | Router support | Default | Scope | Guidance |
|
|
123
|
+
|---|---|---:|---|---|
|
|
124
|
+
| `language` | current router uses default unless extended | `ch` in docs | PDF only | Same language guidance as Precision API. |
|
|
125
|
+
| `page_range` | not yet exposed by router | omitted | PDF only | Agent API supports either `from-to` such as `1-10` or a single page such as `5`; comma-separated complex ranges are not supported. |
|
|
126
|
+
| `enable_table` | current router uses API default unless extended | `true` | PDF only | Leave true unless table detection creates noise. |
|
|
127
|
+
| `is_ocr` | current router uses API default unless extended | `false` | PDF only | Set true for scans if extending the router. |
|
|
128
|
+
| `enable_formula` | current router uses API default unless extended | `true` | PDF only | Leave true for STEM papers; disable for prose-only documents if speed/noise matters. |
|
|
129
|
+
| `file_name` | derived from path | required for file upload | file upload | Include extension. |
|
|
130
|
+
|
|
131
|
+
The Agent API is intentionally narrow: single file, small size/page limits, no token, and Markdown-only output. Use it for quick agent ingestion, not durable production extraction.
|
|
132
|
+
|
|
133
|
+
Observed task states include:
|
|
134
|
+
|
|
135
|
+
```text
|
|
136
|
+
waiting-file
|
|
137
|
+
uploading
|
|
138
|
+
pending
|
|
139
|
+
running
|
|
140
|
+
done
|
|
141
|
+
failed
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Credential Rules
|
|
145
|
+
|
|
146
|
+
Prefer Keychain for local-only credentials:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
keychain-secret get codex.mineru credential
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The router also accepts:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
MINERU_API_TOKEN=...
|
|
156
|
+
MINERU_TOKEN=...
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Do not write MinerU tokens to source files, `.env` files, issue text, logs, or final responses.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
source_path="${BASH_SOURCE[0]}"
|
|
5
|
+
while [[ -L "${source_path}" ]]; do
|
|
6
|
+
source_dir="$(cd -P "$(dirname "${source_path}")" && pwd)"
|
|
7
|
+
target="$(readlink "${source_path}")"
|
|
8
|
+
if [[ "${target}" == /* ]]; then
|
|
9
|
+
source_path="${target}"
|
|
10
|
+
else
|
|
11
|
+
source_path="${source_dir}/${target}"
|
|
12
|
+
fi
|
|
13
|
+
done
|
|
14
|
+
|
|
15
|
+
script_dir="$(cd -P "$(dirname "${source_path}")" && pwd)"
|
|
16
|
+
venv_python="${script_dir}/../.venv/bin/python"
|
|
17
|
+
|
|
18
|
+
if [[ -x "${venv_python}" ]]; then
|
|
19
|
+
exec "${venv_python}" "${script_dir}/ocr_router.py" "$@"
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
exec python3 "${script_dir}/ocr_router.py" "$@"
|