mimo2codex 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +24 -5
- package/README.md +46 -5
- package/README.zh.md +46 -5
- package/dist/admin/router.js +117 -2
- package/dist/admin/router.js.map +1 -1
- package/dist/cli.js +67 -147
- package/dist/cli.js.map +1 -1
- package/dist/config.js +16 -10
- package/dist/config.js.map +1 -1
- package/dist/db/logs.js +80 -0
- package/dist/db/logs.js.map +1 -1
- package/dist/providers/generic.js +96 -0
- package/dist/providers/generic.js.map +1 -0
- package/dist/providers/genericLoader.js +229 -0
- package/dist/providers/genericLoader.js.map +1 -0
- package/dist/providers/registry.js +48 -10
- package/dist/providers/registry.js.map +1 -1
- package/dist/server.js +201 -1
- package/dist/server.js.map +1 -1
- package/dist/setup/snippets.js +187 -0
- package/dist/setup/snippets.js.map +1 -0
- package/dist/translate/reqToChat.js +1 -1
- package/dist/translate/reqToChat.js.map +1 -1
- package/dist/upstream/openaiCompatClient.js +32 -11
- package/dist/upstream/openaiCompatClient.js.map +1 -1
- package/dist/web/assets/index-D19ffnSJ.css +1 -0
- package/dist/web/assets/index-DPLJprJ4.js +67 -0
- package/dist/web/index.html +2 -2
- package/doc/generic-providers.md +399 -0
- package/doc/generic-providers.zh.md +399 -0
- package/mimoskill/SKILL.md +69 -8
- package/mimoskill/references/ocr_workflow.md +216 -0
- package/mimoskill/scripts/generate_image.py +163 -0
- package/mimoskill/scripts/ocr.py +396 -0
- package/package.json +5 -4
- package/dist/web/assets/index-BoykBCnY.js +0 -67
- package/dist/web/assets/index-DAJbSznk.css +0 -1
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# OCR / image recognition workflow
|
|
2
|
+
|
|
3
|
+
`mimoskill/scripts/ocr.py` is the fallback path for reading or describing
|
|
4
|
+
images when the surrounding chat model can't see them. It always calls
|
|
5
|
+
`mimo-v2.5` (MiMo's vision-capable model) internally, regardless of which
|
|
6
|
+
model the rest of the conversation is using.
|
|
7
|
+
|
|
8
|
+
## TL;DR
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
export MIMO_API_KEY=sk-xxxxxxxxxxxxxxxx
|
|
12
|
+
|
|
13
|
+
# default mode (text) — verbatim OCR
|
|
14
|
+
python3 mimoskill/scripts/ocr.py path/to/image.png
|
|
15
|
+
|
|
16
|
+
# describe the image in 2-4 sentences
|
|
17
|
+
python3 mimoskill/scripts/ocr.py --mode describe path/to/image.png
|
|
18
|
+
|
|
19
|
+
# structured JSON (text + regions + language + summary)
|
|
20
|
+
python3 mimoskill/scripts/ocr.py --mode structured a.png b.jpg
|
|
21
|
+
|
|
22
|
+
# re-render as GitHub-flavored Markdown
|
|
23
|
+
python3 mimoskill/scripts/ocr.py --mode markdown form.png
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Why this skill exists
|
|
27
|
+
|
|
28
|
+
The proxy strips image attachments when the active chat model can't accept
|
|
29
|
+
them (`src/translate/reqToChat.ts:48-72`). Non-vision MiMo variants —
|
|
30
|
+
`mimo-v2.5-pro`, `mimo-v2.5-pro[1m]`, `mimo-v2-flash` — return 404
|
|
31
|
+
"No endpoints found that support image input" if images are forwarded.
|
|
32
|
+
The proxy drops the images and leaves an `[N image attachment(s) omitted: …]`
|
|
33
|
+
placeholder so the conversation doesn't crash.
|
|
34
|
+
|
|
35
|
+
`ocr.py` is the recommended way to recover that content **without changing
|
|
36
|
+
the chat model**: it independently calls `mimo-v2.5`, returns text, and the
|
|
37
|
+
caller pipes that text back into the conversation as a normal user message.
|
|
38
|
+
|
|
39
|
+
## Input modes
|
|
40
|
+
|
|
41
|
+
The positional `IMAGE` args (0 or more) accept:
|
|
42
|
+
|
|
43
|
+
| Form | Example | What ocr.py does |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| Local path | `./scan.png`, `C:\foo.jpg` | reads bytes, magic-byte sniffs MIME, base64-encodes to a `data:` URL |
|
|
46
|
+
| `http(s)://` URL | `https://example.com/x.png` | forwarded as-is; MiMo fetches server-side |
|
|
47
|
+
| `data:` URL | `data:image/png;base64,…` | forwarded as-is |
|
|
48
|
+
| `-` (single dash) | piped from stdin | reads one image's bytes from stdin |
|
|
49
|
+
| nothing + non-TTY stdin | `cat x.png \| ocr.py` | same as `-` |
|
|
50
|
+
|
|
51
|
+
Magic-byte table (file extension is **not** trusted):
|
|
52
|
+
|
|
53
|
+
| Bytes | MIME |
|
|
54
|
+
|---|---|
|
|
55
|
+
| `89 50 4E 47 0D 0A 1A 0A` | `image/png` |
|
|
56
|
+
| `FF D8 FF` | `image/jpeg` |
|
|
57
|
+
| `47 49 46 38 37 61` / `…39 61` | `image/gif` |
|
|
58
|
+
| `52 49 46 46 …. 57 45 42 50` | `image/webp` |
|
|
59
|
+
| `42 4D` | `image/bmp` |
|
|
60
|
+
| (anything else) | falls back to `image/png` |
|
|
61
|
+
|
|
62
|
+
## Output modes (`--mode`)
|
|
63
|
+
|
|
64
|
+
### `text` (default) — verbatim OCR
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
python3 mimoskill/scripts/ocr.py invoice.png
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Stdout is the raw extracted text. Line breaks, reading order, and rough
|
|
71
|
+
column/table layout (whitespace + pipes) are preserved. No commentary, no
|
|
72
|
+
translation, no summary. Unreadable spans become `[unreadable]`. Image with
|
|
73
|
+
no text returns the single line `[no text detected]`.
|
|
74
|
+
|
|
75
|
+
### `describe` — short prose description
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python3 mimoskill/scripts/ocr.py --mode describe screenshot.png
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
2-4 sentences covering layout, key elements, visible text (quoted), and
|
|
82
|
+
notable colors. No invented details.
|
|
83
|
+
|
|
84
|
+
### `structured` — JSON
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
python3 mimoskill/scripts/ocr.py --mode structured form.png
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Stdout is a single JSON object:
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"text": "...",
|
|
95
|
+
"language": "zh-Hans",
|
|
96
|
+
"regions": [
|
|
97
|
+
{"label": "title", "text": "增值税电子发票", "role": "title"},
|
|
98
|
+
{"label": "buyer", "text": "...", "role": "paragraph"},
|
|
99
|
+
{"label": "items", "text": "...", "role": "table"}
|
|
100
|
+
],
|
|
101
|
+
"summary": "A Chinese VAT e-invoice with buyer/seller and four line items."
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
`regions[].role` is one of `title`, `paragraph`, `list`, `table`, `caption`,
|
|
106
|
+
`ui`, `handwriting`, `other`.
|
|
107
|
+
|
|
108
|
+
**Note**: `structured` returns **logical regions** (role classification),
|
|
109
|
+
not pixel bounding boxes. MiMo does not currently expose grounded pixel
|
|
110
|
+
coordinates the way some other vision models do; this skill won't pretend
|
|
111
|
+
to. If you need pixel boxes, use a model that does (e.g. Gemini grounding,
|
|
112
|
+
Tesseract with `--psm 6` + position data).
|
|
113
|
+
|
|
114
|
+
### `markdown` — re-render as GFM
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
python3 mimoskill/scripts/ocr.py --mode markdown spec.png
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Headings become `#`/`##`, tables become pipe tables, code-like text becomes
|
|
121
|
+
fenced blocks, lists become `-`. Reading order preserved. Output is the
|
|
122
|
+
Markdown body only — no preamble, no outer fence.
|
|
123
|
+
|
|
124
|
+
## Batch (multi-image) calls
|
|
125
|
+
|
|
126
|
+
Pass multiple positional args:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
python3 mimoskill/scripts/ocr.py page1.png page2.png page3.png
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
All images go to MiMo in a **single** chat completion (one billable call).
|
|
133
|
+
The model can cross-reference (e.g. ID front + back). Output is a single
|
|
134
|
+
text body in reading order across the images.
|
|
135
|
+
|
|
136
|
+
When you need a different prompt per image, run `ocr.py` N times instead.
|
|
137
|
+
|
|
138
|
+
## `--lang` and `--prompt`
|
|
139
|
+
|
|
140
|
+
- `--lang LANG` appends `Primary language: <LANG>.` to the prompt. Useful
|
|
141
|
+
for CJK to prevent the model from outputting Pinyin transliteration:
|
|
142
|
+
`ocr.py --lang Chinese scan.png` or `--lang zh` or `--lang 日本語`.
|
|
143
|
+
|
|
144
|
+
- `--prompt EXTRA` appends a free-text instruction:
|
|
145
|
+
`ocr.py --mode text --prompt "Only handwriting, ignore printed text." form.png`
|
|
146
|
+
|
|
147
|
+
## Model selection
|
|
148
|
+
|
|
149
|
+
| You pass | ocr.py uses |
|
|
150
|
+
|---|---|
|
|
151
|
+
| nothing | `$MIMO_OCR_MODEL` → `$MIMO_MODEL` (if vision-capable) → `mimo-v2.5` |
|
|
152
|
+
| `--model mimo-v2.5` | `mimo-v2.5` |
|
|
153
|
+
| `--model mimo-v2.5[1m]` | `mimo-v2.5[1m]` |
|
|
154
|
+
| `--model mimo-v2-omni` | `mimo-v2-omni` |
|
|
155
|
+
| `--model mimo-v2.5-pro` | **switches to `mimo-v2.5`** (stderr note) |
|
|
156
|
+
| `--model mimo-v2.5-pro[1m]` | **switches to `mimo-v2.5`** |
|
|
157
|
+
| `--model mimo-v2-flash` | **switches to `mimo-v2.5`** |
|
|
158
|
+
|
|
159
|
+
Non-vision models would return 404 from MiMo, so the script coerces them
|
|
160
|
+
silently (one stderr line) rather than failing.
|
|
161
|
+
|
|
162
|
+
## When `MIMO_API_KEY` isn't set
|
|
163
|
+
|
|
164
|
+
`ocr.py` exits with code `3` and this stderr message:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
error: MIMO_API_KEY is not set; ocr.py needs MiMo V2.5 vision to read images.
|
|
168
|
+
set one at https://platform.xiaomimimo.com/#/console/api-keys
|
|
169
|
+
OR if you want fully-local OCR with no API key, install tesseract:
|
|
170
|
+
macOS: brew install tesseract tesseract-lang
|
|
171
|
+
Ubuntu: sudo apt install tesseract-ocr tesseract-ocr-chi-sim
|
|
172
|
+
Windows: https://github.com/UB-Mannheim/tesseract/wiki
|
|
173
|
+
then run: tesseract <image> - -l eng+chi_sim
|
|
174
|
+
(tesseract is NOT installed or invoked by this skill; this is just a pointer.)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
The tesseract pointer is **just a pointer** — this skill never auto-shells
|
|
178
|
+
to it. Keeps the dependency surface predictable.
|
|
179
|
+
|
|
180
|
+
## Common pitfalls
|
|
181
|
+
|
|
182
|
+
- **PDFs are not supported directly.** Rasterize first with one of:
|
|
183
|
+
- `pdftoppm -png input.pdf out` (Poppler)
|
|
184
|
+
- `mutool draw -o out-%d.png input.pdf` (MuPDF)
|
|
185
|
+
- macOS: `sips -s format png input.pdf --out out.png`
|
|
186
|
+
- **Multi-image batches share one prompt.** If you need different modes /
|
|
187
|
+
languages per image, invoke `ocr.py` once per image.
|
|
188
|
+
- **`structured` mode is logical regions, not pixel boxes.** See above.
|
|
189
|
+
- **`--stream` + `structured`**: the streamed body is still a single JSON
|
|
190
|
+
object; buffer it before parsing.
|
|
191
|
+
|
|
192
|
+
## Exit codes
|
|
193
|
+
|
|
194
|
+
| Code | Meaning |
|
|
195
|
+
|---|---|
|
|
196
|
+
| 0 | Success |
|
|
197
|
+
| 1 | MiMo HTTP error (error body printed to stderr) |
|
|
198
|
+
| 2 | argv / usage error (no image, mutually exclusive flags, etc.) |
|
|
199
|
+
| 3 | `MIMO_API_KEY` not set |
|
|
200
|
+
| 4 | Local image file not found / unreadable |
|
|
201
|
+
|
|
202
|
+
## Composing with `mimo_chat.py`
|
|
203
|
+
|
|
204
|
+
OCR + downstream LLM call is a common pattern:
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
TEXT=$(python3 mimoskill/scripts/ocr.py invoice.png)
|
|
208
|
+
python3 mimoskill/scripts/mimo_chat.py "Summarize this invoice:\n$TEXT"
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Or structured + parse:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
JSON=$(python3 mimoskill/scripts/ocr.py --mode structured invoice.png)
|
|
215
|
+
echo "$JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['summary'])"
|
|
216
|
+
```
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
generate_image.py — general (non-pet) image generation.
|
|
4
|
+
|
|
5
|
+
Thin wrapper over generate_pet.py: same providers (auto / pollinations /
|
|
6
|
+
gpt-image-1 / replicate / local-sd), no chibi-pet prompt boilerplate,
|
|
7
|
+
plus an optional --style for common looks.
|
|
8
|
+
|
|
9
|
+
For Codex /hatch pets, keep using generate_pet.py — it has pet-tuned prompt
|
|
10
|
+
prefixes and the --bundle (idle/working/done) state machine.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
# free, no key
|
|
14
|
+
python3 generate_image.py --prompt "isometric cyberpunk city at dusk" --out out.png
|
|
15
|
+
|
|
16
|
+
# styled
|
|
17
|
+
python3 generate_image.py --style pixel-art --prompt "a brave knight" --out k.png
|
|
18
|
+
|
|
19
|
+
# best quality (needs PET_OPENAI_API_KEY — same env var as the pet flow)
|
|
20
|
+
python3 generate_image.py --provider gpt-image-1 --prompt "..." --out out.png
|
|
21
|
+
|
|
22
|
+
# multiple variants
|
|
23
|
+
python3 generate_image.py --n 4 --prompt "..." --out img.png
|
|
24
|
+
# produces img-1.png, img-2.png, img-3.png, img-4.png
|
|
25
|
+
|
|
26
|
+
Only depends on the standard library.
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import importlib.util
|
|
32
|
+
import sys
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Load generate_pet.py as a module by absolute path (not `import generate_pet`)
|
|
37
|
+
# — the skill is invoked from arbitrary cwd, and we don't ship an __init__.py.
|
|
38
|
+
_HERE = Path(__file__).resolve().parent
|
|
39
|
+
_GP_PATH = _HERE / "generate_pet.py"
|
|
40
|
+
_spec = importlib.util.spec_from_file_location("_generate_pet", _GP_PATH)
|
|
41
|
+
if _spec is None or _spec.loader is None:
|
|
42
|
+
sys.stderr.write(f"error: cannot load {_GP_PATH}\n")
|
|
43
|
+
sys.exit(2)
|
|
44
|
+
_gp = importlib.util.module_from_spec(_spec)
|
|
45
|
+
_spec.loader.exec_module(_gp)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# --- style presets ----------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
STYLES: dict[str, tuple[str, str]] = {
|
|
51
|
+
"plain": ("", ""),
|
|
52
|
+
"pixel-art": (
|
|
53
|
+
"Retro 16-bit pixel art sprite of ",
|
|
54
|
+
", transparent background, single sprite, nearest-neighbor",
|
|
55
|
+
),
|
|
56
|
+
"photo": (
|
|
57
|
+
"Photorealistic photograph of ",
|
|
58
|
+
", natural lighting, sharp focus, shallow depth of field",
|
|
59
|
+
),
|
|
60
|
+
"3d-render": (
|
|
61
|
+
"Cute 3D render of ",
|
|
62
|
+
", soft global illumination, octane render",
|
|
63
|
+
),
|
|
64
|
+
"line-art": (
|
|
65
|
+
"Black ink line art of ",
|
|
66
|
+
", clean linework, white background, no shading",
|
|
67
|
+
),
|
|
68
|
+
"watercolor": (
|
|
69
|
+
"Hand-drawn ink and watercolor of ",
|
|
70
|
+
", loose linework, watercolor wash",
|
|
71
|
+
),
|
|
72
|
+
"sticker": (
|
|
73
|
+
"Chibi sticker mascot of ",
|
|
74
|
+
", transparent background, soft cel-shading, single character",
|
|
75
|
+
),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def apply_style(prompt: str, style: str) -> str:
|
|
80
|
+
prefix, suffix = STYLES[style]
|
|
81
|
+
if not prefix and not suffix:
|
|
82
|
+
return prompt
|
|
83
|
+
body = prompt.strip().rstrip(".,;")
|
|
84
|
+
return f"{prefix}{body}{suffix}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --- main -------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def main() -> None:
|
|
90
|
+
p = argparse.ArgumentParser(
|
|
91
|
+
description=__doc__.split("\n", 1)[0],
|
|
92
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
93
|
+
)
|
|
94
|
+
p.add_argument("--prompt", required=True, help="what to draw (used verbatim by default)")
|
|
95
|
+
p.add_argument(
|
|
96
|
+
"--style",
|
|
97
|
+
choices=list(STYLES),
|
|
98
|
+
default="plain",
|
|
99
|
+
help="optional prompt preset (default: plain — no prefix/suffix)",
|
|
100
|
+
)
|
|
101
|
+
p.add_argument(
|
|
102
|
+
"--provider",
|
|
103
|
+
choices=["auto"] + list(_gp.PROVIDERS),
|
|
104
|
+
default="auto",
|
|
105
|
+
help="image gen backend (same as generate_pet.py)",
|
|
106
|
+
)
|
|
107
|
+
p.add_argument("--reference", type=Path, help="reference image (gpt-image-1 only)")
|
|
108
|
+
p.add_argument("--quality", default="medium", choices=["low", "medium", "high", "hd"])
|
|
109
|
+
p.add_argument("--out", type=Path, required=True, help="output path (PNG)")
|
|
110
|
+
p.add_argument("--n", type=int, default=1, help="number of variants to generate")
|
|
111
|
+
p.add_argument("--size", default=None, help="forwarded where supported (e.g. 1024x1024)")
|
|
112
|
+
p.add_argument("--seed", type=int, default=None, help="forwarded where supported")
|
|
113
|
+
args = p.parse_args()
|
|
114
|
+
|
|
115
|
+
if args.n < 1:
|
|
116
|
+
sys.stderr.write("error: --n must be >= 1\n")
|
|
117
|
+
sys.exit(2)
|
|
118
|
+
|
|
119
|
+
# Resolve auto provider with the same status line generate_pet.py emits.
|
|
120
|
+
if args.provider == "auto":
|
|
121
|
+
chosen = _gp.resolve_auto_provider()
|
|
122
|
+
if chosen == "pollinations":
|
|
123
|
+
sys.stderr.write(
|
|
124
|
+
"[provider] auto -> pollinations (free, no key required).\n"
|
|
125
|
+
" For higher quality, set PET_OPENAI_API_KEY (real OpenAI key)\n"
|
|
126
|
+
" and rerun, or pass --provider replicate / local-sd.\n\n"
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
sys.stderr.write(f"[provider] auto -> {chosen}\n\n")
|
|
130
|
+
args.provider = chosen
|
|
131
|
+
|
|
132
|
+
final_prompt = apply_style(args.prompt, args.style)
|
|
133
|
+
sys.stderr.write(f"prompt: {final_prompt}\n")
|
|
134
|
+
|
|
135
|
+
# --size / --seed: emit a note where the underlying provider doesn't
|
|
136
|
+
# plumb them through. v1 forwards nothing (generate_pet.py hard-codes
|
|
137
|
+
# 1024x1024 / no seed); future versions can extend per-provider.
|
|
138
|
+
if args.size and args.size != "1024x1024":
|
|
139
|
+
sys.stderr.write(
|
|
140
|
+
f"note: --size {args.size} ignored in v1 (providers run at 1024x1024).\n"
|
|
141
|
+
)
|
|
142
|
+
if args.seed is not None:
|
|
143
|
+
sys.stderr.write(
|
|
144
|
+
"note: --seed ignored in v1 (not plumbed through to providers yet).\n"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def out_path_for(i: int) -> Path:
|
|
148
|
+
if args.n == 1:
|
|
149
|
+
return args.out
|
|
150
|
+
stem = args.out.stem
|
|
151
|
+
suffix = args.out.suffix or ".png"
|
|
152
|
+
return args.out.parent / f"{stem}-{i + 1}{suffix}"
|
|
153
|
+
|
|
154
|
+
for i in range(args.n):
|
|
155
|
+
out = out_path_for(i)
|
|
156
|
+
sys.stderr.write(f"generating ({i + 1}/{args.n}) -> {out}\n")
|
|
157
|
+
_gp.generate_one(args.provider, final_prompt, args.reference, args.quality, out)
|
|
158
|
+
|
|
159
|
+
sys.stderr.write(f"\n[ok] wrote {args.n} image(s)\n")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|