mimo2codex 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +24 -5
- package/README.md +70 -6
- package/README.zh.md +69 -6
- package/dist/admin/router.js +117 -2
- package/dist/admin/router.js.map +1 -1
- package/dist/cli.js +67 -147
- package/dist/cli.js.map +1 -1
- package/dist/config.js +16 -10
- package/dist/config.js.map +1 -1
- package/dist/db/logs.js +80 -0
- package/dist/db/logs.js.map +1 -1
- package/dist/providers/generic.js +96 -0
- package/dist/providers/generic.js.map +1 -0
- package/dist/providers/genericLoader.js +229 -0
- package/dist/providers/genericLoader.js.map +1 -0
- package/dist/providers/registry.js +48 -10
- package/dist/providers/registry.js.map +1 -1
- package/dist/server.js +201 -1
- package/dist/server.js.map +1 -1
- package/dist/setup/snippets.js +187 -0
- package/dist/setup/snippets.js.map +1 -0
- package/dist/translate/reqToChat.js +42 -2
- package/dist/translate/reqToChat.js.map +1 -1
- package/dist/upstream/openaiCompatClient.js +32 -11
- package/dist/upstream/openaiCompatClient.js.map +1 -1
- package/dist/web/assets/index-D19ffnSJ.css +1 -0
- package/dist/web/assets/index-DPLJprJ4.js +67 -0
- package/dist/web/index.html +2 -2
- package/doc/generic-providers.md +399 -0
- package/doc/generic-providers.zh.md +399 -0
- package/doc/mimoskill.md +295 -0
- package/doc/mimoskill.zh.md +295 -0
- package/mimoskill/SKILL.md +80 -13
- package/mimoskill/references/ocr_workflow.md +240 -0
- package/mimoskill/scripts/generate_image.py +163 -0
- package/mimoskill/scripts/mimo_chat.py +111 -42
- package/mimoskill/scripts/ocr.py +445 -0
- package/package.json +5 -4
- package/dist/web/assets/index-BoykBCnY.js +0 -67
- package/dist/web/assets/index-DAJbSznk.css +0 -1
package/mimoskill/SKILL.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: mimoskill
|
|
3
|
-
description: Use Xiaomi MiMo V2.5 (the LLM behind mimo2codex) for chat, vision, web search, TTS and ASR — and route around capabilities MiMo doesn't natively support, especially image generation
|
|
3
|
+
description: Use Xiaomi MiMo V2.5 (the LLM behind mimo2codex) for chat, vision, web search, TTS and ASR — and route around capabilities MiMo doesn't natively support, especially OCR / image recognition / 识图 / 提取图片文字 / extract text from image when the current model can't see images, and image generation / 图像生成 / 生成图片 / draw a picture / 画一张 including Codex Pets `/hatch`. Trigger when the user mentions MiMo, calls into mimo2codex, asks to read text from an image, asks to describe or 识别 an image while using a non-vision model (mimo-v2.5-pro, mimo-v2-flash, …), asks to generate / hatch a Codex pet, asks for image generation while using MiMo as the chat backend, or hits a "no image generation available" / "image_gen tool unavailable" / "this model does not support image input" message inside Codex.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# mimoskill — Xiaomi MiMo V2.5 + gap fillers
|
|
@@ -18,6 +18,8 @@ Trigger this skill when:
|
|
|
18
18
|
- User asks "how do I generate a Codex pet" / "/hatch isn't working" / "image_gen tool not available"
|
|
19
19
|
- User wants image generation as part of a MiMo-backed workflow
|
|
20
20
|
- User pastes the Codex error: `the image generation tool (image_gen) is not available in this environment` or `the CLI fallback requires the openai Python package`
|
|
21
|
+
- User wants to **OCR / read text from / describe / 识别 / 提取文字 from an image** while the active chat model is non-vision (e.g. mimo-v2.5-pro, mimo-v2-flash, deepseek-*, or any third-party text-only model) — use `scripts/ocr.py`. Works with or without a MiMo key (free pollinations fallback when `MIMO_API_KEY` is unset).
|
|
22
|
+
- User sees the proxy's `[N image attachment(s) omitted: this model does not support image input …]` placeholder in their transcript
|
|
21
23
|
- Anything in the `mimo2codex` repo that touches a feature MiMo doesn't support
|
|
22
24
|
|
|
23
25
|
## What MiMo V2.5 does and doesn't do
|
|
@@ -35,7 +37,8 @@ Quick answer:
|
|
|
35
37
|
| ASR (speech recog) | ✅ | `mimo-v2.5-asr` | separate endpoint |
|
|
36
38
|
| Audio chat | ✅ | `mimo-v2-omni` | input only |
|
|
37
39
|
| Video understanding | ✅ | `mimo-v2-omni` | input only |
|
|
38
|
-
| **Image generation** | ❌ | — |
|
|
40
|
+
| **Image generation** | ❌ | — | `scripts/generate_image.py` (general) or `scripts/generate_pet.py` (Codex pets) — see below |
|
|
41
|
+
| OCR / 识图 (when chat model is non-vision) | ⚠️ via `mimo-v2.5` or free pollinations | `scripts/ocr.py` | `--engine auto`: mimo if `MIMO_API_KEY` set, else pollinations (no key) |
|
|
39
42
|
| Code interpreter / sandbox | ❌ | — | not provided |
|
|
40
43
|
|
|
41
44
|
For the full capability matrix and examples, read [references/models.md](references/models.md).
|
|
@@ -43,31 +46,95 @@ For the full capability matrix and examples, read [references/models.md](referen
|
|
|
43
46
|
## Decision tree: what does the user actually want?
|
|
44
47
|
|
|
45
48
|
```
|
|
46
|
-
Is it
|
|
47
|
-
|
|
48
|
-
|
|
49
|
+
Is it OCR / read text from image / describe / 识别 an image
|
|
50
|
+
when the active chat model is non-vision?
|
|
51
|
+
├── Yes → use scripts/ocr.py (mimo-v2.5 if MIMO_API_KEY set, else free pollinations)
|
|
52
|
+
└── No
|
|
49
53
|
│
|
|
50
|
-
Is it
|
|
51
|
-
├── Yes → see "
|
|
52
|
-
└── No
|
|
54
|
+
Is it chat / vision / search / TTS / ASR with a vision-capable model?
|
|
55
|
+
├── Yes → use MiMo directly (see "Calling MiMo directly" below) or via mimo2codex if Codex is the client
|
|
56
|
+
└── No, they want image generation
|
|
57
|
+
│
|
|
58
|
+
Is it for a Codex pet (`/hatch`)?
|
|
59
|
+
├── Yes → see "Generating a Codex pet" below (scripts/generate_pet.py + install_pet.sh)
|
|
60
|
+
└── No → see "General (non-pet) image generation" below (scripts/generate_image.py)
|
|
53
61
|
```
|
|
54
62
|
|
|
55
|
-
## Calling
|
|
63
|
+
## Calling chat directly (works without any key)
|
|
56
64
|
|
|
57
|
-
Use `scripts/mimo_chat.py`
|
|
65
|
+
Use `scripts/mimo_chat.py` for one-shot or streaming chat. Two engines, `--engine auto` (default) picks `mimo` if `MIMO_API_KEY` is set, else `pollinations` (free, no key) — so **the script works without any key** for text and vision.
|
|
58
66
|
|
|
59
67
|
```bash
|
|
68
|
+
# Zero-setup — uses pollinations fallback when MIMO_API_KEY is unset
|
|
69
|
+
python3 mimoskill/scripts/mimo_chat.py "your prompt here"
|
|
70
|
+
python3 mimoskill/scripts/mimo_chat.py --image https://example.com/x.png "describe this"
|
|
71
|
+
|
|
72
|
+
# Best quality + MiMo-specific features (web search, TTS, ASR)
|
|
60
73
|
export MIMO_API_KEY=sk-xxxxxxxxxxxxxxxx
|
|
61
74
|
python3 mimoskill/scripts/mimo_chat.py "your prompt here"
|
|
62
|
-
python3 mimoskill/scripts/mimo_chat.py
|
|
63
|
-
python3 mimoskill/scripts/mimo_chat.py --search "今天上海天气?"
|
|
75
|
+
python3 mimoskill/scripts/mimo_chat.py "今天上海天气?" # web search auto-enabled on sk-* keys
|
|
64
76
|
python3 mimoskill/scripts/mimo_chat.py --stream "tell me a story"
|
|
65
77
|
```
|
|
66
78
|
|
|
67
|
-
|
|
79
|
+
When the mimo engine is active the script handles all MiMo-specific quirks — `max_completion_tokens` instead of `max_tokens`, the required `text` part next to `image_url`, `reasoning_content` round-tripping, etc. **Web search is auto-enabled on pay-as-you-go (`sk-*`) keys** — the `web_search` builtin is always included in the tools array and the model decides when to invoke it (`tool_choice: "auto"`). Token-plan (`tp-*`) keys skip web search (the endpoint doesn't support it). The pollinations engine doesn't support web search, TTS, or ASR (those are MiMo native features); it auto-switches to OpenAI-compat field names (`max_tokens`).
|
|
68
80
|
|
|
69
81
|
For non-trivial integrations, [references/models.md](references/models.md) and [the official MiMo OpenAI-compat doc](https://platform.xiaomimimo.com/docs/api/chat/openai-api) are the authoritative references.
|
|
70
82
|
|
|
83
|
+
## OCR / image recognition (when the chat model can't see images)
|
|
84
|
+
|
|
85
|
+
If the user wants to **read text from an image** or **describe / 识别 an image** but the current chat model is non-vision (`mimo-v2.5-pro`, `mimo-v2.5-pro[1m]`, `mimo-v2-flash`, `deepseek-*`, or any third-party text-only model), invoke `scripts/ocr.py`. Two engines, `--engine auto` (default) picks the right one:
|
|
86
|
+
|
|
87
|
+
- **`mimo`** — needs `MIMO_API_KEY`, uses `mimo-v2.5` regardless of the chat model. Best quality.
|
|
88
|
+
- **`pollinations`** — free public vision endpoint at `text.pollinations.ai`, **no key required**. Mirrors the same no-key fallback `generate_pet.py` uses. Rate-limited but always available — covers users who only have a DeepSeek key (or no key at all).
|
|
89
|
+
|
|
90
|
+
The proxy silently drops image attachments on non-vision models (`src/translate/reqToChat.ts:48-72`) and leaves a `[N image attachment(s) omitted: …]` placeholder. **When you see that placeholder in the transcript, the right move is to run ocr.py and feed the text back into the conversation.** Don't ask the user to switch models.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Zero-setup — uses pollinations fallback when MIMO_API_KEY is unset
|
|
94
|
+
python3 mimoskill/scripts/ocr.py path/to/image.png
|
|
95
|
+
python3 mimoskill/scripts/ocr.py --mode describe https://example.com/x.png
|
|
96
|
+
python3 mimoskill/scripts/ocr.py --mode structured a.png b.jpg
|
|
97
|
+
cat scan.png | python3 mimoskill/scripts/ocr.py --mode markdown
|
|
98
|
+
|
|
99
|
+
# Best quality — set MiMo key, auto picks mimo
|
|
100
|
+
export MIMO_API_KEY=sk-xxxxxxxxxxxxxxxx
|
|
101
|
+
python3 mimoskill/scripts/ocr.py path/to/image.png
|
|
102
|
+
|
|
103
|
+
# Force the free engine even when you have a MiMo key (e.g. to save quota)
|
|
104
|
+
python3 mimoskill/scripts/ocr.py --engine pollinations form.png
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
`ocr.py` accepts local paths, http(s) URLs, `data:` URLs, or stdin bytes. Magic-byte sniffs the MIME (PNG / JPEG / GIF / WebP / BMP). Multiple positional args are batched into one upstream call. Non-vision `--model` values are auto-coerced to `mimo-v2.5` with one stderr note (mimo engine only; on pollinations use `--pollinations-model`).
|
|
108
|
+
|
|
109
|
+
See [references/ocr_workflow.md](references/ocr_workflow.md) for full mode reference, exit codes, JSON shape for `--mode structured`, and the `--lang` / `--prompt` knobs.
|
|
110
|
+
|
|
111
|
+
## General (non-pet) image generation
|
|
112
|
+
|
|
113
|
+
For arbitrary image generation, use `scripts/generate_image.py` — a thin wrapper over `generate_pet.py` with the chibi-pet prompt boilerplate removed and an optional `--style` for common looks. Same providers (`auto` / `pollinations` / `gpt-image-1` / `replicate` / `local-sd`), same env vars, same `auto` fallback to free Pollinations when you only have a MiMo key.
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# free, no key
|
|
117
|
+
python3 mimoskill/scripts/generate_image.py \
|
|
118
|
+
--prompt "isometric cyberpunk city at dusk" --out /tmp/out.png
|
|
119
|
+
|
|
120
|
+
# with a style preset
|
|
121
|
+
python3 mimoskill/scripts/generate_image.py --style pixel-art \
|
|
122
|
+
--prompt "a brave knight" --out /tmp/knight.png
|
|
123
|
+
|
|
124
|
+
# multiple variants -> /tmp/img-1.png /tmp/img-2.png /tmp/img-3.png /tmp/img-4.png
|
|
125
|
+
python3 mimoskill/scripts/generate_image.py --n 4 \
|
|
126
|
+
--prompt "watercolor desert sunrise" --out /tmp/img.png
|
|
127
|
+
|
|
128
|
+
# best quality (needs PET_OPENAI_API_KEY — same env var as the pet flow)
|
|
129
|
+
export PET_OPENAI_API_KEY=sk-real-openai-key
|
|
130
|
+
python3 mimoskill/scripts/generate_image.py --provider gpt-image-1 \
|
|
131
|
+
--prompt "..." --out /tmp/out.png
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
`--style` choices: `plain` (default, no prefix), `pixel-art`, `photo`, `3d-render`, `line-art`, `watercolor`, `sticker`. `plain` sends your prompt verbatim — pick that when the user gave a fully-specified prompt.
|
|
135
|
+
|
|
136
|
+
For **Codex `/hatch` pets** keep using `generate_pet.py` + `install_pet.sh` — that flow is unchanged and tuned for the chibi sprite + 3-state bundle Codex wants.
|
|
137
|
+
|
|
71
138
|
## Generating a Codex pet (the `/hatch` alternative)
|
|
72
139
|
|
|
73
140
|
**Why this needs special handling**: Codex's built-in `/hatch` pet generation requires OpenAI's image generation API (`gpt-image-1`). MiMo doesn't have an image generation endpoint, and mimo2codex can't fake one. So `/hatch` from inside Codex won't work when Codex is pointed at MiMo.
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# OCR / image recognition workflow
|
|
2
|
+
|
|
3
|
+
`mimoskill/scripts/ocr.py` is the fallback path for reading or describing
|
|
4
|
+
images when the surrounding chat model can't see them. Two engines:
|
|
5
|
+
|
|
6
|
+
| Engine | Needs API key? | Quality | Notes |
|
|
7
|
+
|---|---|---|---|
|
|
8
|
+
| `mimo` | yes (`MIMO_API_KEY`) | best | Calls `mimo-v2.5` regardless of the chat model used elsewhere. |
|
|
9
|
+
| `pollinations` | **no** | decent | Free public endpoint at `text.pollinations.ai`. Rate-limited but no signup. |
|
|
10
|
+
|
|
11
|
+
`--engine auto` (default) picks `mimo` if `MIMO_API_KEY` is set, else falls
|
|
12
|
+
back to `pollinations` so users with only a DeepSeek key (or no key at all)
|
|
13
|
+
still get OCR.
|
|
14
|
+
|
|
15
|
+
## TL;DR
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Zero-setup — uses free pollinations fallback when MIMO_API_KEY is unset
|
|
19
|
+
python3 mimoskill/scripts/ocr.py path/to/image.png
|
|
20
|
+
python3 mimoskill/scripts/ocr.py --mode describe path/to/image.png
|
|
21
|
+
python3 mimoskill/scripts/ocr.py --mode structured a.png b.jpg
|
|
22
|
+
python3 mimoskill/scripts/ocr.py --mode markdown form.png
|
|
23
|
+
|
|
24
|
+
# Force the free engine even when you have a MiMo key (e.g. to save quota)
|
|
25
|
+
python3 mimoskill/scripts/ocr.py --engine pollinations form.png
|
|
26
|
+
|
|
27
|
+
# Best quality — set MiMo key
|
|
28
|
+
export MIMO_API_KEY=sk-xxxxxxxxxxxxxxxx
|
|
29
|
+
python3 mimoskill/scripts/ocr.py path/to/image.png # auto -> mimo
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Why this skill exists
|
|
33
|
+
|
|
34
|
+
The proxy strips image attachments when the active chat model can't accept
|
|
35
|
+
them (`src/translate/reqToChat.ts:48-72`). Non-vision MiMo variants —
|
|
36
|
+
`mimo-v2.5-pro`, `mimo-v2.5-pro[1m]`, `mimo-v2-flash` — return 404
|
|
37
|
+
"No endpoints found that support image input" if images are forwarded.
|
|
38
|
+
The proxy drops the images and leaves an `[N image attachment(s) omitted: …]`
|
|
39
|
+
placeholder so the conversation doesn't crash.
|
|
40
|
+
|
|
41
|
+
`ocr.py` is the recommended way to recover that content **without changing
|
|
42
|
+
the chat model**: it independently calls `mimo-v2.5`, returns text, and the
|
|
43
|
+
caller pipes that text back into the conversation as a normal user message.
|
|
44
|
+
|
|
45
|
+
## Input modes
|
|
46
|
+
|
|
47
|
+
The positional `IMAGE` args (0 or more) accept:
|
|
48
|
+
|
|
49
|
+
| Form | Example | What ocr.py does |
|
|
50
|
+
|---|---|---|
|
|
51
|
+
| Local path | `./scan.png`, `C:\foo.jpg` | reads bytes, magic-byte sniffs MIME, base64-encodes to a `data:` URL |
|
|
52
|
+
| `http(s)://` URL | `https://example.com/x.png` | forwarded as-is; MiMo fetches server-side |
|
|
53
|
+
| `data:` URL | `data:image/png;base64,…` | forwarded as-is |
|
|
54
|
+
| `-` (single dash) | piped from stdin | reads one image's bytes from stdin |
|
|
55
|
+
| nothing + non-TTY stdin | `cat x.png \| ocr.py` | same as `-` |
|
|
56
|
+
|
|
57
|
+
Magic-byte table (file extension is **not** trusted):
|
|
58
|
+
|
|
59
|
+
| Bytes | MIME |
|
|
60
|
+
|---|---|
|
|
61
|
+
| `89 50 4E 47 0D 0A 1A 0A` | `image/png` |
|
|
62
|
+
| `FF D8 FF` | `image/jpeg` |
|
|
63
|
+
| `47 49 46 38 37 61` / `…39 61` | `image/gif` |
|
|
64
|
+
| `52 49 46 46 …. 57 45 42 50` | `image/webp` |
|
|
65
|
+
| `42 4D` | `image/bmp` |
|
|
66
|
+
| (anything else) | falls back to `image/png` |
|
|
67
|
+
|
|
68
|
+
## Output modes (`--mode`)
|
|
69
|
+
|
|
70
|
+
### `text` (default) — verbatim OCR
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
python3 mimoskill/scripts/ocr.py invoice.png
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Stdout is the raw extracted text. Line breaks, reading order, and rough
|
|
77
|
+
column/table layout (whitespace + pipes) are preserved. No commentary, no
|
|
78
|
+
translation, no summary. Unreadable spans become `[unreadable]`. Image with
|
|
79
|
+
no text returns the single line `[no text detected]`.
|
|
80
|
+
|
|
81
|
+
### `describe` — short prose description
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python3 mimoskill/scripts/ocr.py --mode describe screenshot.png
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
2-4 sentences covering layout, key elements, visible text (quoted), and
|
|
88
|
+
notable colors. No invented details.
|
|
89
|
+
|
|
90
|
+
### `structured` — JSON
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
python3 mimoskill/scripts/ocr.py --mode structured form.png
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Stdout is a single JSON object:
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"text": "...",
|
|
101
|
+
"language": "zh-Hans",
|
|
102
|
+
"regions": [
|
|
103
|
+
{"label": "title", "text": "增值税电子发票", "role": "title"},
|
|
104
|
+
{"label": "buyer", "text": "...", "role": "paragraph"},
|
|
105
|
+
{"label": "items", "text": "...", "role": "table"}
|
|
106
|
+
],
|
|
107
|
+
"summary": "A Chinese VAT e-invoice with buyer/seller and four line items."
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
`regions[].role` is one of `title`, `paragraph`, `list`, `table`, `caption`,
|
|
112
|
+
`ui`, `handwriting`, `other`.
|
|
113
|
+
|
|
114
|
+
**Note**: `structured` returns **logical regions** (role classification),
|
|
115
|
+
not pixel bounding boxes. MiMo does not currently expose grounded pixel
|
|
116
|
+
coordinates the way some other vision models do; this skill won't pretend
|
|
117
|
+
to. If you need pixel boxes, use a model that does (e.g. Gemini grounding,
|
|
118
|
+
Tesseract with `--psm 6` + position data).
|
|
119
|
+
|
|
120
|
+
### `markdown` — re-render as GFM
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
python3 mimoskill/scripts/ocr.py --mode markdown spec.png
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Headings become `#`/`##`, tables become pipe tables, code-like text becomes
|
|
127
|
+
fenced blocks, lists become `-`. Reading order preserved. Output is the
|
|
128
|
+
Markdown body only — no preamble, no outer fence.
|
|
129
|
+
|
|
130
|
+
## Batch (multi-image) calls
|
|
131
|
+
|
|
132
|
+
Pass multiple positional args:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python3 mimoskill/scripts/ocr.py page1.png page2.png page3.png
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
All images go to MiMo in a **single** chat completion (one billable call).
|
|
139
|
+
The model can cross-reference (e.g. ID front + back). Output is a single
|
|
140
|
+
text body in reading order across the images.
|
|
141
|
+
|
|
142
|
+
When you need a different prompt per image, run `ocr.py` N times instead.
|
|
143
|
+
|
|
144
|
+
## `--lang` and `--prompt`
|
|
145
|
+
|
|
146
|
+
- `--lang LANG` appends `Primary language: <LANG>.` to the prompt. Useful
|
|
147
|
+
for CJK to prevent the model from outputting Pinyin transliteration:
|
|
148
|
+
`ocr.py --lang Chinese scan.png` or `--lang zh` or `--lang 日本語`.
|
|
149
|
+
|
|
150
|
+
- `--prompt EXTRA` appends a free-text instruction:
|
|
151
|
+
`ocr.py --mode text --prompt "Only handwriting, ignore printed text." form.png`
|
|
152
|
+
|
|
153
|
+
## Model selection
|
|
154
|
+
|
|
155
|
+
| You pass | ocr.py uses |
|
|
156
|
+
|---|---|
|
|
157
|
+
| nothing | `$MIMO_OCR_MODEL` → `$MIMO_MODEL` (if vision-capable) → `mimo-v2.5` |
|
|
158
|
+
| `--model mimo-v2.5` | `mimo-v2.5` |
|
|
159
|
+
| `--model mimo-v2.5[1m]` | `mimo-v2.5[1m]` |
|
|
160
|
+
| `--model mimo-v2-omni` | `mimo-v2-omni` |
|
|
161
|
+
| `--model mimo-v2.5-pro` | **switches to `mimo-v2.5`** (stderr note) |
|
|
162
|
+
| `--model mimo-v2.5-pro[1m]` | **switches to `mimo-v2.5`** |
|
|
163
|
+
| `--model mimo-v2-flash` | **switches to `mimo-v2.5`** |
|
|
164
|
+
|
|
165
|
+
Non-vision models would return 404 from MiMo, so the script coerces them
|
|
166
|
+
silently (one stderr line) rather than failing.
|
|
167
|
+
|
|
168
|
+
## When `MIMO_API_KEY` isn't set
|
|
169
|
+
|
|
170
|
+
`--engine auto` (the default) silently falls back to `pollinations`:
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
[engine] auto -> pollinations (free, no key). Set MIMO_API_KEY for higher quality (mimo-v2.5).
|
|
174
|
+
[ocr] engine=pollinations mode=text model=openai images=1
|
|
175
|
+
<extracted text>
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Exit code `3` is only raised when the user explicitly passes `--engine mimo`
|
|
179
|
+
without a key (passing the flag is treated as an assertion that MiMo should
|
|
180
|
+
be used; auto-falling-back would mask the misconfiguration).
|
|
181
|
+
|
|
182
|
+
If you'd rather use **fully-local OCR** with no network at all, install
|
|
183
|
+
tesseract and shell to it directly — this skill won't auto-invoke it:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
macOS: brew install tesseract tesseract-lang
|
|
187
|
+
Ubuntu: sudo apt install tesseract-ocr tesseract-ocr-chi-sim
|
|
188
|
+
Windows: https://github.com/UB-Mannheim/tesseract/wiki
|
|
189
|
+
tesseract <image> - -l eng+chi_sim
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Pollinations specifics
|
|
193
|
+
|
|
194
|
+
- Endpoint: `https://text.pollinations.ai/openai` (OpenAI Chat Completions
|
|
195
|
+
compatible).
|
|
196
|
+
- Default model: `openai` (vision-capable). Override with
|
|
197
|
+
`--pollinations-model <name>` or `POLLINATIONS_MODEL=<name>`. Other
|
|
198
|
+
vision-capable picks include `openai-large`, `openai-fast`.
|
|
199
|
+
- No `Authorization` header is sent; the service is open. Rate limits apply
|
|
200
|
+
per-IP; if you hit them you'll see HTTP 429 in stderr — wait or retry.
|
|
201
|
+
- `reasoning_content` is normally empty for pollinations responses (the
|
|
202
|
+
underlying models don't expose chain-of-thought).
|
|
203
|
+
|
|
204
|
+
## Common pitfalls
|
|
205
|
+
|
|
206
|
+
- **PDFs are not supported directly.** Rasterize first with one of:
|
|
207
|
+
- `pdftoppm -png input.pdf out` (Poppler)
|
|
208
|
+
- `mutool draw -o out-%d.png input.pdf` (MuPDF)
|
|
209
|
+
- macOS: `sips -s format png input.pdf --out out.png`
|
|
210
|
+
- **Multi-image batches share one prompt.** If you need different modes /
|
|
211
|
+
languages per image, invoke `ocr.py` once per image.
|
|
212
|
+
- **`structured` mode is logical regions, not pixel boxes.** See above.
|
|
213
|
+
- **`--stream` + `structured`**: the streamed body is still a single JSON
|
|
214
|
+
object; buffer it before parsing.
|
|
215
|
+
|
|
216
|
+
## Exit codes
|
|
217
|
+
|
|
218
|
+
| Code | Meaning |
|
|
219
|
+
|---|---|
|
|
220
|
+
| 0 | Success |
|
|
221
|
+
| 1 | Upstream HTTP error (MiMo or Pollinations; error body printed to stderr) |
|
|
222
|
+
| 2 | argv / usage error (no image, mutually exclusive flags, etc.) |
|
|
223
|
+
| 3 | `--engine mimo` explicitly requested but `MIMO_API_KEY` not set |
|
|
224
|
+
| 4 | Local image file not found / unreadable |
|
|
225
|
+
|
|
226
|
+
## Composing with `mimo_chat.py`
|
|
227
|
+
|
|
228
|
+
OCR + downstream LLM call is a common pattern:
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
TEXT=$(python3 mimoskill/scripts/ocr.py invoice.png)
|
|
232
|
+
python3 mimoskill/scripts/mimo_chat.py "Summarize this invoice:\n$TEXT"
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Or structured + parse:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
JSON=$(python3 mimoskill/scripts/ocr.py --mode structured invoice.png)
|
|
239
|
+
echo "$JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['summary'])"
|
|
240
|
+
```
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
generate_image.py — general (non-pet) image generation.
|
|
4
|
+
|
|
5
|
+
Thin wrapper over generate_pet.py: same providers (auto / pollinations /
|
|
6
|
+
gpt-image-1 / replicate / local-sd), no chibi-pet prompt boilerplate,
|
|
7
|
+
plus an optional --style for common looks.
|
|
8
|
+
|
|
9
|
+
For Codex /hatch pets, keep using generate_pet.py — it has pet-tuned prompt
|
|
10
|
+
prefixes and the --bundle (idle/working/done) state machine.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
# free, no key
|
|
14
|
+
python3 generate_image.py --prompt "isometric cyberpunk city at dusk" --out out.png
|
|
15
|
+
|
|
16
|
+
# styled
|
|
17
|
+
python3 generate_image.py --style pixel-art --prompt "a brave knight" --out k.png
|
|
18
|
+
|
|
19
|
+
# best quality (needs PET_OPENAI_API_KEY — same env var as the pet flow)
|
|
20
|
+
python3 generate_image.py --provider gpt-image-1 --prompt "..." --out out.png
|
|
21
|
+
|
|
22
|
+
# multiple variants
|
|
23
|
+
python3 generate_image.py --n 4 --prompt "..." --out img.png
|
|
24
|
+
# produces img-1.png, img-2.png, img-3.png, img-4.png
|
|
25
|
+
|
|
26
|
+
Only depends on the standard library.
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import importlib.util
|
|
32
|
+
import sys
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Load generate_pet.py as a module by absolute path (not `import generate_pet`)
|
|
37
|
+
# — the skill is invoked from arbitrary cwd, and we don't ship an __init__.py.
|
|
38
|
+
_HERE = Path(__file__).resolve().parent
|
|
39
|
+
_GP_PATH = _HERE / "generate_pet.py"
|
|
40
|
+
_spec = importlib.util.spec_from_file_location("_generate_pet", _GP_PATH)
|
|
41
|
+
if _spec is None or _spec.loader is None:
|
|
42
|
+
sys.stderr.write(f"error: cannot load {_GP_PATH}\n")
|
|
43
|
+
sys.exit(2)
|
|
44
|
+
_gp = importlib.util.module_from_spec(_spec)
|
|
45
|
+
_spec.loader.exec_module(_gp)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# --- style presets ----------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
STYLES: dict[str, tuple[str, str]] = {
|
|
51
|
+
"plain": ("", ""),
|
|
52
|
+
"pixel-art": (
|
|
53
|
+
"Retro 16-bit pixel art sprite of ",
|
|
54
|
+
", transparent background, single sprite, nearest-neighbor",
|
|
55
|
+
),
|
|
56
|
+
"photo": (
|
|
57
|
+
"Photorealistic photograph of ",
|
|
58
|
+
", natural lighting, sharp focus, shallow depth of field",
|
|
59
|
+
),
|
|
60
|
+
"3d-render": (
|
|
61
|
+
"Cute 3D render of ",
|
|
62
|
+
", soft global illumination, octane render",
|
|
63
|
+
),
|
|
64
|
+
"line-art": (
|
|
65
|
+
"Black ink line art of ",
|
|
66
|
+
", clean linework, white background, no shading",
|
|
67
|
+
),
|
|
68
|
+
"watercolor": (
|
|
69
|
+
"Hand-drawn ink and watercolor of ",
|
|
70
|
+
", loose linework, watercolor wash",
|
|
71
|
+
),
|
|
72
|
+
"sticker": (
|
|
73
|
+
"Chibi sticker mascot of ",
|
|
74
|
+
", transparent background, soft cel-shading, single character",
|
|
75
|
+
),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def apply_style(prompt: str, style: str) -> str:
|
|
80
|
+
prefix, suffix = STYLES[style]
|
|
81
|
+
if not prefix and not suffix:
|
|
82
|
+
return prompt
|
|
83
|
+
body = prompt.strip().rstrip(".,;")
|
|
84
|
+
return f"{prefix}{body}{suffix}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --- main -------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def main() -> None:
|
|
90
|
+
p = argparse.ArgumentParser(
|
|
91
|
+
description=__doc__.split("\n", 1)[0],
|
|
92
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
93
|
+
)
|
|
94
|
+
p.add_argument("--prompt", required=True, help="what to draw (used verbatim by default)")
|
|
95
|
+
p.add_argument(
|
|
96
|
+
"--style",
|
|
97
|
+
choices=list(STYLES),
|
|
98
|
+
default="plain",
|
|
99
|
+
help="optional prompt preset (default: plain — no prefix/suffix)",
|
|
100
|
+
)
|
|
101
|
+
p.add_argument(
|
|
102
|
+
"--provider",
|
|
103
|
+
choices=["auto"] + list(_gp.PROVIDERS),
|
|
104
|
+
default="auto",
|
|
105
|
+
help="image gen backend (same as generate_pet.py)",
|
|
106
|
+
)
|
|
107
|
+
p.add_argument("--reference", type=Path, help="reference image (gpt-image-1 only)")
|
|
108
|
+
p.add_argument("--quality", default="medium", choices=["low", "medium", "high", "hd"])
|
|
109
|
+
p.add_argument("--out", type=Path, required=True, help="output path (PNG)")
|
|
110
|
+
p.add_argument("--n", type=int, default=1, help="number of variants to generate")
|
|
111
|
+
p.add_argument("--size", default=None, help="forwarded where supported (e.g. 1024x1024)")
|
|
112
|
+
p.add_argument("--seed", type=int, default=None, help="forwarded where supported")
|
|
113
|
+
args = p.parse_args()
|
|
114
|
+
|
|
115
|
+
if args.n < 1:
|
|
116
|
+
sys.stderr.write("error: --n must be >= 1\n")
|
|
117
|
+
sys.exit(2)
|
|
118
|
+
|
|
119
|
+
# Resolve auto provider with the same status line generate_pet.py emits.
|
|
120
|
+
if args.provider == "auto":
|
|
121
|
+
chosen = _gp.resolve_auto_provider()
|
|
122
|
+
if chosen == "pollinations":
|
|
123
|
+
sys.stderr.write(
|
|
124
|
+
"[provider] auto -> pollinations (free, no key required).\n"
|
|
125
|
+
" For higher quality, set PET_OPENAI_API_KEY (real OpenAI key)\n"
|
|
126
|
+
" and rerun, or pass --provider replicate / local-sd.\n\n"
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
sys.stderr.write(f"[provider] auto -> {chosen}\n\n")
|
|
130
|
+
args.provider = chosen
|
|
131
|
+
|
|
132
|
+
final_prompt = apply_style(args.prompt, args.style)
|
|
133
|
+
sys.stderr.write(f"prompt: {final_prompt}\n")
|
|
134
|
+
|
|
135
|
+
# --size / --seed: emit a note where the underlying provider doesn't
|
|
136
|
+
# plumb them through. v1 forwards nothing (generate_pet.py hard-codes
|
|
137
|
+
# 1024x1024 / no seed); future versions can extend per-provider.
|
|
138
|
+
if args.size and args.size != "1024x1024":
|
|
139
|
+
sys.stderr.write(
|
|
140
|
+
f"note: --size {args.size} ignored in v1 (providers run at 1024x1024).\n"
|
|
141
|
+
)
|
|
142
|
+
if args.seed is not None:
|
|
143
|
+
sys.stderr.write(
|
|
144
|
+
"note: --seed ignored in v1 (not plumbed through to providers yet).\n"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def out_path_for(i: int) -> Path:
|
|
148
|
+
if args.n == 1:
|
|
149
|
+
return args.out
|
|
150
|
+
stem = args.out.stem
|
|
151
|
+
suffix = args.out.suffix or ".png"
|
|
152
|
+
return args.out.parent / f"{stem}-{i + 1}{suffix}"
|
|
153
|
+
|
|
154
|
+
for i in range(args.n):
|
|
155
|
+
out = out_path_for(i)
|
|
156
|
+
sys.stderr.write(f"generating ({i + 1}/{args.n}) -> {out}\n")
|
|
157
|
+
_gp.generate_one(args.provider, final_prompt, args.reference, args.quality, out)
|
|
158
|
+
|
|
159
|
+
sys.stderr.write(f"\n[ok] wrote {args.n} image(s)\n")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|