mimo2codex 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +24 -5
- package/README.md +70 -6
- package/README.zh.md +69 -6
- package/dist/admin/router.js +117 -2
- package/dist/admin/router.js.map +1 -1
- package/dist/cli.js +67 -147
- package/dist/cli.js.map +1 -1
- package/dist/config.js +16 -10
- package/dist/config.js.map +1 -1
- package/dist/db/logs.js +80 -0
- package/dist/db/logs.js.map +1 -1
- package/dist/providers/generic.js +96 -0
- package/dist/providers/generic.js.map +1 -0
- package/dist/providers/genericLoader.js +229 -0
- package/dist/providers/genericLoader.js.map +1 -0
- package/dist/providers/registry.js +48 -10
- package/dist/providers/registry.js.map +1 -1
- package/dist/server.js +201 -1
- package/dist/server.js.map +1 -1
- package/dist/setup/snippets.js +187 -0
- package/dist/setup/snippets.js.map +1 -0
- package/dist/translate/reqToChat.js +42 -2
- package/dist/translate/reqToChat.js.map +1 -1
- package/dist/upstream/openaiCompatClient.js +32 -11
- package/dist/upstream/openaiCompatClient.js.map +1 -1
- package/dist/web/assets/index-D19ffnSJ.css +1 -0
- package/dist/web/assets/index-DPLJprJ4.js +67 -0
- package/dist/web/index.html +2 -2
- package/doc/generic-providers.md +399 -0
- package/doc/generic-providers.zh.md +399 -0
- package/doc/mimoskill.md +295 -0
- package/doc/mimoskill.zh.md +295 -0
- package/mimoskill/SKILL.md +80 -13
- package/mimoskill/references/ocr_workflow.md +240 -0
- package/mimoskill/scripts/generate_image.py +163 -0
- package/mimoskill/scripts/mimo_chat.py +111 -42
- package/mimoskill/scripts/ocr.py +445 -0
- package/package.json +5 -4
- package/dist/web/assets/index-BoykBCnY.js +0 -67
- package/dist/web/assets/index-DAJbSznk.css +0 -1
|
@@ -1,21 +1,29 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
mimo_chat.py — single-shot or streaming chat
|
|
3
|
+
mimo_chat.py — single-shot or streaming chat. Works WITHOUT any API key.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
Engines (--engine):
|
|
6
|
+
auto (default) — mimo if MIMO_API_KEY set, else pollinations
|
|
7
|
+
mimo — Xiaomi MiMo V2.5 (best quality, needs MIMO_API_KEY)
|
|
8
|
+
pollinations — pollinations.ai free public chat endpoint. NO KEY REQUIRED
|
|
7
9
|
|
|
10
|
+
When the mimo engine is used, handles the MiMo-specific quirks:
|
|
8
11
|
- max_completion_tokens (not max_tokens)
|
|
9
12
|
- vision via mimo-v2.5 / mimo-v2-omni (and the required text part next to
|
|
10
13
|
image_url, otherwise MiMo 400s with "text is not set")
|
|
11
|
-
- web_search builtin
|
|
14
|
+
- web_search builtin: auto-enabled on pay-as-you-go (sk-*) keys, skipped on
|
|
15
|
+
token-plan (tp-*) keys. Model decides when to invoke (tool_choice: auto).
|
|
16
|
+
Requires the Web Search Plugin to be activated in the MiMo console.
|
|
12
17
|
- reasoning_content extraction
|
|
13
18
|
|
|
14
19
|
Usage:
|
|
15
|
-
|
|
20
|
+
# Zero-setup
|
|
16
21
|
python3 mimo_chat.py "your prompt"
|
|
17
|
-
python3 mimo_chat.py --
|
|
18
|
-
|
|
22
|
+
python3 mimo_chat.py --image https://x/y.png "describe"
|
|
23
|
+
|
|
24
|
+
# MiMo key — gets best quality + native web search (when sk-*)
|
|
25
|
+
export MIMO_API_KEY=sk-xxxx
|
|
26
|
+
python3 mimo_chat.py "今天上海天气?"
|
|
19
27
|
python3 mimo_chat.py --stream "tell me a story"
|
|
20
28
|
|
|
21
29
|
Only depends on the standard library — no `openai` SDK install needed.
|
|
@@ -48,51 +56,64 @@ def build_messages(prompt: str, image: str | None) -> list[dict[str, Any]]:
|
|
|
48
56
|
]
|
|
49
57
|
|
|
50
58
|
|
|
59
|
+
POLLINATIONS_URL = "https://text.pollinations.ai/openai"
|
|
60
|
+
POLLINATIONS_DEFAULT_MODEL = "openai" # vision-capable, free, no key
|
|
61
|
+
|
|
62
|
+
|
|
51
63
|
def build_body(
|
|
52
64
|
*,
|
|
53
65
|
prompt: str,
|
|
54
66
|
image: str | None,
|
|
55
67
|
model: str,
|
|
56
68
|
stream: bool,
|
|
57
|
-
|
|
69
|
+
enable_web_search: bool,
|
|
58
70
|
max_tokens: int,
|
|
59
71
|
temperature: float,
|
|
72
|
+
engine: str,
|
|
60
73
|
) -> dict[str, Any]:
|
|
61
74
|
body: dict[str, Any] = {
|
|
62
75
|
"model": model,
|
|
63
76
|
"messages": build_messages(prompt, image),
|
|
64
|
-
"max_completion_tokens": max_tokens,
|
|
65
77
|
"temperature": temperature,
|
|
66
78
|
"stream": stream,
|
|
67
79
|
}
|
|
68
|
-
if
|
|
69
|
-
# MiMo
|
|
70
|
-
|
|
71
|
-
|
|
80
|
+
if engine == "mimo":
|
|
81
|
+
# MiMo's quirk: max_completion_tokens, not max_tokens.
|
|
82
|
+
body["max_completion_tokens"] = max_tokens
|
|
83
|
+
else:
|
|
84
|
+
body["max_tokens"] = max_tokens
|
|
85
|
+
if enable_web_search:
|
|
86
|
+
# MiMo native web_search builtin. The model decides whether to invoke
|
|
87
|
+
# it (tool_choice=auto). Requires the Web Search Plugin to be
|
|
88
|
+
# activated at https://platform.xiaomimimo.com/#/console/plugin —
|
|
89
|
+
# without that, MiMo returns 400 and the error body is printed.
|
|
90
|
+
body["tools"] = [{"type": "web_search"}]
|
|
72
91
|
body["tool_choice"] = "auto"
|
|
73
92
|
return body
|
|
74
93
|
|
|
75
94
|
|
|
76
|
-
def post(url: str, body: dict[str, Any], api_key: str, stream: bool) -> Any:
|
|
95
|
+
def post(url: str, body: dict[str, Any], api_key: str | None, stream: bool, *, engine: str) -> Any:
|
|
96
|
+
headers = {
|
|
97
|
+
"Content-Type": "application/json",
|
|
98
|
+
"Accept": "text/event-stream" if stream else "application/json",
|
|
99
|
+
"User-Agent": "mimoskill/0.1",
|
|
100
|
+
}
|
|
101
|
+
if api_key:
|
|
102
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
77
103
|
req = urllib.request.Request(
|
|
78
104
|
url,
|
|
79
105
|
method="POST",
|
|
80
106
|
data=json.dumps(body).encode("utf-8"),
|
|
81
|
-
headers=
|
|
82
|
-
"Content-Type": "application/json",
|
|
83
|
-
"Accept": "text/event-stream" if stream else "application/json",
|
|
84
|
-
"Authorization": f"Bearer {api_key}",
|
|
85
|
-
"User-Agent": "mimoskill/0.1",
|
|
86
|
-
},
|
|
107
|
+
headers=headers,
|
|
87
108
|
)
|
|
88
109
|
try:
|
|
89
110
|
return urllib.request.urlopen(req, timeout=300)
|
|
90
111
|
except urllib.error.HTTPError as e:
|
|
91
112
|
snippet = e.read().decode("utf-8", "replace")
|
|
92
|
-
sys.stderr.write(f"
|
|
113
|
+
sys.stderr.write(f"{engine} returned HTTP {e.code}: {snippet}\n")
|
|
93
114
|
sys.exit(1)
|
|
94
115
|
except urllib.error.URLError as e:
|
|
95
|
-
sys.stderr.write(f"connection failed: {e}\n")
|
|
116
|
+
sys.stderr.write(f"connection failed ({engine}): {e}\n")
|
|
96
117
|
sys.exit(1)
|
|
97
118
|
|
|
98
119
|
|
|
@@ -144,51 +165,99 @@ def main() -> None:
|
|
|
144
165
|
p.add_argument("prompt", nargs="?", default="", help="user message text")
|
|
145
166
|
p.add_argument("--model", default=os.environ.get("MIMO_MODEL", "mimo-v2.5-pro"))
|
|
146
167
|
p.add_argument("--image", help="image URL to attach (forces vision-capable model)")
|
|
147
|
-
p.add_argument("--search", action="store_true", help="enable MiMo web_search builtin")
|
|
148
168
|
p.add_argument("--stream", action="store_true", help="stream the response")
|
|
149
169
|
p.add_argument("--max-tokens", type=int, default=2048)
|
|
150
170
|
p.add_argument("--temperature", type=float, default=0.7)
|
|
171
|
+
p.add_argument(
|
|
172
|
+
"--engine",
|
|
173
|
+
choices=["auto", "mimo", "pollinations"],
|
|
174
|
+
default=os.environ.get("MIMO_CHAT_ENGINE", "auto"),
|
|
175
|
+
help="chat backend. auto = mimo if MIMO_API_KEY set, else pollinations "
|
|
176
|
+
"(free, no key required). default: %(default)s",
|
|
177
|
+
)
|
|
151
178
|
p.add_argument(
|
|
152
179
|
"--base-url",
|
|
153
180
|
default=os.environ.get("MIMO_BASE_URL", "https://api.xiaomimimo.com/v1"),
|
|
154
|
-
help="
|
|
181
|
+
help="MiMo endpoint, ignored when --engine=pollinations "
|
|
182
|
+
"(tp-* keys use https://token-plan-cn.xiaomimimo.com/v1)",
|
|
183
|
+
)
|
|
184
|
+
p.add_argument(
|
|
185
|
+
"--pollinations-model",
|
|
186
|
+
default=os.environ.get("POLLINATIONS_MODEL", POLLINATIONS_DEFAULT_MODEL),
|
|
187
|
+
help="model id when --engine=pollinations (default: %(default)s)",
|
|
155
188
|
)
|
|
156
189
|
args = p.parse_args()
|
|
157
190
|
|
|
158
191
|
api_key = os.environ.get("MIMO_API_KEY")
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
192
|
+
|
|
193
|
+
# Resolve engine.
|
|
194
|
+
if args.engine == "mimo":
|
|
195
|
+
engine = "mimo"
|
|
196
|
+
if not api_key:
|
|
197
|
+
sys.stderr.write(
|
|
198
|
+
"error: --engine mimo requires MIMO_API_KEY.\n"
|
|
199
|
+
" get one at https://platform.xiaomimimo.com/#/console/api-keys\n"
|
|
200
|
+
" OR drop the flag to fall back to pollinations (free, no key required):\n"
|
|
201
|
+
" python3 mimo_chat.py <prompt>\n"
|
|
202
|
+
)
|
|
203
|
+
sys.exit(3)
|
|
204
|
+
elif args.engine == "pollinations":
|
|
205
|
+
engine = "pollinations"
|
|
206
|
+
else: # auto
|
|
207
|
+
engine = "mimo" if api_key else "pollinations"
|
|
208
|
+
if engine == "pollinations":
|
|
209
|
+
sys.stderr.write(
|
|
210
|
+
"[engine] auto -> pollinations (free, no key). "
|
|
211
|
+
"Set MIMO_API_KEY for higher quality (mimo-v2.5).\n"
|
|
212
|
+
)
|
|
165
213
|
|
|
166
214
|
if not args.prompt and not args.image:
|
|
167
215
|
sys.stderr.write("error: pass a prompt and/or --image\n")
|
|
168
216
|
sys.exit(2)
|
|
169
217
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
218
|
+
enable_web_search = False
|
|
219
|
+
if engine == "mimo":
|
|
220
|
+
# Auto-bump to a vision model if user passed --image with a non-vision model.
|
|
221
|
+
model = args.model
|
|
222
|
+
if args.image and "omni" not in model.lower() and not model.startswith("mimo-v2.5["):
|
|
223
|
+
if model != "mimo-v2.5":
|
|
224
|
+
sys.stderr.write(
|
|
225
|
+
f"note: --image given but model is '{model}' which doesn't see images.\n"
|
|
226
|
+
f" switching to mimo-v2.5 for this call.\n"
|
|
227
|
+
)
|
|
228
|
+
model = "mimo-v2.5"
|
|
229
|
+
url = args.base_url.rstrip("/") + "/chat/completions"
|
|
230
|
+
auth: str | None = api_key
|
|
231
|
+
# MiMo native web_search: pay-as-you-go (sk-*) supports it, token-plan
|
|
232
|
+
# (tp-*) does not. Always include the tool on sk-* and let the model
|
|
233
|
+
# decide via tool_choice=auto — no extra flag needed.
|
|
234
|
+
enable_web_search = bool(api_key and api_key.startswith("sk-"))
|
|
235
|
+
else:
|
|
236
|
+
# Pollinations: pick the configured vision-capable model. The user's
|
|
237
|
+
# --model (mimo-*) is mimo-specific so we don't honor it here unless
|
|
238
|
+
# they explicitly passed --pollinations-model.
|
|
239
|
+
model = args.pollinations_model
|
|
240
|
+
url = POLLINATIONS_URL
|
|
241
|
+
auth = None
|
|
242
|
+
|
|
243
|
+
sys.stderr.write(
|
|
244
|
+
f"[chat] engine={engine} model={model}"
|
|
245
|
+
+ (" web_search=on" if enable_web_search else "")
|
|
246
|
+
+ "\n"
|
|
247
|
+
)
|
|
179
248
|
|
|
180
249
|
body = build_body(
|
|
181
250
|
prompt=args.prompt,
|
|
182
251
|
image=args.image,
|
|
183
252
|
model=model,
|
|
184
253
|
stream=args.stream,
|
|
185
|
-
|
|
254
|
+
enable_web_search=enable_web_search,
|
|
186
255
|
max_tokens=args.max_tokens,
|
|
187
256
|
temperature=args.temperature,
|
|
257
|
+
engine=engine,
|
|
188
258
|
)
|
|
189
259
|
|
|
190
|
-
|
|
191
|
-
resp = post(url, body, api_key, args.stream)
|
|
260
|
+
resp = post(url, body, auth, args.stream, engine=engine)
|
|
192
261
|
if args.stream:
|
|
193
262
|
stream_chat(resp)
|
|
194
263
|
else:
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ocr.py — OCR / image recognition that works without any API key.
|
|
4
|
+
|
|
5
|
+
Use this when the surrounding chat model can't see images (mimo-v2.5-pro,
|
|
6
|
+
mimo-v2.5-pro[1m], mimo-v2-flash, deepseek-*, or any text-only model).
|
|
7
|
+
|
|
8
|
+
Engines (--engine):
|
|
9
|
+
auto (default) — mimo if MIMO_API_KEY set, else pollinations
|
|
10
|
+
mimo — Xiaomi MiMo V2.5 vision. Highest quality. Needs MIMO_API_KEY
|
|
11
|
+
pollinations — pollinations.ai free public vision endpoint. NO KEY REQUIRED
|
|
12
|
+
|
|
13
|
+
Modes (--mode):
|
|
14
|
+
text (default) verbatim OCR — raw text, preserves line breaks
|
|
15
|
+
describe 2-4 sentence description of the image
|
|
16
|
+
structured single JSON object with text / language / regions / summary
|
|
17
|
+
markdown re-render the image as GitHub-flavored Markdown
|
|
18
|
+
|
|
19
|
+
Image inputs (positional, 0+):
|
|
20
|
+
/path/to/file.png local file → base64 data URL
|
|
21
|
+
https://example.com/x.png http(s) URL → forwarded as-is
|
|
22
|
+
data:image/...;base64,... data URL → forwarded as-is
|
|
23
|
+
- read one image from stdin (bytes)
|
|
24
|
+
(none, stdin not a TTY) same as `-`
|
|
25
|
+
|
|
26
|
+
Usage:
|
|
27
|
+
# Zero-setup: free fallback, works for DeepSeek-only / no-key users
|
|
28
|
+
python3 ocr.py path/to/image.png
|
|
29
|
+
python3 ocr.py --mode describe https://example.com/x.png
|
|
30
|
+
|
|
31
|
+
# Best quality (needs MiMo key)
|
|
32
|
+
export MIMO_API_KEY=sk-xxxx
|
|
33
|
+
python3 ocr.py --mode structured a.png b.jpg
|
|
34
|
+
cat scan.png | python3 ocr.py --mode markdown
|
|
35
|
+
|
|
36
|
+
Only depends on the standard library — no `openai` SDK install needed.
|
|
37
|
+
"""
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import argparse
|
|
41
|
+
import base64
|
|
42
|
+
import json
|
|
43
|
+
import mimetypes
|
|
44
|
+
import os
|
|
45
|
+
import sys
|
|
46
|
+
import urllib.error
|
|
47
|
+
import urllib.request
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
from typing import Any
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# --- modes ------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
MODE_PROMPTS: dict[str, str] = {
|
|
55
|
+
"text": (
|
|
56
|
+
"Extract ALL legible text from the attached image(s) verbatim, "
|
|
57
|
+
"preserving line breaks, reading order, and any obvious column/table "
|
|
58
|
+
"layout using whitespace and pipes. Do not paraphrase, translate, "
|
|
59
|
+
"summarize, or add commentary. If you cannot read part of it, output "
|
|
60
|
+
"`[unreadable]` in place. If the image contains no text, output "
|
|
61
|
+
"exactly the single line `[no text detected]`."
|
|
62
|
+
),
|
|
63
|
+
"describe": (
|
|
64
|
+
"Describe the contents of the attached image(s) in 2-4 sentences. "
|
|
65
|
+
"Mention layout, key visual elements, any visible text (quoted), and "
|
|
66
|
+
"notable colors. Do not invent details that aren't visible."
|
|
67
|
+
),
|
|
68
|
+
"structured": (
|
|
69
|
+
"Return ONE JSON object with keys `text` (string, full OCR — same "
|
|
70
|
+
"rules as verbatim text extraction, preserve line breaks and reading "
|
|
71
|
+
"order), `language` (BCP-47 best-guess like \"zh-Hans\" or \"en\"), "
|
|
72
|
+
"`regions` (array of `{label, text, role}` where role is one of "
|
|
73
|
+
"`title`, `paragraph`, `list`, `table`, `caption`, `ui`, "
|
|
74
|
+
"`handwriting`, `other`), and `summary` (1-sentence description). "
|
|
75
|
+
"Output ONLY the JSON, no markdown fences, no preamble."
|
|
76
|
+
),
|
|
77
|
+
"markdown": (
|
|
78
|
+
"Re-render the attached image(s) as GitHub-flavored Markdown. "
|
|
79
|
+
"Headings become `#`/`##`, tables become pipe tables, code-like text "
|
|
80
|
+
"becomes fenced code blocks, lists become `-`. Preserve reading "
|
|
81
|
+
"order. Output ONLY the markdown body — no preamble, no fences "
|
|
82
|
+
"wrapping the whole thing."
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
STRUCTURED_SYSTEM = (
|
|
87
|
+
"You are an OCR engine. Output strictly machine-parseable JSON, "
|
|
88
|
+
"no markdown fences, no commentary."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --- MIME sniffing ----------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
_MAGIC = [
|
|
95
|
+
(b"\x89PNG\r\n\x1a\n", "image/png"),
|
|
96
|
+
(b"\xff\xd8\xff", "image/jpeg"),
|
|
97
|
+
(b"GIF87a", "image/gif"),
|
|
98
|
+
(b"GIF89a", "image/gif"),
|
|
99
|
+
(b"BM", "image/bmp"),
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def sniff_mime(data: bytes, hint_name: str | None = None) -> str:
|
|
104
|
+
for sig, mime in _MAGIC:
|
|
105
|
+
if data.startswith(sig):
|
|
106
|
+
return mime
|
|
107
|
+
# WebP: "RIFF....WEBP"
|
|
108
|
+
if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
|
|
109
|
+
return "image/webp"
|
|
110
|
+
if hint_name:
|
|
111
|
+
guessed, _ = mimetypes.guess_type(hint_name)
|
|
112
|
+
if guessed and guessed.startswith("image/"):
|
|
113
|
+
return guessed
|
|
114
|
+
return "image/png"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def bytes_to_data_url(data: bytes, hint_name: str | None = None) -> str:
|
|
118
|
+
mime = sniff_mime(data, hint_name)
|
|
119
|
+
b64 = base64.b64encode(data).decode("ascii")
|
|
120
|
+
return f"data:{mime};base64,{b64}"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def resolve_image_arg(arg: str) -> str:
|
|
124
|
+
"""Turn a positional IMAGE arg into a URL suitable for image_url."""
|
|
125
|
+
if arg == "-":
|
|
126
|
+
if sys.stdin.isatty():
|
|
127
|
+
sys.stderr.write("error: `-` requested but stdin is a TTY\n")
|
|
128
|
+
sys.exit(2)
|
|
129
|
+
data = sys.stdin.buffer.read()
|
|
130
|
+
if not data:
|
|
131
|
+
sys.stderr.write("error: stdin was empty\n")
|
|
132
|
+
sys.exit(2)
|
|
133
|
+
return bytes_to_data_url(data)
|
|
134
|
+
if arg.startswith(("http://", "https://", "data:")):
|
|
135
|
+
return arg
|
|
136
|
+
path = Path(arg)
|
|
137
|
+
if not path.exists():
|
|
138
|
+
sys.stderr.write(f"error: image not found: {arg}\n")
|
|
139
|
+
sys.exit(4)
|
|
140
|
+
try:
|
|
141
|
+
data = path.read_bytes()
|
|
142
|
+
except OSError as e:
|
|
143
|
+
sys.stderr.write(f"error: cannot read {arg}: {e}\n")
|
|
144
|
+
sys.exit(4)
|
|
145
|
+
return bytes_to_data_url(data, hint_name=path.name)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# --- model auto-select ------------------------------------------------------
|
|
149
|
+
|
|
150
|
+
def model_supports_images(model: str) -> bool:
|
|
151
|
+
"""Mirror src/translate/reqToChat.ts:modelSupportsImages."""
|
|
152
|
+
base = model.split("[", 1)[0].lower()
|
|
153
|
+
if "omni" in base:
|
|
154
|
+
return True
|
|
155
|
+
if base == "mimo-v2.5":
|
|
156
|
+
return True
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def pick_model(cli_model: str | None) -> tuple[str, str | None]:
|
|
161
|
+
"""Returns (chosen_model, note_for_stderr_or_None)."""
|
|
162
|
+
if cli_model:
|
|
163
|
+
if model_supports_images(cli_model):
|
|
164
|
+
return cli_model, None
|
|
165
|
+
return "mimo-v2.5", (
|
|
166
|
+
f"note: model '{cli_model}' does not see images; "
|
|
167
|
+
f"switching to mimo-v2.5 for this call.\n"
|
|
168
|
+
)
|
|
169
|
+
env_ocr = os.environ.get("MIMO_OCR_MODEL")
|
|
170
|
+
if env_ocr and model_supports_images(env_ocr):
|
|
171
|
+
return env_ocr, None
|
|
172
|
+
env_chat = os.environ.get("MIMO_MODEL")
|
|
173
|
+
if env_chat and model_supports_images(env_chat):
|
|
174
|
+
return env_chat, None
|
|
175
|
+
return "mimo-v2.5", None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# --- message building -------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
def build_messages(
|
|
181
|
+
*, mode: str, image_urls: list[str], lang: str | None, extra_prompt: str | None
|
|
182
|
+
) -> list[dict[str, Any]]:
|
|
183
|
+
user_text = MODE_PROMPTS[mode]
|
|
184
|
+
if lang:
|
|
185
|
+
user_text += f" Primary language: {lang}."
|
|
186
|
+
if extra_prompt:
|
|
187
|
+
user_text += f" {extra_prompt}"
|
|
188
|
+
|
|
189
|
+
content: list[dict[str, Any]] = [
|
|
190
|
+
{"type": "image_url", "image_url": {"url": u}} for u in image_urls
|
|
191
|
+
]
|
|
192
|
+
content.append({"type": "text", "text": user_text})
|
|
193
|
+
|
|
194
|
+
messages: list[dict[str, Any]] = []
|
|
195
|
+
if mode == "structured":
|
|
196
|
+
messages.append({"role": "system", "content": STRUCTURED_SYSTEM})
|
|
197
|
+
messages.append({"role": "user", "content": content})
|
|
198
|
+
return messages
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# --- HTTP -------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
POLLINATIONS_URL = "https://text.pollinations.ai/openai"
|
|
204
|
+
POLLINATIONS_DEFAULT_MODEL = "openai" # vision-capable, free, no key
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def post(url: str, body: dict[str, Any], api_key: str | None, stream: bool, *, engine: str) -> Any:
|
|
208
|
+
headers = {
|
|
209
|
+
"Content-Type": "application/json",
|
|
210
|
+
"Accept": "text/event-stream" if stream else "application/json",
|
|
211
|
+
"User-Agent": "mimoskill-ocr/0.1",
|
|
212
|
+
}
|
|
213
|
+
if api_key:
|
|
214
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
215
|
+
req = urllib.request.Request(
|
|
216
|
+
url,
|
|
217
|
+
method="POST",
|
|
218
|
+
data=json.dumps(body).encode("utf-8"),
|
|
219
|
+
headers=headers,
|
|
220
|
+
)
|
|
221
|
+
try:
|
|
222
|
+
return urllib.request.urlopen(req, timeout=300)
|
|
223
|
+
except urllib.error.HTTPError as e:
|
|
224
|
+
snippet = e.read().decode("utf-8", "replace")
|
|
225
|
+
sys.stderr.write(f"{engine} returned HTTP {e.code}: {snippet}\n")
|
|
226
|
+
sys.exit(1)
|
|
227
|
+
except urllib.error.URLError as e:
|
|
228
|
+
sys.stderr.write(f"connection failed ({engine}): {e}\n")
|
|
229
|
+
sys.exit(1)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def stream_chat(resp: Any) -> tuple[str, str]:
|
|
233
|
+
"""Stream SSE chunks; returns (full_content, full_reasoning)."""
|
|
234
|
+
buf_content: list[str] = []
|
|
235
|
+
buf_reasoning: list[str] = []
|
|
236
|
+
for raw in resp:
|
|
237
|
+
line = raw.decode("utf-8", "replace").strip()
|
|
238
|
+
if not line.startswith("data:"):
|
|
239
|
+
continue
|
|
240
|
+
data = line[5:].strip()
|
|
241
|
+
if data == "[DONE]":
|
|
242
|
+
break
|
|
243
|
+
try:
|
|
244
|
+
chunk = json.loads(data)
|
|
245
|
+
except json.JSONDecodeError:
|
|
246
|
+
continue
|
|
247
|
+
choice = chunk.get("choices", [{}])[0]
|
|
248
|
+
delta = choice.get("delta", {})
|
|
249
|
+
if r := delta.get("reasoning_content"):
|
|
250
|
+
buf_reasoning.append(r)
|
|
251
|
+
sys.stderr.write(r)
|
|
252
|
+
sys.stderr.flush()
|
|
253
|
+
if c := delta.get("content"):
|
|
254
|
+
buf_content.append(c)
|
|
255
|
+
sys.stdout.write(c)
|
|
256
|
+
sys.stdout.flush()
|
|
257
|
+
sys.stdout.write("\n")
|
|
258
|
+
return "".join(buf_content), "".join(buf_reasoning)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def non_stream_chat(resp: Any) -> tuple[str, str, dict[str, Any]]:
|
|
262
|
+
"""Returns (content, reasoning_content, usage)."""
|
|
263
|
+
payload = json.loads(resp.read().decode("utf-8"))
|
|
264
|
+
msg = payload["choices"][0]["message"]
|
|
265
|
+
return (
|
|
266
|
+
msg.get("content") or "",
|
|
267
|
+
msg.get("reasoning_content") or "",
|
|
268
|
+
payload.get("usage") or {},
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# --- CLI --------------------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
def main() -> None:
|
|
275
|
+
p = argparse.ArgumentParser(
|
|
276
|
+
description=__doc__.split("\n", 1)[0],
|
|
277
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
278
|
+
)
|
|
279
|
+
p.add_argument(
|
|
280
|
+
"images",
|
|
281
|
+
nargs="*",
|
|
282
|
+
metavar="IMAGE",
|
|
283
|
+
help="image: local path, http(s) URL, data: URL, or `-` for stdin",
|
|
284
|
+
)
|
|
285
|
+
p.add_argument(
|
|
286
|
+
"--mode",
|
|
287
|
+
choices=list(MODE_PROMPTS),
|
|
288
|
+
default="text",
|
|
289
|
+
help="output mode (default: text)",
|
|
290
|
+
)
|
|
291
|
+
p.add_argument(
|
|
292
|
+
"--model",
|
|
293
|
+
default=None,
|
|
294
|
+
help="MiMo vision model (default: $MIMO_OCR_MODEL / $MIMO_MODEL if "
|
|
295
|
+
"vision-capable / mimo-v2.5). Non-vision models are auto-switched.",
|
|
296
|
+
)
|
|
297
|
+
p.add_argument(
|
|
298
|
+
"--lang",
|
|
299
|
+
default=None,
|
|
300
|
+
help="primary language hint, e.g. 'Chinese', 'zh', '日本語'",
|
|
301
|
+
)
|
|
302
|
+
p.add_argument("--max-tokens", type=int, default=4096)
|
|
303
|
+
p.add_argument("--temperature", type=float, default=0.2)
|
|
304
|
+
p.add_argument(
|
|
305
|
+
"--engine",
|
|
306
|
+
choices=["auto", "mimo", "pollinations"],
|
|
307
|
+
default=os.environ.get("MIMO_OCR_ENGINE", "auto"),
|
|
308
|
+
help="OCR backend. auto = mimo if MIMO_API_KEY set, else pollinations "
|
|
309
|
+
"(free, no key required). default: %(default)s",
|
|
310
|
+
)
|
|
311
|
+
p.add_argument(
|
|
312
|
+
"--base-url",
|
|
313
|
+
default=os.environ.get("MIMO_BASE_URL", "https://api.xiaomimimo.com/v1"),
|
|
314
|
+
help="MiMo OpenAI-compat endpoint, ignored when --engine=pollinations "
|
|
315
|
+
"(default: %(default)s)",
|
|
316
|
+
)
|
|
317
|
+
p.add_argument(
|
|
318
|
+
"--pollinations-model",
|
|
319
|
+
default=os.environ.get("POLLINATIONS_MODEL", POLLINATIONS_DEFAULT_MODEL),
|
|
320
|
+
help="model id when --engine=pollinations (default: %(default)s)",
|
|
321
|
+
)
|
|
322
|
+
p.add_argument(
|
|
323
|
+
"--prompt",
|
|
324
|
+
default=None,
|
|
325
|
+
help="extra instruction appended to the mode prompt",
|
|
326
|
+
)
|
|
327
|
+
p.add_argument("--json", action="store_true", help="wrap stdout as JSON envelope")
|
|
328
|
+
p.add_argument("--stream", action="store_true", help="stream the response")
|
|
329
|
+
args = p.parse_args()
|
|
330
|
+
|
|
331
|
+
api_key = os.environ.get("MIMO_API_KEY")
|
|
332
|
+
|
|
333
|
+
# Resolve engine.
|
|
334
|
+
if args.engine == "mimo":
|
|
335
|
+
engine = "mimo"
|
|
336
|
+
if not api_key:
|
|
337
|
+
sys.stderr.write(
|
|
338
|
+
"error: --engine mimo requires MIMO_API_KEY.\n"
|
|
339
|
+
" set one at https://platform.xiaomimimo.com/#/console/api-keys\n"
|
|
340
|
+
" OR drop the flag to fall back to pollinations (free, no key required):\n"
|
|
341
|
+
" python3 ocr.py <image>\n"
|
|
342
|
+
)
|
|
343
|
+
sys.exit(3)
|
|
344
|
+
elif args.engine == "pollinations":
|
|
345
|
+
engine = "pollinations"
|
|
346
|
+
else: # auto
|
|
347
|
+
engine = "mimo" if api_key else "pollinations"
|
|
348
|
+
if engine == "pollinations":
|
|
349
|
+
sys.stderr.write(
|
|
350
|
+
"[engine] auto -> pollinations (free, no key). "
|
|
351
|
+
"Set MIMO_API_KEY for higher quality (mimo-v2.5).\n"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Resolve images: explicit args, else stdin if not a TTY.
|
|
355
|
+
raw_args = args.images
|
|
356
|
+
if not raw_args and not sys.stdin.isatty():
|
|
357
|
+
raw_args = ["-"]
|
|
358
|
+
if not raw_args:
|
|
359
|
+
sys.stderr.write(
|
|
360
|
+
"error: no image given. Pass one or more IMAGE args or pipe bytes "
|
|
361
|
+
"on stdin. See `ocr.py --help`.\n"
|
|
362
|
+
)
|
|
363
|
+
sys.exit(2)
|
|
364
|
+
|
|
365
|
+
image_urls = [resolve_image_arg(a) for a in raw_args]
|
|
366
|
+
|
|
367
|
+
if engine == "mimo":
|
|
368
|
+
model, note = pick_model(args.model)
|
|
369
|
+
if note:
|
|
370
|
+
sys.stderr.write(note)
|
|
371
|
+
else:
|
|
372
|
+
if args.model:
|
|
373
|
+
sys.stderr.write(
|
|
374
|
+
f"note: --model is mimo-specific; ignoring on pollinations "
|
|
375
|
+
f"(use --pollinations-model instead).\n"
|
|
376
|
+
)
|
|
377
|
+
model = args.pollinations_model
|
|
378
|
+
|
|
379
|
+
sys.stderr.write(
|
|
380
|
+
f"[ocr] engine={engine} mode={args.mode} model={model} images={len(image_urls)}\n"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
messages = build_messages(
|
|
384
|
+
mode=args.mode,
|
|
385
|
+
image_urls=image_urls,
|
|
386
|
+
lang=args.lang,
|
|
387
|
+
extra_prompt=args.prompt,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
body: dict[str, Any] = {
|
|
391
|
+
"model": model,
|
|
392
|
+
"messages": messages,
|
|
393
|
+
"temperature": args.temperature,
|
|
394
|
+
"stream": args.stream,
|
|
395
|
+
}
|
|
396
|
+
if engine == "mimo":
|
|
397
|
+
# MiMo's quirk: max_completion_tokens, not max_tokens.
|
|
398
|
+
body["max_completion_tokens"] = args.max_tokens
|
|
399
|
+
url = args.base_url.rstrip("/") + "/chat/completions"
|
|
400
|
+
auth = api_key
|
|
401
|
+
else:
|
|
402
|
+
body["max_tokens"] = args.max_tokens
|
|
403
|
+
url = POLLINATIONS_URL
|
|
404
|
+
auth = None
|
|
405
|
+
|
|
406
|
+
resp = post(url, body, auth, args.stream, engine=engine)
|
|
407
|
+
|
|
408
|
+
if args.stream:
|
|
409
|
+
content, reasoning = stream_chat(resp)
|
|
410
|
+
usage: dict[str, Any] = {}
|
|
411
|
+
else:
|
|
412
|
+
content, reasoning, usage = non_stream_chat(resp)
|
|
413
|
+
if reasoning:
|
|
414
|
+
sys.stderr.write(f"[reasoning]\n{reasoning}\n[/reasoning]\n\n")
|
|
415
|
+
if args.json:
|
|
416
|
+
envelope = {
|
|
417
|
+
"mode": args.mode,
|
|
418
|
+
"model": model,
|
|
419
|
+
"images": len(image_urls),
|
|
420
|
+
"content": content,
|
|
421
|
+
"reasoning_content": reasoning,
|
|
422
|
+
"usage": usage,
|
|
423
|
+
}
|
|
424
|
+
print(json.dumps(envelope, ensure_ascii=False, indent=2))
|
|
425
|
+
else:
|
|
426
|
+
print(content)
|
|
427
|
+
return
|
|
428
|
+
|
|
429
|
+
# Streaming + --json: emit envelope after the streamed body.
|
|
430
|
+
if args.json:
|
|
431
|
+
envelope = {
|
|
432
|
+
"mode": args.mode,
|
|
433
|
+
"model": model,
|
|
434
|
+
"images": len(image_urls),
|
|
435
|
+
"content": content,
|
|
436
|
+
"reasoning_content": reasoning,
|
|
437
|
+
"usage": {},
|
|
438
|
+
}
|
|
439
|
+
sys.stdout.write("\n---\n")
|
|
440
|
+
sys.stdout.write(json.dumps(envelope, ensure_ascii=False, indent=2))
|
|
441
|
+
sys.stdout.write("\n")
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
if __name__ == "__main__":
|
|
445
|
+
main()
|