mimo2codex 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/AGENTS.md +24 -5
  2. package/README.md +70 -6
  3. package/README.zh.md +69 -6
  4. package/dist/admin/router.js +117 -2
  5. package/dist/admin/router.js.map +1 -1
  6. package/dist/cli.js +67 -147
  7. package/dist/cli.js.map +1 -1
  8. package/dist/config.js +16 -10
  9. package/dist/config.js.map +1 -1
  10. package/dist/db/logs.js +80 -0
  11. package/dist/db/logs.js.map +1 -1
  12. package/dist/providers/generic.js +96 -0
  13. package/dist/providers/generic.js.map +1 -0
  14. package/dist/providers/genericLoader.js +229 -0
  15. package/dist/providers/genericLoader.js.map +1 -0
  16. package/dist/providers/registry.js +48 -10
  17. package/dist/providers/registry.js.map +1 -1
  18. package/dist/server.js +201 -1
  19. package/dist/server.js.map +1 -1
  20. package/dist/setup/snippets.js +187 -0
  21. package/dist/setup/snippets.js.map +1 -0
  22. package/dist/translate/reqToChat.js +42 -2
  23. package/dist/translate/reqToChat.js.map +1 -1
  24. package/dist/upstream/openaiCompatClient.js +32 -11
  25. package/dist/upstream/openaiCompatClient.js.map +1 -1
  26. package/dist/web/assets/index-D19ffnSJ.css +1 -0
  27. package/dist/web/assets/index-DPLJprJ4.js +67 -0
  28. package/dist/web/index.html +2 -2
  29. package/doc/generic-providers.md +399 -0
  30. package/doc/generic-providers.zh.md +399 -0
  31. package/doc/mimoskill.md +295 -0
  32. package/doc/mimoskill.zh.md +295 -0
  33. package/mimoskill/SKILL.md +80 -13
  34. package/mimoskill/references/ocr_workflow.md +240 -0
  35. package/mimoskill/scripts/generate_image.py +163 -0
  36. package/mimoskill/scripts/mimo_chat.py +111 -42
  37. package/mimoskill/scripts/ocr.py +445 -0
  38. package/package.json +5 -4
  39. package/dist/web/assets/index-BoykBCnY.js +0 -67
  40. package/dist/web/assets/index-DAJbSznk.css +0 -1
@@ -1,21 +1,29 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- mimo_chat.py — single-shot or streaming chat with Xiaomi MiMo V2.5.
3
+ mimo_chat.py — single-shot or streaming chat. Works WITHOUT any API key.
4
4
 
5
- Hits MiMo's OpenAI-compatible /v1/chat/completions endpoint directly. Handles
6
- the MiMo-specific quirks:
5
+ Engines (--engine):
6
+ auto (default) mimo if MIMO_API_KEY set, else pollinations
7
+ mimo — Xiaomi MiMo V2.5 (best quality, needs MIMO_API_KEY)
8
+ pollinations — pollinations.ai free public chat endpoint. NO KEY REQUIRED
7
9
 
10
+ When the mimo engine is used, handles the MiMo-specific quirks:
8
11
  - max_completion_tokens (not max_tokens)
9
12
  - vision via mimo-v2.5 / mimo-v2-omni (and the required text part next to
10
13
  image_url, otherwise MiMo 400s with "text is not set")
11
- - web_search builtin tool (requires Web Search Plugin activated in console)
14
+ - web_search builtin: auto-enabled on pay-as-you-go (sk-*) keys, skipped on
15
+ token-plan (tp-*) keys. Model decides when to invoke (tool_choice: auto).
16
+ Requires the Web Search Plugin to be activated in the MiMo console.
12
17
  - reasoning_content extraction
13
18
 
14
19
  Usage:
15
- export MIMO_API_KEY=sk-xxxx
20
+ # Zero-setup
16
21
  python3 mimo_chat.py "your prompt"
17
- python3 mimo_chat.py --model mimo-v2.5 --image https://x/y.png "describe"
18
- python3 mimo_chat.py --search "今天上海天气?"
22
+ python3 mimo_chat.py --image https://x/y.png "describe"
23
+
24
+ # MiMo key — gets best quality + native web search (when sk-*)
25
+ export MIMO_API_KEY=sk-xxxx
26
+ python3 mimo_chat.py "今天上海天气?"
19
27
  python3 mimo_chat.py --stream "tell me a story"
20
28
 
21
29
  Only depends on the standard library — no `openai` SDK install needed.
@@ -48,51 +56,64 @@ def build_messages(prompt: str, image: str | None) -> list[dict[str, Any]]:
48
56
  ]
49
57
 
50
58
 
59
+ POLLINATIONS_URL = "https://text.pollinations.ai/openai"
60
+ POLLINATIONS_DEFAULT_MODEL = "openai" # vision-capable, free, no key
61
+
62
+
51
63
  def build_body(
52
64
  *,
53
65
  prompt: str,
54
66
  image: str | None,
55
67
  model: str,
56
68
  stream: bool,
57
- search: bool,
69
+ enable_web_search: bool,
58
70
  max_tokens: int,
59
71
  temperature: float,
72
+ engine: str,
60
73
  ) -> dict[str, Any]:
61
74
  body: dict[str, Any] = {
62
75
  "model": model,
63
76
  "messages": build_messages(prompt, image),
64
- "max_completion_tokens": max_tokens,
65
77
  "temperature": temperature,
66
78
  "stream": stream,
67
79
  }
68
- if search:
69
- # MiMo native web_search builtin. Requires the Web Search Plugin to
70
- # be activated at https://platform.xiaomimimo.com/#/console/plugin.
71
- body["tools"] = [{"type": "web_search", "force_search": True}]
80
+ if engine == "mimo":
81
+ # MiMo's quirk: max_completion_tokens, not max_tokens.
82
+ body["max_completion_tokens"] = max_tokens
83
+ else:
84
+ body["max_tokens"] = max_tokens
85
+ if enable_web_search:
86
+ # MiMo native web_search builtin. The model decides whether to invoke
87
+ # it (tool_choice=auto). Requires the Web Search Plugin to be
88
+ # activated at https://platform.xiaomimimo.com/#/console/plugin —
89
+ # without that, MiMo returns 400 and the error body is printed.
90
+ body["tools"] = [{"type": "web_search"}]
72
91
  body["tool_choice"] = "auto"
73
92
  return body
74
93
 
75
94
 
76
- def post(url: str, body: dict[str, Any], api_key: str, stream: bool) -> Any:
95
+ def post(url: str, body: dict[str, Any], api_key: str | None, stream: bool, *, engine: str) -> Any:
96
+ headers = {
97
+ "Content-Type": "application/json",
98
+ "Accept": "text/event-stream" if stream else "application/json",
99
+ "User-Agent": "mimoskill/0.1",
100
+ }
101
+ if api_key:
102
+ headers["Authorization"] = f"Bearer {api_key}"
77
103
  req = urllib.request.Request(
78
104
  url,
79
105
  method="POST",
80
106
  data=json.dumps(body).encode("utf-8"),
81
- headers={
82
- "Content-Type": "application/json",
83
- "Accept": "text/event-stream" if stream else "application/json",
84
- "Authorization": f"Bearer {api_key}",
85
- "User-Agent": "mimoskill/0.1",
86
- },
107
+ headers=headers,
87
108
  )
88
109
  try:
89
110
  return urllib.request.urlopen(req, timeout=300)
90
111
  except urllib.error.HTTPError as e:
91
112
  snippet = e.read().decode("utf-8", "replace")
92
- sys.stderr.write(f"MiMo returned HTTP {e.code}: {snippet}\n")
113
+ sys.stderr.write(f"{engine} returned HTTP {e.code}: {snippet}\n")
93
114
  sys.exit(1)
94
115
  except urllib.error.URLError as e:
95
- sys.stderr.write(f"connection failed: {e}\n")
116
+ sys.stderr.write(f"connection failed ({engine}): {e}\n")
96
117
  sys.exit(1)
97
118
 
98
119
 
@@ -144,51 +165,99 @@ def main() -> None:
144
165
  p.add_argument("prompt", nargs="?", default="", help="user message text")
145
166
  p.add_argument("--model", default=os.environ.get("MIMO_MODEL", "mimo-v2.5-pro"))
146
167
  p.add_argument("--image", help="image URL to attach (forces vision-capable model)")
147
- p.add_argument("--search", action="store_true", help="enable MiMo web_search builtin")
148
168
  p.add_argument("--stream", action="store_true", help="stream the response")
149
169
  p.add_argument("--max-tokens", type=int, default=2048)
150
170
  p.add_argument("--temperature", type=float, default=0.7)
171
+ p.add_argument(
172
+ "--engine",
173
+ choices=["auto", "mimo", "pollinations"],
174
+ default=os.environ.get("MIMO_CHAT_ENGINE", "auto"),
175
+ help="chat backend. auto = mimo if MIMO_API_KEY set, else pollinations "
176
+ "(free, no key required). default: %(default)s",
177
+ )
151
178
  p.add_argument(
152
179
  "--base-url",
153
180
  default=os.environ.get("MIMO_BASE_URL", "https://api.xiaomimimo.com/v1"),
154
- help="set to https://token-plan-cn.xiaomimimo.com/v1 for tp-* keys",
181
+ help="MiMo endpoint, ignored when --engine=pollinations "
182
+ "(tp-* keys use https://token-plan-cn.xiaomimimo.com/v1)",
183
+ )
184
+ p.add_argument(
185
+ "--pollinations-model",
186
+ default=os.environ.get("POLLINATIONS_MODEL", POLLINATIONS_DEFAULT_MODEL),
187
+ help="model id when --engine=pollinations (default: %(default)s)",
155
188
  )
156
189
  args = p.parse_args()
157
190
 
158
191
  api_key = os.environ.get("MIMO_API_KEY")
159
- if not api_key:
160
- sys.stderr.write("error: MIMO_API_KEY not set in environment\n")
161
- sys.stderr.write(
162
- " get one at https://platform.xiaomimimo.com/#/console/api-keys\n"
163
- )
164
- sys.exit(2)
192
+
193
+ # Resolve engine.
194
+ if args.engine == "mimo":
195
+ engine = "mimo"
196
+ if not api_key:
197
+ sys.stderr.write(
198
+ "error: --engine mimo requires MIMO_API_KEY.\n"
199
+ " get one at https://platform.xiaomimimo.com/#/console/api-keys\n"
200
+ " OR drop the flag to fall back to pollinations (free, no key required):\n"
201
+ " python3 mimo_chat.py <prompt>\n"
202
+ )
203
+ sys.exit(3)
204
+ elif args.engine == "pollinations":
205
+ engine = "pollinations"
206
+ else: # auto
207
+ engine = "mimo" if api_key else "pollinations"
208
+ if engine == "pollinations":
209
+ sys.stderr.write(
210
+ "[engine] auto -> pollinations (free, no key). "
211
+ "Set MIMO_API_KEY for higher quality (mimo-v2.5).\n"
212
+ )
165
213
 
166
214
  if not args.prompt and not args.image:
167
215
  sys.stderr.write("error: pass a prompt and/or --image\n")
168
216
  sys.exit(2)
169
217
 
170
- # Auto-bump to a vision model if user passed --image with a non-vision model
171
- model = args.model
172
- if args.image and "omni" not in model.lower() and not model.startswith("mimo-v2.5["):
173
- if model != "mimo-v2.5":
174
- sys.stderr.write(
175
- f"note: --image given but model is '{model}' which doesn't see images.\n"
176
- f" switching to mimo-v2.5 for this call.\n"
177
- )
178
- model = "mimo-v2.5"
218
+ enable_web_search = False
219
+ if engine == "mimo":
220
+ # Auto-bump to a vision model if user passed --image with a non-vision model.
221
+ model = args.model
222
+ if args.image and "omni" not in model.lower() and not model.startswith("mimo-v2.5["):
223
+ if model != "mimo-v2.5":
224
+ sys.stderr.write(
225
+ f"note: --image given but model is '{model}' which doesn't see images.\n"
226
+ f" switching to mimo-v2.5 for this call.\n"
227
+ )
228
+ model = "mimo-v2.5"
229
+ url = args.base_url.rstrip("/") + "/chat/completions"
230
+ auth: str | None = api_key
231
+ # MiMo native web_search: pay-as-you-go (sk-*) supports it, token-plan
232
+ # (tp-*) does not. Always include the tool on sk-* and let the model
233
+ # decide via tool_choice=auto — no extra flag needed.
234
+ enable_web_search = bool(api_key and api_key.startswith("sk-"))
235
+ else:
236
+ # Pollinations: pick the configured vision-capable model. The user's
237
+ # --model (mimo-*) is mimo-specific so we don't honor it here unless
238
+ # they explicitly passed --pollinations-model.
239
+ model = args.pollinations_model
240
+ url = POLLINATIONS_URL
241
+ auth = None
242
+
243
+ sys.stderr.write(
244
+ f"[chat] engine={engine} model={model}"
245
+ + (" web_search=on" if enable_web_search else "")
246
+ + "\n"
247
+ )
179
248
 
180
249
  body = build_body(
181
250
  prompt=args.prompt,
182
251
  image=args.image,
183
252
  model=model,
184
253
  stream=args.stream,
185
- search=args.search,
254
+ enable_web_search=enable_web_search,
186
255
  max_tokens=args.max_tokens,
187
256
  temperature=args.temperature,
257
+ engine=engine,
188
258
  )
189
259
 
190
- url = args.base_url.rstrip("/") + "/chat/completions"
191
- resp = post(url, body, api_key, args.stream)
260
+ resp = post(url, body, auth, args.stream, engine=engine)
192
261
  if args.stream:
193
262
  stream_chat(resp)
194
263
  else:
@@ -0,0 +1,445 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ocr.py — OCR / image recognition that works without any API key.
4
+
5
+ Use this when the surrounding chat model can't see images (mimo-v2.5-pro,
6
+ mimo-v2.5-pro[1m], mimo-v2-flash, deepseek-*, or any text-only model).
7
+
8
+ Engines (--engine):
9
+ auto (default) — mimo if MIMO_API_KEY set, else pollinations
10
+ mimo — Xiaomi MiMo V2.5 vision. Highest quality. Needs MIMO_API_KEY
11
+ pollinations — pollinations.ai free public vision endpoint. NO KEY REQUIRED
12
+
13
+ Modes (--mode):
14
+ text (default) verbatim OCR — raw text, preserves line breaks
15
+ describe 2-4 sentence description of the image
16
+ structured single JSON object with text / language / regions / summary
17
+ markdown re-render the image as GitHub-flavored Markdown
18
+
19
+ Image inputs (positional, 0+):
20
+ /path/to/file.png local file → base64 data URL
21
+ https://example.com/x.png http(s) URL → forwarded as-is
22
+ data:image/...;base64,... data URL → forwarded as-is
23
+ - read one image from stdin (bytes)
24
+ (none, stdin not a TTY) same as `-`
25
+
26
+ Usage:
27
+ # Zero-setup: free fallback, works for DeepSeek-only / no-key users
28
+ python3 ocr.py path/to/image.png
29
+ python3 ocr.py --mode describe https://example.com/x.png
30
+
31
+ # Best quality (needs MiMo key)
32
+ export MIMO_API_KEY=sk-xxxx
33
+ python3 ocr.py --mode structured a.png b.jpg
34
+ cat scan.png | python3 ocr.py --mode markdown
35
+
36
+ Only depends on the standard library — no `openai` SDK install needed.
37
+ """
38
+ from __future__ import annotations
39
+
40
+ import argparse
41
+ import base64
42
+ import json
43
+ import mimetypes
44
+ import os
45
+ import sys
46
+ import urllib.error
47
+ import urllib.request
48
+ from pathlib import Path
49
+ from typing import Any
50
+
51
+
52
+ # --- modes ------------------------------------------------------------------
53
+
54
+ MODE_PROMPTS: dict[str, str] = {
55
+ "text": (
56
+ "Extract ALL legible text from the attached image(s) verbatim, "
57
+ "preserving line breaks, reading order, and any obvious column/table "
58
+ "layout using whitespace and pipes. Do not paraphrase, translate, "
59
+ "summarize, or add commentary. If you cannot read part of it, output "
60
+ "`[unreadable]` in place. If the image contains no text, output "
61
+ "exactly the single line `[no text detected]`."
62
+ ),
63
+ "describe": (
64
+ "Describe the contents of the attached image(s) in 2-4 sentences. "
65
+ "Mention layout, key visual elements, any visible text (quoted), and "
66
+ "notable colors. Do not invent details that aren't visible."
67
+ ),
68
+ "structured": (
69
+ "Return ONE JSON object with keys `text` (string, full OCR — same "
70
+ "rules as verbatim text extraction, preserve line breaks and reading "
71
+ "order), `language` (BCP-47 best-guess like \"zh-Hans\" or \"en\"), "
72
+ "`regions` (array of `{label, text, role}` where role is one of "
73
+ "`title`, `paragraph`, `list`, `table`, `caption`, `ui`, "
74
+ "`handwriting`, `other`), and `summary` (1-sentence description). "
75
+ "Output ONLY the JSON, no markdown fences, no preamble."
76
+ ),
77
+ "markdown": (
78
+ "Re-render the attached image(s) as GitHub-flavored Markdown. "
79
+ "Headings become `#`/`##`, tables become pipe tables, code-like text "
80
+ "becomes fenced code blocks, lists become `-`. Preserve reading "
81
+ "order. Output ONLY the markdown body — no preamble, no fences "
82
+ "wrapping the whole thing."
83
+ ),
84
+ }
85
+
86
+ STRUCTURED_SYSTEM = (
87
+ "You are an OCR engine. Output strictly machine-parseable JSON, "
88
+ "no markdown fences, no commentary."
89
+ )
90
+
91
+
92
+ # --- MIME sniffing ----------------------------------------------------------
93
+
94
+ _MAGIC = [
95
+ (b"\x89PNG\r\n\x1a\n", "image/png"),
96
+ (b"\xff\xd8\xff", "image/jpeg"),
97
+ (b"GIF87a", "image/gif"),
98
+ (b"GIF89a", "image/gif"),
99
+ (b"BM", "image/bmp"),
100
+ ]
101
+
102
+
103
+ def sniff_mime(data: bytes, hint_name: str | None = None) -> str:
104
+ for sig, mime in _MAGIC:
105
+ if data.startswith(sig):
106
+ return mime
107
+ # WebP: "RIFF....WEBP"
108
+ if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
109
+ return "image/webp"
110
+ if hint_name:
111
+ guessed, _ = mimetypes.guess_type(hint_name)
112
+ if guessed and guessed.startswith("image/"):
113
+ return guessed
114
+ return "image/png"
115
+
116
+
117
+ def bytes_to_data_url(data: bytes, hint_name: str | None = None) -> str:
118
+ mime = sniff_mime(data, hint_name)
119
+ b64 = base64.b64encode(data).decode("ascii")
120
+ return f"data:{mime};base64,{b64}"
121
+
122
+
123
+ def resolve_image_arg(arg: str) -> str:
124
+ """Turn a positional IMAGE arg into a URL suitable for image_url."""
125
+ if arg == "-":
126
+ if sys.stdin.isatty():
127
+ sys.stderr.write("error: `-` requested but stdin is a TTY\n")
128
+ sys.exit(2)
129
+ data = sys.stdin.buffer.read()
130
+ if not data:
131
+ sys.stderr.write("error: stdin was empty\n")
132
+ sys.exit(2)
133
+ return bytes_to_data_url(data)
134
+ if arg.startswith(("http://", "https://", "data:")):
135
+ return arg
136
+ path = Path(arg)
137
+ if not path.exists():
138
+ sys.stderr.write(f"error: image not found: {arg}\n")
139
+ sys.exit(4)
140
+ try:
141
+ data = path.read_bytes()
142
+ except OSError as e:
143
+ sys.stderr.write(f"error: cannot read {arg}: {e}\n")
144
+ sys.exit(4)
145
+ return bytes_to_data_url(data, hint_name=path.name)
146
+
147
+
148
+ # --- model auto-select ------------------------------------------------------
149
+
150
+ def model_supports_images(model: str) -> bool:
151
+ """Mirror src/translate/reqToChat.ts:modelSupportsImages."""
152
+ base = model.split("[", 1)[0].lower()
153
+ if "omni" in base:
154
+ return True
155
+ if base == "mimo-v2.5":
156
+ return True
157
+ return False
158
+
159
+
160
+ def pick_model(cli_model: str | None) -> tuple[str, str | None]:
161
+ """Returns (chosen_model, note_for_stderr_or_None)."""
162
+ if cli_model:
163
+ if model_supports_images(cli_model):
164
+ return cli_model, None
165
+ return "mimo-v2.5", (
166
+ f"note: model '{cli_model}' does not see images; "
167
+ f"switching to mimo-v2.5 for this call.\n"
168
+ )
169
+ env_ocr = os.environ.get("MIMO_OCR_MODEL")
170
+ if env_ocr and model_supports_images(env_ocr):
171
+ return env_ocr, None
172
+ env_chat = os.environ.get("MIMO_MODEL")
173
+ if env_chat and model_supports_images(env_chat):
174
+ return env_chat, None
175
+ return "mimo-v2.5", None
176
+
177
+
178
+ # --- message building -------------------------------------------------------
179
+
180
+ def build_messages(
181
+ *, mode: str, image_urls: list[str], lang: str | None, extra_prompt: str | None
182
+ ) -> list[dict[str, Any]]:
183
+ user_text = MODE_PROMPTS[mode]
184
+ if lang:
185
+ user_text += f" Primary language: {lang}."
186
+ if extra_prompt:
187
+ user_text += f" {extra_prompt}"
188
+
189
+ content: list[dict[str, Any]] = [
190
+ {"type": "image_url", "image_url": {"url": u}} for u in image_urls
191
+ ]
192
+ content.append({"type": "text", "text": user_text})
193
+
194
+ messages: list[dict[str, Any]] = []
195
+ if mode == "structured":
196
+ messages.append({"role": "system", "content": STRUCTURED_SYSTEM})
197
+ messages.append({"role": "user", "content": content})
198
+ return messages
199
+
200
+
201
+ # --- HTTP -------------------------------------------------------------------
202
+
203
+ POLLINATIONS_URL = "https://text.pollinations.ai/openai"
204
+ POLLINATIONS_DEFAULT_MODEL = "openai" # vision-capable, free, no key
205
+
206
+
207
+ def post(url: str, body: dict[str, Any], api_key: str | None, stream: bool, *, engine: str) -> Any:
208
+ headers = {
209
+ "Content-Type": "application/json",
210
+ "Accept": "text/event-stream" if stream else "application/json",
211
+ "User-Agent": "mimoskill-ocr/0.1",
212
+ }
213
+ if api_key:
214
+ headers["Authorization"] = f"Bearer {api_key}"
215
+ req = urllib.request.Request(
216
+ url,
217
+ method="POST",
218
+ data=json.dumps(body).encode("utf-8"),
219
+ headers=headers,
220
+ )
221
+ try:
222
+ return urllib.request.urlopen(req, timeout=300)
223
+ except urllib.error.HTTPError as e:
224
+ snippet = e.read().decode("utf-8", "replace")
225
+ sys.stderr.write(f"{engine} returned HTTP {e.code}: {snippet}\n")
226
+ sys.exit(1)
227
+ except urllib.error.URLError as e:
228
+ sys.stderr.write(f"connection failed ({engine}): {e}\n")
229
+ sys.exit(1)
230
+
231
+
232
+ def stream_chat(resp: Any) -> tuple[str, str]:
233
+ """Stream SSE chunks; returns (full_content, full_reasoning)."""
234
+ buf_content: list[str] = []
235
+ buf_reasoning: list[str] = []
236
+ for raw in resp:
237
+ line = raw.decode("utf-8", "replace").strip()
238
+ if not line.startswith("data:"):
239
+ continue
240
+ data = line[5:].strip()
241
+ if data == "[DONE]":
242
+ break
243
+ try:
244
+ chunk = json.loads(data)
245
+ except json.JSONDecodeError:
246
+ continue
247
+ choice = chunk.get("choices", [{}])[0]
248
+ delta = choice.get("delta", {})
249
+ if r := delta.get("reasoning_content"):
250
+ buf_reasoning.append(r)
251
+ sys.stderr.write(r)
252
+ sys.stderr.flush()
253
+ if c := delta.get("content"):
254
+ buf_content.append(c)
255
+ sys.stdout.write(c)
256
+ sys.stdout.flush()
257
+ sys.stdout.write("\n")
258
+ return "".join(buf_content), "".join(buf_reasoning)
259
+
260
+
261
+ def non_stream_chat(resp: Any) -> tuple[str, str, dict[str, Any]]:
262
+ """Returns (content, reasoning_content, usage)."""
263
+ payload = json.loads(resp.read().decode("utf-8"))
264
+ msg = payload["choices"][0]["message"]
265
+ return (
266
+ msg.get("content") or "",
267
+ msg.get("reasoning_content") or "",
268
+ payload.get("usage") or {},
269
+ )
270
+
271
+
272
+ # --- CLI --------------------------------------------------------------------
273
+
274
+ def main() -> None:
275
+ p = argparse.ArgumentParser(
276
+ description=__doc__.split("\n", 1)[0],
277
+ formatter_class=argparse.RawDescriptionHelpFormatter,
278
+ )
279
+ p.add_argument(
280
+ "images",
281
+ nargs="*",
282
+ metavar="IMAGE",
283
+ help="image: local path, http(s) URL, data: URL, or `-` for stdin",
284
+ )
285
+ p.add_argument(
286
+ "--mode",
287
+ choices=list(MODE_PROMPTS),
288
+ default="text",
289
+ help="output mode (default: text)",
290
+ )
291
+ p.add_argument(
292
+ "--model",
293
+ default=None,
294
+ help="MiMo vision model (default: $MIMO_OCR_MODEL / $MIMO_MODEL if "
295
+ "vision-capable / mimo-v2.5). Non-vision models are auto-switched.",
296
+ )
297
+ p.add_argument(
298
+ "--lang",
299
+ default=None,
300
+ help="primary language hint, e.g. 'Chinese', 'zh', '日本語'",
301
+ )
302
+ p.add_argument("--max-tokens", type=int, default=4096)
303
+ p.add_argument("--temperature", type=float, default=0.2)
304
+ p.add_argument(
305
+ "--engine",
306
+ choices=["auto", "mimo", "pollinations"],
307
+ default=os.environ.get("MIMO_OCR_ENGINE", "auto"),
308
+ help="OCR backend. auto = mimo if MIMO_API_KEY set, else pollinations "
309
+ "(free, no key required). default: %(default)s",
310
+ )
311
+ p.add_argument(
312
+ "--base-url",
313
+ default=os.environ.get("MIMO_BASE_URL", "https://api.xiaomimimo.com/v1"),
314
+ help="MiMo OpenAI-compat endpoint, ignored when --engine=pollinations "
315
+ "(default: %(default)s)",
316
+ )
317
+ p.add_argument(
318
+ "--pollinations-model",
319
+ default=os.environ.get("POLLINATIONS_MODEL", POLLINATIONS_DEFAULT_MODEL),
320
+ help="model id when --engine=pollinations (default: %(default)s)",
321
+ )
322
+ p.add_argument(
323
+ "--prompt",
324
+ default=None,
325
+ help="extra instruction appended to the mode prompt",
326
+ )
327
+ p.add_argument("--json", action="store_true", help="wrap stdout as JSON envelope")
328
+ p.add_argument("--stream", action="store_true", help="stream the response")
329
+ args = p.parse_args()
330
+
331
+ api_key = os.environ.get("MIMO_API_KEY")
332
+
333
+ # Resolve engine.
334
+ if args.engine == "mimo":
335
+ engine = "mimo"
336
+ if not api_key:
337
+ sys.stderr.write(
338
+ "error: --engine mimo requires MIMO_API_KEY.\n"
339
+ " set one at https://platform.xiaomimimo.com/#/console/api-keys\n"
340
+ " OR drop the flag to fall back to pollinations (free, no key required):\n"
341
+ " python3 ocr.py <image>\n"
342
+ )
343
+ sys.exit(3)
344
+ elif args.engine == "pollinations":
345
+ engine = "pollinations"
346
+ else: # auto
347
+ engine = "mimo" if api_key else "pollinations"
348
+ if engine == "pollinations":
349
+ sys.stderr.write(
350
+ "[engine] auto -> pollinations (free, no key). "
351
+ "Set MIMO_API_KEY for higher quality (mimo-v2.5).\n"
352
+ )
353
+
354
+ # Resolve images: explicit args, else stdin if not a TTY.
355
+ raw_args = args.images
356
+ if not raw_args and not sys.stdin.isatty():
357
+ raw_args = ["-"]
358
+ if not raw_args:
359
+ sys.stderr.write(
360
+ "error: no image given. Pass one or more IMAGE args or pipe bytes "
361
+ "on stdin. See `ocr.py --help`.\n"
362
+ )
363
+ sys.exit(2)
364
+
365
+ image_urls = [resolve_image_arg(a) for a in raw_args]
366
+
367
+ if engine == "mimo":
368
+ model, note = pick_model(args.model)
369
+ if note:
370
+ sys.stderr.write(note)
371
+ else:
372
+ if args.model:
373
+ sys.stderr.write(
374
+ f"note: --model is mimo-specific; ignoring on pollinations "
375
+ f"(use --pollinations-model instead).\n"
376
+ )
377
+ model = args.pollinations_model
378
+
379
+ sys.stderr.write(
380
+ f"[ocr] engine={engine} mode={args.mode} model={model} images={len(image_urls)}\n"
381
+ )
382
+
383
+ messages = build_messages(
384
+ mode=args.mode,
385
+ image_urls=image_urls,
386
+ lang=args.lang,
387
+ extra_prompt=args.prompt,
388
+ )
389
+
390
+ body: dict[str, Any] = {
391
+ "model": model,
392
+ "messages": messages,
393
+ "temperature": args.temperature,
394
+ "stream": args.stream,
395
+ }
396
+ if engine == "mimo":
397
+ # MiMo's quirk: max_completion_tokens, not max_tokens.
398
+ body["max_completion_tokens"] = args.max_tokens
399
+ url = args.base_url.rstrip("/") + "/chat/completions"
400
+ auth = api_key
401
+ else:
402
+ body["max_tokens"] = args.max_tokens
403
+ url = POLLINATIONS_URL
404
+ auth = None
405
+
406
+ resp = post(url, body, auth, args.stream, engine=engine)
407
+
408
+ if args.stream:
409
+ content, reasoning = stream_chat(resp)
410
+ usage: dict[str, Any] = {}
411
+ else:
412
+ content, reasoning, usage = non_stream_chat(resp)
413
+ if reasoning:
414
+ sys.stderr.write(f"[reasoning]\n{reasoning}\n[/reasoning]\n\n")
415
+ if args.json:
416
+ envelope = {
417
+ "mode": args.mode,
418
+ "model": model,
419
+ "images": len(image_urls),
420
+ "content": content,
421
+ "reasoning_content": reasoning,
422
+ "usage": usage,
423
+ }
424
+ print(json.dumps(envelope, ensure_ascii=False, indent=2))
425
+ else:
426
+ print(content)
427
+ return
428
+
429
+ # Streaming + --json: emit envelope after the streamed body.
430
+ if args.json:
431
+ envelope = {
432
+ "mode": args.mode,
433
+ "model": model,
434
+ "images": len(image_urls),
435
+ "content": content,
436
+ "reasoning_content": reasoning,
437
+ "usage": {},
438
+ }
439
+ sys.stdout.write("\n---\n")
440
+ sys.stdout.write(json.dumps(envelope, ensure_ascii=False, indent=2))
441
+ sys.stdout.write("\n")
442
+
443
+
444
+ if __name__ == "__main__":
445
+ main()