mimo2codex 0.1.15 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/AGENTS.md +24 -5
  2. package/README.md +46 -5
  3. package/README.zh.md +46 -5
  4. package/dist/admin/router.js +117 -2
  5. package/dist/admin/router.js.map +1 -1
  6. package/dist/cli.js +67 -147
  7. package/dist/cli.js.map +1 -1
  8. package/dist/config.js +16 -10
  9. package/dist/config.js.map +1 -1
  10. package/dist/db/logs.js +80 -0
  11. package/dist/db/logs.js.map +1 -1
  12. package/dist/providers/generic.js +96 -0
  13. package/dist/providers/generic.js.map +1 -0
  14. package/dist/providers/genericLoader.js +229 -0
  15. package/dist/providers/genericLoader.js.map +1 -0
  16. package/dist/providers/registry.js +48 -10
  17. package/dist/providers/registry.js.map +1 -1
  18. package/dist/server.js +201 -1
  19. package/dist/server.js.map +1 -1
  20. package/dist/setup/snippets.js +187 -0
  21. package/dist/setup/snippets.js.map +1 -0
  22. package/dist/translate/reqToChat.js +1 -1
  23. package/dist/translate/reqToChat.js.map +1 -1
  24. package/dist/upstream/openaiCompatClient.js +32 -11
  25. package/dist/upstream/openaiCompatClient.js.map +1 -1
  26. package/dist/web/assets/index-D19ffnSJ.css +1 -0
  27. package/dist/web/assets/index-DPLJprJ4.js +67 -0
  28. package/dist/web/index.html +2 -2
  29. package/doc/generic-providers.md +399 -0
  30. package/doc/generic-providers.zh.md +399 -0
  31. package/mimoskill/SKILL.md +69 -8
  32. package/mimoskill/references/ocr_workflow.md +216 -0
  33. package/mimoskill/scripts/generate_image.py +163 -0
  34. package/mimoskill/scripts/ocr.py +396 -0
  35. package/package.json +5 -4
  36. package/dist/web/assets/index-BoykBCnY.js +0 -67
  37. package/dist/web/assets/index-DAJbSznk.css +0 -1
@@ -0,0 +1,396 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ocr.py — OCR / image recognition via Xiaomi MiMo V2.5 vision.
4
+
5
+ Use this when the surrounding chat model can't see images (mimo-v2.5-pro,
6
+ mimo-v2.5-pro[1m], mimo-v2-flash, or any third-party model without vision).
7
+ ocr.py always calls mimo-v2.5 internally regardless of what the rest of the
8
+ conversation is using.
9
+
10
+ Modes (--mode):
11
+ text (default) verbatim OCR — raw text, preserves line breaks
12
+ describe 2-4 sentence description of the image
13
+ structured single JSON object with text / language / regions / summary
14
+ markdown re-render the image as GitHub-flavored Markdown
15
+
16
+ Image inputs (positional, 0+):
17
+ /path/to/file.png local file → base64 data URL
18
+ https://example.com/x.png http(s) URL → forwarded as-is
19
+ data:image/...;base64,... data URL → forwarded as-is
20
+ - read one image from stdin (bytes)
21
+ (none, stdin not a TTY) same as `-`
22
+
23
+ Usage:
24
+ export MIMO_API_KEY=sk-xxxx
25
+ python3 ocr.py path/to/image.png
26
+ python3 ocr.py --mode describe https://example.com/x.png
27
+ python3 ocr.py --mode structured a.png b.jpg
28
+ cat scan.png | python3 ocr.py --mode markdown
29
+
30
+ Only depends on the standard library — no `openai` SDK install needed.
31
+ """
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import mimetypes
38
+ import os
39
+ import sys
40
+ import urllib.error
41
+ import urllib.request
42
+ from pathlib import Path
43
+ from typing import Any
44
+
45
+
46
+ # --- modes ------------------------------------------------------------------
47
+
48
+ MODE_PROMPTS: dict[str, str] = {
49
+ "text": (
50
+ "Extract ALL legible text from the attached image(s) verbatim, "
51
+ "preserving line breaks, reading order, and any obvious column/table "
52
+ "layout using whitespace and pipes. Do not paraphrase, translate, "
53
+ "summarize, or add commentary. If you cannot read part of it, output "
54
+ "`[unreadable]` in place. If the image contains no text, output "
55
+ "exactly the single line `[no text detected]`."
56
+ ),
57
+ "describe": (
58
+ "Describe the contents of the attached image(s) in 2-4 sentences. "
59
+ "Mention layout, key visual elements, any visible text (quoted), and "
60
+ "notable colors. Do not invent details that aren't visible."
61
+ ),
62
+ "structured": (
63
+ "Return ONE JSON object with keys `text` (string, full OCR — same "
64
+ "rules as verbatim text extraction, preserve line breaks and reading "
65
+ "order), `language` (BCP-47 best-guess like \"zh-Hans\" or \"en\"), "
66
+ "`regions` (array of `{label, text, role}` where role is one of "
67
+ "`title`, `paragraph`, `list`, `table`, `caption`, `ui`, "
68
+ "`handwriting`, `other`), and `summary` (1-sentence description). "
69
+ "Output ONLY the JSON, no markdown fences, no preamble."
70
+ ),
71
+ "markdown": (
72
+ "Re-render the attached image(s) as GitHub-flavored Markdown. "
73
+ "Headings become `#`/`##`, tables become pipe tables, code-like text "
74
+ "becomes fenced code blocks, lists become `-`. Preserve reading "
75
+ "order. Output ONLY the markdown body — no preamble, no fences "
76
+ "wrapping the whole thing."
77
+ ),
78
+ }
79
+
80
+ STRUCTURED_SYSTEM = (
81
+ "You are an OCR engine. Output strictly machine-parseable JSON, "
82
+ "no markdown fences, no commentary."
83
+ )
84
+
85
+
86
+ # --- MIME sniffing ----------------------------------------------------------
87
+
88
+ _MAGIC = [
89
+ (b"\x89PNG\r\n\x1a\n", "image/png"),
90
+ (b"\xff\xd8\xff", "image/jpeg"),
91
+ (b"GIF87a", "image/gif"),
92
+ (b"GIF89a", "image/gif"),
93
+ (b"BM", "image/bmp"),
94
+ ]
95
+
96
+
97
+ def sniff_mime(data: bytes, hint_name: str | None = None) -> str:
98
+ for sig, mime in _MAGIC:
99
+ if data.startswith(sig):
100
+ return mime
101
+ # WebP: "RIFF....WEBP"
102
+ if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
103
+ return "image/webp"
104
+ if hint_name:
105
+ guessed, _ = mimetypes.guess_type(hint_name)
106
+ if guessed and guessed.startswith("image/"):
107
+ return guessed
108
+ return "image/png"
109
+
110
+
111
+ def bytes_to_data_url(data: bytes, hint_name: str | None = None) -> str:
112
+ mime = sniff_mime(data, hint_name)
113
+ b64 = base64.b64encode(data).decode("ascii")
114
+ return f"data:{mime};base64,{b64}"
115
+
116
+
117
+ def resolve_image_arg(arg: str) -> str:
118
+ """Turn a positional IMAGE arg into a URL suitable for image_url."""
119
+ if arg == "-":
120
+ if sys.stdin.isatty():
121
+ sys.stderr.write("error: `-` requested but stdin is a TTY\n")
122
+ sys.exit(2)
123
+ data = sys.stdin.buffer.read()
124
+ if not data:
125
+ sys.stderr.write("error: stdin was empty\n")
126
+ sys.exit(2)
127
+ return bytes_to_data_url(data)
128
+ if arg.startswith(("http://", "https://", "data:")):
129
+ return arg
130
+ path = Path(arg)
131
+ if not path.exists():
132
+ sys.stderr.write(f"error: image not found: {arg}\n")
133
+ sys.exit(4)
134
+ try:
135
+ data = path.read_bytes()
136
+ except OSError as e:
137
+ sys.stderr.write(f"error: cannot read {arg}: {e}\n")
138
+ sys.exit(4)
139
+ return bytes_to_data_url(data, hint_name=path.name)
140
+
141
+
142
+ # --- model auto-select ------------------------------------------------------
143
+
144
+ def model_supports_images(model: str) -> bool:
145
+ """Mirror src/translate/reqToChat.ts:modelSupportsImages."""
146
+ base = model.split("[", 1)[0].lower()
147
+ if "omni" in base:
148
+ return True
149
+ if base == "mimo-v2.5":
150
+ return True
151
+ return False
152
+
153
+
154
+ def pick_model(cli_model: str | None) -> tuple[str, str | None]:
155
+ """Returns (chosen_model, note_for_stderr_or_None)."""
156
+ if cli_model:
157
+ if model_supports_images(cli_model):
158
+ return cli_model, None
159
+ return "mimo-v2.5", (
160
+ f"note: model '{cli_model}' does not see images; "
161
+ f"switching to mimo-v2.5 for this call.\n"
162
+ )
163
+ env_ocr = os.environ.get("MIMO_OCR_MODEL")
164
+ if env_ocr and model_supports_images(env_ocr):
165
+ return env_ocr, None
166
+ env_chat = os.environ.get("MIMO_MODEL")
167
+ if env_chat and model_supports_images(env_chat):
168
+ return env_chat, None
169
+ return "mimo-v2.5", None
170
+
171
+
172
+ # --- message building -------------------------------------------------------
173
+
174
+ def build_messages(
175
+ *, mode: str, image_urls: list[str], lang: str | None, extra_prompt: str | None
176
+ ) -> list[dict[str, Any]]:
177
+ user_text = MODE_PROMPTS[mode]
178
+ if lang:
179
+ user_text += f" Primary language: {lang}."
180
+ if extra_prompt:
181
+ user_text += f" {extra_prompt}"
182
+
183
+ content: list[dict[str, Any]] = [
184
+ {"type": "image_url", "image_url": {"url": u}} for u in image_urls
185
+ ]
186
+ content.append({"type": "text", "text": user_text})
187
+
188
+ messages: list[dict[str, Any]] = []
189
+ if mode == "structured":
190
+ messages.append({"role": "system", "content": STRUCTURED_SYSTEM})
191
+ messages.append({"role": "user", "content": content})
192
+ return messages
193
+
194
+
195
+ # --- HTTP -------------------------------------------------------------------
196
+
197
+ def post(url: str, body: dict[str, Any], api_key: str, stream: bool) -> Any:
198
+ req = urllib.request.Request(
199
+ url,
200
+ method="POST",
201
+ data=json.dumps(body).encode("utf-8"),
202
+ headers={
203
+ "Content-Type": "application/json",
204
+ "Accept": "text/event-stream" if stream else "application/json",
205
+ "Authorization": f"Bearer {api_key}",
206
+ "User-Agent": "mimoskill-ocr/0.1",
207
+ },
208
+ )
209
+ try:
210
+ return urllib.request.urlopen(req, timeout=300)
211
+ except urllib.error.HTTPError as e:
212
+ snippet = e.read().decode("utf-8", "replace")
213
+ sys.stderr.write(f"MiMo returned HTTP {e.code}: {snippet}\n")
214
+ sys.exit(1)
215
+ except urllib.error.URLError as e:
216
+ sys.stderr.write(f"connection failed: {e}\n")
217
+ sys.exit(1)
218
+
219
+
220
+ def stream_chat(resp: Any) -> tuple[str, str]:
221
+ """Stream SSE chunks; returns (full_content, full_reasoning)."""
222
+ buf_content: list[str] = []
223
+ buf_reasoning: list[str] = []
224
+ for raw in resp:
225
+ line = raw.decode("utf-8", "replace").strip()
226
+ if not line.startswith("data:"):
227
+ continue
228
+ data = line[5:].strip()
229
+ if data == "[DONE]":
230
+ break
231
+ try:
232
+ chunk = json.loads(data)
233
+ except json.JSONDecodeError:
234
+ continue
235
+ choice = chunk.get("choices", [{}])[0]
236
+ delta = choice.get("delta", {})
237
+ if r := delta.get("reasoning_content"):
238
+ buf_reasoning.append(r)
239
+ sys.stderr.write(r)
240
+ sys.stderr.flush()
241
+ if c := delta.get("content"):
242
+ buf_content.append(c)
243
+ sys.stdout.write(c)
244
+ sys.stdout.flush()
245
+ sys.stdout.write("\n")
246
+ return "".join(buf_content), "".join(buf_reasoning)
247
+
248
+
249
+ def non_stream_chat(resp: Any) -> tuple[str, str, dict[str, Any]]:
250
+ """Returns (content, reasoning_content, usage)."""
251
+ payload = json.loads(resp.read().decode("utf-8"))
252
+ msg = payload["choices"][0]["message"]
253
+ return (
254
+ msg.get("content") or "",
255
+ msg.get("reasoning_content") or "",
256
+ payload.get("usage") or {},
257
+ )
258
+
259
+
260
+ # --- CLI --------------------------------------------------------------------
261
+
262
+ def main() -> None:
263
+ p = argparse.ArgumentParser(
264
+ description=__doc__.split("\n", 1)[0],
265
+ formatter_class=argparse.RawDescriptionHelpFormatter,
266
+ )
267
+ p.add_argument(
268
+ "images",
269
+ nargs="*",
270
+ metavar="IMAGE",
271
+ help="image: local path, http(s) URL, data: URL, or `-` for stdin",
272
+ )
273
+ p.add_argument(
274
+ "--mode",
275
+ choices=list(MODE_PROMPTS),
276
+ default="text",
277
+ help="output mode (default: text)",
278
+ )
279
+ p.add_argument(
280
+ "--model",
281
+ default=None,
282
+ help="MiMo vision model (default: $MIMO_OCR_MODEL / $MIMO_MODEL if "
283
+ "vision-capable / mimo-v2.5). Non-vision models are auto-switched.",
284
+ )
285
+ p.add_argument(
286
+ "--lang",
287
+ default=None,
288
+ help="primary language hint, e.g. 'Chinese', 'zh', '日本語'",
289
+ )
290
+ p.add_argument("--max-tokens", type=int, default=4096)
291
+ p.add_argument("--temperature", type=float, default=0.2)
292
+ p.add_argument(
293
+ "--base-url",
294
+ default=os.environ.get("MIMO_BASE_URL", "https://api.xiaomimimo.com/v1"),
295
+ help="MiMo OpenAI-compat endpoint (default: %(default)s)",
296
+ )
297
+ p.add_argument(
298
+ "--prompt",
299
+ default=None,
300
+ help="extra instruction appended to the mode prompt",
301
+ )
302
+ p.add_argument("--json", action="store_true", help="wrap stdout as JSON envelope")
303
+ p.add_argument("--stream", action="store_true", help="stream the response")
304
+ args = p.parse_args()
305
+
306
+ api_key = os.environ.get("MIMO_API_KEY")
307
+ if not api_key:
308
+ sys.stderr.write(
309
+ "error: MIMO_API_KEY is not set; ocr.py needs MiMo V2.5 vision to read images.\n"
310
+ " set one at https://platform.xiaomimimo.com/#/console/api-keys\n"
311
+ " OR if you want fully-local OCR with no API key, install tesseract:\n"
312
+ " macOS: brew install tesseract tesseract-lang\n"
313
+ " Ubuntu: sudo apt install tesseract-ocr tesseract-ocr-chi-sim\n"
314
+ " Windows: https://github.com/UB-Mannheim/tesseract/wiki\n"
315
+ " then run: tesseract <image> - -l eng+chi_sim\n"
316
+ " (tesseract is NOT installed or invoked by this skill; this is just a pointer.)\n"
317
+ )
318
+ sys.exit(3)
319
+
320
+ # Resolve images: explicit args, else stdin if not a TTY.
321
+ raw_args = args.images
322
+ if not raw_args and not sys.stdin.isatty():
323
+ raw_args = ["-"]
324
+ if not raw_args:
325
+ sys.stderr.write(
326
+ "error: no image given. Pass one or more IMAGE args or pipe bytes "
327
+ "on stdin. See `ocr.py --help`.\n"
328
+ )
329
+ sys.exit(2)
330
+
331
+ image_urls = [resolve_image_arg(a) for a in raw_args]
332
+
333
+ model, note = pick_model(args.model)
334
+ if note:
335
+ sys.stderr.write(note)
336
+
337
+ sys.stderr.write(
338
+ f"[ocr] mode={args.mode} model={model} images={len(image_urls)}\n"
339
+ )
340
+
341
+ messages = build_messages(
342
+ mode=args.mode,
343
+ image_urls=image_urls,
344
+ lang=args.lang,
345
+ extra_prompt=args.prompt,
346
+ )
347
+
348
+ body: dict[str, Any] = {
349
+ "model": model,
350
+ "messages": messages,
351
+ "max_completion_tokens": args.max_tokens,
352
+ "temperature": args.temperature,
353
+ "stream": args.stream,
354
+ }
355
+
356
+ url = args.base_url.rstrip("/") + "/chat/completions"
357
+ resp = post(url, body, api_key, args.stream)
358
+
359
+ if args.stream:
360
+ content, reasoning = stream_chat(resp)
361
+ usage: dict[str, Any] = {}
362
+ else:
363
+ content, reasoning, usage = non_stream_chat(resp)
364
+ if reasoning:
365
+ sys.stderr.write(f"[reasoning]\n{reasoning}\n[/reasoning]\n\n")
366
+ if args.json:
367
+ envelope = {
368
+ "mode": args.mode,
369
+ "model": model,
370
+ "images": len(image_urls),
371
+ "content": content,
372
+ "reasoning_content": reasoning,
373
+ "usage": usage,
374
+ }
375
+ print(json.dumps(envelope, ensure_ascii=False, indent=2))
376
+ else:
377
+ print(content)
378
+ return
379
+
380
+ # Streaming + --json: emit envelope after the streamed body.
381
+ if args.json:
382
+ envelope = {
383
+ "mode": args.mode,
384
+ "model": model,
385
+ "images": len(image_urls),
386
+ "content": content,
387
+ "reasoning_content": reasoning,
388
+ "usage": {},
389
+ }
390
+ sys.stdout.write("\n---\n")
391
+ sys.stdout.write(json.dumps(envelope, ensure_ascii=False, indent=2))
392
+ sys.stdout.write("\n")
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mimo2codex",
3
- "version": "0.1.15",
3
+ "version": "0.1.16",
4
4
  "description": "Local proxy that lets the latest OpenAI Codex CLI / desktop talk to Xiaomi MiMo (V2.5 Pro) via the Responses API by translating to Chat Completions on the fly.",
5
5
  "keywords": [
6
6
  "codex",
@@ -26,6 +26,7 @@
26
26
  "files": [
27
27
  "dist",
28
28
  "mimoskill",
29
+ "doc",
29
30
  "AGENTS.md",
30
31
  "README.md",
31
32
  "README.zh.md",
@@ -46,9 +47,9 @@
46
47
  "test:watch": "vitest",
47
48
  "prepack": "npm run build:all",
48
49
  "prepublishOnly": "npm run build:all && npm test",
49
- "release:patch": "npm version patch && git push --follow-tags",
50
- "release:minor": "npm version minor && git push --follow-tags",
51
- "release:major": "npm version major && git push --follow-tags"
50
+ "release:patch": "node scripts/release.mjs patch",
51
+ "release:minor": "node scripts/release.mjs minor",
52
+ "release:major": "node scripts/release.mjs major"
52
53
  },
53
54
  "dependencies": {
54
55
  "better-sqlite3": "^12.9.0",