@xiaotianxt/skills 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/EXCLUDED.md +42 -0
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/SECURITY.md +23 -0
- package/SOURCES.md +45 -0
- package/bin/skills.mjs +241 -0
- package/package.json +38 -0
- package/skills/1password/SKILL.md +94 -0
- package/skills/1password/agents/openai.yaml +4 -0
- package/skills/1password/references/item-management.md +80 -0
- package/skills/1password/references/op-cli.md +107 -0
- package/skills/apple-calendar-event/SKILL.md +81 -0
- package/skills/apple-calendar-event/agents/openai.yaml +4 -0
- package/skills/apple-calendar-event/scripts/calendar_audit.py +201 -0
- package/skills/apple-calendar-event/scripts/calendar_event.py +164 -0
- package/skills/bro-browser/SKILL.md +118 -0
- package/skills/bro-browser/agents/openai.yaml +4 -0
- package/skills/bro-browser/references/tool-map.md +102 -0
- package/skills/bro-browser/references/workflows.md +146 -0
- package/skills/bro-browser/scripts/bro-call.mjs +189 -0
- package/skills/calendar/SKILL.md +182 -0
- package/skills/calendar/agents/openai.yaml +4 -0
- package/skills/calendar/references/operations.md +255 -0
- package/skills/calendar/scripts/calendar_list_review.py +157 -0
- package/skills/calendar/scripts/event_dedupe_preview.py +155 -0
- package/skills/canvas/SKILL.md +70 -0
- package/skills/canvas/agents/openai.yaml +4 -0
- package/skills/canvas/references/canvas-api.md +76 -0
- package/skills/course-exam-review-planner/SKILL.md +127 -0
- package/skills/cx/SKILL.md +25 -0
- package/skills/gh-fix-ci/LICENSE.txt +201 -0
- package/skills/gh-fix-ci/SKILL.md +81 -0
- package/skills/gh-fix-ci/agents/openai.yaml +6 -0
- package/skills/gh-fix-ci/assets/github-small.svg +3 -0
- package/skills/gh-fix-ci/assets/github.png +0 -0
- package/skills/gh-fix-ci/scripts/inspect_pr_checks.py +509 -0
- package/skills/gh-review-workflow/SKILL.md +61 -0
- package/skills/gh-review-workflow/agents/openai.yaml +4 -0
- package/skills/gh-review-workflow/references/workflow.md +48 -0
- package/skills/gh-review-workflow/scripts/fetch_review_state.py +222 -0
- package/skills/gh-review-workflow/scripts/resolve_review_threads.py +83 -0
- package/skills/github/SKILL.md +74 -0
- package/skills/github/agents/openai.yaml +6 -0
- package/skills/github/assets/github-small.svg +3 -0
- package/skills/github/assets/github.png +0 -0
- package/skills/gws-calendar/SKILL.md +126 -0
- package/skills/gws-calendar-agenda/SKILL.md +52 -0
- package/skills/gws-calendar-insert/SKILL.md +66 -0
- package/skills/gws-docs/SKILL.md +48 -0
- package/skills/gws-docs-write/SKILL.md +49 -0
- package/skills/gws-drive/SKILL.md +137 -0
- package/skills/gws-drive-upload/SKILL.md +52 -0
- package/skills/gws-gmail/SKILL.md +62 -0
- package/skills/gws-gmail-forward/SKILL.md +55 -0
- package/skills/gws-gmail-reply/SKILL.md +58 -0
- package/skills/gws-gmail-reply-all/SKILL.md +62 -0
- package/skills/gws-gmail-send/SKILL.md +57 -0
- package/skills/gws-gmail-triage/SKILL.md +50 -0
- package/skills/gws-gmail-watch/SKILL.md +58 -0
- package/skills/gws-shared/SKILL.md +27 -0
- package/skills/helium-browser-mcp/SKILL.md +137 -0
- package/skills/helium-browser-mcp/agents/openai.yaml +4 -0
- package/skills/helium-browser-mcp/scripts/obmcp.mjs +92 -0
- package/skills/helium-browser-mcp/scripts/openbrowsermcp-stdio-proxy.mjs +170 -0
- package/skills/learn/SKILL.md +122 -0
- package/skills/learn/agents/openai.yaml +7 -0
- package/skills/learn/assets/AGENTS.template.md +33 -0
- package/skills/learn/assets/errorlog.template.typ +61 -0
- package/skills/learn/assets/reading-sequence.template.md +23 -0
- package/skills/learn/assets/source-index.template.md +17 -0
- package/skills/learn/assets/tasklog.template.typ +57 -0
- package/skills/learn/assets/workbook.template.typ +60 -0
- package/skills/learn/references/learning-science.md +103 -0
- package/skills/learn/scripts/init_learning_workspace.py +70 -0
- package/skills/macos-messages/SKILL.md +258 -0
- package/skills/memory/SKILL.md +33 -0
- package/skills/memory/codex.md +186 -0
- package/skills/memory/opencode.md +164 -0
- package/skills/mimestreamctl/SKILL.md +170 -0
- package/skills/mimestreamctl/agents/openai.yaml +4 -0
- package/skills/mimestreamctl/scripts/mimestreamctl +33 -0
- package/skills/mon/SKILL.md +51 -0
- package/skills/mon/scripts/mon_spend_review.py +458 -0
- package/skills/ocr/SKILL.md +136 -0
- package/skills/ocr/agents/openai.yaml +4 -0
- package/skills/ocr/references/local-ocr-best-practices.md +297 -0
- package/skills/ocr/references/mineru-api.md +159 -0
- package/skills/ocr/scripts/ocr-router +22 -0
- package/skills/ocr/scripts/ocr_router.py +741 -0
- package/skills/panopto-mp4-bulk-download/SKILL.md +57 -0
- package/skills/panopto-mp4-bulk-download/agents/openai.yaml +4 -0
- package/skills/panopto-mp4-bulk-download/references/url-patterns.md +26 -0
- package/skills/panopto-mp4-bulk-download/scripts/panopto_bulk_mp4.sh +213 -0
- package/skills/rust-systems-style/SKILL.md +109 -0
- package/skills/rust-systems-style/agents/openai.yaml +4 -0
- package/skills/rust-systems-style/references/rust-review-checklist.md +77 -0
- package/skills/rust-systems-style/references/style-sources.md +68 -0
- package/skills/ship-ai-native-cli/SKILL.md +76 -0
- package/skills/ship-ai-native-cli/agents/openai.yaml +4 -0
- package/skills/ship-ai-native-cli/references/case-notes.md +83 -0
- package/skills/ship-ai-native-cli/references/product-method.md +82 -0
- package/skills/ship-ai-native-cli/references/release-checklist.md +147 -0
- package/skills/ship-ai-native-cli/references/rust-cli-shape.md +111 -0
- package/skills/telegram-mtproto-session/SKILL.md +125 -0
- package/skills/telegram-mtproto-session/agents/openai.yaml +4 -0
- package/skills/telegram-mtproto-session/scripts/telegram_session.py +687 -0
- package/skills/tg/SKILL.md +173 -0
- package/skills/things3-manager/SKILL.md +116 -0
- package/skills/things3-manager/scripts/things +42 -0
- package/skills/things3-manager/scripts/things_cli.py +514 -0
- package/skills/web-artifacts-builder/LICENSE.txt +202 -0
- package/skills/web-artifacts-builder/SKILL.md +74 -0
- package/skills/web-artifacts-builder/scripts/bundle-artifact.sh +54 -0
- package/skills/web-artifacts-builder/scripts/init-artifact.sh +379 -0
- package/skills/web-artifacts-builder/scripts/shadcn-components.tar.gz +0 -0
- package/skills/yeet/LICENSE.txt +201 -0
- package/skills/yeet/SKILL.md +71 -0
- package/skills/yeet/agents/openai.yaml +6 -0
- package/skills/yeet/assets/yeet-small.svg +3 -0
- package/skills/yeet/assets/yeet.png +0 -0
|
@@ -0,0 +1,741 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import shutil
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import time
|
|
13
|
+
import urllib.error
|
|
14
|
+
import urllib.request
|
|
15
|
+
import uuid
|
|
16
|
+
import zipfile
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import fitz # PyMuPDF
|
|
23
|
+
except ImportError: # pragma: no cover - fallback path for minimal hosts
|
|
24
|
+
fitz = None
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import requests
|
|
28
|
+
except ImportError: # pragma: no cover - fallback path for minimal hosts
|
|
29
|
+
requests = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
MINERU_BASE_URL = "https://mineru.net"
|
|
33
|
+
MINERU_TOKEN_KEYCHAIN = ("codex.mineru", "credential")
|
|
34
|
+
IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".webp", ".bmp"}
|
|
35
|
+
PDF_SUFFIXES = {".pdf"}
|
|
36
|
+
CLOUD_DOC_SUFFIXES = {
|
|
37
|
+
".pdf",
|
|
38
|
+
".png",
|
|
39
|
+
".jpg",
|
|
40
|
+
".jpeg",
|
|
41
|
+
".doc",
|
|
42
|
+
".docx",
|
|
43
|
+
".ppt",
|
|
44
|
+
".pptx",
|
|
45
|
+
".xls",
|
|
46
|
+
".xlsx",
|
|
47
|
+
".html",
|
|
48
|
+
".htm",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class PdfProfile:
|
|
54
|
+
path: Path
|
|
55
|
+
size_bytes: int
|
|
56
|
+
page_count: int | None
|
|
57
|
+
sample_pages: list[int]
|
|
58
|
+
sample_text_chars: int
|
|
59
|
+
pages_with_text: int
|
|
60
|
+
sample_math_hits: int
|
|
61
|
+
image_count: int | None
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def text_page_ratio(self) -> float:
|
|
65
|
+
if not self.sample_pages:
|
|
66
|
+
return 0.0
|
|
67
|
+
return self.pages_with_text / len(self.sample_pages)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def avg_text_chars(self) -> float:
|
|
71
|
+
if not self.sample_pages:
|
|
72
|
+
return 0.0
|
|
73
|
+
return self.sample_text_chars / len(self.sample_pages)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def has_good_text_layer(self) -> bool:
|
|
77
|
+
return self.text_page_ratio >= 0.75 and self.avg_text_chars >= 250
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def eprint(*parts: object) -> None:
|
|
81
|
+
print(*parts, file=sys.stderr)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def human_size(size: int) -> str:
|
|
85
|
+
units = ["B", "KB", "MB", "GB"]
|
|
86
|
+
value = float(size)
|
|
87
|
+
for unit in units:
|
|
88
|
+
if value < 1024 or unit == units[-1]:
|
|
89
|
+
return f"{value:.1f}{unit}" if unit != "B" else f"{int(value)}B"
|
|
90
|
+
value /= 1024
|
|
91
|
+
return f"{size}B"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def require_tool(name: str) -> str:
|
|
95
|
+
path = shutil.which(name)
|
|
96
|
+
if not path:
|
|
97
|
+
raise SystemExit(f"Required tool not found on PATH: {name}")
|
|
98
|
+
return path
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def run_text(cmd: list[str], *, timeout: int = 120) -> str:
|
|
102
|
+
proc = subprocess.run(
|
|
103
|
+
cmd,
|
|
104
|
+
check=False,
|
|
105
|
+
stdout=subprocess.PIPE,
|
|
106
|
+
stderr=subprocess.PIPE,
|
|
107
|
+
text=True,
|
|
108
|
+
timeout=timeout,
|
|
109
|
+
)
|
|
110
|
+
if proc.returncode != 0:
|
|
111
|
+
raise RuntimeError(f"{cmd[0]} failed: {proc.stderr.strip() or proc.stdout.strip()}")
|
|
112
|
+
return proc.stdout
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def text_char_count(value: str) -> int:
|
|
116
|
+
return sum(1 for ch in value if ch.isalnum() or "\u4e00" <= ch <= "\u9fff")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def math_hit_count(value: str) -> int:
|
|
120
|
+
patterns = [
|
|
121
|
+
r"\b(equation|formula|theorem|lemma|corollary)\b",
|
|
122
|
+
r"[=+\-*/]\s*[A-Za-z0-9(]",
|
|
123
|
+
r"[∑∫√≤≥≈≠∞σΣΔθλμ]",
|
|
124
|
+
r"\b[A-Za-z]_[A-Za-z0-9]\b",
|
|
125
|
+
]
|
|
126
|
+
return sum(len(re.findall(pattern, value, flags=re.IGNORECASE)) for pattern in patterns)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def pdf_page_count(path: Path) -> int | None:
|
|
130
|
+
if fitz is not None:
|
|
131
|
+
try:
|
|
132
|
+
with fitz.open(path) as doc:
|
|
133
|
+
return len(doc)
|
|
134
|
+
except Exception:
|
|
135
|
+
pass
|
|
136
|
+
try:
|
|
137
|
+
out = run_text(["pdfinfo", str(path)])
|
|
138
|
+
except Exception:
|
|
139
|
+
return None
|
|
140
|
+
for line in out.splitlines():
|
|
141
|
+
if line.startswith("Pages:"):
|
|
142
|
+
try:
|
|
143
|
+
return int(line.split(":", 1)[1].strip())
|
|
144
|
+
except ValueError:
|
|
145
|
+
return None
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def sample_pages(page_count: int | None) -> list[int]:
|
|
150
|
+
if not page_count or page_count <= 0:
|
|
151
|
+
return [1]
|
|
152
|
+
candidates = [1, 2, max(1, page_count // 2), page_count]
|
|
153
|
+
return sorted({p for p in candidates if 1 <= p <= page_count})
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def pdftotext_page(path: Path, page: int) -> str:
|
|
157
|
+
return run_text(
|
|
158
|
+
["pdftotext", "-f", str(page), "-l", str(page), "-layout", "-enc", "UTF-8", str(path), "-"],
|
|
159
|
+
timeout=120,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def pdf_image_count(path: Path) -> int | None:
|
|
164
|
+
if fitz is not None:
|
|
165
|
+
try:
|
|
166
|
+
with fitz.open(path) as doc:
|
|
167
|
+
return sum(len(page.get_images(full=True)) for page in doc)
|
|
168
|
+
except Exception:
|
|
169
|
+
pass
|
|
170
|
+
if not shutil.which("pdfimages"):
|
|
171
|
+
return None
|
|
172
|
+
try:
|
|
173
|
+
out = run_text(["pdfimages", "-list", str(path)], timeout=120)
|
|
174
|
+
except Exception:
|
|
175
|
+
return None
|
|
176
|
+
lines = [line for line in out.splitlines() if line.strip()]
|
|
177
|
+
if len(lines) <= 2:
|
|
178
|
+
return 0
|
|
179
|
+
return max(0, len(lines) - 2)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def inspect_pdf(path: Path) -> PdfProfile:
|
|
183
|
+
page_count = pdf_page_count(path)
|
|
184
|
+
pages = sample_pages(page_count)
|
|
185
|
+
total_chars = 0
|
|
186
|
+
pages_with_text = 0
|
|
187
|
+
total_math_hits = 0
|
|
188
|
+
if fitz is not None:
|
|
189
|
+
try:
|
|
190
|
+
with fitz.open(path) as doc:
|
|
191
|
+
for page in pages:
|
|
192
|
+
try:
|
|
193
|
+
text = doc[page - 1].get_text("text")
|
|
194
|
+
except Exception:
|
|
195
|
+
text = ""
|
|
196
|
+
chars = text_char_count(text)
|
|
197
|
+
total_chars += chars
|
|
198
|
+
total_math_hits += math_hit_count(text)
|
|
199
|
+
if chars >= 80:
|
|
200
|
+
pages_with_text += 1
|
|
201
|
+
except Exception:
|
|
202
|
+
total_chars = 0
|
|
203
|
+
pages_with_text = 0
|
|
204
|
+
total_math_hits = 0
|
|
205
|
+
else:
|
|
206
|
+
require_tool("pdfinfo")
|
|
207
|
+
require_tool("pdftotext")
|
|
208
|
+
for page in pages:
|
|
209
|
+
try:
|
|
210
|
+
text = pdftotext_page(path, page)
|
|
211
|
+
except Exception:
|
|
212
|
+
text = ""
|
|
213
|
+
chars = text_char_count(text)
|
|
214
|
+
total_chars += chars
|
|
215
|
+
total_math_hits += math_hit_count(text)
|
|
216
|
+
if chars >= 80:
|
|
217
|
+
pages_with_text += 1
|
|
218
|
+
return PdfProfile(
|
|
219
|
+
path=path,
|
|
220
|
+
size_bytes=path.stat().st_size,
|
|
221
|
+
page_count=page_count,
|
|
222
|
+
sample_pages=pages,
|
|
223
|
+
sample_text_chars=total_chars,
|
|
224
|
+
pages_with_text=pages_with_text,
|
|
225
|
+
sample_math_hits=total_math_hits,
|
|
226
|
+
image_count=pdf_image_count(path),
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def print_profile(profile: PdfProfile) -> None:
|
|
231
|
+
payload = {
|
|
232
|
+
"path": str(profile.path),
|
|
233
|
+
"size": human_size(profile.size_bytes),
|
|
234
|
+
"page_count": profile.page_count,
|
|
235
|
+
"sample_pages": profile.sample_pages,
|
|
236
|
+
"text_page_ratio": round(profile.text_page_ratio, 3),
|
|
237
|
+
"avg_text_chars_per_sample_page": round(profile.avg_text_chars, 1),
|
|
238
|
+
"sample_math_hits": profile.sample_math_hits,
|
|
239
|
+
"image_count": profile.image_count,
|
|
240
|
+
"has_good_text_layer": profile.has_good_text_layer,
|
|
241
|
+
}
|
|
242
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def default_out_dir(input_path: Path, engine: str) -> Path:
|
|
246
|
+
return input_path.with_suffix("").with_name(f"{input_path.stem}_ocr_{engine}")
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def confirm_cloud_upload(path_or_url: str, service: str, args: argparse.Namespace) -> None:
|
|
250
|
+
if args.allow_cloud:
|
|
251
|
+
return
|
|
252
|
+
if args.no_cloud:
|
|
253
|
+
raise SystemExit(f"{service} requires uploading input to a cloud service, but --no-cloud was set.")
|
|
254
|
+
|
|
255
|
+
eprint("")
|
|
256
|
+
eprint("Cloud upload confirmation required.")
|
|
257
|
+
eprint(f"Service: {service}")
|
|
258
|
+
eprint(f"Input: {path_or_url}")
|
|
259
|
+
eprint("Only continue for non-confidential documents that you are allowed to upload.")
|
|
260
|
+
if not sys.stdin.isatty():
|
|
261
|
+
raise SystemExit("Refusing cloud upload in non-interactive mode. Re-run with --allow-cloud if this is intended.")
|
|
262
|
+
answer = input("Type UPLOAD to continue: ").strip()
|
|
263
|
+
if answer != "UPLOAD":
|
|
264
|
+
raise SystemExit("Cloud upload cancelled.")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def keychain_secret(service: str, account: str) -> str | None:
|
|
268
|
+
tool = shutil.which("keychain-secret")
|
|
269
|
+
if not tool:
|
|
270
|
+
return None
|
|
271
|
+
proc = subprocess.run(
|
|
272
|
+
[tool, "get", service, account],
|
|
273
|
+
check=False,
|
|
274
|
+
stdout=subprocess.PIPE,
|
|
275
|
+
stderr=subprocess.PIPE,
|
|
276
|
+
text=True,
|
|
277
|
+
timeout=10,
|
|
278
|
+
)
|
|
279
|
+
if proc.returncode != 0:
|
|
280
|
+
return None
|
|
281
|
+
return proc.stdout.replace("\r", "").split("\n", 1)[0] or None
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def mineru_token() -> str:
|
|
285
|
+
token = os.environ.get("MINERU_API_TOKEN") or os.environ.get("MINERU_TOKEN")
|
|
286
|
+
if token:
|
|
287
|
+
return token
|
|
288
|
+
token = keychain_secret(*MINERU_TOKEN_KEYCHAIN)
|
|
289
|
+
if token:
|
|
290
|
+
return token
|
|
291
|
+
raise SystemExit(
|
|
292
|
+
"MinerU API token not found. Set MINERU_API_TOKEN or store it in Keychain as "
|
|
293
|
+
"service codex.mineru account credential."
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def http_json(method: str, url: str, payload: dict[str, Any] | None, headers: dict[str, str], timeout: int = 60) -> dict[str, Any]:
|
|
298
|
+
if requests is not None:
|
|
299
|
+
try:
|
|
300
|
+
response = requests.request(method, url, json=payload, headers=headers, timeout=timeout)
|
|
301
|
+
except requests.RequestException as exc:
|
|
302
|
+
raise RuntimeError(f"HTTP request failed for {url}: {exc}") from exc
|
|
303
|
+
if response.status_code >= 400:
|
|
304
|
+
raise RuntimeError(f"HTTP {response.status_code} from {url}: {response.text[:800]}")
|
|
305
|
+
return response.json()
|
|
306
|
+
|
|
307
|
+
data = None if payload is None else json.dumps(payload).encode("utf-8")
|
|
308
|
+
req_headers = {"Content-Type": "application/json", **headers}
|
|
309
|
+
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
|
|
310
|
+
try:
|
|
311
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
312
|
+
body = resp.read().decode("utf-8")
|
|
313
|
+
except urllib.error.HTTPError as exc:
|
|
314
|
+
body = exc.read().decode("utf-8", errors="replace")
|
|
315
|
+
raise RuntimeError(f"HTTP {exc.code} from {url}: {body[:800]}") from exc
|
|
316
|
+
return json.loads(body)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def curl_upload_file(path: Path, url: str) -> None:
|
|
320
|
+
if requests is not None:
|
|
321
|
+
with path.open("rb") as handle:
|
|
322
|
+
response = requests.put(url, data=handle, timeout=300)
|
|
323
|
+
if response.status_code >= 400:
|
|
324
|
+
raise RuntimeError(f"Signed URL upload failed: HTTP {response.status_code} {response.text[:500]}")
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
require_tool("curl")
|
|
328
|
+
proc = subprocess.run(["curl", "-sS", "-f", "-X", "PUT", "-T", str(path), url], check=False)
|
|
329
|
+
if proc.returncode != 0:
|
|
330
|
+
raise RuntimeError("Signed URL upload failed.")
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def curl_download(url: str, out: Path) -> None:
|
|
334
|
+
if requests is not None:
|
|
335
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
336
|
+
with requests.get(url, stream=True, timeout=300) as response:
|
|
337
|
+
if response.status_code >= 400:
|
|
338
|
+
raise RuntimeError(f"Download failed: HTTP {response.status_code} {response.text[:500]}")
|
|
339
|
+
with out.open("wb") as handle:
|
|
340
|
+
for chunk in response.iter_content(chunk_size=1024 * 1024):
|
|
341
|
+
if chunk:
|
|
342
|
+
handle.write(chunk)
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
require_tool("curl")
|
|
346
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
347
|
+
proc = subprocess.run(["curl", "-L", "-sS", "-f", "-o", str(out), url], check=False)
|
|
348
|
+
if proc.returncode != 0:
|
|
349
|
+
raise RuntimeError(f"Download failed: {url}")
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def extract_data(response: dict[str, Any]) -> dict[str, Any]:
|
|
353
|
+
data = response.get("data")
|
|
354
|
+
if isinstance(data, dict):
|
|
355
|
+
return data
|
|
356
|
+
raise RuntimeError(f"Unexpected MinerU response: {response}")
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def first_url_value(value: Any, preferred_keys: tuple[str, ...], require_zip: bool = False) -> str | None:
|
|
360
|
+
def is_usable(candidate: Any) -> bool:
|
|
361
|
+
return isinstance(candidate, str) and (not require_zip or ".zip" in candidate.lower())
|
|
362
|
+
|
|
363
|
+
if is_usable(value):
|
|
364
|
+
return value
|
|
365
|
+
if isinstance(value, dict):
|
|
366
|
+
for key in preferred_keys:
|
|
367
|
+
candidate = value.get(key)
|
|
368
|
+
if is_usable(candidate):
|
|
369
|
+
return candidate
|
|
370
|
+
for candidate in value.values():
|
|
371
|
+
found = first_url_value(candidate, preferred_keys, require_zip=require_zip)
|
|
372
|
+
if found:
|
|
373
|
+
return found
|
|
374
|
+
if isinstance(value, list):
|
|
375
|
+
for candidate in value:
|
|
376
|
+
found = first_url_value(candidate, preferred_keys, require_zip=require_zip)
|
|
377
|
+
if found:
|
|
378
|
+
return found
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def run_native_text(input_path: Path, out_dir: Path) -> Path:
|
|
383
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
384
|
+
if shutil.which("pdftotext"):
|
|
385
|
+
raw = run_text(["pdftotext", "-layout", "-enc", "UTF-8", str(input_path), "-"], timeout=900)
|
|
386
|
+
pages = raw.split("\f")
|
|
387
|
+
note = "Extracted from the PDF text layer with pdftotext -layout."
|
|
388
|
+
elif fitz is not None:
|
|
389
|
+
with fitz.open(input_path) as doc:
|
|
390
|
+
pages = [page.get_text("text") for page in doc]
|
|
391
|
+
note = "Extracted from the PDF text layer with PyMuPDF."
|
|
392
|
+
else:
|
|
393
|
+
raise SystemExit("native-text requires either pdftotext or PyMuPDF.")
|
|
394
|
+
lines = [f"# {input_path.stem}", "", f"<!-- {note} -->", ""]
|
|
395
|
+
for idx, page_text in enumerate(pages, start=1):
|
|
396
|
+
if not page_text.strip() and idx == len(pages):
|
|
397
|
+
continue
|
|
398
|
+
lines.append(f"<!-- PDF_PAGE {idx:04d} -->")
|
|
399
|
+
lines.append(f"## PDF Page {idx}")
|
|
400
|
+
lines.append("")
|
|
401
|
+
lines.append(page_text.strip())
|
|
402
|
+
lines.append("")
|
|
403
|
+
out_path = out_dir / "full.md"
|
|
404
|
+
out_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
|
405
|
+
return out_path
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def run_apple_vision(input_path: Path, out_dir: Path, language: str | None) -> Path:
|
|
409
|
+
tool = Path("/Users/yupeit/bin/ocr")
|
|
410
|
+
if not tool.exists():
|
|
411
|
+
raise SystemExit("Apple Vision OCR tool not found at /Users/yupeit/bin/ocr")
|
|
412
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
413
|
+
cmd = [str(tool)]
|
|
414
|
+
if language and language != "auto":
|
|
415
|
+
cmd.extend(["--language", language])
|
|
416
|
+
cmd.append(str(input_path))
|
|
417
|
+
text = run_text(cmd, timeout=300)
|
|
418
|
+
out_path = out_dir / f"{input_path.stem}.txt"
|
|
419
|
+
out_path.write_text(text, encoding="utf-8")
|
|
420
|
+
return out_path
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def run_gemini_vlm(input_path: Path, out_dir: Path, args: argparse.Namespace) -> Path:
|
|
424
|
+
confirm_cloud_upload(str(input_path), "Gemini VLM via /Users/yupeit/bin/ocr.py", args)
|
|
425
|
+
tool = Path("/Users/yupeit/bin/ocr.py")
|
|
426
|
+
if not tool.exists():
|
|
427
|
+
raise SystemExit("Gemini OCR script not found at /Users/yupeit/bin/ocr.py")
|
|
428
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
429
|
+
out_path = out_dir / "full.md"
|
|
430
|
+
cmd = [
|
|
431
|
+
str(tool),
|
|
432
|
+
str(input_path),
|
|
433
|
+
"--output",
|
|
434
|
+
str(out_path),
|
|
435
|
+
"--concurrent",
|
|
436
|
+
str(args.concurrent),
|
|
437
|
+
"--model",
|
|
438
|
+
args.vlm_model,
|
|
439
|
+
]
|
|
440
|
+
proc = subprocess.run(cmd, check=False)
|
|
441
|
+
if proc.returncode != 0:
|
|
442
|
+
raise SystemExit(f"Gemini VLM OCR failed with exit code {proc.returncode}")
|
|
443
|
+
return out_path
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def run_local_mineru(input_path: Path, out_dir: Path, args: argparse.Namespace) -> Path:
|
|
447
|
+
require_tool("uvx")
|
|
448
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
449
|
+
env = os.environ.copy()
|
|
450
|
+
env.setdefault("MINERU_TASK_RESULT_TIMEOUT_SECONDS", str(args.timeout_seconds))
|
|
451
|
+
env.setdefault("MINERU_TASK_RESULT_DOWNLOAD_TIMEOUT_SECONDS", "1800")
|
|
452
|
+
env.setdefault("MINERU_LOCAL_API_STARTUP_TIMEOUT_SECONDS", "600")
|
|
453
|
+
|
|
454
|
+
cmd = [
|
|
455
|
+
"uvx",
|
|
456
|
+
"mineru[all]",
|
|
457
|
+
"-p",
|
|
458
|
+
str(input_path),
|
|
459
|
+
"-o",
|
|
460
|
+
str(out_dir),
|
|
461
|
+
"-b",
|
|
462
|
+
args.mineru_backend,
|
|
463
|
+
"-l",
|
|
464
|
+
args.language,
|
|
465
|
+
"-f",
|
|
466
|
+
"true" if args.enable_formula else "false",
|
|
467
|
+
"-t",
|
|
468
|
+
"true" if args.enable_table else "false",
|
|
469
|
+
"--image-analysis",
|
|
470
|
+
"true" if args.image_analysis else "false",
|
|
471
|
+
]
|
|
472
|
+
if args.mineru_method:
|
|
473
|
+
cmd.extend(["-m", args.mineru_method])
|
|
474
|
+
if args.start_page is not None:
|
|
475
|
+
cmd.extend(["-s", str(args.start_page)])
|
|
476
|
+
if args.end_page is not None:
|
|
477
|
+
cmd.extend(["-e", str(args.end_page)])
|
|
478
|
+
|
|
479
|
+
log_path = out_dir / "mineru-local.log"
|
|
480
|
+
with log_path.open("w", encoding="utf-8") as log:
|
|
481
|
+
log.write("$ " + " ".join(cmd) + "\n\n")
|
|
482
|
+
log.flush()
|
|
483
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env)
|
|
484
|
+
assert proc.stdout is not None
|
|
485
|
+
for line in proc.stdout:
|
|
486
|
+
log.write(line)
|
|
487
|
+
log.flush()
|
|
488
|
+
if "Completed batch" in line or "Error:" in line or "Timed out" in line:
|
|
489
|
+
print(line.rstrip(), flush=True)
|
|
490
|
+
code = proc.wait()
|
|
491
|
+
if code != 0:
|
|
492
|
+
raise SystemExit(f"Local MinerU failed with exit code {code}; see {log_path}")
|
|
493
|
+
markdowns = sorted(out_dir.rglob("*.md"))
|
|
494
|
+
return markdowns[0] if markdowns else out_dir
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def run_mineru_agent(input_path: Path, out_dir: Path, args: argparse.Namespace) -> Path:
|
|
498
|
+
profile = inspect_pdf(input_path) if input_path.suffix.lower() == ".pdf" else None
|
|
499
|
+
if input_path.stat().st_size > 10 * 1024 * 1024 or (profile and profile.page_count and profile.page_count > 20):
|
|
500
|
+
raise SystemExit("MinerU agent API is limited to 10MB/20 pages. Use --engine mineru-api instead.")
|
|
501
|
+
confirm_cloud_upload(str(input_path), "MinerU agent parse API", args)
|
|
502
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
503
|
+
payload = {"file_name": input_path.name}
|
|
504
|
+
created = http_json("POST", f"{MINERU_BASE_URL}/api/v1/agent/parse/file", payload, {})
|
|
505
|
+
data = extract_data(created)
|
|
506
|
+
task_id = data.get("task_id") or data.get("id")
|
|
507
|
+
upload_url = data.get("file_url") or data.get("upload_url")
|
|
508
|
+
if not task_id or not upload_url:
|
|
509
|
+
raise RuntimeError(f"Unexpected MinerU agent create response: {created}")
|
|
510
|
+
curl_upload_file(input_path, upload_url)
|
|
511
|
+
result = poll_agent_task(str(task_id), args.poll_interval, args.timeout_seconds)
|
|
512
|
+
md_url = result.get("full_md_url") or result.get("md_url") or result.get("markdown_url")
|
|
513
|
+
if not md_url:
|
|
514
|
+
raise RuntimeError(f"Agent task completed but no Markdown URL was found: {result}")
|
|
515
|
+
out_path = out_dir / "full.md"
|
|
516
|
+
curl_download(md_url, out_path)
|
|
517
|
+
return out_path
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def poll_agent_task(task_id: str, interval: float, timeout_seconds: int) -> dict[str, Any]:
|
|
521
|
+
deadline = time.time() + timeout_seconds
|
|
522
|
+
while True:
|
|
523
|
+
response = http_json("GET", f"{MINERU_BASE_URL}/api/v1/agent/parse/{task_id}", None, {})
|
|
524
|
+
data = extract_data(response)
|
|
525
|
+
state = str(data.get("state") or data.get("status") or "").lower()
|
|
526
|
+
if state in {"done", "completed", "success", "succeeded"}:
|
|
527
|
+
return data
|
|
528
|
+
if state in {"failed", "error"}:
|
|
529
|
+
raise RuntimeError(f"MinerU agent task failed: {data}")
|
|
530
|
+
if time.time() > deadline:
|
|
531
|
+
raise TimeoutError(f"Timed out waiting for MinerU agent task {task_id}")
|
|
532
|
+
eprint(f"MinerU agent task {task_id}: {state or 'processing'}")
|
|
533
|
+
time.sleep(interval)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def run_mineru_api(input_path: Path, out_dir: Path, args: argparse.Namespace) -> Path:
|
|
537
|
+
confirm_cloud_upload(str(input_path), "MinerU official API", args)
|
|
538
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
539
|
+
token = mineru_token()
|
|
540
|
+
headers = {"Authorization": f"Bearer {token}"}
|
|
541
|
+
|
|
542
|
+
data_id = str(uuid.uuid4())
|
|
543
|
+
file_payload: dict[str, Any] = {
|
|
544
|
+
"name": input_path.name,
|
|
545
|
+
"data_id": data_id,
|
|
546
|
+
"is_ocr": args.is_ocr,
|
|
547
|
+
}
|
|
548
|
+
if args.page_ranges:
|
|
549
|
+
file_payload["page_ranges"] = args.page_ranges
|
|
550
|
+
|
|
551
|
+
create_payload = {
|
|
552
|
+
"enable_formula": args.enable_formula,
|
|
553
|
+
"enable_table": args.enable_table,
|
|
554
|
+
"language": args.language,
|
|
555
|
+
"model_version": args.model_version,
|
|
556
|
+
"files": [file_payload],
|
|
557
|
+
}
|
|
558
|
+
created = http_json("POST", f"{MINERU_BASE_URL}/api/v4/file-urls/batch", create_payload, headers)
|
|
559
|
+
created_data = extract_data(created)
|
|
560
|
+
batch_id = created_data.get("batch_id")
|
|
561
|
+
files = created_data.get("file_urls") or created_data.get("files") or []
|
|
562
|
+
if not batch_id or not files:
|
|
563
|
+
raise RuntimeError(f"Unexpected MinerU signed URL response: {created}")
|
|
564
|
+
upload_url = first_url_value(files, ("upload_url", "file_url", "url"))
|
|
565
|
+
if not upload_url:
|
|
566
|
+
raise RuntimeError(f"Signed URL response did not include upload_url: {created}")
|
|
567
|
+
|
|
568
|
+
eprint(f"Uploading to MinerU batch {batch_id} ...")
|
|
569
|
+
curl_upload_file(input_path, upload_url)
|
|
570
|
+
result = poll_mineru_batch(str(batch_id), data_id, headers, args.poll_interval, args.timeout_seconds)
|
|
571
|
+
zip_url = first_url_value(result, ("full_zip_url", "zip_url"), require_zip=True)
|
|
572
|
+
if not zip_url:
|
|
573
|
+
raise RuntimeError(f"MinerU task completed but no zip URL was found: {result}")
|
|
574
|
+
|
|
575
|
+
zip_path = out_dir / "mineru_result.zip"
|
|
576
|
+
curl_download(str(zip_url), zip_path)
|
|
577
|
+
extract_dir = out_dir / "mineru_result"
|
|
578
|
+
extract_dir.mkdir(parents=True, exist_ok=True)
|
|
579
|
+
with zipfile.ZipFile(zip_path) as archive:
|
|
580
|
+
archive.extractall(extract_dir)
|
|
581
|
+
markdowns = sorted(extract_dir.rglob("*.md"))
|
|
582
|
+
return markdowns[0] if markdowns else extract_dir
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def poll_mineru_batch(
|
|
586
|
+
batch_id: str,
|
|
587
|
+
data_id: str,
|
|
588
|
+
headers: dict[str, str],
|
|
589
|
+
interval: float,
|
|
590
|
+
timeout_seconds: int,
|
|
591
|
+
) -> dict[str, Any]:
|
|
592
|
+
deadline = time.time() + timeout_seconds
|
|
593
|
+
while True:
|
|
594
|
+
response = http_json("GET", f"{MINERU_BASE_URL}/api/v4/extract-results/batch/{batch_id}", None, headers)
|
|
595
|
+
data = extract_data(response)
|
|
596
|
+
results = data.get("extract_result") or data.get("results") or data.get("files") or []
|
|
597
|
+
if isinstance(results, dict):
|
|
598
|
+
results = [results]
|
|
599
|
+
if not isinstance(results, list):
|
|
600
|
+
results = []
|
|
601
|
+
target = None
|
|
602
|
+
for item in results:
|
|
603
|
+
if not isinstance(item, dict):
|
|
604
|
+
continue
|
|
605
|
+
if item.get("data_id") == data_id or not target:
|
|
606
|
+
target = item
|
|
607
|
+
state = str((target or data).get("state") or (target or data).get("status") or "").lower()
|
|
608
|
+
if state in {"done", "completed", "success", "succeeded"}:
|
|
609
|
+
return target or data
|
|
610
|
+
if state in {"failed", "error"}:
|
|
611
|
+
raise RuntimeError(f"MinerU batch task failed: {target or data}")
|
|
612
|
+
if time.time() > deadline:
|
|
613
|
+
raise TimeoutError(f"Timed out waiting for MinerU batch {batch_id}")
|
|
614
|
+
eprint(f"MinerU batch {batch_id}: {state or 'processing'}")
|
|
615
|
+
time.sleep(interval)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def choose_engine(input_path: Path, args: argparse.Namespace) -> tuple[str, PdfProfile | None]:
|
|
619
|
+
suffix = input_path.suffix.lower()
|
|
620
|
+
if args.engine != "auto":
|
|
621
|
+
profile = inspect_pdf(input_path) if suffix == ".pdf" and args.show_profile else None
|
|
622
|
+
return args.engine, profile
|
|
623
|
+
if suffix in IMAGE_SUFFIXES:
|
|
624
|
+
return "apple-vision", None
|
|
625
|
+
if suffix not in PDF_SUFFIXES:
|
|
626
|
+
return "mineru-api", None
|
|
627
|
+
|
|
628
|
+
profile = inspect_pdf(input_path)
|
|
629
|
+
if args.require_structure or args.need_formulas or args.need_tables:
|
|
630
|
+
if shutil.which("uvx"):
|
|
631
|
+
return "mineru-local", profile
|
|
632
|
+
return "mineru-api", profile
|
|
633
|
+
if profile.has_good_text_layer:
|
|
634
|
+
return "native-text", profile
|
|
635
|
+
if shutil.which("uvx"):
|
|
636
|
+
return "mineru-local", profile
|
|
637
|
+
if profile.size_bytes <= 10 * 1024 * 1024 and (profile.page_count or 999999) <= 20:
|
|
638
|
+
return "mineru-agent", profile
|
|
639
|
+
return "mineru-api", profile
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def print_file_profile(input_path: Path, engine: str, profile: PdfProfile | None) -> None:
|
|
643
|
+
if profile is not None:
|
|
644
|
+
payload = {
|
|
645
|
+
"recommended_engine": engine,
|
|
646
|
+
"path": str(profile.path),
|
|
647
|
+
"size": human_size(profile.size_bytes),
|
|
648
|
+
"page_count": profile.page_count,
|
|
649
|
+
"sample_pages": profile.sample_pages,
|
|
650
|
+
"text_page_ratio": round(profile.text_page_ratio, 3),
|
|
651
|
+
"avg_text_chars_per_sample_page": round(profile.avg_text_chars, 1),
|
|
652
|
+
"sample_math_hits": profile.sample_math_hits,
|
|
653
|
+
"image_count": profile.image_count,
|
|
654
|
+
"has_good_text_layer": profile.has_good_text_layer,
|
|
655
|
+
}
|
|
656
|
+
else:
|
|
657
|
+
payload = {
|
|
658
|
+
"recommended_engine": engine,
|
|
659
|
+
"path": str(input_path),
|
|
660
|
+
"size": human_size(input_path.stat().st_size),
|
|
661
|
+
"suffix": input_path.suffix.lower(),
|
|
662
|
+
}
|
|
663
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def parse_args() -> argparse.Namespace:
|
|
667
|
+
parser = argparse.ArgumentParser(description="Route OCR/document extraction to the best local or MinerU workflow.")
|
|
668
|
+
parser.add_argument("input", type=Path, help="Local PDF/image/document path")
|
|
669
|
+
parser.add_argument("--engine", choices=["auto", "native-text", "apple-vision", "gemini-vlm", "mineru-local", "mineru-api", "mineru-agent"], default="auto")
|
|
670
|
+
parser.add_argument("--out-dir", type=Path)
|
|
671
|
+
parser.add_argument("--show-profile", action="store_true")
|
|
672
|
+
parser.add_argument("--profile-only", action="store_true", help="Inspect the file and recommended engine without extracting")
|
|
673
|
+
parser.add_argument("--allow-cloud", action="store_true", help="Allow upload without an interactive confirmation prompt")
|
|
674
|
+
parser.add_argument("--no-cloud", action="store_true", help="Forbid cloud OCR/API upload")
|
|
675
|
+
parser.add_argument("--language", default="en")
|
|
676
|
+
parser.add_argument("--need-formulas", action="store_true")
|
|
677
|
+
parser.add_argument("--need-tables", action="store_true")
|
|
678
|
+
parser.add_argument("--require-structure", action="store_true", help="Prefer MinerU structured output over plain text layer extraction")
|
|
679
|
+
parser.add_argument("--enable-formula", action=argparse.BooleanOptionalAction, default=True)
|
|
680
|
+
parser.add_argument("--enable-table", action=argparse.BooleanOptionalAction, default=True)
|
|
681
|
+
parser.add_argument("--image-analysis", action=argparse.BooleanOptionalAction, default=False)
|
|
682
|
+
parser.add_argument("--model-version", default="vlm", help="MinerU official API model_version, e.g. vlm, pipeline, MinerU-HTML")
|
|
683
|
+
parser.add_argument("--is-ocr", action=argparse.BooleanOptionalAction, default=True, help="MinerU official API OCR flag")
|
|
684
|
+
parser.add_argument("--page-ranges", help="MinerU API page ranges string, e.g. 1-5,9")
|
|
685
|
+
parser.add_argument("--mineru-backend", default="pipeline")
|
|
686
|
+
parser.add_argument("--mineru-method", default="txt")
|
|
687
|
+
parser.add_argument("--start-page", type=int, help="0-based local MinerU start page")
|
|
688
|
+
parser.add_argument("--end-page", type=int, help="0-based local MinerU inclusive end page")
|
|
689
|
+
parser.add_argument("--vlm-model", default="gemini-2.5-flash-lite")
|
|
690
|
+
parser.add_argument("--concurrent", type=int, default=2)
|
|
691
|
+
parser.add_argument("--poll-interval", type=float, default=5.0)
|
|
692
|
+
parser.add_argument("--timeout-seconds", type=int, default=86400)
|
|
693
|
+
return parser.parse_args()
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def main() -> int:
|
|
697
|
+
args = parse_args()
|
|
698
|
+
input_path = args.input.expanduser().resolve()
|
|
699
|
+
if not input_path.exists():
|
|
700
|
+
raise SystemExit(f"Input not found: {input_path}")
|
|
701
|
+
if input_path.suffix.lower() not in CLOUD_DOC_SUFFIXES and args.engine in {"mineru-api", "mineru-agent"}:
|
|
702
|
+
raise SystemExit(f"MinerU cloud engine does not support this suffix: {input_path.suffix}")
|
|
703
|
+
|
|
704
|
+
engine, profile = choose_engine(input_path, args)
|
|
705
|
+
if args.profile_only:
|
|
706
|
+
print_file_profile(input_path, engine, profile)
|
|
707
|
+
return 0
|
|
708
|
+
if args.show_profile and profile is not None:
|
|
709
|
+
print_profile(profile)
|
|
710
|
+
elif profile is not None:
|
|
711
|
+
eprint(
|
|
712
|
+
"PDF profile:",
|
|
713
|
+
f"pages={profile.page_count}",
|
|
714
|
+
f"text_layer={profile.has_good_text_layer}",
|
|
715
|
+
f"avg_chars={profile.avg_text_chars:.1f}",
|
|
716
|
+
f"images={profile.image_count}",
|
|
717
|
+
)
|
|
718
|
+
eprint(f"Selected engine: {engine}")
|
|
719
|
+
|
|
720
|
+
out_dir = (args.out_dir.expanduser().resolve() if args.out_dir else default_out_dir(input_path, engine))
|
|
721
|
+
if engine == "native-text":
|
|
722
|
+
out_path = run_native_text(input_path, out_dir)
|
|
723
|
+
elif engine == "apple-vision":
|
|
724
|
+
out_path = run_apple_vision(input_path, out_dir, args.language)
|
|
725
|
+
elif engine == "gemini-vlm":
|
|
726
|
+
out_path = run_gemini_vlm(input_path, out_dir, args)
|
|
727
|
+
elif engine == "mineru-local":
|
|
728
|
+
out_path = run_local_mineru(input_path, out_dir, args)
|
|
729
|
+
elif engine == "mineru-api":
|
|
730
|
+
out_path = run_mineru_api(input_path, out_dir, args)
|
|
731
|
+
elif engine == "mineru-agent":
|
|
732
|
+
out_path = run_mineru_agent(input_path, out_dir, args)
|
|
733
|
+
else:
|
|
734
|
+
raise SystemExit(f"Unknown engine: {engine}")
|
|
735
|
+
|
|
736
|
+
print(str(out_path))
|
|
737
|
+
return 0
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
if __name__ == "__main__":
|
|
741
|
+
raise SystemExit(main())
|