contract-archive-cli 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contract_archive/__init__.py +2 -0
- contract_archive/archive/__init__.py +64 -0
- contract_archive/archive/db.py +126 -0
- contract_archive/archive/ingest.py +667 -0
- contract_archive/archive/migrations/001_init.sql +62 -0
- contract_archive/archive/migrations/002_obligations.sql +25 -0
- contract_archive/archive/migrations/003_document_types.sql +31 -0
- contract_archive/archive/migrations/004_seals_subjects.sql +36 -0
- contract_archive/archive/migrations/005_completeness.sql +18 -0
- contract_archive/archive/party_registry.py +276 -0
- contract_archive/archive/paths.py +113 -0
- contract_archive/archive/repository.py +918 -0
- contract_archive/cli.py +455 -0
- contract_archive/cli_common.py +293 -0
- contract_archive/cli_config.py +96 -0
- contract_archive/cli_introspect.py +204 -0
- contract_archive/cli_party.py +166 -0
- contract_archive/cli_query.py +492 -0
- contract_archive/cli_render.py +575 -0
- contract_archive/config.py +257 -0
- contract_archive/errors.py +163 -0
- contract_archive/extraction/__init__.py +14 -0
- contract_archive/extraction/amount_check.py +87 -0
- contract_archive/extraction/contract_extractor.py +103 -0
- contract_archive/extraction/document_extractor.py +546 -0
- contract_archive/extraction/evidence_page_fix.py +99 -0
- contract_archive/extraction/llm_extractor.py +207 -0
- contract_archive/extraction/normalize.py +210 -0
- contract_archive/extraction/property_fee.py +79 -0
- contract_archive/extraction/vision_seal.py +390 -0
- contract_archive/pipelines/__init__.py +9 -0
- contract_archive/pipelines/mineru_pipeline.py +955 -0
- contract_archive/pipelines/vl_ocr.py +160 -0
- contract_archive/schemas/__init__.py +67 -0
- contract_archive/schemas/document.py +408 -0
- contract_archive/utils/__init__.py +27 -0
- contract_archive/utils/device.py +51 -0
- contract_archive/utils/http_env.py +54 -0
- contract_archive/utils/pdf.py +207 -0
- contract_archive_cli-0.2.7.dist-info/METADATA +386 -0
- contract_archive_cli-0.2.7.dist-info/RECORD +44 -0
- contract_archive_cli-0.2.7.dist-info/WHEEL +4 -0
- contract_archive_cli-0.2.7.dist-info/entry_points.txt +2 -0
- contract_archive_cli-0.2.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,955 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MinerU 3.x pipeline。
|
|
3
|
+
|
|
4
|
+
调用方式:subprocess 调 `mineru` CLI(Python API 在 3.x 不稳)。
|
|
5
|
+
mineru 3.x CLI 的输出目录约定(注意:和 2.x 不同!):
|
|
6
|
+
<out_dir>/<pdf_stem>/<auto|vlm>/
|
|
7
|
+
├── <stem>.md # 主 markdown(不是 full.md)
|
|
8
|
+
├── <stem>_content_list.json # 结构化元素列表
|
|
9
|
+
├── <stem>_layout.pdf
|
|
10
|
+
├── <stem>_model.json
|
|
11
|
+
├── <stem>_middle.json
|
|
12
|
+
└── images/
|
|
13
|
+
|
|
14
|
+
content_list.json 元素的 bbox 是 **归一化到 0-1000 整数**,不是 PDF point。
|
|
15
|
+
我们把它换算回 PDF point(× page_width_pt / 1000)以与其他 pipeline 对齐。
|
|
16
|
+
|
|
17
|
+
历史:原本与 DashScope/PaddleOCR pipeline 共享一个 BasePipeline 抽象基类。
|
|
18
|
+
重构后唯一具体实现,抽象基类已 inline 到本文件,避免一抽象一具体的反模式。
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
|
+
import signal
|
|
26
|
+
import shutil
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
import tempfile
|
|
30
|
+
import time
|
|
31
|
+
from collections.abc import Mapping
|
|
32
|
+
from datetime import datetime
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
from ..config import get_timeout_s
|
|
36
|
+
from ..schemas import (
|
|
37
|
+
BBox,
|
|
38
|
+
FILE_LAYOUT,
|
|
39
|
+
FILE_MARKDOWN,
|
|
40
|
+
FILE_PIPELINE_META,
|
|
41
|
+
FILE_RAW_TEXT,
|
|
42
|
+
FILE_STRUCTURED,
|
|
43
|
+
LayoutBlock,
|
|
44
|
+
PipelineMeta,
|
|
45
|
+
PipelineOutput,
|
|
46
|
+
PREVIEW_DIR,
|
|
47
|
+
Section,
|
|
48
|
+
StructuredDocument,
|
|
49
|
+
Table,
|
|
50
|
+
)
|
|
51
|
+
from ..utils import (
|
|
52
|
+
PdfPageInfo,
|
|
53
|
+
TextLayerStats,
|
|
54
|
+
analyze_text_layer,
|
|
55
|
+
describe_device,
|
|
56
|
+
extract_text_layer,
|
|
57
|
+
inspect_pdf_pages,
|
|
58
|
+
is_text_layer_usable,
|
|
59
|
+
render_pdf_to_images,
|
|
60
|
+
select_device,
|
|
61
|
+
)
|
|
62
|
+
from ..utils.http_env import sanitize_no_proxy_for_httpx
|
|
63
|
+
from .vl_ocr import ocr_pdf_images_with_vl
|
|
64
|
+
|
|
65
|
+
logger = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# MinerU 3.x content_list.json 中 "type" 字段到统一 schema 的映射
|
|
69
|
+
# 注意:MinerU 3.x 没有独立的 "title" 类型,标题是 type:"text" + text_level>=1
|
|
70
|
+
_MINERU_TYPE_MAP = {
|
|
71
|
+
"text": "paragraph",
|
|
72
|
+
"image": "figure",
|
|
73
|
+
"table": "table",
|
|
74
|
+
"equation": "formula",
|
|
75
|
+
"list": "list",
|
|
76
|
+
"code": "paragraph",
|
|
77
|
+
"seal": "stamp",
|
|
78
|
+
"chart": "figure",
|
|
79
|
+
"header": "header",
|
|
80
|
+
"footer": "footer",
|
|
81
|
+
"page_number": "footer",
|
|
82
|
+
"aside_text": "paragraph",
|
|
83
|
+
"page_footnote": "footer",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# 已知 pipeline 产物文件,清空目录时只删这些 + 这些子目录,绝不 rmtree 未知内容
|
|
88
|
+
_OWNED_FILES = {
|
|
89
|
+
FILE_RAW_TEXT,
|
|
90
|
+
FILE_MARKDOWN,
|
|
91
|
+
FILE_STRUCTURED,
|
|
92
|
+
FILE_LAYOUT,
|
|
93
|
+
FILE_PIPELINE_META,
|
|
94
|
+
"extraction_result.json",
|
|
95
|
+
"extraction_confidence.json",
|
|
96
|
+
}
|
|
97
|
+
_OWNED_DIRS = {PREVIEW_DIR, "_mineru_raw"}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class MinerUPipeline:
|
|
101
|
+
"""MinerU 3.x PDF 解析。单一职责:run(pdf, out_dir) → PipelineOutput。"""
|
|
102
|
+
|
|
103
|
+
# Historical class name stays for API compatibility. User-facing logs use
|
|
104
|
+
# "ocr" because this pipeline may run native text, DashScope VL, or MinerU.
|
|
105
|
+
name = "ocr"
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
device: str | None = None,
|
|
110
|
+
backend: str | None = None,
|
|
111
|
+
dpi: int = 200,
|
|
112
|
+
prefer_text_layer: bool = True,
|
|
113
|
+
allow_vl_fallback: bool = True,
|
|
114
|
+
prefer_vl_ocr: bool | None = None,
|
|
115
|
+
lite_retry: bool | None = None,
|
|
116
|
+
vl_ocr_max_pages: int | None = None,
|
|
117
|
+
vl_ocr_dpi: int | None = None,
|
|
118
|
+
) -> None:
|
|
119
|
+
self.device = select_device(device)
|
|
120
|
+
logger.info("[%s] device = %s", self.name, describe_device(self.device))
|
|
121
|
+
# MinerU 3.x backend 合法值(实测):
|
|
122
|
+
# pipeline CPU 兜底,兼容性最好
|
|
123
|
+
# hybrid-auto-engine 3.x 默认,混合方案
|
|
124
|
+
# hybrid-http-client 走 http server
|
|
125
|
+
# vlm-auto-engine GPU VLM 推理
|
|
126
|
+
# vlm-http-client 走 http server
|
|
127
|
+
# 默认策略:CUDA → vlm-auto-engine,其它(CPU/MPS)→ pipeline
|
|
128
|
+
self.backend = backend or ("vlm-auto-engine" if self.device == "cuda" else "pipeline")
|
|
129
|
+
self.dpi = dpi
|
|
130
|
+
self.prefer_text_layer = prefer_text_layer
|
|
131
|
+
self.allow_vl_fallback = allow_vl_fallback
|
|
132
|
+
self.prefer_vl_ocr = (
|
|
133
|
+
_env_bool("CONTRACT_ARCHIVE_VL_OCR_FIRST", True)
|
|
134
|
+
if prefer_vl_ocr is None
|
|
135
|
+
else prefer_vl_ocr
|
|
136
|
+
)
|
|
137
|
+
self.lite_retry = (
|
|
138
|
+
_env_bool("CONTRACT_ARCHIVE_MINERU_LITE_RETRY", True)
|
|
139
|
+
if lite_retry is None
|
|
140
|
+
else lite_retry
|
|
141
|
+
)
|
|
142
|
+
# VL OCR 现在逐页调用(见 vl_ocr.py),不再受单请求页数限制;此上限退化为
|
|
143
|
+
# "防超大 PDF 烧太多次调用" 的安全阀,默认放宽到 500(保单条款全文普遍 90+ 页)。
|
|
144
|
+
self.vl_ocr_max_pages = vl_ocr_max_pages or _env_int(
|
|
145
|
+
"CONTRACT_ARCHIVE_VL_OCR_MAX_PAGES", 500
|
|
146
|
+
)
|
|
147
|
+
self.vl_ocr_dpi = vl_ocr_dpi or _env_int("CONTRACT_ARCHIVE_VL_OCR_DPI", 160)
|
|
148
|
+
|
|
149
|
+
# ---------- 入口 ----------
|
|
150
|
+
def run(self, pdf_path: str | Path, out_dir: str | Path) -> PipelineOutput:
|
|
151
|
+
"""
|
|
152
|
+
统一入口:
|
|
153
|
+
- 清理本 pipeline 写过的旧产物(白名单,绝不递归删未知内容)
|
|
154
|
+
- 计时 + 调 _process
|
|
155
|
+
- 按统一文件名落盘
|
|
156
|
+
"""
|
|
157
|
+
pdf_path = Path(pdf_path).resolve()
|
|
158
|
+
out_dir = Path(out_dir).resolve()
|
|
159
|
+
if out_dir.exists():
|
|
160
|
+
for item in out_dir.iterdir():
|
|
161
|
+
if item.is_file() and item.name in _OWNED_FILES:
|
|
162
|
+
item.unlink()
|
|
163
|
+
elif item.is_dir() and item.name in _OWNED_DIRS:
|
|
164
|
+
shutil.rmtree(item)
|
|
165
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
(out_dir / PREVIEW_DIR).mkdir(exist_ok=True)
|
|
167
|
+
|
|
168
|
+
started = datetime.now()
|
|
169
|
+
t0 = time.perf_counter()
|
|
170
|
+
try:
|
|
171
|
+
result = self._process(pdf_path, out_dir)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.exception("[%s] pipeline failed", self.name)
|
|
174
|
+
_dump_failure(out_dir, pdf_path, started, str(e))
|
|
175
|
+
raise
|
|
176
|
+
duration = time.perf_counter() - t0
|
|
177
|
+
finished = datetime.now()
|
|
178
|
+
|
|
179
|
+
result.meta.started_at = started
|
|
180
|
+
result.meta.finished_at = finished
|
|
181
|
+
result.meta.duration_seconds = duration
|
|
182
|
+
if not result.meta.source_pdf:
|
|
183
|
+
result.meta.source_pdf = str(pdf_path)
|
|
184
|
+
if not result.meta.device:
|
|
185
|
+
result.meta.device = self.device
|
|
186
|
+
|
|
187
|
+
_dump(out_dir, result)
|
|
188
|
+
logger.info("[%s] done in %.2fs, output=%s", self.name, duration, out_dir)
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
# ---------- 实现 ----------
|
|
192
|
+
def _process(self, pdf_path: Path, work_dir: Path) -> PipelineOutput:
|
|
193
|
+
page_infos = inspect_pdf_pages(pdf_path)
|
|
194
|
+
text_stats = analyze_text_layer(pdf_path)
|
|
195
|
+
|
|
196
|
+
if self.prefer_text_layer and is_text_layer_usable(text_stats):
|
|
197
|
+
logger.info(
|
|
198
|
+
"[pdf] using native text layer: chars=%s printable=%.2f cjk=%.2f",
|
|
199
|
+
text_stats.non_ws_chars,
|
|
200
|
+
text_stats.printable_ratio,
|
|
201
|
+
text_stats.cjk_ratio,
|
|
202
|
+
)
|
|
203
|
+
text = extract_text_layer(pdf_path)
|
|
204
|
+
return _output_from_text(
|
|
205
|
+
pdf_path=pdf_path,
|
|
206
|
+
page_infos=page_infos,
|
|
207
|
+
text=text,
|
|
208
|
+
device=self.device,
|
|
209
|
+
source="native-text-layer",
|
|
210
|
+
notes=_text_stats_note("backend=native-text-layer", text_stats),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
"[pdf] native text layer unusable: chars=%s printable=%.2f cjk=%.2f control=%.2f",
|
|
215
|
+
text_stats.non_ws_chars,
|
|
216
|
+
text_stats.printable_ratio,
|
|
217
|
+
text_stats.cjk_ratio,
|
|
218
|
+
text_stats.control_ratio,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# 1) preview images(独立于 MinerU 内部产物,供下游审阅)
|
|
222
|
+
# 旧实现会在 OCR 前无条件渲染整份 PDF。大 PDF 会先产生大量 PNG 和内存压力,
|
|
223
|
+
# 即使后续 OCR 失败也白做。现在先跑 OCR;成功后再尽力渲染 preview。
|
|
224
|
+
preview_dir = work_dir / PREVIEW_DIR
|
|
225
|
+
|
|
226
|
+
if self.prefer_vl_ocr:
|
|
227
|
+
reason = (
|
|
228
|
+
"native text layer unusable"
|
|
229
|
+
if self.prefer_text_layer
|
|
230
|
+
else "native text layer skipped"
|
|
231
|
+
)
|
|
232
|
+
vl_first = self._try_vl_ocr(
|
|
233
|
+
pdf_path,
|
|
234
|
+
work_dir,
|
|
235
|
+
page_infos,
|
|
236
|
+
text_stats,
|
|
237
|
+
f"{reason}; CONTRACT_ARCHIVE_VL_OCR_FIRST enabled",
|
|
238
|
+
source="vl-ocr-first",
|
|
239
|
+
)
|
|
240
|
+
if vl_first is not None:
|
|
241
|
+
return vl_first
|
|
242
|
+
|
|
243
|
+
# 2) 调用 mineru CLI
|
|
244
|
+
mineru_out = work_dir / "_mineru_raw"
|
|
245
|
+
mineru_out.mkdir(exist_ok=True)
|
|
246
|
+
env = _mineru_subprocess_env(os.environ)
|
|
247
|
+
env.setdefault("MINERU_MODEL_SOURCE", "modelscope") # 国内更快
|
|
248
|
+
|
|
249
|
+
cmd = [
|
|
250
|
+
_resolve_mineru(),
|
|
251
|
+
"-p",
|
|
252
|
+
str(pdf_path),
|
|
253
|
+
"-o",
|
|
254
|
+
str(mineru_out),
|
|
255
|
+
"-b",
|
|
256
|
+
self.backend,
|
|
257
|
+
]
|
|
258
|
+
logger.info("[mineru-cli] running: %s", " ".join(cmd))
|
|
259
|
+
# 显式 timeout(默认 1800s,CONTRACT_ARCHIVE_MINERU_TIMEOUT_S 可调):MinerU 跑深度模型,
|
|
260
|
+
# 畸形/超大 PDF 可能永久挂死——子进程不抛异常,subprocess.run 会无限阻塞,
|
|
261
|
+
# 批量串行 ingest 就此冻死。TimeoutExpired 会自动 kill 该子进程(精确,不波及他人),
|
|
262
|
+
# 转成 RuntimeError 走 run() 的失败落盘 + ingest 的 mineru_failed 分类。
|
|
263
|
+
timeout_s = get_timeout_s("CONTRACT_ARCHIVE_MINERU_TIMEOUT_S", 1800.0)
|
|
264
|
+
try:
|
|
265
|
+
proc = _run_mineru_cli(cmd, env, timeout_s)
|
|
266
|
+
except subprocess.TimeoutExpired as e:
|
|
267
|
+
reason = _mineru_timeout_reason(pdf_path, timeout_s, text_stats)
|
|
268
|
+
retry = self._try_mineru_lite_retry(
|
|
269
|
+
pdf_path, mineru_out, env, timeout_s, reason
|
|
270
|
+
)
|
|
271
|
+
if retry is not None:
|
|
272
|
+
proc = retry
|
|
273
|
+
else:
|
|
274
|
+
fallback = self._try_vl_ocr(
|
|
275
|
+
pdf_path, work_dir, page_infos, text_stats, reason
|
|
276
|
+
)
|
|
277
|
+
if fallback is not None:
|
|
278
|
+
return fallback
|
|
279
|
+
raise RuntimeError(reason) from e
|
|
280
|
+
if proc.returncode != 0:
|
|
281
|
+
retry = self._try_mineru_lite_retry(
|
|
282
|
+
pdf_path,
|
|
283
|
+
mineru_out,
|
|
284
|
+
env,
|
|
285
|
+
timeout_s,
|
|
286
|
+
_mineru_failure_reason(proc),
|
|
287
|
+
)
|
|
288
|
+
if retry is not None:
|
|
289
|
+
proc = retry
|
|
290
|
+
|
|
291
|
+
if proc.returncode != 0:
|
|
292
|
+
logger.error("[mineru-cli] stdout=%s", proc.stdout[-2000:])
|
|
293
|
+
logger.error("[mineru-cli] stderr=%s", proc.stderr[-2000:])
|
|
294
|
+
failure = _mineru_failure_reason(proc)
|
|
295
|
+
fallback = self._try_vl_ocr(
|
|
296
|
+
pdf_path, work_dir, page_infos, text_stats, failure
|
|
297
|
+
)
|
|
298
|
+
if fallback is not None:
|
|
299
|
+
return fallback
|
|
300
|
+
raise RuntimeError(failure)
|
|
301
|
+
|
|
302
|
+
# 3) 找到 MinerU 实际写入的目录
|
|
303
|
+
result_dir = _locate_mineru_result(mineru_out, pdf_path.stem)
|
|
304
|
+
if result_dir is None:
|
|
305
|
+
raise RuntimeError(
|
|
306
|
+
f"MinerU output not found under {mineru_out}; stdout={proc.stdout[-500:]}"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# 4) 读 markdown / content_list.json
|
|
310
|
+
# MinerU 3.x 文件名是 {stem}.md / {stem}_content_list.json
|
|
311
|
+
stem = pdf_path.stem
|
|
312
|
+
candidates_md = [result_dir / f"{stem}.md", result_dir / "full.md"]
|
|
313
|
+
candidates_cl = [
|
|
314
|
+
result_dir / f"{stem}_content_list.json",
|
|
315
|
+
result_dir / "content_list.json",
|
|
316
|
+
]
|
|
317
|
+
md_path = next((p for p in candidates_md if p.exists()), candidates_md[0])
|
|
318
|
+
cl_path = next((p for p in candidates_cl if p.exists()), candidates_cl[0])
|
|
319
|
+
markdown = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
|
|
320
|
+
content_list = (
|
|
321
|
+
json.loads(cl_path.read_text(encoding="utf-8")) if cl_path.exists() else []
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# 拿到每页 PDF point 尺寸用于 bbox 归一化反算。这里不需要整页渲染。
|
|
325
|
+
page_dims = {p.page_index: (p.width_pt, p.height_pt) for p in page_infos}
|
|
326
|
+
|
|
327
|
+
layout_blocks, tables, raw_text = _normalize_mineru(content_list, page_dims)
|
|
328
|
+
# MinerU 的 markdown 会对 _/*/[]/() 等做反斜杠转义(如 "\_2027年\_6月"),
|
|
329
|
+
# 喂给 rule/LLM 之前清掉这些转义,避免 extraction 抓不到
|
|
330
|
+
raw_text = _unescape_markdown(raw_text)
|
|
331
|
+
markdown_for_extract = _unescape_markdown(markdown)
|
|
332
|
+
sections = _split_sections(markdown_for_extract)
|
|
333
|
+
|
|
334
|
+
structured = StructuredDocument(
|
|
335
|
+
title=sections[0].title if sections else None,
|
|
336
|
+
document_type=None,
|
|
337
|
+
language="zh",
|
|
338
|
+
pages=len(page_infos),
|
|
339
|
+
sections=sections,
|
|
340
|
+
tables=tables,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
meta = PipelineMeta(
|
|
344
|
+
pipeline_name="mineru",
|
|
345
|
+
pipeline_version=_mineru_version(),
|
|
346
|
+
model="MinerU",
|
|
347
|
+
device=self.device,
|
|
348
|
+
source_pdf=str(pdf_path),
|
|
349
|
+
started_at=datetime.now(),
|
|
350
|
+
finished_at=datetime.now(),
|
|
351
|
+
duration_seconds=0.0,
|
|
352
|
+
notes=f"backend={self.backend}, model_source=modelscope",
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# MinerU 成功后再尽力渲染 preview;渲染失败不应推翻已完成的 OCR 产物。
|
|
356
|
+
pages = _render_previews_safe(pdf_path, preview_dir, self.dpi)
|
|
357
|
+
|
|
358
|
+
# 5) 复制 MinerU 自己渲染的 images 到 preview 目录
|
|
359
|
+
mineru_images = result_dir / "images"
|
|
360
|
+
if mineru_images.exists():
|
|
361
|
+
dst = preview_dir / "mineru_images"
|
|
362
|
+
dst.mkdir(exist_ok=True)
|
|
363
|
+
for f in mineru_images.iterdir():
|
|
364
|
+
shutil.copy(f, dst / f.name)
|
|
365
|
+
|
|
366
|
+
return PipelineOutput(
|
|
367
|
+
meta=meta,
|
|
368
|
+
raw_text=raw_text,
|
|
369
|
+
markdown=markdown,
|
|
370
|
+
layout=layout_blocks,
|
|
371
|
+
structured=structured,
|
|
372
|
+
preview_image_paths=[str(p.image_path) for p in pages],
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def _try_mineru_lite_retry(
|
|
376
|
+
self,
|
|
377
|
+
pdf_path: Path,
|
|
378
|
+
mineru_out: Path,
|
|
379
|
+
env: dict[str, str],
|
|
380
|
+
timeout_s: float,
|
|
381
|
+
first_failure: str,
|
|
382
|
+
) -> subprocess.CompletedProcess | None:
|
|
383
|
+
"""Retry slow/fragile PDFs with a cheaper MinerU OCR profile before VL fallback."""
|
|
384
|
+
if not self.lite_retry:
|
|
385
|
+
return None
|
|
386
|
+
if self.backend == "pipeline":
|
|
387
|
+
logger.warning("[mineru-lite] retrying after failure: %s", first_failure)
|
|
388
|
+
if mineru_out.exists():
|
|
389
|
+
shutil.rmtree(mineru_out)
|
|
390
|
+
mineru_out.mkdir(exist_ok=True)
|
|
391
|
+
cmd = [
|
|
392
|
+
_resolve_mineru(),
|
|
393
|
+
"-p",
|
|
394
|
+
str(pdf_path),
|
|
395
|
+
"-o",
|
|
396
|
+
str(mineru_out),
|
|
397
|
+
"-b",
|
|
398
|
+
"pipeline",
|
|
399
|
+
"-m",
|
|
400
|
+
"ocr",
|
|
401
|
+
"-l",
|
|
402
|
+
"ch_lite",
|
|
403
|
+
"-f",
|
|
404
|
+
"false",
|
|
405
|
+
"-t",
|
|
406
|
+
"false",
|
|
407
|
+
"--image-analysis",
|
|
408
|
+
"false",
|
|
409
|
+
]
|
|
410
|
+
try:
|
|
411
|
+
proc = _run_mineru_cli(cmd, env, timeout_s)
|
|
412
|
+
except subprocess.TimeoutExpired:
|
|
413
|
+
logger.warning("[mineru-lite] retry timed out after %.0fs", timeout_s)
|
|
414
|
+
return None
|
|
415
|
+
if proc.returncode == 0:
|
|
416
|
+
logger.info("[mineru-lite] retry succeeded")
|
|
417
|
+
return proc
|
|
418
|
+
logger.warning("[mineru-lite] retry failed: %s", _mineru_failure_reason(proc))
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
def _try_vl_ocr(
|
|
422
|
+
self,
|
|
423
|
+
pdf_path: Path,
|
|
424
|
+
work_dir: Path,
|
|
425
|
+
page_infos: list[PdfPageInfo],
|
|
426
|
+
text_stats: TextLayerStats,
|
|
427
|
+
reason: str,
|
|
428
|
+
source: str = "vl-ocr-fallback",
|
|
429
|
+
) -> PipelineOutput | None:
|
|
430
|
+
"""Use DashScope VL OCR for small PDFs when remote OCR is allowed."""
|
|
431
|
+
if not self.allow_vl_fallback:
|
|
432
|
+
return None
|
|
433
|
+
if len(page_infos) > self.vl_ocr_max_pages:
|
|
434
|
+
logger.warning(
|
|
435
|
+
"[vl-ocr] skip fallback: pages=%s exceeds max=%s",
|
|
436
|
+
len(page_infos),
|
|
437
|
+
self.vl_ocr_max_pages,
|
|
438
|
+
)
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
logger.info("[%s] trying DashScope VL OCR: %s", source, reason)
|
|
442
|
+
preview_dir = work_dir / PREVIEW_DIR
|
|
443
|
+
pages = _render_previews_safe(pdf_path, preview_dir, self.vl_ocr_dpi)
|
|
444
|
+
if not pages:
|
|
445
|
+
return None
|
|
446
|
+
text = ocr_pdf_images_with_vl([p.image_path for p in pages])
|
|
447
|
+
if not text:
|
|
448
|
+
return None
|
|
449
|
+
return _output_from_text(
|
|
450
|
+
pdf_path=pdf_path,
|
|
451
|
+
page_infos=page_infos,
|
|
452
|
+
text=text,
|
|
453
|
+
device=self.device,
|
|
454
|
+
source=source,
|
|
455
|
+
notes=_text_stats_note(
|
|
456
|
+
f"backend={source}, reason={reason}",
|
|
457
|
+
text_stats,
|
|
458
|
+
),
|
|
459
|
+
preview_image_paths=[str(p.image_path) for p in pages],
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# ---------- 落盘工具 ----------
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _dump(out_dir: Path, result: PipelineOutput) -> None:
|
|
467
|
+
(out_dir / FILE_RAW_TEXT).write_text(result.raw_text, encoding="utf-8")
|
|
468
|
+
(out_dir / FILE_MARKDOWN).write_text(result.markdown, encoding="utf-8")
|
|
469
|
+
(out_dir / FILE_STRUCTURED).write_text(
|
|
470
|
+
result.structured.model_dump_json(indent=2, exclude_none=False),
|
|
471
|
+
encoding="utf-8",
|
|
472
|
+
)
|
|
473
|
+
(out_dir / FILE_LAYOUT).write_text(
|
|
474
|
+
json.dumps(
|
|
475
|
+
[b.model_dump() for b in result.layout],
|
|
476
|
+
ensure_ascii=False,
|
|
477
|
+
indent=2,
|
|
478
|
+
),
|
|
479
|
+
encoding="utf-8",
|
|
480
|
+
)
|
|
481
|
+
(out_dir / FILE_PIPELINE_META).write_text(
|
|
482
|
+
result.meta.model_dump_json(indent=2),
|
|
483
|
+
encoding="utf-8",
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _dump_failure(
|
|
488
|
+
out_dir: Path, pdf_path: Path, started: datetime, err: str
|
|
489
|
+
) -> None:
|
|
490
|
+
"""失败时仍写一份 meta,附错误信息。"""
|
|
491
|
+
meta = PipelineMeta(
|
|
492
|
+
pipeline_name="mineru",
|
|
493
|
+
source_pdf=str(pdf_path),
|
|
494
|
+
started_at=started,
|
|
495
|
+
finished_at=datetime.now(),
|
|
496
|
+
duration_seconds=(datetime.now() - started).total_seconds(),
|
|
497
|
+
notes=f"FAILED: {err}",
|
|
498
|
+
)
|
|
499
|
+
(out_dir / FILE_PIPELINE_META).write_text(
|
|
500
|
+
meta.model_dump_json(indent=2), encoding="utf-8"
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
# ---------- 解析 / 归一化 ----------
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _locate_mineru_result(out_root: Path, stem: str) -> Path | None:
|
|
508
|
+
"""
|
|
509
|
+
MinerU 3.x 输出位置约定:<out>/<stem>/<auto|vlm>/。
|
|
510
|
+
主 markdown 文件名是 {stem}.md(不是 full.md)。
|
|
511
|
+
"""
|
|
512
|
+
candidates = [
|
|
513
|
+
out_root / stem / "auto",
|
|
514
|
+
out_root / stem / "vlm",
|
|
515
|
+
out_root / stem,
|
|
516
|
+
]
|
|
517
|
+
expected_md = (f"{stem}.md", "full.md") # 兼容 2.x 旧目录
|
|
518
|
+
for c in candidates:
|
|
519
|
+
if c.exists() and any((c / name).exists() for name in expected_md):
|
|
520
|
+
return c
|
|
521
|
+
# 兜底:递归找 {stem}.md
|
|
522
|
+
for p in out_root.rglob(f"{stem}.md"):
|
|
523
|
+
return p.parent
|
|
524
|
+
return None
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _resolve_mineru() -> str:
|
|
528
|
+
"""
|
|
529
|
+
定位 mineru 可执行文件的绝对路径。
|
|
530
|
+
|
|
531
|
+
为什么不能直接用裸字符串 "mineru" 交给 subprocess:
|
|
532
|
+
contract-archive 经 `uv tool install` 安装在隔离 venv 里,mineru 作为同一个 venv 的
|
|
533
|
+
依赖(extra)一起装。但该 venv 的 bin/ 目录**不在**用户 shell 的 PATH 上——
|
|
534
|
+
contract-archive 通过 ~/.local/bin 的 symlink 启动,子进程继承的是 shell PATH,
|
|
535
|
+
于是靠 PATH 解析 "mineru" 必然 FileNotFoundError。
|
|
536
|
+
|
|
537
|
+
策略(确定性优先,消除"PATH 里必须有 mineru"这个隐含前提):
|
|
538
|
+
1. 找与当前解释器同目录的兄弟可执行文件——uv tool / 已激活 venv 场景下,
|
|
539
|
+
mineru 与 python 必然同在一个 bin/,这一步直接命中。
|
|
540
|
+
2. 兜底 shutil.which,兼容用户把 mineru 手动放进 PATH 的开发环境。
|
|
541
|
+
|
|
542
|
+
返回:mineru 可执行文件路径。
|
|
543
|
+
抛出:FileNotFoundError(附安装指引),比裸的 [Errno 2] 可读得多。
|
|
544
|
+
"""
|
|
545
|
+
sibling = Path(sys.executable).parent / "mineru"
|
|
546
|
+
if sibling.exists():
|
|
547
|
+
return str(sibling)
|
|
548
|
+
found = shutil.which("mineru")
|
|
549
|
+
if found:
|
|
550
|
+
return found
|
|
551
|
+
raise FileNotFoundError(
|
|
552
|
+
"找不到 mineru 可执行文件。它随 contract-archive-cli 的 mineru extra 一起安装:\n"
|
|
553
|
+
" uv tool install 'contract-archive-cli[mineru]' # 首次安装\n"
|
|
554
|
+
" uv tool install 'contract-archive-cli[mineru]' --reinstall # 已装过 contract-archive-cli 但缺 mineru\n"
|
|
555
|
+
"开发环境:uv sync --extra mineru"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def _mineru_version() -> str:
|
|
560
|
+
try:
|
|
561
|
+
# --version 探测给个短 timeout(30s):仅是探测,挂住没意义;
|
|
562
|
+
# TimeoutExpired 是 Exception 子类,被下面 except 兜成 "unknown"。
|
|
563
|
+
proc = subprocess.run(
|
|
564
|
+
[_resolve_mineru(), "--version"],
|
|
565
|
+
capture_output=True, text=True, check=False, timeout=30,
|
|
566
|
+
)
|
|
567
|
+
return (proc.stdout or proc.stderr).strip()
|
|
568
|
+
except Exception:
|
|
569
|
+
return "unknown"
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _env_int(name: str, default: int) -> int:
|
|
573
|
+
try:
|
|
574
|
+
value = int(os.environ.get(name, ""))
|
|
575
|
+
except ValueError:
|
|
576
|
+
return default
|
|
577
|
+
return value if value > 0 else default
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _env_bool(name: str, default: bool) -> bool:
|
|
581
|
+
raw = os.environ.get(name)
|
|
582
|
+
if raw is None or raw == "":
|
|
583
|
+
return default
|
|
584
|
+
return raw.strip().lower() not in {"0", "false", "no", "off"}
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _mineru_subprocess_env(source: Mapping[str, str]) -> dict[str, str]:
|
|
588
|
+
"""Build the environment passed to the MinerU CLI subprocess."""
|
|
589
|
+
# MinerU 子进程不需要 DashScope 凭证;过滤掉避免把 secret 无谓透传给子进程。
|
|
590
|
+
env = {k: v for k, v in source.items() if not k.startswith("DASHSCOPE_")}
|
|
591
|
+
|
|
592
|
+
# MinerU 3.x starts a local FastAPI server and the CLI talks to 127.0.0.1
|
|
593
|
+
# through httpx. Broad NO_PROXY values with CIDR or IPv6 entries can make
|
|
594
|
+
# httpx.URLPattern reject the environment before the local server is used.
|
|
595
|
+
sanitize_no_proxy_for_httpx(env)
|
|
596
|
+
return env
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _run_mineru_cli(
|
|
600
|
+
cmd: list[str],
|
|
601
|
+
env: dict[str, str],
|
|
602
|
+
timeout_s: float,
|
|
603
|
+
) -> subprocess.CompletedProcess:
|
|
604
|
+
"""
|
|
605
|
+
Run MinerU as its own process group.
|
|
606
|
+
|
|
607
|
+
MinerU CLI starts a local mineru-api child process. subprocess.run(timeout=...)
|
|
608
|
+
kills only the direct child, which can leave the local server alive. Killing
|
|
609
|
+
the process group keeps retries and batch ingest from accumulating orphan
|
|
610
|
+
model servers.
|
|
611
|
+
"""
|
|
612
|
+
with tempfile.TemporaryFile("w+", encoding="utf-8", errors="replace") as stdout_f:
|
|
613
|
+
stderr_f = tempfile.TemporaryFile("w+", encoding="utf-8", errors="replace")
|
|
614
|
+
try:
|
|
615
|
+
return _run_mineru_cli_with_files(cmd, env, timeout_s, stdout_f, stderr_f)
|
|
616
|
+
finally:
|
|
617
|
+
stderr_f.close()
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _run_mineru_cli_with_files(
|
|
621
|
+
cmd: list[str],
|
|
622
|
+
env: dict[str, str],
|
|
623
|
+
timeout_s: float,
|
|
624
|
+
stdout_f,
|
|
625
|
+
stderr_f,
|
|
626
|
+
) -> subprocess.CompletedProcess:
|
|
627
|
+
proc = subprocess.Popen(
|
|
628
|
+
cmd,
|
|
629
|
+
env=env,
|
|
630
|
+
stdout=stdout_f,
|
|
631
|
+
stderr=stderr_f,
|
|
632
|
+
text=True,
|
|
633
|
+
start_new_session=True,
|
|
634
|
+
)
|
|
635
|
+
try:
|
|
636
|
+
proc.wait(timeout=timeout_s)
|
|
637
|
+
except subprocess.TimeoutExpired as e:
|
|
638
|
+
_terminate_process_tree(proc.pid)
|
|
639
|
+
try:
|
|
640
|
+
proc.wait(timeout=5)
|
|
641
|
+
except subprocess.TimeoutExpired:
|
|
642
|
+
_kill_process_tree(proc.pid)
|
|
643
|
+
proc.wait()
|
|
644
|
+
stdout = _read_tempfile(stdout_f)
|
|
645
|
+
stderr = _read_tempfile(stderr_f)
|
|
646
|
+
e.stdout = stdout
|
|
647
|
+
e.stderr = stderr
|
|
648
|
+
raise e
|
|
649
|
+
stdout = _read_tempfile(stdout_f)
|
|
650
|
+
stderr = _read_tempfile(stderr_f)
|
|
651
|
+
return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _terminate_process_group(pid: int) -> None:
|
|
655
|
+
try:
|
|
656
|
+
os.killpg(pid, signal.SIGTERM)
|
|
657
|
+
except ProcessLookupError:
|
|
658
|
+
pass
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _kill_process_group(pid: int) -> None:
|
|
662
|
+
try:
|
|
663
|
+
os.killpg(pid, signal.SIGKILL)
|
|
664
|
+
except ProcessLookupError:
|
|
665
|
+
pass
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def _terminate_process_tree(pid: int) -> None:
|
|
669
|
+
_signal_process_tree(pid, signal.SIGTERM)
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def _kill_process_tree(pid: int) -> None:
|
|
673
|
+
_signal_process_tree(pid, signal.SIGKILL)
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _signal_process_tree(pid: int, sig: signal.Signals) -> None:
|
|
677
|
+
# MinerU's local fast_api child may start a new process group. Walk /proc
|
|
678
|
+
# children first, then signal each process group and pid.
|
|
679
|
+
pids = _descendant_pids(pid)
|
|
680
|
+
for child in reversed(pids):
|
|
681
|
+
_signal_process_group(child, sig)
|
|
682
|
+
for child in reversed(pids):
|
|
683
|
+
_signal_pid(child, sig)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def _descendant_pids(pid: int) -> list[int]:
|
|
687
|
+
out: list[int] = []
|
|
688
|
+
stack = [pid]
|
|
689
|
+
seen: set[int] = set()
|
|
690
|
+
while stack:
|
|
691
|
+
current = stack.pop()
|
|
692
|
+
if current in seen:
|
|
693
|
+
continue
|
|
694
|
+
seen.add(current)
|
|
695
|
+
out.append(current)
|
|
696
|
+
children_path = Path(f"/proc/{current}/task/{current}/children")
|
|
697
|
+
try:
|
|
698
|
+
children = [
|
|
699
|
+
int(part)
|
|
700
|
+
for part in children_path.read_text(encoding="ascii").split()
|
|
701
|
+
if part.strip()
|
|
702
|
+
]
|
|
703
|
+
except OSError:
|
|
704
|
+
children = []
|
|
705
|
+
stack.extend(children)
|
|
706
|
+
return out
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
def _signal_process_group(pid: int, sig: signal.Signals) -> None:
|
|
710
|
+
try:
|
|
711
|
+
os.killpg(os.getpgid(pid), sig)
|
|
712
|
+
except ProcessLookupError:
|
|
713
|
+
pass
|
|
714
|
+
except PermissionError:
|
|
715
|
+
pass
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def _signal_pid(pid: int, sig: signal.Signals) -> None:
|
|
719
|
+
try:
|
|
720
|
+
os.kill(pid, sig)
|
|
721
|
+
except ProcessLookupError:
|
|
722
|
+
pass
|
|
723
|
+
except PermissionError:
|
|
724
|
+
pass
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def _read_tempfile(handle) -> str:
|
|
728
|
+
handle.flush()
|
|
729
|
+
handle.seek(0)
|
|
730
|
+
return handle.read()
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _mineru_failure_reason(proc: subprocess.CompletedProcess) -> str:
|
|
734
|
+
# 把 stderr 尾部带进异常——失败日志/DB error_message 才能看到真实原因,
|
|
735
|
+
# 不必回头翻控制台(曾因失败日志没记 stderr 而难定位)。
|
|
736
|
+
tail = (proc.stderr or proc.stdout or "").strip().splitlines()
|
|
737
|
+
reason = tail[-1] if tail else "no stderr captured"
|
|
738
|
+
return f"mineru CLI failed (rc={proc.returncode}): {reason}"
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _render_previews_safe(pdf_path: Path, preview_dir: Path, dpi: int) -> list:
|
|
742
|
+
try:
|
|
743
|
+
return render_pdf_to_images(pdf_path, preview_dir, dpi=dpi)
|
|
744
|
+
except Exception as e: # noqa: BLE001 - preview images are useful but not primary OCR output
|
|
745
|
+
logger.warning("[pdf] preview render failed; continue without previews: %s", e)
|
|
746
|
+
return []
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def _mineru_timeout_reason(
|
|
750
|
+
pdf_path: Path, timeout_s: float, text_stats: TextLayerStats
|
|
751
|
+
) -> str:
|
|
752
|
+
reason = (
|
|
753
|
+
f"mineru 超时(>{timeout_s:.0f}s): {pdf_path.name};"
|
|
754
|
+
"确需处理可调大 CONTRACT_ARCHIVE_MINERU_TIMEOUT_S"
|
|
755
|
+
)
|
|
756
|
+
if text_stats.non_ws_chars:
|
|
757
|
+
reason += (
|
|
758
|
+
"。PDF 有文字层但质量不可用"
|
|
759
|
+
f"(chars={text_stats.non_ws_chars}, printable={text_stats.printable_ratio:.2f}, "
|
|
760
|
+
f"cjk={text_stats.cjk_ratio:.2f}, control={text_stats.control_ratio:.2f})"
|
|
761
|
+
)
|
|
762
|
+
return reason
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def _text_stats_note(prefix: str, stats: TextLayerStats) -> str:
|
|
766
|
+
return (
|
|
767
|
+
f"{prefix}; pages={stats.pages}, chars={stats.non_ws_chars}, "
|
|
768
|
+
f"printable_ratio={stats.printable_ratio:.3f}, cjk_ratio={stats.cjk_ratio:.3f}, "
|
|
769
|
+
f"control_ratio={stats.control_ratio:.3f}"
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def _output_from_text(
|
|
774
|
+
*,
|
|
775
|
+
pdf_path: Path,
|
|
776
|
+
page_infos: list[PdfPageInfo],
|
|
777
|
+
text: str,
|
|
778
|
+
device: str,
|
|
779
|
+
source: str,
|
|
780
|
+
notes: str,
|
|
781
|
+
preview_image_paths: list[str] | None = None,
|
|
782
|
+
) -> PipelineOutput:
|
|
783
|
+
markdown = _unescape_markdown(text)
|
|
784
|
+
sections = _split_sections(markdown)
|
|
785
|
+
if not sections:
|
|
786
|
+
title = _guess_title(markdown) or pdf_path.stem
|
|
787
|
+
sections = [
|
|
788
|
+
Section(
|
|
789
|
+
level=1,
|
|
790
|
+
title=title,
|
|
791
|
+
text=markdown.strip(),
|
|
792
|
+
page_start=0,
|
|
793
|
+
page_end=max(0, len(page_infos) - 1),
|
|
794
|
+
)
|
|
795
|
+
]
|
|
796
|
+
structured = StructuredDocument(
|
|
797
|
+
title=sections[0].title if sections else None,
|
|
798
|
+
document_type=None,
|
|
799
|
+
language="zh",
|
|
800
|
+
pages=len(page_infos),
|
|
801
|
+
sections=sections,
|
|
802
|
+
tables=[],
|
|
803
|
+
)
|
|
804
|
+
now = datetime.now()
|
|
805
|
+
return PipelineOutput(
|
|
806
|
+
meta=PipelineMeta(
|
|
807
|
+
pipeline_name="mineru",
|
|
808
|
+
pipeline_version=_mineru_version(),
|
|
809
|
+
model=source,
|
|
810
|
+
device=device,
|
|
811
|
+
source_pdf=str(pdf_path),
|
|
812
|
+
started_at=now,
|
|
813
|
+
finished_at=now,
|
|
814
|
+
duration_seconds=0.0,
|
|
815
|
+
notes=notes,
|
|
816
|
+
),
|
|
817
|
+
raw_text=markdown,
|
|
818
|
+
markdown=markdown,
|
|
819
|
+
layout=[],
|
|
820
|
+
structured=structured,
|
|
821
|
+
preview_image_paths=preview_image_paths or [],
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def _guess_title(text: str) -> str | None:
|
|
826
|
+
for line in text.splitlines():
|
|
827
|
+
line = line.strip(" #\t")
|
|
828
|
+
if 4 <= len(line) <= 80:
|
|
829
|
+
return line
|
|
830
|
+
return None
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def _normalize_mineru(
|
|
834
|
+
content_list: list[dict],
|
|
835
|
+
page_dims: dict[int, tuple[float, float]],
|
|
836
|
+
) -> tuple[list[LayoutBlock], list[Table], str]:
|
|
837
|
+
"""
|
|
838
|
+
把 MinerU 3.x content_list.json 归一化到统一 schema。
|
|
839
|
+
|
|
840
|
+
关键点(与 2.x 不同):
|
|
841
|
+
- **没有 "title" 这个 type**,标题是 type:"text" + "text_level" >= 1
|
|
842
|
+
- bbox 是**归一化到 0-1000 整数**,必须乘以页面真实宽高换算回 PDF point
|
|
843
|
+
- `table_caption` / `image_caption` 是 **list[str]**,要 join
|
|
844
|
+
"""
|
|
845
|
+
blocks: list[LayoutBlock] = []
|
|
846
|
+
tables: list[Table] = []
|
|
847
|
+
raw_lines: list[str] = []
|
|
848
|
+
|
|
849
|
+
for i, item in enumerate(content_list):
|
|
850
|
+
page = item.get("page_idx", 0)
|
|
851
|
+
bbox_raw = item.get("bbox") or []
|
|
852
|
+
bbox = None
|
|
853
|
+
if len(bbox_raw) == 4:
|
|
854
|
+
page_w, page_h = page_dims.get(page, (595.0, 841.0)) # A4 兜底
|
|
855
|
+
# 0-1000 归一化坐标 → PDF point
|
|
856
|
+
bbox = BBox(
|
|
857
|
+
page=page,
|
|
858
|
+
x0=float(bbox_raw[0]) * page_w / 1000.0,
|
|
859
|
+
y0=float(bbox_raw[1]) * page_h / 1000.0,
|
|
860
|
+
x1=float(bbox_raw[2]) * page_w / 1000.0,
|
|
861
|
+
y1=float(bbox_raw[3]) * page_h / 1000.0,
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
item_type = item.get("type", "text")
|
|
865
|
+
text_level = item.get("text_level") or 0
|
|
866
|
+
text = item.get("text", "") or ""
|
|
867
|
+
|
|
868
|
+
# 标题识别:type=text + text_level>=1
|
|
869
|
+
if item_type == "text" and text_level >= 1:
|
|
870
|
+
block_type = "title"
|
|
871
|
+
else:
|
|
872
|
+
block_type = _MINERU_TYPE_MAP.get(item_type, "other")
|
|
873
|
+
|
|
874
|
+
# caption list[str] → str
|
|
875
|
+
caption = item.get("table_caption") or item.get("image_caption") or []
|
|
876
|
+
if isinstance(caption, list):
|
|
877
|
+
caption = " ".join(str(c) for c in caption if c)
|
|
878
|
+
|
|
879
|
+
if bbox:
|
|
880
|
+
blocks.append(
|
|
881
|
+
LayoutBlock(
|
|
882
|
+
bbox=bbox,
|
|
883
|
+
text=text or caption or "",
|
|
884
|
+
block_type=block_type, # type: ignore[arg-type]
|
|
885
|
+
reading_order=i,
|
|
886
|
+
)
|
|
887
|
+
)
|
|
888
|
+
if text:
|
|
889
|
+
raw_lines.append(text)
|
|
890
|
+
elif caption:
|
|
891
|
+
raw_lines.append(caption)
|
|
892
|
+
|
|
893
|
+
if item_type == "table":
|
|
894
|
+
tables.append(
|
|
895
|
+
Table(
|
|
896
|
+
page=page,
|
|
897
|
+
bbox=bbox,
|
|
898
|
+
html=item.get("table_body", ""),
|
|
899
|
+
caption=caption or None,
|
|
900
|
+
)
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
return blocks, tables, "\n".join(raw_lines)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def _unescape_markdown(text: str) -> str:
|
|
907
|
+
"""
|
|
908
|
+
剥离 MinerU 在 markdown 里加的反斜杠转义(\\_/\\*/\\[/\\]/\\(/\\)/\\#)。
|
|
909
|
+
再清掉常见的"数字两侧夹下划线"残留(合同填空符号被 OCR 当成下划线):
|
|
910
|
+
'甲方于_2027年_6月_30日' → '甲方于 2027 年 6 月 30 日'
|
|
911
|
+
"""
|
|
912
|
+
import re as _re
|
|
913
|
+
|
|
914
|
+
text = _re.sub(r"\\([_*\[\]()#+\-.!`])", r"\1", text)
|
|
915
|
+
# 数字/中文之间的下划线一律视为空白
|
|
916
|
+
text = _re.sub(r"_+", " ", text)
|
|
917
|
+
return text
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
def _split_sections(md: str) -> list[Section]:
|
|
921
|
+
import re
|
|
922
|
+
|
|
923
|
+
sections: list[Section] = []
|
|
924
|
+
current_title: str | None = None
|
|
925
|
+
current_level = 1
|
|
926
|
+
buf: list[str] = []
|
|
927
|
+
for line in md.splitlines():
|
|
928
|
+
m = re.match(r"^(#{1,6})\s+(.+?)\s*$", line)
|
|
929
|
+
if m:
|
|
930
|
+
if current_title is not None:
|
|
931
|
+
sections.append(
|
|
932
|
+
Section(
|
|
933
|
+
level=current_level,
|
|
934
|
+
title=current_title,
|
|
935
|
+
text="\n".join(buf).strip(),
|
|
936
|
+
page_start=0,
|
|
937
|
+
page_end=0,
|
|
938
|
+
)
|
|
939
|
+
)
|
|
940
|
+
current_title = m.group(2)
|
|
941
|
+
current_level = len(m.group(1))
|
|
942
|
+
buf = []
|
|
943
|
+
else:
|
|
944
|
+
buf.append(line)
|
|
945
|
+
if current_title is not None:
|
|
946
|
+
sections.append(
|
|
947
|
+
Section(
|
|
948
|
+
level=current_level,
|
|
949
|
+
title=current_title,
|
|
950
|
+
text="\n".join(buf).strip(),
|
|
951
|
+
page_start=0,
|
|
952
|
+
page_end=0,
|
|
953
|
+
)
|
|
954
|
+
)
|
|
955
|
+
return sections
|