deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,646 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import sys
|
|
21
|
+
import tempfile
|
|
22
|
+
import threading
|
|
23
|
+
import zipfile
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from io import BytesIO
|
|
26
|
+
from os import PathLike
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Callable, Optional
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pdfplumber
|
|
32
|
+
import requests
|
|
33
|
+
from PIL import Image
|
|
34
|
+
from strenum import StrEnum
|
|
35
|
+
|
|
36
|
+
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
|
37
|
+
|
|
38
|
+
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
39
|
+
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
40
|
+
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class MinerUContentType(StrEnum):
|
|
44
|
+
IMAGE = "image"
|
|
45
|
+
TABLE = "table"
|
|
46
|
+
TEXT = "text"
|
|
47
|
+
EQUATION = "equation"
|
|
48
|
+
CODE = "code"
|
|
49
|
+
LIST = "list"
|
|
50
|
+
DISCARDED = "discarded"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Mapping from language names to MinerU language codes
|
|
54
|
+
LANGUAGE_TO_MINERU_MAP = {
|
|
55
|
+
'English': 'en',
|
|
56
|
+
'Chinese': 'ch',
|
|
57
|
+
'Traditional Chinese': 'chinese_cht',
|
|
58
|
+
'Russian': 'east_slavic',
|
|
59
|
+
'Ukrainian': 'east_slavic',
|
|
60
|
+
'Indonesian': 'latin',
|
|
61
|
+
'Spanish': 'latin',
|
|
62
|
+
'Vietnamese': 'latin',
|
|
63
|
+
'Japanese': 'japan',
|
|
64
|
+
'Korean': 'korean',
|
|
65
|
+
'Portuguese BR': 'latin',
|
|
66
|
+
'German': 'latin',
|
|
67
|
+
'French': 'latin',
|
|
68
|
+
'Italian': 'latin',
|
|
69
|
+
'Tamil': 'ta',
|
|
70
|
+
'Telugu': 'te',
|
|
71
|
+
'Kannada': 'ka',
|
|
72
|
+
'Thai': 'th',
|
|
73
|
+
'Greek': 'el',
|
|
74
|
+
'Hindi': 'devanagari',
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MinerUBackend(StrEnum):
|
|
79
|
+
"""MinerU processing backend options."""
|
|
80
|
+
|
|
81
|
+
PIPELINE = "pipeline" # Traditional multimodel pipeline (default)
|
|
82
|
+
VLM_TRANSFORMERS = "vlm-transformers" # Vision-language model using HuggingFace Transformers
|
|
83
|
+
VLM_MLX_ENGINE = "vlm-mlx-engine" # Faster, requires Apple Silicon and macOS 13.5+
|
|
84
|
+
VLM_VLLM_ENGINE = "vlm-vllm-engine" # Local vLLM engine, requires local GPU
|
|
85
|
+
VLM_VLLM_ASYNC_ENGINE = "vlm-vllm-async-engine" # Asynchronous vLLM engine, new in MinerU API
|
|
86
|
+
VLM_LMDEPLOY_ENGINE = "vlm-lmdeploy-engine" # LMDeploy engine
|
|
87
|
+
VLM_HTTP_CLIENT = "vlm-http-client" # HTTP client for remote vLLM server (CPU only)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class MinerULanguage(StrEnum):
|
|
91
|
+
"""MinerU supported languages for OCR (pipeline backend only)."""
|
|
92
|
+
|
|
93
|
+
CH = "ch" # Chinese
|
|
94
|
+
CH_SERVER = "ch_server" # Chinese (server)
|
|
95
|
+
CH_LITE = "ch_lite" # Chinese (lite)
|
|
96
|
+
EN = "en" # English
|
|
97
|
+
KOREAN = "korean" # Korean
|
|
98
|
+
JAPAN = "japan" # Japanese
|
|
99
|
+
CHINESE_CHT = "chinese_cht" # Chinese Traditional
|
|
100
|
+
TA = "ta" # Tamil
|
|
101
|
+
TE = "te" # Telugu
|
|
102
|
+
KA = "ka" # Kannada
|
|
103
|
+
TH = "th" # Thai
|
|
104
|
+
EL = "el" # Greek
|
|
105
|
+
LATIN = "latin" # Latin
|
|
106
|
+
ARABIC = "arabic" # Arabic
|
|
107
|
+
EAST_SLAVIC = "east_slavic" # East Slavic
|
|
108
|
+
CYRILLIC = "cyrillic" # Cyrillic
|
|
109
|
+
DEVANAGARI = "devanagari" # Devanagari
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class MinerUParseMethod(StrEnum):
|
|
113
|
+
"""MinerU PDF parsing methods (pipeline backend only)."""
|
|
114
|
+
|
|
115
|
+
AUTO = "auto" # Automatically determine the method based on the file type
|
|
116
|
+
TXT = "txt" # Use text extraction method
|
|
117
|
+
OCR = "ocr" # Use OCR method for image-based PDFs
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class MinerUParseOptions:
|
|
122
|
+
"""Options for MinerU PDF parsing."""
|
|
123
|
+
|
|
124
|
+
backend: MinerUBackend = MinerUBackend.PIPELINE
|
|
125
|
+
lang: Optional[MinerULanguage] = None # language for OCR (pipeline backend only)
|
|
126
|
+
method: MinerUParseMethod = MinerUParseMethod.AUTO
|
|
127
|
+
server_url: Optional[str] = None
|
|
128
|
+
delete_output: bool = True
|
|
129
|
+
parse_method: str = "raw"
|
|
130
|
+
formula_enable: bool = True
|
|
131
|
+
table_enable: bool = True
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class MinerUParser(RAGFlowPdfParser):
|
|
135
|
+
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
|
|
136
|
+
self.mineru_api = mineru_api.rstrip("/")
|
|
137
|
+
self.mineru_server_url = mineru_server_url.rstrip("/")
|
|
138
|
+
self.outlines = []
|
|
139
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
140
|
+
|
|
141
|
+
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
|
142
|
+
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
|
143
|
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
144
|
+
if not root_dir:
|
|
145
|
+
files = zip_ref.namelist()
|
|
146
|
+
if files and files[0].endswith("/"):
|
|
147
|
+
root_dir = files[0]
|
|
148
|
+
else:
|
|
149
|
+
root_dir = None
|
|
150
|
+
|
|
151
|
+
if not root_dir or not root_dir.endswith("/"):
|
|
152
|
+
self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
|
|
153
|
+
zip_ref.extractall(extract_to)
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
root_len = len(root_dir)
|
|
157
|
+
for member in zip_ref.infolist():
|
|
158
|
+
filename = member.filename
|
|
159
|
+
if filename == root_dir:
|
|
160
|
+
self.logger.info("[MinerU] Ignore root folder...")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
path = filename
|
|
164
|
+
if path.startswith(root_dir):
|
|
165
|
+
path = path[root_len:]
|
|
166
|
+
|
|
167
|
+
full_path = os.path.join(extract_to, path)
|
|
168
|
+
if member.is_dir():
|
|
169
|
+
os.makedirs(full_path, exist_ok=True)
|
|
170
|
+
else:
|
|
171
|
+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
172
|
+
with open(full_path, "wb") as f:
|
|
173
|
+
f.write(zip_ref.read(filename))
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _is_http_endpoint_valid(url, timeout=5):
|
|
177
|
+
try:
|
|
178
|
+
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
|
179
|
+
return response.status_code in [200, 301, 302, 307, 308]
|
|
180
|
+
except Exception:
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
|
184
|
+
reason = ""
|
|
185
|
+
|
|
186
|
+
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
|
|
187
|
+
if backend not in valid_backends:
|
|
188
|
+
reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
|
189
|
+
self.logger.warning(reason)
|
|
190
|
+
return False, reason
|
|
191
|
+
|
|
192
|
+
if not self.mineru_api:
|
|
193
|
+
reason = "[MinerU] MINERU_APISERVER not configured."
|
|
194
|
+
self.logger.warning(reason)
|
|
195
|
+
return False, reason
|
|
196
|
+
|
|
197
|
+
api_openapi = f"{self.mineru_api}/openapi.json"
|
|
198
|
+
try:
|
|
199
|
+
api_ok = self._is_http_endpoint_valid(api_openapi)
|
|
200
|
+
self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
|
|
201
|
+
if not api_ok:
|
|
202
|
+
reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
|
|
203
|
+
return False, reason
|
|
204
|
+
except Exception as exc:
|
|
205
|
+
reason = f"[MinerU] MinerU API check failed: {exc}"
|
|
206
|
+
self.logger.warning(reason)
|
|
207
|
+
return False, reason
|
|
208
|
+
|
|
209
|
+
if backend == "vlm-http-client":
|
|
210
|
+
resolved_server = server_url or self.mineru_server_url
|
|
211
|
+
if not resolved_server:
|
|
212
|
+
reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
|
|
213
|
+
self.logger.warning(reason)
|
|
214
|
+
return False, reason
|
|
215
|
+
try:
|
|
216
|
+
server_ok = self._is_http_endpoint_valid(resolved_server)
|
|
217
|
+
self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
|
|
218
|
+
except Exception as exc:
|
|
219
|
+
self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
|
|
220
|
+
|
|
221
|
+
return True, reason
|
|
222
|
+
|
|
223
|
+
def _run_mineru(
|
|
224
|
+
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
|
225
|
+
) -> Path:
|
|
226
|
+
return self._run_mineru_api(input_path, output_dir, options, callback)
|
|
227
|
+
|
|
228
|
+
def _run_mineru_api(
|
|
229
|
+
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
|
230
|
+
) -> Path:
|
|
231
|
+
pdf_file_path = str(input_path)
|
|
232
|
+
|
|
233
|
+
if not os.path.exists(pdf_file_path):
|
|
234
|
+
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
|
235
|
+
|
|
236
|
+
pdf_file_name = Path(pdf_file_path).stem.strip()
|
|
237
|
+
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
|
|
238
|
+
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
|
|
239
|
+
|
|
240
|
+
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
|
241
|
+
|
|
242
|
+
data = {
|
|
243
|
+
"output_dir": "./output",
|
|
244
|
+
"lang_list": options.lang,
|
|
245
|
+
"backend": options.backend,
|
|
246
|
+
"parse_method": options.method,
|
|
247
|
+
"formula_enable": options.formula_enable,
|
|
248
|
+
"table_enable": options.table_enable,
|
|
249
|
+
"server_url": None,
|
|
250
|
+
"return_md": True,
|
|
251
|
+
"return_middle_json": True,
|
|
252
|
+
"return_model_output": True,
|
|
253
|
+
"return_content_list": True,
|
|
254
|
+
"return_images": True,
|
|
255
|
+
"response_format_zip": True,
|
|
256
|
+
"start_page_id": 0,
|
|
257
|
+
"end_page_id": 99999,
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if options.server_url:
|
|
261
|
+
data["server_url"] = options.server_url
|
|
262
|
+
elif self.mineru_server_url:
|
|
263
|
+
data["server_url"] = self.mineru_server_url
|
|
264
|
+
|
|
265
|
+
self.logger.info(f"[MinerU] request {data=}")
|
|
266
|
+
self.logger.info(f"[MinerU] request {options=}")
|
|
267
|
+
|
|
268
|
+
headers = {"Accept": "application/json"}
|
|
269
|
+
try:
|
|
270
|
+
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
|
271
|
+
if callback:
|
|
272
|
+
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
|
273
|
+
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
|
|
274
|
+
timeout=1800)
|
|
275
|
+
|
|
276
|
+
response.raise_for_status()
|
|
277
|
+
if response.headers.get("Content-Type") == "application/zip":
|
|
278
|
+
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
279
|
+
|
|
280
|
+
if callback:
|
|
281
|
+
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
282
|
+
|
|
283
|
+
with open(output_zip_path, "wb") as f:
|
|
284
|
+
f.write(response.content)
|
|
285
|
+
|
|
286
|
+
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
|
287
|
+
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
|
288
|
+
|
|
289
|
+
if callback:
|
|
290
|
+
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
|
291
|
+
else:
|
|
292
|
+
self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
|
|
293
|
+
except Exception as e:
|
|
294
|
+
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
|
295
|
+
self.logger.info("[MinerU] Api completed successfully.")
|
|
296
|
+
return Path(output_path)
|
|
297
|
+
|
|
298
|
+
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
|
299
|
+
self.page_from = page_from
|
|
300
|
+
self.page_to = page_to
|
|
301
|
+
try:
|
|
302
|
+
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
|
303
|
+
self.pdf = pdf
|
|
304
|
+
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in
|
|
305
|
+
enumerate(self.pdf.pages[page_from:page_to])]
|
|
306
|
+
except Exception as e:
|
|
307
|
+
self.page_images = None
|
|
308
|
+
self.total_page = 0
|
|
309
|
+
self.logger.exception(e)
|
|
310
|
+
|
|
311
|
+
def _line_tag(self, bx):
|
|
312
|
+
pn = [bx["page_idx"] + 1]
|
|
313
|
+
positions = bx.get("bbox", (0, 0, 0, 0))
|
|
314
|
+
x0, top, x1, bott = positions
|
|
315
|
+
|
|
316
|
+
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
|
|
317
|
+
page_width, page_height = self.page_images[bx["page_idx"]].size
|
|
318
|
+
x0 = (x0 / 1000.0) * page_width
|
|
319
|
+
x1 = (x1 / 1000.0) * page_width
|
|
320
|
+
top = (top / 1000.0) * page_height
|
|
321
|
+
bott = (bott / 1000.0) * page_height
|
|
322
|
+
|
|
323
|
+
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
|
324
|
+
|
|
325
|
+
def crop(self, text, ZM=1, need_position=False):
|
|
326
|
+
imgs = []
|
|
327
|
+
poss = self.extract_positions(text)
|
|
328
|
+
if not poss:
|
|
329
|
+
if need_position:
|
|
330
|
+
return None, None
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
if not getattr(self, "page_images", None):
|
|
334
|
+
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
|
335
|
+
if need_position:
|
|
336
|
+
return None, None
|
|
337
|
+
return
|
|
338
|
+
|
|
339
|
+
page_count = len(self.page_images)
|
|
340
|
+
|
|
341
|
+
filtered_poss = []
|
|
342
|
+
for pns, left, right, top, bottom in poss:
|
|
343
|
+
if not pns:
|
|
344
|
+
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
|
345
|
+
continue
|
|
346
|
+
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
347
|
+
if not valid_pns:
|
|
348
|
+
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
349
|
+
continue
|
|
350
|
+
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
351
|
+
|
|
352
|
+
poss = filtered_poss
|
|
353
|
+
if not poss:
|
|
354
|
+
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
|
355
|
+
if need_position:
|
|
356
|
+
return None, None
|
|
357
|
+
return
|
|
358
|
+
|
|
359
|
+
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
360
|
+
GAP = 6
|
|
361
|
+
pos = poss[0]
|
|
362
|
+
first_page_idx = pos[0][0]
|
|
363
|
+
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
364
|
+
pos = poss[-1]
|
|
365
|
+
last_page_idx = pos[0][-1]
|
|
366
|
+
if not (0 <= last_page_idx < page_count):
|
|
367
|
+
self.logger.warning(
|
|
368
|
+
f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
|
369
|
+
if need_position:
|
|
370
|
+
return None, None
|
|
371
|
+
return
|
|
372
|
+
last_page_height = self.page_images[last_page_idx].size[1]
|
|
373
|
+
poss.append(
|
|
374
|
+
(
|
|
375
|
+
[last_page_idx],
|
|
376
|
+
pos[1],
|
|
377
|
+
pos[2],
|
|
378
|
+
min(last_page_height, pos[4] + GAP),
|
|
379
|
+
min(last_page_height, pos[4] + 120),
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
positions = []
|
|
384
|
+
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
385
|
+
right = left + max_width
|
|
386
|
+
|
|
387
|
+
if bottom <= top:
|
|
388
|
+
bottom = top + 2
|
|
389
|
+
|
|
390
|
+
for pn in pns[1:]:
|
|
391
|
+
if 0 <= pn - 1 < page_count:
|
|
392
|
+
bottom += self.page_images[pn - 1].size[1]
|
|
393
|
+
else:
|
|
394
|
+
self.logger.warning(
|
|
395
|
+
f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
|
396
|
+
|
|
397
|
+
if not (0 <= pns[0] < page_count):
|
|
398
|
+
self.logger.warning(
|
|
399
|
+
f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
img0 = self.page_images[pns[0]]
|
|
403
|
+
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
|
404
|
+
crop0 = img0.crop((x0, y0, x1, y1))
|
|
405
|
+
imgs.append(crop0)
|
|
406
|
+
if 0 < ii < len(poss) - 1:
|
|
407
|
+
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
408
|
+
|
|
409
|
+
bottom -= img0.size[1]
|
|
410
|
+
for pn in pns[1:]:
|
|
411
|
+
if not (0 <= pn < page_count):
|
|
412
|
+
self.logger.warning(
|
|
413
|
+
f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
|
414
|
+
continue
|
|
415
|
+
page = self.page_images[pn]
|
|
416
|
+
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
|
417
|
+
cimgp = page.crop((x0, y0, x1, y1))
|
|
418
|
+
imgs.append(cimgp)
|
|
419
|
+
if 0 < ii < len(poss) - 1:
|
|
420
|
+
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
421
|
+
bottom -= page.size[1]
|
|
422
|
+
|
|
423
|
+
if not imgs:
|
|
424
|
+
if need_position:
|
|
425
|
+
return None, None
|
|
426
|
+
return
|
|
427
|
+
|
|
428
|
+
height = 0
|
|
429
|
+
for img in imgs:
|
|
430
|
+
height += img.size[1] + GAP
|
|
431
|
+
height = int(height)
|
|
432
|
+
width = int(np.max([i.size[0] for i in imgs]))
|
|
433
|
+
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
|
434
|
+
height = 0
|
|
435
|
+
for ii, img in enumerate(imgs):
|
|
436
|
+
if ii == 0 or ii + 1 == len(imgs):
|
|
437
|
+
img = img.convert("RGBA")
|
|
438
|
+
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
|
439
|
+
overlay.putalpha(128)
|
|
440
|
+
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
441
|
+
pic.paste(img, (0, int(height)))
|
|
442
|
+
height += img.size[1] + GAP
|
|
443
|
+
|
|
444
|
+
if need_position:
|
|
445
|
+
return pic, positions
|
|
446
|
+
return pic
|
|
447
|
+
|
|
448
|
+
@staticmethod
|
|
449
|
+
def extract_positions(txt: str):
|
|
450
|
+
poss = []
|
|
451
|
+
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
452
|
+
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
453
|
+
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
454
|
+
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
455
|
+
return poss
|
|
456
|
+
|
|
457
|
+
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
|
|
458
|
+
dict[str, Any]]:
|
|
459
|
+
json_file = None
|
|
460
|
+
subdir = None
|
|
461
|
+
attempted = []
|
|
462
|
+
|
|
463
|
+
# mirror MinerU's sanitize_filename to align ZIP naming
|
|
464
|
+
def _sanitize_filename(name: str) -> str:
|
|
465
|
+
sanitized = re.sub(r"[/\\\.]{2,}|[/\\]", "", name)
|
|
466
|
+
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
|
467
|
+
if sanitized.startswith("."):
|
|
468
|
+
sanitized = "_" + sanitized[1:]
|
|
469
|
+
return sanitized or "unnamed"
|
|
470
|
+
|
|
471
|
+
safe_stem = _sanitize_filename(file_stem)
|
|
472
|
+
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
|
|
473
|
+
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
|
474
|
+
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
|
|
475
|
+
|
|
476
|
+
jf = output_dir / f"{file_stem}_content_list.json"
|
|
477
|
+
self.logger.info(f"[MinerU] Trying original path: {jf}")
|
|
478
|
+
attempted.append(jf)
|
|
479
|
+
if jf.exists():
|
|
480
|
+
subdir = output_dir
|
|
481
|
+
json_file = jf
|
|
482
|
+
else:
|
|
483
|
+
alt = output_dir / f"{safe_stem}_content_list.json"
|
|
484
|
+
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
|
|
485
|
+
attempted.append(alt)
|
|
486
|
+
if alt.exists():
|
|
487
|
+
subdir = output_dir
|
|
488
|
+
json_file = alt
|
|
489
|
+
else:
|
|
490
|
+
nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
|
|
491
|
+
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
|
|
492
|
+
attempted.append(nested_alt)
|
|
493
|
+
if nested_alt.exists():
|
|
494
|
+
subdir = nested_alt.parent
|
|
495
|
+
json_file = nested_alt
|
|
496
|
+
|
|
497
|
+
if not json_file:
|
|
498
|
+
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
|
499
|
+
|
|
500
|
+
with open(json_file, "r", encoding="utf-8") as f:
|
|
501
|
+
data = json.load(f)
|
|
502
|
+
|
|
503
|
+
for item in data:
|
|
504
|
+
for key in ("img_path", "table_img_path", "equation_img_path"):
|
|
505
|
+
if key in item and item[key]:
|
|
506
|
+
item[key] = str((subdir / item[key]).resolve())
|
|
507
|
+
return data
|
|
508
|
+
|
|
509
|
+
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
|
510
|
+
sections = []
|
|
511
|
+
for output in outputs:
|
|
512
|
+
match output["type"]:
|
|
513
|
+
case MinerUContentType.TEXT:
|
|
514
|
+
section = output.get("text", "")
|
|
515
|
+
case MinerUContentType.TABLE:
|
|
516
|
+
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(
|
|
517
|
+
output.get("table_footnote", []))
|
|
518
|
+
if not section.strip():
|
|
519
|
+
section = "FAILED TO PARSE TABLE"
|
|
520
|
+
case MinerUContentType.IMAGE:
|
|
521
|
+
section = "".join(output.get("image_caption", [])) + "\n" + "".join(
|
|
522
|
+
output.get("image_footnote", []))
|
|
523
|
+
case MinerUContentType.EQUATION:
|
|
524
|
+
section = output.get("text", "")
|
|
525
|
+
case MinerUContentType.CODE:
|
|
526
|
+
section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
|
|
527
|
+
case MinerUContentType.LIST:
|
|
528
|
+
section = "\n".join(output.get("list_items", []))
|
|
529
|
+
case MinerUContentType.DISCARDED:
|
|
530
|
+
continue # Skip discarded blocks entirely
|
|
531
|
+
|
|
532
|
+
if section and parse_method == "manual":
|
|
533
|
+
sections.append((section, output["type"], self._line_tag(output)))
|
|
534
|
+
elif section and parse_method == "paper":
|
|
535
|
+
sections.append((section + self._line_tag(output), output["type"]))
|
|
536
|
+
else:
|
|
537
|
+
sections.append((section, self._line_tag(output)))
|
|
538
|
+
return sections
|
|
539
|
+
|
|
540
|
+
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
|
541
|
+
return []
|
|
542
|
+
|
|
543
|
+
def parse_pdf(
|
|
544
|
+
self,
|
|
545
|
+
filepath: str | PathLike[str],
|
|
546
|
+
binary: BytesIO | bytes,
|
|
547
|
+
callback: Optional[Callable] = None,
|
|
548
|
+
*,
|
|
549
|
+
output_dir: Optional[str] = None,
|
|
550
|
+
backend: str = "pipeline",
|
|
551
|
+
server_url: Optional[str] = None,
|
|
552
|
+
delete_output: bool = True,
|
|
553
|
+
parse_method: str = "raw",
|
|
554
|
+
**kwargs,
|
|
555
|
+
) -> tuple:
|
|
556
|
+
import shutil
|
|
557
|
+
|
|
558
|
+
temp_pdf = None
|
|
559
|
+
created_tmp_dir = False
|
|
560
|
+
|
|
561
|
+
parser_cfg = kwargs.get('parser_config', {})
|
|
562
|
+
lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
|
|
563
|
+
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
|
|
564
|
+
mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
|
|
565
|
+
enable_formula = parser_cfg.get('mineru_formula_enable', True)
|
|
566
|
+
enable_table = parser_cfg.get('mineru_table_enable', True)
|
|
567
|
+
|
|
568
|
+
# remove spaces, or mineru crash, and _read_output fail too
|
|
569
|
+
file_path = Path(filepath)
|
|
570
|
+
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
|
|
571
|
+
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
|
|
572
|
+
|
|
573
|
+
if binary:
|
|
574
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
|
|
575
|
+
temp_pdf = temp_dir / pdf_file_name
|
|
576
|
+
with open(temp_pdf, "wb") as f:
|
|
577
|
+
f.write(binary)
|
|
578
|
+
pdf = temp_pdf
|
|
579
|
+
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
580
|
+
if callback:
|
|
581
|
+
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
582
|
+
else:
|
|
583
|
+
if pdf_file_path_valid != filepath:
|
|
584
|
+
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
|
|
585
|
+
shutil.move(filepath, pdf_file_path_valid)
|
|
586
|
+
pdf = Path(pdf_file_path_valid)
|
|
587
|
+
if not pdf.exists():
|
|
588
|
+
if callback:
|
|
589
|
+
callback(-1, f"[MinerU] PDF not found: {pdf}")
|
|
590
|
+
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
|
|
591
|
+
|
|
592
|
+
if output_dir:
|
|
593
|
+
out_dir = Path(output_dir)
|
|
594
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
595
|
+
else:
|
|
596
|
+
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
|
597
|
+
created_tmp_dir = True
|
|
598
|
+
|
|
599
|
+
self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
|
|
600
|
+
if callback:
|
|
601
|
+
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
|
602
|
+
|
|
603
|
+
self.__images__(pdf, zoomin=1)
|
|
604
|
+
|
|
605
|
+
try:
|
|
606
|
+
options = MinerUParseOptions(
|
|
607
|
+
backend=MinerUBackend(backend),
|
|
608
|
+
lang=MinerULanguage(mineru_lang_code),
|
|
609
|
+
method=MinerUParseMethod(mineru_method_raw_str),
|
|
610
|
+
server_url=server_url,
|
|
611
|
+
delete_output=delete_output,
|
|
612
|
+
parse_method=parse_method,
|
|
613
|
+
formula_enable=enable_formula,
|
|
614
|
+
table_enable=enable_table,
|
|
615
|
+
)
|
|
616
|
+
final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
|
|
617
|
+
outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
|
|
618
|
+
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
|
619
|
+
if callback:
|
|
620
|
+
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
|
621
|
+
|
|
622
|
+
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
|
|
623
|
+
finally:
|
|
624
|
+
if temp_pdf and temp_pdf.exists():
|
|
625
|
+
try:
|
|
626
|
+
temp_pdf.unlink()
|
|
627
|
+
temp_pdf.parent.rmdir()
|
|
628
|
+
except Exception:
|
|
629
|
+
pass
|
|
630
|
+
if delete_output and created_tmp_dir and out_dir.exists():
|
|
631
|
+
try:
|
|
632
|
+
shutil.rmtree(out_dir)
|
|
633
|
+
except Exception:
|
|
634
|
+
pass
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
if __name__ == "__main__":
|
|
638
|
+
parser = MinerUParser("mineru")
|
|
639
|
+
ok, reason = parser.check_installation()
|
|
640
|
+
print("MinerU available:", ok)
|
|
641
|
+
|
|
642
|
+
filepath = ""
|
|
643
|
+
with open(filepath, "rb") as file:
|
|
644
|
+
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
|
|
645
|
+
for output in outputs:
|
|
646
|
+
print(output)
|