deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,646 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import json
17
+ import logging
18
+ import os
19
+ import re
20
+ import sys
21
+ import tempfile
22
+ import threading
23
+ import zipfile
24
+ from dataclasses import dataclass
25
+ from io import BytesIO
26
+ from os import PathLike
27
+ from pathlib import Path
28
+ from typing import Any, Callable, Optional
29
+
30
+ import numpy as np
31
+ import pdfplumber
32
+ import requests
33
+ from PIL import Image
34
+ from strenum import StrEnum
35
+
36
+ from deepdoc.parser.pdf_parser import RAGFlowPdfParser
37
+
38
+ LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
39
+ if LOCK_KEY_pdfplumber not in sys.modules:
40
+ sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
41
+
42
+
43
+ class MinerUContentType(StrEnum):
44
+ IMAGE = "image"
45
+ TABLE = "table"
46
+ TEXT = "text"
47
+ EQUATION = "equation"
48
+ CODE = "code"
49
+ LIST = "list"
50
+ DISCARDED = "discarded"
51
+
52
+
53
+ # Mapping from language names to MinerU language codes
54
+ LANGUAGE_TO_MINERU_MAP = {
55
+ 'English': 'en',
56
+ 'Chinese': 'ch',
57
+ 'Traditional Chinese': 'chinese_cht',
58
+ 'Russian': 'east_slavic',
59
+ 'Ukrainian': 'east_slavic',
60
+ 'Indonesian': 'latin',
61
+ 'Spanish': 'latin',
62
+ 'Vietnamese': 'latin',
63
+ 'Japanese': 'japan',
64
+ 'Korean': 'korean',
65
+ 'Portuguese BR': 'latin',
66
+ 'German': 'latin',
67
+ 'French': 'latin',
68
+ 'Italian': 'latin',
69
+ 'Tamil': 'ta',
70
+ 'Telugu': 'te',
71
+ 'Kannada': 'ka',
72
+ 'Thai': 'th',
73
+ 'Greek': 'el',
74
+ 'Hindi': 'devanagari',
75
+ }
76
+
77
+
78
+ class MinerUBackend(StrEnum):
79
+ """MinerU processing backend options."""
80
+
81
+ PIPELINE = "pipeline" # Traditional multimodel pipeline (default)
82
+ VLM_TRANSFORMERS = "vlm-transformers" # Vision-language model using HuggingFace Transformers
83
+ VLM_MLX_ENGINE = "vlm-mlx-engine" # Faster, requires Apple Silicon and macOS 13.5+
84
+ VLM_VLLM_ENGINE = "vlm-vllm-engine" # Local vLLM engine, requires local GPU
85
+ VLM_VLLM_ASYNC_ENGINE = "vlm-vllm-async-engine" # Asynchronous vLLM engine, new in MinerU API
86
+ VLM_LMDEPLOY_ENGINE = "vlm-lmdeploy-engine" # LMDeploy engine
87
+ VLM_HTTP_CLIENT = "vlm-http-client" # HTTP client for remote vLLM server (CPU only)
88
+
89
+
90
+ class MinerULanguage(StrEnum):
91
+ """MinerU supported languages for OCR (pipeline backend only)."""
92
+
93
+ CH = "ch" # Chinese
94
+ CH_SERVER = "ch_server" # Chinese (server)
95
+ CH_LITE = "ch_lite" # Chinese (lite)
96
+ EN = "en" # English
97
+ KOREAN = "korean" # Korean
98
+ JAPAN = "japan" # Japanese
99
+ CHINESE_CHT = "chinese_cht" # Chinese Traditional
100
+ TA = "ta" # Tamil
101
+ TE = "te" # Telugu
102
+ KA = "ka" # Kannada
103
+ TH = "th" # Thai
104
+ EL = "el" # Greek
105
+ LATIN = "latin" # Latin
106
+ ARABIC = "arabic" # Arabic
107
+ EAST_SLAVIC = "east_slavic" # East Slavic
108
+ CYRILLIC = "cyrillic" # Cyrillic
109
+ DEVANAGARI = "devanagari" # Devanagari
110
+
111
+
112
+ class MinerUParseMethod(StrEnum):
113
+ """MinerU PDF parsing methods (pipeline backend only)."""
114
+
115
+ AUTO = "auto" # Automatically determine the method based on the file type
116
+ TXT = "txt" # Use text extraction method
117
+ OCR = "ocr" # Use OCR method for image-based PDFs
118
+
119
+
120
+ @dataclass
121
+ class MinerUParseOptions:
122
+ """Options for MinerU PDF parsing."""
123
+
124
+ backend: MinerUBackend = MinerUBackend.PIPELINE
125
+ lang: Optional[MinerULanguage] = None # language for OCR (pipeline backend only)
126
+ method: MinerUParseMethod = MinerUParseMethod.AUTO
127
+ server_url: Optional[str] = None
128
+ delete_output: bool = True
129
+ parse_method: str = "raw"
130
+ formula_enable: bool = True
131
+ table_enable: bool = True
132
+
133
+
134
+ class MinerUParser(RAGFlowPdfParser):
135
+ def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
136
+ self.mineru_api = mineru_api.rstrip("/")
137
+ self.mineru_server_url = mineru_server_url.rstrip("/")
138
+ self.outlines = []
139
+ self.logger = logging.getLogger(self.__class__.__name__)
140
+
141
+ def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
142
+ self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
143
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
144
+ if not root_dir:
145
+ files = zip_ref.namelist()
146
+ if files and files[0].endswith("/"):
147
+ root_dir = files[0]
148
+ else:
149
+ root_dir = None
150
+
151
+ if not root_dir or not root_dir.endswith("/"):
152
+ self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
153
+ zip_ref.extractall(extract_to)
154
+ return
155
+
156
+ root_len = len(root_dir)
157
+ for member in zip_ref.infolist():
158
+ filename = member.filename
159
+ if filename == root_dir:
160
+ self.logger.info("[MinerU] Ignore root folder...")
161
+ continue
162
+
163
+ path = filename
164
+ if path.startswith(root_dir):
165
+ path = path[root_len:]
166
+
167
+ full_path = os.path.join(extract_to, path)
168
+ if member.is_dir():
169
+ os.makedirs(full_path, exist_ok=True)
170
+ else:
171
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
172
+ with open(full_path, "wb") as f:
173
+ f.write(zip_ref.read(filename))
174
+
175
+ @staticmethod
176
+ def _is_http_endpoint_valid(url, timeout=5):
177
+ try:
178
+ response = requests.head(url, timeout=timeout, allow_redirects=True)
179
+ return response.status_code in [200, 301, 302, 307, 308]
180
+ except Exception:
181
+ return False
182
+
183
+ def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
184
+ reason = ""
185
+
186
+ valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
187
+ if backend not in valid_backends:
188
+ reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
189
+ self.logger.warning(reason)
190
+ return False, reason
191
+
192
+ if not self.mineru_api:
193
+ reason = "[MinerU] MINERU_APISERVER not configured."
194
+ self.logger.warning(reason)
195
+ return False, reason
196
+
197
+ api_openapi = f"{self.mineru_api}/openapi.json"
198
+ try:
199
+ api_ok = self._is_http_endpoint_valid(api_openapi)
200
+ self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
201
+ if not api_ok:
202
+ reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
203
+ return False, reason
204
+ except Exception as exc:
205
+ reason = f"[MinerU] MinerU API check failed: {exc}"
206
+ self.logger.warning(reason)
207
+ return False, reason
208
+
209
+ if backend == "vlm-http-client":
210
+ resolved_server = server_url or self.mineru_server_url
211
+ if not resolved_server:
212
+ reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
213
+ self.logger.warning(reason)
214
+ return False, reason
215
+ try:
216
+ server_ok = self._is_http_endpoint_valid(resolved_server)
217
+ self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
218
+ except Exception as exc:
219
+ self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
220
+
221
+ return True, reason
222
+
223
+ def _run_mineru(
224
+ self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
225
+ ) -> Path:
226
+ return self._run_mineru_api(input_path, output_dir, options, callback)
227
+
228
+ def _run_mineru_api(
229
+ self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
230
+ ) -> Path:
231
+ pdf_file_path = str(input_path)
232
+
233
+ if not os.path.exists(pdf_file_path):
234
+ raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
235
+
236
+ pdf_file_name = Path(pdf_file_path).stem.strip()
237
+ output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
238
+ output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
239
+
240
+ files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
241
+
242
+ data = {
243
+ "output_dir": "./output",
244
+ "lang_list": options.lang,
245
+ "backend": options.backend,
246
+ "parse_method": options.method,
247
+ "formula_enable": options.formula_enable,
248
+ "table_enable": options.table_enable,
249
+ "server_url": None,
250
+ "return_md": True,
251
+ "return_middle_json": True,
252
+ "return_model_output": True,
253
+ "return_content_list": True,
254
+ "return_images": True,
255
+ "response_format_zip": True,
256
+ "start_page_id": 0,
257
+ "end_page_id": 99999,
258
+ }
259
+
260
+ if options.server_url:
261
+ data["server_url"] = options.server_url
262
+ elif self.mineru_server_url:
263
+ data["server_url"] = self.mineru_server_url
264
+
265
+ self.logger.info(f"[MinerU] request {data=}")
266
+ self.logger.info(f"[MinerU] request {options=}")
267
+
268
+ headers = {"Accept": "application/json"}
269
+ try:
270
+ self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
271
+ if callback:
272
+ callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
273
+ response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
274
+ timeout=1800)
275
+
276
+ response.raise_for_status()
277
+ if response.headers.get("Content-Type") == "application/zip":
278
+ self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
279
+
280
+ if callback:
281
+ callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
282
+
283
+ with open(output_zip_path, "wb") as f:
284
+ f.write(response.content)
285
+
286
+ self.logger.info(f"[MinerU] Unzip to {output_path}...")
287
+ self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
288
+
289
+ if callback:
290
+ callback(0.40, f"[MinerU] Unzip to {output_path}...")
291
+ else:
292
+ self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
293
+ except Exception as e:
294
+ raise RuntimeError(f"[MinerU] api failed with exception {e}")
295
+ self.logger.info("[MinerU] Api completed successfully.")
296
+ return Path(output_path)
297
+
298
+ def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
299
+ self.page_from = page_from
300
+ self.page_to = page_to
301
+ try:
302
+ with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
303
+ self.pdf = pdf
304
+ self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in
305
+ enumerate(self.pdf.pages[page_from:page_to])]
306
+ except Exception as e:
307
+ self.page_images = None
308
+ self.total_page = 0
309
+ self.logger.exception(e)
310
+
311
+ def _line_tag(self, bx):
312
+ pn = [bx["page_idx"] + 1]
313
+ positions = bx.get("bbox", (0, 0, 0, 0))
314
+ x0, top, x1, bott = positions
315
+
316
+ if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
317
+ page_width, page_height = self.page_images[bx["page_idx"]].size
318
+ x0 = (x0 / 1000.0) * page_width
319
+ x1 = (x1 / 1000.0) * page_width
320
+ top = (top / 1000.0) * page_height
321
+ bott = (bott / 1000.0) * page_height
322
+
323
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
324
+
325
+ def crop(self, text, ZM=1, need_position=False):
326
+ imgs = []
327
+ poss = self.extract_positions(text)
328
+ if not poss:
329
+ if need_position:
330
+ return None, None
331
+ return
332
+
333
+ if not getattr(self, "page_images", None):
334
+ self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
335
+ if need_position:
336
+ return None, None
337
+ return
338
+
339
+ page_count = len(self.page_images)
340
+
341
+ filtered_poss = []
342
+ for pns, left, right, top, bottom in poss:
343
+ if not pns:
344
+ self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
345
+ continue
346
+ valid_pns = [p for p in pns if 0 <= p < page_count]
347
+ if not valid_pns:
348
+ self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
349
+ continue
350
+ filtered_poss.append((valid_pns, left, right, top, bottom))
351
+
352
+ poss = filtered_poss
353
+ if not poss:
354
+ self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
355
+ if need_position:
356
+ return None, None
357
+ return
358
+
359
+ max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
360
+ GAP = 6
361
+ pos = poss[0]
362
+ first_page_idx = pos[0][0]
363
+ poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
364
+ pos = poss[-1]
365
+ last_page_idx = pos[0][-1]
366
+ if not (0 <= last_page_idx < page_count):
367
+ self.logger.warning(
368
+ f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
369
+ if need_position:
370
+ return None, None
371
+ return
372
+ last_page_height = self.page_images[last_page_idx].size[1]
373
+ poss.append(
374
+ (
375
+ [last_page_idx],
376
+ pos[1],
377
+ pos[2],
378
+ min(last_page_height, pos[4] + GAP),
379
+ min(last_page_height, pos[4] + 120),
380
+ )
381
+ )
382
+
383
+ positions = []
384
+ for ii, (pns, left, right, top, bottom) in enumerate(poss):
385
+ right = left + max_width
386
+
387
+ if bottom <= top:
388
+ bottom = top + 2
389
+
390
+ for pn in pns[1:]:
391
+ if 0 <= pn - 1 < page_count:
392
+ bottom += self.page_images[pn - 1].size[1]
393
+ else:
394
+ self.logger.warning(
395
+ f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
396
+
397
+ if not (0 <= pns[0] < page_count):
398
+ self.logger.warning(
399
+ f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
400
+ continue
401
+
402
+ img0 = self.page_images[pns[0]]
403
+ x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
404
+ crop0 = img0.crop((x0, y0, x1, y1))
405
+ imgs.append(crop0)
406
+ if 0 < ii < len(poss) - 1:
407
+ positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
408
+
409
+ bottom -= img0.size[1]
410
+ for pn in pns[1:]:
411
+ if not (0 <= pn < page_count):
412
+ self.logger.warning(
413
+ f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
414
+ continue
415
+ page = self.page_images[pn]
416
+ x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
417
+ cimgp = page.crop((x0, y0, x1, y1))
418
+ imgs.append(cimgp)
419
+ if 0 < ii < len(poss) - 1:
420
+ positions.append((pn + self.page_from, x0, x1, y0, y1))
421
+ bottom -= page.size[1]
422
+
423
+ if not imgs:
424
+ if need_position:
425
+ return None, None
426
+ return
427
+
428
+ height = 0
429
+ for img in imgs:
430
+ height += img.size[1] + GAP
431
+ height = int(height)
432
+ width = int(np.max([i.size[0] for i in imgs]))
433
+ pic = Image.new("RGB", (width, height), (245, 245, 245))
434
+ height = 0
435
+ for ii, img in enumerate(imgs):
436
+ if ii == 0 or ii + 1 == len(imgs):
437
+ img = img.convert("RGBA")
438
+ overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
439
+ overlay.putalpha(128)
440
+ img = Image.alpha_composite(img, overlay).convert("RGB")
441
+ pic.paste(img, (0, int(height)))
442
+ height += img.size[1] + GAP
443
+
444
+ if need_position:
445
+ return pic, positions
446
+ return pic
447
+
448
+ @staticmethod
449
+ def extract_positions(txt: str):
450
+ poss = []
451
+ for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
452
+ pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
453
+ left, right, top, bottom = float(left), float(right), float(top), float(bottom)
454
+ poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
455
+ return poss
456
+
457
+ def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
458
+ dict[str, Any]]:
459
+ json_file = None
460
+ subdir = None
461
+ attempted = []
462
+
463
+ # mirror MinerU's sanitize_filename to align ZIP naming
464
+ def _sanitize_filename(name: str) -> str:
465
+ sanitized = re.sub(r"[/\\\.]{2,}|[/\\]", "", name)
466
+ sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
467
+ if sanitized.startswith("."):
468
+ sanitized = "_" + sanitized[1:]
469
+ return sanitized or "unnamed"
470
+
471
+ safe_stem = _sanitize_filename(file_stem)
472
+ allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
473
+ self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
474
+ self.logger.info(f"[MinerU] Searching output in: {output_dir}")
475
+
476
+ jf = output_dir / f"{file_stem}_content_list.json"
477
+ self.logger.info(f"[MinerU] Trying original path: {jf}")
478
+ attempted.append(jf)
479
+ if jf.exists():
480
+ subdir = output_dir
481
+ json_file = jf
482
+ else:
483
+ alt = output_dir / f"{safe_stem}_content_list.json"
484
+ self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
485
+ attempted.append(alt)
486
+ if alt.exists():
487
+ subdir = output_dir
488
+ json_file = alt
489
+ else:
490
+ nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
491
+ self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
492
+ attempted.append(nested_alt)
493
+ if nested_alt.exists():
494
+ subdir = nested_alt.parent
495
+ json_file = nested_alt
496
+
497
+ if not json_file:
498
+ raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
499
+
500
+ with open(json_file, "r", encoding="utf-8") as f:
501
+ data = json.load(f)
502
+
503
+ for item in data:
504
+ for key in ("img_path", "table_img_path", "equation_img_path"):
505
+ if key in item and item[key]:
506
+ item[key] = str((subdir / item[key]).resolve())
507
+ return data
508
+
509
+ def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
510
+ sections = []
511
+ for output in outputs:
512
+ match output["type"]:
513
+ case MinerUContentType.TEXT:
514
+ section = output.get("text", "")
515
+ case MinerUContentType.TABLE:
516
+ section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(
517
+ output.get("table_footnote", []))
518
+ if not section.strip():
519
+ section = "FAILED TO PARSE TABLE"
520
+ case MinerUContentType.IMAGE:
521
+ section = "".join(output.get("image_caption", [])) + "\n" + "".join(
522
+ output.get("image_footnote", []))
523
+ case MinerUContentType.EQUATION:
524
+ section = output.get("text", "")
525
+ case MinerUContentType.CODE:
526
+ section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
527
+ case MinerUContentType.LIST:
528
+ section = "\n".join(output.get("list_items", []))
529
+ case MinerUContentType.DISCARDED:
530
+ continue # Skip discarded blocks entirely
531
+
532
+ if section and parse_method == "manual":
533
+ sections.append((section, output["type"], self._line_tag(output)))
534
+ elif section and parse_method == "paper":
535
+ sections.append((section + self._line_tag(output), output["type"]))
536
+ else:
537
+ sections.append((section, self._line_tag(output)))
538
+ return sections
539
+
540
+ def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
541
+ return []
542
+
543
+ def parse_pdf(
544
+ self,
545
+ filepath: str | PathLike[str],
546
+ binary: BytesIO | bytes,
547
+ callback: Optional[Callable] = None,
548
+ *,
549
+ output_dir: Optional[str] = None,
550
+ backend: str = "pipeline",
551
+ server_url: Optional[str] = None,
552
+ delete_output: bool = True,
553
+ parse_method: str = "raw",
554
+ **kwargs,
555
+ ) -> tuple:
556
+ import shutil
557
+
558
+ temp_pdf = None
559
+ created_tmp_dir = False
560
+
561
+ parser_cfg = kwargs.get('parser_config', {})
562
+ lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
563
+ mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
564
+ mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
565
+ enable_formula = parser_cfg.get('mineru_formula_enable', True)
566
+ enable_table = parser_cfg.get('mineru_table_enable', True)
567
+
568
+ # remove spaces, or mineru crash, and _read_output fail too
569
+ file_path = Path(filepath)
570
+ pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
571
+ pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
572
+
573
+ if binary:
574
+ temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
575
+ temp_pdf = temp_dir / pdf_file_name
576
+ with open(temp_pdf, "wb") as f:
577
+ f.write(binary)
578
+ pdf = temp_pdf
579
+ self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
580
+ if callback:
581
+ callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
582
+ else:
583
+ if pdf_file_path_valid != filepath:
584
+ self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
585
+ shutil.move(filepath, pdf_file_path_valid)
586
+ pdf = Path(pdf_file_path_valid)
587
+ if not pdf.exists():
588
+ if callback:
589
+ callback(-1, f"[MinerU] PDF not found: {pdf}")
590
+ raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
591
+
592
+ if output_dir:
593
+ out_dir = Path(output_dir)
594
+ out_dir.mkdir(parents=True, exist_ok=True)
595
+ else:
596
+ out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
597
+ created_tmp_dir = True
598
+
599
+ self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
600
+ if callback:
601
+ callback(0.15, f"[MinerU] Output directory: {out_dir}")
602
+
603
+ self.__images__(pdf, zoomin=1)
604
+
605
+ try:
606
+ options = MinerUParseOptions(
607
+ backend=MinerUBackend(backend),
608
+ lang=MinerULanguage(mineru_lang_code),
609
+ method=MinerUParseMethod(mineru_method_raw_str),
610
+ server_url=server_url,
611
+ delete_output=delete_output,
612
+ parse_method=parse_method,
613
+ formula_enable=enable_formula,
614
+ table_enable=enable_table,
615
+ )
616
+ final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
617
+ outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
618
+ self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
619
+ if callback:
620
+ callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
621
+
622
+ return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
623
+ finally:
624
+ if temp_pdf and temp_pdf.exists():
625
+ try:
626
+ temp_pdf.unlink()
627
+ temp_pdf.parent.rmdir()
628
+ except Exception:
629
+ pass
630
+ if delete_output and created_tmp_dir and out_dir.exists():
631
+ try:
632
+ shutil.rmtree(out_dir)
633
+ except Exception:
634
+ pass
635
+
636
+
637
+ if __name__ == "__main__":
638
+ parser = MinerUParser("mineru")
639
+ ok, reason = parser.check_installation()
640
+ print("MinerU available:", ok)
641
+
642
+ filepath = ""
643
+ with open(filepath, "rb") as file:
644
+ outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
645
+ for output in outputs:
646
+ print(output)