deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,889 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from __future__ import annotations
17
+
18
+ import base64
19
+ import logging
20
+ import re
21
+ from dataclasses import dataclass
22
+ from enum import Enum
23
+ from io import BytesIO
24
+ from os import PathLike
25
+ from pathlib import Path
26
+ from typing import Any, Callable, Iterable, Optional
27
+ from urllib.parse import unquote
28
+
29
+ import pdfplumber
30
+ from PIL import Image
31
+
32
+ try:
33
+ from docling.document_converter import DocumentConverter
34
+ except Exception:
35
+ DocumentConverter = None
36
+
37
+ try:
38
+ from deepdoc.parser.pdf_parser import RAGFlowPdfParser
39
+ except Exception:
40
+ class RAGFlowPdfParser:
41
+ pass
42
+
43
+
44
+ class DoclingContentType(str, Enum):
45
+ IMAGE = "image"
46
+ TABLE = "table"
47
+ TEXT = "text"
48
+ EQUATION = "equation"
49
+
50
+
51
+ @dataclass
52
+ class _BBox:
53
+ page_no: int
54
+ x0: float
55
+ y0: float
56
+ x1: float
57
+ y1: float
58
+
59
+
60
+ class DoclingParser(RAGFlowPdfParser):
61
+ def __init__(self):
62
+ self.logger = logging.getLogger(self.__class__.__name__)
63
+ self.page_images: list[Image.Image] = []
64
+ self.page_from = 0
65
+ self.page_to = 10_000
66
+ self.outlines = []
67
+
68
+
69
+ def check_installation(self) -> bool:
70
+ if DocumentConverter is None:
71
+ self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
72
+ return False
73
+ try:
74
+ _ = DocumentConverter()
75
+ return True
76
+ except Exception as e:
77
+ self.logger.error(f"[Docling] init DocumentConverter failed: {e}")
78
+ return False
79
+
80
+ def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
81
+ self.page_from = page_from
82
+ self.page_to = page_to
83
+ bytes_io = None
84
+ try:
85
+ if not isinstance(fnm, (str, PathLike)):
86
+ bytes_io = BytesIO(fnm)
87
+
88
+ opener = pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(bytes_io)
89
+ with opener as pdf:
90
+ pages = pdf.pages[page_from:page_to]
91
+ self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for p in pages]
92
+ except Exception as e:
93
+ self.page_images = []
94
+ self.logger.exception(e)
95
+ finally:
96
+ if bytes_io:
97
+ bytes_io.close()
98
+
99
+ def _make_line_tag(self,bbox: _BBox) -> str:
100
+ if bbox is None:
101
+ return ""
102
+ x0,x1, top, bott = bbox.x0, bbox.x1, bbox.y0, bbox.y1
103
+ if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= bbox.page_no:
104
+ _, page_height = self.page_images[bbox.page_no-1].size
105
+ top, bott = page_height-top ,page_height-bott
106
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
107
+ bbox.page_no, x0,x1, top, bott
108
+ )
109
+
110
+ @staticmethod
111
+ def extract_positions(txt: str) -> list[tuple[list[int], float, float, float, float]]:
112
+ poss = []
113
+ for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
114
+ pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
115
+ left, right, top, bottom = float(left), float(right), float(top), float(bottom)
116
+ poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
117
+ return poss
118
+
119
+ def crop(self, text: str, ZM: int = 1, need_position: bool = False):
120
+ imgs = []
121
+ poss = self.extract_positions(text)
122
+ if not poss:
123
+ return (None, None) if need_position else None
124
+
125
+ GAP = 6
126
+ pos = poss[0]
127
+ poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
128
+ pos = poss[-1]
129
+ poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
130
+ positions = []
131
+ for ii, (pns, left, right, top, bottom) in enumerate(poss):
132
+ if bottom <= top:
133
+ bottom = top + 4
134
+ img0 = self.page_images[pns[0]]
135
+ x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
136
+
137
+ crop0 = img0.crop((x0, y0, x1, y1))
138
+ imgs.append(crop0)
139
+ if 0 < ii < len(poss)-1:
140
+ positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
141
+ remain_bottom = bottom - img0.size[1]
142
+ for pn in pns[1:]:
143
+ if remain_bottom <= 0:
144
+ break
145
+ page = self.page_images[pn]
146
+ x0, y0, x1, y1 = int(left), 0, int(right), int(min(remain_bottom, page.size[1]))
147
+ cimgp = page.crop((x0, y0, x1, y1))
148
+ imgs.append(cimgp)
149
+ if 0 < ii < len(poss) - 1:
150
+ positions.append((pn + self.page_from, x0, x1, y0, y1))
151
+ remain_bottom -= page.size[1]
152
+
153
+ if not imgs:
154
+ return (None, None) if need_position else None
155
+
156
+ height = sum(i.size[1] + GAP for i in imgs)
157
+ width = max(i.size[0] for i in imgs)
158
+ pic = Image.new("RGB", (width, int(height)), (245, 245, 245))
159
+ h = 0
160
+ for ii, img in enumerate(imgs):
161
+ if ii == 0 or ii + 1 == len(imgs):
162
+ img = img.convert("RGBA")
163
+ overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
164
+ overlay.putalpha(128)
165
+ img = Image.alpha_composite(img, overlay).convert("RGB")
166
+ pic.paste(img, (0, int(h)))
167
+ h += img.size[1] + GAP
168
+
169
+ return (pic, positions) if need_position else pic
170
+
171
+ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox], str]]:
172
+ """
173
+ Iterate over document items (texts, equations).
174
+
175
+ Args:
176
+ doc: Docling document object
177
+ has_bbox: Whether the document format supports bbox (PDF=True, DOCX/PPTX=False)
178
+
179
+ Yields:
180
+ Tuple of (content_type, text, bbox, label) where:
181
+ - content_type: DoclingContentType value
182
+ - text: Text content
183
+ - bbox: Bounding box (None for DOCX/PPTX)
184
+ - label: Docling label (e.g., "section_header", "text", "list_item", "FORMULA")
185
+ """
186
+ for t in getattr(doc, "texts", []):
187
+ parent = getattr(t, "parent", "")
188
+ ref = getattr(parent, "cref", "") if parent else ""
189
+ label = getattr(t, "label", "")
190
+ # Accept section_header, text, and list_item labels
191
+ # For DOCX/PPTX, ref may not be exactly "#/body" (could be "#/groups/0", "#/texts/0", etc.)
192
+ # So we accept any ref for these labels, or specifically check for "#/body" when needed
193
+ if label in ("section_header", "text", "list_item"):
194
+ text = getattr(t, "text", "") or ""
195
+ if not text.strip():
196
+ continue
197
+ bbox = None
198
+ if has_bbox and getattr(t, "prov", None):
199
+ pn = getattr(t.prov[0], "page_no", None)
200
+ bb = getattr(t.prov[0], "bbox", None)
201
+ if bb:
202
+ bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
203
+ if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
204
+ bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3])
205
+ yield (DoclingContentType.TEXT.value, text, bbox, label)
206
+
207
+ for item in getattr(doc, "texts", []):
208
+ item_label = getattr(item, "label", "")
209
+ if item_label in ("FORMULA",):
210
+ text = getattr(item, "text", "") or ""
211
+ bbox = None
212
+ if has_bbox and getattr(item, "prov", None):
213
+ pn = getattr(item.prov, "page_no", None)
214
+ bb = getattr(item.prov, "bbox", None)
215
+ if bb:
216
+ bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
217
+ if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
218
+ bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
219
+ yield (DoclingContentType.EQUATION.value, text, bbox, item_label)
220
+
221
+ def _label_to_style(self, label: str) -> str:
222
+ """
223
+ Map Docling label to Word style name.
224
+
225
+ Args:
226
+ label: Docling label (e.g., "section_header", "text", "list_item")
227
+
228
+ Returns:
229
+ Word-style name (e.g., "Heading", "Normal", "List Item")
230
+ """
231
+ label_to_style_map = {
232
+ "section_header": "Heading",
233
+ "text": "Normal",
234
+ "list_item": "List Item",
235
+ "FORMULA": "Equation",
236
+ }
237
+ return label_to_style_map.get(label, "Normal")
238
+
239
+ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -> list[tuple[str, str]]:
240
+ """
241
+ Transfer document items to sections.
242
+
243
+ Args:
244
+ doc: Docling document object
245
+ parse_method: Parsing method ("raw", "manual", "paper")
246
+ has_bbox: Whether the document format supports bbox
247
+
248
+ Returns:
249
+ List of (text, tag_or_style) tuples where:
250
+ - For PDF (has_bbox=True): tag is position tag (e.g., "@@1\t0.0\t100.0\t0.0\t50.0##")
251
+ - For DOCX/PPTX (has_bbox=False): tag is style name (e.g., "Heading", "Normal")
252
+ """
253
+ sections: list[tuple[str, str]] = []
254
+ for typ, payload, bbox, label in self._iter_doc_items(doc, has_bbox=has_bbox):
255
+ if typ == DoclingContentType.TEXT.value:
256
+ section = payload.strip()
257
+ if not section:
258
+ continue
259
+ elif typ == DoclingContentType.EQUATION.value:
260
+ section = payload.strip()
261
+ else:
262
+ continue
263
+
264
+ # For PDF (has_bbox=True): use position tag
265
+ # For DOCX/PPTX (has_bbox=False): use label as style
266
+ if isinstance(bbox, _BBox):
267
+ tag = self._make_line_tag(bbox)
268
+ else:
269
+ # No bbox, use label as style for DOCX/PPTX
270
+ tag = self._label_to_style(label)
271
+
272
+ if parse_method == "manual":
273
+ sections.append((section, typ, tag))
274
+ elif parse_method == "paper":
275
+ sections.append((section + tag, typ))
276
+ else:
277
+ sections.append((section, tag))
278
+ return sections
279
+
280
+ def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
281
+ if not getattr(self, "page_images", None):
282
+ return None, ""
283
+
284
+ idx = (page_no - 1) - getattr(self, "page_from", 0)
285
+ if idx < 0 or idx >= len(self.page_images):
286
+ return None, ""
287
+
288
+ page_img = self.page_images[idx]
289
+ W, H = page_img.size
290
+ left, top, right, bott = bbox
291
+
292
+ x0 = float(left)
293
+ y0 = float(H-top)
294
+ x1 = float(right)
295
+ y1 = float(H-bott)
296
+
297
+ x0, y0 = max(0.0, min(x0, W - 1)), max(0.0, min(y0, H - 1))
298
+ x1, y1 = max(x0 + 1.0, min(x1, W)), max(y0 + 1.0, min(y1, H))
299
+
300
+ try:
301
+ crop = page_img.crop((int(x0), int(y0), int(x1), int(y1))).convert("RGB")
302
+ except Exception:
303
+ return None, ""
304
+
305
+ pos = (page_no-1 if page_no>0 else 0, x0, x1, y0, y1)
306
+ return crop, [pos]
307
+
308
+ def _transfer_to_tables(self, doc, has_bbox: bool = True):
309
+ """
310
+ Transfer document tables and pictures to tables format.
311
+
312
+ Args:
313
+ doc: Docling document object
314
+ has_bbox: Whether the document format supports bbox
315
+ """
316
+ tables = []
317
+ for tab in getattr(doc, "tables", []):
318
+ img = None
319
+ positions = ""
320
+ if has_bbox and getattr(tab, "prov", None):
321
+ pn = getattr(tab.prov[0], "page_no", None)
322
+ bb = getattr(tab.prov[0], "bbox", None)
323
+ if pn is not None and bb is not None:
324
+ left = getattr(bb, "l", None)
325
+ top = getattr(bb, "t", None)
326
+ right = getattr(bb, "r", None)
327
+ bott = getattr(bb, "b", None)
328
+ if None not in (left, top, right, bott):
329
+ img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
330
+ html = ""
331
+ try:
332
+ html = tab.export_to_html(doc=doc)
333
+ except Exception:
334
+ pass
335
+ tables.append(((img, html), positions if positions else ""))
336
+
337
+ # Handle pictures (for PDF with bbox)
338
+ if has_bbox:
339
+ for pic in getattr(doc, "pictures", []):
340
+ img = None
341
+ positions = ""
342
+ if getattr(pic, "prov", None):
343
+ pn = getattr(pic.prov[0], "page_no", None)
344
+ bb = getattr(pic.prov[0], "bbox", None)
345
+ if pn is not None and bb is not None:
346
+ left = getattr(bb, "l", None)
347
+ top = getattr(bb, "t", None)
348
+ right = getattr(bb, "r", None)
349
+ bott = getattr(bb, "b", None)
350
+ if None not in (left, top, right, bott):
351
+ img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
352
+ captions = ""
353
+ try:
354
+ captions = pic.caption_text(doc=doc)
355
+ except Exception:
356
+ pass
357
+ tables.append(((img, [captions]), positions if positions else ""))
358
+
359
+ return tables
360
+
361
+ def _extract_image_from_data_uri(self, data_uri: str) -> Optional[Image.Image]:
362
+ """
363
+ Extract PIL Image from base64 data URI.
364
+
365
+ Args:
366
+ data_uri: Data URI string (e.g., "data:image/png;base64,...")
367
+
368
+ Returns:
369
+ PIL Image object or None if extraction fails
370
+ """
371
+ try:
372
+ # Parse data URI: data:image/png;base64,<base64_data>
373
+ if not data_uri.startswith("data:"):
374
+ return None
375
+
376
+ # Extract base64 part
377
+ if "," in data_uri:
378
+ base64_data = data_uri.split(",", 1)[1]
379
+ else:
380
+ return None
381
+
382
+ # Decode base64
383
+ image_data = base64.b64decode(base64_data)
384
+
385
+ # Create PIL Image
386
+ img = Image.open(BytesIO(image_data))
387
+ return img.convert("RGB")
388
+ except Exception as e:
389
+ self.logger.warning(f"[Docling] Failed to extract image from data URI: {e}")
390
+ return None
391
+
392
+ def _find_element_caption(
393
+ self, doc, element, element_type: str, element_idx: int, caption_keywords: list[str]
394
+ ) -> str:
395
+ """
396
+ Find caption for an element (picture or table) by checking document structure.
397
+
398
+ For DOCX, captions are text items that follow the element in the parent's children list.
399
+
400
+ Args:
401
+ doc: Docling document object
402
+ element: The element object (picture or table)
403
+ element_type: Type of element ("picture" or "table")
404
+ element_idx: Index of the element in doc.pictures or doc.tables
405
+ caption_keywords: List of keywords to identify captions (e.g., ["图表", "figure"] for pictures)
406
+
407
+ Returns:
408
+ Caption text or empty string
409
+ """
410
+ try:
411
+ if not hasattr(element, "parent") or not element.parent:
412
+ return ""
413
+
414
+ parent_ref = str(element.parent.cref) if hasattr(element.parent, "cref") else ""
415
+ if not parent_ref or not parent_ref.startswith("#/texts/"):
416
+ return ""
417
+
418
+ # Find parent text item
419
+ parent_idx = int(parent_ref.split("/")[-1])
420
+ if parent_idx >= len(doc.texts):
421
+ return ""
422
+
423
+ parent_text = doc.texts[parent_idx]
424
+ if not hasattr(parent_text, "children") or not parent_text.children:
425
+ return ""
426
+
427
+ # Find element in children list
428
+ element_ref = f"#/{element_type}s/{element_idx}"
429
+ element_idx_in_children = None
430
+ for idx, child in enumerate(parent_text.children):
431
+ child_ref = str(child.cref) if hasattr(child, "cref") else ""
432
+ if child_ref == element_ref:
433
+ element_idx_in_children = idx
434
+ break
435
+
436
+ if element_idx_in_children is None:
437
+ return ""
438
+
439
+ # Check next item after element (potential caption)
440
+ if element_idx_in_children + 1 < len(parent_text.children):
441
+ next_child = parent_text.children[element_idx_in_children + 1]
442
+ next_ref = str(next_child.cref) if hasattr(next_child, "cref") else ""
443
+
444
+ if next_ref.startswith("#/texts/"):
445
+ text_idx = int(next_ref.split("/")[-1])
446
+ if text_idx < len(doc.texts):
447
+ caption_text = doc.texts[text_idx]
448
+ text = getattr(caption_text, "text", "") or getattr(caption_text, "orig", "")
449
+ # Check if it looks like a caption based on keywords
450
+ if text and any(keyword in text.lower() for keyword in caption_keywords):
451
+ return text.strip()
452
+
453
+ return ""
454
+ except Exception as e:
455
+ self.logger.warning(f"[Docling] Failed to find {element_type} caption: {e}")
456
+ return ""
457
+
458
+ def _find_picture_caption(self, doc, picture_idx: int) -> str:
459
+ """
460
+ Find caption for a picture by checking document structure.
461
+
462
+ For DOCX, captions are not directly in PictureItem.captions,
463
+ but are text items that follow the picture in the parent's children list.
464
+
465
+ Args:
466
+ doc: Docling document object
467
+ picture_idx: Index of the picture in doc.pictures
468
+
469
+ Returns:
470
+ Caption text or empty string
471
+ """
472
+ try:
473
+ pic = doc.pictures[picture_idx]
474
+ return self._find_element_caption(
475
+ doc, pic, "picture", picture_idx, ["图表", "figure", "图", "fig"]
476
+ )
477
+ except Exception as e:
478
+ self.logger.warning(f"[Docling] Failed to find picture caption: {e}")
479
+ return ""
480
+
481
+ def _find_table_caption(self, doc, table_idx: int) -> str:
482
+ """
483
+ Find caption for a table by checking document structure.
484
+
485
+ For DOCX, captions are text items that follow the table in the document structure.
486
+ Similar to picture captions, but we also check for "Table" keywords.
487
+
488
+ Args:
489
+ doc: Docling document object
490
+ table_idx: Index of the table in doc.tables
491
+
492
+ Returns:
493
+ Caption text or empty string
494
+ """
495
+ try:
496
+ tab = doc.tables[table_idx]
497
+ return self._find_element_caption(
498
+ doc, tab, "table", table_idx, ["表", "table", "表格"]
499
+ )
500
+ except Exception as e:
501
+ self.logger.warning(f"[Docling] Failed to find table caption: {e}")
502
+ return ""
503
+
504
+ def _transfer_to_tables_docx(self, doc) -> list[tuple[tuple, str]]:
505
+ """
506
+ Transfer DOCX document tables and pictures to tables format.
507
+ DOCX doesn't have bbox, so we handle pictures differently.
508
+
509
+ Args:
510
+ doc: Docling document object
511
+
512
+ Returns:
513
+ List of ((image, html_or_captions), positions) tuples
514
+ """
515
+ tables = []
516
+
517
+ # Handle tables
518
+ for idx, tab in enumerate(getattr(doc, "tables", [])):
519
+ html = ""
520
+ try:
521
+ html = tab.export_to_html(doc=doc)
522
+ except Exception:
523
+ pass
524
+
525
+ # Find table caption through document structure
526
+ caption = self._find_table_caption(doc, idx)
527
+
528
+ # Also try direct caption_text method (might work for some cases)
529
+ if not caption:
530
+ try:
531
+ caption = tab.caption_text(doc=doc) if hasattr(tab, "caption_text") else ""
532
+ except Exception:
533
+ pass
534
+
535
+ # DOCX tables don't have bbox, so no image or positions
536
+ # Format: ((None, html_or_captions), positions)
537
+ # For tables with caption, we store as dict: {"caption": caption, "html": html}
538
+ # For tables without caption, we store as string: html
539
+ if caption:
540
+ # Store caption and html together in a dict format
541
+ table_data = {"caption": caption, "html": html}
542
+ tables.append(((None, table_data), ""))
543
+ else:
544
+ tables.append(((None, html), ""))
545
+
546
+ # Handle pictures
547
+ for idx, pic in enumerate(getattr(doc, "pictures", [])):
548
+ img = None
549
+ captions = ""
550
+
551
+ # Extract image from data URI
552
+ if hasattr(pic, "image") and pic.image:
553
+ if hasattr(pic.image, "uri"):
554
+ data_uri = str(pic.image.uri)
555
+ img = self._extract_image_from_data_uri(data_uri)
556
+
557
+ # Find caption through document structure
558
+ caption = self._find_picture_caption(doc, idx)
559
+ if caption:
560
+ captions = caption
561
+
562
+ # Also try direct caption_text method (might work for some cases)
563
+ if not captions:
564
+ try:
565
+ captions = pic.caption_text(doc=doc)
566
+ except Exception:
567
+ pass
568
+
569
+ # DOCX pictures don't have bbox positions
570
+ tables.append(((img, [captions] if captions else []), ""))
571
+
572
+ return tables
573
+
574
+ def parse_pdf(
575
+ self,
576
+ filepath: str | PathLike[str],
577
+ binary: BytesIO | bytes | None = None,
578
+ callback: Optional[Callable] = None,
579
+ *,
580
+ output_dir: Optional[str] = None,
581
+ lang: Optional[str] = None,
582
+ method: str = "auto",
583
+ delete_output: bool = True,
584
+ parse_method: str = "raw"
585
+ ):
586
+
587
+ if not self.check_installation():
588
+ raise RuntimeError("Docling not available, please install `docling`")
589
+
590
+ if binary is not None:
591
+ tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
592
+ tmpdir.mkdir(parents=True, exist_ok=True)
593
+ name = Path(filepath).name or "input.pdf"
594
+ tmp_pdf = tmpdir / name
595
+ with open(tmp_pdf, "wb") as f:
596
+ if isinstance(binary, (bytes, bytearray)):
597
+ f.write(binary)
598
+ else:
599
+ f.write(binary.getbuffer())
600
+ src_path = tmp_pdf
601
+ else:
602
+ src_path = Path(filepath)
603
+ if not src_path.exists():
604
+ raise FileNotFoundError(f"PDF not found: {src_path}")
605
+
606
+ if callback:
607
+ callback(0.1, f"[Docling] Converting: {src_path}")
608
+
609
+ try:
610
+ self.__images__(str(src_path), zoomin=1)
611
+ except Exception as e:
612
+ self.logger.warning(f"[Docling] render pages failed: {e}")
613
+
614
+ conv = DocumentConverter()
615
+ conv_res = conv.convert(str(src_path))
616
+ doc = conv_res.document
617
+ if callback:
618
+ callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
619
+
620
+ sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=True)
621
+ tables = self._transfer_to_tables(doc, has_bbox=True)
622
+
623
+ if callback:
624
+ callback(0.95, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
625
+
626
+ if binary is not None and delete_output:
627
+ try:
628
+ Path(src_path).unlink(missing_ok=True)
629
+ except Exception:
630
+ pass
631
+
632
+ if callback:
633
+ callback(1.0, "[Docling] Done.")
634
+ return sections, tables
635
+
636
+ def parse_docx(
637
+ self,
638
+ filepath: str | PathLike[str],
639
+ binary: BytesIO | bytes | None = None,
640
+ callback: Optional[Callable] = None,
641
+ *,
642
+ output_dir: Optional[str] = None,
643
+ lang: Optional[str] = None,
644
+ method: str = "auto",
645
+ delete_output: bool = True,
646
+ parse_method: str = "raw"
647
+ ):
648
+ """
649
+ Parse DOCX file using Docling.
650
+
651
+ Args:
652
+ filepath: Path to DOCX file
653
+ binary: Optional binary content of the file
654
+ callback: Optional progress callback function
655
+ output_dir: Optional temporary output directory
656
+ lang: Optional language hint (not used for DOCX)
657
+ method: Parsing method (not used for DOCX)
658
+ delete_output: Whether to delete temporary files
659
+ parse_method: Output format ("raw", "manual", "paper")
660
+
661
+ Returns:
662
+ Tuple of (sections, tables) where:
663
+ - sections: List of (text, tag) or (text, type, tag) tuples
664
+ - tables: List of ((image, html_or_captions), positions) tuples
665
+ """
666
+ if not self.check_installation():
667
+ raise RuntimeError("Docling not available, please install `docling`")
668
+
669
+ if binary is not None:
670
+ tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
671
+ tmpdir.mkdir(parents=True, exist_ok=True)
672
+ name = Path(filepath).name if filepath else "input.docx"
673
+ if not name.endswith(".docx"):
674
+ name = name + ".docx"
675
+ tmp_docx = tmpdir / name
676
+ with open(tmp_docx, "wb") as f:
677
+ if isinstance(binary, (bytes, bytearray)):
678
+ f.write(binary)
679
+ else:
680
+ f.write(binary.getbuffer())
681
+ src_path = tmp_docx
682
+ else:
683
+ src_path = Path(filepath)
684
+ if not src_path.exists():
685
+ raise FileNotFoundError(f"DOCX not found: {src_path}")
686
+
687
+ if callback:
688
+ callback(0.1, f"[Docling] Converting DOCX: {src_path}")
689
+
690
+ try:
691
+ conv = DocumentConverter()
692
+ conv_res = conv.convert(str(src_path))
693
+ doc = conv_res.document
694
+ except Exception as e:
695
+ self.logger.error(f"[Docling] Failed to convert DOCX: {e}")
696
+ raise
697
+
698
+ if callback:
699
+ callback(0.5, f"[Docling] Parsed DOCX: {len(getattr(doc, 'texts', []))} text items")
700
+
701
+ # DOCX doesn't have bbox, so use has_bbox=False
702
+ sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
703
+ tables = self._transfer_to_tables_docx(doc)
704
+
705
+ if callback:
706
+ callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
707
+
708
+ if binary is not None and delete_output:
709
+ try:
710
+ Path(src_path).unlink(missing_ok=True)
711
+ except Exception:
712
+ pass
713
+
714
+ if callback:
715
+ callback(1.0, "[Docling] Done.")
716
+ return sections, tables
717
+
718
+ def parse_pptx(
719
+ self,
720
+ filepath: str | PathLike[str],
721
+ binary: BytesIO | bytes | None = None,
722
+ callback: Optional[Callable] = None,
723
+ *,
724
+ output_dir: Optional[str] = None,
725
+ lang: Optional[str] = None,
726
+ method: str = "auto",
727
+ delete_output: bool = True,
728
+ parse_method: str = "raw"
729
+ ):
730
+ """
731
+ Parse PPTX file using Docling (preliminary support).
732
+
733
+ Args:
734
+ filepath: Path to PPTX file
735
+ binary: Optional binary content of the file
736
+ callback: Optional progress callback function
737
+ output_dir: Optional temporary output directory
738
+ lang: Optional language hint (not used for PPTX)
739
+ method: Parsing method (not used for PPTX)
740
+ delete_output: Whether to delete temporary files
741
+ parse_method: Output format ("raw", "manual", "paper")
742
+
743
+ Returns:
744
+ Tuple of (sections, tables) where:
745
+ - sections: List of (text, tag) or (text, type, tag) tuples
746
+ - tables: List of ((image, html_or_captions), positions) tuples
747
+ """
748
+ if not self.check_installation():
749
+ raise RuntimeError("Docling not available, please install `docling`")
750
+
751
+ if binary is not None:
752
+ tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
753
+ tmpdir.mkdir(parents=True, exist_ok=True)
754
+ name = Path(filepath).name if filepath else "input.pptx"
755
+ if not name.endswith(".pptx"):
756
+ name = name + ".pptx"
757
+ tmp_pptx = tmpdir / name
758
+ with open(tmp_pptx, "wb") as f:
759
+ if isinstance(binary, (bytes, bytearray)):
760
+ f.write(binary)
761
+ else:
762
+ f.write(binary.getbuffer())
763
+ src_path = tmp_pptx
764
+ else:
765
+ src_path = Path(filepath)
766
+ if not src_path.exists():
767
+ raise FileNotFoundError(f"PPTX not found: {src_path}")
768
+
769
+ if callback:
770
+ callback(0.1, f"[Docling] Converting PPTX: {src_path}")
771
+
772
+ try:
773
+ conv = DocumentConverter()
774
+ conv_res = conv.convert(str(src_path))
775
+ doc = conv_res.document
776
+ except Exception as e:
777
+ self.logger.error(f"[Docling] Failed to convert PPTX: {e}")
778
+ raise
779
+
780
+ if callback:
781
+ callback(0.5, f"[Docling] Parsed PPTX: {len(getattr(doc, 'texts', []))} text items")
782
+
783
+ # PPTX doesn't have bbox like DOCX
784
+ sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
785
+ tables = self._transfer_to_tables_docx(doc)
786
+
787
+ if callback:
788
+ callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
789
+
790
+ if binary is not None and delete_output:
791
+ try:
792
+ Path(src_path).unlink(missing_ok=True)
793
+ except Exception:
794
+ pass
795
+
796
+ if callback:
797
+ callback(1.0, "[Docling] Done.")
798
+ return sections, tables
799
+
800
+ def parse_xlsx(
801
+ self,
802
+ filepath: str | PathLike[str],
803
+ binary: BytesIO | bytes | None = None,
804
+ callback: Optional[Callable] = None,
805
+ *,
806
+ output_dir: Optional[str] = None,
807
+ lang: Optional[str] = None,
808
+ method: str = "auto",
809
+ delete_output: bool = True,
810
+ parse_method: str = "raw"
811
+ ):
812
+ """
813
+ Parse XLSX file using Docling (preliminary support).
814
+
815
+ Args:
816
+ filepath: Path to XLSX file
817
+ binary: Optional binary content of the file
818
+ callback: Optional progress callback function
819
+ output_dir: Optional temporary output directory
820
+ lang: Optional language hint (not used for XLSX)
821
+ method: Parsing method (not used for XLSX)
822
+ delete_output: Whether to delete temporary files
823
+ parse_method: Output format ("raw", "manual", "paper")
824
+
825
+ Returns:
826
+ Tuple of (sections, tables) where:
827
+ - sections: List of (text, tag) or (text, type, tag) tuples (usually empty for XLSX)
828
+ - tables: List of ((image, html), positions) tuples
829
+ """
830
+ if not self.check_installation():
831
+ raise RuntimeError("Docling not available, please install `docling`")
832
+
833
+ if binary is not None:
834
+ tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
835
+ tmpdir.mkdir(parents=True, exist_ok=True)
836
+ name = Path(filepath).name if filepath else "input.xlsx"
837
+ if not name.endswith(".xlsx"):
838
+ name = name + ".xlsx"
839
+ tmp_xlsx = tmpdir / name
840
+ with open(tmp_xlsx, "wb") as f:
841
+ if isinstance(binary, (bytes, bytearray)):
842
+ f.write(binary)
843
+ else:
844
+ f.write(binary.getbuffer())
845
+ src_path = tmp_xlsx
846
+ else:
847
+ src_path = Path(filepath)
848
+ if not src_path.exists():
849
+ raise FileNotFoundError(f"XLSX not found: {src_path}")
850
+
851
+ if callback:
852
+ callback(0.1, f"[Docling] Converting XLSX: {src_path}")
853
+
854
+ try:
855
+ conv = DocumentConverter()
856
+ conv_res = conv.convert(str(src_path))
857
+ doc = conv_res.document
858
+ except Exception as e:
859
+ self.logger.error(f"[Docling] Failed to convert XLSX: {e}")
860
+ raise
861
+
862
+ if callback:
863
+ callback(0.5, f"[Docling] Parsed XLSX: {len(getattr(doc, 'tables', []))} tables")
864
+
865
+ # XLSX is primarily tables, minimal text sections
866
+ sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
867
+ # Use DOCX table handler (no bbox, similar structure)
868
+ tables = self._transfer_to_tables_docx(doc)
869
+
870
+ if callback:
871
+ callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
872
+
873
+ if binary is not None and delete_output:
874
+ try:
875
+ Path(src_path).unlink(missing_ok=True)
876
+ except Exception:
877
+ pass
878
+
879
+ if callback:
880
+ callback(1.0, "[Docling] Done.")
881
+ return sections, tables
882
+
883
+
884
+ if __name__ == "__main__":
885
+ logging.basicConfig(level=logging.INFO)
886
+ parser = DoclingParser()
887
+ print("Docling available:", parser.check_installation())
888
+ sections, tables = parser.parse_pdf(filepath="test_docling/toc.pdf", binary=None)
889
+ print(len(sections), len(tables))