deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,889 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import base64
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from io import BytesIO
|
|
24
|
+
from os import PathLike
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Callable, Iterable, Optional
|
|
27
|
+
from urllib.parse import unquote
|
|
28
|
+
|
|
29
|
+
import pdfplumber
|
|
30
|
+
from PIL import Image
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from docling.document_converter import DocumentConverter
|
|
34
|
+
except Exception:
|
|
35
|
+
DocumentConverter = None
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
|
39
|
+
except Exception:
|
|
40
|
+
class RAGFlowPdfParser:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DoclingContentType(str, Enum):
|
|
45
|
+
IMAGE = "image"
|
|
46
|
+
TABLE = "table"
|
|
47
|
+
TEXT = "text"
|
|
48
|
+
EQUATION = "equation"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class _BBox:
|
|
53
|
+
page_no: int
|
|
54
|
+
x0: float
|
|
55
|
+
y0: float
|
|
56
|
+
x1: float
|
|
57
|
+
y1: float
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DoclingParser(RAGFlowPdfParser):
|
|
61
|
+
def __init__(self):
|
|
62
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
63
|
+
self.page_images: list[Image.Image] = []
|
|
64
|
+
self.page_from = 0
|
|
65
|
+
self.page_to = 10_000
|
|
66
|
+
self.outlines = []
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def check_installation(self) -> bool:
|
|
70
|
+
if DocumentConverter is None:
|
|
71
|
+
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
|
|
72
|
+
return False
|
|
73
|
+
try:
|
|
74
|
+
_ = DocumentConverter()
|
|
75
|
+
return True
|
|
76
|
+
except Exception as e:
|
|
77
|
+
self.logger.error(f"[Docling] init DocumentConverter failed: {e}")
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
|
81
|
+
self.page_from = page_from
|
|
82
|
+
self.page_to = page_to
|
|
83
|
+
bytes_io = None
|
|
84
|
+
try:
|
|
85
|
+
if not isinstance(fnm, (str, PathLike)):
|
|
86
|
+
bytes_io = BytesIO(fnm)
|
|
87
|
+
|
|
88
|
+
opener = pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(bytes_io)
|
|
89
|
+
with opener as pdf:
|
|
90
|
+
pages = pdf.pages[page_from:page_to]
|
|
91
|
+
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for p in pages]
|
|
92
|
+
except Exception as e:
|
|
93
|
+
self.page_images = []
|
|
94
|
+
self.logger.exception(e)
|
|
95
|
+
finally:
|
|
96
|
+
if bytes_io:
|
|
97
|
+
bytes_io.close()
|
|
98
|
+
|
|
99
|
+
def _make_line_tag(self,bbox: _BBox) -> str:
|
|
100
|
+
if bbox is None:
|
|
101
|
+
return ""
|
|
102
|
+
x0,x1, top, bott = bbox.x0, bbox.x1, bbox.y0, bbox.y1
|
|
103
|
+
if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= bbox.page_no:
|
|
104
|
+
_, page_height = self.page_images[bbox.page_no-1].size
|
|
105
|
+
top, bott = page_height-top ,page_height-bott
|
|
106
|
+
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
|
|
107
|
+
bbox.page_no, x0,x1, top, bott
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def extract_positions(txt: str) -> list[tuple[list[int], float, float, float, float]]:
|
|
112
|
+
poss = []
|
|
113
|
+
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
114
|
+
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
115
|
+
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
116
|
+
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
117
|
+
return poss
|
|
118
|
+
|
|
119
|
+
def crop(self, text: str, ZM: int = 1, need_position: bool = False):
|
|
120
|
+
imgs = []
|
|
121
|
+
poss = self.extract_positions(text)
|
|
122
|
+
if not poss:
|
|
123
|
+
return (None, None) if need_position else None
|
|
124
|
+
|
|
125
|
+
GAP = 6
|
|
126
|
+
pos = poss[0]
|
|
127
|
+
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
128
|
+
pos = poss[-1]
|
|
129
|
+
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
|
130
|
+
positions = []
|
|
131
|
+
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
132
|
+
if bottom <= top:
|
|
133
|
+
bottom = top + 4
|
|
134
|
+
img0 = self.page_images[pns[0]]
|
|
135
|
+
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
|
136
|
+
|
|
137
|
+
crop0 = img0.crop((x0, y0, x1, y1))
|
|
138
|
+
imgs.append(crop0)
|
|
139
|
+
if 0 < ii < len(poss)-1:
|
|
140
|
+
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
141
|
+
remain_bottom = bottom - img0.size[1]
|
|
142
|
+
for pn in pns[1:]:
|
|
143
|
+
if remain_bottom <= 0:
|
|
144
|
+
break
|
|
145
|
+
page = self.page_images[pn]
|
|
146
|
+
x0, y0, x1, y1 = int(left), 0, int(right), int(min(remain_bottom, page.size[1]))
|
|
147
|
+
cimgp = page.crop((x0, y0, x1, y1))
|
|
148
|
+
imgs.append(cimgp)
|
|
149
|
+
if 0 < ii < len(poss) - 1:
|
|
150
|
+
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
151
|
+
remain_bottom -= page.size[1]
|
|
152
|
+
|
|
153
|
+
if not imgs:
|
|
154
|
+
return (None, None) if need_position else None
|
|
155
|
+
|
|
156
|
+
height = sum(i.size[1] + GAP for i in imgs)
|
|
157
|
+
width = max(i.size[0] for i in imgs)
|
|
158
|
+
pic = Image.new("RGB", (width, int(height)), (245, 245, 245))
|
|
159
|
+
h = 0
|
|
160
|
+
for ii, img in enumerate(imgs):
|
|
161
|
+
if ii == 0 or ii + 1 == len(imgs):
|
|
162
|
+
img = img.convert("RGBA")
|
|
163
|
+
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
|
164
|
+
overlay.putalpha(128)
|
|
165
|
+
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
166
|
+
pic.paste(img, (0, int(h)))
|
|
167
|
+
h += img.size[1] + GAP
|
|
168
|
+
|
|
169
|
+
return (pic, positions) if need_position else pic
|
|
170
|
+
|
|
171
|
+
def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox], str]]:
|
|
172
|
+
"""
|
|
173
|
+
Iterate over document items (texts, equations).
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
doc: Docling document object
|
|
177
|
+
has_bbox: Whether the document format supports bbox (PDF=True, DOCX/PPTX=False)
|
|
178
|
+
|
|
179
|
+
Yields:
|
|
180
|
+
Tuple of (content_type, text, bbox, label) where:
|
|
181
|
+
- content_type: DoclingContentType value
|
|
182
|
+
- text: Text content
|
|
183
|
+
- bbox: Bounding box (None for DOCX/PPTX)
|
|
184
|
+
- label: Docling label (e.g., "section_header", "text", "list_item", "FORMULA")
|
|
185
|
+
"""
|
|
186
|
+
for t in getattr(doc, "texts", []):
|
|
187
|
+
parent = getattr(t, "parent", "")
|
|
188
|
+
ref = getattr(parent, "cref", "") if parent else ""
|
|
189
|
+
label = getattr(t, "label", "")
|
|
190
|
+
# Accept section_header, text, and list_item labels
|
|
191
|
+
# For DOCX/PPTX, ref may not be exactly "#/body" (could be "#/groups/0", "#/texts/0", etc.)
|
|
192
|
+
# So we accept any ref for these labels, or specifically check for "#/body" when needed
|
|
193
|
+
if label in ("section_header", "text", "list_item"):
|
|
194
|
+
text = getattr(t, "text", "") or ""
|
|
195
|
+
if not text.strip():
|
|
196
|
+
continue
|
|
197
|
+
bbox = None
|
|
198
|
+
if has_bbox and getattr(t, "prov", None):
|
|
199
|
+
pn = getattr(t.prov[0], "page_no", None)
|
|
200
|
+
bb = getattr(t.prov[0], "bbox", None)
|
|
201
|
+
if bb:
|
|
202
|
+
bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
|
|
203
|
+
if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
|
|
204
|
+
bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3])
|
|
205
|
+
yield (DoclingContentType.TEXT.value, text, bbox, label)
|
|
206
|
+
|
|
207
|
+
for item in getattr(doc, "texts", []):
|
|
208
|
+
item_label = getattr(item, "label", "")
|
|
209
|
+
if item_label in ("FORMULA",):
|
|
210
|
+
text = getattr(item, "text", "") or ""
|
|
211
|
+
bbox = None
|
|
212
|
+
if has_bbox and getattr(item, "prov", None):
|
|
213
|
+
pn = getattr(item.prov, "page_no", None)
|
|
214
|
+
bb = getattr(item.prov, "bbox", None)
|
|
215
|
+
if bb:
|
|
216
|
+
bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
|
|
217
|
+
if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
|
|
218
|
+
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
|
|
219
|
+
yield (DoclingContentType.EQUATION.value, text, bbox, item_label)
|
|
220
|
+
|
|
221
|
+
def _label_to_style(self, label: str) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Map Docling label to Word style name.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
label: Docling label (e.g., "section_header", "text", "list_item")
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Word-style name (e.g., "Heading", "Normal", "List Item")
|
|
230
|
+
"""
|
|
231
|
+
label_to_style_map = {
|
|
232
|
+
"section_header": "Heading",
|
|
233
|
+
"text": "Normal",
|
|
234
|
+
"list_item": "List Item",
|
|
235
|
+
"FORMULA": "Equation",
|
|
236
|
+
}
|
|
237
|
+
return label_to_style_map.get(label, "Normal")
|
|
238
|
+
|
|
239
|
+
def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -> list[tuple[str, str]]:
|
|
240
|
+
"""
|
|
241
|
+
Transfer document items to sections.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
doc: Docling document object
|
|
245
|
+
parse_method: Parsing method ("raw", "manual", "paper")
|
|
246
|
+
has_bbox: Whether the document format supports bbox
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of (text, tag_or_style) tuples where:
|
|
250
|
+
- For PDF (has_bbox=True): tag is position tag (e.g., "@@1\t0.0\t100.0\t0.0\t50.0##")
|
|
251
|
+
- For DOCX/PPTX (has_bbox=False): tag is style name (e.g., "Heading", "Normal")
|
|
252
|
+
"""
|
|
253
|
+
sections: list[tuple[str, str]] = []
|
|
254
|
+
for typ, payload, bbox, label in self._iter_doc_items(doc, has_bbox=has_bbox):
|
|
255
|
+
if typ == DoclingContentType.TEXT.value:
|
|
256
|
+
section = payload.strip()
|
|
257
|
+
if not section:
|
|
258
|
+
continue
|
|
259
|
+
elif typ == DoclingContentType.EQUATION.value:
|
|
260
|
+
section = payload.strip()
|
|
261
|
+
else:
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
# For PDF (has_bbox=True): use position tag
|
|
265
|
+
# For DOCX/PPTX (has_bbox=False): use label as style
|
|
266
|
+
if isinstance(bbox, _BBox):
|
|
267
|
+
tag = self._make_line_tag(bbox)
|
|
268
|
+
else:
|
|
269
|
+
# No bbox, use label as style for DOCX/PPTX
|
|
270
|
+
tag = self._label_to_style(label)
|
|
271
|
+
|
|
272
|
+
if parse_method == "manual":
|
|
273
|
+
sections.append((section, typ, tag))
|
|
274
|
+
elif parse_method == "paper":
|
|
275
|
+
sections.append((section + tag, typ))
|
|
276
|
+
else:
|
|
277
|
+
sections.append((section, tag))
|
|
278
|
+
return sections
|
|
279
|
+
|
|
280
|
+
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
|
|
281
|
+
if not getattr(self, "page_images", None):
|
|
282
|
+
return None, ""
|
|
283
|
+
|
|
284
|
+
idx = (page_no - 1) - getattr(self, "page_from", 0)
|
|
285
|
+
if idx < 0 or idx >= len(self.page_images):
|
|
286
|
+
return None, ""
|
|
287
|
+
|
|
288
|
+
page_img = self.page_images[idx]
|
|
289
|
+
W, H = page_img.size
|
|
290
|
+
left, top, right, bott = bbox
|
|
291
|
+
|
|
292
|
+
x0 = float(left)
|
|
293
|
+
y0 = float(H-top)
|
|
294
|
+
x1 = float(right)
|
|
295
|
+
y1 = float(H-bott)
|
|
296
|
+
|
|
297
|
+
x0, y0 = max(0.0, min(x0, W - 1)), max(0.0, min(y0, H - 1))
|
|
298
|
+
x1, y1 = max(x0 + 1.0, min(x1, W)), max(y0 + 1.0, min(y1, H))
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
crop = page_img.crop((int(x0), int(y0), int(x1), int(y1))).convert("RGB")
|
|
302
|
+
except Exception:
|
|
303
|
+
return None, ""
|
|
304
|
+
|
|
305
|
+
pos = (page_no-1 if page_no>0 else 0, x0, x1, y0, y1)
|
|
306
|
+
return crop, [pos]
|
|
307
|
+
|
|
308
|
+
def _transfer_to_tables(self, doc, has_bbox: bool = True):
|
|
309
|
+
"""
|
|
310
|
+
Transfer document tables and pictures to tables format.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
doc: Docling document object
|
|
314
|
+
has_bbox: Whether the document format supports bbox
|
|
315
|
+
"""
|
|
316
|
+
tables = []
|
|
317
|
+
for tab in getattr(doc, "tables", []):
|
|
318
|
+
img = None
|
|
319
|
+
positions = ""
|
|
320
|
+
if has_bbox and getattr(tab, "prov", None):
|
|
321
|
+
pn = getattr(tab.prov[0], "page_no", None)
|
|
322
|
+
bb = getattr(tab.prov[0], "bbox", None)
|
|
323
|
+
if pn is not None and bb is not None:
|
|
324
|
+
left = getattr(bb, "l", None)
|
|
325
|
+
top = getattr(bb, "t", None)
|
|
326
|
+
right = getattr(bb, "r", None)
|
|
327
|
+
bott = getattr(bb, "b", None)
|
|
328
|
+
if None not in (left, top, right, bott):
|
|
329
|
+
img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
|
|
330
|
+
html = ""
|
|
331
|
+
try:
|
|
332
|
+
html = tab.export_to_html(doc=doc)
|
|
333
|
+
except Exception:
|
|
334
|
+
pass
|
|
335
|
+
tables.append(((img, html), positions if positions else ""))
|
|
336
|
+
|
|
337
|
+
# Handle pictures (for PDF with bbox)
|
|
338
|
+
if has_bbox:
|
|
339
|
+
for pic in getattr(doc, "pictures", []):
|
|
340
|
+
img = None
|
|
341
|
+
positions = ""
|
|
342
|
+
if getattr(pic, "prov", None):
|
|
343
|
+
pn = getattr(pic.prov[0], "page_no", None)
|
|
344
|
+
bb = getattr(pic.prov[0], "bbox", None)
|
|
345
|
+
if pn is not None and bb is not None:
|
|
346
|
+
left = getattr(bb, "l", None)
|
|
347
|
+
top = getattr(bb, "t", None)
|
|
348
|
+
right = getattr(bb, "r", None)
|
|
349
|
+
bott = getattr(bb, "b", None)
|
|
350
|
+
if None not in (left, top, right, bott):
|
|
351
|
+
img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
|
|
352
|
+
captions = ""
|
|
353
|
+
try:
|
|
354
|
+
captions = pic.caption_text(doc=doc)
|
|
355
|
+
except Exception:
|
|
356
|
+
pass
|
|
357
|
+
tables.append(((img, [captions]), positions if positions else ""))
|
|
358
|
+
|
|
359
|
+
return tables
|
|
360
|
+
|
|
361
|
+
def _extract_image_from_data_uri(self, data_uri: str) -> Optional[Image.Image]:
|
|
362
|
+
"""
|
|
363
|
+
Extract PIL Image from base64 data URI.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
data_uri: Data URI string (e.g., "data:image/png;base64,...")
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
PIL Image object or None if extraction fails
|
|
370
|
+
"""
|
|
371
|
+
try:
|
|
372
|
+
# Parse data URI: data:image/png;base64,<base64_data>
|
|
373
|
+
if not data_uri.startswith("data:"):
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
# Extract base64 part
|
|
377
|
+
if "," in data_uri:
|
|
378
|
+
base64_data = data_uri.split(",", 1)[1]
|
|
379
|
+
else:
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
# Decode base64
|
|
383
|
+
image_data = base64.b64decode(base64_data)
|
|
384
|
+
|
|
385
|
+
# Create PIL Image
|
|
386
|
+
img = Image.open(BytesIO(image_data))
|
|
387
|
+
return img.convert("RGB")
|
|
388
|
+
except Exception as e:
|
|
389
|
+
self.logger.warning(f"[Docling] Failed to extract image from data URI: {e}")
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
def _find_element_caption(
|
|
393
|
+
self, doc, element, element_type: str, element_idx: int, caption_keywords: list[str]
|
|
394
|
+
) -> str:
|
|
395
|
+
"""
|
|
396
|
+
Find caption for an element (picture or table) by checking document structure.
|
|
397
|
+
|
|
398
|
+
For DOCX, captions are text items that follow the element in the parent's children list.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
doc: Docling document object
|
|
402
|
+
element: The element object (picture or table)
|
|
403
|
+
element_type: Type of element ("picture" or "table")
|
|
404
|
+
element_idx: Index of the element in doc.pictures or doc.tables
|
|
405
|
+
caption_keywords: List of keywords to identify captions (e.g., ["图表", "figure"] for pictures)
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Caption text or empty string
|
|
409
|
+
"""
|
|
410
|
+
try:
|
|
411
|
+
if not hasattr(element, "parent") or not element.parent:
|
|
412
|
+
return ""
|
|
413
|
+
|
|
414
|
+
parent_ref = str(element.parent.cref) if hasattr(element.parent, "cref") else ""
|
|
415
|
+
if not parent_ref or not parent_ref.startswith("#/texts/"):
|
|
416
|
+
return ""
|
|
417
|
+
|
|
418
|
+
# Find parent text item
|
|
419
|
+
parent_idx = int(parent_ref.split("/")[-1])
|
|
420
|
+
if parent_idx >= len(doc.texts):
|
|
421
|
+
return ""
|
|
422
|
+
|
|
423
|
+
parent_text = doc.texts[parent_idx]
|
|
424
|
+
if not hasattr(parent_text, "children") or not parent_text.children:
|
|
425
|
+
return ""
|
|
426
|
+
|
|
427
|
+
# Find element in children list
|
|
428
|
+
element_ref = f"#/{element_type}s/{element_idx}"
|
|
429
|
+
element_idx_in_children = None
|
|
430
|
+
for idx, child in enumerate(parent_text.children):
|
|
431
|
+
child_ref = str(child.cref) if hasattr(child, "cref") else ""
|
|
432
|
+
if child_ref == element_ref:
|
|
433
|
+
element_idx_in_children = idx
|
|
434
|
+
break
|
|
435
|
+
|
|
436
|
+
if element_idx_in_children is None:
|
|
437
|
+
return ""
|
|
438
|
+
|
|
439
|
+
# Check next item after element (potential caption)
|
|
440
|
+
if element_idx_in_children + 1 < len(parent_text.children):
|
|
441
|
+
next_child = parent_text.children[element_idx_in_children + 1]
|
|
442
|
+
next_ref = str(next_child.cref) if hasattr(next_child, "cref") else ""
|
|
443
|
+
|
|
444
|
+
if next_ref.startswith("#/texts/"):
|
|
445
|
+
text_idx = int(next_ref.split("/")[-1])
|
|
446
|
+
if text_idx < len(doc.texts):
|
|
447
|
+
caption_text = doc.texts[text_idx]
|
|
448
|
+
text = getattr(caption_text, "text", "") or getattr(caption_text, "orig", "")
|
|
449
|
+
# Check if it looks like a caption based on keywords
|
|
450
|
+
if text and any(keyword in text.lower() for keyword in caption_keywords):
|
|
451
|
+
return text.strip()
|
|
452
|
+
|
|
453
|
+
return ""
|
|
454
|
+
except Exception as e:
|
|
455
|
+
self.logger.warning(f"[Docling] Failed to find {element_type} caption: {e}")
|
|
456
|
+
return ""
|
|
457
|
+
|
|
458
|
+
def _find_picture_caption(self, doc, picture_idx: int) -> str:
|
|
459
|
+
"""
|
|
460
|
+
Find caption for a picture by checking document structure.
|
|
461
|
+
|
|
462
|
+
For DOCX, captions are not directly in PictureItem.captions,
|
|
463
|
+
but are text items that follow the picture in the parent's children list.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
doc: Docling document object
|
|
467
|
+
picture_idx: Index of the picture in doc.pictures
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
Caption text or empty string
|
|
471
|
+
"""
|
|
472
|
+
try:
|
|
473
|
+
pic = doc.pictures[picture_idx]
|
|
474
|
+
return self._find_element_caption(
|
|
475
|
+
doc, pic, "picture", picture_idx, ["图表", "figure", "图", "fig"]
|
|
476
|
+
)
|
|
477
|
+
except Exception as e:
|
|
478
|
+
self.logger.warning(f"[Docling] Failed to find picture caption: {e}")
|
|
479
|
+
return ""
|
|
480
|
+
|
|
481
|
+
def _find_table_caption(self, doc, table_idx: int) -> str:
|
|
482
|
+
"""
|
|
483
|
+
Find caption for a table by checking document structure.
|
|
484
|
+
|
|
485
|
+
For DOCX, captions are text items that follow the table in the document structure.
|
|
486
|
+
Similar to picture captions, but we also check for "Table" keywords.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
doc: Docling document object
|
|
490
|
+
table_idx: Index of the table in doc.tables
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
Caption text or empty string
|
|
494
|
+
"""
|
|
495
|
+
try:
|
|
496
|
+
tab = doc.tables[table_idx]
|
|
497
|
+
return self._find_element_caption(
|
|
498
|
+
doc, tab, "table", table_idx, ["表", "table", "表格"]
|
|
499
|
+
)
|
|
500
|
+
except Exception as e:
|
|
501
|
+
self.logger.warning(f"[Docling] Failed to find table caption: {e}")
|
|
502
|
+
return ""
|
|
503
|
+
|
|
504
|
+
def _transfer_to_tables_docx(self, doc) -> list[tuple[tuple, str]]:
|
|
505
|
+
"""
|
|
506
|
+
Transfer DOCX document tables and pictures to tables format.
|
|
507
|
+
DOCX doesn't have bbox, so we handle pictures differently.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
doc: Docling document object
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
List of ((image, html_or_captions), positions) tuples
|
|
514
|
+
"""
|
|
515
|
+
tables = []
|
|
516
|
+
|
|
517
|
+
# Handle tables
|
|
518
|
+
for idx, tab in enumerate(getattr(doc, "tables", [])):
|
|
519
|
+
html = ""
|
|
520
|
+
try:
|
|
521
|
+
html = tab.export_to_html(doc=doc)
|
|
522
|
+
except Exception:
|
|
523
|
+
pass
|
|
524
|
+
|
|
525
|
+
# Find table caption through document structure
|
|
526
|
+
caption = self._find_table_caption(doc, idx)
|
|
527
|
+
|
|
528
|
+
# Also try direct caption_text method (might work for some cases)
|
|
529
|
+
if not caption:
|
|
530
|
+
try:
|
|
531
|
+
caption = tab.caption_text(doc=doc) if hasattr(tab, "caption_text") else ""
|
|
532
|
+
except Exception:
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
# DOCX tables don't have bbox, so no image or positions
|
|
536
|
+
# Format: ((None, html_or_captions), positions)
|
|
537
|
+
# For tables with caption, we store as dict: {"caption": caption, "html": html}
|
|
538
|
+
# For tables without caption, we store as string: html
|
|
539
|
+
if caption:
|
|
540
|
+
# Store caption and html together in a dict format
|
|
541
|
+
table_data = {"caption": caption, "html": html}
|
|
542
|
+
tables.append(((None, table_data), ""))
|
|
543
|
+
else:
|
|
544
|
+
tables.append(((None, html), ""))
|
|
545
|
+
|
|
546
|
+
# Handle pictures
|
|
547
|
+
for idx, pic in enumerate(getattr(doc, "pictures", [])):
|
|
548
|
+
img = None
|
|
549
|
+
captions = ""
|
|
550
|
+
|
|
551
|
+
# Extract image from data URI
|
|
552
|
+
if hasattr(pic, "image") and pic.image:
|
|
553
|
+
if hasattr(pic.image, "uri"):
|
|
554
|
+
data_uri = str(pic.image.uri)
|
|
555
|
+
img = self._extract_image_from_data_uri(data_uri)
|
|
556
|
+
|
|
557
|
+
# Find caption through document structure
|
|
558
|
+
caption = self._find_picture_caption(doc, idx)
|
|
559
|
+
if caption:
|
|
560
|
+
captions = caption
|
|
561
|
+
|
|
562
|
+
# Also try direct caption_text method (might work for some cases)
|
|
563
|
+
if not captions:
|
|
564
|
+
try:
|
|
565
|
+
captions = pic.caption_text(doc=doc)
|
|
566
|
+
except Exception:
|
|
567
|
+
pass
|
|
568
|
+
|
|
569
|
+
# DOCX pictures don't have bbox positions
|
|
570
|
+
tables.append(((img, [captions] if captions else []), ""))
|
|
571
|
+
|
|
572
|
+
return tables
|
|
573
|
+
|
|
574
|
+
def parse_pdf(
|
|
575
|
+
self,
|
|
576
|
+
filepath: str | PathLike[str],
|
|
577
|
+
binary: BytesIO | bytes | None = None,
|
|
578
|
+
callback: Optional[Callable] = None,
|
|
579
|
+
*,
|
|
580
|
+
output_dir: Optional[str] = None,
|
|
581
|
+
lang: Optional[str] = None,
|
|
582
|
+
method: str = "auto",
|
|
583
|
+
delete_output: bool = True,
|
|
584
|
+
parse_method: str = "raw"
|
|
585
|
+
):
|
|
586
|
+
|
|
587
|
+
if not self.check_installation():
|
|
588
|
+
raise RuntimeError("Docling not available, please install `docling`")
|
|
589
|
+
|
|
590
|
+
if binary is not None:
|
|
591
|
+
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
|
|
592
|
+
tmpdir.mkdir(parents=True, exist_ok=True)
|
|
593
|
+
name = Path(filepath).name or "input.pdf"
|
|
594
|
+
tmp_pdf = tmpdir / name
|
|
595
|
+
with open(tmp_pdf, "wb") as f:
|
|
596
|
+
if isinstance(binary, (bytes, bytearray)):
|
|
597
|
+
f.write(binary)
|
|
598
|
+
else:
|
|
599
|
+
f.write(binary.getbuffer())
|
|
600
|
+
src_path = tmp_pdf
|
|
601
|
+
else:
|
|
602
|
+
src_path = Path(filepath)
|
|
603
|
+
if not src_path.exists():
|
|
604
|
+
raise FileNotFoundError(f"PDF not found: {src_path}")
|
|
605
|
+
|
|
606
|
+
if callback:
|
|
607
|
+
callback(0.1, f"[Docling] Converting: {src_path}")
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
self.__images__(str(src_path), zoomin=1)
|
|
611
|
+
except Exception as e:
|
|
612
|
+
self.logger.warning(f"[Docling] render pages failed: {e}")
|
|
613
|
+
|
|
614
|
+
conv = DocumentConverter()
|
|
615
|
+
conv_res = conv.convert(str(src_path))
|
|
616
|
+
doc = conv_res.document
|
|
617
|
+
if callback:
|
|
618
|
+
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
|
|
619
|
+
|
|
620
|
+
sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=True)
|
|
621
|
+
tables = self._transfer_to_tables(doc, has_bbox=True)
|
|
622
|
+
|
|
623
|
+
if callback:
|
|
624
|
+
callback(0.95, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
|
|
625
|
+
|
|
626
|
+
if binary is not None and delete_output:
|
|
627
|
+
try:
|
|
628
|
+
Path(src_path).unlink(missing_ok=True)
|
|
629
|
+
except Exception:
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
if callback:
|
|
633
|
+
callback(1.0, "[Docling] Done.")
|
|
634
|
+
return sections, tables
|
|
635
|
+
|
|
636
|
+
def parse_docx(
|
|
637
|
+
self,
|
|
638
|
+
filepath: str | PathLike[str],
|
|
639
|
+
binary: BytesIO | bytes | None = None,
|
|
640
|
+
callback: Optional[Callable] = None,
|
|
641
|
+
*,
|
|
642
|
+
output_dir: Optional[str] = None,
|
|
643
|
+
lang: Optional[str] = None,
|
|
644
|
+
method: str = "auto",
|
|
645
|
+
delete_output: bool = True,
|
|
646
|
+
parse_method: str = "raw"
|
|
647
|
+
):
|
|
648
|
+
"""
|
|
649
|
+
Parse DOCX file using Docling.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
filepath: Path to DOCX file
|
|
653
|
+
binary: Optional binary content of the file
|
|
654
|
+
callback: Optional progress callback function
|
|
655
|
+
output_dir: Optional temporary output directory
|
|
656
|
+
lang: Optional language hint (not used for DOCX)
|
|
657
|
+
method: Parsing method (not used for DOCX)
|
|
658
|
+
delete_output: Whether to delete temporary files
|
|
659
|
+
parse_method: Output format ("raw", "manual", "paper")
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Tuple of (sections, tables) where:
|
|
663
|
+
- sections: List of (text, tag) or (text, type, tag) tuples
|
|
664
|
+
- tables: List of ((image, html_or_captions), positions) tuples
|
|
665
|
+
"""
|
|
666
|
+
if not self.check_installation():
|
|
667
|
+
raise RuntimeError("Docling not available, please install `docling`")
|
|
668
|
+
|
|
669
|
+
if binary is not None:
|
|
670
|
+
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
|
|
671
|
+
tmpdir.mkdir(parents=True, exist_ok=True)
|
|
672
|
+
name = Path(filepath).name if filepath else "input.docx"
|
|
673
|
+
if not name.endswith(".docx"):
|
|
674
|
+
name = name + ".docx"
|
|
675
|
+
tmp_docx = tmpdir / name
|
|
676
|
+
with open(tmp_docx, "wb") as f:
|
|
677
|
+
if isinstance(binary, (bytes, bytearray)):
|
|
678
|
+
f.write(binary)
|
|
679
|
+
else:
|
|
680
|
+
f.write(binary.getbuffer())
|
|
681
|
+
src_path = tmp_docx
|
|
682
|
+
else:
|
|
683
|
+
src_path = Path(filepath)
|
|
684
|
+
if not src_path.exists():
|
|
685
|
+
raise FileNotFoundError(f"DOCX not found: {src_path}")
|
|
686
|
+
|
|
687
|
+
if callback:
|
|
688
|
+
callback(0.1, f"[Docling] Converting DOCX: {src_path}")
|
|
689
|
+
|
|
690
|
+
try:
|
|
691
|
+
conv = DocumentConverter()
|
|
692
|
+
conv_res = conv.convert(str(src_path))
|
|
693
|
+
doc = conv_res.document
|
|
694
|
+
except Exception as e:
|
|
695
|
+
self.logger.error(f"[Docling] Failed to convert DOCX: {e}")
|
|
696
|
+
raise
|
|
697
|
+
|
|
698
|
+
if callback:
|
|
699
|
+
callback(0.5, f"[Docling] Parsed DOCX: {len(getattr(doc, 'texts', []))} text items")
|
|
700
|
+
|
|
701
|
+
# DOCX doesn't have bbox, so use has_bbox=False
|
|
702
|
+
sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
|
|
703
|
+
tables = self._transfer_to_tables_docx(doc)
|
|
704
|
+
|
|
705
|
+
if callback:
|
|
706
|
+
callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
|
|
707
|
+
|
|
708
|
+
if binary is not None and delete_output:
|
|
709
|
+
try:
|
|
710
|
+
Path(src_path).unlink(missing_ok=True)
|
|
711
|
+
except Exception:
|
|
712
|
+
pass
|
|
713
|
+
|
|
714
|
+
if callback:
|
|
715
|
+
callback(1.0, "[Docling] Done.")
|
|
716
|
+
return sections, tables
|
|
717
|
+
|
|
718
|
+
def parse_pptx(
|
|
719
|
+
self,
|
|
720
|
+
filepath: str | PathLike[str],
|
|
721
|
+
binary: BytesIO | bytes | None = None,
|
|
722
|
+
callback: Optional[Callable] = None,
|
|
723
|
+
*,
|
|
724
|
+
output_dir: Optional[str] = None,
|
|
725
|
+
lang: Optional[str] = None,
|
|
726
|
+
method: str = "auto",
|
|
727
|
+
delete_output: bool = True,
|
|
728
|
+
parse_method: str = "raw"
|
|
729
|
+
):
|
|
730
|
+
"""
|
|
731
|
+
Parse PPTX file using Docling (preliminary support).
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
filepath: Path to PPTX file
|
|
735
|
+
binary: Optional binary content of the file
|
|
736
|
+
callback: Optional progress callback function
|
|
737
|
+
output_dir: Optional temporary output directory
|
|
738
|
+
lang: Optional language hint (not used for PPTX)
|
|
739
|
+
method: Parsing method (not used for PPTX)
|
|
740
|
+
delete_output: Whether to delete temporary files
|
|
741
|
+
parse_method: Output format ("raw", "manual", "paper")
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
Tuple of (sections, tables) where:
|
|
745
|
+
- sections: List of (text, tag) or (text, type, tag) tuples
|
|
746
|
+
- tables: List of ((image, html_or_captions), positions) tuples
|
|
747
|
+
"""
|
|
748
|
+
if not self.check_installation():
|
|
749
|
+
raise RuntimeError("Docling not available, please install `docling`")
|
|
750
|
+
|
|
751
|
+
if binary is not None:
|
|
752
|
+
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
|
|
753
|
+
tmpdir.mkdir(parents=True, exist_ok=True)
|
|
754
|
+
name = Path(filepath).name if filepath else "input.pptx"
|
|
755
|
+
if not name.endswith(".pptx"):
|
|
756
|
+
name = name + ".pptx"
|
|
757
|
+
tmp_pptx = tmpdir / name
|
|
758
|
+
with open(tmp_pptx, "wb") as f:
|
|
759
|
+
if isinstance(binary, (bytes, bytearray)):
|
|
760
|
+
f.write(binary)
|
|
761
|
+
else:
|
|
762
|
+
f.write(binary.getbuffer())
|
|
763
|
+
src_path = tmp_pptx
|
|
764
|
+
else:
|
|
765
|
+
src_path = Path(filepath)
|
|
766
|
+
if not src_path.exists():
|
|
767
|
+
raise FileNotFoundError(f"PPTX not found: {src_path}")
|
|
768
|
+
|
|
769
|
+
if callback:
|
|
770
|
+
callback(0.1, f"[Docling] Converting PPTX: {src_path}")
|
|
771
|
+
|
|
772
|
+
try:
|
|
773
|
+
conv = DocumentConverter()
|
|
774
|
+
conv_res = conv.convert(str(src_path))
|
|
775
|
+
doc = conv_res.document
|
|
776
|
+
except Exception as e:
|
|
777
|
+
self.logger.error(f"[Docling] Failed to convert PPTX: {e}")
|
|
778
|
+
raise
|
|
779
|
+
|
|
780
|
+
if callback:
|
|
781
|
+
callback(0.5, f"[Docling] Parsed PPTX: {len(getattr(doc, 'texts', []))} text items")
|
|
782
|
+
|
|
783
|
+
# PPTX doesn't have bbox like DOCX
|
|
784
|
+
sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
|
|
785
|
+
tables = self._transfer_to_tables_docx(doc)
|
|
786
|
+
|
|
787
|
+
if callback:
|
|
788
|
+
callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
|
|
789
|
+
|
|
790
|
+
if binary is not None and delete_output:
|
|
791
|
+
try:
|
|
792
|
+
Path(src_path).unlink(missing_ok=True)
|
|
793
|
+
except Exception:
|
|
794
|
+
pass
|
|
795
|
+
|
|
796
|
+
if callback:
|
|
797
|
+
callback(1.0, "[Docling] Done.")
|
|
798
|
+
return sections, tables
|
|
799
|
+
|
|
800
|
+
def parse_xlsx(
|
|
801
|
+
self,
|
|
802
|
+
filepath: str | PathLike[str],
|
|
803
|
+
binary: BytesIO | bytes | None = None,
|
|
804
|
+
callback: Optional[Callable] = None,
|
|
805
|
+
*,
|
|
806
|
+
output_dir: Optional[str] = None,
|
|
807
|
+
lang: Optional[str] = None,
|
|
808
|
+
method: str = "auto",
|
|
809
|
+
delete_output: bool = True,
|
|
810
|
+
parse_method: str = "raw"
|
|
811
|
+
):
|
|
812
|
+
"""
|
|
813
|
+
Parse XLSX file using Docling (preliminary support).
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
filepath: Path to XLSX file
|
|
817
|
+
binary: Optional binary content of the file
|
|
818
|
+
callback: Optional progress callback function
|
|
819
|
+
output_dir: Optional temporary output directory
|
|
820
|
+
lang: Optional language hint (not used for XLSX)
|
|
821
|
+
method: Parsing method (not used for XLSX)
|
|
822
|
+
delete_output: Whether to delete temporary files
|
|
823
|
+
parse_method: Output format ("raw", "manual", "paper")
|
|
824
|
+
|
|
825
|
+
Returns:
|
|
826
|
+
Tuple of (sections, tables) where:
|
|
827
|
+
- sections: List of (text, tag) or (text, type, tag) tuples (usually empty for XLSX)
|
|
828
|
+
- tables: List of ((image, html), positions) tuples
|
|
829
|
+
"""
|
|
830
|
+
if not self.check_installation():
|
|
831
|
+
raise RuntimeError("Docling not available, please install `docling`")
|
|
832
|
+
|
|
833
|
+
if binary is not None:
|
|
834
|
+
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
|
|
835
|
+
tmpdir.mkdir(parents=True, exist_ok=True)
|
|
836
|
+
name = Path(filepath).name if filepath else "input.xlsx"
|
|
837
|
+
if not name.endswith(".xlsx"):
|
|
838
|
+
name = name + ".xlsx"
|
|
839
|
+
tmp_xlsx = tmpdir / name
|
|
840
|
+
with open(tmp_xlsx, "wb") as f:
|
|
841
|
+
if isinstance(binary, (bytes, bytearray)):
|
|
842
|
+
f.write(binary)
|
|
843
|
+
else:
|
|
844
|
+
f.write(binary.getbuffer())
|
|
845
|
+
src_path = tmp_xlsx
|
|
846
|
+
else:
|
|
847
|
+
src_path = Path(filepath)
|
|
848
|
+
if not src_path.exists():
|
|
849
|
+
raise FileNotFoundError(f"XLSX not found: {src_path}")
|
|
850
|
+
|
|
851
|
+
if callback:
|
|
852
|
+
callback(0.1, f"[Docling] Converting XLSX: {src_path}")
|
|
853
|
+
|
|
854
|
+
try:
|
|
855
|
+
conv = DocumentConverter()
|
|
856
|
+
conv_res = conv.convert(str(src_path))
|
|
857
|
+
doc = conv_res.document
|
|
858
|
+
except Exception as e:
|
|
859
|
+
self.logger.error(f"[Docling] Failed to convert XLSX: {e}")
|
|
860
|
+
raise
|
|
861
|
+
|
|
862
|
+
if callback:
|
|
863
|
+
callback(0.5, f"[Docling] Parsed XLSX: {len(getattr(doc, 'tables', []))} tables")
|
|
864
|
+
|
|
865
|
+
# XLSX is primarily tables, minimal text sections
|
|
866
|
+
sections = self._transfer_to_sections(doc, parse_method=parse_method, has_bbox=False)
|
|
867
|
+
# Use DOCX table handler (no bbox, similar structure)
|
|
868
|
+
tables = self._transfer_to_tables_docx(doc)
|
|
869
|
+
|
|
870
|
+
if callback:
|
|
871
|
+
callback(0.9, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
|
|
872
|
+
|
|
873
|
+
if binary is not None and delete_output:
|
|
874
|
+
try:
|
|
875
|
+
Path(src_path).unlink(missing_ok=True)
|
|
876
|
+
except Exception:
|
|
877
|
+
pass
|
|
878
|
+
|
|
879
|
+
if callback:
|
|
880
|
+
callback(1.0, "[Docling] Done.")
|
|
881
|
+
return sections, tables
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
if __name__ == "__main__":
|
|
885
|
+
logging.basicConfig(level=logging.INFO)
|
|
886
|
+
parser = DoclingParser()
|
|
887
|
+
print("Docling available:", parser.check_installation())
|
|
888
|
+
sections, tables = parser.parse_pdf(filepath="test_docling/toc.pdf", binary=None)
|
|
889
|
+
print(len(sections), len(tables))
|