docling 2.17.0__py3-none-any.whl → 2.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +18 -18
- docling/backend/md_backend.py +88 -35
- docling/backend/mspowerpoint_backend.py +39 -27
- docling/backend/msword_backend.py +172 -130
- docling/datamodel/document.py +2 -0
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +12 -2
- docling/models/table_structure_model.py +9 -5
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +4 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/METADATA +8 -3
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/RECORD +15 -15
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/WHEEL +1 -1
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/LICENSE +0 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/entry_points.txt +0 -0
@@ -2,21 +2,28 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import
|
5
|
+
from typing import Any, Optional, Union
|
6
6
|
|
7
|
-
import docx
|
8
7
|
from docling_core.types.doc import (
|
9
8
|
DocItemLabel,
|
10
9
|
DoclingDocument,
|
11
10
|
DocumentOrigin,
|
12
11
|
GroupLabel,
|
13
12
|
ImageRef,
|
13
|
+
NodeItem,
|
14
14
|
TableCell,
|
15
15
|
TableData,
|
16
16
|
)
|
17
|
+
from docx import Document
|
18
|
+
from docx.document import Document as DocxDocument
|
19
|
+
from docx.oxml.table import CT_Tc
|
20
|
+
from docx.oxml.xmlchemy import BaseOxmlElement
|
21
|
+
from docx.table import Table, _Cell
|
22
|
+
from docx.text.paragraph import Paragraph
|
17
23
|
from lxml import etree
|
18
24
|
from lxml.etree import XPath
|
19
25
|
from PIL import Image, UnidentifiedImageError
|
26
|
+
from typing_extensions import override
|
20
27
|
|
21
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
22
29
|
from docling.datamodel.base_models import InputFormat
|
@@ -26,7 +33,10 @@ _log = logging.getLogger(__name__)
|
|
26
33
|
|
27
34
|
|
28
35
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
29
|
-
|
36
|
+
@override
|
37
|
+
def __init__(
|
38
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
39
|
+
) -> None:
|
30
40
|
super().__init__(in_doc, path_or_stream)
|
31
41
|
self.XML_KEY = (
|
32
42
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
@@ -36,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
36
46
|
}
|
37
47
|
# self.initialise(path_or_stream)
|
38
48
|
# Word file:
|
39
|
-
self.path_or_stream = path_or_stream
|
40
|
-
self.valid = False
|
49
|
+
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
50
|
+
self.valid: bool = False
|
41
51
|
# Initialise the parents for the hierarchy
|
42
|
-
self.max_levels = 10
|
43
|
-
self.level_at_new_list = None
|
44
|
-
self.parents = {}
|
52
|
+
self.max_levels: int = 10
|
53
|
+
self.level_at_new_list: Optional[int] = None
|
54
|
+
self.parents: dict[int, Optional[NodeItem]] = {}
|
45
55
|
for i in range(-1, self.max_levels):
|
46
56
|
self.parents[i] = None
|
47
57
|
|
48
58
|
self.level = 0
|
49
59
|
self.listIter = 0
|
50
60
|
|
51
|
-
self.history = {
|
61
|
+
self.history: dict[str, Any] = {
|
52
62
|
"names": [None],
|
53
63
|
"levels": [None],
|
54
64
|
"numids": [None],
|
@@ -58,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
58
68
|
self.docx_obj = None
|
59
69
|
try:
|
60
70
|
if isinstance(self.path_or_stream, BytesIO):
|
61
|
-
self.docx_obj =
|
71
|
+
self.docx_obj = Document(self.path_or_stream)
|
62
72
|
elif isinstance(self.path_or_stream, Path):
|
63
|
-
self.docx_obj =
|
73
|
+
self.docx_obj = Document(str(self.path_or_stream))
|
64
74
|
|
65
75
|
self.valid = True
|
66
76
|
except Exception as e:
|
@@ -68,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
68
78
|
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
69
79
|
) from e
|
70
80
|
|
81
|
+
@override
|
71
82
|
def is_valid(self) -> bool:
|
72
83
|
return self.valid
|
73
84
|
|
74
85
|
@classmethod
|
86
|
+
@override
|
75
87
|
def supports_pagination(cls) -> bool:
|
76
88
|
return False
|
77
89
|
|
90
|
+
@override
|
78
91
|
def unload(self):
|
79
92
|
if isinstance(self.path_or_stream, BytesIO):
|
80
93
|
self.path_or_stream.close()
|
@@ -82,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
82
95
|
self.path_or_stream = None
|
83
96
|
|
84
97
|
@classmethod
|
85
|
-
|
98
|
+
@override
|
99
|
+
def supported_formats(cls) -> set[InputFormat]:
|
86
100
|
return {InputFormat.DOCX}
|
87
101
|
|
102
|
+
@override
|
88
103
|
def convert(self) -> DoclingDocument:
|
89
|
-
|
104
|
+
"""Parses the DOCX into a structured document model.
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
The parsed document.
|
108
|
+
"""
|
90
109
|
|
91
110
|
origin = DocumentOrigin(
|
92
111
|
filename=self.file.name or "file",
|
@@ -104,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
104
123
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
105
124
|
)
|
106
125
|
|
107
|
-
def update_history(
|
126
|
+
def update_history(
|
127
|
+
self,
|
128
|
+
name: str,
|
129
|
+
level: Optional[int],
|
130
|
+
numid: Optional[int],
|
131
|
+
ilevel: Optional[int],
|
132
|
+
):
|
108
133
|
self.history["names"].append(name)
|
109
134
|
self.history["levels"].append(level)
|
110
135
|
|
111
136
|
self.history["numids"].append(numid)
|
112
137
|
self.history["indents"].append(ilevel)
|
113
138
|
|
114
|
-
def prev_name(self):
|
139
|
+
def prev_name(self) -> Optional[str]:
|
115
140
|
return self.history["names"][-1]
|
116
141
|
|
117
|
-
def prev_level(self):
|
142
|
+
def prev_level(self) -> Optional[int]:
|
118
143
|
return self.history["levels"][-1]
|
119
144
|
|
120
|
-
def prev_numid(self):
|
145
|
+
def prev_numid(self) -> Optional[int]:
|
121
146
|
return self.history["numids"][-1]
|
122
147
|
|
123
|
-
def prev_indent(self):
|
148
|
+
def prev_indent(self) -> Optional[int]:
|
124
149
|
return self.history["indents"][-1]
|
125
150
|
|
126
151
|
def get_level(self) -> int:
|
@@ -130,13 +155,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
130
155
|
return k
|
131
156
|
return 0
|
132
157
|
|
133
|
-
def walk_linear(
|
158
|
+
def walk_linear(
|
159
|
+
self,
|
160
|
+
body: BaseOxmlElement,
|
161
|
+
docx_obj: DocxDocument,
|
162
|
+
doc: DoclingDocument,
|
163
|
+
) -> DoclingDocument:
|
134
164
|
for element in body:
|
135
165
|
tag_name = etree.QName(element).localname
|
136
166
|
# Check for Inline Images (blip elements)
|
137
167
|
namespaces = {
|
138
168
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
139
169
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
170
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
140
171
|
}
|
141
172
|
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
142
173
|
drawing_blip = xpath_expr(element)
|
@@ -149,7 +180,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
149
180
|
_log.debug("could not parse a table, broken docx table")
|
150
181
|
|
151
182
|
elif drawing_blip:
|
152
|
-
self.handle_pictures(
|
183
|
+
self.handle_pictures(docx_obj, drawing_blip, doc)
|
184
|
+
# Check for the sdt containers, like table of contents
|
185
|
+
elif tag_name in ["sdt"]:
|
186
|
+
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
187
|
+
if sdt_content is not None:
|
188
|
+
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
189
|
+
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
190
|
+
for p in paragraphs:
|
191
|
+
self.handle_text_elements(p, docx_obj, doc)
|
153
192
|
# Check for Text
|
154
193
|
elif tag_name in ["p"]:
|
155
194
|
# "tcPr", "sectPr"
|
@@ -158,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
158
197
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
159
198
|
return doc
|
160
199
|
|
161
|
-
def str_to_int(self, s, default=0):
|
200
|
+
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
|
162
201
|
if s is None:
|
163
202
|
return None
|
164
203
|
try:
|
@@ -166,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
166
205
|
except ValueError:
|
167
206
|
return default
|
168
207
|
|
169
|
-
def split_text_and_number(self, input_string):
|
208
|
+
def split_text_and_number(self, input_string: str) -> list[str]:
|
170
209
|
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
171
210
|
if match:
|
172
211
|
parts = list(filter(None, match.groups()))
|
@@ -174,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
174
213
|
else:
|
175
214
|
return [input_string]
|
176
215
|
|
177
|
-
def get_numId_and_ilvl(
|
216
|
+
def get_numId_and_ilvl(
|
217
|
+
self, paragraph: Paragraph
|
218
|
+
) -> tuple[Optional[int], Optional[int]]:
|
178
219
|
# Access the XML element of the paragraph
|
179
220
|
numPr = paragraph._element.find(
|
180
221
|
".//w:numPr", namespaces=paragraph._element.nsmap
|
@@ -187,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
187
228
|
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
188
229
|
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
189
230
|
|
190
|
-
return self.str_to_int(numId,
|
191
|
-
ilvl, default=None
|
192
|
-
)
|
231
|
+
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
|
193
232
|
|
194
233
|
return None, None # If the paragraph is not part of a list
|
195
234
|
|
196
|
-
def get_label_and_level(self, paragraph):
|
235
|
+
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
197
236
|
if paragraph.style is None:
|
198
237
|
return "Normal", None
|
199
238
|
label = paragraph.style.style_id
|
@@ -209,20 +248,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
209
248
|
|
210
249
|
if "Heading" in label and len(parts) == 2:
|
211
250
|
parts.sort()
|
212
|
-
label_str = ""
|
213
|
-
label_level = 0
|
251
|
+
label_str: str = ""
|
252
|
+
label_level: Optional[int] = 0
|
214
253
|
if parts[0] == "Heading":
|
215
254
|
label_str = parts[0]
|
216
|
-
label_level = self.str_to_int(parts[1],
|
255
|
+
label_level = self.str_to_int(parts[1], None)
|
217
256
|
if parts[1] == "Heading":
|
218
257
|
label_str = parts[1]
|
219
|
-
label_level = self.str_to_int(parts[0],
|
258
|
+
label_level = self.str_to_int(parts[0], None)
|
220
259
|
return label_str, label_level
|
221
260
|
else:
|
222
261
|
return label, None
|
223
262
|
|
224
|
-
def handle_text_elements(
|
225
|
-
|
263
|
+
def handle_text_elements(
|
264
|
+
self,
|
265
|
+
element: BaseOxmlElement,
|
266
|
+
docx_obj: DocxDocument,
|
267
|
+
doc: DoclingDocument,
|
268
|
+
) -> None:
|
269
|
+
paragraph = Paragraph(element, docx_obj)
|
226
270
|
|
227
271
|
if paragraph.text is None:
|
228
272
|
return
|
@@ -240,13 +284,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
240
284
|
numid = None
|
241
285
|
|
242
286
|
# Handle lists
|
243
|
-
if
|
287
|
+
if (
|
288
|
+
numid is not None
|
289
|
+
and ilevel is not None
|
290
|
+
and p_style_id not in ["Title", "Heading"]
|
291
|
+
):
|
244
292
|
self.add_listitem(
|
245
|
-
element,
|
246
|
-
docx_obj,
|
247
293
|
doc,
|
248
|
-
p_style_id,
|
249
|
-
p_level,
|
250
294
|
numid,
|
251
295
|
ilevel,
|
252
296
|
text,
|
@@ -254,20 +298,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
254
298
|
)
|
255
299
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
256
300
|
return
|
257
|
-
elif
|
258
|
-
|
259
|
-
|
301
|
+
elif (
|
302
|
+
numid is None
|
303
|
+
and self.prev_numid() is not None
|
304
|
+
and p_style_id not in ["Title", "Heading"]
|
305
|
+
): # Close list
|
306
|
+
if self.level_at_new_list:
|
307
|
+
for key in range(len(self.parents)):
|
308
|
+
if key >= self.level_at_new_list:
|
309
|
+
self.parents[key] = None
|
310
|
+
self.level = self.level_at_new_list - 1
|
311
|
+
self.level_at_new_list = None
|
312
|
+
else:
|
313
|
+
for key in range(len(self.parents)):
|
260
314
|
self.parents[key] = None
|
261
|
-
|
262
|
-
|
315
|
+
self.level = 0
|
316
|
+
|
263
317
|
if p_style_id in ["Title"]:
|
264
|
-
for key
|
318
|
+
for key in range(len(self.parents)):
|
265
319
|
self.parents[key] = None
|
266
320
|
self.parents[0] = doc.add_text(
|
267
321
|
parent=None, label=DocItemLabel.TITLE, text=text
|
268
322
|
)
|
269
323
|
elif "Heading" in p_style_id:
|
270
|
-
self.add_header(
|
324
|
+
self.add_header(doc, p_level, text)
|
271
325
|
|
272
326
|
elif p_style_id in [
|
273
327
|
"Paragraph",
|
@@ -295,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
295
349
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
296
350
|
return
|
297
351
|
|
298
|
-
def add_header(
|
352
|
+
def add_header(
|
353
|
+
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
354
|
+
) -> None:
|
299
355
|
level = self.get_level()
|
300
356
|
if isinstance(curr_level, int):
|
301
357
|
if curr_level > level:
|
@@ -308,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
308
364
|
)
|
309
365
|
elif curr_level < level:
|
310
366
|
# remove the tail
|
311
|
-
for key
|
367
|
+
for key in range(len(self.parents)):
|
312
368
|
if key >= curr_level:
|
313
369
|
self.parents[key] = None
|
314
370
|
|
@@ -327,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
327
383
|
|
328
384
|
def add_listitem(
|
329
385
|
self,
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
p_style_id,
|
334
|
-
p_level,
|
335
|
-
numid,
|
336
|
-
ilevel,
|
386
|
+
doc: DoclingDocument,
|
387
|
+
numid: int,
|
388
|
+
ilevel: int,
|
337
389
|
text: str,
|
338
|
-
is_numbered=False,
|
339
|
-
):
|
340
|
-
# is_numbered = is_numbered
|
390
|
+
is_numbered: bool = False,
|
391
|
+
) -> None:
|
341
392
|
enum_marker = ""
|
342
393
|
|
343
394
|
level = self.get_level()
|
395
|
+
prev_indent = self.prev_indent()
|
344
396
|
if self.prev_numid() is None: # Open new list
|
345
|
-
self.level_at_new_list = level
|
397
|
+
self.level_at_new_list = level
|
346
398
|
|
347
399
|
self.parents[level] = doc.add_group(
|
348
400
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
@@ -361,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
361
413
|
)
|
362
414
|
|
363
415
|
elif (
|
364
|
-
self.prev_numid() == numid
|
416
|
+
self.prev_numid() == numid
|
417
|
+
and self.level_at_new_list is not None
|
418
|
+
and prev_indent is not None
|
419
|
+
and prev_indent < ilevel
|
365
420
|
): # Open indented list
|
366
421
|
for i in range(
|
367
|
-
self.level_at_new_list +
|
422
|
+
self.level_at_new_list + prev_indent + 1,
|
368
423
|
self.level_at_new_list + ilevel + 1,
|
369
424
|
):
|
370
425
|
# Determine if this is an unordered list or an ordered list.
|
@@ -393,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
393
448
|
text=text,
|
394
449
|
)
|
395
450
|
|
396
|
-
elif
|
451
|
+
elif (
|
452
|
+
self.prev_numid() == numid
|
453
|
+
and self.level_at_new_list is not None
|
454
|
+
and prev_indent is not None
|
455
|
+
and ilevel < prev_indent
|
456
|
+
): # Close list
|
397
457
|
for k, v in self.parents.items():
|
398
458
|
if k > self.level_at_new_list + ilevel:
|
399
459
|
self.parents[k] = None
|
@@ -411,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
411
471
|
)
|
412
472
|
self.listIter = 0
|
413
473
|
|
414
|
-
elif self.prev_numid() == numid or
|
474
|
+
elif self.prev_numid() == numid or prev_indent == ilevel:
|
415
475
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
416
476
|
self.listIter += 1
|
417
477
|
if is_numbered:
|
@@ -425,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
425
485
|
)
|
426
486
|
return
|
427
487
|
|
428
|
-
def handle_tables(
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
return 1 # Default is 1 (no colspan)
|
436
|
-
|
437
|
-
# Function to check if a cell has a rowspan (vMerge)
|
438
|
-
def get_rowspan(cell):
|
439
|
-
v_merge = cell._element.xpath("@w:vMerge")
|
440
|
-
if v_merge:
|
441
|
-
return v_merge[
|
442
|
-
0
|
443
|
-
] # 'restart' indicates the beginning of a rowspan, others are continuation
|
444
|
-
return 1
|
445
|
-
|
446
|
-
table = docx.table.Table(element, docx_obj)
|
447
|
-
|
488
|
+
def handle_tables(
|
489
|
+
self,
|
490
|
+
element: BaseOxmlElement,
|
491
|
+
docx_obj: DocxDocument,
|
492
|
+
doc: DoclingDocument,
|
493
|
+
) -> None:
|
494
|
+
table: Table = Table(element, docx_obj)
|
448
495
|
num_rows = len(table.rows)
|
449
|
-
num_cols =
|
450
|
-
|
451
|
-
# Calculate the max number of columns
|
452
|
-
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
496
|
+
num_cols = len(table.columns)
|
497
|
+
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
|
453
498
|
|
454
499
|
if num_rows == 1 and num_cols == 1:
|
455
500
|
cell_element = table.rows[0].cells[0]
|
@@ -458,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
458
503
|
self.walk_linear(cell_element._element, docx_obj, doc)
|
459
504
|
return
|
460
505
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
465
|
-
|
506
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
507
|
+
cell_set: set[CT_Tc] = set()
|
466
508
|
for row_idx, row in enumerate(table.rows):
|
509
|
+
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
467
510
|
col_idx = 0
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
col_span=col_span,
|
498
|
-
start_row_offset_idx=row_idx,
|
499
|
-
end_row_offset_idx=row_idx + row_span,
|
511
|
+
while col_idx < num_cols:
|
512
|
+
cell: _Cell = row.cells[col_idx]
|
513
|
+
_log.debug(
|
514
|
+
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
515
|
+
)
|
516
|
+
if cell is None or cell._tc in cell_set:
|
517
|
+
_log.debug(f" skipped since repeated content")
|
518
|
+
col_idx += cell.grid_span
|
519
|
+
continue
|
520
|
+
else:
|
521
|
+
cell_set.add(cell._tc)
|
522
|
+
|
523
|
+
spanned_idx = row_idx
|
524
|
+
spanned_tc: Optional[CT_Tc] = cell._tc
|
525
|
+
while spanned_tc == cell._tc:
|
526
|
+
spanned_idx += 1
|
527
|
+
spanned_tc = (
|
528
|
+
table.rows[spanned_idx].cells[col_idx]._tc
|
529
|
+
if spanned_idx < num_rows
|
530
|
+
else None
|
531
|
+
)
|
532
|
+
_log.debug(f" spanned before row {spanned_idx}")
|
533
|
+
|
534
|
+
table_cell = TableCell(
|
535
|
+
text=cell.text,
|
536
|
+
row_span=spanned_idx - row_idx,
|
537
|
+
col_span=cell.grid_span,
|
538
|
+
start_row_offset_idx=row.grid_cols_before + row_idx,
|
539
|
+
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
500
540
|
start_col_offset_idx=col_idx,
|
501
|
-
end_col_offset_idx=col_idx +
|
541
|
+
end_col_offset_idx=col_idx + cell.grid_span,
|
502
542
|
col_header=False,
|
503
543
|
row_header=False,
|
504
544
|
)
|
505
|
-
|
506
|
-
|
545
|
+
data.table_cells.append(table_cell)
|
546
|
+
col_idx += cell.grid_span
|
507
547
|
|
508
548
|
level = self.get_level()
|
509
549
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
510
550
|
return
|
511
551
|
|
512
|
-
def handle_pictures(
|
513
|
-
|
552
|
+
def handle_pictures(
|
553
|
+
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
554
|
+
) -> None:
|
555
|
+
def get_docx_image(drawing_blip):
|
514
556
|
rId = drawing_blip[0].get(
|
515
557
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
516
558
|
)
|
@@ -520,11 +562,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
520
562
|
image_data = image_part.blob # Get the binary image data
|
521
563
|
return image_data
|
522
564
|
|
523
|
-
image_data = get_docx_image(element, drawing_blip)
|
524
|
-
image_bytes = BytesIO(image_data)
|
525
565
|
level = self.get_level()
|
526
566
|
# Open the BytesIO object with PIL to create an Image
|
527
567
|
try:
|
568
|
+
image_data = get_docx_image(drawing_blip)
|
569
|
+
image_bytes = BytesIO(image_data)
|
528
570
|
pil_image = Image.open(image_bytes)
|
529
571
|
doc.add_picture(
|
530
572
|
parent=self.parents[level - 1],
|
docling/datamodel/document.py
CHANGED
@@ -157,6 +157,8 @@ class InputDocument(BaseModel):
|
|
157
157
|
self.page_count = self._backend.page_count()
|
158
158
|
if not self.page_count <= self.limits.max_num_pages:
|
159
159
|
self.valid = False
|
160
|
+
elif self.page_count < self.limits.page_range[0]:
|
161
|
+
self.valid = False
|
160
162
|
|
161
163
|
except (FileNotFoundError, OSError) as e:
|
162
164
|
self.valid = False
|
docling/datamodel/settings.py
CHANGED
@@ -1,13 +1,28 @@
|
|
1
1
|
import sys
|
2
2
|
from pathlib import Path
|
3
|
+
from typing import Annotated, Tuple
|
3
4
|
|
4
|
-
from pydantic import BaseModel
|
5
|
+
from pydantic import BaseModel, PlainValidator
|
5
6
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
6
7
|
|
7
8
|
|
9
|
+
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
10
|
+
if v[0] < 1 or v[1] < v[0]:
|
11
|
+
raise ValueError(
|
12
|
+
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
13
|
+
)
|
14
|
+
return v
|
15
|
+
|
16
|
+
|
17
|
+
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
18
|
+
|
19
|
+
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
20
|
+
|
21
|
+
|
8
22
|
class DocumentLimits(BaseModel):
|
9
23
|
max_num_pages: int = sys.maxsize
|
10
24
|
max_file_size: int = sys.maxsize
|
25
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE
|
11
26
|
|
12
27
|
|
13
28
|
class BatchConcurrencySettings(BaseModel):
|
docling/document_converter.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import logging
|
2
|
+
import math
|
2
3
|
import sys
|
3
4
|
import time
|
4
5
|
from functools import partial
|
5
6
|
from pathlib import Path
|
6
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
7
8
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
10
|
|
@@ -31,7 +32,12 @@ from docling.datamodel.document import (
|
|
31
32
|
_DocumentConversionInput,
|
32
33
|
)
|
33
34
|
from docling.datamodel.pipeline_options import PipelineOptions
|
34
|
-
from docling.datamodel.settings import
|
35
|
+
from docling.datamodel.settings import (
|
36
|
+
DEFAULT_PAGE_RANGE,
|
37
|
+
DocumentLimits,
|
38
|
+
PageRange,
|
39
|
+
settings,
|
40
|
+
)
|
35
41
|
from docling.exceptions import ConversionError
|
36
42
|
from docling.pipeline.base_pipeline import BasePipeline
|
37
43
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
@@ -184,6 +190,7 @@ class DocumentConverter:
|
|
184
190
|
raises_on_error: bool = True,
|
185
191
|
max_num_pages: int = sys.maxsize,
|
186
192
|
max_file_size: int = sys.maxsize,
|
193
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
187
194
|
) -> ConversionResult:
|
188
195
|
all_res = self.convert_all(
|
189
196
|
source=[source],
|
@@ -191,6 +198,7 @@ class DocumentConverter:
|
|
191
198
|
max_num_pages=max_num_pages,
|
192
199
|
max_file_size=max_file_size,
|
193
200
|
headers=headers,
|
201
|
+
page_range=page_range,
|
194
202
|
)
|
195
203
|
return next(all_res)
|
196
204
|
|
@@ -202,10 +210,12 @@ class DocumentConverter:
|
|
202
210
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
203
211
|
max_num_pages: int = sys.maxsize,
|
204
212
|
max_file_size: int = sys.maxsize,
|
213
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
205
214
|
) -> Iterator[ConversionResult]:
|
206
215
|
limits = DocumentLimits(
|
207
216
|
max_num_pages=max_num_pages,
|
208
217
|
max_file_size=max_file_size,
|
218
|
+
page_range=page_range,
|
209
219
|
)
|
210
220
|
conv_input = _DocumentConversionInput(
|
211
221
|
path_or_stream_iterator=source, limits=limits, headers=headers
|
@@ -209,12 +209,16 @@ class TableStructureModel(BasePageModel):
|
|
209
209
|
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
210
210
|
table_cells.append(tc)
|
211
211
|
|
212
|
+
assert "predict_details" in table_out
|
213
|
+
|
212
214
|
# Retrieving cols/rows, after post processing:
|
213
|
-
num_rows = table_out["predict_details"]
|
214
|
-
num_cols = table_out["predict_details"]
|
215
|
-
otsl_seq =
|
216
|
-
"
|
217
|
-
|
215
|
+
num_rows = table_out["predict_details"].get("num_rows", 0)
|
216
|
+
num_cols = table_out["predict_details"].get("num_cols", 0)
|
217
|
+
otsl_seq = (
|
218
|
+
table_out["predict_details"]
|
219
|
+
.get("prediction", {})
|
220
|
+
.get("rs_seq", [])
|
221
|
+
)
|
218
222
|
|
219
223
|
tbl = Table(
|
220
224
|
otsl_seq=otsl_seq,
|