docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,398 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItemLabel,
|
10
|
+
DoclingDocument,
|
11
|
+
DocumentOrigin,
|
12
|
+
GroupLabel,
|
13
|
+
ProvenanceItem,
|
14
|
+
Size,
|
15
|
+
TableCell,
|
16
|
+
TableData,
|
17
|
+
)
|
18
|
+
from pptx import Presentation
|
19
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
20
|
+
|
21
|
+
from docling.backend.abstract_backend import (
|
22
|
+
DeclarativeDocumentBackend,
|
23
|
+
PaginatedDocumentBackend,
|
24
|
+
)
|
25
|
+
from docling.datamodel.base_models import InputFormat
|
26
|
+
from docling.datamodel.document import InputDocument
|
27
|
+
|
28
|
+
_log = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
32
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
33
|
+
super().__init__(in_doc, path_or_stream)
|
34
|
+
self.namespaces = {
|
35
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
36
|
+
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
37
|
+
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
38
|
+
}
|
39
|
+
# Powerpoint file:
|
40
|
+
self.path_or_stream = path_or_stream
|
41
|
+
|
42
|
+
self.pptx_obj = None
|
43
|
+
self.valid = False
|
44
|
+
try:
|
45
|
+
if isinstance(self.path_or_stream, BytesIO):
|
46
|
+
self.pptx_obj = Presentation(self.path_or_stream)
|
47
|
+
elif isinstance(self.path_or_stream, Path):
|
48
|
+
self.pptx_obj = Presentation(str(self.path_or_stream))
|
49
|
+
|
50
|
+
self.valid = True
|
51
|
+
except Exception as e:
|
52
|
+
raise RuntimeError(
|
53
|
+
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
54
|
+
) from e
|
55
|
+
|
56
|
+
return
|
57
|
+
|
58
|
+
def page_count(self) -> int:
|
59
|
+
if self.is_valid():
|
60
|
+
assert self.pptx_obj is not None
|
61
|
+
return len(self.pptx_obj.slides)
|
62
|
+
else:
|
63
|
+
return 0
|
64
|
+
|
65
|
+
def is_valid(self) -> bool:
|
66
|
+
return self.valid
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def supports_pagination(cls) -> bool:
|
70
|
+
return True # True? if so, how to handle pages...
|
71
|
+
|
72
|
+
def unload(self):
|
73
|
+
if isinstance(self.path_or_stream, BytesIO):
|
74
|
+
self.path_or_stream.close()
|
75
|
+
|
76
|
+
self.path_or_stream = None
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
80
|
+
return {InputFormat.PPTX}
|
81
|
+
|
82
|
+
def convert(self) -> DoclingDocument:
|
83
|
+
# Parses the PPTX into a structured document model.
|
84
|
+
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
85
|
+
|
86
|
+
origin = DocumentOrigin(
|
87
|
+
filename=self.file.name or "file",
|
88
|
+
mimetype="application/vnd.ms-powerpoint",
|
89
|
+
binary_hash=self.document_hash,
|
90
|
+
)
|
91
|
+
|
92
|
+
doc = DoclingDocument(
|
93
|
+
name=self.file.stem or "file", origin=origin
|
94
|
+
) # must add origin information
|
95
|
+
doc = self.walk_linear(self.pptx_obj, doc)
|
96
|
+
|
97
|
+
return doc
|
98
|
+
|
99
|
+
def generate_prov(self, shape, slide_ind, text=""):
|
100
|
+
left = shape.left
|
101
|
+
top = shape.top
|
102
|
+
width = shape.width
|
103
|
+
height = shape.height
|
104
|
+
shape_bbox = [left, top, left + width, top + height]
|
105
|
+
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
106
|
+
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
107
|
+
prov = ProvenanceItem(
|
108
|
+
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
109
|
+
)
|
110
|
+
|
111
|
+
return prov
|
112
|
+
|
113
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
114
|
+
is_a_list = False
|
115
|
+
is_list_group_created = False
|
116
|
+
enum_list_item_value = 0
|
117
|
+
new_list = None
|
118
|
+
bullet_type = "None"
|
119
|
+
list_text = ""
|
120
|
+
list_label = GroupLabel.LIST
|
121
|
+
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
122
|
+
|
123
|
+
# Identify if shape contains lists
|
124
|
+
for paragraph in shape.text_frame.paragraphs:
|
125
|
+
# Check if paragraph is a bullet point using the `element` XML
|
126
|
+
p = paragraph._element
|
127
|
+
if (
|
128
|
+
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
129
|
+
is not None
|
130
|
+
):
|
131
|
+
bullet_type = "Bullet"
|
132
|
+
is_a_list = True
|
133
|
+
elif (
|
134
|
+
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
135
|
+
is not None
|
136
|
+
):
|
137
|
+
bullet_type = "Numbered"
|
138
|
+
is_a_list = True
|
139
|
+
else:
|
140
|
+
is_a_list = False
|
141
|
+
|
142
|
+
if paragraph.level > 0:
|
143
|
+
# Most likely a sub-list
|
144
|
+
is_a_list = True
|
145
|
+
|
146
|
+
if is_a_list:
|
147
|
+
# Determine if this is an unordered list or an ordered list.
|
148
|
+
# Set GroupLabel.ORDERED_LIST when it fits.
|
149
|
+
if bullet_type == "Numbered":
|
150
|
+
list_label = GroupLabel.ORDERED_LIST
|
151
|
+
|
152
|
+
if is_a_list:
|
153
|
+
_log.debug("LIST DETECTED!")
|
154
|
+
else:
|
155
|
+
_log.debug("No List")
|
156
|
+
|
157
|
+
# If there is a list inside of the shape, create a new docling list to assign list items to
|
158
|
+
# if is_a_list:
|
159
|
+
# new_list = doc.add_group(
|
160
|
+
# label=list_label, name=f"list", parent=parent_slide
|
161
|
+
# )
|
162
|
+
|
163
|
+
# Iterate through paragraphs to build up text
|
164
|
+
for paragraph in shape.text_frame.paragraphs:
|
165
|
+
# p_text = paragraph.text.strip()
|
166
|
+
p = paragraph._element
|
167
|
+
enum_list_item_value += 1
|
168
|
+
inline_paragraph_text = ""
|
169
|
+
inline_list_item_text = ""
|
170
|
+
|
171
|
+
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
172
|
+
if len(e.text.strip()) > 0:
|
173
|
+
e_is_a_list_item = False
|
174
|
+
is_numbered = False
|
175
|
+
if (
|
176
|
+
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
177
|
+
is not None
|
178
|
+
):
|
179
|
+
bullet_type = "Bullet"
|
180
|
+
e_is_a_list_item = True
|
181
|
+
elif (
|
182
|
+
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
183
|
+
is not None
|
184
|
+
):
|
185
|
+
bullet_type = "Numbered"
|
186
|
+
is_numbered = True
|
187
|
+
e_is_a_list_item = True
|
188
|
+
else:
|
189
|
+
e_is_a_list_item = False
|
190
|
+
|
191
|
+
if e_is_a_list_item:
|
192
|
+
if len(inline_paragraph_text) > 0:
|
193
|
+
# output accumulated inline text:
|
194
|
+
doc.add_text(
|
195
|
+
label=doc_label,
|
196
|
+
parent=parent_slide,
|
197
|
+
text=inline_paragraph_text,
|
198
|
+
prov=prov,
|
199
|
+
)
|
200
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
201
|
+
inline_list_item_text += e.text
|
202
|
+
# print(e.text)
|
203
|
+
else:
|
204
|
+
# Assign proper label to the text, depending if it's a Title or Section Header
|
205
|
+
# For other types of text, assign - PARAGRAPH
|
206
|
+
doc_label = DocItemLabel.PARAGRAPH
|
207
|
+
if shape.is_placeholder:
|
208
|
+
placeholder_type = shape.placeholder_format.type
|
209
|
+
if placeholder_type in [
|
210
|
+
PP_PLACEHOLDER.CENTER_TITLE,
|
211
|
+
PP_PLACEHOLDER.TITLE,
|
212
|
+
]:
|
213
|
+
# It's a title
|
214
|
+
doc_label = DocItemLabel.TITLE
|
215
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
216
|
+
DocItemLabel.SECTION_HEADER
|
217
|
+
enum_list_item_value = 0
|
218
|
+
inline_paragraph_text += e.text
|
219
|
+
|
220
|
+
if len(inline_paragraph_text) > 0:
|
221
|
+
# output accumulated inline text:
|
222
|
+
doc.add_text(
|
223
|
+
label=doc_label,
|
224
|
+
parent=parent_slide,
|
225
|
+
text=inline_paragraph_text,
|
226
|
+
prov=prov,
|
227
|
+
)
|
228
|
+
|
229
|
+
if len(inline_list_item_text) > 0:
|
230
|
+
enum_marker = ""
|
231
|
+
if is_numbered:
|
232
|
+
enum_marker = str(enum_list_item_value) + "."
|
233
|
+
if not is_list_group_created:
|
234
|
+
new_list = doc.add_group(
|
235
|
+
label=list_label, name=f"list", parent=parent_slide
|
236
|
+
)
|
237
|
+
is_list_group_created = True
|
238
|
+
doc.add_list_item(
|
239
|
+
marker=enum_marker,
|
240
|
+
enumerated=is_numbered,
|
241
|
+
parent=new_list,
|
242
|
+
text=inline_list_item_text,
|
243
|
+
prov=prov,
|
244
|
+
)
|
245
|
+
return
|
246
|
+
|
247
|
+
def handle_title(self, shape, parent_slide, slide_ind, doc):
|
248
|
+
placeholder_type = shape.placeholder_format.type
|
249
|
+
txt = shape.text.strip()
|
250
|
+
prov = self.generate_prov(shape, slide_ind, txt)
|
251
|
+
|
252
|
+
if len(txt.strip()) > 0:
|
253
|
+
# title = slide.shapes.title.text if slide.shapes.title else "No title"
|
254
|
+
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
|
255
|
+
_log.info(f"Title found: {shape.text}")
|
256
|
+
doc.add_text(
|
257
|
+
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
|
258
|
+
)
|
259
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
260
|
+
_log.info(f"Subtitle found: {shape.text}")
|
261
|
+
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
|
262
|
+
doc.add_text(
|
263
|
+
label=DocItemLabel.SECTION_HEADER,
|
264
|
+
parent=parent_slide,
|
265
|
+
text=txt,
|
266
|
+
prov=prov,
|
267
|
+
)
|
268
|
+
return
|
269
|
+
|
270
|
+
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
271
|
+
# shape has picture
|
272
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
273
|
+
doc.add_picture(parent=parent_slide, caption=None, prov=prov)
|
274
|
+
return
|
275
|
+
|
276
|
+
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
277
|
+
# Handling tables, images, charts
|
278
|
+
if shape.has_table:
|
279
|
+
table = shape.table
|
280
|
+
table_xml = shape._element
|
281
|
+
|
282
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
283
|
+
|
284
|
+
num_cols = 0
|
285
|
+
num_rows = len(table.rows)
|
286
|
+
tcells = []
|
287
|
+
# Access the XML element for the shape that contains the table
|
288
|
+
table_xml = shape._element
|
289
|
+
|
290
|
+
for row_idx, row in enumerate(table.rows):
|
291
|
+
if len(row.cells) > num_cols:
|
292
|
+
num_cols = len(row.cells)
|
293
|
+
for col_idx, cell in enumerate(row.cells):
|
294
|
+
# Access the XML of the cell (this is the 'tc' element in table XML)
|
295
|
+
cell_xml = table_xml.xpath(
|
296
|
+
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
|
297
|
+
)
|
298
|
+
|
299
|
+
if not cell_xml:
|
300
|
+
continue # If no cell XML is found, skip
|
301
|
+
|
302
|
+
cell_xml = cell_xml[0] # Get the first matching XML node
|
303
|
+
row_span = cell_xml.get("rowSpan") # Vertical span
|
304
|
+
col_span = cell_xml.get("gridSpan") # Horizontal span
|
305
|
+
|
306
|
+
if row_span is None:
|
307
|
+
row_span = 1
|
308
|
+
else:
|
309
|
+
row_span = int(row_span)
|
310
|
+
|
311
|
+
if col_span is None:
|
312
|
+
col_span = 1
|
313
|
+
else:
|
314
|
+
col_span = int(col_span)
|
315
|
+
|
316
|
+
icell = TableCell(
|
317
|
+
text=cell.text.strip(),
|
318
|
+
row_span=row_span,
|
319
|
+
col_span=col_span,
|
320
|
+
start_row_offset_idx=row_idx,
|
321
|
+
end_row_offset_idx=row_idx + row_span,
|
322
|
+
start_col_offset_idx=col_idx,
|
323
|
+
end_col_offset_idx=col_idx + col_span,
|
324
|
+
col_header=False,
|
325
|
+
row_header=False,
|
326
|
+
)
|
327
|
+
if len(cell.text.strip()) > 0:
|
328
|
+
tcells.append(icell)
|
329
|
+
# Initialize Docling TableData
|
330
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
331
|
+
# Populate
|
332
|
+
for tcell in tcells:
|
333
|
+
data.table_cells.append(tcell)
|
334
|
+
if len(tcells) > 0:
|
335
|
+
# If table is not fully empty...
|
336
|
+
# Create Docling table
|
337
|
+
doc.add_table(parent=parent_slide, data=data, prov=prov)
|
338
|
+
return
|
339
|
+
|
340
|
+
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
341
|
+
# Units of size in PPTX by default are EMU units (English Metric Units)
|
342
|
+
slide_width = pptx_obj.slide_width
|
343
|
+
slide_height = pptx_obj.slide_height
|
344
|
+
|
345
|
+
text_content = [] # type: ignore
|
346
|
+
|
347
|
+
max_levels = 10
|
348
|
+
parents = {} # type: ignore
|
349
|
+
for i in range(0, max_levels):
|
350
|
+
parents[i] = None
|
351
|
+
|
352
|
+
# Loop through each slide
|
353
|
+
for slide_num, slide in enumerate(pptx_obj.slides):
|
354
|
+
slide_ind = pptx_obj.slides.index(slide)
|
355
|
+
parent_slide = doc.add_group(
|
356
|
+
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
357
|
+
)
|
358
|
+
|
359
|
+
size = Size(width=slide_width, height=slide_height)
|
360
|
+
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
361
|
+
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
362
|
+
|
363
|
+
# Loop through each shape in the slide
|
364
|
+
for shape in slide.shapes:
|
365
|
+
|
366
|
+
if shape.has_table:
|
367
|
+
# Handle Tables
|
368
|
+
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
369
|
+
|
370
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
371
|
+
# Handle Tables
|
372
|
+
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
373
|
+
|
374
|
+
# If shape doesn't have any text, move on to the next shape
|
375
|
+
if not hasattr(shape, "text"):
|
376
|
+
continue
|
377
|
+
if shape.text is None:
|
378
|
+
continue
|
379
|
+
if len(shape.text.strip()) == 0:
|
380
|
+
continue
|
381
|
+
if not shape.has_text_frame:
|
382
|
+
_log.warn("Warning: shape has text but not text_frame")
|
383
|
+
continue
|
384
|
+
|
385
|
+
# if shape.is_placeholder:
|
386
|
+
# Handle Titles (Headers) and Subtitles
|
387
|
+
# Check if the shape is a placeholder (titles are placeholders)
|
388
|
+
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
389
|
+
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
390
|
+
# else:
|
391
|
+
|
392
|
+
# Handle other text elements, including lists (bullet lists, numbered lists)
|
393
|
+
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
394
|
+
|
395
|
+
# figures...
|
396
|
+
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
|
397
|
+
|
398
|
+
return doc
|