docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +65 -58
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.0.dist-info/METADATA +0 -380
- docling-1.19.0.dist-info/RECORD +0 -34
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,375 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItemLabel,
|
10
|
+
DoclingDocument,
|
11
|
+
DocumentOrigin,
|
12
|
+
GroupLabel,
|
13
|
+
ProvenanceItem,
|
14
|
+
Size,
|
15
|
+
TableCell,
|
16
|
+
TableData,
|
17
|
+
)
|
18
|
+
from pptx import Presentation
|
19
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
20
|
+
|
21
|
+
from docling.backend.abstract_backend import (
|
22
|
+
DeclarativeDocumentBackend,
|
23
|
+
PaginatedDocumentBackend,
|
24
|
+
)
|
25
|
+
from docling.datamodel.base_models import InputFormat
|
26
|
+
from docling.datamodel.document import InputDocument
|
27
|
+
|
28
|
+
_log = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
32
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
33
|
+
super().__init__(in_doc, path_or_stream)
|
34
|
+
self.namespaces = {
|
35
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
36
|
+
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
37
|
+
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
38
|
+
}
|
39
|
+
# Powerpoint file:
|
40
|
+
self.path_or_stream = path_or_stream
|
41
|
+
|
42
|
+
self.pptx_obj = None
|
43
|
+
self.valid = False
|
44
|
+
try:
|
45
|
+
if isinstance(self.path_or_stream, BytesIO):
|
46
|
+
self.pptx_obj = Presentation(self.path_or_stream)
|
47
|
+
elif isinstance(self.path_or_stream, Path):
|
48
|
+
self.pptx_obj = Presentation(str(self.path_or_stream))
|
49
|
+
|
50
|
+
self.valid = True
|
51
|
+
except Exception as e:
|
52
|
+
raise RuntimeError(
|
53
|
+
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
54
|
+
) from e
|
55
|
+
|
56
|
+
return
|
57
|
+
|
58
|
+
def page_count(self) -> int:
|
59
|
+
if self.is_valid():
|
60
|
+
assert self.pptx_obj is not None
|
61
|
+
return len(self.pptx_obj.slides)
|
62
|
+
else:
|
63
|
+
return 0
|
64
|
+
|
65
|
+
def is_valid(self) -> bool:
|
66
|
+
return self.valid
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def supports_pagination(cls) -> bool:
|
70
|
+
return True # True? if so, how to handle pages...
|
71
|
+
|
72
|
+
def unload(self):
|
73
|
+
if isinstance(self.path_or_stream, BytesIO):
|
74
|
+
self.path_or_stream.close()
|
75
|
+
|
76
|
+
self.path_or_stream = None
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
80
|
+
return {InputFormat.PPTX}
|
81
|
+
|
82
|
+
def convert(self) -> DoclingDocument:
|
83
|
+
# Parses the PPTX into a structured document model.
|
84
|
+
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
85
|
+
|
86
|
+
fname = ""
|
87
|
+
if isinstance(self.path_or_stream, Path):
|
88
|
+
fname = self.path_or_stream.name
|
89
|
+
|
90
|
+
origin = DocumentOrigin(
|
91
|
+
filename=fname,
|
92
|
+
mimetype="application/vnd.ms-powerpoint",
|
93
|
+
binary_hash=self.document_hash,
|
94
|
+
)
|
95
|
+
if len(fname) > 0:
|
96
|
+
docname = Path(fname).stem
|
97
|
+
else:
|
98
|
+
docname = "stream"
|
99
|
+
doc = DoclingDocument(
|
100
|
+
name=docname, origin=origin
|
101
|
+
) # must add origin information
|
102
|
+
doc = self.walk_linear(self.pptx_obj, doc)
|
103
|
+
|
104
|
+
return doc
|
105
|
+
|
106
|
+
def generate_prov(self, shape, slide_ind, text=""):
|
107
|
+
left = shape.left
|
108
|
+
top = shape.top
|
109
|
+
width = shape.width
|
110
|
+
height = shape.height
|
111
|
+
shape_bbox = [left, top, left + width, top + height]
|
112
|
+
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
113
|
+
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
114
|
+
prov = ProvenanceItem(
|
115
|
+
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
116
|
+
)
|
117
|
+
|
118
|
+
return prov
|
119
|
+
|
120
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
121
|
+
is_a_list = False
|
122
|
+
enum_list_item_value = 0
|
123
|
+
for paragraph in shape.text_frame.paragraphs:
|
124
|
+
enum_list_item_value += 1
|
125
|
+
bullet_type = "None"
|
126
|
+
# Check if paragraph is a bullet point using the `element` XML
|
127
|
+
p = paragraph._element
|
128
|
+
if (
|
129
|
+
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
130
|
+
is not None
|
131
|
+
):
|
132
|
+
bullet_type = "Bullet"
|
133
|
+
is_a_list = True
|
134
|
+
elif (
|
135
|
+
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
136
|
+
is not None
|
137
|
+
):
|
138
|
+
bullet_type = "Numbered"
|
139
|
+
is_a_list = True
|
140
|
+
else:
|
141
|
+
is_a_list = False
|
142
|
+
|
143
|
+
if paragraph.level > 0:
|
144
|
+
# Most likely a sub-list
|
145
|
+
is_a_list = True
|
146
|
+
list_text = paragraph.text.strip()
|
147
|
+
|
148
|
+
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
149
|
+
|
150
|
+
if is_a_list:
|
151
|
+
# Determine if this is an unordered list or an ordered list.
|
152
|
+
# Set GroupLabel.ORDERED_LIST when it fits.
|
153
|
+
list_label = GroupLabel.LIST
|
154
|
+
if bullet_type == "Numbered":
|
155
|
+
list_label = GroupLabel.ORDERED_LIST
|
156
|
+
|
157
|
+
new_list = doc.add_group(
|
158
|
+
label=list_label, name=f"list", parent=parent_slide
|
159
|
+
)
|
160
|
+
else:
|
161
|
+
new_list = None
|
162
|
+
|
163
|
+
if is_a_list:
|
164
|
+
_log.debug("LIST DETECTED!")
|
165
|
+
else:
|
166
|
+
_log.debug("No List")
|
167
|
+
|
168
|
+
# for e in p.iter():
|
169
|
+
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
170
|
+
if len(e.text.strip()) > 0:
|
171
|
+
e_is_a_list_item = False
|
172
|
+
is_numbered = False
|
173
|
+
if (
|
174
|
+
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
175
|
+
is not None
|
176
|
+
):
|
177
|
+
bullet_type = "Bullet"
|
178
|
+
e_is_a_list_item = True
|
179
|
+
elif (
|
180
|
+
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
181
|
+
is not None
|
182
|
+
):
|
183
|
+
bullet_type = "Numbered"
|
184
|
+
is_numbered = True
|
185
|
+
e_is_a_list_item = True
|
186
|
+
else:
|
187
|
+
e_is_a_list_item = False
|
188
|
+
|
189
|
+
if e_is_a_list_item:
|
190
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
191
|
+
enum_marker = str(enum_list_item_value) + "."
|
192
|
+
doc.add_list_item(
|
193
|
+
marker=enum_marker,
|
194
|
+
enumerated=is_numbered,
|
195
|
+
parent=new_list,
|
196
|
+
text=list_text,
|
197
|
+
prov=prov,
|
198
|
+
)
|
199
|
+
else:
|
200
|
+
# Assign proper label to the text, depending if it's a Title or Section Header
|
201
|
+
# For other types of text, assign - PARAGRAPH
|
202
|
+
doc_label = DocItemLabel.PARAGRAPH
|
203
|
+
if shape.is_placeholder:
|
204
|
+
placeholder_type = shape.placeholder_format.type
|
205
|
+
if placeholder_type in [
|
206
|
+
PP_PLACEHOLDER.CENTER_TITLE,
|
207
|
+
PP_PLACEHOLDER.TITLE,
|
208
|
+
]:
|
209
|
+
# It's a title
|
210
|
+
doc_label = DocItemLabel.TITLE
|
211
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
212
|
+
DocItemLabel.SECTION_HEADER
|
213
|
+
|
214
|
+
enum_list_item_value = 0
|
215
|
+
|
216
|
+
doc.add_text(
|
217
|
+
label=doc_label,
|
218
|
+
parent=parent_slide,
|
219
|
+
text=list_text,
|
220
|
+
prov=prov,
|
221
|
+
)
|
222
|
+
return
|
223
|
+
|
224
|
+
def handle_title(self, shape, parent_slide, slide_ind, doc):
|
225
|
+
placeholder_type = shape.placeholder_format.type
|
226
|
+
txt = shape.text.strip()
|
227
|
+
prov = self.generate_prov(shape, slide_ind, txt)
|
228
|
+
|
229
|
+
if len(txt.strip()) > 0:
|
230
|
+
# title = slide.shapes.title.text if slide.shapes.title else "No title"
|
231
|
+
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
|
232
|
+
_log.info(f"Title found: {shape.text}")
|
233
|
+
doc.add_text(
|
234
|
+
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
|
235
|
+
)
|
236
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
237
|
+
_log.info(f"Subtitle found: {shape.text}")
|
238
|
+
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
|
239
|
+
doc.add_text(
|
240
|
+
label=DocItemLabel.SECTION_HEADER,
|
241
|
+
parent=parent_slide,
|
242
|
+
text=txt,
|
243
|
+
prov=prov,
|
244
|
+
)
|
245
|
+
return
|
246
|
+
|
247
|
+
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
248
|
+
# shape has picture
|
249
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
250
|
+
doc.add_picture(parent=parent_slide, caption=None, prov=prov)
|
251
|
+
return
|
252
|
+
|
253
|
+
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
254
|
+
# Handling tables, images, charts
|
255
|
+
if shape.has_table:
|
256
|
+
table = shape.table
|
257
|
+
table_xml = shape._element
|
258
|
+
|
259
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
260
|
+
|
261
|
+
num_cols = 0
|
262
|
+
num_rows = len(table.rows)
|
263
|
+
tcells = []
|
264
|
+
# Access the XML element for the shape that contains the table
|
265
|
+
table_xml = shape._element
|
266
|
+
|
267
|
+
for row_idx, row in enumerate(table.rows):
|
268
|
+
if len(row.cells) > num_cols:
|
269
|
+
num_cols = len(row.cells)
|
270
|
+
for col_idx, cell in enumerate(row.cells):
|
271
|
+
# Access the XML of the cell (this is the 'tc' element in table XML)
|
272
|
+
cell_xml = table_xml.xpath(
|
273
|
+
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
|
274
|
+
)
|
275
|
+
|
276
|
+
if not cell_xml:
|
277
|
+
continue # If no cell XML is found, skip
|
278
|
+
|
279
|
+
cell_xml = cell_xml[0] # Get the first matching XML node
|
280
|
+
row_span = cell_xml.get("rowSpan") # Vertical span
|
281
|
+
col_span = cell_xml.get("gridSpan") # Horizontal span
|
282
|
+
|
283
|
+
if row_span is None:
|
284
|
+
row_span = 1
|
285
|
+
else:
|
286
|
+
row_span = int(row_span)
|
287
|
+
|
288
|
+
if col_span is None:
|
289
|
+
col_span = 1
|
290
|
+
else:
|
291
|
+
col_span = int(col_span)
|
292
|
+
|
293
|
+
icell = TableCell(
|
294
|
+
text=cell.text.strip(),
|
295
|
+
row_span=row_span,
|
296
|
+
col_span=col_span,
|
297
|
+
start_row_offset_idx=row_idx,
|
298
|
+
end_row_offset_idx=row_idx + row_span,
|
299
|
+
start_col_offset_idx=col_idx,
|
300
|
+
end_col_offset_idx=col_idx + col_span,
|
301
|
+
col_header=False,
|
302
|
+
row_header=False,
|
303
|
+
)
|
304
|
+
if len(cell.text.strip()) > 0:
|
305
|
+
tcells.append(icell)
|
306
|
+
# Initialize Docling TableData
|
307
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
308
|
+
# Populate
|
309
|
+
for tcell in tcells:
|
310
|
+
data.table_cells.append(tcell)
|
311
|
+
if len(tcells) > 0:
|
312
|
+
# If table is not fully empty...
|
313
|
+
# Create Docling table
|
314
|
+
doc.add_table(data=data, prov=prov)
|
315
|
+
return
|
316
|
+
|
317
|
+
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
318
|
+
# Units of size in PPTX by default are EMU units (English Metric Units)
|
319
|
+
slide_width = pptx_obj.slide_width
|
320
|
+
slide_height = pptx_obj.slide_height
|
321
|
+
|
322
|
+
text_content = [] # type: ignore
|
323
|
+
|
324
|
+
max_levels = 10
|
325
|
+
parents = {} # type: ignore
|
326
|
+
for i in range(0, max_levels):
|
327
|
+
parents[i] = None
|
328
|
+
|
329
|
+
# Loop through each slide
|
330
|
+
for slide_num, slide in enumerate(pptx_obj.slides):
|
331
|
+
slide_ind = pptx_obj.slides.index(slide)
|
332
|
+
parent_slide = doc.add_group(
|
333
|
+
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
334
|
+
)
|
335
|
+
|
336
|
+
size = Size(width=slide_width, height=slide_height)
|
337
|
+
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
338
|
+
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
|
339
|
+
|
340
|
+
# Loop through each shape in the slide
|
341
|
+
for shape in slide.shapes:
|
342
|
+
|
343
|
+
if shape.has_table:
|
344
|
+
# Handle Tables
|
345
|
+
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
346
|
+
|
347
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
348
|
+
# Handle Tables
|
349
|
+
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
350
|
+
|
351
|
+
# If shape doesn't have any text, move on to the next shape
|
352
|
+
if not hasattr(shape, "text"):
|
353
|
+
continue
|
354
|
+
if shape.text is None:
|
355
|
+
continue
|
356
|
+
if len(shape.text.strip()) == 0:
|
357
|
+
continue
|
358
|
+
if not shape.has_text_frame:
|
359
|
+
_log.warn("Warning: shape has text but not text_frame")
|
360
|
+
continue
|
361
|
+
|
362
|
+
# if shape.is_placeholder:
|
363
|
+
# Handle Titles (Headers) and Subtitles
|
364
|
+
# Check if the shape is a placeholder (titles are placeholders)
|
365
|
+
# self.handle_title(shape, parent_slide, slide_ind, doc)
|
366
|
+
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
367
|
+
# else:
|
368
|
+
|
369
|
+
# Handle other text elements, including lists (bullet lists, numbered lists)
|
370
|
+
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
371
|
+
|
372
|
+
# figures...
|
373
|
+
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
|
374
|
+
|
375
|
+
return doc
|