docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,398 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ProvenanceItem,
14
+ Size,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+ from pptx import Presentation
19
+ from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
20
+
21
+ from docling.backend.abstract_backend import (
22
+ DeclarativeDocumentBackend,
23
+ PaginatedDocumentBackend,
24
+ )
25
+ from docling.datamodel.base_models import InputFormat
26
+ from docling.datamodel.document import InputDocument
27
+
28
+ _log = logging.getLogger(__name__)
29
+
30
+
31
+ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
32
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
33
+ super().__init__(in_doc, path_or_stream)
34
+ self.namespaces = {
35
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
36
+ "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
37
+ "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
38
+ }
39
+ # Powerpoint file:
40
+ self.path_or_stream = path_or_stream
41
+
42
+ self.pptx_obj = None
43
+ self.valid = False
44
+ try:
45
+ if isinstance(self.path_or_stream, BytesIO):
46
+ self.pptx_obj = Presentation(self.path_or_stream)
47
+ elif isinstance(self.path_or_stream, Path):
48
+ self.pptx_obj = Presentation(str(self.path_or_stream))
49
+
50
+ self.valid = True
51
+ except Exception as e:
52
+ raise RuntimeError(
53
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
54
+ ) from e
55
+
56
+ return
57
+
58
+ def page_count(self) -> int:
59
+ if self.is_valid():
60
+ assert self.pptx_obj is not None
61
+ return len(self.pptx_obj.slides)
62
+ else:
63
+ return 0
64
+
65
+ def is_valid(self) -> bool:
66
+ return self.valid
67
+
68
+ @classmethod
69
+ def supports_pagination(cls) -> bool:
70
+ return True # True? if so, how to handle pages...
71
+
72
+ def unload(self):
73
+ if isinstance(self.path_or_stream, BytesIO):
74
+ self.path_or_stream.close()
75
+
76
+ self.path_or_stream = None
77
+
78
+ @classmethod
79
+ def supported_formats(cls) -> Set[InputFormat]:
80
+ return {InputFormat.PPTX}
81
+
82
+ def convert(self) -> DoclingDocument:
83
+ # Parses the PPTX into a structured document model.
84
+ # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
85
+
86
+ origin = DocumentOrigin(
87
+ filename=self.file.name or "file",
88
+ mimetype="application/vnd.ms-powerpoint",
89
+ binary_hash=self.document_hash,
90
+ )
91
+
92
+ doc = DoclingDocument(
93
+ name=self.file.stem or "file", origin=origin
94
+ ) # must add origin information
95
+ doc = self.walk_linear(self.pptx_obj, doc)
96
+
97
+ return doc
98
+
99
+ def generate_prov(self, shape, slide_ind, text=""):
100
+ left = shape.left
101
+ top = shape.top
102
+ width = shape.width
103
+ height = shape.height
104
+ shape_bbox = [left, top, left + width, top + height]
105
+ shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
106
+ # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
107
+ prov = ProvenanceItem(
108
+ page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
109
+ )
110
+
111
+ return prov
112
+
113
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
114
+ is_a_list = False
115
+ is_list_group_created = False
116
+ enum_list_item_value = 0
117
+ new_list = None
118
+ bullet_type = "None"
119
+ list_text = ""
120
+ list_label = GroupLabel.LIST
121
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip())
122
+
123
+ # Identify if shape contains lists
124
+ for paragraph in shape.text_frame.paragraphs:
125
+ # Check if paragraph is a bullet point using the `element` XML
126
+ p = paragraph._element
127
+ if (
128
+ p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
129
+ is not None
130
+ ):
131
+ bullet_type = "Bullet"
132
+ is_a_list = True
133
+ elif (
134
+ p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
135
+ is not None
136
+ ):
137
+ bullet_type = "Numbered"
138
+ is_a_list = True
139
+ else:
140
+ is_a_list = False
141
+
142
+ if paragraph.level > 0:
143
+ # Most likely a sub-list
144
+ is_a_list = True
145
+
146
+ if is_a_list:
147
+ # Determine if this is an unordered list or an ordered list.
148
+ # Set GroupLabel.ORDERED_LIST when it fits.
149
+ if bullet_type == "Numbered":
150
+ list_label = GroupLabel.ORDERED_LIST
151
+
152
+ if is_a_list:
153
+ _log.debug("LIST DETECTED!")
154
+ else:
155
+ _log.debug("No List")
156
+
157
+ # If there is a list inside of the shape, create a new docling list to assign list items to
158
+ # if is_a_list:
159
+ # new_list = doc.add_group(
160
+ # label=list_label, name=f"list", parent=parent_slide
161
+ # )
162
+
163
+ # Iterate through paragraphs to build up text
164
+ for paragraph in shape.text_frame.paragraphs:
165
+ # p_text = paragraph.text.strip()
166
+ p = paragraph._element
167
+ enum_list_item_value += 1
168
+ inline_paragraph_text = ""
169
+ inline_list_item_text = ""
170
+
171
+ for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
172
+ if len(e.text.strip()) > 0:
173
+ e_is_a_list_item = False
174
+ is_numbered = False
175
+ if (
176
+ p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
177
+ is not None
178
+ ):
179
+ bullet_type = "Bullet"
180
+ e_is_a_list_item = True
181
+ elif (
182
+ p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
183
+ is not None
184
+ ):
185
+ bullet_type = "Numbered"
186
+ is_numbered = True
187
+ e_is_a_list_item = True
188
+ else:
189
+ e_is_a_list_item = False
190
+
191
+ if e_is_a_list_item:
192
+ if len(inline_paragraph_text) > 0:
193
+ # output accumulated inline text:
194
+ doc.add_text(
195
+ label=doc_label,
196
+ parent=parent_slide,
197
+ text=inline_paragraph_text,
198
+ prov=prov,
199
+ )
200
+ # Set marker and enumerated arguments if this is an enumeration element.
201
+ inline_list_item_text += e.text
202
+ # print(e.text)
203
+ else:
204
+ # Assign proper label to the text, depending if it's a Title or Section Header
205
+ # For other types of text, assign - PARAGRAPH
206
+ doc_label = DocItemLabel.PARAGRAPH
207
+ if shape.is_placeholder:
208
+ placeholder_type = shape.placeholder_format.type
209
+ if placeholder_type in [
210
+ PP_PLACEHOLDER.CENTER_TITLE,
211
+ PP_PLACEHOLDER.TITLE,
212
+ ]:
213
+ # It's a title
214
+ doc_label = DocItemLabel.TITLE
215
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
216
+ DocItemLabel.SECTION_HEADER
217
+ enum_list_item_value = 0
218
+ inline_paragraph_text += e.text
219
+
220
+ if len(inline_paragraph_text) > 0:
221
+ # output accumulated inline text:
222
+ doc.add_text(
223
+ label=doc_label,
224
+ parent=parent_slide,
225
+ text=inline_paragraph_text,
226
+ prov=prov,
227
+ )
228
+
229
+ if len(inline_list_item_text) > 0:
230
+ enum_marker = ""
231
+ if is_numbered:
232
+ enum_marker = str(enum_list_item_value) + "."
233
+ if not is_list_group_created:
234
+ new_list = doc.add_group(
235
+ label=list_label, name=f"list", parent=parent_slide
236
+ )
237
+ is_list_group_created = True
238
+ doc.add_list_item(
239
+ marker=enum_marker,
240
+ enumerated=is_numbered,
241
+ parent=new_list,
242
+ text=inline_list_item_text,
243
+ prov=prov,
244
+ )
245
+ return
246
+
247
+ def handle_title(self, shape, parent_slide, slide_ind, doc):
248
+ placeholder_type = shape.placeholder_format.type
249
+ txt = shape.text.strip()
250
+ prov = self.generate_prov(shape, slide_ind, txt)
251
+
252
+ if len(txt.strip()) > 0:
253
+ # title = slide.shapes.title.text if slide.shapes.title else "No title"
254
+ if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
255
+ _log.info(f"Title found: {shape.text}")
256
+ doc.add_text(
257
+ label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
258
+ )
259
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
260
+ _log.info(f"Subtitle found: {shape.text}")
261
+ # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
262
+ doc.add_text(
263
+ label=DocItemLabel.SECTION_HEADER,
264
+ parent=parent_slide,
265
+ text=txt,
266
+ prov=prov,
267
+ )
268
+ return
269
+
270
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
271
+ # shape has picture
272
+ prov = self.generate_prov(shape, slide_ind, "")
273
+ doc.add_picture(parent=parent_slide, caption=None, prov=prov)
274
+ return
275
+
276
+ def handle_tables(self, shape, parent_slide, slide_ind, doc):
277
+ # Handling tables, images, charts
278
+ if shape.has_table:
279
+ table = shape.table
280
+ table_xml = shape._element
281
+
282
+ prov = self.generate_prov(shape, slide_ind, "")
283
+
284
+ num_cols = 0
285
+ num_rows = len(table.rows)
286
+ tcells = []
287
+ # Access the XML element for the shape that contains the table
288
+ table_xml = shape._element
289
+
290
+ for row_idx, row in enumerate(table.rows):
291
+ if len(row.cells) > num_cols:
292
+ num_cols = len(row.cells)
293
+ for col_idx, cell in enumerate(row.cells):
294
+ # Access the XML of the cell (this is the 'tc' element in table XML)
295
+ cell_xml = table_xml.xpath(
296
+ f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
297
+ )
298
+
299
+ if not cell_xml:
300
+ continue # If no cell XML is found, skip
301
+
302
+ cell_xml = cell_xml[0] # Get the first matching XML node
303
+ row_span = cell_xml.get("rowSpan") # Vertical span
304
+ col_span = cell_xml.get("gridSpan") # Horizontal span
305
+
306
+ if row_span is None:
307
+ row_span = 1
308
+ else:
309
+ row_span = int(row_span)
310
+
311
+ if col_span is None:
312
+ col_span = 1
313
+ else:
314
+ col_span = int(col_span)
315
+
316
+ icell = TableCell(
317
+ text=cell.text.strip(),
318
+ row_span=row_span,
319
+ col_span=col_span,
320
+ start_row_offset_idx=row_idx,
321
+ end_row_offset_idx=row_idx + row_span,
322
+ start_col_offset_idx=col_idx,
323
+ end_col_offset_idx=col_idx + col_span,
324
+ col_header=False,
325
+ row_header=False,
326
+ )
327
+ if len(cell.text.strip()) > 0:
328
+ tcells.append(icell)
329
+ # Initialize Docling TableData
330
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
331
+ # Populate
332
+ for tcell in tcells:
333
+ data.table_cells.append(tcell)
334
+ if len(tcells) > 0:
335
+ # If table is not fully empty...
336
+ # Create Docling table
337
+ doc.add_table(parent=parent_slide, data=data, prov=prov)
338
+ return
339
+
340
+ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
341
+ # Units of size in PPTX by default are EMU units (English Metric Units)
342
+ slide_width = pptx_obj.slide_width
343
+ slide_height = pptx_obj.slide_height
344
+
345
+ text_content = [] # type: ignore
346
+
347
+ max_levels = 10
348
+ parents = {} # type: ignore
349
+ for i in range(0, max_levels):
350
+ parents[i] = None
351
+
352
+ # Loop through each slide
353
+ for slide_num, slide in enumerate(pptx_obj.slides):
354
+ slide_ind = pptx_obj.slides.index(slide)
355
+ parent_slide = doc.add_group(
356
+ name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
357
+ )
358
+
359
+ size = Size(width=slide_width, height=slide_height)
360
+ parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
361
+ # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
362
+
363
+ # Loop through each shape in the slide
364
+ for shape in slide.shapes:
365
+
366
+ if shape.has_table:
367
+ # Handle Tables
368
+ self.handle_tables(shape, parent_slide, slide_ind, doc)
369
+
370
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
371
+ # Handle Tables
372
+ self.handle_pictures(shape, parent_slide, slide_ind, doc)
373
+
374
+ # If shape doesn't have any text, move on to the next shape
375
+ if not hasattr(shape, "text"):
376
+ continue
377
+ if shape.text is None:
378
+ continue
379
+ if len(shape.text.strip()) == 0:
380
+ continue
381
+ if not shape.has_text_frame:
382
+ _log.warn("Warning: shape has text but not text_frame")
383
+ continue
384
+
385
+ # if shape.is_placeholder:
386
+ # Handle Titles (Headers) and Subtitles
387
+ # Check if the shape is a placeholder (titles are placeholders)
388
+ # self.handle_title(shape, parent_slide, slide_ind, doc)
389
+ # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
390
+ # else:
391
+
392
+ # Handle other text elements, including lists (bullet lists, numbered lists)
393
+ self.handle_text_elements(shape, parent_slide, slide_ind, doc)
394
+
395
+ # figures...
396
+ # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
397
+
398
+ return doc