docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +15 -11
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.20.0.dist-info/METADATA +0 -380
  35. docling-1.20.0.dist-info/RECORD +0 -35
  36. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,375 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ProvenanceItem,
14
+ Size,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+ from pptx import Presentation
19
+ from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
20
+
21
+ from docling.backend.abstract_backend import (
22
+ DeclarativeDocumentBackend,
23
+ PaginatedDocumentBackend,
24
+ )
25
+ from docling.datamodel.base_models import InputFormat
26
+ from docling.datamodel.document import InputDocument
27
+
28
+ _log = logging.getLogger(__name__)
29
+
30
+
31
+ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
32
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
33
+ super().__init__(in_doc, path_or_stream)
34
+ self.namespaces = {
35
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
36
+ "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
37
+ "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
38
+ }
39
+ # Powerpoint file:
40
+ self.path_or_stream = path_or_stream
41
+
42
+ self.pptx_obj = None
43
+ self.valid = False
44
+ try:
45
+ if isinstance(self.path_or_stream, BytesIO):
46
+ self.pptx_obj = Presentation(self.path_or_stream)
47
+ elif isinstance(self.path_or_stream, Path):
48
+ self.pptx_obj = Presentation(str(self.path_or_stream))
49
+
50
+ self.valid = True
51
+ except Exception as e:
52
+ raise RuntimeError(
53
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
54
+ ) from e
55
+
56
+ return
57
+
58
+ def page_count(self) -> int:
59
+ if self.is_valid():
60
+ assert self.pptx_obj is not None
61
+ return len(self.pptx_obj.slides)
62
+ else:
63
+ return 0
64
+
65
+ def is_valid(self) -> bool:
66
+ return self.valid
67
+
68
+ @classmethod
69
+ def supports_pagination(cls) -> bool:
70
+ return True # True? if so, how to handle pages...
71
+
72
+ def unload(self):
73
+ if isinstance(self.path_or_stream, BytesIO):
74
+ self.path_or_stream.close()
75
+
76
+ self.path_or_stream = None
77
+
78
+ @classmethod
79
+ def supported_formats(cls) -> Set[InputFormat]:
80
+ return {InputFormat.PPTX}
81
+
82
+ def convert(self) -> DoclingDocument:
83
+ # Parses the PPTX into a structured document model.
84
+ # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
85
+
86
+ fname = ""
87
+ if isinstance(self.path_or_stream, Path):
88
+ fname = self.path_or_stream.name
89
+
90
+ origin = DocumentOrigin(
91
+ filename=fname,
92
+ mimetype="application/vnd.ms-powerpoint",
93
+ binary_hash=self.document_hash,
94
+ )
95
+ if len(fname) > 0:
96
+ docname = Path(fname).stem
97
+ else:
98
+ docname = "stream"
99
+ doc = DoclingDocument(
100
+ name=docname, origin=origin
101
+ ) # must add origin information
102
+ doc = self.walk_linear(self.pptx_obj, doc)
103
+
104
+ return doc
105
+
106
+ def generate_prov(self, shape, slide_ind, text=""):
107
+ left = shape.left
108
+ top = shape.top
109
+ width = shape.width
110
+ height = shape.height
111
+ shape_bbox = [left, top, left + width, top + height]
112
+ shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
113
+ # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
114
+ prov = ProvenanceItem(
115
+ page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
116
+ )
117
+
118
+ return prov
119
+
120
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
121
+ is_a_list = False
122
+ enum_list_item_value = 0
123
+ for paragraph in shape.text_frame.paragraphs:
124
+ enum_list_item_value += 1
125
+ bullet_type = "None"
126
+ # Check if paragraph is a bullet point using the `element` XML
127
+ p = paragraph._element
128
+ if (
129
+ p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
130
+ is not None
131
+ ):
132
+ bullet_type = "Bullet"
133
+ is_a_list = True
134
+ elif (
135
+ p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
136
+ is not None
137
+ ):
138
+ bullet_type = "Numbered"
139
+ is_a_list = True
140
+ else:
141
+ is_a_list = False
142
+
143
+ if paragraph.level > 0:
144
+ # Most likely a sub-list
145
+ is_a_list = True
146
+ list_text = paragraph.text.strip()
147
+
148
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip())
149
+
150
+ if is_a_list:
151
+ # Determine if this is an unordered list or an ordered list.
152
+ # Set GroupLabel.ORDERED_LIST when it fits.
153
+ list_label = GroupLabel.LIST
154
+ if bullet_type == "Numbered":
155
+ list_label = GroupLabel.ORDERED_LIST
156
+
157
+ new_list = doc.add_group(
158
+ label=list_label, name=f"list", parent=parent_slide
159
+ )
160
+ else:
161
+ new_list = None
162
+
163
+ if is_a_list:
164
+ _log.debug("LIST DETECTED!")
165
+ else:
166
+ _log.debug("No List")
167
+
168
+ # for e in p.iter():
169
+ for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
170
+ if len(e.text.strip()) > 0:
171
+ e_is_a_list_item = False
172
+ is_numbered = False
173
+ if (
174
+ p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
175
+ is not None
176
+ ):
177
+ bullet_type = "Bullet"
178
+ e_is_a_list_item = True
179
+ elif (
180
+ p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
181
+ is not None
182
+ ):
183
+ bullet_type = "Numbered"
184
+ is_numbered = True
185
+ e_is_a_list_item = True
186
+ else:
187
+ e_is_a_list_item = False
188
+
189
+ if e_is_a_list_item:
190
+ # Set marker and enumerated arguments if this is an enumeration element.
191
+ enum_marker = str(enum_list_item_value) + "."
192
+ doc.add_list_item(
193
+ marker=enum_marker,
194
+ enumerated=is_numbered,
195
+ parent=new_list,
196
+ text=list_text,
197
+ prov=prov,
198
+ )
199
+ else:
200
+ # Assign proper label to the text, depending if it's a Title or Section Header
201
+ # For other types of text, assign - PARAGRAPH
202
+ doc_label = DocItemLabel.PARAGRAPH
203
+ if shape.is_placeholder:
204
+ placeholder_type = shape.placeholder_format.type
205
+ if placeholder_type in [
206
+ PP_PLACEHOLDER.CENTER_TITLE,
207
+ PP_PLACEHOLDER.TITLE,
208
+ ]:
209
+ # It's a title
210
+ doc_label = DocItemLabel.TITLE
211
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
212
+ DocItemLabel.SECTION_HEADER
213
+
214
+ enum_list_item_value = 0
215
+
216
+ doc.add_text(
217
+ label=doc_label,
218
+ parent=parent_slide,
219
+ text=list_text,
220
+ prov=prov,
221
+ )
222
+ return
223
+
224
+ def handle_title(self, shape, parent_slide, slide_ind, doc):
225
+ placeholder_type = shape.placeholder_format.type
226
+ txt = shape.text.strip()
227
+ prov = self.generate_prov(shape, slide_ind, txt)
228
+
229
+ if len(txt.strip()) > 0:
230
+ # title = slide.shapes.title.text if slide.shapes.title else "No title"
231
+ if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
232
+ _log.info(f"Title found: {shape.text}")
233
+ doc.add_text(
234
+ label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
235
+ )
236
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
237
+ _log.info(f"Subtitle found: {shape.text}")
238
+ # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
239
+ doc.add_text(
240
+ label=DocItemLabel.SECTION_HEADER,
241
+ parent=parent_slide,
242
+ text=txt,
243
+ prov=prov,
244
+ )
245
+ return
246
+
247
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
248
+ # shape has picture
249
+ prov = self.generate_prov(shape, slide_ind, "")
250
+ doc.add_picture(parent=parent_slide, caption=None, prov=prov)
251
+ return
252
+
253
+ def handle_tables(self, shape, parent_slide, slide_ind, doc):
254
+ # Handling tables, images, charts
255
+ if shape.has_table:
256
+ table = shape.table
257
+ table_xml = shape._element
258
+
259
+ prov = self.generate_prov(shape, slide_ind, "")
260
+
261
+ num_cols = 0
262
+ num_rows = len(table.rows)
263
+ tcells = []
264
+ # Access the XML element for the shape that contains the table
265
+ table_xml = shape._element
266
+
267
+ for row_idx, row in enumerate(table.rows):
268
+ if len(row.cells) > num_cols:
269
+ num_cols = len(row.cells)
270
+ for col_idx, cell in enumerate(row.cells):
271
+ # Access the XML of the cell (this is the 'tc' element in table XML)
272
+ cell_xml = table_xml.xpath(
273
+ f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
274
+ )
275
+
276
+ if not cell_xml:
277
+ continue # If no cell XML is found, skip
278
+
279
+ cell_xml = cell_xml[0] # Get the first matching XML node
280
+ row_span = cell_xml.get("rowSpan") # Vertical span
281
+ col_span = cell_xml.get("gridSpan") # Horizontal span
282
+
283
+ if row_span is None:
284
+ row_span = 1
285
+ else:
286
+ row_span = int(row_span)
287
+
288
+ if col_span is None:
289
+ col_span = 1
290
+ else:
291
+ col_span = int(col_span)
292
+
293
+ icell = TableCell(
294
+ text=cell.text.strip(),
295
+ row_span=row_span,
296
+ col_span=col_span,
297
+ start_row_offset_idx=row_idx,
298
+ end_row_offset_idx=row_idx + row_span,
299
+ start_col_offset_idx=col_idx,
300
+ end_col_offset_idx=col_idx + col_span,
301
+ col_header=False,
302
+ row_header=False,
303
+ )
304
+ if len(cell.text.strip()) > 0:
305
+ tcells.append(icell)
306
+ # Initialize Docling TableData
307
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
308
+ # Populate
309
+ for tcell in tcells:
310
+ data.table_cells.append(tcell)
311
+ if len(tcells) > 0:
312
+ # If table is not fully empty...
313
+ # Create Docling table
314
+ doc.add_table(data=data, prov=prov)
315
+ return
316
+
317
+ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
318
+ # Units of size in PPTX by default are EMU units (English Metric Units)
319
+ slide_width = pptx_obj.slide_width
320
+ slide_height = pptx_obj.slide_height
321
+
322
+ text_content = [] # type: ignore
323
+
324
+ max_levels = 10
325
+ parents = {} # type: ignore
326
+ for i in range(0, max_levels):
327
+ parents[i] = None
328
+
329
+ # Loop through each slide
330
+ for slide_num, slide in enumerate(pptx_obj.slides):
331
+ slide_ind = pptx_obj.slides.index(slide)
332
+ parent_slide = doc.add_group(
333
+ name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
334
+ )
335
+
336
+ size = Size(width=slide_width, height=slide_height)
337
+ parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
338
+ # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
339
+
340
+ # Loop through each shape in the slide
341
+ for shape in slide.shapes:
342
+
343
+ if shape.has_table:
344
+ # Handle Tables
345
+ self.handle_tables(shape, parent_slide, slide_ind, doc)
346
+
347
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
348
+ # Handle Tables
349
+ self.handle_pictures(shape, parent_slide, slide_ind, doc)
350
+
351
+ # If shape doesn't have any text, move on to the next shape
352
+ if not hasattr(shape, "text"):
353
+ continue
354
+ if shape.text is None:
355
+ continue
356
+ if len(shape.text.strip()) == 0:
357
+ continue
358
+ if not shape.has_text_frame:
359
+ _log.warn("Warning: shape has text but not text_frame")
360
+ continue
361
+
362
+ # if shape.is_placeholder:
363
+ # Handle Titles (Headers) and Subtitles
364
+ # Check if the shape is a placeholder (titles are placeholders)
365
+ # self.handle_title(shape, parent_slide, slide_ind, doc)
366
+ # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
367
+ # else:
368
+
369
+ # Handle other text elements, including lists (bullet lists, numbered lists)
370
+ self.handle_text_elements(shape, parent_slide, slide_ind, doc)
371
+
372
+ # figures...
373
+ # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
374
+
375
+ return doc