docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,398 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ImageRef,
14
+ ProvenanceItem,
15
+ Size,
16
+ TableCell,
17
+ TableData,
18
+ )
19
+ from docling_core.types.doc.document import ContentLayer
20
+ from PIL import Image, UnidentifiedImageError
21
+ from pptx import Presentation
22
+ from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
23
+ from pptx.oxml.text import CT_TextLineBreak
24
+
25
+ from docling.backend.abstract_backend import (
26
+ DeclarativeDocumentBackend,
27
+ PaginatedDocumentBackend,
28
+ )
29
+ from docling.datamodel.base_models import InputFormat
30
+ from docling.datamodel.document import InputDocument
31
+
32
+ _log = logging.getLogger(__name__)
33
+
34
+
35
+ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
36
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
37
+ super().__init__(in_doc, path_or_stream)
38
+ self.namespaces = {
39
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
40
+ "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
41
+ "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
42
+ }
43
+ # Powerpoint file:
44
+ self.path_or_stream = path_or_stream
45
+
46
+ self.pptx_obj = None
47
+ self.valid = False
48
+ try:
49
+ if isinstance(self.path_or_stream, BytesIO):
50
+ self.pptx_obj = Presentation(self.path_or_stream)
51
+ elif isinstance(self.path_or_stream, Path):
52
+ self.pptx_obj = Presentation(str(self.path_or_stream))
53
+
54
+ self.valid = True
55
+ except Exception as e:
56
+ raise RuntimeError(
57
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
58
+ ) from e
59
+
60
+ return
61
+
62
+ def page_count(self) -> int:
63
+ if self.is_valid():
64
+ assert self.pptx_obj is not None
65
+ return len(self.pptx_obj.slides)
66
+ else:
67
+ return 0
68
+
69
+ def is_valid(self) -> bool:
70
+ return self.valid
71
+
72
+ @classmethod
73
+ def supports_pagination(cls) -> bool:
74
+ return True # True? if so, how to handle pages...
75
+
76
+ def unload(self):
77
+ if isinstance(self.path_or_stream, BytesIO):
78
+ self.path_or_stream.close()
79
+
80
+ self.path_or_stream = None
81
+
82
+ @classmethod
83
+ def supported_formats(cls) -> set[InputFormat]:
84
+ return {InputFormat.PPTX}
85
+
86
+ def convert(self) -> DoclingDocument:
87
+ # Parses the PPTX into a structured document model.
88
+ # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
89
+
90
+ origin = DocumentOrigin(
91
+ filename=self.file.name or "file",
92
+ mimetype="application/vnd.ms-powerpoint",
93
+ binary_hash=self.document_hash,
94
+ )
95
+
96
+ doc = DoclingDocument(
97
+ name=self.file.stem or "file", origin=origin
98
+ ) # must add origin information
99
+ doc = self.walk_linear(self.pptx_obj, doc)
100
+
101
+ return doc
102
+
103
+ def generate_prov(
104
+ self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
105
+ ):
106
+ if shape.left:
107
+ left = shape.left
108
+ top = shape.top
109
+ width = shape.width
110
+ height = shape.height
111
+ else:
112
+ left = 0
113
+ top = 0
114
+ width = slide_size.width
115
+ height = slide_size.height
116
+ shape_bbox = [left, top, left + width, top + height]
117
+ shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
118
+ prov = ProvenanceItem(
119
+ page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
120
+ )
121
+
122
+ return prov
123
+
124
+ def handle_text_elements(
125
+ self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
126
+ ):
127
+ is_list_group_created = False
128
+ enum_list_item_value = 0
129
+ new_list = None
130
+ doc_label = DocItemLabel.LIST_ITEM
131
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
132
+
133
+ def is_list_item(paragraph):
134
+ """Check if the paragraph is a list item."""
135
+ p = paragraph._element
136
+ if (
137
+ p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
138
+ is not None
139
+ ):
140
+ return (True, "Bullet")
141
+ elif (
142
+ p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
143
+ is not None
144
+ ):
145
+ return (True, "Numbered")
146
+ elif paragraph.level > 0:
147
+ # Most likely a sub-list
148
+ return (True, "None")
149
+ else:
150
+ return (False, "None")
151
+
152
+ # Iterate through paragraphs to build up text
153
+ for paragraph in shape.text_frame.paragraphs:
154
+ is_a_list, bullet_type = is_list_item(paragraph)
155
+ p = paragraph._element
156
+
157
+ # Convert line breaks to spaces and accumulate text
158
+ p_text = ""
159
+ for e in p.content_children:
160
+ if isinstance(e, CT_TextLineBreak):
161
+ p_text += " "
162
+ else:
163
+ p_text += e.text
164
+
165
+ if is_a_list:
166
+ enum_marker = ""
167
+ enumerated = bullet_type == "Numbered"
168
+
169
+ if not is_list_group_created:
170
+ new_list = doc.add_list_group(
171
+ name="list",
172
+ parent=parent_slide,
173
+ )
174
+ is_list_group_created = True
175
+ enum_list_item_value = 0
176
+
177
+ if enumerated:
178
+ enum_list_item_value += 1
179
+ enum_marker = str(enum_list_item_value) + "."
180
+
181
+ doc.add_list_item(
182
+ marker=enum_marker,
183
+ enumerated=enumerated,
184
+ parent=new_list,
185
+ text=p_text,
186
+ prov=prov,
187
+ )
188
+ else: # is paragraph not a list item
189
+ # Assign proper label to the text, depending if it's a Title or Section Header
190
+ # For other types of text, assign - PARAGRAPH
191
+ doc_label = DocItemLabel.PARAGRAPH
192
+ if shape.is_placeholder:
193
+ placeholder_type = shape.placeholder_format.type
194
+ if placeholder_type in [
195
+ PP_PLACEHOLDER.CENTER_TITLE,
196
+ PP_PLACEHOLDER.TITLE,
197
+ ]:
198
+ # It's a title
199
+ doc_label = DocItemLabel.TITLE
200
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
201
+ DocItemLabel.SECTION_HEADER
202
+
203
+ # output accumulated inline text:
204
+ doc.add_text(
205
+ label=doc_label,
206
+ parent=parent_slide,
207
+ text=p_text,
208
+ prov=prov,
209
+ )
210
+ return
211
+
212
+ def handle_title(self, shape, parent_slide, slide_ind, doc):
213
+ placeholder_type = shape.placeholder_format.type
214
+ txt = shape.text.strip()
215
+ prov = self.generate_prov(shape, slide_ind, txt)
216
+
217
+ if len(txt.strip()) > 0:
218
+ # title = slide.shapes.title.text if slide.shapes.title else "No title"
219
+ if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
220
+ _log.info(f"Title found: {shape.text}")
221
+ doc.add_text(
222
+ label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
223
+ )
224
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
225
+ _log.info(f"Subtitle found: {shape.text}")
226
+ # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
227
+ doc.add_text(
228
+ label=DocItemLabel.SECTION_HEADER,
229
+ parent=parent_slide,
230
+ text=txt,
231
+ prov=prov,
232
+ )
233
+ return
234
+
235
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
236
+ # Open it with PIL
237
+ try:
238
+ # Get the image bytes
239
+ image = shape.image
240
+ image_bytes = image.blob
241
+ im_dpi, _ = image.dpi
242
+ pil_image = Image.open(BytesIO(image_bytes))
243
+
244
+ # shape has picture
245
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
246
+ doc.add_picture(
247
+ parent=parent_slide,
248
+ image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
249
+ caption=None,
250
+ prov=prov,
251
+ )
252
+ except (UnidentifiedImageError, OSError) as e:
253
+ _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
254
+ return
255
+
256
+ def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
257
+ # Handling tables, images, charts
258
+ if shape.has_table:
259
+ table = shape.table
260
+ table_xml = shape._element
261
+
262
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
263
+
264
+ num_cols = 0
265
+ num_rows = len(table.rows)
266
+ tcells = []
267
+ # Access the XML element for the shape that contains the table
268
+ table_xml = shape._element
269
+
270
+ for row_idx, row in enumerate(table.rows):
271
+ if len(row.cells) > num_cols:
272
+ num_cols = len(row.cells)
273
+ for col_idx, cell in enumerate(row.cells):
274
+ # Access the XML of the cell (this is the 'tc' element in table XML)
275
+ cell_xml = table_xml.xpath(
276
+ f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
277
+ )
278
+
279
+ if not cell_xml:
280
+ continue # If no cell XML is found, skip
281
+
282
+ cell_xml = cell_xml[0] # Get the first matching XML node
283
+ row_span = cell_xml.get("rowSpan") # Vertical span
284
+ col_span = cell_xml.get("gridSpan") # Horizontal span
285
+
286
+ if row_span is None:
287
+ row_span = 1
288
+ else:
289
+ row_span = int(row_span)
290
+
291
+ if col_span is None:
292
+ col_span = 1
293
+ else:
294
+ col_span = int(col_span)
295
+
296
+ icell = TableCell(
297
+ text=cell.text.strip(),
298
+ row_span=row_span,
299
+ col_span=col_span,
300
+ start_row_offset_idx=row_idx,
301
+ end_row_offset_idx=row_idx + row_span,
302
+ start_col_offset_idx=col_idx,
303
+ end_col_offset_idx=col_idx + col_span,
304
+ column_header=row_idx == 0,
305
+ row_header=False,
306
+ )
307
+ if len(cell.text.strip()) > 0:
308
+ tcells.append(icell)
309
+ # Initialize Docling TableData
310
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
311
+ # Populate
312
+ for tcell in tcells:
313
+ data.table_cells.append(tcell)
314
+ if len(tcells) > 0:
315
+ # If table is not fully empty...
316
+ # Create Docling table
317
+ doc.add_table(parent=parent_slide, data=data, prov=prov)
318
+ return
319
+
320
+ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
321
+ # Units of size in PPTX by default are EMU units (English Metric Units)
322
+ slide_width = pptx_obj.slide_width
323
+ slide_height = pptx_obj.slide_height
324
+
325
+ max_levels = 10
326
+ parents = {} # type: ignore
327
+ for i in range(max_levels):
328
+ parents[i] = None
329
+
330
+ # Loop through each slide
331
+ for slide_num, slide in enumerate(pptx_obj.slides):
332
+ slide_ind = pptx_obj.slides.index(slide)
333
+ parent_slide = doc.add_group(
334
+ name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
335
+ )
336
+
337
+ slide_size = Size(width=slide_width, height=slide_height)
338
+ doc.add_page(page_no=slide_ind + 1, size=slide_size)
339
+
340
+ def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
341
+ handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
342
+ if shape.has_table:
343
+ # Handle Tables
344
+ self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
345
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
346
+ # Handle Pictures
347
+ if hasattr(shape, "image"):
348
+ self.handle_pictures(
349
+ shape, parent_slide, slide_ind, doc, slide_size
350
+ )
351
+ # If shape doesn't have any text, move on to the next shape
352
+ if not hasattr(shape, "text"):
353
+ return
354
+ if shape.text is None:
355
+ return
356
+ if len(shape.text.strip()) == 0:
357
+ return
358
+ if not shape.has_text_frame:
359
+ _log.warning("Warning: shape has text but not text_frame")
360
+ return
361
+ # Handle other text elements, including lists (bullet lists, numbered lists)
362
+ self.handle_text_elements(
363
+ shape, parent_slide, slide_ind, doc, slide_size
364
+ )
365
+ return
366
+
367
+ def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
368
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
369
+ for groupedshape in shape.shapes:
370
+ handle_shapes(
371
+ groupedshape, parent_slide, slide_ind, doc, slide_size
372
+ )
373
+
374
+ # Loop through each shape in the slide
375
+ for shape in slide.shapes:
376
+ handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
377
+
378
+ # Handle notes slide
379
+ if slide.has_notes_slide:
380
+ notes_slide = slide.notes_slide
381
+ if notes_slide.notes_text_frame is not None:
382
+ notes_text = notes_slide.notes_text_frame.text.strip()
383
+ if notes_text:
384
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
385
+ prov = ProvenanceItem(
386
+ page_no=slide_ind + 1,
387
+ charspan=[0, len(notes_text)],
388
+ bbox=bbox,
389
+ )
390
+ doc.add_text(
391
+ label=DocItemLabel.TEXT,
392
+ parent=parent_slide,
393
+ text=notes_text,
394
+ prov=prov,
395
+ content_layer=ContentLayer.FURNITURE,
396
+ )
397
+
398
+ return doc