docling 2.17.0__py3-none-any.whl → 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Optional, Set, Union
5
5
 
6
- from bs4 import BeautifulSoup
6
+ from bs4 import BeautifulSoup, Tag
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
9
9
  DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
24
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
25
  super().__init__(in_doc, path_or_stream)
26
26
  _log.debug("About to init HTML backend...")
27
- self.soup = None
27
+ self.soup: Optional[Tag] = None
28
28
  # HTML file:
29
29
  self.path_or_stream = path_or_stream
30
30
  # Initialise the parents for the hierarchy
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
89
89
  )
90
90
  return doc
91
91
 
92
- def walk(self, element, doc):
92
+ def walk(self, element: Tag, doc: DoclingDocument):
93
93
  try:
94
94
  # Iterate over elements in the body of the document
95
95
  for idx, element in enumerate(element.children):
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
106
106
 
107
107
  return doc
108
108
 
109
- def analyse_element(self, element, idx, doc):
109
+ def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
110
  """
111
111
  if element.name!=None:
112
112
  _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
136
136
  else:
137
137
  self.walk(element, doc)
138
138
 
139
- def get_direct_text(self, item):
139
+ def get_direct_text(self, item: Tag):
140
140
  """Get the direct text of the <li> element (ignoring nested lists)."""
141
141
  text = item.find(string=True, recursive=False)
142
142
  if isinstance(text, str):
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
145
145
  return ""
146
146
 
147
147
  # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item):
148
+ def extract_text_recursively(self, item: Tag):
149
149
  result = []
150
150
 
151
151
  if isinstance(item, str):
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
166
166
 
167
167
  return "".join(result) + " "
168
168
 
169
- def handle_header(self, element, idx, doc):
169
+ def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
170
170
  """Handles header tags (h1, h2, etc.)."""
171
171
  hlevel = int(element.name.replace("h", ""))
172
172
  slevel = hlevel - 1
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
208
  level=hlevel,
209
209
  )
210
210
 
211
- def handle_code(self, element, idx, doc):
211
+ def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
212
212
  """Handles monospace code snippets (pre)."""
213
213
  if element.text is None:
214
214
  return
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
216
216
  label = DocItemLabel.CODE
217
217
  if len(text) == 0:
218
218
  return
219
- doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
+ doc.add_code(parent=self.parents[self.level], text=text)
220
220
 
221
- def handle_paragraph(self, element, idx, doc):
221
+ def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
222
222
  """Handles paragraph tags (p)."""
223
223
  if element.text is None:
224
224
  return
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
228
228
  return
229
229
  doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
230
 
231
- def handle_list(self, element, idx, doc):
231
+ def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
232
232
  """Handles list tags (ul, ol) and their list items."""
233
233
 
234
234
  if element.name == "ul":
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
250
250
  self.parents[self.level + 1] = None
251
251
  self.level -= 1
252
252
 
253
- def handle_listitem(self, element, idx, doc):
253
+ def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
254
254
  """Handles listitem tags (li)."""
255
255
  nested_lists = element.find(["ul", "ol"])
256
256
 
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
304
304
  else:
305
305
  _log.warn("list-item has no text: ", element)
306
306
 
307
- def handle_table(self, element, idx, doc):
307
+ def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
308
  """Handles table tags."""
309
309
 
310
310
  nested_tables = element.find("table")
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
377
377
 
378
378
  doc.add_table(data=data, parent=self.parents[self.level])
379
379
 
380
- def get_list_text(self, list_element, level=0):
380
+ def get_list_text(self, list_element: Tag, level=0):
381
381
  """Recursively extract text from <ul> or <ol> with proper indentation."""
382
382
  result = []
383
383
  bullet_char = "*" # Default bullet character for unordered lists
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
403
403
 
404
404
  return result
405
405
 
406
- def extract_table_cell_text(self, cell):
406
+ def extract_table_cell_text(self, cell: Tag):
407
407
  """Extract text from a table cell, including lists with indents."""
408
408
  contains_lists = cell.find(["ul", "ol"])
409
409
  if contains_lists is None:
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
414
414
  )
415
415
  return cell.text
416
416
 
417
- def handle_figure(self, element, idx, doc):
417
+ def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
418
418
  """Handles image tags (img)."""
419
419
 
420
420
  # Extract the image URI from the <img> tag
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
437
437
  caption=fig_caption,
438
438
  )
439
439
 
440
- def handle_image(self, element, idx, doc):
440
+ def handle_image(self, element: Tag, idx, doc: DoclingDocument):
441
441
  """Handles image tags (img)."""
442
442
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
+ import marko.element
9
10
  import marko.ext
10
11
  import marko.ext.gfm
11
12
  import marko.inline
@@ -23,14 +24,19 @@ from docling_core.types.doc import (
23
24
  from marko import Markdown
24
25
 
25
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.html_backend import HTMLDocumentBackend
26
28
  from docling.datamodel.base_models import InputFormat
27
29
  from docling.datamodel.document import InputDocument
28
30
 
29
31
  _log = logging.getLogger(__name__)
30
32
 
33
+ _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
34
+ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
+ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
+
31
37
 
32
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
33
- def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
39
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
34
40
  # This regex will match any sequence of underscores
35
41
  pattern = r"_+"
36
42
 
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
66
72
  self.in_table = False
67
73
  self.md_table_buffer: list[str] = []
68
74
  self.inline_texts: list[str] = []
75
+ self._html_blocks: int = 0
69
76
 
70
77
  try:
71
78
  if isinstance(self.path_or_stream, BytesIO):
@@ -74,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
74
81
  # very long sequences of underscores will lead to unnecessary long processing times.
75
82
  # In any proper Markdown files, underscores have to be escaped,
76
83
  # otherwise they represent emphasis (bold or italic)
77
- self.markdown = self.shorten_underscore_sequences(text_stream)
84
+ self.markdown = self._shorten_underscore_sequences(text_stream)
78
85
  if isinstance(self.path_or_stream, Path):
79
86
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
80
87
  md_content = f.read()
@@ -82,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
82
89
  # very long sequences of underscores will lead to unnecessary long processing times.
83
90
  # In any proper Markdown files, underscores have to be escaped,
84
91
  # otherwise they represent emphasis (bold or italic)
85
- self.markdown = self.shorten_underscore_sequences(md_content)
92
+ self.markdown = self._shorten_underscore_sequences(md_content)
86
93
  self.valid = True
87
94
 
88
95
  _log.debug(self.markdown)
@@ -92,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
92
99
  ) from e
93
100
  return
94
101
 
95
- def close_table(self, doc: DoclingDocument):
102
+ def _close_table(self, doc: DoclingDocument):
96
103
  if self.in_table:
97
104
  _log.debug("=== TABLE START ===")
98
105
  for md_table_row in self.md_table_buffer:
@@ -149,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
149
156
  doc.add_table(data=table_data)
150
157
  return
151
158
 
152
- def process_inline_text(
153
- self, parent_element: Optional[NodeItem], doc: DoclingDocument
159
+ def _process_inline_text(
160
+ self, parent_item: Optional[NodeItem], doc: DoclingDocument
154
161
  ):
155
162
  txt = " ".join(self.inline_texts)
156
163
  if len(txt) > 0:
157
164
  doc.add_text(
158
165
  label=DocItemLabel.PARAGRAPH,
159
- parent=parent_element,
166
+ parent=parent_item,
160
167
  text=txt,
161
168
  )
162
169
  self.inline_texts = []
163
170
 
164
- def iterate_elements(
171
+ def _iterate_elements(
165
172
  self,
166
- element: marko.block.Element,
173
+ element: marko.element.Element,
167
174
  depth: int,
168
175
  doc: DoclingDocument,
169
- parent_element: Optional[NodeItem] = None,
176
+ visited: Set[marko.element.Element],
177
+ parent_item: Optional[NodeItem] = None,
170
178
  ):
179
+
180
+ if element in visited:
181
+ return
182
+
171
183
  # Iterates over all elements in the AST
172
184
  # Check for different element types and process relevant details
173
- if isinstance(element, marko.block.Heading):
174
- self.close_table(doc)
175
- self.process_inline_text(parent_element, doc)
185
+ if isinstance(element, marko.block.Heading) and len(element.children) > 0:
186
+ self._close_table(doc)
187
+ self._process_inline_text(parent_item, doc)
176
188
  _log.debug(
177
189
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
178
190
  )
@@ -200,41 +212,48 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
200
212
  traverse(element)
201
213
  snippet_text = "".join(strings)
202
214
  if len(snippet_text) > 0:
203
- parent_element = doc.add_text(
204
- label=doc_label, parent=parent_element, text=snippet_text
215
+ parent_item = doc.add_text(
216
+ label=doc_label, parent=parent_item, text=snippet_text
205
217
  )
206
218
 
207
219
  elif isinstance(element, marko.block.List):
208
- self.close_table(doc)
209
- self.process_inline_text(parent_element, doc)
220
+ has_non_empty_list_items = False
221
+ for child in element.children:
222
+ if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
223
+ has_non_empty_list_items = True
224
+ break
225
+
226
+ self._close_table(doc)
227
+ self._process_inline_text(parent_item, doc)
210
228
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
211
- list_label = GroupLabel.LIST
212
- if element.ordered:
213
- list_label = GroupLabel.ORDERED_LIST
214
- parent_element = doc.add_group(
215
- label=list_label, name=f"list", parent=parent_element
216
- )
229
+ if has_non_empty_list_items:
230
+ label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
231
+ parent_item = doc.add_group(
232
+ label=label, name=f"list", parent=parent_item
233
+ )
217
234
 
218
- elif isinstance(element, marko.block.ListItem):
219
- self.close_table(doc)
220
- self.process_inline_text(parent_element, doc)
235
+ elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
236
+ self._close_table(doc)
237
+ self._process_inline_text(parent_item, doc)
221
238
  _log.debug(" - List item")
222
239
 
223
- snippet_text = str(element.children[0].children[0].children) # type: ignore
240
+ first_child = element.children[0]
241
+ snippet_text = str(first_child.children[0].children) # type: ignore
224
242
  is_numbered = False
225
243
  if (
226
- parent_element is not None
227
- and isinstance(parent_element, DocItem)
228
- and parent_element.label == GroupLabel.ORDERED_LIST
244
+ parent_item is not None
245
+ and isinstance(parent_item, DocItem)
246
+ and parent_item.label == GroupLabel.ORDERED_LIST
229
247
  ):
230
248
  is_numbered = True
231
249
  doc.add_list_item(
232
- enumerated=is_numbered, parent=parent_element, text=snippet_text
250
+ enumerated=is_numbered, parent=parent_item, text=snippet_text
233
251
  )
252
+ visited.add(first_child)
234
253
 
235
254
  elif isinstance(element, marko.inline.Image):
236
- self.close_table(doc)
237
- self.process_inline_text(parent_element, doc)
255
+ self._close_table(doc)
256
+ self._process_inline_text(parent_item, doc)
238
257
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
239
258
 
240
259
  fig_caption: Optional[TextItem] = None
@@ -243,50 +262,44 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
243
262
  label=DocItemLabel.CAPTION, text=element.title
244
263
  )
245
264
 
246
- doc.add_picture(parent=parent_element, caption=fig_caption)
265
+ doc.add_picture(parent=parent_item, caption=fig_caption)
247
266
 
248
- elif isinstance(element, marko.block.Paragraph):
249
- self.process_inline_text(parent_element, doc)
267
+ elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
268
+ self._process_inline_text(parent_item, doc)
250
269
 
251
270
  elif isinstance(element, marko.inline.RawText):
252
271
  _log.debug(f" - Paragraph (raw text): {element.children}")
253
- snippet_text = str(element.children).strip()
272
+ snippet_text = element.children.strip()
254
273
  # Detect start of the table:
255
274
  if "|" in snippet_text:
256
275
  # most likely part of the markdown table
257
276
  self.in_table = True
258
277
  if len(self.md_table_buffer) > 0:
259
- self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
260
- snippet_text
261
- )
278
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
262
279
  else:
263
280
  self.md_table_buffer.append(snippet_text)
264
281
  else:
265
- self.close_table(doc)
266
- self.in_table = False
282
+ self._close_table(doc)
267
283
  # most likely just inline text
268
284
  self.inline_texts.append(str(element.children))
269
285
 
270
286
  elif isinstance(element, marko.inline.CodeSpan):
271
- self.close_table(doc)
272
- self.process_inline_text(parent_element, doc)
287
+ self._close_table(doc)
288
+ self._process_inline_text(parent_item, doc)
273
289
  _log.debug(f" - Code Span: {element.children}")
274
290
  snippet_text = str(element.children).strip()
275
- doc.add_code(parent=parent_element, text=snippet_text)
276
-
277
- elif isinstance(element, marko.block.CodeBlock):
278
- self.close_table(doc)
279
- self.process_inline_text(parent_element, doc)
280
- _log.debug(f" - Code Block: {element.children}")
281
- snippet_text = str(element.children[0].children).strip() # type: ignore
282
- doc.add_code(parent=parent_element, text=snippet_text)
283
-
284
- elif isinstance(element, marko.block.FencedCode):
285
- self.close_table(doc)
286
- self.process_inline_text(parent_element, doc)
291
+ doc.add_code(parent=parent_item, text=snippet_text)
292
+
293
+ elif (
294
+ isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
295
+ and len(element.children) > 0
296
+ and isinstance((first_child := element.children[0]), marko.inline.RawText)
297
+ and len(snippet_text := (first_child.children.strip())) > 0
298
+ ):
299
+ self._close_table(doc)
300
+ self._process_inline_text(parent_item, doc)
287
301
  _log.debug(f" - Code Block: {element.children}")
288
- snippet_text = str(element.children[0].children).strip() # type: ignore
289
- doc.add_code(parent=parent_element, text=snippet_text)
302
+ doc.add_code(parent=parent_item, text=snippet_text)
290
303
 
291
304
  elif isinstance(element, marko.inline.LineBreak):
292
305
  if self.in_table:
@@ -294,29 +307,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
294
307
  self.md_table_buffer.append("")
295
308
 
296
309
  elif isinstance(element, marko.block.HTMLBlock):
297
- self.process_inline_text(parent_element, doc)
298
- self.close_table(doc)
310
+ self._html_blocks += 1
311
+ self._process_inline_text(parent_item, doc)
312
+ self._close_table(doc)
299
313
  _log.debug("HTML Block: {}".format(element))
300
314
  if (
301
- len(element.children) > 0
315
+ len(element.body) > 0
302
316
  ): # If Marko doesn't return any content for HTML block, skip it
303
- snippet_text = str(element.children).strip()
304
- doc.add_text(
305
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
306
- )
317
+ html_block = element.body.strip()
318
+
319
+ # wrap in markers to enable post-processing in convert()
320
+ text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
321
+ doc.add_code(parent=parent_item, text=text_to_add)
307
322
  else:
308
323
  if not isinstance(element, str):
309
- self.close_table(doc)
324
+ self._close_table(doc)
310
325
  _log.debug("Some other element: {}".format(element))
311
326
 
327
+ processed_block_types = (
328
+ marko.block.Heading,
329
+ marko.block.CodeBlock,
330
+ marko.block.FencedCode,
331
+ marko.inline.RawText,
332
+ )
333
+
312
334
  # Iterate through the element's children (if any)
313
- if not isinstance(element, marko.block.ListItem):
314
- if not isinstance(element, marko.block.Heading):
315
- if not isinstance(element, marko.block.FencedCode):
316
- # if not isinstance(element, marko.block.Paragraph):
317
- if hasattr(element, "children"):
318
- for child in element.children:
319
- self.iterate_elements(child, depth + 1, doc, parent_element)
335
+ if hasattr(element, "children") and not isinstance(
336
+ element, processed_block_types
337
+ ):
338
+ for child in element.children:
339
+ self._iterate_elements(
340
+ element=child,
341
+ depth=depth + 1,
342
+ doc=doc,
343
+ visited=visited,
344
+ parent_item=parent_item,
345
+ )
320
346
 
321
347
  def is_valid(self) -> bool:
322
348
  return self.valid
@@ -350,8 +376,51 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
350
376
  marko_parser = Markdown()
351
377
  parsed_ast = marko_parser.parse(self.markdown)
352
378
  # Start iterating from the root of the AST
353
- self.iterate_elements(parsed_ast, 0, doc, None)
354
- self.process_inline_text(None, doc) # handle last hanging inline text
379
+ self._iterate_elements(
380
+ element=parsed_ast,
381
+ depth=0,
382
+ doc=doc,
383
+ parent_item=None,
384
+ visited=set(),
385
+ )
386
+ self._process_inline_text(None, doc) # handle last hanging inline text
387
+ self._close_table(doc=doc) # handle any last hanging table
388
+
389
+ # if HTML blocks were detected, export to HTML and delegate to HTML backend
390
+ if self._html_blocks > 0:
391
+
392
+ # export to HTML
393
+ html_backend_cls = HTMLDocumentBackend
394
+ html_str = doc.export_to_html()
395
+
396
+ def _restore_original_html(txt, regex):
397
+ _txt, count = re.subn(regex, "", txt)
398
+ if count != self._html_blocks:
399
+ raise RuntimeError(
400
+ "An internal error has occurred during Markdown conversion."
401
+ )
402
+ return _txt
403
+
404
+ # restore original HTML by removing previouly added markers
405
+ for regex in [
406
+ rf"<pre>\s*<code>\s*{_START_MARKER}",
407
+ rf"{_STOP_MARKER}\s*</code>\s*</pre>",
408
+ ]:
409
+ html_str = _restore_original_html(txt=html_str, regex=regex)
410
+ self._html_blocks = 0
411
+
412
+ # delegate to HTML backend
413
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
414
+ in_doc = InputDocument(
415
+ path_or_stream=stream,
416
+ format=InputFormat.HTML,
417
+ backend=html_backend_cls,
418
+ filename=self.file.name,
419
+ )
420
+ html_backend_obj = html_backend_cls(
421
+ in_doc=in_doc, path_or_stream=stream
422
+ )
423
+ doc = html_backend_obj.convert()
355
424
  else:
356
425
  raise RuntimeError(
357
426
  f"Cannot convert md with {self.document_hash} because the backend failed to init."
@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
98
98
 
99
99
  return doc
100
100
 
101
- def generate_prov(self, shape, slide_ind, text=""):
102
- left = shape.left
103
- top = shape.top
104
- width = shape.width
105
- height = shape.height
101
+ def generate_prov(
102
+ self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
103
+ ):
104
+ if shape.left:
105
+ left = shape.left
106
+ top = shape.top
107
+ width = shape.width
108
+ height = shape.height
109
+ else:
110
+ left = 0
111
+ top = 0
112
+ width = slide_size.width
113
+ height = slide_size.height
106
114
  shape_bbox = [left, top, left + width, top + height]
107
115
  shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
108
- # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
109
116
  prov = ProvenanceItem(
110
117
  page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
111
118
  )
112
119
 
113
120
  return prov
114
121
 
115
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
122
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
116
123
  is_a_list = False
117
124
  is_list_group_created = False
118
125
  enum_list_item_value = 0
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
128
  list_text = ""
122
129
  list_label = GroupLabel.LIST
123
130
  doc_label = DocItemLabel.LIST_ITEM
124
- prov = self.generate_prov(shape, slide_ind, shape.text.strip())
131
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
125
132
 
126
133
  # Identify if shape contains lists
127
134
  for paragraph in shape.text_frame.paragraphs:
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
270
277
  )
271
278
  return
272
279
 
273
- def handle_pictures(self, shape, parent_slide, slide_ind, doc):
274
- # Get the image bytes
275
- image = shape.image
276
- image_bytes = image.blob
277
- im_dpi, _ = image.dpi
278
-
280
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
279
281
  # Open it with PIL
280
282
  try:
283
+ # Get the image bytes
284
+ image = shape.image
285
+ image_bytes = image.blob
286
+ im_dpi, _ = image.dpi
281
287
  pil_image = Image.open(BytesIO(image_bytes))
282
288
 
283
289
  # shape has picture
284
- prov = self.generate_prov(shape, slide_ind, "")
290
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
285
291
  doc.add_picture(
286
292
  parent=parent_slide,
287
293
  image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
292
298
  _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
293
299
  return
294
300
 
295
- def handle_tables(self, shape, parent_slide, slide_ind, doc):
301
+ def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
296
302
  # Handling tables, images, charts
297
303
  if shape.has_table:
298
304
  table = shape.table
299
305
  table_xml = shape._element
300
306
 
301
- prov = self.generate_prov(shape, slide_ind, "")
307
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
302
308
 
303
309
  num_cols = 0
304
310
  num_rows = len(table.rows)
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
375
381
  name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
376
382
  )
377
383
 
378
- size = Size(width=slide_width, height=slide_height)
379
- parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
384
+ slide_size = Size(width=slide_width, height=slide_height)
385
+ parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
380
386
 
381
- def handle_shapes(shape, parent_slide, slide_ind, doc):
382
- handle_groups(shape, parent_slide, slide_ind, doc)
387
+ def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
388
+ handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
383
389
  if shape.has_table:
384
390
  # Handle Tables
385
- self.handle_tables(shape, parent_slide, slide_ind, doc)
391
+ self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
386
392
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
387
393
  # Handle Pictures
388
- self.handle_pictures(shape, parent_slide, slide_ind, doc)
394
+ self.handle_pictures(
395
+ shape, parent_slide, slide_ind, doc, slide_size
396
+ )
389
397
  # If shape doesn't have any text, move on to the next shape
390
398
  if not hasattr(shape, "text"):
391
399
  return
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
397
405
  _log.warning("Warning: shape has text but not text_frame")
398
406
  return
399
407
  # Handle other text elements, including lists (bullet lists, numbered lists)
400
- self.handle_text_elements(shape, parent_slide, slide_ind, doc)
408
+ self.handle_text_elements(
409
+ shape, parent_slide, slide_ind, doc, slide_size
410
+ )
401
411
  return
402
412
 
403
- def handle_groups(shape, parent_slide, slide_ind, doc):
413
+ def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
404
414
  if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
405
415
  for groupedshape in shape.shapes:
406
- handle_shapes(groupedshape, parent_slide, slide_ind, doc)
416
+ handle_shapes(
417
+ groupedshape, parent_slide, slide_ind, doc, slide_size
418
+ )
407
419
 
408
420
  # Loop through each shape in the slide
409
421
  for shape in slide.shapes:
410
- handle_shapes(shape, parent_slide, slide_ind, doc)
422
+ handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
411
423
 
412
424
  return doc