docling 2.16.0__py3-none-any.whl → 2.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Optional, Set, Union
5
5
 
6
- from bs4 import BeautifulSoup
6
+ from bs4 import BeautifulSoup, Tag
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
9
9
  DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
24
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
25
  super().__init__(in_doc, path_or_stream)
26
26
  _log.debug("About to init HTML backend...")
27
- self.soup = None
27
+ self.soup: Optional[Tag] = None
28
28
  # HTML file:
29
29
  self.path_or_stream = path_or_stream
30
30
  # Initialise the parents for the hierarchy
@@ -78,17 +78,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
78
78
 
79
79
  if self.is_valid():
80
80
  assert self.soup is not None
81
+ content = self.soup.body or self.soup
81
82
  # Replace <br> tags with newline characters
82
- for br in self.soup.body.find_all("br"):
83
+ for br in content.find_all("br"):
83
84
  br.replace_with("\n")
84
- doc = self.walk(self.soup.body, doc)
85
+ doc = self.walk(content, doc)
85
86
  else:
86
87
  raise RuntimeError(
87
88
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
88
89
  )
89
90
  return doc
90
91
 
91
- def walk(self, element, doc):
92
+ def walk(self, element: Tag, doc: DoclingDocument):
92
93
  try:
93
94
  # Iterate over elements in the body of the document
94
95
  for idx, element in enumerate(element.children):
@@ -105,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
105
106
 
106
107
  return doc
107
108
 
108
- def analyse_element(self, element, idx, doc):
109
+ def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
109
110
  """
110
111
  if element.name!=None:
111
112
  _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -135,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
135
136
  else:
136
137
  self.walk(element, doc)
137
138
 
138
- def get_direct_text(self, item):
139
+ def get_direct_text(self, item: Tag):
139
140
  """Get the direct text of the <li> element (ignoring nested lists)."""
140
141
  text = item.find(string=True, recursive=False)
141
142
  if isinstance(text, str):
@@ -144,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
144
145
  return ""
145
146
 
146
147
  # Function to recursively extract text from all child nodes
147
- def extract_text_recursively(self, item):
148
+ def extract_text_recursively(self, item: Tag):
148
149
  result = []
149
150
 
150
151
  if isinstance(item, str):
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
165
166
 
166
167
  return "".join(result) + " "
167
168
 
168
- def handle_header(self, element, idx, doc):
169
+ def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
169
170
  """Handles header tags (h1, h2, etc.)."""
170
171
  hlevel = int(element.name.replace("h", ""))
171
172
  slevel = hlevel - 1
@@ -207,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
207
208
  level=hlevel,
208
209
  )
209
210
 
210
- def handle_code(self, element, idx, doc):
211
+ def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
211
212
  """Handles monospace code snippets (pre)."""
212
213
  if element.text is None:
213
214
  return
@@ -215,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
215
216
  label = DocItemLabel.CODE
216
217
  if len(text) == 0:
217
218
  return
218
- doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
+ doc.add_code(parent=self.parents[self.level], text=text)
219
220
 
220
- def handle_paragraph(self, element, idx, doc):
221
+ def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
221
222
  """Handles paragraph tags (p)."""
222
223
  if element.text is None:
223
224
  return
@@ -227,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
227
228
  return
228
229
  doc.add_text(parent=self.parents[self.level], label=label, text=text)
229
230
 
230
- def handle_list(self, element, idx, doc):
231
+ def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
231
232
  """Handles list tags (ul, ol) and their list items."""
232
233
 
233
234
  if element.name == "ul":
@@ -249,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
249
250
  self.parents[self.level + 1] = None
250
251
  self.level -= 1
251
252
 
252
- def handle_listitem(self, element, idx, doc):
253
+ def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
253
254
  """Handles listitem tags (li)."""
254
255
  nested_lists = element.find(["ul", "ol"])
255
256
 
@@ -303,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
303
304
  else:
304
305
  _log.warn("list-item has no text: ", element)
305
306
 
306
- def handle_table(self, element, idx, doc):
307
+ def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
307
308
  """Handles table tags."""
308
309
 
309
310
  nested_tables = element.find("table")
@@ -376,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
376
377
 
377
378
  doc.add_table(data=data, parent=self.parents[self.level])
378
379
 
379
- def get_list_text(self, list_element, level=0):
380
+ def get_list_text(self, list_element: Tag, level=0):
380
381
  """Recursively extract text from <ul> or <ol> with proper indentation."""
381
382
  result = []
382
383
  bullet_char = "*" # Default bullet character for unordered lists
@@ -402,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
402
403
 
403
404
  return result
404
405
 
405
- def extract_table_cell_text(self, cell):
406
+ def extract_table_cell_text(self, cell: Tag):
406
407
  """Extract text from a table cell, including lists with indents."""
407
408
  contains_lists = cell.find(["ul", "ol"])
408
409
  if contains_lists is None:
@@ -413,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
413
414
  )
414
415
  return cell.text
415
416
 
416
- def handle_figure(self, element, idx, doc):
417
+ def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
417
418
  """Handles image tags (img)."""
418
419
 
419
420
  # Extract the image URI from the <img> tag
@@ -436,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
436
437
  caption=fig_caption,
437
438
  )
438
439
 
439
- def handle_image(self, element, idx, doc):
440
+ def handle_image(self, element: Tag, idx, doc: DoclingDocument):
440
441
  """Handles image tags (img)."""
441
442
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
+ import marko.element
9
10
  import marko.ext
10
11
  import marko.ext.gfm
11
12
  import marko.inline
@@ -23,11 +24,16 @@ from docling_core.types.doc import (
23
24
  from marko import Markdown
24
25
 
25
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.html_backend import HTMLDocumentBackend
26
28
  from docling.datamodel.base_models import InputFormat
27
29
  from docling.datamodel.document import InputDocument
28
30
 
29
31
  _log = logging.getLogger(__name__)
30
32
 
33
+ _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
34
+ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
+ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
+
31
37
 
32
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
33
39
  def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
@@ -65,7 +71,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
65
71
 
66
72
  self.in_table = False
67
73
  self.md_table_buffer: list[str] = []
68
- self.inline_text_buffer = ""
74
+ self.inline_texts: list[str] = []
75
+ self._html_blocks: int = 0
69
76
 
70
77
  try:
71
78
  if isinstance(self.path_or_stream, BytesIO):
@@ -152,26 +159,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
152
159
  def process_inline_text(
153
160
  self, parent_element: Optional[NodeItem], doc: DoclingDocument
154
161
  ):
155
- # self.inline_text_buffer += str(text_in)
156
- txt = self.inline_text_buffer.strip()
162
+ txt = " ".join(self.inline_texts)
157
163
  if len(txt) > 0:
158
164
  doc.add_text(
159
165
  label=DocItemLabel.PARAGRAPH,
160
166
  parent=parent_element,
161
167
  text=txt,
162
168
  )
163
- self.inline_text_buffer = ""
169
+ self.inline_texts = []
164
170
 
165
171
  def iterate_elements(
166
172
  self,
167
- element: marko.block.Element,
173
+ element: marko.element.Element,
168
174
  depth: int,
169
175
  doc: DoclingDocument,
170
176
  parent_element: Optional[NodeItem] = None,
171
177
  ):
172
178
  # Iterates over all elements in the AST
173
179
  # Check for different element types and process relevant details
174
- if isinstance(element, marko.block.Heading):
180
+ if isinstance(element, marko.block.Heading) and len(element.children) > 0:
175
181
  self.close_table(doc)
176
182
  self.process_inline_text(parent_element, doc)
177
183
  _log.debug(
@@ -206,17 +212,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
206
212
  )
207
213
 
208
214
  elif isinstance(element, marko.block.List):
215
+ has_non_empty_list_items = False
216
+ for child in element.children:
217
+ if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
218
+ has_non_empty_list_items = True
219
+ break
220
+
209
221
  self.close_table(doc)
210
222
  self.process_inline_text(parent_element, doc)
211
223
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
212
- list_label = GroupLabel.LIST
213
- if element.ordered:
214
- list_label = GroupLabel.ORDERED_LIST
215
- parent_element = doc.add_group(
216
- label=list_label, name=f"list", parent=parent_element
217
- )
224
+ if has_non_empty_list_items:
225
+ label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
226
+ parent_element = doc.add_group(
227
+ label=label, name=f"list", parent=parent_element
228
+ )
218
229
 
219
- elif isinstance(element, marko.block.ListItem):
230
+ elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
220
231
  self.close_table(doc)
221
232
  self.process_inline_text(parent_element, doc)
222
233
  _log.debug(" - List item")
@@ -246,29 +257,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
246
257
 
247
258
  doc.add_picture(parent=parent_element, caption=fig_caption)
248
259
 
249
- elif isinstance(element, marko.block.Paragraph):
260
+ elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
250
261
  self.process_inline_text(parent_element, doc)
251
262
 
252
263
  elif isinstance(element, marko.inline.RawText):
253
264
  _log.debug(f" - Paragraph (raw text): {element.children}")
254
- snippet_text = str(element.children).strip()
265
+ snippet_text = element.children.strip()
255
266
  # Detect start of the table:
256
267
  if "|" in snippet_text:
257
268
  # most likely part of the markdown table
258
269
  self.in_table = True
259
270
  if len(self.md_table_buffer) > 0:
260
- self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
261
- snippet_text
262
- )
271
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
263
272
  else:
264
273
  self.md_table_buffer.append(snippet_text)
265
274
  else:
266
275
  self.close_table(doc)
267
276
  self.in_table = False
268
277
  # most likely just inline text
269
- self.inline_text_buffer += str(
270
- element.children
271
- ) # do not strip an inline text, as it may contain important spaces
278
+ self.inline_texts.append(str(element.children))
272
279
 
273
280
  elif isinstance(element, marko.inline.CodeSpan):
274
281
  self.close_table(doc)
@@ -277,50 +284,55 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
277
284
  snippet_text = str(element.children).strip()
278
285
  doc.add_code(parent=parent_element, text=snippet_text)
279
286
 
280
- elif isinstance(element, marko.block.CodeBlock):
281
- self.close_table(doc)
282
- self.process_inline_text(parent_element, doc)
283
- _log.debug(f" - Code Block: {element.children}")
284
- snippet_text = str(element.children[0].children).strip() # type: ignore
285
- doc.add_code(parent=parent_element, text=snippet_text)
286
-
287
- elif isinstance(element, marko.block.FencedCode):
287
+ elif (
288
+ isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
289
+ and len(element.children) > 0
290
+ and isinstance((first_child := element.children[0]), marko.inline.RawText)
291
+ and len(snippet_text := (first_child.children.strip())) > 0
292
+ ):
288
293
  self.close_table(doc)
289
294
  self.process_inline_text(parent_element, doc)
290
295
  _log.debug(f" - Code Block: {element.children}")
291
- snippet_text = str(element.children[0].children).strip() # type: ignore
292
296
  doc.add_code(parent=parent_element, text=snippet_text)
293
297
 
294
298
  elif isinstance(element, marko.inline.LineBreak):
295
- self.process_inline_text(parent_element, doc)
296
299
  if self.in_table:
297
300
  _log.debug("Line break in a table")
298
301
  self.md_table_buffer.append("")
299
302
 
300
303
  elif isinstance(element, marko.block.HTMLBlock):
304
+ self._html_blocks += 1
301
305
  self.process_inline_text(parent_element, doc)
302
306
  self.close_table(doc)
303
307
  _log.debug("HTML Block: {}".format(element))
304
308
  if (
305
- len(element.children) > 0
309
+ len(element.body) > 0
306
310
  ): # If Marko doesn't return any content for HTML block, skip it
307
- snippet_text = str(element.children).strip()
308
- doc.add_text(
309
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
310
- )
311
+ html_block = element.body.strip()
312
+
313
+ # wrap in markers to enable post-processing in convert()
314
+ text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
315
+ doc.add_code(parent=parent_element, text=text_to_add)
311
316
  else:
312
317
  if not isinstance(element, str):
313
318
  self.close_table(doc)
314
319
  _log.debug("Some other element: {}".format(element))
315
320
 
321
+ processed_block_types = (
322
+ marko.block.ListItem,
323
+ marko.block.Heading,
324
+ marko.block.CodeBlock,
325
+ marko.block.FencedCode,
326
+ # marko.block.Paragraph,
327
+ marko.inline.RawText,
328
+ )
329
+
316
330
  # Iterate through the element's children (if any)
317
- if not isinstance(element, marko.block.ListItem):
318
- if not isinstance(element, marko.block.Heading):
319
- if not isinstance(element, marko.block.FencedCode):
320
- # if not isinstance(element, marko.block.Paragraph):
321
- if hasattr(element, "children"):
322
- for child in element.children:
323
- self.iterate_elements(child, depth + 1, doc, parent_element)
331
+ if hasattr(element, "children") and not isinstance(
332
+ element, processed_block_types
333
+ ):
334
+ for child in element.children:
335
+ self.iterate_elements(child, depth + 1, doc, parent_element)
324
336
 
325
337
  def is_valid(self) -> bool:
326
338
  return self.valid
@@ -356,6 +368,43 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
356
368
  # Start iterating from the root of the AST
357
369
  self.iterate_elements(parsed_ast, 0, doc, None)
358
370
  self.process_inline_text(None, doc) # handle last hanging inline text
371
+ self.close_table(doc=doc) # handle any last hanging table
372
+
373
+ # if HTML blocks were detected, export to HTML and delegate to HTML backend
374
+ if self._html_blocks > 0:
375
+
376
+ # export to HTML
377
+ html_backend_cls = HTMLDocumentBackend
378
+ html_str = doc.export_to_html()
379
+
380
+ def _restore_original_html(txt, regex):
381
+ _txt, count = re.subn(regex, "", txt)
382
+ if count != self._html_blocks:
383
+ raise RuntimeError(
384
+ "An internal error has occurred during Markdown conversion."
385
+ )
386
+ return _txt
387
+
388
+ # restore original HTML by removing previouly added markers
389
+ for regex in [
390
+ rf"<pre>\s*<code>\s*{_START_MARKER}",
391
+ rf"{_STOP_MARKER}\s*</code>\s*</pre>",
392
+ ]:
393
+ html_str = _restore_original_html(txt=html_str, regex=regex)
394
+ self._html_blocks = 0
395
+
396
+ # delegate to HTML backend
397
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
398
+ in_doc = InputDocument(
399
+ path_or_stream=stream,
400
+ format=InputFormat.HTML,
401
+ backend=html_backend_cls,
402
+ filename=self.file.name,
403
+ )
404
+ html_backend_obj = html_backend_cls(
405
+ in_doc=in_doc, path_or_stream=stream
406
+ )
407
+ doc = html_backend_obj.convert()
359
408
  else:
360
409
  raise RuntimeError(
361
410
  f"Cannot convert md with {self.document_hash} because the backend failed to init."
@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
98
98
 
99
99
  return doc
100
100
 
101
- def generate_prov(self, shape, slide_ind, text=""):
102
- left = shape.left
103
- top = shape.top
104
- width = shape.width
105
- height = shape.height
101
+ def generate_prov(
102
+ self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
103
+ ):
104
+ if shape.left:
105
+ left = shape.left
106
+ top = shape.top
107
+ width = shape.width
108
+ height = shape.height
109
+ else:
110
+ left = 0
111
+ top = 0
112
+ width = slide_size.width
113
+ height = slide_size.height
106
114
  shape_bbox = [left, top, left + width, top + height]
107
115
  shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
108
- # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
109
116
  prov = ProvenanceItem(
110
117
  page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
111
118
  )
112
119
 
113
120
  return prov
114
121
 
115
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
122
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
116
123
  is_a_list = False
117
124
  is_list_group_created = False
118
125
  enum_list_item_value = 0
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
128
  list_text = ""
122
129
  list_label = GroupLabel.LIST
123
130
  doc_label = DocItemLabel.LIST_ITEM
124
- prov = self.generate_prov(shape, slide_ind, shape.text.strip())
131
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
125
132
 
126
133
  # Identify if shape contains lists
127
134
  for paragraph in shape.text_frame.paragraphs:
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
270
277
  )
271
278
  return
272
279
 
273
- def handle_pictures(self, shape, parent_slide, slide_ind, doc):
274
- # Get the image bytes
275
- image = shape.image
276
- image_bytes = image.blob
277
- im_dpi, _ = image.dpi
278
-
280
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
279
281
  # Open it with PIL
280
282
  try:
283
+ # Get the image bytes
284
+ image = shape.image
285
+ image_bytes = image.blob
286
+ im_dpi, _ = image.dpi
281
287
  pil_image = Image.open(BytesIO(image_bytes))
282
288
 
283
289
  # shape has picture
284
- prov = self.generate_prov(shape, slide_ind, "")
290
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
285
291
  doc.add_picture(
286
292
  parent=parent_slide,
287
293
  image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
292
298
  _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
293
299
  return
294
300
 
295
- def handle_tables(self, shape, parent_slide, slide_ind, doc):
301
+ def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
296
302
  # Handling tables, images, charts
297
303
  if shape.has_table:
298
304
  table = shape.table
299
305
  table_xml = shape._element
300
306
 
301
- prov = self.generate_prov(shape, slide_ind, "")
307
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
302
308
 
303
309
  num_cols = 0
304
310
  num_rows = len(table.rows)
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
375
381
  name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
376
382
  )
377
383
 
378
- size = Size(width=slide_width, height=slide_height)
379
- parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
384
+ slide_size = Size(width=slide_width, height=slide_height)
385
+ parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
380
386
 
381
- def handle_shapes(shape, parent_slide, slide_ind, doc):
382
- handle_groups(shape, parent_slide, slide_ind, doc)
387
+ def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
388
+ handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
383
389
  if shape.has_table:
384
390
  # Handle Tables
385
- self.handle_tables(shape, parent_slide, slide_ind, doc)
391
+ self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
386
392
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
387
393
  # Handle Pictures
388
- self.handle_pictures(shape, parent_slide, slide_ind, doc)
394
+ self.handle_pictures(
395
+ shape, parent_slide, slide_ind, doc, slide_size
396
+ )
389
397
  # If shape doesn't have any text, move on to the next shape
390
398
  if not hasattr(shape, "text"):
391
399
  return
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
397
405
  _log.warning("Warning: shape has text but not text_frame")
398
406
  return
399
407
  # Handle other text elements, including lists (bullet lists, numbered lists)
400
- self.handle_text_elements(shape, parent_slide, slide_ind, doc)
408
+ self.handle_text_elements(
409
+ shape, parent_slide, slide_ind, doc, slide_size
410
+ )
401
411
  return
402
412
 
403
- def handle_groups(shape, parent_slide, slide_ind, doc):
413
+ def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
404
414
  if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
405
415
  for groupedshape in shape.shapes:
406
- handle_shapes(groupedshape, parent_slide, slide_ind, doc)
416
+ handle_shapes(
417
+ groupedshape, parent_slide, slide_ind, doc, slide_size
418
+ )
407
419
 
408
420
  # Loop through each shape in the slide
409
421
  for shape in slide.shapes:
410
- handle_shapes(shape, parent_slide, slide_ind, doc)
422
+ handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
411
423
 
412
424
  return doc