docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. docling/backend/asciidoc_backend.py +39 -18
  2. docling/backend/docling_parse_backend.py +61 -59
  3. docling/backend/docling_parse_v2_backend.py +72 -62
  4. docling/backend/docling_parse_v4_backend.py +21 -19
  5. docling/backend/md_backend.py +101 -81
  6. docling/backend/mspowerpoint_backend.py +72 -113
  7. docling/backend/msword_backend.py +99 -80
  8. docling/backend/noop_backend.py +51 -0
  9. docling/backend/pypdfium2_backend.py +127 -53
  10. docling/cli/main.py +82 -14
  11. docling/datamodel/asr_model_specs.py +92 -0
  12. docling/datamodel/base_models.py +21 -4
  13. docling/datamodel/document.py +3 -1
  14. docling/datamodel/pipeline_options.py +15 -2
  15. docling/datamodel/pipeline_options_asr_model.py +57 -0
  16. docling/datamodel/pipeline_options_vlm_model.py +4 -4
  17. docling/document_converter.py +8 -0
  18. docling/models/api_vlm_model.py +3 -1
  19. docling/models/base_model.py +1 -1
  20. docling/models/base_ocr_model.py +33 -11
  21. docling/models/easyocr_model.py +1 -1
  22. docling/models/layout_model.py +2 -3
  23. docling/models/ocr_mac_model.py +1 -1
  24. docling/models/page_preprocessing_model.py +3 -6
  25. docling/models/rapid_ocr_model.py +1 -1
  26. docling/models/readingorder_model.py +3 -3
  27. docling/models/tesseract_ocr_cli_model.py +4 -3
  28. docling/models/tesseract_ocr_model.py +1 -1
  29. docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
  30. docling/models/vlm_models_inline/mlx_model.py +3 -1
  31. docling/pipeline/asr_pipeline.py +253 -0
  32. docling/pipeline/base_pipeline.py +11 -0
  33. docling/pipeline/standard_pdf_pipeline.py +0 -1
  34. docling/utils/layout_postprocessor.py +11 -6
  35. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
  36. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
  37. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
  38. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
  39. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
  40. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,15 @@
1
1
  import logging
2
2
  import re
3
3
  import warnings
4
+ from copy import deepcopy
4
5
  from io import BytesIO
5
6
  from pathlib import Path
6
7
  from typing import List, Optional, Set, Union
7
8
 
8
9
  import marko
9
10
  import marko.element
10
- import marko.ext
11
- import marko.ext.gfm
12
11
  import marko.inline
13
12
  from docling_core.types.doc import (
14
- DocItem,
15
13
  DocItemLabel,
16
14
  DoclingDocument,
17
15
  DocumentOrigin,
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
21
19
  TableData,
22
20
  TextItem,
23
21
  )
22
+ from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
24
23
  from marko import Markdown
24
+ from pydantic import AnyUrl, TypeAdapter
25
25
 
26
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
27
  from docling.backend.html_backend import HTMLDocumentBackend
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
71
71
 
72
72
  self.in_table = False
73
73
  self.md_table_buffer: list[str] = []
74
- self.inline_texts: list[str] = []
75
74
  self._html_blocks: int = 0
76
75
 
77
76
  try:
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
155
  doc.add_table(data=table_data)
157
156
  return
158
157
 
159
- def _process_inline_text(
160
- self, parent_item: Optional[NodeItem], doc: DoclingDocument
161
- ):
162
- txt = " ".join(self.inline_texts)
163
- if len(txt) > 0:
164
- doc.add_text(
165
- label=DocItemLabel.PARAGRAPH,
166
- parent=parent_item,
167
- text=txt,
168
- )
169
- self.inline_texts = []
170
-
171
158
  def _iterate_elements( # noqa: C901
172
159
  self,
160
+ *,
173
161
  element: marko.element.Element,
174
162
  depth: int,
175
163
  doc: DoclingDocument,
176
164
  visited: Set[marko.element.Element],
177
165
  parent_item: Optional[NodeItem] = None,
166
+ formatting: Optional[Formatting] = None,
167
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
178
168
  ):
179
169
  if element in visited:
180
170
  return
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
183
173
  # Check for different element types and process relevant details
184
174
  if isinstance(element, marko.block.Heading) and len(element.children) > 0:
185
175
  self._close_table(doc)
186
- self._process_inline_text(parent_item, doc)
187
176
  _log.debug(
188
177
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
189
178
  )
179
+
180
+ if len(element.children) == 1:
181
+ child = element.children[0]
182
+ snippet_text = str(child.children) # type: ignore
183
+ visited.add(child)
184
+ else:
185
+ snippet_text = "" # inline group will be created
186
+
190
187
  if element.level == 1:
191
- doc_label = DocItemLabel.TITLE
188
+ parent_item = doc.add_title(
189
+ text=snippet_text,
190
+ parent=parent_item,
191
+ formatting=formatting,
192
+ hyperlink=hyperlink,
193
+ )
192
194
  else:
193
- doc_label = DocItemLabel.SECTION_HEADER
194
-
195
- # Header could have arbitrary inclusion of bold, italic or emphasis,
196
- # hence we need to traverse the tree to get full text of a header
197
- strings: List[str] = []
198
-
199
- # Define a recursive function to traverse the tree
200
- def traverse(node: marko.block.BlockElement):
201
- # Check if the node has a "children" attribute
202
- if hasattr(node, "children"):
203
- # If "children" is a list, continue traversal
204
- if isinstance(node.children, list):
205
- for child in node.children:
206
- traverse(child)
207
- # If "children" is text, add it to header text
208
- elif isinstance(node.children, str):
209
- strings.append(node.children)
210
-
211
- traverse(element)
212
- snippet_text = "".join(strings)
213
- if len(snippet_text) > 0:
214
- if doc_label == DocItemLabel.SECTION_HEADER:
215
- parent_item = doc.add_heading(
216
- text=snippet_text,
217
- level=element.level - 1,
218
- parent=parent_item,
219
- )
220
- else:
221
- parent_item = doc.add_text(
222
- label=doc_label, parent=parent_item, text=snippet_text
223
- )
195
+ parent_item = doc.add_heading(
196
+ text=snippet_text,
197
+ level=element.level - 1,
198
+ parent=parent_item,
199
+ formatting=formatting,
200
+ hyperlink=hyperlink,
201
+ )
224
202
 
225
203
  elif isinstance(element, marko.block.List):
226
204
  has_non_empty_list_items = False
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
230
208
  break
231
209
 
232
210
  self._close_table(doc)
233
- self._process_inline_text(parent_item, doc)
234
211
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
235
212
  if has_non_empty_list_items:
236
213
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
240
217
 
241
218
  elif (
242
219
  isinstance(element, marko.block.ListItem)
243
- and len(element.children) > 0
244
- and isinstance((first_child := element.children[0]), marko.block.Paragraph)
220
+ and len(element.children) == 1
221
+ and isinstance((child := element.children[0]), marko.block.Paragraph)
222
+ and len(child.children) > 0
245
223
  ):
246
224
  self._close_table(doc)
247
- self._process_inline_text(parent_item, doc)
248
225
  _log.debug(" - List item")
249
226
 
250
- snippet_text = str(first_child.children[0].children) # type: ignore
251
- is_numbered = False
252
- if (
253
- parent_item is not None
254
- and isinstance(parent_item, DocItem)
255
- and parent_item.label == GroupLabel.ORDERED_LIST
256
- ):
257
- is_numbered = True
258
- doc.add_list_item(
259
- enumerated=is_numbered, parent=parent_item, text=snippet_text
227
+ if len(child.children) == 1:
228
+ snippet_text = str(child.children[0].children) # type: ignore
229
+ visited.add(child)
230
+ else:
231
+ snippet_text = "" # inline group will be created
232
+ is_numbered = isinstance(parent_item, OrderedList)
233
+ if not isinstance(parent_item, (OrderedList, UnorderedList)):
234
+ _log.warning("ListItem would have not had a list parent, adding one.")
235
+ parent_item = doc.add_unordered_list(parent=parent_item)
236
+ parent_item = doc.add_list_item(
237
+ enumerated=is_numbered,
238
+ parent=parent_item,
239
+ text=snippet_text,
240
+ formatting=formatting,
241
+ hyperlink=hyperlink,
260
242
  )
261
- visited.add(first_child)
262
243
 
263
244
  elif isinstance(element, marko.inline.Image):
264
245
  self._close_table(doc)
265
- self._process_inline_text(parent_item, doc)
266
246
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
267
247
 
268
248
  fig_caption: Optional[TextItem] = None
269
249
  if element.title is not None and element.title != "":
270
250
  fig_caption = doc.add_text(
271
- label=DocItemLabel.CAPTION, text=element.title
251
+ label=DocItemLabel.CAPTION,
252
+ text=element.title,
253
+ formatting=formatting,
254
+ hyperlink=hyperlink,
272
255
  )
273
256
 
274
257
  doc.add_picture(parent=parent_item, caption=fig_caption)
275
258
 
276
- elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
277
- self._process_inline_text(parent_item, doc)
259
+ elif isinstance(element, marko.inline.Emphasis):
260
+ _log.debug(f" - Emphasis: {element.children}")
261
+ formatting = deepcopy(formatting) if formatting else Formatting()
262
+ formatting.italic = True
263
+
264
+ elif isinstance(element, marko.inline.StrongEmphasis):
265
+ _log.debug(f" - StrongEmphasis: {element.children}")
266
+ formatting = deepcopy(formatting) if formatting else Formatting()
267
+ formatting.bold = True
268
+
269
+ elif isinstance(element, marko.inline.Link):
270
+ _log.debug(f" - Link: {element.children}")
271
+ hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
272
+ element.dest
273
+ )
278
274
 
279
275
  elif isinstance(element, marko.inline.RawText):
280
276
  _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
287
283
  self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
288
284
  else:
289
285
  self.md_table_buffer.append(snippet_text)
290
- else:
286
+ elif snippet_text:
291
287
  self._close_table(doc)
292
- # most likely just inline text
293
- self.inline_texts.append(str(element.children))
288
+ doc.add_text(
289
+ label=DocItemLabel.TEXT,
290
+ parent=parent_item,
291
+ text=snippet_text,
292
+ formatting=formatting,
293
+ hyperlink=hyperlink,
294
+ )
294
295
 
295
296
  elif isinstance(element, marko.inline.CodeSpan):
296
297
  self._close_table(doc)
297
- self._process_inline_text(parent_item, doc)
298
298
  _log.debug(f" - Code Span: {element.children}")
299
299
  snippet_text = str(element.children).strip()
300
- doc.add_code(parent=parent_item, text=snippet_text)
300
+ doc.add_code(
301
+ parent=parent_item,
302
+ text=snippet_text,
303
+ formatting=formatting,
304
+ hyperlink=hyperlink,
305
+ )
301
306
 
302
307
  elif (
303
308
  isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
304
309
  and len(element.children) > 0
305
- and isinstance((first_child := element.children[0]), marko.inline.RawText)
306
- and len(snippet_text := (first_child.children.strip())) > 0
310
+ and isinstance((child := element.children[0]), marko.inline.RawText)
311
+ and len(snippet_text := (child.children.strip())) > 0
307
312
  ):
308
313
  self._close_table(doc)
309
- self._process_inline_text(parent_item, doc)
310
314
  _log.debug(f" - Code Block: {element.children}")
311
- doc.add_code(parent=parent_item, text=snippet_text)
315
+ doc.add_code(
316
+ parent=parent_item,
317
+ text=snippet_text,
318
+ formatting=formatting,
319
+ hyperlink=hyperlink,
320
+ )
312
321
 
313
322
  elif isinstance(element, marko.inline.LineBreak):
314
323
  if self.in_table:
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
317
326
 
318
327
  elif isinstance(element, marko.block.HTMLBlock):
319
328
  self._html_blocks += 1
320
- self._process_inline_text(parent_item, doc)
321
329
  self._close_table(doc)
322
330
  _log.debug(f"HTML Block: {element}")
323
331
  if (
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
327
335
 
328
336
  # wrap in markers to enable post-processing in convert()
329
337
  text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
330
- doc.add_code(parent=parent_item, text=text_to_add)
338
+ doc.add_code(
339
+ parent=parent_item,
340
+ text=text_to_add,
341
+ formatting=formatting,
342
+ hyperlink=hyperlink,
343
+ )
331
344
  else:
332
345
  if not isinstance(element, str):
333
346
  self._close_table(doc)
334
347
  _log.debug(f"Some other element: {element}")
335
348
 
349
+ if (
350
+ isinstance(element, (marko.block.Paragraph, marko.block.Heading))
351
+ and len(element.children) > 1
352
+ ):
353
+ parent_item = doc.add_inline_group(parent=parent_item)
354
+
336
355
  processed_block_types = (
337
- marko.block.Heading,
356
+ # marko.block.Heading,
338
357
  marko.block.CodeBlock,
339
358
  marko.block.FencedCode,
340
359
  marko.inline.RawText,
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
351
370
  doc=doc,
352
371
  visited=visited,
353
372
  parent_item=parent_item,
373
+ formatting=formatting,
374
+ hyperlink=hyperlink,
354
375
  )
355
376
 
356
377
  def is_valid(self) -> bool:
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
392
413
  parent_item=None,
393
414
  visited=set(),
394
415
  )
395
- self._process_inline_text(None, doc) # handle last hanging inline text
396
416
  self._close_table(doc=doc) # handle any last hanging table
397
417
 
398
418
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
20
20
  from PIL import Image, UnidentifiedImageError
21
21
  from pptx import Presentation
22
22
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
23
+ from pptx.oxml.text import CT_TextLineBreak
23
24
 
24
25
  from docling.backend.abstract_backend import (
25
26
  DeclarativeDocumentBackend,
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
121
 
121
122
  return prov
122
123
 
123
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
124
- is_a_list = False
124
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
125
125
  is_list_group_created = False
126
126
  enum_list_item_value = 0
127
127
  new_list = None
128
- bullet_type = "None"
129
- list_label = GroupLabel.LIST
130
128
  doc_label = DocItemLabel.LIST_ITEM
131
129
  prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
132
130
 
133
- # Identify if shape contains lists
134
- for paragraph in shape.text_frame.paragraphs:
135
- # Check if paragraph is a bullet point using the `element` XML
131
+ def is_list_item(paragraph):
132
+ """Check if the paragraph is a list item."""
136
133
  p = paragraph._element
137
134
  if (
138
135
  p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
139
136
  is not None
140
137
  ):
141
- bullet_type = "Bullet"
142
- is_a_list = True
138
+ return (True, "Bullet")
143
139
  elif (
144
140
  p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
145
141
  is not None
146
142
  ):
147
- bullet_type = "Numbered"
148
- is_a_list = True
149
- else:
150
- is_a_list = False
151
-
152
- if paragraph.level > 0:
143
+ return (True, "Numbered")
144
+ elif paragraph.level > 0:
153
145
  # Most likely a sub-list
154
- is_a_list = True
155
-
156
- if is_a_list:
157
- # Determine if this is an unordered list or an ordered list.
158
- # Set GroupLabel.ORDERED_LIST when it fits.
159
- if bullet_type == "Numbered":
160
- list_label = GroupLabel.ORDERED_LIST
161
-
162
- if is_a_list:
163
- _log.debug("LIST DETECTED!")
146
+ return (True, "None")
164
147
  else:
165
- _log.debug("No List")
166
-
167
- # If there is a list inside of the shape, create a new docling list to assign list items to
168
- # if is_a_list:
169
- # new_list = doc.add_group(
170
- # label=list_label, name=f"list", parent=parent_slide
171
- # )
148
+ return (False, "None")
172
149
 
173
150
  # Iterate through paragraphs to build up text
174
151
  for paragraph in shape.text_frame.paragraphs:
175
- # p_text = paragraph.text.strip()
152
+ is_a_list, bullet_type = is_list_item(paragraph)
176
153
  p = paragraph._element
177
- enum_list_item_value += 1
178
- inline_paragraph_text = ""
179
- inline_list_item_text = ""
180
-
181
- for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
182
- if len(e.text.strip()) > 0:
183
- e_is_a_list_item = False
184
- is_numbered = False
185
- if (
186
- p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
187
- is not None
188
- ):
189
- bullet_type = "Bullet"
190
- e_is_a_list_item = True
191
- elif (
192
- p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
193
- is not None
194
- ):
195
- bullet_type = "Numbered"
196
- is_numbered = True
197
- e_is_a_list_item = True
198
- else:
199
- e_is_a_list_item = False
200
-
201
- if e_is_a_list_item:
202
- if len(inline_paragraph_text) > 0:
203
- # output accumulated inline text:
204
- doc.add_text(
205
- label=doc_label,
206
- parent=parent_slide,
207
- text=inline_paragraph_text,
208
- prov=prov,
209
- )
210
- # Set marker and enumerated arguments if this is an enumeration element.
211
- inline_list_item_text += e.text
212
- # print(e.text)
213
- else:
214
- # Assign proper label to the text, depending if it's a Title or Section Header
215
- # For other types of text, assign - PARAGRAPH
216
- doc_label = DocItemLabel.PARAGRAPH
217
- if shape.is_placeholder:
218
- placeholder_type = shape.placeholder_format.type
219
- if placeholder_type in [
220
- PP_PLACEHOLDER.CENTER_TITLE,
221
- PP_PLACEHOLDER.TITLE,
222
- ]:
223
- # It's a title
224
- doc_label = DocItemLabel.TITLE
225
- elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
226
- DocItemLabel.SECTION_HEADER
227
- enum_list_item_value = 0
228
- inline_paragraph_text += e.text
229
-
230
- if len(inline_paragraph_text) > 0:
231
- # output accumulated inline text:
232
- doc.add_text(
233
- label=doc_label,
234
- parent=parent_slide,
235
- text=inline_paragraph_text,
236
- prov=prov,
237
- )
238
154
 
239
- if len(inline_list_item_text) > 0:
155
+ # Convert line breaks to spaces and accumulate text
156
+ p_text = ""
157
+ for e in p.content_children:
158
+ if isinstance(e, CT_TextLineBreak):
159
+ p_text += " "
160
+ else:
161
+ p_text += e.text
162
+
163
+ if is_a_list:
240
164
  enum_marker = ""
241
- if is_numbered:
242
- enum_marker = str(enum_list_item_value) + "."
165
+ enumerated = bullet_type == "Numbered"
166
+
243
167
  if not is_list_group_created:
244
168
  new_list = doc.add_group(
245
- label=list_label, name="list", parent=parent_slide
169
+ label=GroupLabel.ORDERED_LIST
170
+ if enumerated
171
+ else GroupLabel.LIST,
172
+ name="list",
173
+ parent=parent_slide,
246
174
  )
247
175
  is_list_group_created = True
176
+ enum_list_item_value = 0
177
+
178
+ if enumerated:
179
+ enum_list_item_value += 1
180
+ enum_marker = str(enum_list_item_value) + "."
181
+
248
182
  doc.add_list_item(
249
183
  marker=enum_marker,
250
- enumerated=is_numbered,
184
+ enumerated=enumerated,
251
185
  parent=new_list,
252
- text=inline_list_item_text,
186
+ text=p_text,
187
+ prov=prov,
188
+ )
189
+ else: # is paragraph not a list item
190
+ # Assign proper label to the text, depending if it's a Title or Section Header
191
+ # For other types of text, assign - PARAGRAPH
192
+ doc_label = DocItemLabel.PARAGRAPH
193
+ if shape.is_placeholder:
194
+ placeholder_type = shape.placeholder_format.type
195
+ if placeholder_type in [
196
+ PP_PLACEHOLDER.CENTER_TITLE,
197
+ PP_PLACEHOLDER.TITLE,
198
+ ]:
199
+ # It's a title
200
+ doc_label = DocItemLabel.TITLE
201
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
202
+ DocItemLabel.SECTION_HEADER
203
+
204
+ # output accumulated inline text:
205
+ doc.add_text(
206
+ label=doc_label,
207
+ parent=parent_slide,
208
+ text=p_text,
253
209
  prov=prov,
254
210
  )
255
211
  return
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
423
379
  # Handle notes slide
424
380
  if slide.has_notes_slide:
425
381
  notes_slide = slide.notes_slide
426
- notes_text = notes_slide.notes_text_frame.text.strip()
427
- if notes_text:
428
- bbox = BoundingBox(l=0, t=0, r=0, b=0)
429
- prov = ProvenanceItem(
430
- page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
431
- )
432
- doc.add_text(
433
- label=DocItemLabel.TEXT,
434
- parent=parent_slide,
435
- text=notes_text,
436
- prov=prov,
437
- content_layer=ContentLayer.FURNITURE,
438
- )
382
+ if notes_slide.notes_text_frame is not None:
383
+ notes_text = notes_slide.notes_text_frame.text.strip()
384
+ if notes_text:
385
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
386
+ prov = ProvenanceItem(
387
+ page_no=slide_ind + 1,
388
+ charspan=[0, len(notes_text)],
389
+ bbox=bbox,
390
+ )
391
+ doc.add_text(
392
+ label=DocItemLabel.TEXT,
393
+ parent=parent_slide,
394
+ text=notes_text,
395
+ prov=prov,
396
+ content_layer=ContentLayer.FURNITURE,
397
+ )
439
398
 
440
399
  return doc