docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +39 -18
- docling/backend/docling_parse_backend.py +61 -59
- docling/backend/docling_parse_v2_backend.py +72 -62
- docling/backend/docling_parse_v4_backend.py +21 -19
- docling/backend/md_backend.py +101 -81
- docling/backend/mspowerpoint_backend.py +72 -113
- docling/backend/msword_backend.py +99 -80
- docling/backend/noop_backend.py +51 -0
- docling/backend/pypdfium2_backend.py +127 -53
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +15 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +4 -4
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/base_ocr_model.py +33 -11
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +2 -3
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +3 -6
- docling/models/rapid_ocr_model.py +1 -1
- docling/models/readingorder_model.py +3 -3
- docling/models/tesseract_ocr_cli_model.py +4 -3
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
docling/backend/md_backend.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
import warnings
|
4
|
+
from copy import deepcopy
|
4
5
|
from io import BytesIO
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import List, Optional, Set, Union
|
7
8
|
|
8
9
|
import marko
|
9
10
|
import marko.element
|
10
|
-
import marko.ext
|
11
|
-
import marko.ext.gfm
|
12
11
|
import marko.inline
|
13
12
|
from docling_core.types.doc import (
|
14
|
-
DocItem,
|
15
13
|
DocItemLabel,
|
16
14
|
DoclingDocument,
|
17
15
|
DocumentOrigin,
|
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
|
|
21
19
|
TableData,
|
22
20
|
TextItem,
|
23
21
|
)
|
22
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
24
23
|
from marko import Markdown
|
24
|
+
from pydantic import AnyUrl, TypeAdapter
|
25
25
|
|
26
26
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
27
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
71
71
|
|
72
72
|
self.in_table = False
|
73
73
|
self.md_table_buffer: list[str] = []
|
74
|
-
self.inline_texts: list[str] = []
|
75
74
|
self._html_blocks: int = 0
|
76
75
|
|
77
76
|
try:
|
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
155
|
doc.add_table(data=table_data)
|
157
156
|
return
|
158
157
|
|
159
|
-
def _process_inline_text(
|
160
|
-
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
161
|
-
):
|
162
|
-
txt = " ".join(self.inline_texts)
|
163
|
-
if len(txt) > 0:
|
164
|
-
doc.add_text(
|
165
|
-
label=DocItemLabel.PARAGRAPH,
|
166
|
-
parent=parent_item,
|
167
|
-
text=txt,
|
168
|
-
)
|
169
|
-
self.inline_texts = []
|
170
|
-
|
171
158
|
def _iterate_elements( # noqa: C901
|
172
159
|
self,
|
160
|
+
*,
|
173
161
|
element: marko.element.Element,
|
174
162
|
depth: int,
|
175
163
|
doc: DoclingDocument,
|
176
164
|
visited: Set[marko.element.Element],
|
177
165
|
parent_item: Optional[NodeItem] = None,
|
166
|
+
formatting: Optional[Formatting] = None,
|
167
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
178
168
|
):
|
179
169
|
if element in visited:
|
180
170
|
return
|
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
183
173
|
# Check for different element types and process relevant details
|
184
174
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
185
175
|
self._close_table(doc)
|
186
|
-
self._process_inline_text(parent_item, doc)
|
187
176
|
_log.debug(
|
188
177
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
189
178
|
)
|
179
|
+
|
180
|
+
if len(element.children) == 1:
|
181
|
+
child = element.children[0]
|
182
|
+
snippet_text = str(child.children) # type: ignore
|
183
|
+
visited.add(child)
|
184
|
+
else:
|
185
|
+
snippet_text = "" # inline group will be created
|
186
|
+
|
190
187
|
if element.level == 1:
|
191
|
-
|
188
|
+
parent_item = doc.add_title(
|
189
|
+
text=snippet_text,
|
190
|
+
parent=parent_item,
|
191
|
+
formatting=formatting,
|
192
|
+
hyperlink=hyperlink,
|
193
|
+
)
|
192
194
|
else:
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
def traverse(node: marko.block.BlockElement):
|
201
|
-
# Check if the node has a "children" attribute
|
202
|
-
if hasattr(node, "children"):
|
203
|
-
# If "children" is a list, continue traversal
|
204
|
-
if isinstance(node.children, list):
|
205
|
-
for child in node.children:
|
206
|
-
traverse(child)
|
207
|
-
# If "children" is text, add it to header text
|
208
|
-
elif isinstance(node.children, str):
|
209
|
-
strings.append(node.children)
|
210
|
-
|
211
|
-
traverse(element)
|
212
|
-
snippet_text = "".join(strings)
|
213
|
-
if len(snippet_text) > 0:
|
214
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
215
|
-
parent_item = doc.add_heading(
|
216
|
-
text=snippet_text,
|
217
|
-
level=element.level - 1,
|
218
|
-
parent=parent_item,
|
219
|
-
)
|
220
|
-
else:
|
221
|
-
parent_item = doc.add_text(
|
222
|
-
label=doc_label, parent=parent_item, text=snippet_text
|
223
|
-
)
|
195
|
+
parent_item = doc.add_heading(
|
196
|
+
text=snippet_text,
|
197
|
+
level=element.level - 1,
|
198
|
+
parent=parent_item,
|
199
|
+
formatting=formatting,
|
200
|
+
hyperlink=hyperlink,
|
201
|
+
)
|
224
202
|
|
225
203
|
elif isinstance(element, marko.block.List):
|
226
204
|
has_non_empty_list_items = False
|
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
230
208
|
break
|
231
209
|
|
232
210
|
self._close_table(doc)
|
233
|
-
self._process_inline_text(parent_item, doc)
|
234
211
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
235
212
|
if has_non_empty_list_items:
|
236
213
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
240
217
|
|
241
218
|
elif (
|
242
219
|
isinstance(element, marko.block.ListItem)
|
243
|
-
and len(element.children)
|
244
|
-
and isinstance((
|
220
|
+
and len(element.children) == 1
|
221
|
+
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
222
|
+
and len(child.children) > 0
|
245
223
|
):
|
246
224
|
self._close_table(doc)
|
247
|
-
self._process_inline_text(parent_item, doc)
|
248
225
|
_log.debug(" - List item")
|
249
226
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
):
|
257
|
-
|
258
|
-
|
259
|
-
|
227
|
+
if len(child.children) == 1:
|
228
|
+
snippet_text = str(child.children[0].children) # type: ignore
|
229
|
+
visited.add(child)
|
230
|
+
else:
|
231
|
+
snippet_text = "" # inline group will be created
|
232
|
+
is_numbered = isinstance(parent_item, OrderedList)
|
233
|
+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
234
|
+
_log.warning("ListItem would have not had a list parent, adding one.")
|
235
|
+
parent_item = doc.add_unordered_list(parent=parent_item)
|
236
|
+
parent_item = doc.add_list_item(
|
237
|
+
enumerated=is_numbered,
|
238
|
+
parent=parent_item,
|
239
|
+
text=snippet_text,
|
240
|
+
formatting=formatting,
|
241
|
+
hyperlink=hyperlink,
|
260
242
|
)
|
261
|
-
visited.add(first_child)
|
262
243
|
|
263
244
|
elif isinstance(element, marko.inline.Image):
|
264
245
|
self._close_table(doc)
|
265
|
-
self._process_inline_text(parent_item, doc)
|
266
246
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
267
247
|
|
268
248
|
fig_caption: Optional[TextItem] = None
|
269
249
|
if element.title is not None and element.title != "":
|
270
250
|
fig_caption = doc.add_text(
|
271
|
-
label=DocItemLabel.CAPTION,
|
251
|
+
label=DocItemLabel.CAPTION,
|
252
|
+
text=element.title,
|
253
|
+
formatting=formatting,
|
254
|
+
hyperlink=hyperlink,
|
272
255
|
)
|
273
256
|
|
274
257
|
doc.add_picture(parent=parent_item, caption=fig_caption)
|
275
258
|
|
276
|
-
elif isinstance(element, marko.
|
277
|
-
|
259
|
+
elif isinstance(element, marko.inline.Emphasis):
|
260
|
+
_log.debug(f" - Emphasis: {element.children}")
|
261
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
262
|
+
formatting.italic = True
|
263
|
+
|
264
|
+
elif isinstance(element, marko.inline.StrongEmphasis):
|
265
|
+
_log.debug(f" - StrongEmphasis: {element.children}")
|
266
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
267
|
+
formatting.bold = True
|
268
|
+
|
269
|
+
elif isinstance(element, marko.inline.Link):
|
270
|
+
_log.debug(f" - Link: {element.children}")
|
271
|
+
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
272
|
+
element.dest
|
273
|
+
)
|
278
274
|
|
279
275
|
elif isinstance(element, marko.inline.RawText):
|
280
276
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
287
283
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
288
284
|
else:
|
289
285
|
self.md_table_buffer.append(snippet_text)
|
290
|
-
|
286
|
+
elif snippet_text:
|
291
287
|
self._close_table(doc)
|
292
|
-
|
293
|
-
|
288
|
+
doc.add_text(
|
289
|
+
label=DocItemLabel.TEXT,
|
290
|
+
parent=parent_item,
|
291
|
+
text=snippet_text,
|
292
|
+
formatting=formatting,
|
293
|
+
hyperlink=hyperlink,
|
294
|
+
)
|
294
295
|
|
295
296
|
elif isinstance(element, marko.inline.CodeSpan):
|
296
297
|
self._close_table(doc)
|
297
|
-
self._process_inline_text(parent_item, doc)
|
298
298
|
_log.debug(f" - Code Span: {element.children}")
|
299
299
|
snippet_text = str(element.children).strip()
|
300
|
-
doc.add_code(
|
300
|
+
doc.add_code(
|
301
|
+
parent=parent_item,
|
302
|
+
text=snippet_text,
|
303
|
+
formatting=formatting,
|
304
|
+
hyperlink=hyperlink,
|
305
|
+
)
|
301
306
|
|
302
307
|
elif (
|
303
308
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
304
309
|
and len(element.children) > 0
|
305
|
-
and isinstance((
|
306
|
-
and len(snippet_text := (
|
310
|
+
and isinstance((child := element.children[0]), marko.inline.RawText)
|
311
|
+
and len(snippet_text := (child.children.strip())) > 0
|
307
312
|
):
|
308
313
|
self._close_table(doc)
|
309
|
-
self._process_inline_text(parent_item, doc)
|
310
314
|
_log.debug(f" - Code Block: {element.children}")
|
311
|
-
doc.add_code(
|
315
|
+
doc.add_code(
|
316
|
+
parent=parent_item,
|
317
|
+
text=snippet_text,
|
318
|
+
formatting=formatting,
|
319
|
+
hyperlink=hyperlink,
|
320
|
+
)
|
312
321
|
|
313
322
|
elif isinstance(element, marko.inline.LineBreak):
|
314
323
|
if self.in_table:
|
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
317
326
|
|
318
327
|
elif isinstance(element, marko.block.HTMLBlock):
|
319
328
|
self._html_blocks += 1
|
320
|
-
self._process_inline_text(parent_item, doc)
|
321
329
|
self._close_table(doc)
|
322
330
|
_log.debug(f"HTML Block: {element}")
|
323
331
|
if (
|
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
327
335
|
|
328
336
|
# wrap in markers to enable post-processing in convert()
|
329
337
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
330
|
-
doc.add_code(
|
338
|
+
doc.add_code(
|
339
|
+
parent=parent_item,
|
340
|
+
text=text_to_add,
|
341
|
+
formatting=formatting,
|
342
|
+
hyperlink=hyperlink,
|
343
|
+
)
|
331
344
|
else:
|
332
345
|
if not isinstance(element, str):
|
333
346
|
self._close_table(doc)
|
334
347
|
_log.debug(f"Some other element: {element}")
|
335
348
|
|
349
|
+
if (
|
350
|
+
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
351
|
+
and len(element.children) > 1
|
352
|
+
):
|
353
|
+
parent_item = doc.add_inline_group(parent=parent_item)
|
354
|
+
|
336
355
|
processed_block_types = (
|
337
|
-
marko.block.Heading,
|
356
|
+
# marko.block.Heading,
|
338
357
|
marko.block.CodeBlock,
|
339
358
|
marko.block.FencedCode,
|
340
359
|
marko.inline.RawText,
|
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
351
370
|
doc=doc,
|
352
371
|
visited=visited,
|
353
372
|
parent_item=parent_item,
|
373
|
+
formatting=formatting,
|
374
|
+
hyperlink=hyperlink,
|
354
375
|
)
|
355
376
|
|
356
377
|
def is_valid(self) -> bool:
|
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
392
413
|
parent_item=None,
|
393
414
|
visited=set(),
|
394
415
|
)
|
395
|
-
self._process_inline_text(None, doc) # handle last hanging inline text
|
396
416
|
self._close_table(doc=doc) # handle any last hanging table
|
397
417
|
|
398
418
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
|
|
20
20
|
from PIL import Image, UnidentifiedImageError
|
21
21
|
from pptx import Presentation
|
22
22
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
23
|
+
from pptx.oxml.text import CT_TextLineBreak
|
23
24
|
|
24
25
|
from docling.backend.abstract_backend import (
|
25
26
|
DeclarativeDocumentBackend,
|
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
121
|
|
121
122
|
return prov
|
122
123
|
|
123
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
124
|
-
is_a_list = False
|
124
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
125
125
|
is_list_group_created = False
|
126
126
|
enum_list_item_value = 0
|
127
127
|
new_list = None
|
128
|
-
bullet_type = "None"
|
129
|
-
list_label = GroupLabel.LIST
|
130
128
|
doc_label = DocItemLabel.LIST_ITEM
|
131
129
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
132
130
|
|
133
|
-
|
134
|
-
|
135
|
-
# Check if paragraph is a bullet point using the `element` XML
|
131
|
+
def is_list_item(paragraph):
|
132
|
+
"""Check if the paragraph is a list item."""
|
136
133
|
p = paragraph._element
|
137
134
|
if (
|
138
135
|
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
139
136
|
is not None
|
140
137
|
):
|
141
|
-
|
142
|
-
is_a_list = True
|
138
|
+
return (True, "Bullet")
|
143
139
|
elif (
|
144
140
|
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
145
141
|
is not None
|
146
142
|
):
|
147
|
-
|
148
|
-
|
149
|
-
else:
|
150
|
-
is_a_list = False
|
151
|
-
|
152
|
-
if paragraph.level > 0:
|
143
|
+
return (True, "Numbered")
|
144
|
+
elif paragraph.level > 0:
|
153
145
|
# Most likely a sub-list
|
154
|
-
|
155
|
-
|
156
|
-
if is_a_list:
|
157
|
-
# Determine if this is an unordered list or an ordered list.
|
158
|
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
159
|
-
if bullet_type == "Numbered":
|
160
|
-
list_label = GroupLabel.ORDERED_LIST
|
161
|
-
|
162
|
-
if is_a_list:
|
163
|
-
_log.debug("LIST DETECTED!")
|
146
|
+
return (True, "None")
|
164
147
|
else:
|
165
|
-
|
166
|
-
|
167
|
-
# If there is a list inside of the shape, create a new docling list to assign list items to
|
168
|
-
# if is_a_list:
|
169
|
-
# new_list = doc.add_group(
|
170
|
-
# label=list_label, name=f"list", parent=parent_slide
|
171
|
-
# )
|
148
|
+
return (False, "None")
|
172
149
|
|
173
150
|
# Iterate through paragraphs to build up text
|
174
151
|
for paragraph in shape.text_frame.paragraphs:
|
175
|
-
|
152
|
+
is_a_list, bullet_type = is_list_item(paragraph)
|
176
153
|
p = paragraph._element
|
177
|
-
enum_list_item_value += 1
|
178
|
-
inline_paragraph_text = ""
|
179
|
-
inline_list_item_text = ""
|
180
|
-
|
181
|
-
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
182
|
-
if len(e.text.strip()) > 0:
|
183
|
-
e_is_a_list_item = False
|
184
|
-
is_numbered = False
|
185
|
-
if (
|
186
|
-
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
187
|
-
is not None
|
188
|
-
):
|
189
|
-
bullet_type = "Bullet"
|
190
|
-
e_is_a_list_item = True
|
191
|
-
elif (
|
192
|
-
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
193
|
-
is not None
|
194
|
-
):
|
195
|
-
bullet_type = "Numbered"
|
196
|
-
is_numbered = True
|
197
|
-
e_is_a_list_item = True
|
198
|
-
else:
|
199
|
-
e_is_a_list_item = False
|
200
|
-
|
201
|
-
if e_is_a_list_item:
|
202
|
-
if len(inline_paragraph_text) > 0:
|
203
|
-
# output accumulated inline text:
|
204
|
-
doc.add_text(
|
205
|
-
label=doc_label,
|
206
|
-
parent=parent_slide,
|
207
|
-
text=inline_paragraph_text,
|
208
|
-
prov=prov,
|
209
|
-
)
|
210
|
-
# Set marker and enumerated arguments if this is an enumeration element.
|
211
|
-
inline_list_item_text += e.text
|
212
|
-
# print(e.text)
|
213
|
-
else:
|
214
|
-
# Assign proper label to the text, depending if it's a Title or Section Header
|
215
|
-
# For other types of text, assign - PARAGRAPH
|
216
|
-
doc_label = DocItemLabel.PARAGRAPH
|
217
|
-
if shape.is_placeholder:
|
218
|
-
placeholder_type = shape.placeholder_format.type
|
219
|
-
if placeholder_type in [
|
220
|
-
PP_PLACEHOLDER.CENTER_TITLE,
|
221
|
-
PP_PLACEHOLDER.TITLE,
|
222
|
-
]:
|
223
|
-
# It's a title
|
224
|
-
doc_label = DocItemLabel.TITLE
|
225
|
-
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
226
|
-
DocItemLabel.SECTION_HEADER
|
227
|
-
enum_list_item_value = 0
|
228
|
-
inline_paragraph_text += e.text
|
229
|
-
|
230
|
-
if len(inline_paragraph_text) > 0:
|
231
|
-
# output accumulated inline text:
|
232
|
-
doc.add_text(
|
233
|
-
label=doc_label,
|
234
|
-
parent=parent_slide,
|
235
|
-
text=inline_paragraph_text,
|
236
|
-
prov=prov,
|
237
|
-
)
|
238
154
|
|
239
|
-
|
155
|
+
# Convert line breaks to spaces and accumulate text
|
156
|
+
p_text = ""
|
157
|
+
for e in p.content_children:
|
158
|
+
if isinstance(e, CT_TextLineBreak):
|
159
|
+
p_text += " "
|
160
|
+
else:
|
161
|
+
p_text += e.text
|
162
|
+
|
163
|
+
if is_a_list:
|
240
164
|
enum_marker = ""
|
241
|
-
|
242
|
-
|
165
|
+
enumerated = bullet_type == "Numbered"
|
166
|
+
|
243
167
|
if not is_list_group_created:
|
244
168
|
new_list = doc.add_group(
|
245
|
-
label=
|
169
|
+
label=GroupLabel.ORDERED_LIST
|
170
|
+
if enumerated
|
171
|
+
else GroupLabel.LIST,
|
172
|
+
name="list",
|
173
|
+
parent=parent_slide,
|
246
174
|
)
|
247
175
|
is_list_group_created = True
|
176
|
+
enum_list_item_value = 0
|
177
|
+
|
178
|
+
if enumerated:
|
179
|
+
enum_list_item_value += 1
|
180
|
+
enum_marker = str(enum_list_item_value) + "."
|
181
|
+
|
248
182
|
doc.add_list_item(
|
249
183
|
marker=enum_marker,
|
250
|
-
enumerated=
|
184
|
+
enumerated=enumerated,
|
251
185
|
parent=new_list,
|
252
|
-
text=
|
186
|
+
text=p_text,
|
187
|
+
prov=prov,
|
188
|
+
)
|
189
|
+
else: # is paragraph not a list item
|
190
|
+
# Assign proper label to the text, depending if it's a Title or Section Header
|
191
|
+
# For other types of text, assign - PARAGRAPH
|
192
|
+
doc_label = DocItemLabel.PARAGRAPH
|
193
|
+
if shape.is_placeholder:
|
194
|
+
placeholder_type = shape.placeholder_format.type
|
195
|
+
if placeholder_type in [
|
196
|
+
PP_PLACEHOLDER.CENTER_TITLE,
|
197
|
+
PP_PLACEHOLDER.TITLE,
|
198
|
+
]:
|
199
|
+
# It's a title
|
200
|
+
doc_label = DocItemLabel.TITLE
|
201
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
202
|
+
DocItemLabel.SECTION_HEADER
|
203
|
+
|
204
|
+
# output accumulated inline text:
|
205
|
+
doc.add_text(
|
206
|
+
label=doc_label,
|
207
|
+
parent=parent_slide,
|
208
|
+
text=p_text,
|
253
209
|
prov=prov,
|
254
210
|
)
|
255
211
|
return
|
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
423
379
|
# Handle notes slide
|
424
380
|
if slide.has_notes_slide:
|
425
381
|
notes_slide = slide.notes_slide
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
382
|
+
if notes_slide.notes_text_frame is not None:
|
383
|
+
notes_text = notes_slide.notes_text_frame.text.strip()
|
384
|
+
if notes_text:
|
385
|
+
bbox = BoundingBox(l=0, t=0, r=0, b=0)
|
386
|
+
prov = ProvenanceItem(
|
387
|
+
page_no=slide_ind + 1,
|
388
|
+
charspan=[0, len(notes_text)],
|
389
|
+
bbox=bbox,
|
390
|
+
)
|
391
|
+
doc.add_text(
|
392
|
+
label=DocItemLabel.TEXT,
|
393
|
+
parent=parent_slide,
|
394
|
+
text=notes_text,
|
395
|
+
prov=prov,
|
396
|
+
content_layer=ContentLayer.FURNITURE,
|
397
|
+
)
|
439
398
|
|
440
399
|
return doc
|