docling 2.17.0__py3-none-any.whl → 2.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +18 -18
- docling/backend/md_backend.py +88 -35
- docling/backend/mspowerpoint_backend.py +39 -27
- docling/backend/msword_backend.py +172 -130
- docling/datamodel/document.py +2 -0
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +12 -2
- docling/models/table_structure_model.py +9 -5
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +4 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/METADATA +8 -3
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/RECORD +15 -15
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/WHEEL +1 -1
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/LICENSE +0 -0
- {docling-2.17.0.dist-info → docling-2.18.0.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Set, Union
|
4
|
+
from typing import Optional, Set, Union
|
5
5
|
|
6
|
-
from bs4 import BeautifulSoup
|
6
|
+
from bs4 import BeautifulSoup, Tag
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
9
9
|
DoclingDocument,
|
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
24
24
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
25
25
|
super().__init__(in_doc, path_or_stream)
|
26
26
|
_log.debug("About to init HTML backend...")
|
27
|
-
self.soup = None
|
27
|
+
self.soup: Optional[Tag] = None
|
28
28
|
# HTML file:
|
29
29
|
self.path_or_stream = path_or_stream
|
30
30
|
# Initialise the parents for the hierarchy
|
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
89
89
|
)
|
90
90
|
return doc
|
91
91
|
|
92
|
-
def walk(self, element, doc):
|
92
|
+
def walk(self, element: Tag, doc: DoclingDocument):
|
93
93
|
try:
|
94
94
|
# Iterate over elements in the body of the document
|
95
95
|
for idx, element in enumerate(element.children):
|
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
106
106
|
|
107
107
|
return doc
|
108
108
|
|
109
|
-
def analyse_element(self, element, idx, doc):
|
109
|
+
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
110
110
|
"""
|
111
111
|
if element.name!=None:
|
112
112
|
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
136
136
|
else:
|
137
137
|
self.walk(element, doc)
|
138
138
|
|
139
|
-
def get_direct_text(self, item):
|
139
|
+
def get_direct_text(self, item: Tag):
|
140
140
|
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
141
141
|
text = item.find(string=True, recursive=False)
|
142
142
|
if isinstance(text, str):
|
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
145
145
|
return ""
|
146
146
|
|
147
147
|
# Function to recursively extract text from all child nodes
|
148
|
-
def extract_text_recursively(self, item):
|
148
|
+
def extract_text_recursively(self, item: Tag):
|
149
149
|
result = []
|
150
150
|
|
151
151
|
if isinstance(item, str):
|
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
166
166
|
|
167
167
|
return "".join(result) + " "
|
168
168
|
|
169
|
-
def handle_header(self, element, idx, doc):
|
169
|
+
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
170
170
|
"""Handles header tags (h1, h2, etc.)."""
|
171
171
|
hlevel = int(element.name.replace("h", ""))
|
172
172
|
slevel = hlevel - 1
|
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
208
208
|
level=hlevel,
|
209
209
|
)
|
210
210
|
|
211
|
-
def handle_code(self, element, idx, doc):
|
211
|
+
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
212
212
|
"""Handles monospace code snippets (pre)."""
|
213
213
|
if element.text is None:
|
214
214
|
return
|
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
216
216
|
label = DocItemLabel.CODE
|
217
217
|
if len(text) == 0:
|
218
218
|
return
|
219
|
-
doc.add_code(parent=self.parents[self.level],
|
219
|
+
doc.add_code(parent=self.parents[self.level], text=text)
|
220
220
|
|
221
|
-
def handle_paragraph(self, element, idx, doc):
|
221
|
+
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
222
222
|
"""Handles paragraph tags (p)."""
|
223
223
|
if element.text is None:
|
224
224
|
return
|
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
228
228
|
return
|
229
229
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
230
230
|
|
231
|
-
def handle_list(self, element, idx, doc):
|
231
|
+
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
232
232
|
"""Handles list tags (ul, ol) and their list items."""
|
233
233
|
|
234
234
|
if element.name == "ul":
|
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
250
250
|
self.parents[self.level + 1] = None
|
251
251
|
self.level -= 1
|
252
252
|
|
253
|
-
def handle_listitem(self, element, idx, doc):
|
253
|
+
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
254
254
|
"""Handles listitem tags (li)."""
|
255
255
|
nested_lists = element.find(["ul", "ol"])
|
256
256
|
|
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
304
304
|
else:
|
305
305
|
_log.warn("list-item has no text: ", element)
|
306
306
|
|
307
|
-
def handle_table(self, element, idx, doc):
|
307
|
+
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
308
308
|
"""Handles table tags."""
|
309
309
|
|
310
310
|
nested_tables = element.find("table")
|
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
377
377
|
|
378
378
|
doc.add_table(data=data, parent=self.parents[self.level])
|
379
379
|
|
380
|
-
def get_list_text(self, list_element, level=0):
|
380
|
+
def get_list_text(self, list_element: Tag, level=0):
|
381
381
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
382
382
|
result = []
|
383
383
|
bullet_char = "*" # Default bullet character for unordered lists
|
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
403
403
|
|
404
404
|
return result
|
405
405
|
|
406
|
-
def extract_table_cell_text(self, cell):
|
406
|
+
def extract_table_cell_text(self, cell: Tag):
|
407
407
|
"""Extract text from a table cell, including lists with indents."""
|
408
408
|
contains_lists = cell.find(["ul", "ol"])
|
409
409
|
if contains_lists is None:
|
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
414
414
|
)
|
415
415
|
return cell.text
|
416
416
|
|
417
|
-
def handle_figure(self, element, idx, doc):
|
417
|
+
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
418
418
|
"""Handles image tags (img)."""
|
419
419
|
|
420
420
|
# Extract the image URI from the <img> tag
|
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
437
437
|
caption=fig_caption,
|
438
438
|
)
|
439
439
|
|
440
|
-
def handle_image(self, element, idx, doc):
|
440
|
+
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
441
441
|
"""Handles image tags (img)."""
|
442
442
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
docling/backend/md_backend.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6
6
|
from typing import List, Optional, Set, Union
|
7
7
|
|
8
8
|
import marko
|
9
|
+
import marko.element
|
9
10
|
import marko.ext
|
10
11
|
import marko.ext.gfm
|
11
12
|
import marko.inline
|
@@ -23,11 +24,16 @@ from docling_core.types.doc import (
|
|
23
24
|
from marko import Markdown
|
24
25
|
|
25
26
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
26
28
|
from docling.datamodel.base_models import InputFormat
|
27
29
|
from docling.datamodel.document import InputDocument
|
28
30
|
|
29
31
|
_log = logging.getLogger(__name__)
|
30
32
|
|
33
|
+
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
34
|
+
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
35
|
+
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
|
+
|
31
37
|
|
32
38
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
33
39
|
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
66
72
|
self.in_table = False
|
67
73
|
self.md_table_buffer: list[str] = []
|
68
74
|
self.inline_texts: list[str] = []
|
75
|
+
self._html_blocks: int = 0
|
69
76
|
|
70
77
|
try:
|
71
78
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -163,14 +170,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
163
170
|
|
164
171
|
def iterate_elements(
|
165
172
|
self,
|
166
|
-
element: marko.
|
173
|
+
element: marko.element.Element,
|
167
174
|
depth: int,
|
168
175
|
doc: DoclingDocument,
|
169
176
|
parent_element: Optional[NodeItem] = None,
|
170
177
|
):
|
171
178
|
# Iterates over all elements in the AST
|
172
179
|
# Check for different element types and process relevant details
|
173
|
-
if isinstance(element, marko.block.Heading):
|
180
|
+
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
174
181
|
self.close_table(doc)
|
175
182
|
self.process_inline_text(parent_element, doc)
|
176
183
|
_log.debug(
|
@@ -205,17 +212,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
205
212
|
)
|
206
213
|
|
207
214
|
elif isinstance(element, marko.block.List):
|
215
|
+
has_non_empty_list_items = False
|
216
|
+
for child in element.children:
|
217
|
+
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
|
218
|
+
has_non_empty_list_items = True
|
219
|
+
break
|
220
|
+
|
208
221
|
self.close_table(doc)
|
209
222
|
self.process_inline_text(parent_element, doc)
|
210
223
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
)
|
224
|
+
if has_non_empty_list_items:
|
225
|
+
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
226
|
+
parent_element = doc.add_group(
|
227
|
+
label=label, name=f"list", parent=parent_element
|
228
|
+
)
|
217
229
|
|
218
|
-
elif isinstance(element, marko.block.ListItem):
|
230
|
+
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
219
231
|
self.close_table(doc)
|
220
232
|
self.process_inline_text(parent_element, doc)
|
221
233
|
_log.debug(" - List item")
|
@@ -245,20 +257,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
245
257
|
|
246
258
|
doc.add_picture(parent=parent_element, caption=fig_caption)
|
247
259
|
|
248
|
-
elif isinstance(element, marko.block.Paragraph):
|
260
|
+
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
249
261
|
self.process_inline_text(parent_element, doc)
|
250
262
|
|
251
263
|
elif isinstance(element, marko.inline.RawText):
|
252
264
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
253
|
-
snippet_text =
|
265
|
+
snippet_text = element.children.strip()
|
254
266
|
# Detect start of the table:
|
255
267
|
if "|" in snippet_text:
|
256
268
|
# most likely part of the markdown table
|
257
269
|
self.in_table = True
|
258
270
|
if len(self.md_table_buffer) > 0:
|
259
|
-
self.md_table_buffer[len(self.md_table_buffer) - 1] +=
|
260
|
-
snippet_text
|
261
|
-
)
|
271
|
+
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
262
272
|
else:
|
263
273
|
self.md_table_buffer.append(snippet_text)
|
264
274
|
else:
|
@@ -274,18 +284,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
274
284
|
snippet_text = str(element.children).strip()
|
275
285
|
doc.add_code(parent=parent_element, text=snippet_text)
|
276
286
|
|
277
|
-
elif
|
287
|
+
elif (
|
288
|
+
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
289
|
+
and len(element.children) > 0
|
290
|
+
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
291
|
+
and len(snippet_text := (first_child.children.strip())) > 0
|
292
|
+
):
|
278
293
|
self.close_table(doc)
|
279
294
|
self.process_inline_text(parent_element, doc)
|
280
295
|
_log.debug(f" - Code Block: {element.children}")
|
281
|
-
snippet_text = str(element.children[0].children).strip() # type: ignore
|
282
|
-
doc.add_code(parent=parent_element, text=snippet_text)
|
283
|
-
|
284
|
-
elif isinstance(element, marko.block.FencedCode):
|
285
|
-
self.close_table(doc)
|
286
|
-
self.process_inline_text(parent_element, doc)
|
287
|
-
_log.debug(f" - Code Block: {element.children}")
|
288
|
-
snippet_text = str(element.children[0].children).strip() # type: ignore
|
289
296
|
doc.add_code(parent=parent_element, text=snippet_text)
|
290
297
|
|
291
298
|
elif isinstance(element, marko.inline.LineBreak):
|
@@ -294,29 +301,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
294
301
|
self.md_table_buffer.append("")
|
295
302
|
|
296
303
|
elif isinstance(element, marko.block.HTMLBlock):
|
304
|
+
self._html_blocks += 1
|
297
305
|
self.process_inline_text(parent_element, doc)
|
298
306
|
self.close_table(doc)
|
299
307
|
_log.debug("HTML Block: {}".format(element))
|
300
308
|
if (
|
301
|
-
len(element.
|
309
|
+
len(element.body) > 0
|
302
310
|
): # If Marko doesn't return any content for HTML block, skip it
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
311
|
+
html_block = element.body.strip()
|
312
|
+
|
313
|
+
# wrap in markers to enable post-processing in convert()
|
314
|
+
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
315
|
+
doc.add_code(parent=parent_element, text=text_to_add)
|
307
316
|
else:
|
308
317
|
if not isinstance(element, str):
|
309
318
|
self.close_table(doc)
|
310
319
|
_log.debug("Some other element: {}".format(element))
|
311
320
|
|
321
|
+
processed_block_types = (
|
322
|
+
marko.block.ListItem,
|
323
|
+
marko.block.Heading,
|
324
|
+
marko.block.CodeBlock,
|
325
|
+
marko.block.FencedCode,
|
326
|
+
# marko.block.Paragraph,
|
327
|
+
marko.inline.RawText,
|
328
|
+
)
|
329
|
+
|
312
330
|
# Iterate through the element's children (if any)
|
313
|
-
if
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
for child in element.children:
|
319
|
-
self.iterate_elements(child, depth + 1, doc, parent_element)
|
331
|
+
if hasattr(element, "children") and not isinstance(
|
332
|
+
element, processed_block_types
|
333
|
+
):
|
334
|
+
for child in element.children:
|
335
|
+
self.iterate_elements(child, depth + 1, doc, parent_element)
|
320
336
|
|
321
337
|
def is_valid(self) -> bool:
|
322
338
|
return self.valid
|
@@ -352,6 +368,43 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
352
368
|
# Start iterating from the root of the AST
|
353
369
|
self.iterate_elements(parsed_ast, 0, doc, None)
|
354
370
|
self.process_inline_text(None, doc) # handle last hanging inline text
|
371
|
+
self.close_table(doc=doc) # handle any last hanging table
|
372
|
+
|
373
|
+
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
374
|
+
if self._html_blocks > 0:
|
375
|
+
|
376
|
+
# export to HTML
|
377
|
+
html_backend_cls = HTMLDocumentBackend
|
378
|
+
html_str = doc.export_to_html()
|
379
|
+
|
380
|
+
def _restore_original_html(txt, regex):
|
381
|
+
_txt, count = re.subn(regex, "", txt)
|
382
|
+
if count != self._html_blocks:
|
383
|
+
raise RuntimeError(
|
384
|
+
"An internal error has occurred during Markdown conversion."
|
385
|
+
)
|
386
|
+
return _txt
|
387
|
+
|
388
|
+
# restore original HTML by removing previouly added markers
|
389
|
+
for regex in [
|
390
|
+
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
391
|
+
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
392
|
+
]:
|
393
|
+
html_str = _restore_original_html(txt=html_str, regex=regex)
|
394
|
+
self._html_blocks = 0
|
395
|
+
|
396
|
+
# delegate to HTML backend
|
397
|
+
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
398
|
+
in_doc = InputDocument(
|
399
|
+
path_or_stream=stream,
|
400
|
+
format=InputFormat.HTML,
|
401
|
+
backend=html_backend_cls,
|
402
|
+
filename=self.file.name,
|
403
|
+
)
|
404
|
+
html_backend_obj = html_backend_cls(
|
405
|
+
in_doc=in_doc, path_or_stream=stream
|
406
|
+
)
|
407
|
+
doc = html_backend_obj.convert()
|
355
408
|
else:
|
356
409
|
raise RuntimeError(
|
357
410
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
98
98
|
|
99
99
|
return doc
|
100
100
|
|
101
|
-
def generate_prov(
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
101
|
+
def generate_prov(
|
102
|
+
self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
|
103
|
+
):
|
104
|
+
if shape.left:
|
105
|
+
left = shape.left
|
106
|
+
top = shape.top
|
107
|
+
width = shape.width
|
108
|
+
height = shape.height
|
109
|
+
else:
|
110
|
+
left = 0
|
111
|
+
top = 0
|
112
|
+
width = slide_size.width
|
113
|
+
height = slide_size.height
|
106
114
|
shape_bbox = [left, top, left + width, top + height]
|
107
115
|
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
108
|
-
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
109
116
|
prov = ProvenanceItem(
|
110
117
|
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
111
118
|
)
|
112
119
|
|
113
120
|
return prov
|
114
121
|
|
115
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
122
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
116
123
|
is_a_list = False
|
117
124
|
is_list_group_created = False
|
118
125
|
enum_list_item_value = 0
|
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
121
128
|
list_text = ""
|
122
129
|
list_label = GroupLabel.LIST
|
123
130
|
doc_label = DocItemLabel.LIST_ITEM
|
124
|
-
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
131
|
+
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
125
132
|
|
126
133
|
# Identify if shape contains lists
|
127
134
|
for paragraph in shape.text_frame.paragraphs:
|
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
270
277
|
)
|
271
278
|
return
|
272
279
|
|
273
|
-
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
274
|
-
# Get the image bytes
|
275
|
-
image = shape.image
|
276
|
-
image_bytes = image.blob
|
277
|
-
im_dpi, _ = image.dpi
|
278
|
-
|
280
|
+
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
|
279
281
|
# Open it with PIL
|
280
282
|
try:
|
283
|
+
# Get the image bytes
|
284
|
+
image = shape.image
|
285
|
+
image_bytes = image.blob
|
286
|
+
im_dpi, _ = image.dpi
|
281
287
|
pil_image = Image.open(BytesIO(image_bytes))
|
282
288
|
|
283
289
|
# shape has picture
|
284
|
-
prov = self.generate_prov(shape, slide_ind, "")
|
290
|
+
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
285
291
|
doc.add_picture(
|
286
292
|
parent=parent_slide,
|
287
293
|
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
292
298
|
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
293
299
|
return
|
294
300
|
|
295
|
-
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
301
|
+
def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
|
296
302
|
# Handling tables, images, charts
|
297
303
|
if shape.has_table:
|
298
304
|
table = shape.table
|
299
305
|
table_xml = shape._element
|
300
306
|
|
301
|
-
prov = self.generate_prov(shape, slide_ind, "")
|
307
|
+
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
302
308
|
|
303
309
|
num_cols = 0
|
304
310
|
num_rows = len(table.rows)
|
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
375
381
|
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
376
382
|
)
|
377
383
|
|
378
|
-
|
379
|
-
parent_page = doc.add_page(page_no=slide_ind + 1, size=
|
384
|
+
slide_size = Size(width=slide_width, height=slide_height)
|
385
|
+
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
380
386
|
|
381
|
-
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
382
|
-
handle_groups(shape, parent_slide, slide_ind, doc)
|
387
|
+
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
388
|
+
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
383
389
|
if shape.has_table:
|
384
390
|
# Handle Tables
|
385
|
-
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
391
|
+
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
386
392
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
387
393
|
# Handle Pictures
|
388
|
-
self.handle_pictures(
|
394
|
+
self.handle_pictures(
|
395
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
396
|
+
)
|
389
397
|
# If shape doesn't have any text, move on to the next shape
|
390
398
|
if not hasattr(shape, "text"):
|
391
399
|
return
|
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
397
405
|
_log.warning("Warning: shape has text but not text_frame")
|
398
406
|
return
|
399
407
|
# Handle other text elements, including lists (bullet lists, numbered lists)
|
400
|
-
self.handle_text_elements(
|
408
|
+
self.handle_text_elements(
|
409
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
410
|
+
)
|
401
411
|
return
|
402
412
|
|
403
|
-
def handle_groups(shape, parent_slide, slide_ind, doc):
|
413
|
+
def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
|
404
414
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
405
415
|
for groupedshape in shape.shapes:
|
406
|
-
handle_shapes(
|
416
|
+
handle_shapes(
|
417
|
+
groupedshape, parent_slide, slide_ind, doc, slide_size
|
418
|
+
)
|
407
419
|
|
408
420
|
# Loop through each shape in the slide
|
409
421
|
for shape in slide.shapes:
|
410
|
-
handle_shapes(shape, parent_slide, slide_ind, doc)
|
422
|
+
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
411
423
|
|
412
424
|
return doc
|