docling 2.16.0__py3-none-any.whl → 2.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +21 -20
- docling/backend/md_backend.py +92 -43
- docling/backend/mspowerpoint_backend.py +39 -27
- docling/backend/msword_backend.py +172 -130
- docling/backend/xml/uspto_backend.py +25 -25
- docling/cli/main.py +18 -3
- docling/datamodel/document.py +4 -0
- docling/datamodel/pipeline_options.py +1 -0
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +12 -2
- docling/models/rapid_ocr_model.py +1 -0
- docling/models/table_structure_model.py +9 -5
- docling/models/tesseract_ocr_cli_model.py +72 -4
- docling/models/tesseract_ocr_model.py +37 -37
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +4 -0
- docling/utils/ocr_utils.py +9 -0
- {docling-2.16.0.dist-info → docling-2.18.0.dist-info}/METADATA +20 -12
- {docling-2.16.0.dist-info → docling-2.18.0.dist-info}/RECORD +22 -21
- {docling-2.16.0.dist-info → docling-2.18.0.dist-info}/WHEEL +1 -1
- {docling-2.16.0.dist-info → docling-2.18.0.dist-info}/LICENSE +0 -0
- {docling-2.16.0.dist-info → docling-2.18.0.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Set, Union
|
4
|
+
from typing import Optional, Set, Union
|
5
5
|
|
6
|
-
from bs4 import BeautifulSoup
|
6
|
+
from bs4 import BeautifulSoup, Tag
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
9
9
|
DoclingDocument,
|
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
24
24
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
25
25
|
super().__init__(in_doc, path_or_stream)
|
26
26
|
_log.debug("About to init HTML backend...")
|
27
|
-
self.soup = None
|
27
|
+
self.soup: Optional[Tag] = None
|
28
28
|
# HTML file:
|
29
29
|
self.path_or_stream = path_or_stream
|
30
30
|
# Initialise the parents for the hierarchy
|
@@ -78,17 +78,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
78
78
|
|
79
79
|
if self.is_valid():
|
80
80
|
assert self.soup is not None
|
81
|
+
content = self.soup.body or self.soup
|
81
82
|
# Replace <br> tags with newline characters
|
82
|
-
for br in
|
83
|
+
for br in content.find_all("br"):
|
83
84
|
br.replace_with("\n")
|
84
|
-
doc = self.walk(
|
85
|
+
doc = self.walk(content, doc)
|
85
86
|
else:
|
86
87
|
raise RuntimeError(
|
87
88
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
88
89
|
)
|
89
90
|
return doc
|
90
91
|
|
91
|
-
def walk(self, element, doc):
|
92
|
+
def walk(self, element: Tag, doc: DoclingDocument):
|
92
93
|
try:
|
93
94
|
# Iterate over elements in the body of the document
|
94
95
|
for idx, element in enumerate(element.children):
|
@@ -105,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
105
106
|
|
106
107
|
return doc
|
107
108
|
|
108
|
-
def analyse_element(self, element, idx, doc):
|
109
|
+
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
109
110
|
"""
|
110
111
|
if element.name!=None:
|
111
112
|
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
@@ -135,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
135
136
|
else:
|
136
137
|
self.walk(element, doc)
|
137
138
|
|
138
|
-
def get_direct_text(self, item):
|
139
|
+
def get_direct_text(self, item: Tag):
|
139
140
|
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
140
141
|
text = item.find(string=True, recursive=False)
|
141
142
|
if isinstance(text, str):
|
@@ -144,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
144
145
|
return ""
|
145
146
|
|
146
147
|
# Function to recursively extract text from all child nodes
|
147
|
-
def extract_text_recursively(self, item):
|
148
|
+
def extract_text_recursively(self, item: Tag):
|
148
149
|
result = []
|
149
150
|
|
150
151
|
if isinstance(item, str):
|
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
165
166
|
|
166
167
|
return "".join(result) + " "
|
167
168
|
|
168
|
-
def handle_header(self, element, idx, doc):
|
169
|
+
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
169
170
|
"""Handles header tags (h1, h2, etc.)."""
|
170
171
|
hlevel = int(element.name.replace("h", ""))
|
171
172
|
slevel = hlevel - 1
|
@@ -207,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
207
208
|
level=hlevel,
|
208
209
|
)
|
209
210
|
|
210
|
-
def handle_code(self, element, idx, doc):
|
211
|
+
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
211
212
|
"""Handles monospace code snippets (pre)."""
|
212
213
|
if element.text is None:
|
213
214
|
return
|
@@ -215,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
215
216
|
label = DocItemLabel.CODE
|
216
217
|
if len(text) == 0:
|
217
218
|
return
|
218
|
-
doc.add_code(parent=self.parents[self.level],
|
219
|
+
doc.add_code(parent=self.parents[self.level], text=text)
|
219
220
|
|
220
|
-
def handle_paragraph(self, element, idx, doc):
|
221
|
+
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
221
222
|
"""Handles paragraph tags (p)."""
|
222
223
|
if element.text is None:
|
223
224
|
return
|
@@ -227,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
227
228
|
return
|
228
229
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
229
230
|
|
230
|
-
def handle_list(self, element, idx, doc):
|
231
|
+
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
231
232
|
"""Handles list tags (ul, ol) and their list items."""
|
232
233
|
|
233
234
|
if element.name == "ul":
|
@@ -249,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
249
250
|
self.parents[self.level + 1] = None
|
250
251
|
self.level -= 1
|
251
252
|
|
252
|
-
def handle_listitem(self, element, idx, doc):
|
253
|
+
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
253
254
|
"""Handles listitem tags (li)."""
|
254
255
|
nested_lists = element.find(["ul", "ol"])
|
255
256
|
|
@@ -303,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
303
304
|
else:
|
304
305
|
_log.warn("list-item has no text: ", element)
|
305
306
|
|
306
|
-
def handle_table(self, element, idx, doc):
|
307
|
+
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
307
308
|
"""Handles table tags."""
|
308
309
|
|
309
310
|
nested_tables = element.find("table")
|
@@ -376,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
376
377
|
|
377
378
|
doc.add_table(data=data, parent=self.parents[self.level])
|
378
379
|
|
379
|
-
def get_list_text(self, list_element, level=0):
|
380
|
+
def get_list_text(self, list_element: Tag, level=0):
|
380
381
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
381
382
|
result = []
|
382
383
|
bullet_char = "*" # Default bullet character for unordered lists
|
@@ -402,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
402
403
|
|
403
404
|
return result
|
404
405
|
|
405
|
-
def extract_table_cell_text(self, cell):
|
406
|
+
def extract_table_cell_text(self, cell: Tag):
|
406
407
|
"""Extract text from a table cell, including lists with indents."""
|
407
408
|
contains_lists = cell.find(["ul", "ol"])
|
408
409
|
if contains_lists is None:
|
@@ -413,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
413
414
|
)
|
414
415
|
return cell.text
|
415
416
|
|
416
|
-
def handle_figure(self, element, idx, doc):
|
417
|
+
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
417
418
|
"""Handles image tags (img)."""
|
418
419
|
|
419
420
|
# Extract the image URI from the <img> tag
|
@@ -436,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
436
437
|
caption=fig_caption,
|
437
438
|
)
|
438
439
|
|
439
|
-
def handle_image(self, element, idx, doc):
|
440
|
+
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
440
441
|
"""Handles image tags (img)."""
|
441
442
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
docling/backend/md_backend.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6
6
|
from typing import List, Optional, Set, Union
|
7
7
|
|
8
8
|
import marko
|
9
|
+
import marko.element
|
9
10
|
import marko.ext
|
10
11
|
import marko.ext.gfm
|
11
12
|
import marko.inline
|
@@ -23,11 +24,16 @@ from docling_core.types.doc import (
|
|
23
24
|
from marko import Markdown
|
24
25
|
|
25
26
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
26
28
|
from docling.datamodel.base_models import InputFormat
|
27
29
|
from docling.datamodel.document import InputDocument
|
28
30
|
|
29
31
|
_log = logging.getLogger(__name__)
|
30
32
|
|
33
|
+
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
34
|
+
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
35
|
+
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
|
+
|
31
37
|
|
32
38
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
33
39
|
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
@@ -65,7 +71,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
65
71
|
|
66
72
|
self.in_table = False
|
67
73
|
self.md_table_buffer: list[str] = []
|
68
|
-
self.
|
74
|
+
self.inline_texts: list[str] = []
|
75
|
+
self._html_blocks: int = 0
|
69
76
|
|
70
77
|
try:
|
71
78
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -152,26 +159,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
152
159
|
def process_inline_text(
|
153
160
|
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
154
161
|
):
|
155
|
-
|
156
|
-
txt = self.inline_text_buffer.strip()
|
162
|
+
txt = " ".join(self.inline_texts)
|
157
163
|
if len(txt) > 0:
|
158
164
|
doc.add_text(
|
159
165
|
label=DocItemLabel.PARAGRAPH,
|
160
166
|
parent=parent_element,
|
161
167
|
text=txt,
|
162
168
|
)
|
163
|
-
self.
|
169
|
+
self.inline_texts = []
|
164
170
|
|
165
171
|
def iterate_elements(
|
166
172
|
self,
|
167
|
-
element: marko.
|
173
|
+
element: marko.element.Element,
|
168
174
|
depth: int,
|
169
175
|
doc: DoclingDocument,
|
170
176
|
parent_element: Optional[NodeItem] = None,
|
171
177
|
):
|
172
178
|
# Iterates over all elements in the AST
|
173
179
|
# Check for different element types and process relevant details
|
174
|
-
if isinstance(element, marko.block.Heading):
|
180
|
+
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
175
181
|
self.close_table(doc)
|
176
182
|
self.process_inline_text(parent_element, doc)
|
177
183
|
_log.debug(
|
@@ -206,17 +212,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
206
212
|
)
|
207
213
|
|
208
214
|
elif isinstance(element, marko.block.List):
|
215
|
+
has_non_empty_list_items = False
|
216
|
+
for child in element.children:
|
217
|
+
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
|
218
|
+
has_non_empty_list_items = True
|
219
|
+
break
|
220
|
+
|
209
221
|
self.close_table(doc)
|
210
222
|
self.process_inline_text(parent_element, doc)
|
211
223
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
)
|
224
|
+
if has_non_empty_list_items:
|
225
|
+
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
226
|
+
parent_element = doc.add_group(
|
227
|
+
label=label, name=f"list", parent=parent_element
|
228
|
+
)
|
218
229
|
|
219
|
-
elif isinstance(element, marko.block.ListItem):
|
230
|
+
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
220
231
|
self.close_table(doc)
|
221
232
|
self.process_inline_text(parent_element, doc)
|
222
233
|
_log.debug(" - List item")
|
@@ -246,29 +257,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
246
257
|
|
247
258
|
doc.add_picture(parent=parent_element, caption=fig_caption)
|
248
259
|
|
249
|
-
elif isinstance(element, marko.block.Paragraph):
|
260
|
+
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
250
261
|
self.process_inline_text(parent_element, doc)
|
251
262
|
|
252
263
|
elif isinstance(element, marko.inline.RawText):
|
253
264
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
254
|
-
snippet_text =
|
265
|
+
snippet_text = element.children.strip()
|
255
266
|
# Detect start of the table:
|
256
267
|
if "|" in snippet_text:
|
257
268
|
# most likely part of the markdown table
|
258
269
|
self.in_table = True
|
259
270
|
if len(self.md_table_buffer) > 0:
|
260
|
-
self.md_table_buffer[len(self.md_table_buffer) - 1] +=
|
261
|
-
snippet_text
|
262
|
-
)
|
271
|
+
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
263
272
|
else:
|
264
273
|
self.md_table_buffer.append(snippet_text)
|
265
274
|
else:
|
266
275
|
self.close_table(doc)
|
267
276
|
self.in_table = False
|
268
277
|
# most likely just inline text
|
269
|
-
self.
|
270
|
-
element.children
|
271
|
-
) # do not strip an inline text, as it may contain important spaces
|
278
|
+
self.inline_texts.append(str(element.children))
|
272
279
|
|
273
280
|
elif isinstance(element, marko.inline.CodeSpan):
|
274
281
|
self.close_table(doc)
|
@@ -277,50 +284,55 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
277
284
|
snippet_text = str(element.children).strip()
|
278
285
|
doc.add_code(parent=parent_element, text=snippet_text)
|
279
286
|
|
280
|
-
elif
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
snippet_text
|
285
|
-
|
286
|
-
|
287
|
-
elif isinstance(element, marko.block.FencedCode):
|
287
|
+
elif (
|
288
|
+
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
289
|
+
and len(element.children) > 0
|
290
|
+
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
291
|
+
and len(snippet_text := (first_child.children.strip())) > 0
|
292
|
+
):
|
288
293
|
self.close_table(doc)
|
289
294
|
self.process_inline_text(parent_element, doc)
|
290
295
|
_log.debug(f" - Code Block: {element.children}")
|
291
|
-
snippet_text = str(element.children[0].children).strip() # type: ignore
|
292
296
|
doc.add_code(parent=parent_element, text=snippet_text)
|
293
297
|
|
294
298
|
elif isinstance(element, marko.inline.LineBreak):
|
295
|
-
self.process_inline_text(parent_element, doc)
|
296
299
|
if self.in_table:
|
297
300
|
_log.debug("Line break in a table")
|
298
301
|
self.md_table_buffer.append("")
|
299
302
|
|
300
303
|
elif isinstance(element, marko.block.HTMLBlock):
|
304
|
+
self._html_blocks += 1
|
301
305
|
self.process_inline_text(parent_element, doc)
|
302
306
|
self.close_table(doc)
|
303
307
|
_log.debug("HTML Block: {}".format(element))
|
304
308
|
if (
|
305
|
-
len(element.
|
309
|
+
len(element.body) > 0
|
306
310
|
): # If Marko doesn't return any content for HTML block, skip it
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
+
html_block = element.body.strip()
|
312
|
+
|
313
|
+
# wrap in markers to enable post-processing in convert()
|
314
|
+
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
315
|
+
doc.add_code(parent=parent_element, text=text_to_add)
|
311
316
|
else:
|
312
317
|
if not isinstance(element, str):
|
313
318
|
self.close_table(doc)
|
314
319
|
_log.debug("Some other element: {}".format(element))
|
315
320
|
|
321
|
+
processed_block_types = (
|
322
|
+
marko.block.ListItem,
|
323
|
+
marko.block.Heading,
|
324
|
+
marko.block.CodeBlock,
|
325
|
+
marko.block.FencedCode,
|
326
|
+
# marko.block.Paragraph,
|
327
|
+
marko.inline.RawText,
|
328
|
+
)
|
329
|
+
|
316
330
|
# Iterate through the element's children (if any)
|
317
|
-
if
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
for child in element.children:
|
323
|
-
self.iterate_elements(child, depth + 1, doc, parent_element)
|
331
|
+
if hasattr(element, "children") and not isinstance(
|
332
|
+
element, processed_block_types
|
333
|
+
):
|
334
|
+
for child in element.children:
|
335
|
+
self.iterate_elements(child, depth + 1, doc, parent_element)
|
324
336
|
|
325
337
|
def is_valid(self) -> bool:
|
326
338
|
return self.valid
|
@@ -356,6 +368,43 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
356
368
|
# Start iterating from the root of the AST
|
357
369
|
self.iterate_elements(parsed_ast, 0, doc, None)
|
358
370
|
self.process_inline_text(None, doc) # handle last hanging inline text
|
371
|
+
self.close_table(doc=doc) # handle any last hanging table
|
372
|
+
|
373
|
+
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
374
|
+
if self._html_blocks > 0:
|
375
|
+
|
376
|
+
# export to HTML
|
377
|
+
html_backend_cls = HTMLDocumentBackend
|
378
|
+
html_str = doc.export_to_html()
|
379
|
+
|
380
|
+
def _restore_original_html(txt, regex):
|
381
|
+
_txt, count = re.subn(regex, "", txt)
|
382
|
+
if count != self._html_blocks:
|
383
|
+
raise RuntimeError(
|
384
|
+
"An internal error has occurred during Markdown conversion."
|
385
|
+
)
|
386
|
+
return _txt
|
387
|
+
|
388
|
+
# restore original HTML by removing previouly added markers
|
389
|
+
for regex in [
|
390
|
+
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
391
|
+
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
392
|
+
]:
|
393
|
+
html_str = _restore_original_html(txt=html_str, regex=regex)
|
394
|
+
self._html_blocks = 0
|
395
|
+
|
396
|
+
# delegate to HTML backend
|
397
|
+
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
398
|
+
in_doc = InputDocument(
|
399
|
+
path_or_stream=stream,
|
400
|
+
format=InputFormat.HTML,
|
401
|
+
backend=html_backend_cls,
|
402
|
+
filename=self.file.name,
|
403
|
+
)
|
404
|
+
html_backend_obj = html_backend_cls(
|
405
|
+
in_doc=in_doc, path_or_stream=stream
|
406
|
+
)
|
407
|
+
doc = html_backend_obj.convert()
|
359
408
|
else:
|
360
409
|
raise RuntimeError(
|
361
410
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
98
98
|
|
99
99
|
return doc
|
100
100
|
|
101
|
-
def generate_prov(
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
101
|
+
def generate_prov(
|
102
|
+
self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
|
103
|
+
):
|
104
|
+
if shape.left:
|
105
|
+
left = shape.left
|
106
|
+
top = shape.top
|
107
|
+
width = shape.width
|
108
|
+
height = shape.height
|
109
|
+
else:
|
110
|
+
left = 0
|
111
|
+
top = 0
|
112
|
+
width = slide_size.width
|
113
|
+
height = slide_size.height
|
106
114
|
shape_bbox = [left, top, left + width, top + height]
|
107
115
|
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
108
|
-
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
109
116
|
prov = ProvenanceItem(
|
110
117
|
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
111
118
|
)
|
112
119
|
|
113
120
|
return prov
|
114
121
|
|
115
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
122
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
116
123
|
is_a_list = False
|
117
124
|
is_list_group_created = False
|
118
125
|
enum_list_item_value = 0
|
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
121
128
|
list_text = ""
|
122
129
|
list_label = GroupLabel.LIST
|
123
130
|
doc_label = DocItemLabel.LIST_ITEM
|
124
|
-
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
131
|
+
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
125
132
|
|
126
133
|
# Identify if shape contains lists
|
127
134
|
for paragraph in shape.text_frame.paragraphs:
|
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
270
277
|
)
|
271
278
|
return
|
272
279
|
|
273
|
-
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
274
|
-
# Get the image bytes
|
275
|
-
image = shape.image
|
276
|
-
image_bytes = image.blob
|
277
|
-
im_dpi, _ = image.dpi
|
278
|
-
|
280
|
+
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
|
279
281
|
# Open it with PIL
|
280
282
|
try:
|
283
|
+
# Get the image bytes
|
284
|
+
image = shape.image
|
285
|
+
image_bytes = image.blob
|
286
|
+
im_dpi, _ = image.dpi
|
281
287
|
pil_image = Image.open(BytesIO(image_bytes))
|
282
288
|
|
283
289
|
# shape has picture
|
284
|
-
prov = self.generate_prov(shape, slide_ind, "")
|
290
|
+
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
285
291
|
doc.add_picture(
|
286
292
|
parent=parent_slide,
|
287
293
|
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
292
298
|
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
293
299
|
return
|
294
300
|
|
295
|
-
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
301
|
+
def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
|
296
302
|
# Handling tables, images, charts
|
297
303
|
if shape.has_table:
|
298
304
|
table = shape.table
|
299
305
|
table_xml = shape._element
|
300
306
|
|
301
|
-
prov = self.generate_prov(shape, slide_ind, "")
|
307
|
+
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
302
308
|
|
303
309
|
num_cols = 0
|
304
310
|
num_rows = len(table.rows)
|
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
375
381
|
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
376
382
|
)
|
377
383
|
|
378
|
-
|
379
|
-
parent_page = doc.add_page(page_no=slide_ind + 1, size=
|
384
|
+
slide_size = Size(width=slide_width, height=slide_height)
|
385
|
+
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
380
386
|
|
381
|
-
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
382
|
-
handle_groups(shape, parent_slide, slide_ind, doc)
|
387
|
+
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
388
|
+
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
383
389
|
if shape.has_table:
|
384
390
|
# Handle Tables
|
385
|
-
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
391
|
+
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
386
392
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
387
393
|
# Handle Pictures
|
388
|
-
self.handle_pictures(
|
394
|
+
self.handle_pictures(
|
395
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
396
|
+
)
|
389
397
|
# If shape doesn't have any text, move on to the next shape
|
390
398
|
if not hasattr(shape, "text"):
|
391
399
|
return
|
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
397
405
|
_log.warning("Warning: shape has text but not text_frame")
|
398
406
|
return
|
399
407
|
# Handle other text elements, including lists (bullet lists, numbered lists)
|
400
|
-
self.handle_text_elements(
|
408
|
+
self.handle_text_elements(
|
409
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
410
|
+
)
|
401
411
|
return
|
402
412
|
|
403
|
-
def handle_groups(shape, parent_slide, slide_ind, doc):
|
413
|
+
def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
|
404
414
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
405
415
|
for groupedshape in shape.shapes:
|
406
|
-
handle_shapes(
|
416
|
+
handle_shapes(
|
417
|
+
groupedshape, parent_slide, slide_ind, doc, slide_size
|
418
|
+
)
|
407
419
|
|
408
420
|
# Loop through each shape in the slide
|
409
421
|
for shape in slide.shapes:
|
410
|
-
handle_shapes(shape, parent_slide, slide_ind, doc)
|
422
|
+
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
411
423
|
|
412
424
|
return doc
|