docling 2.37.0__py3-none-any.whl → 2.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/md_backend.py +185 -80
- docling/backend/msword_backend.py +76 -63
- docling/backend/noop_backend.py +51 -0
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +12 -2
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +13 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +2 -3
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/METADATA +7 -4
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/RECORD +24 -20
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/WHEEL +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/entry_points.txt +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/licenses/LICENSE +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/top_level.txt +0 -0
docling/backend/md_backend.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
import warnings
|
4
|
+
from copy import deepcopy
|
5
|
+
from enum import Enum
|
4
6
|
from io import BytesIO
|
5
7
|
from pathlib import Path
|
6
|
-
from typing import List, Optional, Set, Union
|
8
|
+
from typing import List, Literal, Optional, Set, Union
|
7
9
|
|
8
10
|
import marko
|
9
11
|
import marko.element
|
10
|
-
import marko.ext
|
11
|
-
import marko.ext.gfm
|
12
12
|
import marko.inline
|
13
13
|
from docling_core.types.doc import (
|
14
|
-
DocItem,
|
15
14
|
DocItemLabel,
|
16
15
|
DoclingDocument,
|
17
16
|
DocumentOrigin,
|
@@ -21,7 +20,10 @@ from docling_core.types.doc import (
|
|
21
20
|
TableData,
|
22
21
|
TextItem,
|
23
22
|
)
|
23
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
24
24
|
from marko import Markdown
|
25
|
+
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
26
|
+
from typing_extensions import Annotated
|
25
27
|
|
26
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
29
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|
35
37
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
38
|
|
37
39
|
|
40
|
+
class _PendingCreationType(str, Enum):
|
41
|
+
"""CoordOrigin."""
|
42
|
+
|
43
|
+
HEADING = "heading"
|
44
|
+
LIST_ITEM = "list_item"
|
45
|
+
|
46
|
+
|
47
|
+
class _HeadingCreationPayload(BaseModel):
|
48
|
+
kind: Literal["heading"] = "heading"
|
49
|
+
level: int
|
50
|
+
|
51
|
+
|
52
|
+
class _ListItemCreationPayload(BaseModel):
|
53
|
+
kind: Literal["list_item"] = "list_item"
|
54
|
+
|
55
|
+
|
56
|
+
_CreationPayload = Annotated[
|
57
|
+
Union[
|
58
|
+
_HeadingCreationPayload,
|
59
|
+
_ListItemCreationPayload,
|
60
|
+
],
|
61
|
+
Field(discriminator="kind"),
|
62
|
+
]
|
63
|
+
|
64
|
+
|
38
65
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
66
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
67
|
# This regex will match any sequence of underscores
|
@@ -71,7 +98,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
71
98
|
|
72
99
|
self.in_table = False
|
73
100
|
self.md_table_buffer: list[str] = []
|
74
|
-
self.inline_texts: list[str] = []
|
75
101
|
self._html_blocks: int = 0
|
76
102
|
|
77
103
|
try:
|
@@ -156,25 +182,65 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
182
|
doc.add_table(data=table_data)
|
157
183
|
return
|
158
184
|
|
159
|
-
def
|
160
|
-
self,
|
185
|
+
def _create_list_item(
|
186
|
+
self,
|
187
|
+
doc: DoclingDocument,
|
188
|
+
parent_item: Optional[NodeItem],
|
189
|
+
text: str,
|
190
|
+
formatting: Optional[Formatting] = None,
|
191
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
192
|
+
):
|
193
|
+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
194
|
+
_log.warning("ListItem would have not had a list parent, adding one.")
|
195
|
+
parent_item = doc.add_unordered_list(parent=parent_item)
|
196
|
+
item = doc.add_list_item(
|
197
|
+
text=text,
|
198
|
+
enumerated=(isinstance(parent_item, OrderedList)),
|
199
|
+
parent=parent_item,
|
200
|
+
formatting=formatting,
|
201
|
+
hyperlink=hyperlink,
|
202
|
+
)
|
203
|
+
return item
|
204
|
+
|
205
|
+
def _create_heading_item(
|
206
|
+
self,
|
207
|
+
doc: DoclingDocument,
|
208
|
+
parent_item: Optional[NodeItem],
|
209
|
+
text: str,
|
210
|
+
level: int,
|
211
|
+
formatting: Optional[Formatting] = None,
|
212
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
161
213
|
):
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
label=DocItemLabel.PARAGRAPH,
|
214
|
+
if level == 1:
|
215
|
+
item = doc.add_title(
|
216
|
+
text=text,
|
166
217
|
parent=parent_item,
|
167
|
-
|
218
|
+
formatting=formatting,
|
219
|
+
hyperlink=hyperlink,
|
168
220
|
)
|
169
|
-
|
221
|
+
else:
|
222
|
+
item = doc.add_heading(
|
223
|
+
text=text,
|
224
|
+
level=level - 1,
|
225
|
+
parent=parent_item,
|
226
|
+
formatting=formatting,
|
227
|
+
hyperlink=hyperlink,
|
228
|
+
)
|
229
|
+
return item
|
170
230
|
|
171
231
|
def _iterate_elements( # noqa: C901
|
172
232
|
self,
|
233
|
+
*,
|
173
234
|
element: marko.element.Element,
|
174
235
|
depth: int,
|
175
236
|
doc: DoclingDocument,
|
176
237
|
visited: Set[marko.element.Element],
|
238
|
+
creation_stack: list[
|
239
|
+
_CreationPayload
|
240
|
+
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
177
241
|
parent_item: Optional[NodeItem] = None,
|
242
|
+
formatting: Optional[Formatting] = None,
|
243
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
178
244
|
):
|
179
245
|
if element in visited:
|
180
246
|
return
|
@@ -183,44 +249,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
183
249
|
# Check for different element types and process relevant details
|
184
250
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
185
251
|
self._close_table(doc)
|
186
|
-
self._process_inline_text(parent_item, doc)
|
187
252
|
_log.debug(
|
188
253
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
189
254
|
)
|
190
|
-
|
191
|
-
|
255
|
+
|
256
|
+
if len(element.children) > 1: # inline group will be created further down
|
257
|
+
parent_item = self._create_heading_item(
|
258
|
+
doc=doc,
|
259
|
+
parent_item=parent_item,
|
260
|
+
text="",
|
261
|
+
level=element.level,
|
262
|
+
formatting=formatting,
|
263
|
+
hyperlink=hyperlink,
|
264
|
+
)
|
192
265
|
else:
|
193
|
-
|
194
|
-
|
195
|
-
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
196
|
-
# hence we need to traverse the tree to get full text of a header
|
197
|
-
strings: List[str] = []
|
198
|
-
|
199
|
-
# Define a recursive function to traverse the tree
|
200
|
-
def traverse(node: marko.block.BlockElement):
|
201
|
-
# Check if the node has a "children" attribute
|
202
|
-
if hasattr(node, "children"):
|
203
|
-
# If "children" is a list, continue traversal
|
204
|
-
if isinstance(node.children, list):
|
205
|
-
for child in node.children:
|
206
|
-
traverse(child)
|
207
|
-
# If "children" is text, add it to header text
|
208
|
-
elif isinstance(node.children, str):
|
209
|
-
strings.append(node.children)
|
210
|
-
|
211
|
-
traverse(element)
|
212
|
-
snippet_text = "".join(strings)
|
213
|
-
if len(snippet_text) > 0:
|
214
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
215
|
-
parent_item = doc.add_heading(
|
216
|
-
text=snippet_text,
|
217
|
-
level=element.level - 1,
|
218
|
-
parent=parent_item,
|
219
|
-
)
|
220
|
-
else:
|
221
|
-
parent_item = doc.add_text(
|
222
|
-
label=doc_label, parent=parent_item, text=snippet_text
|
223
|
-
)
|
266
|
+
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
224
267
|
|
225
268
|
elif isinstance(element, marko.block.List):
|
226
269
|
has_non_empty_list_items = False
|
@@ -230,7 +273,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
230
273
|
break
|
231
274
|
|
232
275
|
self._close_table(doc)
|
233
|
-
self._process_inline_text(parent_item, doc)
|
234
276
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
235
277
|
if has_non_empty_list_items:
|
236
278
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
@@ -240,41 +282,54 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
240
282
|
|
241
283
|
elif (
|
242
284
|
isinstance(element, marko.block.ListItem)
|
243
|
-
and len(element.children)
|
244
|
-
and isinstance((
|
285
|
+
and len(element.children) == 1
|
286
|
+
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
287
|
+
and len(child.children) > 0
|
245
288
|
):
|
246
289
|
self._close_table(doc)
|
247
|
-
self._process_inline_text(parent_item, doc)
|
248
290
|
_log.debug(" - List item")
|
249
291
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
)
|
261
|
-
visited.add(first_child)
|
292
|
+
if len(child.children) > 1: # inline group will be created further down
|
293
|
+
parent_item = self._create_list_item(
|
294
|
+
doc=doc,
|
295
|
+
parent_item=parent_item,
|
296
|
+
text="",
|
297
|
+
formatting=formatting,
|
298
|
+
hyperlink=hyperlink,
|
299
|
+
)
|
300
|
+
else:
|
301
|
+
creation_stack.append(_ListItemCreationPayload())
|
262
302
|
|
263
303
|
elif isinstance(element, marko.inline.Image):
|
264
304
|
self._close_table(doc)
|
265
|
-
self._process_inline_text(parent_item, doc)
|
266
305
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
267
306
|
|
268
307
|
fig_caption: Optional[TextItem] = None
|
269
308
|
if element.title is not None and element.title != "":
|
270
309
|
fig_caption = doc.add_text(
|
271
|
-
label=DocItemLabel.CAPTION,
|
310
|
+
label=DocItemLabel.CAPTION,
|
311
|
+
text=element.title,
|
312
|
+
formatting=formatting,
|
313
|
+
hyperlink=hyperlink,
|
272
314
|
)
|
273
315
|
|
274
316
|
doc.add_picture(parent=parent_item, caption=fig_caption)
|
275
317
|
|
276
|
-
elif isinstance(element, marko.
|
277
|
-
|
318
|
+
elif isinstance(element, marko.inline.Emphasis):
|
319
|
+
_log.debug(f" - Emphasis: {element.children}")
|
320
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
321
|
+
formatting.italic = True
|
322
|
+
|
323
|
+
elif isinstance(element, marko.inline.StrongEmphasis):
|
324
|
+
_log.debug(f" - StrongEmphasis: {element.children}")
|
325
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
326
|
+
formatting.bold = True
|
327
|
+
|
328
|
+
elif isinstance(element, marko.inline.Link):
|
329
|
+
_log.debug(f" - Link: {element.children}")
|
330
|
+
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
331
|
+
element.dest
|
332
|
+
)
|
278
333
|
|
279
334
|
elif isinstance(element, marko.inline.RawText):
|
280
335
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -287,28 +342,66 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
287
342
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
288
343
|
else:
|
289
344
|
self.md_table_buffer.append(snippet_text)
|
290
|
-
|
345
|
+
elif snippet_text:
|
291
346
|
self._close_table(doc)
|
292
|
-
|
293
|
-
|
347
|
+
|
348
|
+
if creation_stack:
|
349
|
+
while len(creation_stack) > 0:
|
350
|
+
to_create = creation_stack.pop()
|
351
|
+
if isinstance(to_create, _ListItemCreationPayload):
|
352
|
+
parent_item = self._create_list_item(
|
353
|
+
doc=doc,
|
354
|
+
parent_item=parent_item,
|
355
|
+
text=snippet_text,
|
356
|
+
formatting=formatting,
|
357
|
+
hyperlink=hyperlink,
|
358
|
+
)
|
359
|
+
elif isinstance(to_create, _HeadingCreationPayload):
|
360
|
+
# not keeping as parent_item as logic for correctly tracking
|
361
|
+
# that not implemented yet (section components not captured
|
362
|
+
# as heading children in marko)
|
363
|
+
self._create_heading_item(
|
364
|
+
doc=doc,
|
365
|
+
parent_item=parent_item,
|
366
|
+
text=snippet_text,
|
367
|
+
level=to_create.level,
|
368
|
+
formatting=formatting,
|
369
|
+
hyperlink=hyperlink,
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
doc.add_text(
|
373
|
+
label=DocItemLabel.TEXT,
|
374
|
+
parent=parent_item,
|
375
|
+
text=snippet_text,
|
376
|
+
formatting=formatting,
|
377
|
+
hyperlink=hyperlink,
|
378
|
+
)
|
294
379
|
|
295
380
|
elif isinstance(element, marko.inline.CodeSpan):
|
296
381
|
self._close_table(doc)
|
297
|
-
self._process_inline_text(parent_item, doc)
|
298
382
|
_log.debug(f" - Code Span: {element.children}")
|
299
383
|
snippet_text = str(element.children).strip()
|
300
|
-
doc.add_code(
|
384
|
+
doc.add_code(
|
385
|
+
parent=parent_item,
|
386
|
+
text=snippet_text,
|
387
|
+
formatting=formatting,
|
388
|
+
hyperlink=hyperlink,
|
389
|
+
)
|
301
390
|
|
302
391
|
elif (
|
303
392
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
304
393
|
and len(element.children) > 0
|
305
|
-
and isinstance((
|
306
|
-
and len(snippet_text := (
|
394
|
+
and isinstance((child := element.children[0]), marko.inline.RawText)
|
395
|
+
and len(snippet_text := (child.children.strip())) > 0
|
307
396
|
):
|
308
397
|
self._close_table(doc)
|
309
|
-
self._process_inline_text(parent_item, doc)
|
310
398
|
_log.debug(f" - Code Block: {element.children}")
|
311
|
-
doc.add_code(
|
399
|
+
doc.add_code(
|
400
|
+
parent=parent_item,
|
401
|
+
text=snippet_text,
|
402
|
+
formatting=formatting,
|
403
|
+
hyperlink=hyperlink,
|
404
|
+
)
|
312
405
|
|
313
406
|
elif isinstance(element, marko.inline.LineBreak):
|
314
407
|
if self.in_table:
|
@@ -317,7 +410,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
317
410
|
|
318
411
|
elif isinstance(element, marko.block.HTMLBlock):
|
319
412
|
self._html_blocks += 1
|
320
|
-
self._process_inline_text(parent_item, doc)
|
321
413
|
self._close_table(doc)
|
322
414
|
_log.debug(f"HTML Block: {element}")
|
323
415
|
if (
|
@@ -327,14 +419,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
327
419
|
|
328
420
|
# wrap in markers to enable post-processing in convert()
|
329
421
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
330
|
-
doc.add_code(
|
422
|
+
doc.add_code(
|
423
|
+
parent=parent_item,
|
424
|
+
text=text_to_add,
|
425
|
+
formatting=formatting,
|
426
|
+
hyperlink=hyperlink,
|
427
|
+
)
|
331
428
|
else:
|
332
429
|
if not isinstance(element, str):
|
333
430
|
self._close_table(doc)
|
334
431
|
_log.debug(f"Some other element: {element}")
|
335
432
|
|
433
|
+
if (
|
434
|
+
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
435
|
+
and len(element.children) > 1
|
436
|
+
):
|
437
|
+
parent_item = doc.add_inline_group(parent=parent_item)
|
438
|
+
|
336
439
|
processed_block_types = (
|
337
|
-
marko.block.Heading,
|
338
440
|
marko.block.CodeBlock,
|
339
441
|
marko.block.FencedCode,
|
340
442
|
marko.inline.RawText,
|
@@ -350,7 +452,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
350
452
|
depth=depth + 1,
|
351
453
|
doc=doc,
|
352
454
|
visited=visited,
|
455
|
+
creation_stack=creation_stack,
|
353
456
|
parent_item=parent_item,
|
457
|
+
formatting=formatting,
|
458
|
+
hyperlink=hyperlink,
|
354
459
|
)
|
355
460
|
|
356
461
|
def is_valid(self) -> bool:
|
@@ -391,8 +496,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
391
496
|
doc=doc,
|
392
497
|
parent_item=None,
|
393
498
|
visited=set(),
|
499
|
+
creation_stack=[],
|
394
500
|
)
|
395
|
-
self._process_inline_text(None, doc) # handle last hanging inline text
|
396
501
|
self._close_table(doc=doc) # handle any last hanging table
|
397
502
|
|
398
503
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
|
14
14
|
TableCell,
|
15
15
|
TableData,
|
16
16
|
)
|
17
|
-
from docling_core.types.doc.document import Formatting
|
17
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
18
18
|
from docx import Document
|
19
19
|
from docx.document import Document as DocxDocument
|
20
20
|
from docx.oxml.table import CT_Tc
|
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
84
84
|
self.valid = True
|
85
85
|
except Exception as e:
|
86
86
|
raise RuntimeError(
|
87
|
-
f"
|
87
|
+
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
88
88
|
) from e
|
89
89
|
|
90
90
|
@override
|
@@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
251
251
|
self._handle_tables(element, docx_obj, doc)
|
252
252
|
except Exception:
|
253
253
|
_log.debug("could not parse a table, broken docx table")
|
254
|
-
|
254
|
+
# Check for Image
|
255
255
|
elif drawing_blip:
|
256
256
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
257
|
+
# Check for Text after the Image
|
258
|
+
if (
|
259
|
+
tag_name in ["p"]
|
260
|
+
and element.find(".//w:t", namespaces=namespaces) is not None
|
261
|
+
):
|
262
|
+
self._handle_text_elements(element, docx_obj, doc)
|
257
263
|
# Check for the sdt containers, like table of contents
|
258
264
|
elif tag_name in ["sdt"]:
|
259
265
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
@@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
268
274
|
self._handle_text_elements(element, docx_obj, doc)
|
269
275
|
else:
|
270
276
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
277
|
+
|
271
278
|
return doc
|
272
279
|
|
273
280
|
def _str_to_int(
|
@@ -390,7 +397,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
390
397
|
if isinstance(c, Hyperlink):
|
391
398
|
text = c.text
|
392
399
|
hyperlink = Path(c.address)
|
393
|
-
format =
|
400
|
+
format = (
|
401
|
+
self._get_format_from_run(c.runs[0])
|
402
|
+
if c.runs and len(c.runs) > 0
|
403
|
+
else None
|
404
|
+
)
|
394
405
|
elif isinstance(c, Run):
|
395
406
|
text = c.text
|
396
407
|
hyperlink = None
|
@@ -578,7 +589,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
578
589
|
all_paragraphs = []
|
579
590
|
|
580
591
|
# Sort paragraphs within each container, then process containers
|
581
|
-
for
|
592
|
+
for paragraphs in container_paragraphs.values():
|
582
593
|
# Sort by vertical position within each container
|
583
594
|
sorted_container_paragraphs = sorted(
|
584
595
|
paragraphs,
|
@@ -689,14 +700,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
689
700
|
doc: DoclingDocument,
|
690
701
|
) -> None:
|
691
702
|
paragraph = Paragraph(element, docx_obj)
|
692
|
-
|
703
|
+
paragraph_elements = self._get_paragraph_elements(paragraph)
|
693
704
|
text, equations = self._handle_equations_in_text(
|
694
705
|
element=element, text=paragraph.text
|
695
706
|
)
|
696
707
|
|
697
708
|
if text is None:
|
698
709
|
return
|
699
|
-
paragraph_elements = self._get_paragraph_elements(paragraph)
|
700
710
|
text = text.strip()
|
701
711
|
|
702
712
|
# Common styles for bullet and numbered lists.
|
@@ -912,6 +922,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
912
922
|
)
|
913
923
|
return
|
914
924
|
|
925
|
+
def _add_formatted_list_item(
|
926
|
+
self,
|
927
|
+
doc: DoclingDocument,
|
928
|
+
elements: list,
|
929
|
+
marker: str,
|
930
|
+
enumerated: bool,
|
931
|
+
level: int,
|
932
|
+
) -> None:
|
933
|
+
# This should not happen by construction
|
934
|
+
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
935
|
+
return
|
936
|
+
if len(elements) == 1:
|
937
|
+
text, format, hyperlink = elements[0]
|
938
|
+
doc.add_list_item(
|
939
|
+
marker=marker,
|
940
|
+
enumerated=enumerated,
|
941
|
+
parent=self.parents[level],
|
942
|
+
text=text,
|
943
|
+
formatting=format,
|
944
|
+
hyperlink=hyperlink,
|
945
|
+
)
|
946
|
+
else:
|
947
|
+
new_item = doc.add_list_item(
|
948
|
+
marker=marker,
|
949
|
+
enumerated=enumerated,
|
950
|
+
parent=self.parents[level],
|
951
|
+
text="",
|
952
|
+
)
|
953
|
+
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
954
|
+
for text, format, hyperlink in elements:
|
955
|
+
doc.add_text(
|
956
|
+
label=DocItemLabel.TEXT,
|
957
|
+
parent=new_parent,
|
958
|
+
text=text,
|
959
|
+
formatting=format,
|
960
|
+
hyperlink=hyperlink,
|
961
|
+
)
|
962
|
+
|
915
963
|
def _add_list_item(
|
916
964
|
self,
|
917
965
|
*,
|
@@ -921,6 +969,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
921
969
|
elements: list,
|
922
970
|
is_numbered: bool = False,
|
923
971
|
) -> None:
|
972
|
+
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
973
|
+
if not elements:
|
974
|
+
return None
|
924
975
|
enum_marker = ""
|
925
976
|
|
926
977
|
level = self._get_level()
|
@@ -937,21 +988,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
937
988
|
if is_numbered:
|
938
989
|
enum_marker = str(self.listIter) + "."
|
939
990
|
is_numbered = True
|
940
|
-
|
941
|
-
doc
|
942
|
-
prev_parent=self.parents[level],
|
943
|
-
paragraph_elements=elements,
|
991
|
+
self._add_formatted_list_item(
|
992
|
+
doc, elements, enum_marker, is_numbered, level
|
944
993
|
)
|
945
|
-
for text, format, hyperlink in elements:
|
946
|
-
doc.add_list_item(
|
947
|
-
marker=enum_marker,
|
948
|
-
enumerated=is_numbered,
|
949
|
-
parent=new_parent,
|
950
|
-
text=text,
|
951
|
-
formatting=format,
|
952
|
-
hyperlink=hyperlink,
|
953
|
-
)
|
954
|
-
|
955
994
|
elif (
|
956
995
|
self._prev_numid() == numid
|
957
996
|
and self.level_at_new_list is not None
|
@@ -981,28 +1020,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
981
1020
|
if is_numbered:
|
982
1021
|
enum_marker = str(self.listIter) + "."
|
983
1022
|
is_numbered = True
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
1023
|
+
self._add_formatted_list_item(
|
1024
|
+
doc,
|
1025
|
+
elements,
|
1026
|
+
enum_marker,
|
1027
|
+
is_numbered,
|
1028
|
+
self.level_at_new_list + ilevel,
|
989
1029
|
)
|
990
|
-
for text, format, hyperlink in elements:
|
991
|
-
doc.add_list_item(
|
992
|
-
marker=enum_marker,
|
993
|
-
enumerated=is_numbered,
|
994
|
-
parent=new_parent,
|
995
|
-
text=text,
|
996
|
-
formatting=format,
|
997
|
-
hyperlink=hyperlink,
|
998
|
-
)
|
999
1030
|
elif (
|
1000
1031
|
self._prev_numid() == numid
|
1001
1032
|
and self.level_at_new_list is not None
|
1002
1033
|
and prev_indent is not None
|
1003
1034
|
and ilevel < prev_indent
|
1004
1035
|
): # Close list
|
1005
|
-
for k
|
1036
|
+
for k in self.parents:
|
1006
1037
|
if k > self.level_at_new_list + ilevel:
|
1007
1038
|
self.parents[k] = None
|
1008
1039
|
|
@@ -1011,20 +1042,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1011
1042
|
if is_numbered:
|
1012
1043
|
enum_marker = str(self.listIter) + "."
|
1013
1044
|
is_numbered = True
|
1014
|
-
|
1015
|
-
doc
|
1016
|
-
|
1017
|
-
|
1045
|
+
self._add_formatted_list_item(
|
1046
|
+
doc,
|
1047
|
+
elements,
|
1048
|
+
enum_marker,
|
1049
|
+
is_numbered,
|
1050
|
+
self.level_at_new_list + ilevel,
|
1018
1051
|
)
|
1019
|
-
for text, format, hyperlink in elements:
|
1020
|
-
doc.add_list_item(
|
1021
|
-
marker=enum_marker,
|
1022
|
-
enumerated=is_numbered,
|
1023
|
-
parent=new_parent,
|
1024
|
-
text=text,
|
1025
|
-
formatting=format,
|
1026
|
-
hyperlink=hyperlink,
|
1027
|
-
)
|
1028
1052
|
self.listIter = 0
|
1029
1053
|
|
1030
1054
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
@@ -1033,21 +1057,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1033
1057
|
if is_numbered:
|
1034
1058
|
enum_marker = str(self.listIter) + "."
|
1035
1059
|
is_numbered = True
|
1036
|
-
|
1037
|
-
doc
|
1038
|
-
prev_parent=self.parents[level - 1],
|
1039
|
-
paragraph_elements=elements,
|
1060
|
+
self._add_formatted_list_item(
|
1061
|
+
doc, elements, enum_marker, is_numbered, level - 1
|
1040
1062
|
)
|
1041
|
-
|
1042
|
-
# Add the list item to the parent group
|
1043
|
-
doc.add_list_item(
|
1044
|
-
marker=enum_marker,
|
1045
|
-
enumerated=is_numbered,
|
1046
|
-
parent=new_parent,
|
1047
|
-
text=text,
|
1048
|
-
formatting=format,
|
1049
|
-
hyperlink=hyperlink,
|
1050
|
-
)
|
1063
|
+
|
1051
1064
|
return
|
1052
1065
|
|
1053
1066
|
def _handle_tables(
|