docling-core 2.24.1__py3-none-any.whl → 2.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +23 -2
- docling_core/experimental/serializer/common.py +79 -34
- docling_core/experimental/serializer/doctags.py +83 -47
- docling_core/experimental/serializer/html.py +931 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +95 -57
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/base.py +4 -1
- docling_core/types/doc/document.py +738 -490
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/METADATA +1 -1
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/RECORD +19 -17
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/LICENSE +0 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/WHEEL +0 -0
- {docling_core-2.24.1.dist-info → docling_core-2.26.0.dist-info}/entry_points.txt +0 -0
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
-
import html
|
|
7
6
|
import itertools
|
|
8
7
|
import json
|
|
9
8
|
import logging
|
|
@@ -12,17 +11,12 @@ import os
|
|
|
12
11
|
import re
|
|
13
12
|
import sys
|
|
14
13
|
import typing
|
|
15
|
-
import warnings
|
|
16
14
|
from enum import Enum
|
|
17
15
|
from io import BytesIO
|
|
18
16
|
from pathlib import Path
|
|
19
17
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
20
|
-
from urllib.parse import
|
|
21
|
-
from xml.etree.cElementTree import SubElement, tostring
|
|
22
|
-
from xml.sax.saxutils import unescape
|
|
18
|
+
from urllib.parse import unquote
|
|
23
19
|
|
|
24
|
-
import latex2mathml.converter
|
|
25
|
-
import latex2mathml.exceptions
|
|
26
20
|
import pandas as pd
|
|
27
21
|
import yaml
|
|
28
22
|
from PIL import Image as PILImage
|
|
@@ -49,13 +43,10 @@ from docling_core.types.doc.labels import (
|
|
|
49
43
|
GraphCellLabel,
|
|
50
44
|
GraphLinkLabel,
|
|
51
45
|
GroupLabel,
|
|
46
|
+
PictureClassificationLabel,
|
|
52
47
|
)
|
|
53
48
|
from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
|
|
54
|
-
from docling_core.types.doc.utils import
|
|
55
|
-
get_html_tag_with_text_direction,
|
|
56
|
-
get_text_direction,
|
|
57
|
-
relative_path,
|
|
58
|
-
)
|
|
49
|
+
from docling_core.types.doc.utils import relative_path
|
|
59
50
|
|
|
60
51
|
_logger = logging.getLogger(__name__)
|
|
61
52
|
|
|
@@ -290,22 +281,6 @@ class PictureScatterChartData(PictureChartData):
|
|
|
290
281
|
points: List[ChartPoint]
|
|
291
282
|
|
|
292
283
|
|
|
293
|
-
PictureDataType = Annotated[
|
|
294
|
-
Union[
|
|
295
|
-
PictureClassificationData,
|
|
296
|
-
PictureDescriptionData,
|
|
297
|
-
PictureMoleculeData,
|
|
298
|
-
PictureMiscData,
|
|
299
|
-
PictureLineChartData,
|
|
300
|
-
PictureBarChartData,
|
|
301
|
-
PictureStackedBarChartData,
|
|
302
|
-
PicturePieChartData,
|
|
303
|
-
PictureScatterChartData,
|
|
304
|
-
],
|
|
305
|
-
Field(discriminator="kind"),
|
|
306
|
-
]
|
|
307
|
-
|
|
308
|
-
|
|
309
284
|
class TableCell(BaseModel):
|
|
310
285
|
"""TableCell."""
|
|
311
286
|
|
|
@@ -391,6 +366,35 @@ class TableData(BaseModel): # TBD
|
|
|
391
366
|
return table_data
|
|
392
367
|
|
|
393
368
|
|
|
369
|
+
class PictureTabularChartData(PictureChartData):
|
|
370
|
+
"""Base class for picture chart data.
|
|
371
|
+
|
|
372
|
+
Attributes:
|
|
373
|
+
title (str): The title of the chart.
|
|
374
|
+
chart_data (TableData): Chart data in the table format.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
kind: Literal["tabular_chart_data"] = "tabular_chart_data"
|
|
378
|
+
chart_data: TableData
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
PictureDataType = Annotated[
|
|
382
|
+
Union[
|
|
383
|
+
PictureClassificationData,
|
|
384
|
+
PictureDescriptionData,
|
|
385
|
+
PictureMoleculeData,
|
|
386
|
+
PictureMiscData,
|
|
387
|
+
PictureTabularChartData,
|
|
388
|
+
PictureLineChartData,
|
|
389
|
+
PictureBarChartData,
|
|
390
|
+
PictureStackedBarChartData,
|
|
391
|
+
PicturePieChartData,
|
|
392
|
+
PictureScatterChartData,
|
|
393
|
+
],
|
|
394
|
+
Field(discriminator="kind"),
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
|
|
394
398
|
class DocumentOrigin(BaseModel):
|
|
395
399
|
"""FileSource."""
|
|
396
400
|
|
|
@@ -458,8 +462,12 @@ class RefItem(BaseModel):
|
|
|
458
462
|
populate_by_name=True,
|
|
459
463
|
)
|
|
460
464
|
|
|
465
|
+
def _split_ref_to_path(self):
|
|
466
|
+
"""Get the path of the reference."""
|
|
467
|
+
return self.cref.split("/")
|
|
468
|
+
|
|
461
469
|
def resolve(self, doc: "DoclingDocument"):
|
|
462
|
-
"""
|
|
470
|
+
"""Resolve the path in the document."""
|
|
463
471
|
path_components = self.cref.split("/")
|
|
464
472
|
if (num_comps := len(path_components)) == 3:
|
|
465
473
|
_, path, index_str = path_components
|
|
@@ -542,25 +550,32 @@ class DocTagsDocument(BaseModel):
|
|
|
542
550
|
|
|
543
551
|
@classmethod
|
|
544
552
|
def from_doctags_and_image_pairs(
|
|
545
|
-
cls,
|
|
553
|
+
cls,
|
|
554
|
+
doctags: typing.Sequence[Union[Path, str]],
|
|
555
|
+
images: Optional[List[Union[Path, PILImage.Image]]],
|
|
546
556
|
):
|
|
547
557
|
"""from_doctags_and_image_pairs."""
|
|
548
|
-
if len(doctags) != len(images):
|
|
558
|
+
if images is not None and len(doctags) != len(images):
|
|
549
559
|
raise ValueError("Number of page doctags must be equal to page images!")
|
|
550
560
|
doctags_doc = cls()
|
|
551
561
|
|
|
552
562
|
pages = []
|
|
553
|
-
|
|
563
|
+
|
|
564
|
+
for ix, dt in enumerate(doctags):
|
|
554
565
|
if isinstance(dt, Path):
|
|
555
566
|
with dt.open("r") as fp:
|
|
556
567
|
dt = fp.read()
|
|
557
568
|
elif isinstance(dt, str):
|
|
558
569
|
pass
|
|
559
570
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
571
|
+
img = None
|
|
572
|
+
if images is not None:
|
|
573
|
+
img = images[ix]
|
|
574
|
+
|
|
575
|
+
if isinstance(img, Path):
|
|
576
|
+
img = PILImage.open(img)
|
|
577
|
+
elif isinstance(img, PILImage.Image):
|
|
578
|
+
pass
|
|
564
579
|
|
|
565
580
|
page = DocTagsPage(tokens=dt, image=img)
|
|
566
581
|
pages.append(page)
|
|
@@ -568,6 +583,25 @@ class DocTagsDocument(BaseModel):
|
|
|
568
583
|
doctags_doc.pages = pages
|
|
569
584
|
return doctags_doc
|
|
570
585
|
|
|
586
|
+
@classmethod
|
|
587
|
+
def from_multipage_doctags_and_images(
|
|
588
|
+
cls,
|
|
589
|
+
doctags: Union[Path, str],
|
|
590
|
+
images: Optional[List[Union[Path, PILImage.Image]]],
|
|
591
|
+
):
|
|
592
|
+
"""From doctags with `<page_break>` and corresponding list of page images."""
|
|
593
|
+
if isinstance(doctags, Path):
|
|
594
|
+
with doctags.open("r") as fp:
|
|
595
|
+
doctags = fp.read()
|
|
596
|
+
dt_list = (
|
|
597
|
+
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
|
|
598
|
+
.removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
|
|
599
|
+
.split(f"<{DocumentToken.PAGE_BREAK.value}>")
|
|
600
|
+
)
|
|
601
|
+
dt_list = [el.strip() for el in dt_list]
|
|
602
|
+
|
|
603
|
+
return cls.from_doctags_and_image_pairs(dt_list, images)
|
|
604
|
+
|
|
571
605
|
|
|
572
606
|
class ProvenanceItem(BaseModel):
|
|
573
607
|
"""ProvenanceItem."""
|
|
@@ -598,10 +632,98 @@ class NodeItem(BaseModel):
|
|
|
598
632
|
|
|
599
633
|
model_config = ConfigDict(extra="forbid")
|
|
600
634
|
|
|
601
|
-
def get_ref(self):
|
|
635
|
+
def get_ref(self) -> RefItem:
|
|
602
636
|
"""get_ref."""
|
|
603
637
|
return RefItem(cref=self.self_ref)
|
|
604
638
|
|
|
639
|
+
def _get_parent_ref(
|
|
640
|
+
self, doc: "DoclingDocument", stack: list[int]
|
|
641
|
+
) -> Optional[RefItem]:
|
|
642
|
+
"""get_parent_ref."""
|
|
643
|
+
if len(stack) == 0:
|
|
644
|
+
return self.parent
|
|
645
|
+
elif len(stack) > 0 and stack[0] < len(self.children):
|
|
646
|
+
item = self.children[stack[0]].resolve(doc)
|
|
647
|
+
return item._get_parent_ref(doc=doc, stack=stack[1:])
|
|
648
|
+
|
|
649
|
+
return None
|
|
650
|
+
|
|
651
|
+
def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool:
|
|
652
|
+
"""Delete child node in tree."""
|
|
653
|
+
if len(stack) == 1 and stack[0] < len(self.children):
|
|
654
|
+
del self.children[stack[0]]
|
|
655
|
+
return True
|
|
656
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
657
|
+
item = self.children[stack[0]].resolve(doc)
|
|
658
|
+
return item._delete_child(doc=doc, stack=stack[1:])
|
|
659
|
+
|
|
660
|
+
return False
|
|
661
|
+
|
|
662
|
+
def _update_child(
|
|
663
|
+
self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
|
|
664
|
+
) -> bool:
|
|
665
|
+
"""Update child node in tree."""
|
|
666
|
+
if len(stack) == 1 and stack[0] < len(self.children):
|
|
667
|
+
# ensure the parent is correct
|
|
668
|
+
new_item = new_ref.resolve(doc=doc)
|
|
669
|
+
new_item.parent = self.get_ref()
|
|
670
|
+
|
|
671
|
+
self.children[stack[0]] = new_ref
|
|
672
|
+
return True
|
|
673
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
674
|
+
item = self.children[stack[0]].resolve(doc)
|
|
675
|
+
return item._update_child(doc=doc, stack=stack[1:], new_ref=new_ref)
|
|
676
|
+
|
|
677
|
+
return False
|
|
678
|
+
|
|
679
|
+
def _add_child(
|
|
680
|
+
self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
|
|
681
|
+
) -> bool:
|
|
682
|
+
"""Append child to node identified by stack."""
|
|
683
|
+
if len(stack) == 0:
|
|
684
|
+
|
|
685
|
+
# ensure the parent is correct
|
|
686
|
+
new_item = new_ref.resolve(doc=doc)
|
|
687
|
+
new_item.parent = self.get_ref()
|
|
688
|
+
|
|
689
|
+
self.children.append(new_ref)
|
|
690
|
+
return True
|
|
691
|
+
elif len(stack) > 0 and stack[0] < len(self.children):
|
|
692
|
+
item = self.children[stack[0]].resolve(doc)
|
|
693
|
+
return item._add_child(doc=doc, stack=stack[1:], new_ref=new_ref)
|
|
694
|
+
|
|
695
|
+
return False
|
|
696
|
+
|
|
697
|
+
def _add_sibling(
|
|
698
|
+
self,
|
|
699
|
+
doc: "DoclingDocument",
|
|
700
|
+
stack: list[int],
|
|
701
|
+
new_ref: RefItem,
|
|
702
|
+
after: bool = True,
|
|
703
|
+
) -> bool:
|
|
704
|
+
"""Add sibling node in tree."""
|
|
705
|
+
if len(stack) == 1 and stack[0] < len(self.children) and (not after):
|
|
706
|
+
# ensure the parent is correct
|
|
707
|
+
new_item = new_ref.resolve(doc=doc)
|
|
708
|
+
new_item.parent = self.get_ref()
|
|
709
|
+
|
|
710
|
+
self.children.insert(stack[0], new_ref)
|
|
711
|
+
return True
|
|
712
|
+
elif len(stack) == 1 and stack[0] < len(self.children) and (after):
|
|
713
|
+
# ensure the parent is correct
|
|
714
|
+
new_item = new_ref.resolve(doc=doc)
|
|
715
|
+
new_item.parent = self.get_ref()
|
|
716
|
+
|
|
717
|
+
self.children.insert(stack[0] + 1, new_ref)
|
|
718
|
+
return True
|
|
719
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
720
|
+
item = self.children[stack[0]].resolve(doc)
|
|
721
|
+
return item._add_sibling(
|
|
722
|
+
doc=doc, stack=stack[1:], new_ref=new_ref, after=after
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
return False
|
|
726
|
+
|
|
605
727
|
|
|
606
728
|
class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
607
729
|
"""GroupItem."""
|
|
@@ -722,7 +844,9 @@ class TextItem(DocItem):
|
|
|
722
844
|
text: str # sanitized representation
|
|
723
845
|
|
|
724
846
|
formatting: Optional[Formatting] = None
|
|
725
|
-
hyperlink: Optional[Union[AnyUrl, Path]] =
|
|
847
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = Field(
|
|
848
|
+
union_mode="left_to_right", default=None
|
|
849
|
+
)
|
|
726
850
|
|
|
727
851
|
@deprecated("Use export_to_doctags() instead.")
|
|
728
852
|
def export_to_document_tokens(self, *args, **kwargs):
|
|
@@ -925,7 +1049,9 @@ class FormulaItem(TextItem):
|
|
|
925
1049
|
class PictureItem(FloatingItem):
|
|
926
1050
|
"""PictureItem."""
|
|
927
1051
|
|
|
928
|
-
label: typing.Literal[DocItemLabel.PICTURE] =
|
|
1052
|
+
label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = (
|
|
1053
|
+
DocItemLabel.PICTURE
|
|
1054
|
+
)
|
|
929
1055
|
|
|
930
1056
|
annotations: List[PictureDataType] = []
|
|
931
1057
|
|
|
@@ -992,54 +1118,19 @@ class PictureItem(FloatingItem):
|
|
|
992
1118
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
993
1119
|
) -> str:
|
|
994
1120
|
"""Export picture to HTML format."""
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
caption_text = ""
|
|
1000
|
-
if len(text) > 0:
|
|
1001
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1002
|
-
html_tag="figcaption", text=text
|
|
1003
|
-
)
|
|
1004
|
-
|
|
1005
|
-
default_response = f"<figure>{caption_text}</figure>"
|
|
1006
|
-
|
|
1007
|
-
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
1008
|
-
return default_response
|
|
1009
|
-
|
|
1010
|
-
elif image_mode == ImageRefMode.EMBEDDED:
|
|
1011
|
-
# short-cut: we already have the image in base64
|
|
1012
|
-
if (
|
|
1013
|
-
isinstance(self.image, ImageRef)
|
|
1014
|
-
and isinstance(self.image.uri, AnyUrl)
|
|
1015
|
-
and self.image.uri.scheme == "data"
|
|
1016
|
-
):
|
|
1017
|
-
img_text = f'<img src="{self.image.uri}">'
|
|
1018
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1019
|
-
|
|
1020
|
-
# get the self.image._pil or crop it out of the page-image
|
|
1021
|
-
img = self.get_image(doc)
|
|
1022
|
-
|
|
1023
|
-
if img is not None:
|
|
1024
|
-
imgb64 = self._image_to_base64(img)
|
|
1025
|
-
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
1026
|
-
|
|
1027
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1028
|
-
else:
|
|
1029
|
-
return default_response
|
|
1030
|
-
|
|
1031
|
-
elif image_mode == ImageRefMode.REFERENCED:
|
|
1032
|
-
|
|
1033
|
-
if not isinstance(self.image, ImageRef) or (
|
|
1034
|
-
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
1035
|
-
):
|
|
1036
|
-
return default_response
|
|
1037
|
-
|
|
1038
|
-
img_text = f'<img src="{quote(str(self.image.uri))}">'
|
|
1039
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1121
|
+
from docling_core.experimental.serializer.html import (
|
|
1122
|
+
HTMLDocSerializer,
|
|
1123
|
+
HTMLParams,
|
|
1124
|
+
)
|
|
1040
1125
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1126
|
+
serializer = HTMLDocSerializer(
|
|
1127
|
+
doc=doc,
|
|
1128
|
+
params=HTMLParams(
|
|
1129
|
+
image_mode=image_mode,
|
|
1130
|
+
),
|
|
1131
|
+
)
|
|
1132
|
+
text = serializer.serialize(item=self).text
|
|
1133
|
+
return text
|
|
1043
1134
|
|
|
1044
1135
|
@deprecated("Use export_to_doctags() instead.")
|
|
1045
1136
|
def export_to_document_tokens(self, *args, **kwargs):
|
|
@@ -1190,81 +1281,18 @@ class TableItem(FloatingItem):
|
|
|
1190
1281
|
add_caption: bool = True,
|
|
1191
1282
|
) -> str:
|
|
1192
1283
|
"""Export the table as html."""
|
|
1193
|
-
if doc is None:
|
|
1194
|
-
|
|
1195
|
-
"The `doc` argument will be mandatory in a future version. "
|
|
1196
|
-
"It must be provided to include a caption.",
|
|
1197
|
-
DeprecationWarning,
|
|
1198
|
-
)
|
|
1199
|
-
|
|
1200
|
-
nrows = self.data.num_rows
|
|
1201
|
-
ncols = self.data.num_cols
|
|
1202
|
-
|
|
1203
|
-
text = ""
|
|
1204
|
-
if doc is not None and add_caption and len(self.captions):
|
|
1205
|
-
text = html.escape(self.caption_text(doc))
|
|
1206
|
-
|
|
1207
|
-
if len(self.data.table_cells) == 0:
|
|
1208
|
-
return ""
|
|
1209
|
-
|
|
1210
|
-
body = ""
|
|
1211
|
-
|
|
1212
|
-
for i in range(nrows):
|
|
1213
|
-
body += "<tr>"
|
|
1214
|
-
for j in range(ncols):
|
|
1215
|
-
cell: TableCell = self.data.grid[i][j]
|
|
1216
|
-
|
|
1217
|
-
rowspan, rowstart = (
|
|
1218
|
-
cell.row_span,
|
|
1219
|
-
cell.start_row_offset_idx,
|
|
1220
|
-
)
|
|
1221
|
-
colspan, colstart = (
|
|
1222
|
-
cell.col_span,
|
|
1223
|
-
cell.start_col_offset_idx,
|
|
1224
|
-
)
|
|
1225
|
-
|
|
1226
|
-
if rowstart != i:
|
|
1227
|
-
continue
|
|
1228
|
-
if colstart != j:
|
|
1229
|
-
continue
|
|
1230
|
-
|
|
1231
|
-
content = html.escape(cell.text.strip())
|
|
1232
|
-
celltag = "td"
|
|
1233
|
-
if cell.column_header:
|
|
1234
|
-
celltag = "th"
|
|
1235
|
-
|
|
1236
|
-
opening_tag = f"{celltag}"
|
|
1237
|
-
if rowspan > 1:
|
|
1238
|
-
opening_tag += f' rowspan="{rowspan}"'
|
|
1239
|
-
if colspan > 1:
|
|
1240
|
-
opening_tag += f' colspan="{colspan}"'
|
|
1241
|
-
|
|
1242
|
-
text_dir = get_text_direction(content)
|
|
1243
|
-
if text_dir == "rtl":
|
|
1244
|
-
opening_tag += f' dir="{dir}"'
|
|
1245
|
-
|
|
1246
|
-
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
1247
|
-
body += "</tr>"
|
|
1248
|
-
|
|
1249
|
-
# dir = get_text_direction(text)
|
|
1250
|
-
|
|
1251
|
-
if len(text) > 0 and len(body) > 0:
|
|
1252
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1253
|
-
html_tag="caption", text=text
|
|
1254
|
-
)
|
|
1255
|
-
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
|
|
1284
|
+
if doc is not None:
|
|
1285
|
+
from docling_core.experimental.serializer.html import HTMLDocSerializer
|
|
1256
1286
|
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1261
|
-
html_tag="caption", text=text
|
|
1262
|
-
)
|
|
1263
|
-
body = f"<table>{caption_text}</table>"
|
|
1287
|
+
serializer = HTMLDocSerializer(doc=doc)
|
|
1288
|
+
text = serializer.serialize(item=self).text
|
|
1289
|
+
return text
|
|
1264
1290
|
else:
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1291
|
+
_logger.error(
|
|
1292
|
+
"Usage of TableItem.export_to_html() without `doc` argument is "
|
|
1293
|
+
"deprecated.",
|
|
1294
|
+
)
|
|
1295
|
+
return ""
|
|
1268
1296
|
|
|
1269
1297
|
def export_to_otsl(
|
|
1270
1298
|
self,
|
|
@@ -1539,76 +1567,6 @@ class PageItem(BaseModel):
|
|
|
1539
1567
|
class DoclingDocument(BaseModel):
|
|
1540
1568
|
"""DoclingDocument."""
|
|
1541
1569
|
|
|
1542
|
-
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1543
|
-
<link rel="icon" type="image/png"
|
|
1544
|
-
href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
|
|
1545
|
-
<meta charset="UTF-8">
|
|
1546
|
-
<title>
|
|
1547
|
-
Powered by Docling
|
|
1548
|
-
</title>
|
|
1549
|
-
<style>
|
|
1550
|
-
html {
|
|
1551
|
-
background-color: LightGray;
|
|
1552
|
-
}
|
|
1553
|
-
body {
|
|
1554
|
-
margin: 0 auto;
|
|
1555
|
-
width:800px;
|
|
1556
|
-
padding: 30px;
|
|
1557
|
-
background-color: White;
|
|
1558
|
-
font-family: Arial, sans-serif;
|
|
1559
|
-
box-shadow: 10px 10px 10px grey;
|
|
1560
|
-
}
|
|
1561
|
-
figure{
|
|
1562
|
-
display: block;
|
|
1563
|
-
width: 100%;
|
|
1564
|
-
margin: 0px;
|
|
1565
|
-
margin-top: 10px;
|
|
1566
|
-
margin-bottom: 10px;
|
|
1567
|
-
}
|
|
1568
|
-
img {
|
|
1569
|
-
display: block;
|
|
1570
|
-
margin: auto;
|
|
1571
|
-
margin-top: 10px;
|
|
1572
|
-
margin-bottom: 10px;
|
|
1573
|
-
max-width: 640px;
|
|
1574
|
-
max-height: 640px;
|
|
1575
|
-
}
|
|
1576
|
-
table {
|
|
1577
|
-
min-width:500px;
|
|
1578
|
-
background-color: White;
|
|
1579
|
-
border-collapse: collapse;
|
|
1580
|
-
cell-padding: 5px;
|
|
1581
|
-
margin: auto;
|
|
1582
|
-
margin-top: 10px;
|
|
1583
|
-
margin-bottom: 10px;
|
|
1584
|
-
}
|
|
1585
|
-
th, td {
|
|
1586
|
-
border: 1px solid black;
|
|
1587
|
-
padding: 8px;
|
|
1588
|
-
}
|
|
1589
|
-
th {
|
|
1590
|
-
font-weight: bold;
|
|
1591
|
-
}
|
|
1592
|
-
table tr:nth-child(even) td{
|
|
1593
|
-
background-color: LightGray;
|
|
1594
|
-
}
|
|
1595
|
-
math annotation {
|
|
1596
|
-
display: none;
|
|
1597
|
-
}
|
|
1598
|
-
.formula-not-decoded {
|
|
1599
|
-
background: repeating-linear-gradient(
|
|
1600
|
-
45deg, /* Angle of the stripes */
|
|
1601
|
-
LightGray, /* First color */
|
|
1602
|
-
LightGray 10px, /* Length of the first color */
|
|
1603
|
-
White 10px, /* Second color */
|
|
1604
|
-
White 20px /* Length of the second color */
|
|
1605
|
-
);
|
|
1606
|
-
margin: 0;
|
|
1607
|
-
text-align: center;
|
|
1608
|
-
}
|
|
1609
|
-
</style>
|
|
1610
|
-
</head>"""
|
|
1611
|
-
|
|
1612
1570
|
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
1613
1571
|
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
1614
1572
|
CURRENT_VERSION
|
|
@@ -1655,6 +1613,364 @@ class DoclingDocument(BaseModel):
|
|
|
1655
1613
|
item["content_layer"] = "furniture"
|
|
1656
1614
|
return data
|
|
1657
1615
|
|
|
1616
|
+
# ---------------------------
|
|
1617
|
+
# Public Manipulation methods
|
|
1618
|
+
# ---------------------------
|
|
1619
|
+
|
|
1620
|
+
def append_child_item(
|
|
1621
|
+
self, *, child: NodeItem, parent: Optional[NodeItem] = None
|
|
1622
|
+
) -> None:
|
|
1623
|
+
"""Adds an item."""
|
|
1624
|
+
if len(child.children) > 0:
|
|
1625
|
+
raise ValueError("Can not append a child with children")
|
|
1626
|
+
|
|
1627
|
+
parent = parent if parent is not None else self.body
|
|
1628
|
+
|
|
1629
|
+
success, stack = self._get_stack_of_item(item=parent)
|
|
1630
|
+
|
|
1631
|
+
if not success:
|
|
1632
|
+
raise ValueError(
|
|
1633
|
+
f"Could not resolve the parent node in the document tree: {parent}"
|
|
1634
|
+
)
|
|
1635
|
+
|
|
1636
|
+
# Append the item to the attributes of the doc
|
|
1637
|
+
self._append_item(item=child, parent_ref=parent.get_ref())
|
|
1638
|
+
|
|
1639
|
+
# Update the tree of the doc
|
|
1640
|
+
success = self.body._add_child(doc=self, new_ref=child.get_ref(), stack=stack)
|
|
1641
|
+
|
|
1642
|
+
# Clean the attribute (orphan) if not successful
|
|
1643
|
+
if not success:
|
|
1644
|
+
self._pop_item(item=child)
|
|
1645
|
+
raise ValueError(f"Could not append child: {child} to parent: {parent}")
|
|
1646
|
+
|
|
1647
|
+
def insert_item_after_sibling(
|
|
1648
|
+
self, *, new_item: NodeItem, sibling: NodeItem
|
|
1649
|
+
) -> None:
|
|
1650
|
+
"""Inserts an item, given its node_item instance, after other as a sibling."""
|
|
1651
|
+
self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True)
|
|
1652
|
+
|
|
1653
|
+
def insert_item_before_sibling(
|
|
1654
|
+
self, *, new_item: NodeItem, sibling: NodeItem
|
|
1655
|
+
) -> None:
|
|
1656
|
+
"""Inserts an item, given its node_item instance, before other as a sibling."""
|
|
1657
|
+
self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False)
|
|
1658
|
+
|
|
1659
|
+
def delete_items(self, *, node_items: List[NodeItem]) -> None:
|
|
1660
|
+
"""Deletes an item, given its instance or ref, and any children it has."""
|
|
1661
|
+
refs = []
|
|
1662
|
+
for _ in node_items:
|
|
1663
|
+
refs.append(_.get_ref())
|
|
1664
|
+
|
|
1665
|
+
self._delete_items(refs=refs)
|
|
1666
|
+
|
|
1667
|
+
def replace_item(self, *, new_item: NodeItem, old_item: NodeItem) -> None:
|
|
1668
|
+
"""Replace item with new item."""
|
|
1669
|
+
self.insert_item_after_sibling(new_item=new_item, sibling=old_item)
|
|
1670
|
+
self.delete_items(node_items=[old_item])
|
|
1671
|
+
|
|
1672
|
+
# ----------------------------
|
|
1673
|
+
# Private Manipulation methods
|
|
1674
|
+
# ----------------------------
|
|
1675
|
+
|
|
1676
|
+
def _get_stack_of_item(self, item: NodeItem) -> tuple[bool, list[int]]:
|
|
1677
|
+
"""Find the stack indices of the item."""
|
|
1678
|
+
return self._get_stack_of_refitem(ref=item.get_ref())
|
|
1679
|
+
|
|
1680
|
+
def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]:
|
|
1681
|
+
"""Find the stack indices of the reference."""
|
|
1682
|
+
if ref == self.body.get_ref():
|
|
1683
|
+
return (True, [])
|
|
1684
|
+
|
|
1685
|
+
node = ref.resolve(doc=self)
|
|
1686
|
+
parent_ref = node._get_parent_ref(doc=self, stack=[])
|
|
1687
|
+
|
|
1688
|
+
if parent_ref is None:
|
|
1689
|
+
return (False, [])
|
|
1690
|
+
|
|
1691
|
+
stack: list[int] = []
|
|
1692
|
+
while parent_ref is not None:
|
|
1693
|
+
parent = parent_ref.resolve(doc=self)
|
|
1694
|
+
|
|
1695
|
+
index = parent.children.index(node.get_ref())
|
|
1696
|
+
stack.insert(0, index) # prepend the index
|
|
1697
|
+
|
|
1698
|
+
node = parent
|
|
1699
|
+
parent_ref = node._get_parent_ref(doc=self, stack=[])
|
|
1700
|
+
|
|
1701
|
+
return (True, stack)
|
|
1702
|
+
|
|
1703
|
+
def _insert_item_at_refitem(
|
|
1704
|
+
self, item: NodeItem, ref: RefItem, after: bool
|
|
1705
|
+
) -> RefItem:
|
|
1706
|
+
"""Insert node-item using the self-reference."""
|
|
1707
|
+
success, stack = self._get_stack_of_refitem(ref=ref)
|
|
1708
|
+
|
|
1709
|
+
if not success:
|
|
1710
|
+
raise ValueError(
|
|
1711
|
+
f"Could not insert at {ref.cref}: could not find the stack"
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1714
|
+
return self._insert_item_at_stack(item=item, stack=stack, after=after)
|
|
1715
|
+
|
|
1716
|
+
def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
|
|
1717
|
+
"""Append item of its type."""
|
|
1718
|
+
cref: str = "" # to be updated
|
|
1719
|
+
|
|
1720
|
+
if isinstance(item, TextItem):
|
|
1721
|
+
item_label = "texts"
|
|
1722
|
+
item_index = len(self.texts)
|
|
1723
|
+
|
|
1724
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1725
|
+
|
|
1726
|
+
item.self_ref = cref
|
|
1727
|
+
item.parent = parent_ref
|
|
1728
|
+
|
|
1729
|
+
self.texts.append(item)
|
|
1730
|
+
|
|
1731
|
+
elif isinstance(item, TableItem):
|
|
1732
|
+
item_label = "tables"
|
|
1733
|
+
item_index = len(self.tables)
|
|
1734
|
+
|
|
1735
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1736
|
+
|
|
1737
|
+
item.self_ref = cref
|
|
1738
|
+
item.parent = parent_ref
|
|
1739
|
+
|
|
1740
|
+
self.tables.append(item)
|
|
1741
|
+
|
|
1742
|
+
elif isinstance(item, PictureItem):
|
|
1743
|
+
item_label = "pictures"
|
|
1744
|
+
item_index = len(self.pictures)
|
|
1745
|
+
|
|
1746
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1747
|
+
|
|
1748
|
+
item.self_ref = cref
|
|
1749
|
+
item.parent = parent_ref
|
|
1750
|
+
|
|
1751
|
+
self.pictures.append(item)
|
|
1752
|
+
|
|
1753
|
+
elif isinstance(item, KeyValueItem):
|
|
1754
|
+
item_label = "key_value_items"
|
|
1755
|
+
item_index = len(self.key_value_items)
|
|
1756
|
+
|
|
1757
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1758
|
+
|
|
1759
|
+
item.self_ref = cref
|
|
1760
|
+
item.parent = parent_ref
|
|
1761
|
+
|
|
1762
|
+
self.key_value_items.append(item)
|
|
1763
|
+
|
|
1764
|
+
elif isinstance(item, FormItem):
|
|
1765
|
+
item_label = "form_items"
|
|
1766
|
+
item_index = len(self.form_items)
|
|
1767
|
+
|
|
1768
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1769
|
+
|
|
1770
|
+
item.self_ref = cref
|
|
1771
|
+
item.parent = parent_ref
|
|
1772
|
+
|
|
1773
|
+
self.form_items.append(item)
|
|
1774
|
+
else:
|
|
1775
|
+
raise ValueError(f"Item {item} is not supported for insertion")
|
|
1776
|
+
|
|
1777
|
+
return RefItem(cref=cref)
|
|
1778
|
+
|
|
1779
|
+
def _pop_item(self, *, item: NodeItem):
|
|
1780
|
+
"""Pop the last item of its type."""
|
|
1781
|
+
path = item.self_ref.split("/")
|
|
1782
|
+
|
|
1783
|
+
if len(path) != 3:
|
|
1784
|
+
raise ValueError(f"Can not pop item with path: {path}")
|
|
1785
|
+
|
|
1786
|
+
item_label = path[1]
|
|
1787
|
+
item_index = int(path[2])
|
|
1788
|
+
|
|
1789
|
+
if (
|
|
1790
|
+
len(self.__getattribute__(item_label)) + 1 == item_index
|
|
1791
|
+
): # we can only pop the last item
|
|
1792
|
+
del self.__getattribute__(item_label)[item_index]
|
|
1793
|
+
else:
|
|
1794
|
+
msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}"
|
|
1795
|
+
raise ValueError(f"Failed to pop: item is not last ({msg})")
|
|
1796
|
+
|
|
1797
|
+
def _insert_item_at_stack(
|
|
1798
|
+
self, item: NodeItem, stack: list[int], after: bool
|
|
1799
|
+
) -> RefItem:
|
|
1800
|
+
"""Insert node-item using the self-reference."""
|
|
1801
|
+
parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
|
|
1802
|
+
|
|
1803
|
+
if parent_ref is None:
|
|
1804
|
+
raise ValueError(f"Could not find a parent at stack: {stack}")
|
|
1805
|
+
|
|
1806
|
+
new_ref = self._append_item(item=item, parent_ref=parent_ref)
|
|
1807
|
+
|
|
1808
|
+
success = self.body._add_sibling(
|
|
1809
|
+
doc=self, stack=stack, new_ref=new_ref, after=after
|
|
1810
|
+
)
|
|
1811
|
+
|
|
1812
|
+
if not success:
|
|
1813
|
+
self._pop_item(item=item)
|
|
1814
|
+
|
|
1815
|
+
return item.get_ref()
|
|
1816
|
+
|
|
1817
|
+
def _delete_items(self, refs: list[RefItem]) -> bool:
|
|
1818
|
+
"""Delete document item using the self-reference."""
|
|
1819
|
+
to_be_deleted_items: dict[tuple[int, ...], str] = {} # stack to cref
|
|
1820
|
+
|
|
1821
|
+
# Identify the to_be_deleted_items
|
|
1822
|
+
for item, stack in self._iterate_items_with_stack(with_groups=True):
|
|
1823
|
+
ref = item.get_ref()
|
|
1824
|
+
|
|
1825
|
+
if ref in refs:
|
|
1826
|
+
to_be_deleted_items[tuple(stack)] = ref.cref
|
|
1827
|
+
|
|
1828
|
+
substacks = [stack[0 : i + 1] for i in range(len(stack) - 1)]
|
|
1829
|
+
for substack in substacks:
|
|
1830
|
+
if tuple(substack) in to_be_deleted_items:
|
|
1831
|
+
to_be_deleted_items[tuple(stack)] = ref.cref
|
|
1832
|
+
|
|
1833
|
+
if len(to_be_deleted_items) == 0:
|
|
1834
|
+
raise ValueError("Nothing to be deleted ...")
|
|
1835
|
+
|
|
1836
|
+
# Clean the tree, reverse the order to not have to update
|
|
1837
|
+
for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())):
|
|
1838
|
+
success = self.body._delete_child(doc=self, stack=list(stack_))
|
|
1839
|
+
|
|
1840
|
+
if not success:
|
|
1841
|
+
del to_be_deleted_items[stack_]
|
|
1842
|
+
else:
|
|
1843
|
+
_logger.info(f"deleted item in tree at stack: {stack_} => {ref_}")
|
|
1844
|
+
|
|
1845
|
+
# Create a new lookup of the orphans:
|
|
1846
|
+
# dict of item_label (`texts`, `tables`, ...) to a
|
|
1847
|
+
# dict of item_label with delta (default = -1).
|
|
1848
|
+
lookup: dict[str, dict[int, int]] = {}
|
|
1849
|
+
|
|
1850
|
+
for stack_, ref_ in to_be_deleted_items.items():
|
|
1851
|
+
path = ref_.split("/")
|
|
1852
|
+
if len(path) == 3:
|
|
1853
|
+
|
|
1854
|
+
item_label = path[1]
|
|
1855
|
+
item_index = int(path[2])
|
|
1856
|
+
|
|
1857
|
+
if item_label not in lookup:
|
|
1858
|
+
lookup[item_label] = {}
|
|
1859
|
+
|
|
1860
|
+
lookup[item_label][item_index] = -1
|
|
1861
|
+
|
|
1862
|
+
# Remove the orphans in reverse order
|
|
1863
|
+
for item_label, item_inds in lookup.items():
|
|
1864
|
+
for item_index, val in reversed(
|
|
1865
|
+
sorted(item_inds.items())
|
|
1866
|
+
): # make sure you delete the last in the list first!
|
|
1867
|
+
_logger.debug(f"deleting item in doc for {item_label} for {item_index}")
|
|
1868
|
+
del self.__getattribute__(item_label)[item_index]
|
|
1869
|
+
|
|
1870
|
+
self._update_breadth_first_with_lookup(
|
|
1871
|
+
node=self.body, refs_to_be_deleted=refs, lookup=lookup
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
return True
|
|
1875
|
+
|
|
1876
|
+
# Update the references
|
|
1877
|
+
def _update_ref_with_lookup(
|
|
1878
|
+
self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]
|
|
1879
|
+
) -> RefItem:
|
|
1880
|
+
"""Update ref with lookup."""
|
|
1881
|
+
if item_label not in lookup: # Nothing to be done
|
|
1882
|
+
return RefItem(cref=f"#/{item_label}/{item_index}")
|
|
1883
|
+
|
|
1884
|
+
# Count how many items have been deleted in front of you
|
|
1885
|
+
delta = sum(
|
|
1886
|
+
val if item_index >= key else 0 for key, val in lookup[item_label].items()
|
|
1887
|
+
)
|
|
1888
|
+
new_index = item_index + delta
|
|
1889
|
+
|
|
1890
|
+
return RefItem(cref=f"#/{item_label}/{new_index}")
|
|
1891
|
+
|
|
1892
|
+
def _update_refitems_with_lookup(
|
|
1893
|
+
self,
|
|
1894
|
+
ref_items: list[RefItem],
|
|
1895
|
+
refs_to_be_deleted: list[RefItem],
|
|
1896
|
+
lookup: dict[str, dict[int, int]],
|
|
1897
|
+
) -> list[RefItem]:
|
|
1898
|
+
"""Update refitems with lookup."""
|
|
1899
|
+
new_refitems = []
|
|
1900
|
+
for ref_item in ref_items:
|
|
1901
|
+
|
|
1902
|
+
if (
|
|
1903
|
+
ref_item not in refs_to_be_deleted
|
|
1904
|
+
): # if ref_item is in ref, then delete/skip them
|
|
1905
|
+
path = ref_item._split_ref_to_path()
|
|
1906
|
+
if len(path) == 3:
|
|
1907
|
+
new_refitems.append(
|
|
1908
|
+
self._update_ref_with_lookup(
|
|
1909
|
+
item_label=path[1],
|
|
1910
|
+
item_index=int(path[2]),
|
|
1911
|
+
lookup=lookup,
|
|
1912
|
+
)
|
|
1913
|
+
)
|
|
1914
|
+
else:
|
|
1915
|
+
new_refitems.append(ref_item)
|
|
1916
|
+
|
|
1917
|
+
return new_refitems
|
|
1918
|
+
|
|
1919
|
+
def _update_breadth_first_with_lookup(
|
|
1920
|
+
self,
|
|
1921
|
+
node: NodeItem,
|
|
1922
|
+
refs_to_be_deleted: list[RefItem],
|
|
1923
|
+
lookup: dict[str, dict[int, int]],
|
|
1924
|
+
):
|
|
1925
|
+
"""Update breadth first with lookup."""
|
|
1926
|
+
# Update the captions, references and footnote references
|
|
1927
|
+
if isinstance(node, FloatingItem):
|
|
1928
|
+
node.captions = self._update_refitems_with_lookup(
|
|
1929
|
+
ref_items=node.captions,
|
|
1930
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1931
|
+
lookup=lookup,
|
|
1932
|
+
)
|
|
1933
|
+
node.references = self._update_refitems_with_lookup(
|
|
1934
|
+
ref_items=node.references,
|
|
1935
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1936
|
+
lookup=lookup,
|
|
1937
|
+
)
|
|
1938
|
+
node.footnotes = self._update_refitems_with_lookup(
|
|
1939
|
+
ref_items=node.footnotes,
|
|
1940
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1941
|
+
lookup=lookup,
|
|
1942
|
+
)
|
|
1943
|
+
|
|
1944
|
+
# Update the self_ref reference
|
|
1945
|
+
if node.parent is not None:
|
|
1946
|
+
path = node.parent._split_ref_to_path()
|
|
1947
|
+
if len(path) == 3:
|
|
1948
|
+
node.parent = self._update_ref_with_lookup(
|
|
1949
|
+
item_label=path[1], item_index=int(path[2]), lookup=lookup
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
# Update the parent reference
|
|
1953
|
+
if node.self_ref is not None:
|
|
1954
|
+
path = node.self_ref.split("/")
|
|
1955
|
+
if len(path) == 3:
|
|
1956
|
+
_ref = self._update_ref_with_lookup(
|
|
1957
|
+
item_label=path[1], item_index=int(path[2]), lookup=lookup
|
|
1958
|
+
)
|
|
1959
|
+
node.self_ref = _ref.cref
|
|
1960
|
+
|
|
1961
|
+
# Update the child references
|
|
1962
|
+
node.children = self._update_refitems_with_lookup(
|
|
1963
|
+
ref_items=node.children,
|
|
1964
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1965
|
+
lookup=lookup,
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
for i, child_ref in enumerate(node.children):
|
|
1969
|
+
node = child_ref.resolve(self)
|
|
1970
|
+
self._update_breadth_first_with_lookup(
|
|
1971
|
+
node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup
|
|
1972
|
+
)
|
|
1973
|
+
|
|
1658
1974
|
###################################
|
|
1659
1975
|
# TODO: refactor add* methods below
|
|
1660
1976
|
###################################
|
|
@@ -2293,21 +2609,33 @@ class DoclingDocument(BaseModel):
|
|
|
2293
2609
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2294
2610
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
2295
2611
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
2296
|
-
"""
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
:
|
|
2304
|
-
|
|
2305
|
-
|
|
2612
|
+
"""Iterate elements with level."""
|
|
2613
|
+
for item, stack in self._iterate_items_with_stack(
|
|
2614
|
+
root=root,
|
|
2615
|
+
with_groups=with_groups,
|
|
2616
|
+
traverse_pictures=traverse_pictures,
|
|
2617
|
+
page_no=page_no,
|
|
2618
|
+
included_content_layers=included_content_layers,
|
|
2619
|
+
):
|
|
2620
|
+
yield item, len(stack)
|
|
2621
|
+
|
|
2622
|
+
def _iterate_items_with_stack(
|
|
2623
|
+
self,
|
|
2624
|
+
root: Optional[NodeItem] = None,
|
|
2625
|
+
with_groups: bool = False,
|
|
2626
|
+
traverse_pictures: bool = False,
|
|
2627
|
+
page_no: Optional[int] = None,
|
|
2628
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2629
|
+
_stack: Optional[list[int]] = None,
|
|
2630
|
+
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
|
|
2631
|
+
"""Iterate elements with stack."""
|
|
2306
2632
|
my_layers = (
|
|
2307
2633
|
included_content_layers
|
|
2308
2634
|
if included_content_layers is not None
|
|
2309
2635
|
else DEFAULT_CONTENT_LAYERS
|
|
2310
2636
|
)
|
|
2637
|
+
my_stack: list[int] = _stack if _stack is not None else []
|
|
2638
|
+
|
|
2311
2639
|
if not root:
|
|
2312
2640
|
root = self.body
|
|
2313
2641
|
|
|
@@ -2327,25 +2655,31 @@ class DoclingDocument(BaseModel):
|
|
|
2327
2655
|
)
|
|
2328
2656
|
|
|
2329
2657
|
if should_yield:
|
|
2330
|
-
yield root,
|
|
2658
|
+
yield root, my_stack
|
|
2331
2659
|
|
|
2332
2660
|
# Handle picture traversal - only traverse children if requested
|
|
2333
2661
|
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
2334
2662
|
return
|
|
2335
2663
|
|
|
2664
|
+
my_stack.append(-1)
|
|
2665
|
+
|
|
2336
2666
|
# Traverse children
|
|
2337
|
-
for child_ref in root.children:
|
|
2667
|
+
for child_ind, child_ref in enumerate(root.children):
|
|
2668
|
+
my_stack[-1] = child_ind
|
|
2338
2669
|
child = child_ref.resolve(self)
|
|
2670
|
+
|
|
2339
2671
|
if isinstance(child, NodeItem):
|
|
2340
|
-
yield from self.
|
|
2672
|
+
yield from self._iterate_items_with_stack(
|
|
2341
2673
|
child,
|
|
2342
2674
|
with_groups=with_groups,
|
|
2343
2675
|
traverse_pictures=traverse_pictures,
|
|
2344
2676
|
page_no=page_no,
|
|
2345
|
-
|
|
2677
|
+
_stack=my_stack,
|
|
2346
2678
|
included_content_layers=my_layers,
|
|
2347
2679
|
)
|
|
2348
2680
|
|
|
2681
|
+
my_stack.pop()
|
|
2682
|
+
|
|
2349
2683
|
def _clear_picture_pil_cache(self):
|
|
2350
2684
|
"""Clear cache storage of all images."""
|
|
2351
2685
|
for item, level in self.iterate_items(with_groups=False):
|
|
@@ -2618,6 +2952,7 @@ class DoclingDocument(BaseModel):
|
|
|
2618
2952
|
strict_text: bool = False,
|
|
2619
2953
|
escape_underscores: bool = True,
|
|
2620
2954
|
image_placeholder: str = "<!-- image -->",
|
|
2955
|
+
enable_chart_tables: bool = True,
|
|
2621
2956
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2622
2957
|
indent: int = 4,
|
|
2623
2958
|
text_width: int = -1,
|
|
@@ -2685,6 +3020,7 @@ class DoclingDocument(BaseModel):
|
|
|
2685
3020
|
stop_idx=to_element,
|
|
2686
3021
|
escape_underscores=escape_underscores,
|
|
2687
3022
|
image_placeholder=image_placeholder,
|
|
3023
|
+
enable_chart_tables=enable_chart_tables,
|
|
2688
3024
|
image_mode=image_mode,
|
|
2689
3025
|
indent=indent,
|
|
2690
3026
|
wrap_width=text_width if text_width > 0 else None,
|
|
@@ -2735,12 +3071,14 @@ class DoclingDocument(BaseModel):
|
|
|
2735
3071
|
formula_to_mathml: bool = True,
|
|
2736
3072
|
page_no: Optional[int] = None,
|
|
2737
3073
|
html_lang: str = "en",
|
|
2738
|
-
html_head: str =
|
|
3074
|
+
html_head: str = "null", # should be deprecated
|
|
2739
3075
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3076
|
+
split_page_view: bool = False,
|
|
2740
3077
|
):
|
|
2741
3078
|
"""Save to HTML."""
|
|
2742
3079
|
if isinstance(filename, str):
|
|
2743
3080
|
filename = Path(filename)
|
|
3081
|
+
|
|
2744
3082
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2745
3083
|
|
|
2746
3084
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2760,6 +3098,7 @@ class DoclingDocument(BaseModel):
|
|
|
2760
3098
|
html_lang=html_lang,
|
|
2761
3099
|
html_head=html_head,
|
|
2762
3100
|
included_content_layers=included_content_layers,
|
|
3101
|
+
split_page_view=split_page_view,
|
|
2763
3102
|
)
|
|
2764
3103
|
|
|
2765
3104
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2808,245 +3147,51 @@ class DoclingDocument(BaseModel):
|
|
|
2808
3147
|
formula_to_mathml: bool = True,
|
|
2809
3148
|
page_no: Optional[int] = None,
|
|
2810
3149
|
html_lang: str = "en",
|
|
2811
|
-
html_head: str =
|
|
3150
|
+
html_head: str = "null", # should be deprecated ...
|
|
2812
3151
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3152
|
+
split_page_view: bool = False,
|
|
2813
3153
|
) -> str:
|
|
2814
3154
|
r"""Serialize to HTML."""
|
|
2815
|
-
|
|
3155
|
+
from docling_core.experimental.serializer.html import (
|
|
3156
|
+
HTMLDocSerializer,
|
|
3157
|
+
HTMLOutputStyle,
|
|
3158
|
+
HTMLParams,
|
|
3159
|
+
)
|
|
3160
|
+
|
|
3161
|
+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2816
3162
|
my_layers = (
|
|
2817
3163
|
included_content_layers
|
|
2818
3164
|
if included_content_layers is not None
|
|
2819
3165
|
else DEFAULT_CONTENT_LAYERS
|
|
2820
3166
|
)
|
|
2821
3167
|
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
in_ordered_list: List[bool],
|
|
2826
|
-
html_texts: list[str],
|
|
2827
|
-
):
|
|
2828
|
-
|
|
2829
|
-
if len(in_ordered_list) == 0:
|
|
2830
|
-
return (in_ordered_list, html_texts)
|
|
2831
|
-
|
|
2832
|
-
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2833
|
-
if in_ordered_list[-1]:
|
|
2834
|
-
html_texts.append("</ol>")
|
|
2835
|
-
else:
|
|
2836
|
-
html_texts.append("</ul>")
|
|
2837
|
-
|
|
2838
|
-
prev_level -= 1
|
|
2839
|
-
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2840
|
-
|
|
2841
|
-
return (in_ordered_list, html_texts)
|
|
2842
|
-
|
|
2843
|
-
head_lines = [
|
|
2844
|
-
"<!DOCTYPE html>",
|
|
2845
|
-
f'<html lang="{html_lang}">',
|
|
2846
|
-
html_head,
|
|
2847
|
-
]
|
|
2848
|
-
html_texts: list[str] = []
|
|
2849
|
-
|
|
2850
|
-
prev_level = 0 # Track the previous item's level
|
|
2851
|
-
|
|
2852
|
-
in_ordered_list: List[bool] = [] # False
|
|
2853
|
-
|
|
2854
|
-
def _prepare_tag_content(
|
|
2855
|
-
text: str, do_escape_html=True, do_replace_newline=True
|
|
2856
|
-
) -> str:
|
|
2857
|
-
if do_escape_html:
|
|
2858
|
-
text = html.escape(text, quote=False)
|
|
2859
|
-
if do_replace_newline:
|
|
2860
|
-
text = text.replace("\n", "<br>")
|
|
2861
|
-
return text
|
|
2862
|
-
|
|
2863
|
-
for ix, (item, curr_level) in enumerate(
|
|
2864
|
-
self.iterate_items(
|
|
2865
|
-
self.body,
|
|
2866
|
-
with_groups=True,
|
|
2867
|
-
page_no=page_no,
|
|
2868
|
-
included_content_layers=my_layers,
|
|
2869
|
-
)
|
|
2870
|
-
):
|
|
2871
|
-
# If we've moved to a lower level, we're exiting one or more groups
|
|
2872
|
-
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2873
|
-
# Calculate how many levels we've exited
|
|
2874
|
-
# level_difference = previous_level - level
|
|
2875
|
-
# Decrement list_nesting_level for each list group we've exited
|
|
2876
|
-
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2877
|
-
|
|
2878
|
-
in_ordered_list, html_texts = close_lists(
|
|
2879
|
-
curr_level=curr_level,
|
|
2880
|
-
prev_level=prev_level,
|
|
2881
|
-
in_ordered_list=in_ordered_list,
|
|
2882
|
-
html_texts=html_texts,
|
|
2883
|
-
)
|
|
2884
|
-
|
|
2885
|
-
prev_level = curr_level # Update previous_level for next iteration
|
|
3168
|
+
output_style = HTMLOutputStyle.SINGLE_COLUMN
|
|
3169
|
+
if split_page_view:
|
|
3170
|
+
output_style = HTMLOutputStyle.SPLIT_PAGE
|
|
2886
3171
|
|
|
2887
|
-
|
|
2888
|
-
|
|
2889
|
-
|
|
2890
|
-
if
|
|
2891
|
-
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
|
|
2899
|
-
|
|
2900
|
-
# Increment list nesting level when entering a new list
|
|
2901
|
-
in_ordered_list.append(True)
|
|
2902
|
-
|
|
2903
|
-
elif isinstance(item, GroupItem) and item.label in [
|
|
2904
|
-
GroupLabel.LIST,
|
|
2905
|
-
]:
|
|
2906
|
-
|
|
2907
|
-
text = "<ul>"
|
|
2908
|
-
html_texts.append(text)
|
|
2909
|
-
|
|
2910
|
-
# Increment list nesting level when entering a new list
|
|
2911
|
-
in_ordered_list.append(False)
|
|
2912
|
-
|
|
2913
|
-
elif isinstance(item, GroupItem):
|
|
2914
|
-
continue
|
|
2915
|
-
|
|
2916
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2917
|
-
text_inner = _prepare_tag_content(item.text)
|
|
2918
|
-
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
|
|
2919
|
-
|
|
2920
|
-
html_texts.append(text)
|
|
2921
|
-
|
|
2922
|
-
elif isinstance(item, SectionHeaderItem):
|
|
2923
|
-
|
|
2924
|
-
section_level: int = min(item.level + 1, 6)
|
|
2925
|
-
|
|
2926
|
-
text = get_html_tag_with_text_direction(
|
|
2927
|
-
html_tag=f"h{section_level}",
|
|
2928
|
-
text=_prepare_tag_content(item.text),
|
|
2929
|
-
)
|
|
2930
|
-
html_texts.append(text)
|
|
2931
|
-
|
|
2932
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2933
|
-
|
|
2934
|
-
math_formula = _prepare_tag_content(
|
|
2935
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
2936
|
-
)
|
|
2937
|
-
text = ""
|
|
2938
|
-
|
|
2939
|
-
def _image_fallback(item: TextItem):
|
|
2940
|
-
item_image = item.get_image(doc=self)
|
|
2941
|
-
if item_image is not None:
|
|
2942
|
-
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2943
|
-
return (
|
|
2944
|
-
"<figure>"
|
|
2945
|
-
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2946
|
-
"</figure>"
|
|
2947
|
-
)
|
|
2948
|
-
|
|
2949
|
-
img_fallback = _image_fallback(item)
|
|
2950
|
-
|
|
2951
|
-
# If the formula is not processed correcty, use its image
|
|
2952
|
-
if (
|
|
2953
|
-
item.text == ""
|
|
2954
|
-
and item.orig != ""
|
|
2955
|
-
and image_mode == ImageRefMode.EMBEDDED
|
|
2956
|
-
and len(item.prov) > 0
|
|
2957
|
-
and img_fallback is not None
|
|
2958
|
-
):
|
|
2959
|
-
text = img_fallback
|
|
2960
|
-
|
|
2961
|
-
# Building a math equation in MathML format
|
|
2962
|
-
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2963
|
-
elif formula_to_mathml and len(math_formula) > 0:
|
|
2964
|
-
try:
|
|
2965
|
-
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2966
|
-
math_formula, display="block"
|
|
2967
|
-
)
|
|
2968
|
-
annotation = SubElement(
|
|
2969
|
-
mathml_element, "annotation", dict(encoding="TeX")
|
|
2970
|
-
)
|
|
2971
|
-
annotation.text = math_formula
|
|
2972
|
-
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
2973
|
-
text = f"<div>{mathml}</div>"
|
|
2974
|
-
except Exception as err:
|
|
2975
|
-
_logger.warning(
|
|
2976
|
-
"Malformed formula cannot be rendered. "
|
|
2977
|
-
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
2978
|
-
)
|
|
2979
|
-
if (
|
|
2980
|
-
image_mode == ImageRefMode.EMBEDDED
|
|
2981
|
-
and len(item.prov) > 0
|
|
2982
|
-
and img_fallback is not None
|
|
2983
|
-
):
|
|
2984
|
-
text = img_fallback
|
|
2985
|
-
else:
|
|
2986
|
-
text = f"<pre>{math_formula}</pre>"
|
|
2987
|
-
|
|
2988
|
-
elif math_formula != "":
|
|
2989
|
-
text = f"<pre>{math_formula}</pre>"
|
|
2990
|
-
|
|
2991
|
-
if text != "":
|
|
2992
|
-
html_texts.append(text)
|
|
2993
|
-
else:
|
|
2994
|
-
html_texts.append(
|
|
2995
|
-
'<div class="formula-not-decoded">Formula not decoded</div>'
|
|
2996
|
-
)
|
|
2997
|
-
|
|
2998
|
-
elif isinstance(item, ListItem):
|
|
2999
|
-
text = get_html_tag_with_text_direction(
|
|
3000
|
-
html_tag="li", text=_prepare_tag_content(item.text)
|
|
3001
|
-
)
|
|
3002
|
-
html_texts.append(text)
|
|
3003
|
-
|
|
3004
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
3005
|
-
text = get_html_tag_with_text_direction(
|
|
3006
|
-
html_tag="li", text=_prepare_tag_content(item.text)
|
|
3007
|
-
)
|
|
3008
|
-
html_texts.append(text)
|
|
3009
|
-
|
|
3010
|
-
elif isinstance(item, CodeItem):
|
|
3011
|
-
code_text = _prepare_tag_content(
|
|
3012
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
3013
|
-
)
|
|
3014
|
-
text = f"<pre><code>{code_text}</code></pre>"
|
|
3015
|
-
html_texts.append(text)
|
|
3016
|
-
|
|
3017
|
-
elif isinstance(item, TextItem):
|
|
3018
|
-
|
|
3019
|
-
text = get_html_tag_with_text_direction(
|
|
3020
|
-
html_tag="p", text=_prepare_tag_content(item.text)
|
|
3021
|
-
)
|
|
3022
|
-
html_texts.append(text)
|
|
3023
|
-
|
|
3024
|
-
elif isinstance(item, TableItem):
|
|
3025
|
-
|
|
3026
|
-
text = item.export_to_html(doc=self, add_caption=True)
|
|
3027
|
-
html_texts.append(text)
|
|
3028
|
-
|
|
3029
|
-
elif isinstance(item, PictureItem):
|
|
3030
|
-
|
|
3031
|
-
html_texts.append(
|
|
3032
|
-
item.export_to_html(
|
|
3033
|
-
doc=self, add_caption=True, image_mode=image_mode
|
|
3034
|
-
)
|
|
3035
|
-
)
|
|
3036
|
-
|
|
3037
|
-
elif isinstance(item, DocItem) and item.label in my_labels:
|
|
3038
|
-
continue
|
|
3039
|
-
|
|
3040
|
-
html_texts.append("</html>")
|
|
3172
|
+
params = HTMLParams(
|
|
3173
|
+
labels=my_labels,
|
|
3174
|
+
layers=my_layers,
|
|
3175
|
+
pages={page_no} if page_no is not None else None,
|
|
3176
|
+
start_idx=from_element,
|
|
3177
|
+
stop_idx=to_element,
|
|
3178
|
+
image_mode=image_mode,
|
|
3179
|
+
formula_to_mathml=formula_to_mathml,
|
|
3180
|
+
html_head=html_head,
|
|
3181
|
+
html_lang=html_lang,
|
|
3182
|
+
output_style=output_style,
|
|
3183
|
+
)
|
|
3041
3184
|
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
lines.extend(html_texts)
|
|
3185
|
+
if html_head == "null":
|
|
3186
|
+
params.html_head = None
|
|
3045
3187
|
|
|
3046
|
-
|
|
3047
|
-
|
|
3188
|
+
serializer = HTMLDocSerializer(
|
|
3189
|
+
doc=self,
|
|
3190
|
+
params=params,
|
|
3191
|
+
)
|
|
3192
|
+
ser_res = serializer.serialize()
|
|
3048
3193
|
|
|
3049
|
-
return
|
|
3194
|
+
return ser_res.text
|
|
3050
3195
|
|
|
3051
3196
|
def load_from_doctags( # noqa: C901
|
|
3052
3197
|
self,
|
|
@@ -3077,6 +3222,8 @@ class DoclingDocument(BaseModel):
|
|
|
3077
3222
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3078
3223
|
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3079
3224
|
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
3225
|
+
if len(coords) > 4:
|
|
3226
|
+
coords = coords[:4]
|
|
3080
3227
|
if len(coords) == 4:
|
|
3081
3228
|
l, t, r, b = map(float, coords)
|
|
3082
3229
|
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
@@ -3107,11 +3254,28 @@ class DoclingDocument(BaseModel):
|
|
|
3107
3254
|
|
|
3108
3255
|
def otsl_parse_texts(texts, tokens):
|
|
3109
3256
|
split_word = TableToken.OTSL_NL.value
|
|
3257
|
+
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
3258
|
+
clean_tokens = []
|
|
3259
|
+
for t in tokens:
|
|
3260
|
+
if t in [
|
|
3261
|
+
TableToken.OTSL_ECEL.value,
|
|
3262
|
+
TableToken.OTSL_FCEL.value,
|
|
3263
|
+
TableToken.OTSL_LCEL.value,
|
|
3264
|
+
TableToken.OTSL_UCEL.value,
|
|
3265
|
+
TableToken.OTSL_XCEL.value,
|
|
3266
|
+
TableToken.OTSL_NL.value,
|
|
3267
|
+
TableToken.OTSL_CHED.value,
|
|
3268
|
+
TableToken.OTSL_RHED.value,
|
|
3269
|
+
TableToken.OTSL_SROW.value,
|
|
3270
|
+
]:
|
|
3271
|
+
clean_tokens.append(t)
|
|
3272
|
+
tokens = clean_tokens
|
|
3110
3273
|
split_row_tokens = [
|
|
3111
3274
|
list(y)
|
|
3112
3275
|
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
3113
3276
|
if not x
|
|
3114
3277
|
]
|
|
3278
|
+
|
|
3115
3279
|
table_cells = []
|
|
3116
3280
|
r_idx = 0
|
|
3117
3281
|
c_idx = 0
|
|
@@ -3263,6 +3427,40 @@ class DoclingDocument(BaseModel):
|
|
|
3263
3427
|
table_cells=table_cells,
|
|
3264
3428
|
)
|
|
3265
3429
|
|
|
3430
|
+
def extract_chart_type(text_chunk: str):
|
|
3431
|
+
label = None
|
|
3432
|
+
chart_labels = [
|
|
3433
|
+
PictureClassificationLabel.PIE_CHART,
|
|
3434
|
+
PictureClassificationLabel.BAR_CHART,
|
|
3435
|
+
PictureClassificationLabel.STACKED_BAR_CHART,
|
|
3436
|
+
PictureClassificationLabel.LINE_CHART,
|
|
3437
|
+
PictureClassificationLabel.FLOW_CHART,
|
|
3438
|
+
PictureClassificationLabel.SCATTER_CHART,
|
|
3439
|
+
PictureClassificationLabel.HEATMAP,
|
|
3440
|
+
"line",
|
|
3441
|
+
"dot_line",
|
|
3442
|
+
"vbar_categorical",
|
|
3443
|
+
"hbar_categorical",
|
|
3444
|
+
]
|
|
3445
|
+
|
|
3446
|
+
# Current SmolDocling can predict different labels:
|
|
3447
|
+
chart_labels_mapping = {
|
|
3448
|
+
"line": PictureClassificationLabel.LINE_CHART,
|
|
3449
|
+
"dot_line": PictureClassificationLabel.LINE_CHART,
|
|
3450
|
+
"vbar_categorical": PictureClassificationLabel.BAR_CHART,
|
|
3451
|
+
"hbar_categorical": PictureClassificationLabel.BAR_CHART,
|
|
3452
|
+
}
|
|
3453
|
+
|
|
3454
|
+
for clabel in chart_labels:
|
|
3455
|
+
tag = f"<{clabel}>"
|
|
3456
|
+
if tag in text_chunk:
|
|
3457
|
+
if clabel in chart_labels_mapping:
|
|
3458
|
+
label = PictureClassificationLabel(chart_labels_mapping[clabel])
|
|
3459
|
+
else:
|
|
3460
|
+
label = PictureClassificationLabel(clabel)
|
|
3461
|
+
break
|
|
3462
|
+
return label
|
|
3463
|
+
|
|
3266
3464
|
def parse_key_value_item(
|
|
3267
3465
|
tokens: str, image: Optional[PILImage.Image] = None
|
|
3268
3466
|
) -> Tuple[GraphData, Optional[ProvenanceItem]]:
|
|
@@ -3394,10 +3592,9 @@ class DoclingDocument(BaseModel):
|
|
|
3394
3592
|
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3395
3593
|
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3396
3594
|
rf"{DocItemLabel.KEY_VALUE_REGION}|"
|
|
3595
|
+
rf"{DocumentToken.CHART.value}|"
|
|
3397
3596
|
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3398
3597
|
)
|
|
3399
|
-
|
|
3400
|
-
# DocumentToken.OTSL
|
|
3401
3598
|
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3402
3599
|
|
|
3403
3600
|
# Go through each match in order
|
|
@@ -3405,18 +3602,17 @@ class DoclingDocument(BaseModel):
|
|
|
3405
3602
|
full_chunk = match.group(0)
|
|
3406
3603
|
tag_name = match.group("tag")
|
|
3407
3604
|
|
|
3408
|
-
bbox = extract_bounding_box(full_chunk)
|
|
3605
|
+
bbox = extract_bounding_box(full_chunk) # Extracts first bbox
|
|
3409
3606
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3410
3607
|
|
|
3411
3608
|
if tag_name == DocumentToken.OTSL.value:
|
|
3412
3609
|
table_data = parse_table_content(full_chunk)
|
|
3413
|
-
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3414
3610
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
3415
3611
|
if caption is not None and caption_bbox is not None:
|
|
3416
3612
|
caption.prov.append(
|
|
3417
3613
|
ProvenanceItem(
|
|
3418
3614
|
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
|
|
3419
|
-
charspan=(0,
|
|
3615
|
+
charspan=(0, len(caption.text)),
|
|
3420
3616
|
page_no=page_no,
|
|
3421
3617
|
)
|
|
3422
3618
|
)
|
|
@@ -3430,8 +3626,13 @@ class DoclingDocument(BaseModel):
|
|
|
3430
3626
|
else:
|
|
3431
3627
|
self.add_table(data=table_data, caption=caption)
|
|
3432
3628
|
|
|
3433
|
-
elif tag_name
|
|
3434
|
-
|
|
3629
|
+
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3630
|
+
caption, caption_bbox = extract_caption(full_chunk)
|
|
3631
|
+
table_data = None
|
|
3632
|
+
chart_type = None
|
|
3633
|
+
if tag_name == DocumentToken.CHART.value:
|
|
3634
|
+
table_data = parse_table_content(full_chunk)
|
|
3635
|
+
chart_type = extract_chart_type(full_chunk)
|
|
3435
3636
|
if image:
|
|
3436
3637
|
if bbox:
|
|
3437
3638
|
im_width, im_height = image.size
|
|
@@ -3455,30 +3656,77 @@ class DoclingDocument(BaseModel):
|
|
|
3455
3656
|
),
|
|
3456
3657
|
)
|
|
3457
3658
|
# If there is a caption to an image, add it as well
|
|
3458
|
-
if
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3659
|
+
if caption is not None and caption_bbox is not None:
|
|
3660
|
+
caption.prov.append(
|
|
3661
|
+
ProvenanceItem(
|
|
3662
|
+
bbox=caption_bbox.resize_by_scale(
|
|
3663
|
+
pg_width, pg_height
|
|
3664
|
+
),
|
|
3665
|
+
charspan=(0, len(caption.text)),
|
|
3666
|
+
page_no=page_no,
|
|
3667
|
+
)
|
|
3463
3668
|
)
|
|
3464
|
-
pic.captions.append(
|
|
3669
|
+
pic.captions.append(caption.get_ref())
|
|
3670
|
+
pic_title = "picture"
|
|
3671
|
+
if chart_type is not None:
|
|
3672
|
+
pic.annotations.append(
|
|
3673
|
+
PictureClassificationData(
|
|
3674
|
+
provenance="load_from_doctags",
|
|
3675
|
+
predicted_classes=[
|
|
3676
|
+
# chart_type
|
|
3677
|
+
PictureClassificationClass(
|
|
3678
|
+
class_name=chart_type, confidence=1.0
|
|
3679
|
+
)
|
|
3680
|
+
],
|
|
3681
|
+
)
|
|
3682
|
+
)
|
|
3683
|
+
pic_title = chart_type
|
|
3684
|
+
if table_data is not None:
|
|
3685
|
+
# Add chart data as PictureTabularChartData
|
|
3686
|
+
pd = PictureTabularChartData(
|
|
3687
|
+
chart_data=table_data, title=pic_title
|
|
3688
|
+
)
|
|
3689
|
+
pic.annotations.append(pd)
|
|
3465
3690
|
else:
|
|
3466
3691
|
if bbox:
|
|
3467
3692
|
# In case we don't have access to an binary of an image
|
|
3468
|
-
self.add_picture(
|
|
3693
|
+
pic = self.add_picture(
|
|
3469
3694
|
parent=None,
|
|
3470
3695
|
prov=ProvenanceItem(
|
|
3471
3696
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
3472
3697
|
),
|
|
3473
3698
|
)
|
|
3474
3699
|
# If there is a caption to an image, add it as well
|
|
3475
|
-
if
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3700
|
+
if caption is not None and caption_bbox is not None:
|
|
3701
|
+
caption.prov.append(
|
|
3702
|
+
ProvenanceItem(
|
|
3703
|
+
bbox=caption_bbox.resize_by_scale(
|
|
3704
|
+
pg_width, pg_height
|
|
3705
|
+
),
|
|
3706
|
+
charspan=(0, len(caption.text)),
|
|
3707
|
+
page_no=page_no,
|
|
3708
|
+
)
|
|
3709
|
+
)
|
|
3710
|
+
pic.captions.append(caption.get_ref())
|
|
3711
|
+
if chart_type is not None:
|
|
3712
|
+
pic.annotations.append(
|
|
3713
|
+
PictureClassificationData(
|
|
3714
|
+
provenance="load_from_doctags",
|
|
3715
|
+
predicted_classes=[
|
|
3716
|
+
# chart_type
|
|
3717
|
+
PictureClassificationClass(
|
|
3718
|
+
class_name=chart_type, confidence=1.0
|
|
3719
|
+
)
|
|
3720
|
+
],
|
|
3721
|
+
)
|
|
3480
3722
|
)
|
|
3481
|
-
|
|
3723
|
+
if table_data is not None:
|
|
3724
|
+
# Add chart data as PictureTabularChartData
|
|
3725
|
+
pd = PictureTabularChartData(
|
|
3726
|
+
chart_data=table_data, title=pic_title
|
|
3727
|
+
)
|
|
3728
|
+
pic.annotations.append(pd)
|
|
3729
|
+
|
|
3482
3730
|
elif tag_name == DocItemLabel.KEY_VALUE_REGION:
|
|
3483
3731
|
key_value_data, kv_item_prov = parse_key_value_item(
|
|
3484
3732
|
full_chunk, image
|