docling-core 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/__init__.py +6 -0
- docling_core/experimental/serializer/__init__.py +6 -0
- docling_core/experimental/serializer/base.py +227 -0
- docling_core/experimental/serializer/common.py +353 -0
- docling_core/experimental/serializer/markdown.py +461 -0
- docling_core/types/doc/document.py +779 -330
- docling_core/types/doc/page.py +1238 -0
- docling_core/types/doc/tokens.py +1 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/METADATA +1 -1
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/RECORD +13 -7
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/LICENSE +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/WHEEL +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/entry_points.txt +0 -0
|
@@ -4,13 +4,13 @@ import base64
|
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
6
|
import html
|
|
7
|
+
import itertools
|
|
7
8
|
import json
|
|
8
9
|
import logging
|
|
9
10
|
import mimetypes
|
|
10
11
|
import os
|
|
11
12
|
import re
|
|
12
13
|
import sys
|
|
13
|
-
import textwrap
|
|
14
14
|
import typing
|
|
15
15
|
import warnings
|
|
16
16
|
from enum import Enum
|
|
@@ -37,7 +37,7 @@ from pydantic import (
|
|
|
37
37
|
model_validator,
|
|
38
38
|
)
|
|
39
39
|
from tabulate import tabulate
|
|
40
|
-
from typing_extensions import Annotated, Self
|
|
40
|
+
from typing_extensions import Annotated, Self, deprecated
|
|
41
41
|
|
|
42
42
|
from docling_core.search.package import VERSION_PATTERN
|
|
43
43
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
@@ -61,7 +61,7 @@ _logger = logging.getLogger(__name__)
|
|
|
61
61
|
|
|
62
62
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
63
63
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
64
|
-
CURRENT_VERSION: Final = "1.
|
|
64
|
+
CURRENT_VERSION: Final = "1.3.0"
|
|
65
65
|
|
|
66
66
|
DEFAULT_EXPORT_LABELS = {
|
|
67
67
|
DocItemLabel.TITLE,
|
|
@@ -86,6 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
|
86
86
|
[
|
|
87
87
|
DocItemLabel.FOOTNOTE,
|
|
88
88
|
DocItemLabel.CAPTION,
|
|
89
|
+
DocItemLabel.KEY_VALUE_REGION,
|
|
90
|
+
DocItemLabel.FORM,
|
|
89
91
|
]
|
|
90
92
|
)
|
|
91
93
|
|
|
@@ -522,6 +524,49 @@ class ImageRef(BaseModel):
|
|
|
522
524
|
)
|
|
523
525
|
|
|
524
526
|
|
|
527
|
+
class DocTagsPage(BaseModel):
|
|
528
|
+
"""DocTagsPage."""
|
|
529
|
+
|
|
530
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
531
|
+
|
|
532
|
+
tokens: str
|
|
533
|
+
image: Optional[PILImage.Image] = None
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
class DocTagsDocument(BaseModel):
|
|
537
|
+
"""DocTagsDocument."""
|
|
538
|
+
|
|
539
|
+
pages: List[DocTagsPage] = []
|
|
540
|
+
|
|
541
|
+
@classmethod
|
|
542
|
+
def from_doctags_and_image_pairs(
|
|
543
|
+
cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
|
|
544
|
+
):
|
|
545
|
+
"""from_doctags_and_image_pairs."""
|
|
546
|
+
if len(doctags) != len(images):
|
|
547
|
+
raise ValueError("Number of page doctags must be equal to page images!")
|
|
548
|
+
doctags_doc = cls()
|
|
549
|
+
|
|
550
|
+
pages = []
|
|
551
|
+
for dt, img in zip(doctags, images):
|
|
552
|
+
if isinstance(dt, Path):
|
|
553
|
+
with dt.open("r") as fp:
|
|
554
|
+
dt = fp.read()
|
|
555
|
+
elif isinstance(dt, str):
|
|
556
|
+
pass
|
|
557
|
+
|
|
558
|
+
if isinstance(img, Path):
|
|
559
|
+
img = PILImage.open(img)
|
|
560
|
+
elif isinstance(dt, PILImage.Image):
|
|
561
|
+
pass
|
|
562
|
+
|
|
563
|
+
page = DocTagsPage(tokens=dt, image=img)
|
|
564
|
+
pages.append(page)
|
|
565
|
+
|
|
566
|
+
doctags_doc.pages = pages
|
|
567
|
+
return doctags_doc
|
|
568
|
+
|
|
569
|
+
|
|
525
570
|
class ProvenanceItem(BaseModel):
|
|
526
571
|
"""ProvenanceItem."""
|
|
527
572
|
|
|
@@ -563,9 +608,30 @@ class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
|
563
608
|
"group" # Name of the group, e.g. "Introduction Chapter",
|
|
564
609
|
# "Slide 5", "Navigation menu list", ...
|
|
565
610
|
)
|
|
611
|
+
# TODO narrow down to allowed values, i.e. excluding those used for subtypes
|
|
566
612
|
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
567
613
|
|
|
568
614
|
|
|
615
|
+
class UnorderedList(GroupItem):
|
|
616
|
+
"""UnorderedList."""
|
|
617
|
+
|
|
618
|
+
label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST # type: ignore[assignment]
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
class OrderedList(GroupItem):
|
|
622
|
+
"""OrderedList."""
|
|
623
|
+
|
|
624
|
+
label: typing.Literal[GroupLabel.ORDERED_LIST] = (
|
|
625
|
+
GroupLabel.ORDERED_LIST # type: ignore[assignment]
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
class InlineGroup(GroupItem):
|
|
630
|
+
"""InlineGroup."""
|
|
631
|
+
|
|
632
|
+
label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE
|
|
633
|
+
|
|
634
|
+
|
|
569
635
|
class DocItem(
|
|
570
636
|
NodeItem
|
|
571
637
|
): # Base type for any element that carries content, can be a leaf node
|
|
@@ -626,6 +692,15 @@ class DocItem(
|
|
|
626
692
|
return page_image.crop(crop_bbox.as_tuple())
|
|
627
693
|
|
|
628
694
|
|
|
695
|
+
class Formatting(BaseModel):
|
|
696
|
+
"""Formatting."""
|
|
697
|
+
|
|
698
|
+
bold: bool = False
|
|
699
|
+
italic: bool = False
|
|
700
|
+
underline: bool = False
|
|
701
|
+
strikethrough: bool = False
|
|
702
|
+
|
|
703
|
+
|
|
629
704
|
class TextItem(DocItem):
|
|
630
705
|
"""TextItem."""
|
|
631
706
|
|
|
@@ -634,18 +709,19 @@ class TextItem(DocItem):
|
|
|
634
709
|
DocItemLabel.CHECKBOX_SELECTED,
|
|
635
710
|
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
636
711
|
DocItemLabel.FOOTNOTE,
|
|
637
|
-
DocItemLabel.FORMULA,
|
|
638
712
|
DocItemLabel.PAGE_FOOTER,
|
|
639
713
|
DocItemLabel.PAGE_HEADER,
|
|
640
714
|
DocItemLabel.PARAGRAPH,
|
|
641
715
|
DocItemLabel.REFERENCE,
|
|
642
716
|
DocItemLabel.TEXT,
|
|
643
|
-
DocItemLabel.TITLE,
|
|
644
717
|
]
|
|
645
718
|
|
|
646
719
|
orig: str # untreated representation
|
|
647
720
|
text: str # sanitized representation
|
|
648
721
|
|
|
722
|
+
formatting: Optional[Formatting] = None
|
|
723
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None
|
|
724
|
+
|
|
649
725
|
def export_to_document_tokens(
|
|
650
726
|
self,
|
|
651
727
|
doc: "DoclingDocument",
|
|
@@ -683,6 +759,14 @@ class TextItem(DocItem):
|
|
|
683
759
|
return body
|
|
684
760
|
|
|
685
761
|
|
|
762
|
+
class TitleItem(TextItem):
|
|
763
|
+
"""TitleItem."""
|
|
764
|
+
|
|
765
|
+
label: typing.Literal[DocItemLabel.TITLE] = (
|
|
766
|
+
DocItemLabel.TITLE # type: ignore[assignment]
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
|
|
686
770
|
class SectionHeaderItem(TextItem):
|
|
687
771
|
"""SectionItem."""
|
|
688
772
|
|
|
@@ -818,6 +902,14 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
818
902
|
return body
|
|
819
903
|
|
|
820
904
|
|
|
905
|
+
class FormulaItem(TextItem):
|
|
906
|
+
"""FormulaItem."""
|
|
907
|
+
|
|
908
|
+
label: typing.Literal[DocItemLabel.FORMULA] = (
|
|
909
|
+
DocItemLabel.FORMULA # type: ignore[assignment]
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
|
|
821
913
|
class PictureItem(FloatingItem):
|
|
822
914
|
"""PictureItem."""
|
|
823
915
|
|
|
@@ -856,54 +948,34 @@ class PictureItem(FloatingItem):
|
|
|
856
948
|
def export_to_markdown(
|
|
857
949
|
self,
|
|
858
950
|
doc: "DoclingDocument",
|
|
859
|
-
add_caption: bool = True,
|
|
951
|
+
add_caption: bool = True, # deprecated
|
|
860
952
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
861
953
|
image_placeholder: str = "<!-- image -->",
|
|
862
954
|
) -> str:
|
|
863
955
|
"""Export picture to Markdown format."""
|
|
864
|
-
|
|
865
|
-
error_response = (
|
|
866
|
-
"<!-- 🖼️❌ Image not available. "
|
|
867
|
-
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
868
|
-
" -->"
|
|
869
|
-
)
|
|
956
|
+
from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
|
|
870
957
|
|
|
871
|
-
if
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
# short-cut: we already have the image in base64
|
|
877
|
-
if (
|
|
878
|
-
isinstance(self.image, ImageRef)
|
|
879
|
-
and isinstance(self.image.uri, AnyUrl)
|
|
880
|
-
and self.image.uri.scheme == "data"
|
|
881
|
-
):
|
|
882
|
-
text = f""
|
|
883
|
-
return text
|
|
884
|
-
|
|
885
|
-
# get the self.image._pil or crop it out of the page-image
|
|
886
|
-
img = self.get_image(doc)
|
|
887
|
-
|
|
888
|
-
if img is not None:
|
|
889
|
-
imgb64 = self._image_to_base64(img)
|
|
890
|
-
text = f""
|
|
891
|
-
|
|
892
|
-
return text
|
|
893
|
-
else:
|
|
894
|
-
return error_response
|
|
895
|
-
|
|
896
|
-
elif image_mode == ImageRefMode.REFERENCED:
|
|
897
|
-
if not isinstance(self.image, ImageRef) or (
|
|
898
|
-
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
899
|
-
):
|
|
900
|
-
return default_response
|
|
901
|
-
|
|
902
|
-
text = f")})"
|
|
903
|
-
return text
|
|
958
|
+
if not add_caption:
|
|
959
|
+
_logger.warning(
|
|
960
|
+
"Argument `add_caption` is deprecated and will be ignored.",
|
|
961
|
+
)
|
|
904
962
|
|
|
905
|
-
|
|
906
|
-
|
|
963
|
+
serializer = MarkdownDocSerializer(
|
|
964
|
+
doc=self,
|
|
965
|
+
image_mode=image_mode,
|
|
966
|
+
)
|
|
967
|
+
text = (
|
|
968
|
+
serializer.picture_serializer.serialize(
|
|
969
|
+
item=self,
|
|
970
|
+
doc_serializer=serializer,
|
|
971
|
+
doc=doc,
|
|
972
|
+
image_mode=image_mode,
|
|
973
|
+
image_placeholder=image_placeholder,
|
|
974
|
+
).text
|
|
975
|
+
if serializer.picture_serializer
|
|
976
|
+
else ""
|
|
977
|
+
)
|
|
978
|
+
return text
|
|
907
979
|
|
|
908
980
|
def export_to_html(
|
|
909
981
|
self,
|
|
@@ -1003,6 +1075,20 @@ class PictureItem(FloatingItem):
|
|
|
1003
1075
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
1004
1076
|
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
1005
1077
|
|
|
1078
|
+
smiles_annotations = [
|
|
1079
|
+
ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
|
|
1080
|
+
]
|
|
1081
|
+
if len(smiles_annotations) > 0:
|
|
1082
|
+
body += (
|
|
1083
|
+
"<"
|
|
1084
|
+
+ DocumentToken.SMILES.value
|
|
1085
|
+
+ ">"
|
|
1086
|
+
+ smiles_annotations[0].smi
|
|
1087
|
+
+ "</"
|
|
1088
|
+
+ DocumentToken.SMILES.value
|
|
1089
|
+
+ ">"
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1006
1092
|
if add_caption and len(self.captions):
|
|
1007
1093
|
text = self.caption_text(doc)
|
|
1008
1094
|
|
|
@@ -1078,33 +1164,58 @@ class TableItem(FloatingItem):
|
|
|
1078
1164
|
|
|
1079
1165
|
return df
|
|
1080
1166
|
|
|
1081
|
-
def export_to_markdown(self) -> str:
|
|
1167
|
+
def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
|
|
1082
1168
|
"""Export the table as markdown."""
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1169
|
+
if doc is not None:
|
|
1170
|
+
from docling_core.experimental.serializer.markdown import (
|
|
1171
|
+
MarkdownDocSerializer,
|
|
1172
|
+
)
|
|
1087
1173
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1174
|
+
serializer = MarkdownDocSerializer(
|
|
1175
|
+
doc=doc,
|
|
1176
|
+
)
|
|
1177
|
+
text = (
|
|
1178
|
+
serializer.table_serializer.serialize(
|
|
1179
|
+
item=self,
|
|
1180
|
+
doc_serializer=serializer,
|
|
1181
|
+
doc=doc,
|
|
1182
|
+
).text
|
|
1183
|
+
if serializer.table_serializer
|
|
1184
|
+
else ""
|
|
1185
|
+
)
|
|
1186
|
+
return text
|
|
1187
|
+
else:
|
|
1188
|
+
_logger.warning(
|
|
1189
|
+
"Usage of TableItem.export_to_markdown() without `doc` argument is "
|
|
1190
|
+
"deprecated.",
|
|
1191
|
+
)
|
|
1093
1192
|
|
|
1094
|
-
table
|
|
1193
|
+
table = []
|
|
1194
|
+
for row in self.data.grid:
|
|
1195
|
+
tmp = []
|
|
1196
|
+
for col in row:
|
|
1197
|
+
|
|
1198
|
+
# make sure that md tables are not broken
|
|
1199
|
+
# due to newline chars in the text
|
|
1200
|
+
text = col.text
|
|
1201
|
+
text = text.replace("\n", " ")
|
|
1202
|
+
tmp.append(text)
|
|
1203
|
+
|
|
1204
|
+
table.append(tmp)
|
|
1205
|
+
|
|
1206
|
+
res = ""
|
|
1207
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
1208
|
+
try:
|
|
1209
|
+
res = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
1210
|
+
except ValueError:
|
|
1211
|
+
res = tabulate(
|
|
1212
|
+
table[1:],
|
|
1213
|
+
headers=table[0],
|
|
1214
|
+
tablefmt="github",
|
|
1215
|
+
disable_numparse=True,
|
|
1216
|
+
)
|
|
1095
1217
|
|
|
1096
|
-
|
|
1097
|
-
if len(table) > 1 and len(table[0]) > 0:
|
|
1098
|
-
try:
|
|
1099
|
-
md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
1100
|
-
except ValueError:
|
|
1101
|
-
md_table = tabulate(
|
|
1102
|
-
table[1:],
|
|
1103
|
-
headers=table[0],
|
|
1104
|
-
tablefmt="github",
|
|
1105
|
-
disable_numparse=True,
|
|
1106
|
-
)
|
|
1107
|
-
return md_table
|
|
1218
|
+
return res
|
|
1108
1219
|
|
|
1109
1220
|
def export_to_html(
|
|
1110
1221
|
self,
|
|
@@ -1397,10 +1508,6 @@ class KeyValueItem(FloatingItem):
|
|
|
1397
1508
|
|
|
1398
1509
|
graph: GraphData
|
|
1399
1510
|
|
|
1400
|
-
def _export_to_markdown(self) -> str:
|
|
1401
|
-
# TODO add actual implementation
|
|
1402
|
-
return "<!-- missing-key-value-item -->"
|
|
1403
|
-
|
|
1404
1511
|
|
|
1405
1512
|
class FormItem(FloatingItem):
|
|
1406
1513
|
"""FormItem."""
|
|
@@ -1409,17 +1516,15 @@ class FormItem(FloatingItem):
|
|
|
1409
1516
|
|
|
1410
1517
|
graph: GraphData
|
|
1411
1518
|
|
|
1412
|
-
def _export_to_markdown(self) -> str:
|
|
1413
|
-
# TODO add actual implementation
|
|
1414
|
-
return "<!-- missing-form-item -->"
|
|
1415
|
-
|
|
1416
1519
|
|
|
1417
1520
|
ContentItem = Annotated[
|
|
1418
1521
|
Union[
|
|
1419
1522
|
TextItem,
|
|
1523
|
+
TitleItem,
|
|
1420
1524
|
SectionHeaderItem,
|
|
1421
1525
|
ListItem,
|
|
1422
1526
|
CodeItem,
|
|
1527
|
+
FormulaItem,
|
|
1423
1528
|
PictureItem,
|
|
1424
1529
|
TableItem,
|
|
1425
1530
|
KeyValueItem,
|
|
@@ -1530,8 +1635,10 @@ class DoclingDocument(BaseModel):
|
|
|
1530
1635
|
) # List[RefItem] = []
|
|
1531
1636
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1532
1637
|
|
|
1533
|
-
groups: List[GroupItem] = []
|
|
1534
|
-
texts: List[
|
|
1638
|
+
groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
|
|
1639
|
+
texts: List[
|
|
1640
|
+
Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
|
|
1641
|
+
] = []
|
|
1535
1642
|
pictures: List[PictureItem] = []
|
|
1536
1643
|
tables: List[TableItem] = []
|
|
1537
1644
|
key_value_items: List[KeyValueItem] = []
|
|
@@ -1555,6 +1662,68 @@ class DoclingDocument(BaseModel):
|
|
|
1555
1662
|
item["content_layer"] = "furniture"
|
|
1556
1663
|
return data
|
|
1557
1664
|
|
|
1665
|
+
###################################
|
|
1666
|
+
# TODO: refactor add* methods below
|
|
1667
|
+
###################################
|
|
1668
|
+
|
|
1669
|
+
def add_ordered_list(
|
|
1670
|
+
self,
|
|
1671
|
+
name: Optional[str] = None,
|
|
1672
|
+
parent: Optional[NodeItem] = None,
|
|
1673
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1674
|
+
) -> GroupItem:
|
|
1675
|
+
"""add_ordered_list."""
|
|
1676
|
+
_parent = parent or self.body
|
|
1677
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1678
|
+
group = OrderedList(self_ref=cref, parent=_parent.get_ref())
|
|
1679
|
+
if name is not None:
|
|
1680
|
+
group.name = name
|
|
1681
|
+
if content_layer:
|
|
1682
|
+
group.content_layer = content_layer
|
|
1683
|
+
|
|
1684
|
+
self.groups.append(group)
|
|
1685
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1686
|
+
return group
|
|
1687
|
+
|
|
1688
|
+
def add_unordered_list(
|
|
1689
|
+
self,
|
|
1690
|
+
name: Optional[str] = None,
|
|
1691
|
+
parent: Optional[NodeItem] = None,
|
|
1692
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1693
|
+
) -> GroupItem:
|
|
1694
|
+
"""add_unordered_list."""
|
|
1695
|
+
_parent = parent or self.body
|
|
1696
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1697
|
+
group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
|
|
1698
|
+
if name is not None:
|
|
1699
|
+
group.name = name
|
|
1700
|
+
if content_layer:
|
|
1701
|
+
group.content_layer = content_layer
|
|
1702
|
+
|
|
1703
|
+
self.groups.append(group)
|
|
1704
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1705
|
+
return group
|
|
1706
|
+
|
|
1707
|
+
def add_inline_group(
|
|
1708
|
+
self,
|
|
1709
|
+
name: Optional[str] = None,
|
|
1710
|
+
parent: Optional[NodeItem] = None,
|
|
1711
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1712
|
+
# marker: Optional[UnorderedList.ULMarker] = None,
|
|
1713
|
+
) -> GroupItem:
|
|
1714
|
+
"""add_inline_group."""
|
|
1715
|
+
_parent = parent or self.body
|
|
1716
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1717
|
+
group = InlineGroup(self_ref=cref, parent=_parent.get_ref())
|
|
1718
|
+
if name is not None:
|
|
1719
|
+
group.name = name
|
|
1720
|
+
if content_layer:
|
|
1721
|
+
group.content_layer = content_layer
|
|
1722
|
+
|
|
1723
|
+
self.groups.append(group)
|
|
1724
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1725
|
+
return group
|
|
1726
|
+
|
|
1558
1727
|
def add_group(
|
|
1559
1728
|
self,
|
|
1560
1729
|
label: Optional[GroupLabel] = None,
|
|
@@ -1569,6 +1738,25 @@ class DoclingDocument(BaseModel):
|
|
|
1569
1738
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1570
1739
|
|
|
1571
1740
|
"""
|
|
1741
|
+
if label == GroupLabel.LIST:
|
|
1742
|
+
return self.add_unordered_list(
|
|
1743
|
+
name=name,
|
|
1744
|
+
parent=parent,
|
|
1745
|
+
content_layer=content_layer,
|
|
1746
|
+
)
|
|
1747
|
+
elif label == GroupLabel.ORDERED_LIST:
|
|
1748
|
+
return self.add_ordered_list(
|
|
1749
|
+
name=name,
|
|
1750
|
+
parent=parent,
|
|
1751
|
+
content_layer=content_layer,
|
|
1752
|
+
)
|
|
1753
|
+
elif label == GroupLabel.INLINE:
|
|
1754
|
+
return self.add_inline_group(
|
|
1755
|
+
name=name,
|
|
1756
|
+
parent=parent,
|
|
1757
|
+
content_layer=content_layer,
|
|
1758
|
+
)
|
|
1759
|
+
|
|
1572
1760
|
if not parent:
|
|
1573
1761
|
parent = self.body
|
|
1574
1762
|
|
|
@@ -1597,6 +1785,8 @@ class DoclingDocument(BaseModel):
|
|
|
1597
1785
|
prov: Optional[ProvenanceItem] = None,
|
|
1598
1786
|
parent: Optional[NodeItem] = None,
|
|
1599
1787
|
content_layer: Optional[ContentLayer] = None,
|
|
1788
|
+
formatting: Optional[Formatting] = None,
|
|
1789
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1600
1790
|
):
|
|
1601
1791
|
"""add_list_item.
|
|
1602
1792
|
|
|
@@ -1624,6 +1814,8 @@ class DoclingDocument(BaseModel):
|
|
|
1624
1814
|
parent=parent.get_ref(),
|
|
1625
1815
|
enumerated=enumerated,
|
|
1626
1816
|
marker=marker,
|
|
1817
|
+
formatting=formatting,
|
|
1818
|
+
hyperlink=hyperlink,
|
|
1627
1819
|
)
|
|
1628
1820
|
if prov:
|
|
1629
1821
|
list_item.prov.append(prov)
|
|
@@ -1643,6 +1835,8 @@ class DoclingDocument(BaseModel):
|
|
|
1643
1835
|
prov: Optional[ProvenanceItem] = None,
|
|
1644
1836
|
parent: Optional[NodeItem] = None,
|
|
1645
1837
|
content_layer: Optional[ContentLayer] = None,
|
|
1838
|
+
formatting: Optional[Formatting] = None,
|
|
1839
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1646
1840
|
):
|
|
1647
1841
|
"""add_text.
|
|
1648
1842
|
|
|
@@ -1662,6 +1856,8 @@ class DoclingDocument(BaseModel):
|
|
|
1662
1856
|
prov=prov,
|
|
1663
1857
|
parent=parent,
|
|
1664
1858
|
content_layer=content_layer,
|
|
1859
|
+
formatting=formatting,
|
|
1860
|
+
hyperlink=hyperlink,
|
|
1665
1861
|
)
|
|
1666
1862
|
|
|
1667
1863
|
elif label in [DocItemLabel.LIST_ITEM]:
|
|
@@ -1671,15 +1867,31 @@ class DoclingDocument(BaseModel):
|
|
|
1671
1867
|
prov=prov,
|
|
1672
1868
|
parent=parent,
|
|
1673
1869
|
content_layer=content_layer,
|
|
1870
|
+
formatting=formatting,
|
|
1871
|
+
hyperlink=hyperlink,
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
elif label in [DocItemLabel.TITLE]:
|
|
1875
|
+
return self.add_title(
|
|
1876
|
+
text=text,
|
|
1877
|
+
orig=orig,
|
|
1878
|
+
prov=prov,
|
|
1879
|
+
parent=parent,
|
|
1880
|
+
content_layer=content_layer,
|
|
1881
|
+
formatting=formatting,
|
|
1882
|
+
hyperlink=hyperlink,
|
|
1674
1883
|
)
|
|
1675
1884
|
|
|
1676
1885
|
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1677
1886
|
return self.add_heading(
|
|
1678
1887
|
text=text,
|
|
1679
1888
|
orig=orig,
|
|
1889
|
+
# NOTE: we do not / cannot pass the level here, lossy path..
|
|
1680
1890
|
prov=prov,
|
|
1681
1891
|
parent=parent,
|
|
1682
1892
|
content_layer=content_layer,
|
|
1893
|
+
formatting=formatting,
|
|
1894
|
+
hyperlink=hyperlink,
|
|
1683
1895
|
)
|
|
1684
1896
|
|
|
1685
1897
|
elif label in [DocItemLabel.CODE]:
|
|
@@ -1689,6 +1901,18 @@ class DoclingDocument(BaseModel):
|
|
|
1689
1901
|
prov=prov,
|
|
1690
1902
|
parent=parent,
|
|
1691
1903
|
content_layer=content_layer,
|
|
1904
|
+
formatting=formatting,
|
|
1905
|
+
hyperlink=hyperlink,
|
|
1906
|
+
)
|
|
1907
|
+
elif label in [DocItemLabel.FORMULA]:
|
|
1908
|
+
return self.add_formula(
|
|
1909
|
+
text=text,
|
|
1910
|
+
orig=orig,
|
|
1911
|
+
prov=prov,
|
|
1912
|
+
parent=parent,
|
|
1913
|
+
content_layer=content_layer,
|
|
1914
|
+
formatting=formatting,
|
|
1915
|
+
hyperlink=hyperlink,
|
|
1692
1916
|
)
|
|
1693
1917
|
|
|
1694
1918
|
else:
|
|
@@ -1707,6 +1931,8 @@ class DoclingDocument(BaseModel):
|
|
|
1707
1931
|
orig=orig,
|
|
1708
1932
|
self_ref=cref,
|
|
1709
1933
|
parent=parent.get_ref(),
|
|
1934
|
+
formatting=formatting,
|
|
1935
|
+
hyperlink=hyperlink,
|
|
1710
1936
|
)
|
|
1711
1937
|
if prov:
|
|
1712
1938
|
text_item.prov.append(prov)
|
|
@@ -1808,11 +2034,14 @@ class DoclingDocument(BaseModel):
|
|
|
1808
2034
|
prov: Optional[ProvenanceItem] = None,
|
|
1809
2035
|
parent: Optional[NodeItem] = None,
|
|
1810
2036
|
content_layer: Optional[ContentLayer] = None,
|
|
2037
|
+
formatting: Optional[Formatting] = None,
|
|
2038
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1811
2039
|
):
|
|
1812
2040
|
"""add_title.
|
|
1813
2041
|
|
|
1814
2042
|
:param text: str:
|
|
1815
2043
|
:param orig: Optional[str]: (Default value = None)
|
|
2044
|
+
:param level: LevelNumber: (Default value = 1)
|
|
1816
2045
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1817
2046
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1818
2047
|
"""
|
|
@@ -1824,22 +2053,23 @@ class DoclingDocument(BaseModel):
|
|
|
1824
2053
|
|
|
1825
2054
|
text_index = len(self.texts)
|
|
1826
2055
|
cref = f"#/texts/{text_index}"
|
|
1827
|
-
|
|
1828
|
-
label=DocItemLabel.TITLE,
|
|
2056
|
+
item = TitleItem(
|
|
1829
2057
|
text=text,
|
|
1830
2058
|
orig=orig,
|
|
1831
2059
|
self_ref=cref,
|
|
1832
2060
|
parent=parent.get_ref(),
|
|
2061
|
+
formatting=formatting,
|
|
2062
|
+
hyperlink=hyperlink,
|
|
1833
2063
|
)
|
|
1834
2064
|
if prov:
|
|
1835
|
-
|
|
2065
|
+
item.prov.append(prov)
|
|
1836
2066
|
if content_layer:
|
|
1837
|
-
|
|
2067
|
+
item.content_layer = content_layer
|
|
1838
2068
|
|
|
1839
|
-
self.texts.append(
|
|
2069
|
+
self.texts.append(item)
|
|
1840
2070
|
parent.children.append(RefItem(cref=cref))
|
|
1841
2071
|
|
|
1842
|
-
return
|
|
2072
|
+
return item
|
|
1843
2073
|
|
|
1844
2074
|
def add_code(
|
|
1845
2075
|
self,
|
|
@@ -1850,6 +2080,8 @@ class DoclingDocument(BaseModel):
|
|
|
1850
2080
|
prov: Optional[ProvenanceItem] = None,
|
|
1851
2081
|
parent: Optional[NodeItem] = None,
|
|
1852
2082
|
content_layer: Optional[ContentLayer] = None,
|
|
2083
|
+
formatting: Optional[Formatting] = None,
|
|
2084
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1853
2085
|
):
|
|
1854
2086
|
"""add_code.
|
|
1855
2087
|
|
|
@@ -1874,6 +2106,8 @@ class DoclingDocument(BaseModel):
|
|
|
1874
2106
|
orig=orig,
|
|
1875
2107
|
self_ref=cref,
|
|
1876
2108
|
parent=parent.get_ref(),
|
|
2109
|
+
formatting=formatting,
|
|
2110
|
+
hyperlink=hyperlink,
|
|
1877
2111
|
)
|
|
1878
2112
|
if code_language:
|
|
1879
2113
|
code_item.code_language = code_language
|
|
@@ -1889,6 +2123,50 @@ class DoclingDocument(BaseModel):
|
|
|
1889
2123
|
|
|
1890
2124
|
return code_item
|
|
1891
2125
|
|
|
2126
|
+
def add_formula(
|
|
2127
|
+
self,
|
|
2128
|
+
text: str,
|
|
2129
|
+
orig: Optional[str] = None,
|
|
2130
|
+
prov: Optional[ProvenanceItem] = None,
|
|
2131
|
+
parent: Optional[NodeItem] = None,
|
|
2132
|
+
content_layer: Optional[ContentLayer] = None,
|
|
2133
|
+
formatting: Optional[Formatting] = None,
|
|
2134
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
2135
|
+
):
|
|
2136
|
+
"""add_formula.
|
|
2137
|
+
|
|
2138
|
+
:param text: str:
|
|
2139
|
+
:param orig: Optional[str]: (Default value = None)
|
|
2140
|
+
:param level: LevelNumber: (Default value = 1)
|
|
2141
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
2142
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2143
|
+
"""
|
|
2144
|
+
if not parent:
|
|
2145
|
+
parent = self.body
|
|
2146
|
+
|
|
2147
|
+
if not orig:
|
|
2148
|
+
orig = text
|
|
2149
|
+
|
|
2150
|
+
text_index = len(self.texts)
|
|
2151
|
+
cref = f"#/texts/{text_index}"
|
|
2152
|
+
section_header_item = FormulaItem(
|
|
2153
|
+
text=text,
|
|
2154
|
+
orig=orig,
|
|
2155
|
+
self_ref=cref,
|
|
2156
|
+
parent=parent.get_ref(),
|
|
2157
|
+
formatting=formatting,
|
|
2158
|
+
hyperlink=hyperlink,
|
|
2159
|
+
)
|
|
2160
|
+
if prov:
|
|
2161
|
+
section_header_item.prov.append(prov)
|
|
2162
|
+
if content_layer:
|
|
2163
|
+
section_header_item.content_layer = content_layer
|
|
2164
|
+
|
|
2165
|
+
self.texts.append(section_header_item)
|
|
2166
|
+
parent.children.append(RefItem(cref=cref))
|
|
2167
|
+
|
|
2168
|
+
return section_header_item
|
|
2169
|
+
|
|
1892
2170
|
def add_heading(
|
|
1893
2171
|
self,
|
|
1894
2172
|
text: str,
|
|
@@ -1897,6 +2175,8 @@ class DoclingDocument(BaseModel):
|
|
|
1897
2175
|
prov: Optional[ProvenanceItem] = None,
|
|
1898
2176
|
parent: Optional[NodeItem] = None,
|
|
1899
2177
|
content_layer: Optional[ContentLayer] = None,
|
|
2178
|
+
formatting: Optional[Formatting] = None,
|
|
2179
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1900
2180
|
):
|
|
1901
2181
|
"""add_heading.
|
|
1902
2182
|
|
|
@@ -1921,6 +2201,8 @@ class DoclingDocument(BaseModel):
|
|
|
1921
2201
|
orig=orig,
|
|
1922
2202
|
self_ref=cref,
|
|
1923
2203
|
parent=parent.get_ref(),
|
|
2204
|
+
formatting=formatting,
|
|
2205
|
+
hyperlink=hyperlink,
|
|
1924
2206
|
)
|
|
1925
2207
|
if prov:
|
|
1926
2208
|
section_header_item.prov.append(prov)
|
|
@@ -2276,10 +2558,10 @@ class DoclingDocument(BaseModel):
|
|
|
2276
2558
|
self,
|
|
2277
2559
|
filename: Path,
|
|
2278
2560
|
artifacts_dir: Optional[Path] = None,
|
|
2279
|
-
delim: str = "\n\n",
|
|
2561
|
+
delim: str = "\n\n",
|
|
2280
2562
|
from_element: int = 0,
|
|
2281
2563
|
to_element: int = sys.maxsize,
|
|
2282
|
-
labels: set[DocItemLabel] =
|
|
2564
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2283
2565
|
strict_text: bool = False,
|
|
2284
2566
|
escaping_underscores: bool = True,
|
|
2285
2567
|
image_placeholder: str = "<!-- image -->",
|
|
@@ -2319,10 +2601,10 @@ class DoclingDocument(BaseModel):
|
|
|
2319
2601
|
|
|
2320
2602
|
def export_to_markdown( # noqa: C901
|
|
2321
2603
|
self,
|
|
2322
|
-
delim: str = "\n\n",
|
|
2604
|
+
delim: str = "\n\n",
|
|
2323
2605
|
from_element: int = 0,
|
|
2324
2606
|
to_element: int = sys.maxsize,
|
|
2325
|
-
labels: set[DocItemLabel] =
|
|
2607
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2326
2608
|
strict_text: bool = False,
|
|
2327
2609
|
escaping_underscores: bool = True,
|
|
2328
2610
|
image_placeholder: str = "<!-- image -->",
|
|
@@ -2337,9 +2619,8 @@ class DoclingDocument(BaseModel):
|
|
|
2337
2619
|
Operates on a slice of the document's body as defined through arguments
|
|
2338
2620
|
from_element and to_element; defaulting to the whole document.
|
|
2339
2621
|
|
|
2340
|
-
:param delim:
|
|
2341
|
-
|
|
2342
|
-
:type delim: str = "\n"
|
|
2622
|
+
:param delim: Deprecated.
|
|
2623
|
+
:type delim: str = "\n\n"
|
|
2343
2624
|
:param from_element: Body slicing start index (inclusive).
|
|
2344
2625
|
(Default value = 0).
|
|
2345
2626
|
:type from_element: int = 0
|
|
@@ -2347,9 +2628,8 @@ class DoclingDocument(BaseModel):
|
|
|
2347
2628
|
(exclusive). (Default value = maxint).
|
|
2348
2629
|
:type to_element: int = sys.maxsize
|
|
2349
2630
|
:param labels: The set of document labels to include in the export.
|
|
2350
|
-
:type labels: set[DocItemLabel] =
|
|
2351
|
-
:param strict_text:
|
|
2352
|
-
of the document. (Default value = False).
|
|
2631
|
+
:type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2632
|
+
:param strict_text: Deprecated.
|
|
2353
2633
|
:type strict_text: bool = False
|
|
2354
2634
|
:param escaping_underscores: bool: Whether to escape underscores in the
|
|
2355
2635
|
text content of the document. (Default value = True).
|
|
@@ -2366,250 +2646,48 @@ class DoclingDocument(BaseModel):
|
|
|
2366
2646
|
:returns: The exported Markdown representation.
|
|
2367
2647
|
:rtype: str
|
|
2368
2648
|
"""
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
|
|
2375
|
-
|
|
2649
|
+
from docling_core.experimental.serializer.markdown import (
|
|
2650
|
+
MarkdownDocSerializer,
|
|
2651
|
+
MarkdownListSerializer,
|
|
2652
|
+
MarkdownTextSerializer,
|
|
2653
|
+
)
|
|
2654
|
+
|
|
2655
|
+
serializer = MarkdownDocSerializer(
|
|
2656
|
+
doc=self,
|
|
2657
|
+
start=from_element,
|
|
2658
|
+
stop=to_element,
|
|
2376
2659
|
image_placeholder=image_placeholder,
|
|
2377
2660
|
image_mode=image_mode,
|
|
2378
|
-
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2661
|
+
labels=labels,
|
|
2662
|
+
layers=included_content_layers,
|
|
2663
|
+
pages={page_no} if page_no is not None else None,
|
|
2664
|
+
escaping_underscores=escaping_underscores,
|
|
2665
|
+
text_serializer=MarkdownTextSerializer(
|
|
2666
|
+
wrap_width=text_width if text_width > 0 else None,
|
|
2667
|
+
),
|
|
2668
|
+
list_serializer=MarkdownListSerializer(
|
|
2669
|
+
indent=indent,
|
|
2670
|
+
),
|
|
2385
2671
|
)
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
def _get_markdown_components( # noqa: C901
|
|
2389
|
-
self,
|
|
2390
|
-
node: NodeItem,
|
|
2391
|
-
from_element: int,
|
|
2392
|
-
to_element: int,
|
|
2393
|
-
labels: set[DocItemLabel],
|
|
2394
|
-
strict_text: bool,
|
|
2395
|
-
escaping_underscores: bool,
|
|
2396
|
-
image_placeholder: str,
|
|
2397
|
-
image_mode: ImageRefMode,
|
|
2398
|
-
indent: int,
|
|
2399
|
-
text_width: int,
|
|
2400
|
-
page_no: Optional[int],
|
|
2401
|
-
included_content_layers: set[ContentLayer],
|
|
2402
|
-
list_level: int,
|
|
2403
|
-
is_inline_scope: bool,
|
|
2404
|
-
visited: set[str], # refs of visited items
|
|
2405
|
-
) -> list[str]:
|
|
2406
|
-
components: list[str] = [] # components to concatenate
|
|
2407
|
-
|
|
2408
|
-
# Our export markdown doesn't contain any emphasis styling:
|
|
2409
|
-
# Bold, Italic, or Bold-Italic
|
|
2410
|
-
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2411
|
-
# That means we need to escape it, to properly reflect content in the markdown
|
|
2412
|
-
# However, we need to preserve underscores in image URLs
|
|
2413
|
-
# to maintain their validity
|
|
2414
|
-
# For example:  should remain unchanged
|
|
2415
|
-
def _escape_underscores(text):
|
|
2416
|
-
"""Escape underscores but leave them intact in the URL.."""
|
|
2417
|
-
# Firstly, identify all the URL patterns.
|
|
2418
|
-
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2419
|
-
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2420
|
-
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2421
|
-
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2422
|
-
|
|
2423
|
-
parts = []
|
|
2424
|
-
last_end = 0
|
|
2425
|
-
|
|
2426
|
-
for match in re.finditer(combined_pattern, text):
|
|
2427
|
-
# Text to add before the URL (needs to be escaped)
|
|
2428
|
-
before_url = text[last_end : match.start()]
|
|
2429
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2430
|
-
|
|
2431
|
-
# Add the full URL part (do not escape)
|
|
2432
|
-
parts.append(match.group(0))
|
|
2433
|
-
last_end = match.end()
|
|
2434
|
-
|
|
2435
|
-
# Add the final part of the text (which needs to be escaped)
|
|
2436
|
-
if last_end < len(text):
|
|
2437
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2438
|
-
|
|
2439
|
-
return "".join(parts)
|
|
2440
|
-
|
|
2441
|
-
def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
|
|
2442
|
-
if do_escape_underscores and escaping_underscores:
|
|
2443
|
-
text = _escape_underscores(text)
|
|
2444
|
-
if do_escape_html:
|
|
2445
|
-
text = html.escape(text, quote=False)
|
|
2446
|
-
if text:
|
|
2447
|
-
components.append(text)
|
|
2672
|
+
ser_res = serializer.serialize()
|
|
2448
2673
|
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2674
|
+
if delim != "\n\n":
|
|
2675
|
+
_logger.warning(
|
|
2676
|
+
"Parameter `delim` has been deprecated and will be ignored.",
|
|
2677
|
+
)
|
|
2678
|
+
if strict_text:
|
|
2679
|
+
_logger.warning(
|
|
2680
|
+
"Parameter `strict_text` has been deprecated and will be ignored.",
|
|
2455
2681
|
)
|
|
2456
|
-
):
|
|
2457
|
-
if item.self_ref in visited:
|
|
2458
|
-
continue
|
|
2459
|
-
else:
|
|
2460
|
-
visited.add(item.self_ref)
|
|
2461
|
-
|
|
2462
|
-
if ix < from_element or to_element <= ix:
|
|
2463
|
-
continue # skip as many items as you want
|
|
2464
|
-
|
|
2465
|
-
elif (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2466
|
-
continue # skip any label that is not whitelisted
|
|
2467
|
-
|
|
2468
|
-
elif isinstance(item, GroupItem):
|
|
2469
|
-
if item.label in [
|
|
2470
|
-
GroupLabel.LIST,
|
|
2471
|
-
GroupLabel.ORDERED_LIST,
|
|
2472
|
-
]:
|
|
2473
|
-
comps = self._get_markdown_components(
|
|
2474
|
-
node=item,
|
|
2475
|
-
from_element=from_element,
|
|
2476
|
-
to_element=to_element,
|
|
2477
|
-
labels=labels,
|
|
2478
|
-
strict_text=strict_text,
|
|
2479
|
-
escaping_underscores=escaping_underscores,
|
|
2480
|
-
image_placeholder=image_placeholder,
|
|
2481
|
-
image_mode=image_mode,
|
|
2482
|
-
indent=indent,
|
|
2483
|
-
text_width=text_width,
|
|
2484
|
-
page_no=page_no,
|
|
2485
|
-
included_content_layers=included_content_layers,
|
|
2486
|
-
list_level=list_level + 1,
|
|
2487
|
-
is_inline_scope=is_inline_scope,
|
|
2488
|
-
visited=visited,
|
|
2489
|
-
)
|
|
2490
|
-
indent_str = list_level * indent * " "
|
|
2491
|
-
is_ol = item.label == GroupLabel.ORDERED_LIST
|
|
2492
|
-
text = "\n".join(
|
|
2493
|
-
[
|
|
2494
|
-
# avoid additional marker on already evaled sublists
|
|
2495
|
-
(
|
|
2496
|
-
c
|
|
2497
|
-
if c and c[0] == " "
|
|
2498
|
-
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
|
|
2499
|
-
)
|
|
2500
|
-
for i, c in enumerate(comps)
|
|
2501
|
-
]
|
|
2502
|
-
)
|
|
2503
|
-
_ingest_text(
|
|
2504
|
-
text=text,
|
|
2505
|
-
# special chars have already been escaped as needed
|
|
2506
|
-
do_escape_html=False,
|
|
2507
|
-
do_escape_underscores=False,
|
|
2508
|
-
)
|
|
2509
|
-
elif item.label == GroupLabel.INLINE:
|
|
2510
|
-
comps = self._get_markdown_components(
|
|
2511
|
-
node=item,
|
|
2512
|
-
from_element=from_element,
|
|
2513
|
-
to_element=to_element,
|
|
2514
|
-
labels=labels,
|
|
2515
|
-
strict_text=strict_text,
|
|
2516
|
-
escaping_underscores=escaping_underscores,
|
|
2517
|
-
image_placeholder=image_placeholder,
|
|
2518
|
-
image_mode=image_mode,
|
|
2519
|
-
indent=indent,
|
|
2520
|
-
text_width=text_width,
|
|
2521
|
-
page_no=page_no,
|
|
2522
|
-
included_content_layers=included_content_layers,
|
|
2523
|
-
list_level=list_level,
|
|
2524
|
-
is_inline_scope=True,
|
|
2525
|
-
visited=visited,
|
|
2526
|
-
)
|
|
2527
|
-
text = " ".join(comps)
|
|
2528
|
-
_ingest_text(
|
|
2529
|
-
text=text,
|
|
2530
|
-
# special chars have already been escaped as needed
|
|
2531
|
-
do_escape_html=False,
|
|
2532
|
-
do_escape_underscores=False,
|
|
2533
|
-
)
|
|
2534
|
-
else:
|
|
2535
|
-
continue
|
|
2536
|
-
|
|
2537
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2538
|
-
marker = "" if strict_text else "#"
|
|
2539
|
-
text = f"{marker} {item.text}"
|
|
2540
|
-
_ingest_text(text.strip())
|
|
2541
|
-
|
|
2542
|
-
elif (
|
|
2543
|
-
isinstance(item, TextItem)
|
|
2544
|
-
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
2545
|
-
) or isinstance(item, SectionHeaderItem):
|
|
2546
|
-
marker = ""
|
|
2547
|
-
if not strict_text:
|
|
2548
|
-
marker = "#" * level
|
|
2549
|
-
if len(marker) < 2:
|
|
2550
|
-
marker = "##"
|
|
2551
|
-
text = f"{marker} {item.text}"
|
|
2552
|
-
_ingest_text(text.strip())
|
|
2553
|
-
|
|
2554
|
-
elif isinstance(item, CodeItem):
|
|
2555
|
-
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
2556
|
-
_ingest_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2557
|
-
|
|
2558
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2559
|
-
if item.text != "":
|
|
2560
|
-
_ingest_text(
|
|
2561
|
-
f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
|
|
2562
|
-
do_escape_underscores=False,
|
|
2563
|
-
do_escape_html=False,
|
|
2564
|
-
)
|
|
2565
|
-
elif item.orig != "":
|
|
2566
|
-
_ingest_text(
|
|
2567
|
-
"<!-- formula-not-decoded -->",
|
|
2568
|
-
do_escape_underscores=False,
|
|
2569
|
-
do_escape_html=False,
|
|
2570
|
-
)
|
|
2571
|
-
|
|
2572
|
-
elif isinstance(item, TextItem):
|
|
2573
|
-
if len(item.text) and text_width > 0:
|
|
2574
|
-
text = item.text
|
|
2575
|
-
wrapped_text = textwrap.fill(text, width=text_width)
|
|
2576
|
-
_ingest_text(wrapped_text)
|
|
2577
|
-
elif len(item.text):
|
|
2578
|
-
_ingest_text(item.text)
|
|
2579
|
-
|
|
2580
|
-
elif isinstance(item, TableItem) and not strict_text:
|
|
2581
|
-
if caption_text := item.caption_text(self):
|
|
2582
|
-
_ingest_text(caption_text)
|
|
2583
|
-
md_table = item.export_to_markdown()
|
|
2584
|
-
_ingest_text(md_table)
|
|
2585
|
-
|
|
2586
|
-
elif isinstance(item, PictureItem) and not strict_text:
|
|
2587
|
-
_ingest_text(item.caption_text(self))
|
|
2588
|
-
|
|
2589
|
-
line = item.export_to_markdown(
|
|
2590
|
-
doc=self,
|
|
2591
|
-
image_placeholder=image_placeholder,
|
|
2592
|
-
image_mode=image_mode,
|
|
2593
|
-
)
|
|
2594
|
-
|
|
2595
|
-
_ingest_text(line, do_escape_html=False, do_escape_underscores=False)
|
|
2596
|
-
|
|
2597
|
-
elif isinstance(item, (KeyValueItem, FormItem)):
|
|
2598
|
-
text = item._export_to_markdown()
|
|
2599
|
-
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2600
|
-
|
|
2601
|
-
elif isinstance(item, DocItem):
|
|
2602
|
-
text = "<!-- missing-text -->"
|
|
2603
|
-
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2604
2682
|
|
|
2605
|
-
return
|
|
2683
|
+
return ser_res.text
|
|
2606
2684
|
|
|
2607
2685
|
def export_to_text( # noqa: C901
|
|
2608
2686
|
self,
|
|
2609
2687
|
delim: str = "\n\n",
|
|
2610
2688
|
from_element: int = 0,
|
|
2611
2689
|
to_element: int = 1000000,
|
|
2612
|
-
labels: set[DocItemLabel] =
|
|
2690
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2613
2691
|
) -> str:
|
|
2614
2692
|
"""export_to_text."""
|
|
2615
2693
|
return self.export_to_markdown(
|
|
@@ -2936,7 +3014,378 @@ class DoclingDocument(BaseModel):
|
|
|
2936
3014
|
|
|
2937
3015
|
return html_text
|
|
2938
3016
|
|
|
2939
|
-
def
|
|
3017
|
+
def load_from_doctags( # noqa: C901
|
|
3018
|
+
self,
|
|
3019
|
+
doctag_document: DocTagsDocument,
|
|
3020
|
+
) -> "DoclingDocument":
|
|
3021
|
+
r"""Load Docling document from lists of DocTags and Images."""
|
|
3022
|
+
# Maps the recognized tag to a Docling label.
|
|
3023
|
+
# Code items will be given DocItemLabel.CODE
|
|
3024
|
+
tag_to_doclabel = {
|
|
3025
|
+
"title": DocItemLabel.TITLE,
|
|
3026
|
+
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
|
3027
|
+
"otsl": DocItemLabel.TABLE,
|
|
3028
|
+
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
|
3029
|
+
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
|
3030
|
+
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
|
3031
|
+
"text": DocItemLabel.TEXT,
|
|
3032
|
+
"page_header": DocItemLabel.PAGE_HEADER,
|
|
3033
|
+
"page_footer": DocItemLabel.PAGE_FOOTER,
|
|
3034
|
+
"formula": DocItemLabel.FORMULA,
|
|
3035
|
+
"caption": DocItemLabel.CAPTION,
|
|
3036
|
+
"picture": DocItemLabel.PICTURE,
|
|
3037
|
+
"list_item": DocItemLabel.LIST_ITEM,
|
|
3038
|
+
"footnote": DocItemLabel.FOOTNOTE,
|
|
3039
|
+
"code": DocItemLabel.CODE,
|
|
3040
|
+
}
|
|
3041
|
+
|
|
3042
|
+
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3043
|
+
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3044
|
+
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
3045
|
+
if len(coords) == 4:
|
|
3046
|
+
l, t, r, b = map(float, coords)
|
|
3047
|
+
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
3048
|
+
return None
|
|
3049
|
+
|
|
3050
|
+
def extract_inner_text(text_chunk: str) -> str:
|
|
3051
|
+
"""Strip all <...> tags inside the chunk to get the raw text content."""
|
|
3052
|
+
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
3053
|
+
|
|
3054
|
+
def otsl_parse_texts(texts, tokens):
|
|
3055
|
+
split_word = TableToken.OTSL_NL.value
|
|
3056
|
+
split_row_tokens = [
|
|
3057
|
+
list(y)
|
|
3058
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
3059
|
+
if not x
|
|
3060
|
+
]
|
|
3061
|
+
table_cells = []
|
|
3062
|
+
r_idx = 0
|
|
3063
|
+
c_idx = 0
|
|
3064
|
+
|
|
3065
|
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
3066
|
+
span = 0
|
|
3067
|
+
c_idx_iter = c_idx
|
|
3068
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
3069
|
+
c_idx_iter += 1
|
|
3070
|
+
span += 1
|
|
3071
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
|
3072
|
+
return span
|
|
3073
|
+
return span
|
|
3074
|
+
|
|
3075
|
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
3076
|
+
span = 0
|
|
3077
|
+
r_idx_iter = r_idx
|
|
3078
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
3079
|
+
r_idx_iter += 1
|
|
3080
|
+
span += 1
|
|
3081
|
+
if r_idx_iter >= len(tokens):
|
|
3082
|
+
return span
|
|
3083
|
+
return span
|
|
3084
|
+
|
|
3085
|
+
for i, text in enumerate(texts):
|
|
3086
|
+
cell_text = ""
|
|
3087
|
+
if text in [
|
|
3088
|
+
TableToken.OTSL_FCEL.value,
|
|
3089
|
+
TableToken.OTSL_ECEL.value,
|
|
3090
|
+
TableToken.OTSL_CHED.value,
|
|
3091
|
+
TableToken.OTSL_RHED.value,
|
|
3092
|
+
TableToken.OTSL_SROW.value,
|
|
3093
|
+
]:
|
|
3094
|
+
row_span = 1
|
|
3095
|
+
col_span = 1
|
|
3096
|
+
right_offset = 1
|
|
3097
|
+
if text != TableToken.OTSL_ECEL.value:
|
|
3098
|
+
cell_text = texts[i + 1]
|
|
3099
|
+
right_offset = 2
|
|
3100
|
+
|
|
3101
|
+
# Check next element(s) for lcel / ucel / xcel,
|
|
3102
|
+
# set properly row_span, col_span
|
|
3103
|
+
next_right_cell = ""
|
|
3104
|
+
if i + right_offset < len(texts):
|
|
3105
|
+
next_right_cell = texts[i + right_offset]
|
|
3106
|
+
|
|
3107
|
+
next_bottom_cell = ""
|
|
3108
|
+
if r_idx + 1 < len(split_row_tokens):
|
|
3109
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
3110
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
3111
|
+
|
|
3112
|
+
if next_right_cell in [
|
|
3113
|
+
TableToken.OTSL_LCEL.value,
|
|
3114
|
+
TableToken.OTSL_XCEL.value,
|
|
3115
|
+
]:
|
|
3116
|
+
# we have horisontal spanning cell or 2d spanning cell
|
|
3117
|
+
col_span += count_right(
|
|
3118
|
+
split_row_tokens,
|
|
3119
|
+
c_idx + 1,
|
|
3120
|
+
r_idx,
|
|
3121
|
+
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
3122
|
+
)
|
|
3123
|
+
if next_bottom_cell in [
|
|
3124
|
+
TableToken.OTSL_UCEL.value,
|
|
3125
|
+
TableToken.OTSL_XCEL.value,
|
|
3126
|
+
]:
|
|
3127
|
+
# we have a vertical spanning cell or 2d spanning cell
|
|
3128
|
+
row_span += count_down(
|
|
3129
|
+
split_row_tokens,
|
|
3130
|
+
c_idx,
|
|
3131
|
+
r_idx + 1,
|
|
3132
|
+
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
3133
|
+
)
|
|
3134
|
+
|
|
3135
|
+
table_cells.append(
|
|
3136
|
+
TableCell(
|
|
3137
|
+
text=cell_text.strip(),
|
|
3138
|
+
row_span=row_span,
|
|
3139
|
+
col_span=col_span,
|
|
3140
|
+
start_row_offset_idx=r_idx,
|
|
3141
|
+
end_row_offset_idx=r_idx + row_span,
|
|
3142
|
+
start_col_offset_idx=c_idx,
|
|
3143
|
+
end_col_offset_idx=c_idx + col_span,
|
|
3144
|
+
)
|
|
3145
|
+
)
|
|
3146
|
+
if text in [
|
|
3147
|
+
TableToken.OTSL_FCEL.value,
|
|
3148
|
+
TableToken.OTSL_ECEL.value,
|
|
3149
|
+
TableToken.OTSL_CHED.value,
|
|
3150
|
+
TableToken.OTSL_RHED.value,
|
|
3151
|
+
TableToken.OTSL_SROW.value,
|
|
3152
|
+
TableToken.OTSL_LCEL.value,
|
|
3153
|
+
TableToken.OTSL_UCEL.value,
|
|
3154
|
+
TableToken.OTSL_XCEL.value,
|
|
3155
|
+
]:
|
|
3156
|
+
c_idx += 1
|
|
3157
|
+
if text == TableToken.OTSL_NL.value:
|
|
3158
|
+
r_idx += 1
|
|
3159
|
+
c_idx = 0
|
|
3160
|
+
return table_cells, split_row_tokens
|
|
3161
|
+
|
|
3162
|
+
def otsl_extract_tokens_and_text(s: str):
|
|
3163
|
+
# Pattern to match anything enclosed by < >
|
|
3164
|
+
# (including the angle brackets themselves)
|
|
3165
|
+
pattern = r"(<[^>]+>)"
|
|
3166
|
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
3167
|
+
tokens = re.findall(pattern, s)
|
|
3168
|
+
# Remove any tokens that start with "<loc_"
|
|
3169
|
+
tokens = [
|
|
3170
|
+
token
|
|
3171
|
+
for token in tokens
|
|
3172
|
+
if not (
|
|
3173
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3174
|
+
or token
|
|
3175
|
+
in [
|
|
3176
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3177
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3178
|
+
]
|
|
3179
|
+
)
|
|
3180
|
+
]
|
|
3181
|
+
# Split the string by those tokens to get the in-between text
|
|
3182
|
+
text_parts = re.split(pattern, s)
|
|
3183
|
+
text_parts = [
|
|
3184
|
+
token
|
|
3185
|
+
for token in text_parts
|
|
3186
|
+
if not (
|
|
3187
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
3188
|
+
or token
|
|
3189
|
+
in [
|
|
3190
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
3191
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
3192
|
+
]
|
|
3193
|
+
)
|
|
3194
|
+
]
|
|
3195
|
+
# Remove any empty or purely whitespace strings from text_parts
|
|
3196
|
+
text_parts = [part for part in text_parts if part.strip()]
|
|
3197
|
+
|
|
3198
|
+
return tokens, text_parts
|
|
3199
|
+
|
|
3200
|
+
def parse_table_content(otsl_content: str) -> TableData:
|
|
3201
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
3202
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
3203
|
+
|
|
3204
|
+
return TableData(
|
|
3205
|
+
num_rows=len(split_row_tokens),
|
|
3206
|
+
num_cols=(
|
|
3207
|
+
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
3208
|
+
),
|
|
3209
|
+
table_cells=table_cells,
|
|
3210
|
+
)
|
|
3211
|
+
|
|
3212
|
+
# doc = DoclingDocument(name="Document")
|
|
3213
|
+
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3214
|
+
page_doctags = doctag_page.tokens
|
|
3215
|
+
image = doctag_page.image
|
|
3216
|
+
|
|
3217
|
+
page_no = pg_idx + 1
|
|
3218
|
+
# bounding_boxes = []
|
|
3219
|
+
|
|
3220
|
+
if image is not None:
|
|
3221
|
+
pg_width = image.width
|
|
3222
|
+
pg_height = image.height
|
|
3223
|
+
else:
|
|
3224
|
+
pg_width = 1
|
|
3225
|
+
pg_height = 1
|
|
3226
|
+
|
|
3227
|
+
"""
|
|
3228
|
+
1. Finds all <tag>...</tag>
|
|
3229
|
+
blocks in the entire string (multi-line friendly)
|
|
3230
|
+
in the order they appear.
|
|
3231
|
+
2. For each chunk, extracts bounding box (if any) and inner text.
|
|
3232
|
+
3. Adds the item to a DoclingDocument structure with the right label.
|
|
3233
|
+
4. Tracks bounding boxes+color in a separate list for later visualization.
|
|
3234
|
+
"""
|
|
3235
|
+
|
|
3236
|
+
# Regex for root level recognized tags
|
|
3237
|
+
tag_pattern = (
|
|
3238
|
+
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
3239
|
+
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
3240
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
3241
|
+
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
3242
|
+
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
3243
|
+
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
3244
|
+
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
|
|
3245
|
+
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3246
|
+
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3247
|
+
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3248
|
+
)
|
|
3249
|
+
|
|
3250
|
+
# DocumentToken.OTSL
|
|
3251
|
+
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3252
|
+
|
|
3253
|
+
# Go through each match in order
|
|
3254
|
+
for match in pattern.finditer(page_doctags):
|
|
3255
|
+
full_chunk = match.group(0)
|
|
3256
|
+
tag_name = match.group("tag")
|
|
3257
|
+
|
|
3258
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3259
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3260
|
+
|
|
3261
|
+
if tag_name == DocumentToken.OTSL.value:
|
|
3262
|
+
table_data = parse_table_content(full_chunk)
|
|
3263
|
+
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3264
|
+
|
|
3265
|
+
if bbox:
|
|
3266
|
+
prov = ProvenanceItem(
|
|
3267
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3268
|
+
charspan=(0, 0),
|
|
3269
|
+
page_no=page_no,
|
|
3270
|
+
)
|
|
3271
|
+
self.add_table(data=table_data, prov=prov)
|
|
3272
|
+
else:
|
|
3273
|
+
self.add_table(data=table_data)
|
|
3274
|
+
|
|
3275
|
+
elif tag_name == DocItemLabel.PICTURE:
|
|
3276
|
+
text_caption_content = extract_inner_text(full_chunk)
|
|
3277
|
+
if image:
|
|
3278
|
+
if bbox:
|
|
3279
|
+
im_width, im_height = image.size
|
|
3280
|
+
|
|
3281
|
+
crop_box = (
|
|
3282
|
+
int(bbox.l * im_width),
|
|
3283
|
+
int(bbox.t * im_height),
|
|
3284
|
+
int(bbox.r * im_width),
|
|
3285
|
+
int(bbox.b * im_height),
|
|
3286
|
+
)
|
|
3287
|
+
cropped_image = image.crop(crop_box)
|
|
3288
|
+
pic = self.add_picture(
|
|
3289
|
+
parent=None,
|
|
3290
|
+
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
3291
|
+
prov=(
|
|
3292
|
+
ProvenanceItem(
|
|
3293
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3294
|
+
charspan=(0, 0),
|
|
3295
|
+
page_no=page_no,
|
|
3296
|
+
)
|
|
3297
|
+
),
|
|
3298
|
+
)
|
|
3299
|
+
# If there is a caption to an image, add it as well
|
|
3300
|
+
if len(text_caption_content) > 0:
|
|
3301
|
+
caption_item = self.add_text(
|
|
3302
|
+
label=DocItemLabel.CAPTION,
|
|
3303
|
+
text=text_caption_content,
|
|
3304
|
+
parent=None,
|
|
3305
|
+
)
|
|
3306
|
+
pic.captions.append(caption_item.get_ref())
|
|
3307
|
+
else:
|
|
3308
|
+
if bbox:
|
|
3309
|
+
# In case we don't have access to an binary of an image
|
|
3310
|
+
self.add_picture(
|
|
3311
|
+
parent=None,
|
|
3312
|
+
prov=ProvenanceItem(
|
|
3313
|
+
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
3314
|
+
),
|
|
3315
|
+
)
|
|
3316
|
+
# If there is a caption to an image, add it as well
|
|
3317
|
+
if len(text_caption_content) > 0:
|
|
3318
|
+
caption_item = self.add_text(
|
|
3319
|
+
label=DocItemLabel.CAPTION,
|
|
3320
|
+
text=text_caption_content,
|
|
3321
|
+
parent=None,
|
|
3322
|
+
)
|
|
3323
|
+
pic.captions.append(caption_item.get_ref())
|
|
3324
|
+
elif tag_name in [
|
|
3325
|
+
DocumentToken.ORDERED_LIST.value,
|
|
3326
|
+
DocumentToken.UNORDERED_LIST.value,
|
|
3327
|
+
]:
|
|
3328
|
+
list_label = GroupLabel.LIST
|
|
3329
|
+
enum_marker = ""
|
|
3330
|
+
enum_value = 0
|
|
3331
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3332
|
+
list_label = GroupLabel.ORDERED_LIST
|
|
3333
|
+
|
|
3334
|
+
list_item_pattern = (
|
|
3335
|
+
rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
|
|
3336
|
+
)
|
|
3337
|
+
li_pattern = re.compile(list_item_pattern, re.DOTALL)
|
|
3338
|
+
# Add list group:
|
|
3339
|
+
new_list = self.add_group(label=list_label, name="list")
|
|
3340
|
+
# Pricess list items
|
|
3341
|
+
for li_match in li_pattern.finditer(full_chunk):
|
|
3342
|
+
enum_value += 1
|
|
3343
|
+
if tag_name == DocumentToken.ORDERED_LIST.value:
|
|
3344
|
+
enum_marker = str(enum_value) + "."
|
|
3345
|
+
|
|
3346
|
+
li_full_chunk = li_match.group(0)
|
|
3347
|
+
li_bbox = extract_bounding_box(li_full_chunk) if image else None
|
|
3348
|
+
text_content = extract_inner_text(li_full_chunk)
|
|
3349
|
+
# Add list item
|
|
3350
|
+
self.add_list_item(
|
|
3351
|
+
marker=enum_marker,
|
|
3352
|
+
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
|
|
3353
|
+
parent=new_list,
|
|
3354
|
+
text=text_content,
|
|
3355
|
+
prov=(
|
|
3356
|
+
ProvenanceItem(
|
|
3357
|
+
bbox=li_bbox.resize_by_scale(pg_width, pg_height),
|
|
3358
|
+
charspan=(0, len(text_content)),
|
|
3359
|
+
page_no=page_no,
|
|
3360
|
+
)
|
|
3361
|
+
if li_bbox
|
|
3362
|
+
else None
|
|
3363
|
+
),
|
|
3364
|
+
)
|
|
3365
|
+
else:
|
|
3366
|
+
# For everything else, treat as text
|
|
3367
|
+
text_content = extract_inner_text(full_chunk)
|
|
3368
|
+
self.add_text(
|
|
3369
|
+
label=doc_label,
|
|
3370
|
+
text=text_content,
|
|
3371
|
+
prov=(
|
|
3372
|
+
ProvenanceItem(
|
|
3373
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3374
|
+
charspan=(0, len(text_content)),
|
|
3375
|
+
page_no=page_no,
|
|
3376
|
+
)
|
|
3377
|
+
if bbox
|
|
3378
|
+
else None
|
|
3379
|
+
),
|
|
3380
|
+
)
|
|
3381
|
+
return self
|
|
3382
|
+
|
|
3383
|
+
@deprecated("Use save_as_doctags instead.")
|
|
3384
|
+
def save_as_document_tokens(self, *args, **kwargs):
|
|
3385
|
+
r"""Save the document content to a DocumentToken format."""
|
|
3386
|
+
return self.save_as_doctags(*args, **kwargs)
|
|
3387
|
+
|
|
3388
|
+
def save_as_doctags(
|
|
2940
3389
|
self,
|
|
2941
3390
|
filename: Path,
|
|
2942
3391
|
delim: str = "",
|
|
@@ -2952,7 +3401,7 @@ class DoclingDocument(BaseModel):
|
|
|
2952
3401
|
add_table_cell_location: bool = False,
|
|
2953
3402
|
add_table_cell_text: bool = True,
|
|
2954
3403
|
):
|
|
2955
|
-
r"""Save the document content to
|
|
3404
|
+
r"""Save the document content to DocTags format."""
|
|
2956
3405
|
out = self.export_to_document_tokens(
|
|
2957
3406
|
delim=delim,
|
|
2958
3407
|
from_element=from_element,
|