docling-core 2.22.0__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/__init__.py +6 -0
- docling_core/experimental/serializer/__init__.py +6 -0
- docling_core/experimental/serializer/base.py +227 -0
- docling_core/experimental/serializer/common.py +353 -0
- docling_core/experimental/serializer/markdown.py +461 -0
- docling_core/types/doc/document.py +347 -327
- docling_core/types/doc/page.py +1238 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.0.dist-info}/METADATA +1 -1
- {docling_core-2.22.0.dist-info → docling_core-2.23.0.dist-info}/RECORD +12 -6
- {docling_core-2.22.0.dist-info → docling_core-2.23.0.dist-info}/LICENSE +0 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.0.dist-info}/WHEEL +0 -0
- {docling_core-2.22.0.dist-info → docling_core-2.23.0.dist-info}/entry_points.txt +0 -0
|
@@ -11,7 +11,6 @@ import mimetypes
|
|
|
11
11
|
import os
|
|
12
12
|
import re
|
|
13
13
|
import sys
|
|
14
|
-
import textwrap
|
|
15
14
|
import typing
|
|
16
15
|
import warnings
|
|
17
16
|
from enum import Enum
|
|
@@ -62,7 +61,7 @@ _logger = logging.getLogger(__name__)
|
|
|
62
61
|
|
|
63
62
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
64
63
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
65
|
-
CURRENT_VERSION: Final = "1.
|
|
64
|
+
CURRENT_VERSION: Final = "1.3.0"
|
|
66
65
|
|
|
67
66
|
DEFAULT_EXPORT_LABELS = {
|
|
68
67
|
DocItemLabel.TITLE,
|
|
@@ -87,6 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
|
87
86
|
[
|
|
88
87
|
DocItemLabel.FOOTNOTE,
|
|
89
88
|
DocItemLabel.CAPTION,
|
|
89
|
+
DocItemLabel.KEY_VALUE_REGION,
|
|
90
|
+
DocItemLabel.FORM,
|
|
90
91
|
]
|
|
91
92
|
)
|
|
92
93
|
|
|
@@ -607,9 +608,30 @@ class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
|
607
608
|
"group" # Name of the group, e.g. "Introduction Chapter",
|
|
608
609
|
# "Slide 5", "Navigation menu list", ...
|
|
609
610
|
)
|
|
611
|
+
# TODO narrow down to allowed values, i.e. excluding those used for subtypes
|
|
610
612
|
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
611
613
|
|
|
612
614
|
|
|
615
|
+
class UnorderedList(GroupItem):
|
|
616
|
+
"""UnorderedList."""
|
|
617
|
+
|
|
618
|
+
label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST # type: ignore[assignment]
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
class OrderedList(GroupItem):
|
|
622
|
+
"""OrderedList."""
|
|
623
|
+
|
|
624
|
+
label: typing.Literal[GroupLabel.ORDERED_LIST] = (
|
|
625
|
+
GroupLabel.ORDERED_LIST # type: ignore[assignment]
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
class InlineGroup(GroupItem):
|
|
630
|
+
"""InlineGroup."""
|
|
631
|
+
|
|
632
|
+
label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE
|
|
633
|
+
|
|
634
|
+
|
|
613
635
|
class DocItem(
|
|
614
636
|
NodeItem
|
|
615
637
|
): # Base type for any element that carries content, can be a leaf node
|
|
@@ -670,6 +692,15 @@ class DocItem(
|
|
|
670
692
|
return page_image.crop(crop_bbox.as_tuple())
|
|
671
693
|
|
|
672
694
|
|
|
695
|
+
class Formatting(BaseModel):
|
|
696
|
+
"""Formatting."""
|
|
697
|
+
|
|
698
|
+
bold: bool = False
|
|
699
|
+
italic: bool = False
|
|
700
|
+
underline: bool = False
|
|
701
|
+
strikethrough: bool = False
|
|
702
|
+
|
|
703
|
+
|
|
673
704
|
class TextItem(DocItem):
|
|
674
705
|
"""TextItem."""
|
|
675
706
|
|
|
@@ -678,18 +709,19 @@ class TextItem(DocItem):
|
|
|
678
709
|
DocItemLabel.CHECKBOX_SELECTED,
|
|
679
710
|
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
680
711
|
DocItemLabel.FOOTNOTE,
|
|
681
|
-
DocItemLabel.FORMULA,
|
|
682
712
|
DocItemLabel.PAGE_FOOTER,
|
|
683
713
|
DocItemLabel.PAGE_HEADER,
|
|
684
714
|
DocItemLabel.PARAGRAPH,
|
|
685
715
|
DocItemLabel.REFERENCE,
|
|
686
716
|
DocItemLabel.TEXT,
|
|
687
|
-
DocItemLabel.TITLE,
|
|
688
717
|
]
|
|
689
718
|
|
|
690
719
|
orig: str # untreated representation
|
|
691
720
|
text: str # sanitized representation
|
|
692
721
|
|
|
722
|
+
formatting: Optional[Formatting] = None
|
|
723
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None
|
|
724
|
+
|
|
693
725
|
def export_to_document_tokens(
|
|
694
726
|
self,
|
|
695
727
|
doc: "DoclingDocument",
|
|
@@ -727,6 +759,14 @@ class TextItem(DocItem):
|
|
|
727
759
|
return body
|
|
728
760
|
|
|
729
761
|
|
|
762
|
+
class TitleItem(TextItem):
|
|
763
|
+
"""TitleItem."""
|
|
764
|
+
|
|
765
|
+
label: typing.Literal[DocItemLabel.TITLE] = (
|
|
766
|
+
DocItemLabel.TITLE # type: ignore[assignment]
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
|
|
730
770
|
class SectionHeaderItem(TextItem):
|
|
731
771
|
"""SectionItem."""
|
|
732
772
|
|
|
@@ -862,6 +902,14 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
862
902
|
return body
|
|
863
903
|
|
|
864
904
|
|
|
905
|
+
class FormulaItem(TextItem):
|
|
906
|
+
"""FormulaItem."""
|
|
907
|
+
|
|
908
|
+
label: typing.Literal[DocItemLabel.FORMULA] = (
|
|
909
|
+
DocItemLabel.FORMULA # type: ignore[assignment]
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
|
|
865
913
|
class PictureItem(FloatingItem):
|
|
866
914
|
"""PictureItem."""
|
|
867
915
|
|
|
@@ -900,54 +948,34 @@ class PictureItem(FloatingItem):
|
|
|
900
948
|
def export_to_markdown(
|
|
901
949
|
self,
|
|
902
950
|
doc: "DoclingDocument",
|
|
903
|
-
add_caption: bool = True,
|
|
951
|
+
add_caption: bool = True, # deprecated
|
|
904
952
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
905
953
|
image_placeholder: str = "<!-- image -->",
|
|
906
954
|
) -> str:
|
|
907
955
|
"""Export picture to Markdown format."""
|
|
908
|
-
|
|
909
|
-
error_response = (
|
|
910
|
-
"<!-- 🖼️❌ Image not available. "
|
|
911
|
-
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
912
|
-
" -->"
|
|
913
|
-
)
|
|
956
|
+
from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
|
|
914
957
|
|
|
915
|
-
if
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
# short-cut: we already have the image in base64
|
|
921
|
-
if (
|
|
922
|
-
isinstance(self.image, ImageRef)
|
|
923
|
-
and isinstance(self.image.uri, AnyUrl)
|
|
924
|
-
and self.image.uri.scheme == "data"
|
|
925
|
-
):
|
|
926
|
-
text = f""
|
|
927
|
-
return text
|
|
928
|
-
|
|
929
|
-
# get the self.image._pil or crop it out of the page-image
|
|
930
|
-
img = self.get_image(doc)
|
|
931
|
-
|
|
932
|
-
if img is not None:
|
|
933
|
-
imgb64 = self._image_to_base64(img)
|
|
934
|
-
text = f""
|
|
935
|
-
|
|
936
|
-
return text
|
|
937
|
-
else:
|
|
938
|
-
return error_response
|
|
939
|
-
|
|
940
|
-
elif image_mode == ImageRefMode.REFERENCED:
|
|
941
|
-
if not isinstance(self.image, ImageRef) or (
|
|
942
|
-
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
943
|
-
):
|
|
944
|
-
return default_response
|
|
945
|
-
|
|
946
|
-
text = f")})"
|
|
947
|
-
return text
|
|
958
|
+
if not add_caption:
|
|
959
|
+
_logger.warning(
|
|
960
|
+
"Argument `add_caption` is deprecated and will be ignored.",
|
|
961
|
+
)
|
|
948
962
|
|
|
949
|
-
|
|
950
|
-
|
|
963
|
+
serializer = MarkdownDocSerializer(
|
|
964
|
+
doc=self,
|
|
965
|
+
image_mode=image_mode,
|
|
966
|
+
)
|
|
967
|
+
text = (
|
|
968
|
+
serializer.picture_serializer.serialize(
|
|
969
|
+
item=self,
|
|
970
|
+
doc_serializer=serializer,
|
|
971
|
+
doc=doc,
|
|
972
|
+
image_mode=image_mode,
|
|
973
|
+
image_placeholder=image_placeholder,
|
|
974
|
+
).text
|
|
975
|
+
if serializer.picture_serializer
|
|
976
|
+
else ""
|
|
977
|
+
)
|
|
978
|
+
return text
|
|
951
979
|
|
|
952
980
|
def export_to_html(
|
|
953
981
|
self,
|
|
@@ -1136,33 +1164,58 @@ class TableItem(FloatingItem):
|
|
|
1136
1164
|
|
|
1137
1165
|
return df
|
|
1138
1166
|
|
|
1139
|
-
def export_to_markdown(self) -> str:
|
|
1167
|
+
def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
|
|
1140
1168
|
"""Export the table as markdown."""
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1169
|
+
if doc is not None:
|
|
1170
|
+
from docling_core.experimental.serializer.markdown import (
|
|
1171
|
+
MarkdownDocSerializer,
|
|
1172
|
+
)
|
|
1145
1173
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1174
|
+
serializer = MarkdownDocSerializer(
|
|
1175
|
+
doc=doc,
|
|
1176
|
+
)
|
|
1177
|
+
text = (
|
|
1178
|
+
serializer.table_serializer.serialize(
|
|
1179
|
+
item=self,
|
|
1180
|
+
doc_serializer=serializer,
|
|
1181
|
+
doc=doc,
|
|
1182
|
+
).text
|
|
1183
|
+
if serializer.table_serializer
|
|
1184
|
+
else ""
|
|
1185
|
+
)
|
|
1186
|
+
return text
|
|
1187
|
+
else:
|
|
1188
|
+
_logger.warning(
|
|
1189
|
+
"Usage of TableItem.export_to_markdown() without `doc` argument is "
|
|
1190
|
+
"deprecated.",
|
|
1191
|
+
)
|
|
1151
1192
|
|
|
1152
|
-
table
|
|
1193
|
+
table = []
|
|
1194
|
+
for row in self.data.grid:
|
|
1195
|
+
tmp = []
|
|
1196
|
+
for col in row:
|
|
1197
|
+
|
|
1198
|
+
# make sure that md tables are not broken
|
|
1199
|
+
# due to newline chars in the text
|
|
1200
|
+
text = col.text
|
|
1201
|
+
text = text.replace("\n", " ")
|
|
1202
|
+
tmp.append(text)
|
|
1203
|
+
|
|
1204
|
+
table.append(tmp)
|
|
1205
|
+
|
|
1206
|
+
res = ""
|
|
1207
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
1208
|
+
try:
|
|
1209
|
+
res = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
1210
|
+
except ValueError:
|
|
1211
|
+
res = tabulate(
|
|
1212
|
+
table[1:],
|
|
1213
|
+
headers=table[0],
|
|
1214
|
+
tablefmt="github",
|
|
1215
|
+
disable_numparse=True,
|
|
1216
|
+
)
|
|
1153
1217
|
|
|
1154
|
-
|
|
1155
|
-
if len(table) > 1 and len(table[0]) > 0:
|
|
1156
|
-
try:
|
|
1157
|
-
md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
1158
|
-
except ValueError:
|
|
1159
|
-
md_table = tabulate(
|
|
1160
|
-
table[1:],
|
|
1161
|
-
headers=table[0],
|
|
1162
|
-
tablefmt="github",
|
|
1163
|
-
disable_numparse=True,
|
|
1164
|
-
)
|
|
1165
|
-
return md_table
|
|
1218
|
+
return res
|
|
1166
1219
|
|
|
1167
1220
|
def export_to_html(
|
|
1168
1221
|
self,
|
|
@@ -1455,10 +1508,6 @@ class KeyValueItem(FloatingItem):
|
|
|
1455
1508
|
|
|
1456
1509
|
graph: GraphData
|
|
1457
1510
|
|
|
1458
|
-
def _export_to_markdown(self) -> str:
|
|
1459
|
-
# TODO add actual implementation
|
|
1460
|
-
return "<!-- missing-key-value-item -->"
|
|
1461
|
-
|
|
1462
1511
|
|
|
1463
1512
|
class FormItem(FloatingItem):
|
|
1464
1513
|
"""FormItem."""
|
|
@@ -1467,17 +1516,15 @@ class FormItem(FloatingItem):
|
|
|
1467
1516
|
|
|
1468
1517
|
graph: GraphData
|
|
1469
1518
|
|
|
1470
|
-
def _export_to_markdown(self) -> str:
|
|
1471
|
-
# TODO add actual implementation
|
|
1472
|
-
return "<!-- missing-form-item -->"
|
|
1473
|
-
|
|
1474
1519
|
|
|
1475
1520
|
ContentItem = Annotated[
|
|
1476
1521
|
Union[
|
|
1477
1522
|
TextItem,
|
|
1523
|
+
TitleItem,
|
|
1478
1524
|
SectionHeaderItem,
|
|
1479
1525
|
ListItem,
|
|
1480
1526
|
CodeItem,
|
|
1527
|
+
FormulaItem,
|
|
1481
1528
|
PictureItem,
|
|
1482
1529
|
TableItem,
|
|
1483
1530
|
KeyValueItem,
|
|
@@ -1588,8 +1635,10 @@ class DoclingDocument(BaseModel):
|
|
|
1588
1635
|
) # List[RefItem] = []
|
|
1589
1636
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1590
1637
|
|
|
1591
|
-
groups: List[GroupItem] = []
|
|
1592
|
-
texts: List[
|
|
1638
|
+
groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
|
|
1639
|
+
texts: List[
|
|
1640
|
+
Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
|
|
1641
|
+
] = []
|
|
1593
1642
|
pictures: List[PictureItem] = []
|
|
1594
1643
|
tables: List[TableItem] = []
|
|
1595
1644
|
key_value_items: List[KeyValueItem] = []
|
|
@@ -1613,6 +1662,68 @@ class DoclingDocument(BaseModel):
|
|
|
1613
1662
|
item["content_layer"] = "furniture"
|
|
1614
1663
|
return data
|
|
1615
1664
|
|
|
1665
|
+
###################################
|
|
1666
|
+
# TODO: refactor add* methods below
|
|
1667
|
+
###################################
|
|
1668
|
+
|
|
1669
|
+
def add_ordered_list(
|
|
1670
|
+
self,
|
|
1671
|
+
name: Optional[str] = None,
|
|
1672
|
+
parent: Optional[NodeItem] = None,
|
|
1673
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1674
|
+
) -> GroupItem:
|
|
1675
|
+
"""add_ordered_list."""
|
|
1676
|
+
_parent = parent or self.body
|
|
1677
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1678
|
+
group = OrderedList(self_ref=cref, parent=_parent.get_ref())
|
|
1679
|
+
if name is not None:
|
|
1680
|
+
group.name = name
|
|
1681
|
+
if content_layer:
|
|
1682
|
+
group.content_layer = content_layer
|
|
1683
|
+
|
|
1684
|
+
self.groups.append(group)
|
|
1685
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1686
|
+
return group
|
|
1687
|
+
|
|
1688
|
+
def add_unordered_list(
|
|
1689
|
+
self,
|
|
1690
|
+
name: Optional[str] = None,
|
|
1691
|
+
parent: Optional[NodeItem] = None,
|
|
1692
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1693
|
+
) -> GroupItem:
|
|
1694
|
+
"""add_unordered_list."""
|
|
1695
|
+
_parent = parent or self.body
|
|
1696
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1697
|
+
group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
|
|
1698
|
+
if name is not None:
|
|
1699
|
+
group.name = name
|
|
1700
|
+
if content_layer:
|
|
1701
|
+
group.content_layer = content_layer
|
|
1702
|
+
|
|
1703
|
+
self.groups.append(group)
|
|
1704
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1705
|
+
return group
|
|
1706
|
+
|
|
1707
|
+
def add_inline_group(
|
|
1708
|
+
self,
|
|
1709
|
+
name: Optional[str] = None,
|
|
1710
|
+
parent: Optional[NodeItem] = None,
|
|
1711
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1712
|
+
# marker: Optional[UnorderedList.ULMarker] = None,
|
|
1713
|
+
) -> GroupItem:
|
|
1714
|
+
"""add_inline_group."""
|
|
1715
|
+
_parent = parent or self.body
|
|
1716
|
+
cref = f"#/groups/{len(self.groups)}"
|
|
1717
|
+
group = InlineGroup(self_ref=cref, parent=_parent.get_ref())
|
|
1718
|
+
if name is not None:
|
|
1719
|
+
group.name = name
|
|
1720
|
+
if content_layer:
|
|
1721
|
+
group.content_layer = content_layer
|
|
1722
|
+
|
|
1723
|
+
self.groups.append(group)
|
|
1724
|
+
_parent.children.append(RefItem(cref=cref))
|
|
1725
|
+
return group
|
|
1726
|
+
|
|
1616
1727
|
def add_group(
|
|
1617
1728
|
self,
|
|
1618
1729
|
label: Optional[GroupLabel] = None,
|
|
@@ -1627,6 +1738,25 @@ class DoclingDocument(BaseModel):
|
|
|
1627
1738
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1628
1739
|
|
|
1629
1740
|
"""
|
|
1741
|
+
if label == GroupLabel.LIST:
|
|
1742
|
+
return self.add_unordered_list(
|
|
1743
|
+
name=name,
|
|
1744
|
+
parent=parent,
|
|
1745
|
+
content_layer=content_layer,
|
|
1746
|
+
)
|
|
1747
|
+
elif label == GroupLabel.ORDERED_LIST:
|
|
1748
|
+
return self.add_ordered_list(
|
|
1749
|
+
name=name,
|
|
1750
|
+
parent=parent,
|
|
1751
|
+
content_layer=content_layer,
|
|
1752
|
+
)
|
|
1753
|
+
elif label == GroupLabel.INLINE:
|
|
1754
|
+
return self.add_inline_group(
|
|
1755
|
+
name=name,
|
|
1756
|
+
parent=parent,
|
|
1757
|
+
content_layer=content_layer,
|
|
1758
|
+
)
|
|
1759
|
+
|
|
1630
1760
|
if not parent:
|
|
1631
1761
|
parent = self.body
|
|
1632
1762
|
|
|
@@ -1655,6 +1785,8 @@ class DoclingDocument(BaseModel):
|
|
|
1655
1785
|
prov: Optional[ProvenanceItem] = None,
|
|
1656
1786
|
parent: Optional[NodeItem] = None,
|
|
1657
1787
|
content_layer: Optional[ContentLayer] = None,
|
|
1788
|
+
formatting: Optional[Formatting] = None,
|
|
1789
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1658
1790
|
):
|
|
1659
1791
|
"""add_list_item.
|
|
1660
1792
|
|
|
@@ -1682,6 +1814,8 @@ class DoclingDocument(BaseModel):
|
|
|
1682
1814
|
parent=parent.get_ref(),
|
|
1683
1815
|
enumerated=enumerated,
|
|
1684
1816
|
marker=marker,
|
|
1817
|
+
formatting=formatting,
|
|
1818
|
+
hyperlink=hyperlink,
|
|
1685
1819
|
)
|
|
1686
1820
|
if prov:
|
|
1687
1821
|
list_item.prov.append(prov)
|
|
@@ -1701,6 +1835,8 @@ class DoclingDocument(BaseModel):
|
|
|
1701
1835
|
prov: Optional[ProvenanceItem] = None,
|
|
1702
1836
|
parent: Optional[NodeItem] = None,
|
|
1703
1837
|
content_layer: Optional[ContentLayer] = None,
|
|
1838
|
+
formatting: Optional[Formatting] = None,
|
|
1839
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1704
1840
|
):
|
|
1705
1841
|
"""add_text.
|
|
1706
1842
|
|
|
@@ -1720,6 +1856,8 @@ class DoclingDocument(BaseModel):
|
|
|
1720
1856
|
prov=prov,
|
|
1721
1857
|
parent=parent,
|
|
1722
1858
|
content_layer=content_layer,
|
|
1859
|
+
formatting=formatting,
|
|
1860
|
+
hyperlink=hyperlink,
|
|
1723
1861
|
)
|
|
1724
1862
|
|
|
1725
1863
|
elif label in [DocItemLabel.LIST_ITEM]:
|
|
@@ -1729,15 +1867,31 @@ class DoclingDocument(BaseModel):
|
|
|
1729
1867
|
prov=prov,
|
|
1730
1868
|
parent=parent,
|
|
1731
1869
|
content_layer=content_layer,
|
|
1870
|
+
formatting=formatting,
|
|
1871
|
+
hyperlink=hyperlink,
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
elif label in [DocItemLabel.TITLE]:
|
|
1875
|
+
return self.add_title(
|
|
1876
|
+
text=text,
|
|
1877
|
+
orig=orig,
|
|
1878
|
+
prov=prov,
|
|
1879
|
+
parent=parent,
|
|
1880
|
+
content_layer=content_layer,
|
|
1881
|
+
formatting=formatting,
|
|
1882
|
+
hyperlink=hyperlink,
|
|
1732
1883
|
)
|
|
1733
1884
|
|
|
1734
1885
|
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1735
1886
|
return self.add_heading(
|
|
1736
1887
|
text=text,
|
|
1737
1888
|
orig=orig,
|
|
1889
|
+
# NOTE: we do not / cannot pass the level here, lossy path..
|
|
1738
1890
|
prov=prov,
|
|
1739
1891
|
parent=parent,
|
|
1740
1892
|
content_layer=content_layer,
|
|
1893
|
+
formatting=formatting,
|
|
1894
|
+
hyperlink=hyperlink,
|
|
1741
1895
|
)
|
|
1742
1896
|
|
|
1743
1897
|
elif label in [DocItemLabel.CODE]:
|
|
@@ -1747,6 +1901,18 @@ class DoclingDocument(BaseModel):
|
|
|
1747
1901
|
prov=prov,
|
|
1748
1902
|
parent=parent,
|
|
1749
1903
|
content_layer=content_layer,
|
|
1904
|
+
formatting=formatting,
|
|
1905
|
+
hyperlink=hyperlink,
|
|
1906
|
+
)
|
|
1907
|
+
elif label in [DocItemLabel.FORMULA]:
|
|
1908
|
+
return self.add_formula(
|
|
1909
|
+
text=text,
|
|
1910
|
+
orig=orig,
|
|
1911
|
+
prov=prov,
|
|
1912
|
+
parent=parent,
|
|
1913
|
+
content_layer=content_layer,
|
|
1914
|
+
formatting=formatting,
|
|
1915
|
+
hyperlink=hyperlink,
|
|
1750
1916
|
)
|
|
1751
1917
|
|
|
1752
1918
|
else:
|
|
@@ -1765,6 +1931,8 @@ class DoclingDocument(BaseModel):
|
|
|
1765
1931
|
orig=orig,
|
|
1766
1932
|
self_ref=cref,
|
|
1767
1933
|
parent=parent.get_ref(),
|
|
1934
|
+
formatting=formatting,
|
|
1935
|
+
hyperlink=hyperlink,
|
|
1768
1936
|
)
|
|
1769
1937
|
if prov:
|
|
1770
1938
|
text_item.prov.append(prov)
|
|
@@ -1866,11 +2034,14 @@ class DoclingDocument(BaseModel):
|
|
|
1866
2034
|
prov: Optional[ProvenanceItem] = None,
|
|
1867
2035
|
parent: Optional[NodeItem] = None,
|
|
1868
2036
|
content_layer: Optional[ContentLayer] = None,
|
|
2037
|
+
formatting: Optional[Formatting] = None,
|
|
2038
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1869
2039
|
):
|
|
1870
2040
|
"""add_title.
|
|
1871
2041
|
|
|
1872
2042
|
:param text: str:
|
|
1873
2043
|
:param orig: Optional[str]: (Default value = None)
|
|
2044
|
+
:param level: LevelNumber: (Default value = 1)
|
|
1874
2045
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1875
2046
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1876
2047
|
"""
|
|
@@ -1882,22 +2053,23 @@ class DoclingDocument(BaseModel):
|
|
|
1882
2053
|
|
|
1883
2054
|
text_index = len(self.texts)
|
|
1884
2055
|
cref = f"#/texts/{text_index}"
|
|
1885
|
-
|
|
1886
|
-
label=DocItemLabel.TITLE,
|
|
2056
|
+
item = TitleItem(
|
|
1887
2057
|
text=text,
|
|
1888
2058
|
orig=orig,
|
|
1889
2059
|
self_ref=cref,
|
|
1890
2060
|
parent=parent.get_ref(),
|
|
2061
|
+
formatting=formatting,
|
|
2062
|
+
hyperlink=hyperlink,
|
|
1891
2063
|
)
|
|
1892
2064
|
if prov:
|
|
1893
|
-
|
|
2065
|
+
item.prov.append(prov)
|
|
1894
2066
|
if content_layer:
|
|
1895
|
-
|
|
2067
|
+
item.content_layer = content_layer
|
|
1896
2068
|
|
|
1897
|
-
self.texts.append(
|
|
2069
|
+
self.texts.append(item)
|
|
1898
2070
|
parent.children.append(RefItem(cref=cref))
|
|
1899
2071
|
|
|
1900
|
-
return
|
|
2072
|
+
return item
|
|
1901
2073
|
|
|
1902
2074
|
def add_code(
|
|
1903
2075
|
self,
|
|
@@ -1908,6 +2080,8 @@ class DoclingDocument(BaseModel):
|
|
|
1908
2080
|
prov: Optional[ProvenanceItem] = None,
|
|
1909
2081
|
parent: Optional[NodeItem] = None,
|
|
1910
2082
|
content_layer: Optional[ContentLayer] = None,
|
|
2083
|
+
formatting: Optional[Formatting] = None,
|
|
2084
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1911
2085
|
):
|
|
1912
2086
|
"""add_code.
|
|
1913
2087
|
|
|
@@ -1932,6 +2106,8 @@ class DoclingDocument(BaseModel):
|
|
|
1932
2106
|
orig=orig,
|
|
1933
2107
|
self_ref=cref,
|
|
1934
2108
|
parent=parent.get_ref(),
|
|
2109
|
+
formatting=formatting,
|
|
2110
|
+
hyperlink=hyperlink,
|
|
1935
2111
|
)
|
|
1936
2112
|
if code_language:
|
|
1937
2113
|
code_item.code_language = code_language
|
|
@@ -1947,6 +2123,50 @@ class DoclingDocument(BaseModel):
|
|
|
1947
2123
|
|
|
1948
2124
|
return code_item
|
|
1949
2125
|
|
|
2126
|
+
def add_formula(
|
|
2127
|
+
self,
|
|
2128
|
+
text: str,
|
|
2129
|
+
orig: Optional[str] = None,
|
|
2130
|
+
prov: Optional[ProvenanceItem] = None,
|
|
2131
|
+
parent: Optional[NodeItem] = None,
|
|
2132
|
+
content_layer: Optional[ContentLayer] = None,
|
|
2133
|
+
formatting: Optional[Formatting] = None,
|
|
2134
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
2135
|
+
):
|
|
2136
|
+
"""add_formula.
|
|
2137
|
+
|
|
2138
|
+
:param text: str:
|
|
2139
|
+
:param orig: Optional[str]: (Default value = None)
|
|
2140
|
+
:param level: LevelNumber: (Default value = 1)
|
|
2141
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
2142
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
2143
|
+
"""
|
|
2144
|
+
if not parent:
|
|
2145
|
+
parent = self.body
|
|
2146
|
+
|
|
2147
|
+
if not orig:
|
|
2148
|
+
orig = text
|
|
2149
|
+
|
|
2150
|
+
text_index = len(self.texts)
|
|
2151
|
+
cref = f"#/texts/{text_index}"
|
|
2152
|
+
section_header_item = FormulaItem(
|
|
2153
|
+
text=text,
|
|
2154
|
+
orig=orig,
|
|
2155
|
+
self_ref=cref,
|
|
2156
|
+
parent=parent.get_ref(),
|
|
2157
|
+
formatting=formatting,
|
|
2158
|
+
hyperlink=hyperlink,
|
|
2159
|
+
)
|
|
2160
|
+
if prov:
|
|
2161
|
+
section_header_item.prov.append(prov)
|
|
2162
|
+
if content_layer:
|
|
2163
|
+
section_header_item.content_layer = content_layer
|
|
2164
|
+
|
|
2165
|
+
self.texts.append(section_header_item)
|
|
2166
|
+
parent.children.append(RefItem(cref=cref))
|
|
2167
|
+
|
|
2168
|
+
return section_header_item
|
|
2169
|
+
|
|
1950
2170
|
def add_heading(
|
|
1951
2171
|
self,
|
|
1952
2172
|
text: str,
|
|
@@ -1955,6 +2175,8 @@ class DoclingDocument(BaseModel):
|
|
|
1955
2175
|
prov: Optional[ProvenanceItem] = None,
|
|
1956
2176
|
parent: Optional[NodeItem] = None,
|
|
1957
2177
|
content_layer: Optional[ContentLayer] = None,
|
|
2178
|
+
formatting: Optional[Formatting] = None,
|
|
2179
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
1958
2180
|
):
|
|
1959
2181
|
"""add_heading.
|
|
1960
2182
|
|
|
@@ -1979,6 +2201,8 @@ class DoclingDocument(BaseModel):
|
|
|
1979
2201
|
orig=orig,
|
|
1980
2202
|
self_ref=cref,
|
|
1981
2203
|
parent=parent.get_ref(),
|
|
2204
|
+
formatting=formatting,
|
|
2205
|
+
hyperlink=hyperlink,
|
|
1982
2206
|
)
|
|
1983
2207
|
if prov:
|
|
1984
2208
|
section_header_item.prov.append(prov)
|
|
@@ -2334,10 +2558,10 @@ class DoclingDocument(BaseModel):
|
|
|
2334
2558
|
self,
|
|
2335
2559
|
filename: Path,
|
|
2336
2560
|
artifacts_dir: Optional[Path] = None,
|
|
2337
|
-
delim: str = "\n\n",
|
|
2561
|
+
delim: str = "\n\n",
|
|
2338
2562
|
from_element: int = 0,
|
|
2339
2563
|
to_element: int = sys.maxsize,
|
|
2340
|
-
labels: set[DocItemLabel] =
|
|
2564
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2341
2565
|
strict_text: bool = False,
|
|
2342
2566
|
escaping_underscores: bool = True,
|
|
2343
2567
|
image_placeholder: str = "<!-- image -->",
|
|
@@ -2377,10 +2601,10 @@ class DoclingDocument(BaseModel):
|
|
|
2377
2601
|
|
|
2378
2602
|
def export_to_markdown( # noqa: C901
|
|
2379
2603
|
self,
|
|
2380
|
-
delim: str = "\n\n",
|
|
2604
|
+
delim: str = "\n\n",
|
|
2381
2605
|
from_element: int = 0,
|
|
2382
2606
|
to_element: int = sys.maxsize,
|
|
2383
|
-
labels: set[DocItemLabel] =
|
|
2607
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2384
2608
|
strict_text: bool = False,
|
|
2385
2609
|
escaping_underscores: bool = True,
|
|
2386
2610
|
image_placeholder: str = "<!-- image -->",
|
|
@@ -2395,9 +2619,8 @@ class DoclingDocument(BaseModel):
|
|
|
2395
2619
|
Operates on a slice of the document's body as defined through arguments
|
|
2396
2620
|
from_element and to_element; defaulting to the whole document.
|
|
2397
2621
|
|
|
2398
|
-
:param delim:
|
|
2399
|
-
|
|
2400
|
-
:type delim: str = "\n"
|
|
2622
|
+
:param delim: Deprecated.
|
|
2623
|
+
:type delim: str = "\n\n"
|
|
2401
2624
|
:param from_element: Body slicing start index (inclusive).
|
|
2402
2625
|
(Default value = 0).
|
|
2403
2626
|
:type from_element: int = 0
|
|
@@ -2405,9 +2628,8 @@ class DoclingDocument(BaseModel):
|
|
|
2405
2628
|
(exclusive). (Default value = maxint).
|
|
2406
2629
|
:type to_element: int = sys.maxsize
|
|
2407
2630
|
:param labels: The set of document labels to include in the export.
|
|
2408
|
-
:type labels: set[DocItemLabel] =
|
|
2409
|
-
:param strict_text:
|
|
2410
|
-
of the document. (Default value = False).
|
|
2631
|
+
:type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2632
|
+
:param strict_text: Deprecated.
|
|
2411
2633
|
:type strict_text: bool = False
|
|
2412
2634
|
:param escaping_underscores: bool: Whether to escape underscores in the
|
|
2413
2635
|
text content of the document. (Default value = True).
|
|
@@ -2424,250 +2646,48 @@ class DoclingDocument(BaseModel):
|
|
|
2424
2646
|
:returns: The exported Markdown representation.
|
|
2425
2647
|
:rtype: str
|
|
2426
2648
|
"""
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2649
|
+
from docling_core.experimental.serializer.markdown import (
|
|
2650
|
+
MarkdownDocSerializer,
|
|
2651
|
+
MarkdownListSerializer,
|
|
2652
|
+
MarkdownTextSerializer,
|
|
2653
|
+
)
|
|
2654
|
+
|
|
2655
|
+
serializer = MarkdownDocSerializer(
|
|
2656
|
+
doc=self,
|
|
2657
|
+
start=from_element,
|
|
2658
|
+
stop=to_element,
|
|
2434
2659
|
image_placeholder=image_placeholder,
|
|
2435
2660
|
image_mode=image_mode,
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2661
|
+
labels=labels,
|
|
2662
|
+
layers=included_content_layers,
|
|
2663
|
+
pages={page_no} if page_no is not None else None,
|
|
2664
|
+
escaping_underscores=escaping_underscores,
|
|
2665
|
+
text_serializer=MarkdownTextSerializer(
|
|
2666
|
+
wrap_width=text_width if text_width > 0 else None,
|
|
2667
|
+
),
|
|
2668
|
+
list_serializer=MarkdownListSerializer(
|
|
2669
|
+
indent=indent,
|
|
2670
|
+
),
|
|
2443
2671
|
)
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
def _get_markdown_components( # noqa: C901
|
|
2447
|
-
self,
|
|
2448
|
-
node: NodeItem,
|
|
2449
|
-
from_element: int,
|
|
2450
|
-
to_element: int,
|
|
2451
|
-
labels: set[DocItemLabel],
|
|
2452
|
-
strict_text: bool,
|
|
2453
|
-
escaping_underscores: bool,
|
|
2454
|
-
image_placeholder: str,
|
|
2455
|
-
image_mode: ImageRefMode,
|
|
2456
|
-
indent: int,
|
|
2457
|
-
text_width: int,
|
|
2458
|
-
page_no: Optional[int],
|
|
2459
|
-
included_content_layers: set[ContentLayer],
|
|
2460
|
-
list_level: int,
|
|
2461
|
-
is_inline_scope: bool,
|
|
2462
|
-
visited: set[str], # refs of visited items
|
|
2463
|
-
) -> list[str]:
|
|
2464
|
-
components: list[str] = [] # components to concatenate
|
|
2465
|
-
|
|
2466
|
-
# Our export markdown doesn't contain any emphasis styling:
|
|
2467
|
-
# Bold, Italic, or Bold-Italic
|
|
2468
|
-
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2469
|
-
# That means we need to escape it, to properly reflect content in the markdown
|
|
2470
|
-
# However, we need to preserve underscores in image URLs
|
|
2471
|
-
# to maintain their validity
|
|
2472
|
-
# For example:  should remain unchanged
|
|
2473
|
-
def _escape_underscores(text):
|
|
2474
|
-
"""Escape underscores but leave them intact in the URL.."""
|
|
2475
|
-
# Firstly, identify all the URL patterns.
|
|
2476
|
-
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2477
|
-
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
|
|
2478
|
-
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
|
|
2479
|
-
combined_pattern = f"({url_pattern})|({latex_pattern})"
|
|
2480
|
-
|
|
2481
|
-
parts = []
|
|
2482
|
-
last_end = 0
|
|
2483
|
-
|
|
2484
|
-
for match in re.finditer(combined_pattern, text):
|
|
2485
|
-
# Text to add before the URL (needs to be escaped)
|
|
2486
|
-
before_url = text[last_end : match.start()]
|
|
2487
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2488
|
-
|
|
2489
|
-
# Add the full URL part (do not escape)
|
|
2490
|
-
parts.append(match.group(0))
|
|
2491
|
-
last_end = match.end()
|
|
2492
|
-
|
|
2493
|
-
# Add the final part of the text (which needs to be escaped)
|
|
2494
|
-
if last_end < len(text):
|
|
2495
|
-
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2496
|
-
|
|
2497
|
-
return "".join(parts)
|
|
2498
|
-
|
|
2499
|
-
def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
|
|
2500
|
-
if do_escape_underscores and escaping_underscores:
|
|
2501
|
-
text = _escape_underscores(text)
|
|
2502
|
-
if do_escape_html:
|
|
2503
|
-
text = html.escape(text, quote=False)
|
|
2504
|
-
if text:
|
|
2505
|
-
components.append(text)
|
|
2672
|
+
ser_res = serializer.serialize()
|
|
2506
2673
|
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2674
|
+
if delim != "\n\n":
|
|
2675
|
+
_logger.warning(
|
|
2676
|
+
"Parameter `delim` has been deprecated and will be ignored.",
|
|
2677
|
+
)
|
|
2678
|
+
if strict_text:
|
|
2679
|
+
_logger.warning(
|
|
2680
|
+
"Parameter `strict_text` has been deprecated and will be ignored.",
|
|
2513
2681
|
)
|
|
2514
|
-
):
|
|
2515
|
-
if item.self_ref in visited:
|
|
2516
|
-
continue
|
|
2517
|
-
else:
|
|
2518
|
-
visited.add(item.self_ref)
|
|
2519
|
-
|
|
2520
|
-
if ix < from_element or to_element <= ix:
|
|
2521
|
-
continue # skip as many items as you want
|
|
2522
|
-
|
|
2523
|
-
elif (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2524
|
-
continue # skip any label that is not whitelisted
|
|
2525
|
-
|
|
2526
|
-
elif isinstance(item, GroupItem):
|
|
2527
|
-
if item.label in [
|
|
2528
|
-
GroupLabel.LIST,
|
|
2529
|
-
GroupLabel.ORDERED_LIST,
|
|
2530
|
-
]:
|
|
2531
|
-
comps = self._get_markdown_components(
|
|
2532
|
-
node=item,
|
|
2533
|
-
from_element=from_element,
|
|
2534
|
-
to_element=to_element,
|
|
2535
|
-
labels=labels,
|
|
2536
|
-
strict_text=strict_text,
|
|
2537
|
-
escaping_underscores=escaping_underscores,
|
|
2538
|
-
image_placeholder=image_placeholder,
|
|
2539
|
-
image_mode=image_mode,
|
|
2540
|
-
indent=indent,
|
|
2541
|
-
text_width=text_width,
|
|
2542
|
-
page_no=page_no,
|
|
2543
|
-
included_content_layers=included_content_layers,
|
|
2544
|
-
list_level=list_level + 1,
|
|
2545
|
-
is_inline_scope=is_inline_scope,
|
|
2546
|
-
visited=visited,
|
|
2547
|
-
)
|
|
2548
|
-
indent_str = list_level * indent * " "
|
|
2549
|
-
is_ol = item.label == GroupLabel.ORDERED_LIST
|
|
2550
|
-
text = "\n".join(
|
|
2551
|
-
[
|
|
2552
|
-
# avoid additional marker on already evaled sublists
|
|
2553
|
-
(
|
|
2554
|
-
c
|
|
2555
|
-
if c and c[0] == " "
|
|
2556
|
-
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
|
|
2557
|
-
)
|
|
2558
|
-
for i, c in enumerate(comps)
|
|
2559
|
-
]
|
|
2560
|
-
)
|
|
2561
|
-
_ingest_text(
|
|
2562
|
-
text=text,
|
|
2563
|
-
# special chars have already been escaped as needed
|
|
2564
|
-
do_escape_html=False,
|
|
2565
|
-
do_escape_underscores=False,
|
|
2566
|
-
)
|
|
2567
|
-
elif item.label == GroupLabel.INLINE:
|
|
2568
|
-
comps = self._get_markdown_components(
|
|
2569
|
-
node=item,
|
|
2570
|
-
from_element=from_element,
|
|
2571
|
-
to_element=to_element,
|
|
2572
|
-
labels=labels,
|
|
2573
|
-
strict_text=strict_text,
|
|
2574
|
-
escaping_underscores=escaping_underscores,
|
|
2575
|
-
image_placeholder=image_placeholder,
|
|
2576
|
-
image_mode=image_mode,
|
|
2577
|
-
indent=indent,
|
|
2578
|
-
text_width=text_width,
|
|
2579
|
-
page_no=page_no,
|
|
2580
|
-
included_content_layers=included_content_layers,
|
|
2581
|
-
list_level=list_level,
|
|
2582
|
-
is_inline_scope=True,
|
|
2583
|
-
visited=visited,
|
|
2584
|
-
)
|
|
2585
|
-
text = " ".join(comps)
|
|
2586
|
-
_ingest_text(
|
|
2587
|
-
text=text,
|
|
2588
|
-
# special chars have already been escaped as needed
|
|
2589
|
-
do_escape_html=False,
|
|
2590
|
-
do_escape_underscores=False,
|
|
2591
|
-
)
|
|
2592
|
-
else:
|
|
2593
|
-
continue
|
|
2594
|
-
|
|
2595
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2596
|
-
marker = "" if strict_text else "#"
|
|
2597
|
-
text = f"{marker} {item.text}"
|
|
2598
|
-
_ingest_text(text.strip())
|
|
2599
|
-
|
|
2600
|
-
elif (
|
|
2601
|
-
isinstance(item, TextItem)
|
|
2602
|
-
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
2603
|
-
) or isinstance(item, SectionHeaderItem):
|
|
2604
|
-
marker = ""
|
|
2605
|
-
if not strict_text:
|
|
2606
|
-
marker = "#" * level
|
|
2607
|
-
if len(marker) < 2:
|
|
2608
|
-
marker = "##"
|
|
2609
|
-
text = f"{marker} {item.text}"
|
|
2610
|
-
_ingest_text(text.strip())
|
|
2611
|
-
|
|
2612
|
-
elif isinstance(item, CodeItem):
|
|
2613
|
-
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
2614
|
-
_ingest_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2615
|
-
|
|
2616
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2617
|
-
if item.text != "":
|
|
2618
|
-
_ingest_text(
|
|
2619
|
-
f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
|
|
2620
|
-
do_escape_underscores=False,
|
|
2621
|
-
do_escape_html=False,
|
|
2622
|
-
)
|
|
2623
|
-
elif item.orig != "":
|
|
2624
|
-
_ingest_text(
|
|
2625
|
-
"<!-- formula-not-decoded -->",
|
|
2626
|
-
do_escape_underscores=False,
|
|
2627
|
-
do_escape_html=False,
|
|
2628
|
-
)
|
|
2629
|
-
|
|
2630
|
-
elif isinstance(item, TextItem):
|
|
2631
|
-
if len(item.text) and text_width > 0:
|
|
2632
|
-
text = item.text
|
|
2633
|
-
wrapped_text = textwrap.fill(text, width=text_width)
|
|
2634
|
-
_ingest_text(wrapped_text)
|
|
2635
|
-
elif len(item.text):
|
|
2636
|
-
_ingest_text(item.text)
|
|
2637
|
-
|
|
2638
|
-
elif isinstance(item, TableItem) and not strict_text:
|
|
2639
|
-
if caption_text := item.caption_text(self):
|
|
2640
|
-
_ingest_text(caption_text)
|
|
2641
|
-
md_table = item.export_to_markdown()
|
|
2642
|
-
_ingest_text(md_table)
|
|
2643
|
-
|
|
2644
|
-
elif isinstance(item, PictureItem) and not strict_text:
|
|
2645
|
-
_ingest_text(item.caption_text(self))
|
|
2646
|
-
|
|
2647
|
-
line = item.export_to_markdown(
|
|
2648
|
-
doc=self,
|
|
2649
|
-
image_placeholder=image_placeholder,
|
|
2650
|
-
image_mode=image_mode,
|
|
2651
|
-
)
|
|
2652
|
-
|
|
2653
|
-
_ingest_text(line, do_escape_html=False, do_escape_underscores=False)
|
|
2654
|
-
|
|
2655
|
-
elif isinstance(item, (KeyValueItem, FormItem)):
|
|
2656
|
-
text = item._export_to_markdown()
|
|
2657
|
-
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2658
|
-
|
|
2659
|
-
elif isinstance(item, DocItem):
|
|
2660
|
-
text = "<!-- missing-text -->"
|
|
2661
|
-
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2662
2682
|
|
|
2663
|
-
return
|
|
2683
|
+
return ser_res.text
|
|
2664
2684
|
|
|
2665
2685
|
def export_to_text( # noqa: C901
|
|
2666
2686
|
self,
|
|
2667
2687
|
delim: str = "\n\n",
|
|
2668
2688
|
from_element: int = 0,
|
|
2669
2689
|
to_element: int = 1000000,
|
|
2670
|
-
labels: set[DocItemLabel] =
|
|
2690
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2671
2691
|
) -> str:
|
|
2672
2692
|
"""export_to_text."""
|
|
2673
2693
|
return self.export_to_markdown(
|