docling-core 2.22.0__py3-none-any.whl → 2.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -11,7 +11,6 @@ import mimetypes
11
11
  import os
12
12
  import re
13
13
  import sys
14
- import textwrap
15
14
  import typing
16
15
  import warnings
17
16
  from enum import Enum
@@ -62,7 +61,7 @@ _logger = logging.getLogger(__name__)
62
61
 
63
62
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
64
63
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
65
- CURRENT_VERSION: Final = "1.2.0"
64
+ CURRENT_VERSION: Final = "1.3.0"
66
65
 
67
66
  DEFAULT_EXPORT_LABELS = {
68
67
  DocItemLabel.TITLE,
@@ -87,6 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
87
86
  [
88
87
  DocItemLabel.FOOTNOTE,
89
88
  DocItemLabel.CAPTION,
89
+ DocItemLabel.KEY_VALUE_REGION,
90
+ DocItemLabel.FORM,
90
91
  ]
91
92
  )
92
93
 
@@ -607,9 +608,30 @@ class GroupItem(NodeItem): # Container type, can't be a leaf node
607
608
  "group" # Name of the group, e.g. "Introduction Chapter",
608
609
  # "Slide 5", "Navigation menu list", ...
609
610
  )
611
+ # TODO narrow down to allowed values, i.e. excluding those used for subtypes
610
612
  label: GroupLabel = GroupLabel.UNSPECIFIED
611
613
 
612
614
 
615
+ class UnorderedList(GroupItem):
616
+ """UnorderedList."""
617
+
618
+ label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST # type: ignore[assignment]
619
+
620
+
621
+ class OrderedList(GroupItem):
622
+ """OrderedList."""
623
+
624
+ label: typing.Literal[GroupLabel.ORDERED_LIST] = (
625
+ GroupLabel.ORDERED_LIST # type: ignore[assignment]
626
+ )
627
+
628
+
629
+ class InlineGroup(GroupItem):
630
+ """InlineGroup."""
631
+
632
+ label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE
633
+
634
+
613
635
  class DocItem(
614
636
  NodeItem
615
637
  ): # Base type for any element that carries content, can be a leaf node
@@ -670,6 +692,15 @@ class DocItem(
670
692
  return page_image.crop(crop_bbox.as_tuple())
671
693
 
672
694
 
695
+ class Formatting(BaseModel):
696
+ """Formatting."""
697
+
698
+ bold: bool = False
699
+ italic: bool = False
700
+ underline: bool = False
701
+ strikethrough: bool = False
702
+
703
+
673
704
  class TextItem(DocItem):
674
705
  """TextItem."""
675
706
 
@@ -678,18 +709,19 @@ class TextItem(DocItem):
678
709
  DocItemLabel.CHECKBOX_SELECTED,
679
710
  DocItemLabel.CHECKBOX_UNSELECTED,
680
711
  DocItemLabel.FOOTNOTE,
681
- DocItemLabel.FORMULA,
682
712
  DocItemLabel.PAGE_FOOTER,
683
713
  DocItemLabel.PAGE_HEADER,
684
714
  DocItemLabel.PARAGRAPH,
685
715
  DocItemLabel.REFERENCE,
686
716
  DocItemLabel.TEXT,
687
- DocItemLabel.TITLE,
688
717
  ]
689
718
 
690
719
  orig: str # untreated representation
691
720
  text: str # sanitized representation
692
721
 
722
+ formatting: Optional[Formatting] = None
723
+ hyperlink: Optional[Union[AnyUrl, Path]] = None
724
+
693
725
  def export_to_document_tokens(
694
726
  self,
695
727
  doc: "DoclingDocument",
@@ -727,6 +759,14 @@ class TextItem(DocItem):
727
759
  return body
728
760
 
729
761
 
762
+ class TitleItem(TextItem):
763
+ """TitleItem."""
764
+
765
+ label: typing.Literal[DocItemLabel.TITLE] = (
766
+ DocItemLabel.TITLE # type: ignore[assignment]
767
+ )
768
+
769
+
730
770
  class SectionHeaderItem(TextItem):
731
771
  """SectionItem."""
732
772
 
@@ -862,6 +902,14 @@ class CodeItem(FloatingItem, TextItem):
862
902
  return body
863
903
 
864
904
 
905
+ class FormulaItem(TextItem):
906
+ """FormulaItem."""
907
+
908
+ label: typing.Literal[DocItemLabel.FORMULA] = (
909
+ DocItemLabel.FORMULA # type: ignore[assignment]
910
+ )
911
+
912
+
865
913
  class PictureItem(FloatingItem):
866
914
  """PictureItem."""
867
915
 
@@ -900,54 +948,34 @@ class PictureItem(FloatingItem):
900
948
  def export_to_markdown(
901
949
  self,
902
950
  doc: "DoclingDocument",
903
- add_caption: bool = True,
951
+ add_caption: bool = True, # deprecated
904
952
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
905
953
  image_placeholder: str = "<!-- image -->",
906
954
  ) -> str:
907
955
  """Export picture to Markdown format."""
908
- default_response = image_placeholder
909
- error_response = (
910
- "<!-- 🖼️❌ Image not available. "
911
- "Please use `PdfPipelineOptions(generate_picture_images=True)`"
912
- " -->"
913
- )
956
+ from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
914
957
 
915
- if image_mode == ImageRefMode.PLACEHOLDER:
916
- return default_response
917
-
918
- elif image_mode == ImageRefMode.EMBEDDED:
919
-
920
- # short-cut: we already have the image in base64
921
- if (
922
- isinstance(self.image, ImageRef)
923
- and isinstance(self.image.uri, AnyUrl)
924
- and self.image.uri.scheme == "data"
925
- ):
926
- text = f"![Image]({self.image.uri})"
927
- return text
928
-
929
- # get the self.image._pil or crop it out of the page-image
930
- img = self.get_image(doc)
931
-
932
- if img is not None:
933
- imgb64 = self._image_to_base64(img)
934
- text = f"![Image](data:image/png;base64,{imgb64})"
935
-
936
- return text
937
- else:
938
- return error_response
939
-
940
- elif image_mode == ImageRefMode.REFERENCED:
941
- if not isinstance(self.image, ImageRef) or (
942
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
943
- ):
944
- return default_response
945
-
946
- text = f"![Image]({quote(str(self.image.uri))})"
947
- return text
958
+ if not add_caption:
959
+ _logger.warning(
960
+ "Argument `add_caption` is deprecated and will be ignored.",
961
+ )
948
962
 
949
- else:
950
- return default_response
963
+ serializer = MarkdownDocSerializer(
964
+ doc=self,
965
+ image_mode=image_mode,
966
+ )
967
+ text = (
968
+ serializer.picture_serializer.serialize(
969
+ item=self,
970
+ doc_serializer=serializer,
971
+ doc=doc,
972
+ image_mode=image_mode,
973
+ image_placeholder=image_placeholder,
974
+ ).text
975
+ if serializer.picture_serializer
976
+ else ""
977
+ )
978
+ return text
951
979
 
952
980
  def export_to_html(
953
981
  self,
@@ -1136,33 +1164,58 @@ class TableItem(FloatingItem):
1136
1164
 
1137
1165
  return df
1138
1166
 
1139
- def export_to_markdown(self) -> str:
1167
+ def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
1140
1168
  """Export the table as markdown."""
1141
- table = []
1142
- for row in self.data.grid:
1143
- tmp = []
1144
- for col in row:
1169
+ if doc is not None:
1170
+ from docling_core.experimental.serializer.markdown import (
1171
+ MarkdownDocSerializer,
1172
+ )
1145
1173
 
1146
- # make sure that md tables are not broken
1147
- # due to newline chars in the text
1148
- text = col.text
1149
- text = text.replace("\n", " ")
1150
- tmp.append(text)
1174
+ serializer = MarkdownDocSerializer(
1175
+ doc=doc,
1176
+ )
1177
+ text = (
1178
+ serializer.table_serializer.serialize(
1179
+ item=self,
1180
+ doc_serializer=serializer,
1181
+ doc=doc,
1182
+ ).text
1183
+ if serializer.table_serializer
1184
+ else ""
1185
+ )
1186
+ return text
1187
+ else:
1188
+ _logger.warning(
1189
+ "Usage of TableItem.export_to_markdown() without `doc` argument is "
1190
+ "deprecated.",
1191
+ )
1151
1192
 
1152
- table.append(tmp)
1193
+ table = []
1194
+ for row in self.data.grid:
1195
+ tmp = []
1196
+ for col in row:
1197
+
1198
+ # make sure that md tables are not broken
1199
+ # due to newline chars in the text
1200
+ text = col.text
1201
+ text = text.replace("\n", " ")
1202
+ tmp.append(text)
1203
+
1204
+ table.append(tmp)
1205
+
1206
+ res = ""
1207
+ if len(table) > 1 and len(table[0]) > 0:
1208
+ try:
1209
+ res = tabulate(table[1:], headers=table[0], tablefmt="github")
1210
+ except ValueError:
1211
+ res = tabulate(
1212
+ table[1:],
1213
+ headers=table[0],
1214
+ tablefmt="github",
1215
+ disable_numparse=True,
1216
+ )
1153
1217
 
1154
- md_table = ""
1155
- if len(table) > 1 and len(table[0]) > 0:
1156
- try:
1157
- md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
1158
- except ValueError:
1159
- md_table = tabulate(
1160
- table[1:],
1161
- headers=table[0],
1162
- tablefmt="github",
1163
- disable_numparse=True,
1164
- )
1165
- return md_table
1218
+ return res
1166
1219
 
1167
1220
  def export_to_html(
1168
1221
  self,
@@ -1455,10 +1508,6 @@ class KeyValueItem(FloatingItem):
1455
1508
 
1456
1509
  graph: GraphData
1457
1510
 
1458
- def _export_to_markdown(self) -> str:
1459
- # TODO add actual implementation
1460
- return "<!-- missing-key-value-item -->"
1461
-
1462
1511
 
1463
1512
  class FormItem(FloatingItem):
1464
1513
  """FormItem."""
@@ -1467,17 +1516,15 @@ class FormItem(FloatingItem):
1467
1516
 
1468
1517
  graph: GraphData
1469
1518
 
1470
- def _export_to_markdown(self) -> str:
1471
- # TODO add actual implementation
1472
- return "<!-- missing-form-item -->"
1473
-
1474
1519
 
1475
1520
  ContentItem = Annotated[
1476
1521
  Union[
1477
1522
  TextItem,
1523
+ TitleItem,
1478
1524
  SectionHeaderItem,
1479
1525
  ListItem,
1480
1526
  CodeItem,
1527
+ FormulaItem,
1481
1528
  PictureItem,
1482
1529
  TableItem,
1483
1530
  KeyValueItem,
@@ -1501,7 +1548,7 @@ class DoclingDocument(BaseModel):
1501
1548
 
1502
1549
  _HTML_DEFAULT_HEAD: str = r"""<head>
1503
1550
  <link rel="icon" type="image/png"
1504
- href="https://ds4sd.github.io/docling/assets/logo.png"/>
1551
+ href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
1505
1552
  <meta charset="UTF-8">
1506
1553
  <title>
1507
1554
  Powered by Docling
@@ -1588,8 +1635,10 @@ class DoclingDocument(BaseModel):
1588
1635
  ) # List[RefItem] = []
1589
1636
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1590
1637
 
1591
- groups: List[GroupItem] = []
1592
- texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
1638
+ groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
1639
+ texts: List[
1640
+ Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
1641
+ ] = []
1593
1642
  pictures: List[PictureItem] = []
1594
1643
  tables: List[TableItem] = []
1595
1644
  key_value_items: List[KeyValueItem] = []
@@ -1613,6 +1662,68 @@ class DoclingDocument(BaseModel):
1613
1662
  item["content_layer"] = "furniture"
1614
1663
  return data
1615
1664
 
1665
+ ###################################
1666
+ # TODO: refactor add* methods below
1667
+ ###################################
1668
+
1669
+ def add_ordered_list(
1670
+ self,
1671
+ name: Optional[str] = None,
1672
+ parent: Optional[NodeItem] = None,
1673
+ content_layer: Optional[ContentLayer] = None,
1674
+ ) -> GroupItem:
1675
+ """add_ordered_list."""
1676
+ _parent = parent or self.body
1677
+ cref = f"#/groups/{len(self.groups)}"
1678
+ group = OrderedList(self_ref=cref, parent=_parent.get_ref())
1679
+ if name is not None:
1680
+ group.name = name
1681
+ if content_layer:
1682
+ group.content_layer = content_layer
1683
+
1684
+ self.groups.append(group)
1685
+ _parent.children.append(RefItem(cref=cref))
1686
+ return group
1687
+
1688
+ def add_unordered_list(
1689
+ self,
1690
+ name: Optional[str] = None,
1691
+ parent: Optional[NodeItem] = None,
1692
+ content_layer: Optional[ContentLayer] = None,
1693
+ ) -> GroupItem:
1694
+ """add_unordered_list."""
1695
+ _parent = parent or self.body
1696
+ cref = f"#/groups/{len(self.groups)}"
1697
+ group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
1698
+ if name is not None:
1699
+ group.name = name
1700
+ if content_layer:
1701
+ group.content_layer = content_layer
1702
+
1703
+ self.groups.append(group)
1704
+ _parent.children.append(RefItem(cref=cref))
1705
+ return group
1706
+
1707
+ def add_inline_group(
1708
+ self,
1709
+ name: Optional[str] = None,
1710
+ parent: Optional[NodeItem] = None,
1711
+ content_layer: Optional[ContentLayer] = None,
1712
+ # marker: Optional[UnorderedList.ULMarker] = None,
1713
+ ) -> GroupItem:
1714
+ """add_inline_group."""
1715
+ _parent = parent or self.body
1716
+ cref = f"#/groups/{len(self.groups)}"
1717
+ group = InlineGroup(self_ref=cref, parent=_parent.get_ref())
1718
+ if name is not None:
1719
+ group.name = name
1720
+ if content_layer:
1721
+ group.content_layer = content_layer
1722
+
1723
+ self.groups.append(group)
1724
+ _parent.children.append(RefItem(cref=cref))
1725
+ return group
1726
+
1616
1727
  def add_group(
1617
1728
  self,
1618
1729
  label: Optional[GroupLabel] = None,
@@ -1627,6 +1738,25 @@ class DoclingDocument(BaseModel):
1627
1738
  :param parent: Optional[NodeItem]: (Default value = None)
1628
1739
 
1629
1740
  """
1741
+ if label == GroupLabel.LIST:
1742
+ return self.add_unordered_list(
1743
+ name=name,
1744
+ parent=parent,
1745
+ content_layer=content_layer,
1746
+ )
1747
+ elif label == GroupLabel.ORDERED_LIST:
1748
+ return self.add_ordered_list(
1749
+ name=name,
1750
+ parent=parent,
1751
+ content_layer=content_layer,
1752
+ )
1753
+ elif label == GroupLabel.INLINE:
1754
+ return self.add_inline_group(
1755
+ name=name,
1756
+ parent=parent,
1757
+ content_layer=content_layer,
1758
+ )
1759
+
1630
1760
  if not parent:
1631
1761
  parent = self.body
1632
1762
 
@@ -1655,6 +1785,8 @@ class DoclingDocument(BaseModel):
1655
1785
  prov: Optional[ProvenanceItem] = None,
1656
1786
  parent: Optional[NodeItem] = None,
1657
1787
  content_layer: Optional[ContentLayer] = None,
1788
+ formatting: Optional[Formatting] = None,
1789
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1658
1790
  ):
1659
1791
  """add_list_item.
1660
1792
 
@@ -1682,6 +1814,8 @@ class DoclingDocument(BaseModel):
1682
1814
  parent=parent.get_ref(),
1683
1815
  enumerated=enumerated,
1684
1816
  marker=marker,
1817
+ formatting=formatting,
1818
+ hyperlink=hyperlink,
1685
1819
  )
1686
1820
  if prov:
1687
1821
  list_item.prov.append(prov)
@@ -1701,6 +1835,8 @@ class DoclingDocument(BaseModel):
1701
1835
  prov: Optional[ProvenanceItem] = None,
1702
1836
  parent: Optional[NodeItem] = None,
1703
1837
  content_layer: Optional[ContentLayer] = None,
1838
+ formatting: Optional[Formatting] = None,
1839
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1704
1840
  ):
1705
1841
  """add_text.
1706
1842
 
@@ -1720,6 +1856,8 @@ class DoclingDocument(BaseModel):
1720
1856
  prov=prov,
1721
1857
  parent=parent,
1722
1858
  content_layer=content_layer,
1859
+ formatting=formatting,
1860
+ hyperlink=hyperlink,
1723
1861
  )
1724
1862
 
1725
1863
  elif label in [DocItemLabel.LIST_ITEM]:
@@ -1729,15 +1867,31 @@ class DoclingDocument(BaseModel):
1729
1867
  prov=prov,
1730
1868
  parent=parent,
1731
1869
  content_layer=content_layer,
1870
+ formatting=formatting,
1871
+ hyperlink=hyperlink,
1872
+ )
1873
+
1874
+ elif label in [DocItemLabel.TITLE]:
1875
+ return self.add_title(
1876
+ text=text,
1877
+ orig=orig,
1878
+ prov=prov,
1879
+ parent=parent,
1880
+ content_layer=content_layer,
1881
+ formatting=formatting,
1882
+ hyperlink=hyperlink,
1732
1883
  )
1733
1884
 
1734
1885
  elif label in [DocItemLabel.SECTION_HEADER]:
1735
1886
  return self.add_heading(
1736
1887
  text=text,
1737
1888
  orig=orig,
1889
+ # NOTE: we do not / cannot pass the level here, lossy path..
1738
1890
  prov=prov,
1739
1891
  parent=parent,
1740
1892
  content_layer=content_layer,
1893
+ formatting=formatting,
1894
+ hyperlink=hyperlink,
1741
1895
  )
1742
1896
 
1743
1897
  elif label in [DocItemLabel.CODE]:
@@ -1747,6 +1901,18 @@ class DoclingDocument(BaseModel):
1747
1901
  prov=prov,
1748
1902
  parent=parent,
1749
1903
  content_layer=content_layer,
1904
+ formatting=formatting,
1905
+ hyperlink=hyperlink,
1906
+ )
1907
+ elif label in [DocItemLabel.FORMULA]:
1908
+ return self.add_formula(
1909
+ text=text,
1910
+ orig=orig,
1911
+ prov=prov,
1912
+ parent=parent,
1913
+ content_layer=content_layer,
1914
+ formatting=formatting,
1915
+ hyperlink=hyperlink,
1750
1916
  )
1751
1917
 
1752
1918
  else:
@@ -1765,6 +1931,8 @@ class DoclingDocument(BaseModel):
1765
1931
  orig=orig,
1766
1932
  self_ref=cref,
1767
1933
  parent=parent.get_ref(),
1934
+ formatting=formatting,
1935
+ hyperlink=hyperlink,
1768
1936
  )
1769
1937
  if prov:
1770
1938
  text_item.prov.append(prov)
@@ -1866,11 +2034,14 @@ class DoclingDocument(BaseModel):
1866
2034
  prov: Optional[ProvenanceItem] = None,
1867
2035
  parent: Optional[NodeItem] = None,
1868
2036
  content_layer: Optional[ContentLayer] = None,
2037
+ formatting: Optional[Formatting] = None,
2038
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1869
2039
  ):
1870
2040
  """add_title.
1871
2041
 
1872
2042
  :param text: str:
1873
2043
  :param orig: Optional[str]: (Default value = None)
2044
+ :param level: LevelNumber: (Default value = 1)
1874
2045
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1875
2046
  :param parent: Optional[NodeItem]: (Default value = None)
1876
2047
  """
@@ -1882,22 +2053,23 @@ class DoclingDocument(BaseModel):
1882
2053
 
1883
2054
  text_index = len(self.texts)
1884
2055
  cref = f"#/texts/{text_index}"
1885
- text_item = TextItem(
1886
- label=DocItemLabel.TITLE,
2056
+ item = TitleItem(
1887
2057
  text=text,
1888
2058
  orig=orig,
1889
2059
  self_ref=cref,
1890
2060
  parent=parent.get_ref(),
2061
+ formatting=formatting,
2062
+ hyperlink=hyperlink,
1891
2063
  )
1892
2064
  if prov:
1893
- text_item.prov.append(prov)
2065
+ item.prov.append(prov)
1894
2066
  if content_layer:
1895
- text_item.content_layer = content_layer
2067
+ item.content_layer = content_layer
1896
2068
 
1897
- self.texts.append(text_item)
2069
+ self.texts.append(item)
1898
2070
  parent.children.append(RefItem(cref=cref))
1899
2071
 
1900
- return text_item
2072
+ return item
1901
2073
 
1902
2074
  def add_code(
1903
2075
  self,
@@ -1908,6 +2080,8 @@ class DoclingDocument(BaseModel):
1908
2080
  prov: Optional[ProvenanceItem] = None,
1909
2081
  parent: Optional[NodeItem] = None,
1910
2082
  content_layer: Optional[ContentLayer] = None,
2083
+ formatting: Optional[Formatting] = None,
2084
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1911
2085
  ):
1912
2086
  """add_code.
1913
2087
 
@@ -1932,6 +2106,8 @@ class DoclingDocument(BaseModel):
1932
2106
  orig=orig,
1933
2107
  self_ref=cref,
1934
2108
  parent=parent.get_ref(),
2109
+ formatting=formatting,
2110
+ hyperlink=hyperlink,
1935
2111
  )
1936
2112
  if code_language:
1937
2113
  code_item.code_language = code_language
@@ -1947,6 +2123,50 @@ class DoclingDocument(BaseModel):
1947
2123
 
1948
2124
  return code_item
1949
2125
 
2126
+ def add_formula(
2127
+ self,
2128
+ text: str,
2129
+ orig: Optional[str] = None,
2130
+ prov: Optional[ProvenanceItem] = None,
2131
+ parent: Optional[NodeItem] = None,
2132
+ content_layer: Optional[ContentLayer] = None,
2133
+ formatting: Optional[Formatting] = None,
2134
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
2135
+ ):
2136
+ """add_formula.
2137
+
2138
+ :param text: str:
2139
+ :param orig: Optional[str]: (Default value = None)
2140
+ :param level: LevelNumber: (Default value = 1)
2141
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
2142
+ :param parent: Optional[NodeItem]: (Default value = None)
2143
+ """
2144
+ if not parent:
2145
+ parent = self.body
2146
+
2147
+ if not orig:
2148
+ orig = text
2149
+
2150
+ text_index = len(self.texts)
2151
+ cref = f"#/texts/{text_index}"
2152
+ section_header_item = FormulaItem(
2153
+ text=text,
2154
+ orig=orig,
2155
+ self_ref=cref,
2156
+ parent=parent.get_ref(),
2157
+ formatting=formatting,
2158
+ hyperlink=hyperlink,
2159
+ )
2160
+ if prov:
2161
+ section_header_item.prov.append(prov)
2162
+ if content_layer:
2163
+ section_header_item.content_layer = content_layer
2164
+
2165
+ self.texts.append(section_header_item)
2166
+ parent.children.append(RefItem(cref=cref))
2167
+
2168
+ return section_header_item
2169
+
1950
2170
  def add_heading(
1951
2171
  self,
1952
2172
  text: str,
@@ -1955,6 +2175,8 @@ class DoclingDocument(BaseModel):
1955
2175
  prov: Optional[ProvenanceItem] = None,
1956
2176
  parent: Optional[NodeItem] = None,
1957
2177
  content_layer: Optional[ContentLayer] = None,
2178
+ formatting: Optional[Formatting] = None,
2179
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1958
2180
  ):
1959
2181
  """add_heading.
1960
2182
 
@@ -1979,6 +2201,8 @@ class DoclingDocument(BaseModel):
1979
2201
  orig=orig,
1980
2202
  self_ref=cref,
1981
2203
  parent=parent.get_ref(),
2204
+ formatting=formatting,
2205
+ hyperlink=hyperlink,
1982
2206
  )
1983
2207
  if prov:
1984
2208
  section_header_item.prov.append(prov)
@@ -2334,10 +2558,10 @@ class DoclingDocument(BaseModel):
2334
2558
  self,
2335
2559
  filename: Path,
2336
2560
  artifacts_dir: Optional[Path] = None,
2337
- delim: str = "\n\n", # TODO: deprecate
2561
+ delim: str = "\n\n",
2338
2562
  from_element: int = 0,
2339
2563
  to_element: int = sys.maxsize,
2340
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2564
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2341
2565
  strict_text: bool = False,
2342
2566
  escaping_underscores: bool = True,
2343
2567
  image_placeholder: str = "<!-- image -->",
@@ -2377,10 +2601,10 @@ class DoclingDocument(BaseModel):
2377
2601
 
2378
2602
  def export_to_markdown( # noqa: C901
2379
2603
  self,
2380
- delim: str = "\n\n", # TODO deprecate
2604
+ delim: str = "\n\n",
2381
2605
  from_element: int = 0,
2382
2606
  to_element: int = sys.maxsize,
2383
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2607
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2384
2608
  strict_text: bool = False,
2385
2609
  escaping_underscores: bool = True,
2386
2610
  image_placeholder: str = "<!-- image -->",
@@ -2395,9 +2619,8 @@ class DoclingDocument(BaseModel):
2395
2619
  Operates on a slice of the document's body as defined through arguments
2396
2620
  from_element and to_element; defaulting to the whole document.
2397
2621
 
2398
- :param delim: Delimiter to use when concatenating the various
2399
- Markdown parts. (Default value = "\n").
2400
- :type delim: str = "\n"
2622
+ :param delim: Deprecated.
2623
+ :type delim: str = "\n\n"
2401
2624
  :param from_element: Body slicing start index (inclusive).
2402
2625
  (Default value = 0).
2403
2626
  :type from_element: int = 0
@@ -2405,9 +2628,8 @@ class DoclingDocument(BaseModel):
2405
2628
  (exclusive). (Default value = maxint).
2406
2629
  :type to_element: int = sys.maxsize
2407
2630
  :param labels: The set of document labels to include in the export.
2408
- :type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
2409
- :param strict_text: bool: Whether to only include the text content
2410
- of the document. (Default value = False).
2631
+ :type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
2632
+ :param strict_text: Deprecated.
2411
2633
  :type strict_text: bool = False
2412
2634
  :param escaping_underscores: bool: Whether to escape underscores in the
2413
2635
  text content of the document. (Default value = True).
@@ -2424,250 +2646,48 @@ class DoclingDocument(BaseModel):
2424
2646
  :returns: The exported Markdown representation.
2425
2647
  :rtype: str
2426
2648
  """
2427
- comps = self._get_markdown_components(
2428
- node=self.body,
2429
- from_element=from_element,
2430
- to_element=to_element,
2431
- labels=labels,
2432
- strict_text=strict_text,
2433
- escaping_underscores=escaping_underscores,
2649
+ from docling_core.experimental.serializer.markdown import (
2650
+ MarkdownDocSerializer,
2651
+ MarkdownListSerializer,
2652
+ MarkdownTextSerializer,
2653
+ )
2654
+
2655
+ serializer = MarkdownDocSerializer(
2656
+ doc=self,
2657
+ start=from_element,
2658
+ stop=to_element,
2434
2659
  image_placeholder=image_placeholder,
2435
2660
  image_mode=image_mode,
2436
- indent=indent,
2437
- text_width=text_width,
2438
- page_no=page_no,
2439
- included_content_layers=included_content_layers,
2440
- list_level=0,
2441
- is_inline_scope=False,
2442
- visited=set(),
2661
+ labels=labels,
2662
+ layers=included_content_layers,
2663
+ pages={page_no} if page_no is not None else None,
2664
+ escaping_underscores=escaping_underscores,
2665
+ text_serializer=MarkdownTextSerializer(
2666
+ wrap_width=text_width if text_width > 0 else None,
2667
+ ),
2668
+ list_serializer=MarkdownListSerializer(
2669
+ indent=indent,
2670
+ ),
2443
2671
  )
2444
- return delim.join(comps)
2445
-
2446
- def _get_markdown_components( # noqa: C901
2447
- self,
2448
- node: NodeItem,
2449
- from_element: int,
2450
- to_element: int,
2451
- labels: set[DocItemLabel],
2452
- strict_text: bool,
2453
- escaping_underscores: bool,
2454
- image_placeholder: str,
2455
- image_mode: ImageRefMode,
2456
- indent: int,
2457
- text_width: int,
2458
- page_no: Optional[int],
2459
- included_content_layers: set[ContentLayer],
2460
- list_level: int,
2461
- is_inline_scope: bool,
2462
- visited: set[str], # refs of visited items
2463
- ) -> list[str]:
2464
- components: list[str] = [] # components to concatenate
2465
-
2466
- # Our export markdown doesn't contain any emphasis styling:
2467
- # Bold, Italic, or Bold-Italic
2468
- # Hence, any underscore that we print into Markdown is coming from document text
2469
- # That means we need to escape it, to properly reflect content in the markdown
2470
- # However, we need to preserve underscores in image URLs
2471
- # to maintain their validity
2472
- # For example: ![image](path/to_image.png) should remain unchanged
2473
- def _escape_underscores(text):
2474
- """Escape underscores but leave them intact in the URL.."""
2475
- # Firstly, identify all the URL patterns.
2476
- url_pattern = r"!\[.*?\]\((.*?)\)"
2477
- # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2478
- latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2479
- combined_pattern = f"({url_pattern})|({latex_pattern})"
2480
-
2481
- parts = []
2482
- last_end = 0
2483
-
2484
- for match in re.finditer(combined_pattern, text):
2485
- # Text to add before the URL (needs to be escaped)
2486
- before_url = text[last_end : match.start()]
2487
- parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2488
-
2489
- # Add the full URL part (do not escape)
2490
- parts.append(match.group(0))
2491
- last_end = match.end()
2492
-
2493
- # Add the final part of the text (which needs to be escaped)
2494
- if last_end < len(text):
2495
- parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2496
-
2497
- return "".join(parts)
2498
-
2499
- def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
2500
- if do_escape_underscores and escaping_underscores:
2501
- text = _escape_underscores(text)
2502
- if do_escape_html:
2503
- text = html.escape(text, quote=False)
2504
- if text:
2505
- components.append(text)
2672
+ ser_res = serializer.serialize()
2506
2673
 
2507
- for ix, (item, level) in enumerate(
2508
- self.iterate_items(
2509
- node,
2510
- with_groups=True,
2511
- page_no=page_no,
2512
- included_content_layers=included_content_layers,
2674
+ if delim != "\n\n":
2675
+ _logger.warning(
2676
+ "Parameter `delim` has been deprecated and will be ignored.",
2677
+ )
2678
+ if strict_text:
2679
+ _logger.warning(
2680
+ "Parameter `strict_text` has been deprecated and will be ignored.",
2513
2681
  )
2514
- ):
2515
- if item.self_ref in visited:
2516
- continue
2517
- else:
2518
- visited.add(item.self_ref)
2519
-
2520
- if ix < from_element or to_element <= ix:
2521
- continue # skip as many items as you want
2522
-
2523
- elif (isinstance(item, DocItem)) and (item.label not in labels):
2524
- continue # skip any label that is not whitelisted
2525
-
2526
- elif isinstance(item, GroupItem):
2527
- if item.label in [
2528
- GroupLabel.LIST,
2529
- GroupLabel.ORDERED_LIST,
2530
- ]:
2531
- comps = self._get_markdown_components(
2532
- node=item,
2533
- from_element=from_element,
2534
- to_element=to_element,
2535
- labels=labels,
2536
- strict_text=strict_text,
2537
- escaping_underscores=escaping_underscores,
2538
- image_placeholder=image_placeholder,
2539
- image_mode=image_mode,
2540
- indent=indent,
2541
- text_width=text_width,
2542
- page_no=page_no,
2543
- included_content_layers=included_content_layers,
2544
- list_level=list_level + 1,
2545
- is_inline_scope=is_inline_scope,
2546
- visited=visited,
2547
- )
2548
- indent_str = list_level * indent * " "
2549
- is_ol = item.label == GroupLabel.ORDERED_LIST
2550
- text = "\n".join(
2551
- [
2552
- # avoid additional marker on already evaled sublists
2553
- (
2554
- c
2555
- if c and c[0] == " "
2556
- else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
2557
- )
2558
- for i, c in enumerate(comps)
2559
- ]
2560
- )
2561
- _ingest_text(
2562
- text=text,
2563
- # special chars have already been escaped as needed
2564
- do_escape_html=False,
2565
- do_escape_underscores=False,
2566
- )
2567
- elif item.label == GroupLabel.INLINE:
2568
- comps = self._get_markdown_components(
2569
- node=item,
2570
- from_element=from_element,
2571
- to_element=to_element,
2572
- labels=labels,
2573
- strict_text=strict_text,
2574
- escaping_underscores=escaping_underscores,
2575
- image_placeholder=image_placeholder,
2576
- image_mode=image_mode,
2577
- indent=indent,
2578
- text_width=text_width,
2579
- page_no=page_no,
2580
- included_content_layers=included_content_layers,
2581
- list_level=list_level,
2582
- is_inline_scope=True,
2583
- visited=visited,
2584
- )
2585
- text = " ".join(comps)
2586
- _ingest_text(
2587
- text=text,
2588
- # special chars have already been escaped as needed
2589
- do_escape_html=False,
2590
- do_escape_underscores=False,
2591
- )
2592
- else:
2593
- continue
2594
-
2595
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2596
- marker = "" if strict_text else "#"
2597
- text = f"{marker} {item.text}"
2598
- _ingest_text(text.strip())
2599
-
2600
- elif (
2601
- isinstance(item, TextItem)
2602
- and item.label in [DocItemLabel.SECTION_HEADER]
2603
- ) or isinstance(item, SectionHeaderItem):
2604
- marker = ""
2605
- if not strict_text:
2606
- marker = "#" * level
2607
- if len(marker) < 2:
2608
- marker = "##"
2609
- text = f"{marker} {item.text}"
2610
- _ingest_text(text.strip())
2611
-
2612
- elif isinstance(item, CodeItem):
2613
- text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
2614
- _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
2615
-
2616
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2617
- if item.text != "":
2618
- _ingest_text(
2619
- f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
2620
- do_escape_underscores=False,
2621
- do_escape_html=False,
2622
- )
2623
- elif item.orig != "":
2624
- _ingest_text(
2625
- "<!-- formula-not-decoded -->",
2626
- do_escape_underscores=False,
2627
- do_escape_html=False,
2628
- )
2629
-
2630
- elif isinstance(item, TextItem):
2631
- if len(item.text) and text_width > 0:
2632
- text = item.text
2633
- wrapped_text = textwrap.fill(text, width=text_width)
2634
- _ingest_text(wrapped_text)
2635
- elif len(item.text):
2636
- _ingest_text(item.text)
2637
-
2638
- elif isinstance(item, TableItem) and not strict_text:
2639
- if caption_text := item.caption_text(self):
2640
- _ingest_text(caption_text)
2641
- md_table = item.export_to_markdown()
2642
- _ingest_text(md_table)
2643
-
2644
- elif isinstance(item, PictureItem) and not strict_text:
2645
- _ingest_text(item.caption_text(self))
2646
-
2647
- line = item.export_to_markdown(
2648
- doc=self,
2649
- image_placeholder=image_placeholder,
2650
- image_mode=image_mode,
2651
- )
2652
-
2653
- _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
2654
-
2655
- elif isinstance(item, (KeyValueItem, FormItem)):
2656
- text = item._export_to_markdown()
2657
- _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2658
-
2659
- elif isinstance(item, DocItem):
2660
- text = "<!-- missing-text -->"
2661
- _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2662
2682
 
2663
- return components
2683
+ return ser_res.text
2664
2684
 
2665
2685
  def export_to_text( # noqa: C901
2666
2686
  self,
2667
2687
  delim: str = "\n\n",
2668
2688
  from_element: int = 0,
2669
2689
  to_element: int = 1000000,
2670
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2690
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2671
2691
  ) -> str:
2672
2692
  """export_to_text."""
2673
2693
  return self.export_to_markdown(