docling-core 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -4,13 +4,13 @@ import base64
4
4
  import copy
5
5
  import hashlib
6
6
  import html
7
+ import itertools
7
8
  import json
8
9
  import logging
9
10
  import mimetypes
10
11
  import os
11
12
  import re
12
13
  import sys
13
- import textwrap
14
14
  import typing
15
15
  import warnings
16
16
  from enum import Enum
@@ -37,7 +37,7 @@ from pydantic import (
37
37
  model_validator,
38
38
  )
39
39
  from tabulate import tabulate
40
- from typing_extensions import Annotated, Self
40
+ from typing_extensions import Annotated, Self, deprecated
41
41
 
42
42
  from docling_core.search.package import VERSION_PATTERN
43
43
  from docling_core.types.base import _JSON_POINTER_REGEX
@@ -61,7 +61,7 @@ _logger = logging.getLogger(__name__)
61
61
 
62
62
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
63
63
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
64
- CURRENT_VERSION: Final = "1.2.0"
64
+ CURRENT_VERSION: Final = "1.3.0"
65
65
 
66
66
  DEFAULT_EXPORT_LABELS = {
67
67
  DocItemLabel.TITLE,
@@ -86,6 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
86
86
  [
87
87
  DocItemLabel.FOOTNOTE,
88
88
  DocItemLabel.CAPTION,
89
+ DocItemLabel.KEY_VALUE_REGION,
90
+ DocItemLabel.FORM,
89
91
  ]
90
92
  )
91
93
 
@@ -522,6 +524,49 @@ class ImageRef(BaseModel):
522
524
  )
523
525
 
524
526
 
527
+ class DocTagsPage(BaseModel):
528
+ """DocTagsPage."""
529
+
530
+ model_config = ConfigDict(arbitrary_types_allowed=True)
531
+
532
+ tokens: str
533
+ image: Optional[PILImage.Image] = None
534
+
535
+
536
+ class DocTagsDocument(BaseModel):
537
+ """DocTagsDocument."""
538
+
539
+ pages: List[DocTagsPage] = []
540
+
541
+ @classmethod
542
+ def from_doctags_and_image_pairs(
543
+ cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
544
+ ):
545
+ """from_doctags_and_image_pairs."""
546
+ if len(doctags) != len(images):
547
+ raise ValueError("Number of page doctags must be equal to page images!")
548
+ doctags_doc = cls()
549
+
550
+ pages = []
551
+ for dt, img in zip(doctags, images):
552
+ if isinstance(dt, Path):
553
+ with dt.open("r") as fp:
554
+ dt = fp.read()
555
+ elif isinstance(dt, str):
556
+ pass
557
+
558
+ if isinstance(img, Path):
559
+ img = PILImage.open(img)
560
+ elif isinstance(dt, PILImage.Image):
561
+ pass
562
+
563
+ page = DocTagsPage(tokens=dt, image=img)
564
+ pages.append(page)
565
+
566
+ doctags_doc.pages = pages
567
+ return doctags_doc
568
+
569
+
525
570
  class ProvenanceItem(BaseModel):
526
571
  """ProvenanceItem."""
527
572
 
@@ -563,9 +608,30 @@ class GroupItem(NodeItem): # Container type, can't be a leaf node
563
608
  "group" # Name of the group, e.g. "Introduction Chapter",
564
609
  # "Slide 5", "Navigation menu list", ...
565
610
  )
611
+ # TODO narrow down to allowed values, i.e. excluding those used for subtypes
566
612
  label: GroupLabel = GroupLabel.UNSPECIFIED
567
613
 
568
614
 
615
+ class UnorderedList(GroupItem):
616
+ """UnorderedList."""
617
+
618
+ label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST # type: ignore[assignment]
619
+
620
+
621
+ class OrderedList(GroupItem):
622
+ """OrderedList."""
623
+
624
+ label: typing.Literal[GroupLabel.ORDERED_LIST] = (
625
+ GroupLabel.ORDERED_LIST # type: ignore[assignment]
626
+ )
627
+
628
+
629
+ class InlineGroup(GroupItem):
630
+ """InlineGroup."""
631
+
632
+ label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE
633
+
634
+
569
635
  class DocItem(
570
636
  NodeItem
571
637
  ): # Base type for any element that carries content, can be a leaf node
@@ -626,6 +692,15 @@ class DocItem(
626
692
  return page_image.crop(crop_bbox.as_tuple())
627
693
 
628
694
 
695
+ class Formatting(BaseModel):
696
+ """Formatting."""
697
+
698
+ bold: bool = False
699
+ italic: bool = False
700
+ underline: bool = False
701
+ strikethrough: bool = False
702
+
703
+
629
704
  class TextItem(DocItem):
630
705
  """TextItem."""
631
706
 
@@ -634,18 +709,19 @@ class TextItem(DocItem):
634
709
  DocItemLabel.CHECKBOX_SELECTED,
635
710
  DocItemLabel.CHECKBOX_UNSELECTED,
636
711
  DocItemLabel.FOOTNOTE,
637
- DocItemLabel.FORMULA,
638
712
  DocItemLabel.PAGE_FOOTER,
639
713
  DocItemLabel.PAGE_HEADER,
640
714
  DocItemLabel.PARAGRAPH,
641
715
  DocItemLabel.REFERENCE,
642
716
  DocItemLabel.TEXT,
643
- DocItemLabel.TITLE,
644
717
  ]
645
718
 
646
719
  orig: str # untreated representation
647
720
  text: str # sanitized representation
648
721
 
722
+ formatting: Optional[Formatting] = None
723
+ hyperlink: Optional[Union[AnyUrl, Path]] = None
724
+
649
725
  def export_to_document_tokens(
650
726
  self,
651
727
  doc: "DoclingDocument",
@@ -683,6 +759,14 @@ class TextItem(DocItem):
683
759
  return body
684
760
 
685
761
 
762
+ class TitleItem(TextItem):
763
+ """TitleItem."""
764
+
765
+ label: typing.Literal[DocItemLabel.TITLE] = (
766
+ DocItemLabel.TITLE # type: ignore[assignment]
767
+ )
768
+
769
+
686
770
  class SectionHeaderItem(TextItem):
687
771
  """SectionItem."""
688
772
 
@@ -818,6 +902,14 @@ class CodeItem(FloatingItem, TextItem):
818
902
  return body
819
903
 
820
904
 
905
+ class FormulaItem(TextItem):
906
+ """FormulaItem."""
907
+
908
+ label: typing.Literal[DocItemLabel.FORMULA] = (
909
+ DocItemLabel.FORMULA # type: ignore[assignment]
910
+ )
911
+
912
+
821
913
  class PictureItem(FloatingItem):
822
914
  """PictureItem."""
823
915
 
@@ -856,54 +948,34 @@ class PictureItem(FloatingItem):
856
948
  def export_to_markdown(
857
949
  self,
858
950
  doc: "DoclingDocument",
859
- add_caption: bool = True,
951
+ add_caption: bool = True, # deprecated
860
952
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
861
953
  image_placeholder: str = "<!-- image -->",
862
954
  ) -> str:
863
955
  """Export picture to Markdown format."""
864
- default_response = image_placeholder
865
- error_response = (
866
- "<!-- 🖼️❌ Image not available. "
867
- "Please use `PdfPipelineOptions(generate_picture_images=True)`"
868
- " -->"
869
- )
956
+ from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
870
957
 
871
- if image_mode == ImageRefMode.PLACEHOLDER:
872
- return default_response
873
-
874
- elif image_mode == ImageRefMode.EMBEDDED:
875
-
876
- # short-cut: we already have the image in base64
877
- if (
878
- isinstance(self.image, ImageRef)
879
- and isinstance(self.image.uri, AnyUrl)
880
- and self.image.uri.scheme == "data"
881
- ):
882
- text = f"![Image]({self.image.uri})"
883
- return text
884
-
885
- # get the self.image._pil or crop it out of the page-image
886
- img = self.get_image(doc)
887
-
888
- if img is not None:
889
- imgb64 = self._image_to_base64(img)
890
- text = f"![Image](data:image/png;base64,{imgb64})"
891
-
892
- return text
893
- else:
894
- return error_response
895
-
896
- elif image_mode == ImageRefMode.REFERENCED:
897
- if not isinstance(self.image, ImageRef) or (
898
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
899
- ):
900
- return default_response
901
-
902
- text = f"![Image]({quote(str(self.image.uri))})"
903
- return text
958
+ if not add_caption:
959
+ _logger.warning(
960
+ "Argument `add_caption` is deprecated and will be ignored.",
961
+ )
904
962
 
905
- else:
906
- return default_response
963
+ serializer = MarkdownDocSerializer(
964
+ doc=self,
965
+ image_mode=image_mode,
966
+ )
967
+ text = (
968
+ serializer.picture_serializer.serialize(
969
+ item=self,
970
+ doc_serializer=serializer,
971
+ doc=doc,
972
+ image_mode=image_mode,
973
+ image_placeholder=image_placeholder,
974
+ ).text
975
+ if serializer.picture_serializer
976
+ else ""
977
+ )
978
+ return text
907
979
 
908
980
  def export_to_html(
909
981
  self,
@@ -1003,6 +1075,20 @@ class PictureItem(FloatingItem):
1003
1075
  predicted_class = classifications[0].predicted_classes[0].class_name
1004
1076
  body += DocumentToken.get_picture_classification_token(predicted_class)
1005
1077
 
1078
+ smiles_annotations = [
1079
+ ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
1080
+ ]
1081
+ if len(smiles_annotations) > 0:
1082
+ body += (
1083
+ "<"
1084
+ + DocumentToken.SMILES.value
1085
+ + ">"
1086
+ + smiles_annotations[0].smi
1087
+ + "</"
1088
+ + DocumentToken.SMILES.value
1089
+ + ">"
1090
+ )
1091
+
1006
1092
  if add_caption and len(self.captions):
1007
1093
  text = self.caption_text(doc)
1008
1094
 
@@ -1078,33 +1164,58 @@ class TableItem(FloatingItem):
1078
1164
 
1079
1165
  return df
1080
1166
 
1081
- def export_to_markdown(self) -> str:
1167
+ def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
1082
1168
  """Export the table as markdown."""
1083
- table = []
1084
- for row in self.data.grid:
1085
- tmp = []
1086
- for col in row:
1169
+ if doc is not None:
1170
+ from docling_core.experimental.serializer.markdown import (
1171
+ MarkdownDocSerializer,
1172
+ )
1087
1173
 
1088
- # make sure that md tables are not broken
1089
- # due to newline chars in the text
1090
- text = col.text
1091
- text = text.replace("\n", " ")
1092
- tmp.append(text)
1174
+ serializer = MarkdownDocSerializer(
1175
+ doc=doc,
1176
+ )
1177
+ text = (
1178
+ serializer.table_serializer.serialize(
1179
+ item=self,
1180
+ doc_serializer=serializer,
1181
+ doc=doc,
1182
+ ).text
1183
+ if serializer.table_serializer
1184
+ else ""
1185
+ )
1186
+ return text
1187
+ else:
1188
+ _logger.warning(
1189
+ "Usage of TableItem.export_to_markdown() without `doc` argument is "
1190
+ "deprecated.",
1191
+ )
1093
1192
 
1094
- table.append(tmp)
1193
+ table = []
1194
+ for row in self.data.grid:
1195
+ tmp = []
1196
+ for col in row:
1197
+
1198
+ # make sure that md tables are not broken
1199
+ # due to newline chars in the text
1200
+ text = col.text
1201
+ text = text.replace("\n", " ")
1202
+ tmp.append(text)
1203
+
1204
+ table.append(tmp)
1205
+
1206
+ res = ""
1207
+ if len(table) > 1 and len(table[0]) > 0:
1208
+ try:
1209
+ res = tabulate(table[1:], headers=table[0], tablefmt="github")
1210
+ except ValueError:
1211
+ res = tabulate(
1212
+ table[1:],
1213
+ headers=table[0],
1214
+ tablefmt="github",
1215
+ disable_numparse=True,
1216
+ )
1095
1217
 
1096
- md_table = ""
1097
- if len(table) > 1 and len(table[0]) > 0:
1098
- try:
1099
- md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
1100
- except ValueError:
1101
- md_table = tabulate(
1102
- table[1:],
1103
- headers=table[0],
1104
- tablefmt="github",
1105
- disable_numparse=True,
1106
- )
1107
- return md_table
1218
+ return res
1108
1219
 
1109
1220
  def export_to_html(
1110
1221
  self,
@@ -1397,10 +1508,6 @@ class KeyValueItem(FloatingItem):
1397
1508
 
1398
1509
  graph: GraphData
1399
1510
 
1400
- def _export_to_markdown(self) -> str:
1401
- # TODO add actual implementation
1402
- return "<!-- missing-key-value-item -->"
1403
-
1404
1511
 
1405
1512
  class FormItem(FloatingItem):
1406
1513
  """FormItem."""
@@ -1409,17 +1516,15 @@ class FormItem(FloatingItem):
1409
1516
 
1410
1517
  graph: GraphData
1411
1518
 
1412
- def _export_to_markdown(self) -> str:
1413
- # TODO add actual implementation
1414
- return "<!-- missing-form-item -->"
1415
-
1416
1519
 
1417
1520
  ContentItem = Annotated[
1418
1521
  Union[
1419
1522
  TextItem,
1523
+ TitleItem,
1420
1524
  SectionHeaderItem,
1421
1525
  ListItem,
1422
1526
  CodeItem,
1527
+ FormulaItem,
1423
1528
  PictureItem,
1424
1529
  TableItem,
1425
1530
  KeyValueItem,
@@ -1530,8 +1635,10 @@ class DoclingDocument(BaseModel):
1530
1635
  ) # List[RefItem] = []
1531
1636
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1532
1637
 
1533
- groups: List[GroupItem] = []
1534
- texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
1638
+ groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
1639
+ texts: List[
1640
+ Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
1641
+ ] = []
1535
1642
  pictures: List[PictureItem] = []
1536
1643
  tables: List[TableItem] = []
1537
1644
  key_value_items: List[KeyValueItem] = []
@@ -1555,6 +1662,68 @@ class DoclingDocument(BaseModel):
1555
1662
  item["content_layer"] = "furniture"
1556
1663
  return data
1557
1664
 
1665
+ ###################################
1666
+ # TODO: refactor add* methods below
1667
+ ###################################
1668
+
1669
+ def add_ordered_list(
1670
+ self,
1671
+ name: Optional[str] = None,
1672
+ parent: Optional[NodeItem] = None,
1673
+ content_layer: Optional[ContentLayer] = None,
1674
+ ) -> GroupItem:
1675
+ """add_ordered_list."""
1676
+ _parent = parent or self.body
1677
+ cref = f"#/groups/{len(self.groups)}"
1678
+ group = OrderedList(self_ref=cref, parent=_parent.get_ref())
1679
+ if name is not None:
1680
+ group.name = name
1681
+ if content_layer:
1682
+ group.content_layer = content_layer
1683
+
1684
+ self.groups.append(group)
1685
+ _parent.children.append(RefItem(cref=cref))
1686
+ return group
1687
+
1688
+ def add_unordered_list(
1689
+ self,
1690
+ name: Optional[str] = None,
1691
+ parent: Optional[NodeItem] = None,
1692
+ content_layer: Optional[ContentLayer] = None,
1693
+ ) -> GroupItem:
1694
+ """add_unordered_list."""
1695
+ _parent = parent or self.body
1696
+ cref = f"#/groups/{len(self.groups)}"
1697
+ group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
1698
+ if name is not None:
1699
+ group.name = name
1700
+ if content_layer:
1701
+ group.content_layer = content_layer
1702
+
1703
+ self.groups.append(group)
1704
+ _parent.children.append(RefItem(cref=cref))
1705
+ return group
1706
+
1707
+ def add_inline_group(
1708
+ self,
1709
+ name: Optional[str] = None,
1710
+ parent: Optional[NodeItem] = None,
1711
+ content_layer: Optional[ContentLayer] = None,
1712
+ # marker: Optional[UnorderedList.ULMarker] = None,
1713
+ ) -> GroupItem:
1714
+ """add_inline_group."""
1715
+ _parent = parent or self.body
1716
+ cref = f"#/groups/{len(self.groups)}"
1717
+ group = InlineGroup(self_ref=cref, parent=_parent.get_ref())
1718
+ if name is not None:
1719
+ group.name = name
1720
+ if content_layer:
1721
+ group.content_layer = content_layer
1722
+
1723
+ self.groups.append(group)
1724
+ _parent.children.append(RefItem(cref=cref))
1725
+ return group
1726
+
1558
1727
  def add_group(
1559
1728
  self,
1560
1729
  label: Optional[GroupLabel] = None,
@@ -1569,6 +1738,25 @@ class DoclingDocument(BaseModel):
1569
1738
  :param parent: Optional[NodeItem]: (Default value = None)
1570
1739
 
1571
1740
  """
1741
+ if label == GroupLabel.LIST:
1742
+ return self.add_unordered_list(
1743
+ name=name,
1744
+ parent=parent,
1745
+ content_layer=content_layer,
1746
+ )
1747
+ elif label == GroupLabel.ORDERED_LIST:
1748
+ return self.add_ordered_list(
1749
+ name=name,
1750
+ parent=parent,
1751
+ content_layer=content_layer,
1752
+ )
1753
+ elif label == GroupLabel.INLINE:
1754
+ return self.add_inline_group(
1755
+ name=name,
1756
+ parent=parent,
1757
+ content_layer=content_layer,
1758
+ )
1759
+
1572
1760
  if not parent:
1573
1761
  parent = self.body
1574
1762
 
@@ -1597,6 +1785,8 @@ class DoclingDocument(BaseModel):
1597
1785
  prov: Optional[ProvenanceItem] = None,
1598
1786
  parent: Optional[NodeItem] = None,
1599
1787
  content_layer: Optional[ContentLayer] = None,
1788
+ formatting: Optional[Formatting] = None,
1789
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1600
1790
  ):
1601
1791
  """add_list_item.
1602
1792
 
@@ -1624,6 +1814,8 @@ class DoclingDocument(BaseModel):
1624
1814
  parent=parent.get_ref(),
1625
1815
  enumerated=enumerated,
1626
1816
  marker=marker,
1817
+ formatting=formatting,
1818
+ hyperlink=hyperlink,
1627
1819
  )
1628
1820
  if prov:
1629
1821
  list_item.prov.append(prov)
@@ -1643,6 +1835,8 @@ class DoclingDocument(BaseModel):
1643
1835
  prov: Optional[ProvenanceItem] = None,
1644
1836
  parent: Optional[NodeItem] = None,
1645
1837
  content_layer: Optional[ContentLayer] = None,
1838
+ formatting: Optional[Formatting] = None,
1839
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1646
1840
  ):
1647
1841
  """add_text.
1648
1842
 
@@ -1662,6 +1856,8 @@ class DoclingDocument(BaseModel):
1662
1856
  prov=prov,
1663
1857
  parent=parent,
1664
1858
  content_layer=content_layer,
1859
+ formatting=formatting,
1860
+ hyperlink=hyperlink,
1665
1861
  )
1666
1862
 
1667
1863
  elif label in [DocItemLabel.LIST_ITEM]:
@@ -1671,15 +1867,31 @@ class DoclingDocument(BaseModel):
1671
1867
  prov=prov,
1672
1868
  parent=parent,
1673
1869
  content_layer=content_layer,
1870
+ formatting=formatting,
1871
+ hyperlink=hyperlink,
1872
+ )
1873
+
1874
+ elif label in [DocItemLabel.TITLE]:
1875
+ return self.add_title(
1876
+ text=text,
1877
+ orig=orig,
1878
+ prov=prov,
1879
+ parent=parent,
1880
+ content_layer=content_layer,
1881
+ formatting=formatting,
1882
+ hyperlink=hyperlink,
1674
1883
  )
1675
1884
 
1676
1885
  elif label in [DocItemLabel.SECTION_HEADER]:
1677
1886
  return self.add_heading(
1678
1887
  text=text,
1679
1888
  orig=orig,
1889
+ # NOTE: we do not / cannot pass the level here, lossy path..
1680
1890
  prov=prov,
1681
1891
  parent=parent,
1682
1892
  content_layer=content_layer,
1893
+ formatting=formatting,
1894
+ hyperlink=hyperlink,
1683
1895
  )
1684
1896
 
1685
1897
  elif label in [DocItemLabel.CODE]:
@@ -1689,6 +1901,18 @@ class DoclingDocument(BaseModel):
1689
1901
  prov=prov,
1690
1902
  parent=parent,
1691
1903
  content_layer=content_layer,
1904
+ formatting=formatting,
1905
+ hyperlink=hyperlink,
1906
+ )
1907
+ elif label in [DocItemLabel.FORMULA]:
1908
+ return self.add_formula(
1909
+ text=text,
1910
+ orig=orig,
1911
+ prov=prov,
1912
+ parent=parent,
1913
+ content_layer=content_layer,
1914
+ formatting=formatting,
1915
+ hyperlink=hyperlink,
1692
1916
  )
1693
1917
 
1694
1918
  else:
@@ -1707,6 +1931,8 @@ class DoclingDocument(BaseModel):
1707
1931
  orig=orig,
1708
1932
  self_ref=cref,
1709
1933
  parent=parent.get_ref(),
1934
+ formatting=formatting,
1935
+ hyperlink=hyperlink,
1710
1936
  )
1711
1937
  if prov:
1712
1938
  text_item.prov.append(prov)
@@ -1808,11 +2034,14 @@ class DoclingDocument(BaseModel):
1808
2034
  prov: Optional[ProvenanceItem] = None,
1809
2035
  parent: Optional[NodeItem] = None,
1810
2036
  content_layer: Optional[ContentLayer] = None,
2037
+ formatting: Optional[Formatting] = None,
2038
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1811
2039
  ):
1812
2040
  """add_title.
1813
2041
 
1814
2042
  :param text: str:
1815
2043
  :param orig: Optional[str]: (Default value = None)
2044
+ :param level: LevelNumber: (Default value = 1)
1816
2045
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1817
2046
  :param parent: Optional[NodeItem]: (Default value = None)
1818
2047
  """
@@ -1824,22 +2053,23 @@ class DoclingDocument(BaseModel):
1824
2053
 
1825
2054
  text_index = len(self.texts)
1826
2055
  cref = f"#/texts/{text_index}"
1827
- text_item = TextItem(
1828
- label=DocItemLabel.TITLE,
2056
+ item = TitleItem(
1829
2057
  text=text,
1830
2058
  orig=orig,
1831
2059
  self_ref=cref,
1832
2060
  parent=parent.get_ref(),
2061
+ formatting=formatting,
2062
+ hyperlink=hyperlink,
1833
2063
  )
1834
2064
  if prov:
1835
- text_item.prov.append(prov)
2065
+ item.prov.append(prov)
1836
2066
  if content_layer:
1837
- text_item.content_layer = content_layer
2067
+ item.content_layer = content_layer
1838
2068
 
1839
- self.texts.append(text_item)
2069
+ self.texts.append(item)
1840
2070
  parent.children.append(RefItem(cref=cref))
1841
2071
 
1842
- return text_item
2072
+ return item
1843
2073
 
1844
2074
  def add_code(
1845
2075
  self,
@@ -1850,6 +2080,8 @@ class DoclingDocument(BaseModel):
1850
2080
  prov: Optional[ProvenanceItem] = None,
1851
2081
  parent: Optional[NodeItem] = None,
1852
2082
  content_layer: Optional[ContentLayer] = None,
2083
+ formatting: Optional[Formatting] = None,
2084
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1853
2085
  ):
1854
2086
  """add_code.
1855
2087
 
@@ -1874,6 +2106,8 @@ class DoclingDocument(BaseModel):
1874
2106
  orig=orig,
1875
2107
  self_ref=cref,
1876
2108
  parent=parent.get_ref(),
2109
+ formatting=formatting,
2110
+ hyperlink=hyperlink,
1877
2111
  )
1878
2112
  if code_language:
1879
2113
  code_item.code_language = code_language
@@ -1889,6 +2123,50 @@ class DoclingDocument(BaseModel):
1889
2123
 
1890
2124
  return code_item
1891
2125
 
2126
+ def add_formula(
2127
+ self,
2128
+ text: str,
2129
+ orig: Optional[str] = None,
2130
+ prov: Optional[ProvenanceItem] = None,
2131
+ parent: Optional[NodeItem] = None,
2132
+ content_layer: Optional[ContentLayer] = None,
2133
+ formatting: Optional[Formatting] = None,
2134
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
2135
+ ):
2136
+ """add_formula.
2137
+
2138
+ :param text: str:
2139
+ :param orig: Optional[str]: (Default value = None)
2140
+ :param level: LevelNumber: (Default value = 1)
2141
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
2142
+ :param parent: Optional[NodeItem]: (Default value = None)
2143
+ """
2144
+ if not parent:
2145
+ parent = self.body
2146
+
2147
+ if not orig:
2148
+ orig = text
2149
+
2150
+ text_index = len(self.texts)
2151
+ cref = f"#/texts/{text_index}"
2152
+ section_header_item = FormulaItem(
2153
+ text=text,
2154
+ orig=orig,
2155
+ self_ref=cref,
2156
+ parent=parent.get_ref(),
2157
+ formatting=formatting,
2158
+ hyperlink=hyperlink,
2159
+ )
2160
+ if prov:
2161
+ section_header_item.prov.append(prov)
2162
+ if content_layer:
2163
+ section_header_item.content_layer = content_layer
2164
+
2165
+ self.texts.append(section_header_item)
2166
+ parent.children.append(RefItem(cref=cref))
2167
+
2168
+ return section_header_item
2169
+
1892
2170
  def add_heading(
1893
2171
  self,
1894
2172
  text: str,
@@ -1897,6 +2175,8 @@ class DoclingDocument(BaseModel):
1897
2175
  prov: Optional[ProvenanceItem] = None,
1898
2176
  parent: Optional[NodeItem] = None,
1899
2177
  content_layer: Optional[ContentLayer] = None,
2178
+ formatting: Optional[Formatting] = None,
2179
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
1900
2180
  ):
1901
2181
  """add_heading.
1902
2182
 
@@ -1921,6 +2201,8 @@ class DoclingDocument(BaseModel):
1921
2201
  orig=orig,
1922
2202
  self_ref=cref,
1923
2203
  parent=parent.get_ref(),
2204
+ formatting=formatting,
2205
+ hyperlink=hyperlink,
1924
2206
  )
1925
2207
  if prov:
1926
2208
  section_header_item.prov.append(prov)
@@ -2276,10 +2558,10 @@ class DoclingDocument(BaseModel):
2276
2558
  self,
2277
2559
  filename: Path,
2278
2560
  artifacts_dir: Optional[Path] = None,
2279
- delim: str = "\n\n", # TODO: deprecate
2561
+ delim: str = "\n\n",
2280
2562
  from_element: int = 0,
2281
2563
  to_element: int = sys.maxsize,
2282
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2564
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2283
2565
  strict_text: bool = False,
2284
2566
  escaping_underscores: bool = True,
2285
2567
  image_placeholder: str = "<!-- image -->",
@@ -2319,10 +2601,10 @@ class DoclingDocument(BaseModel):
2319
2601
 
2320
2602
  def export_to_markdown( # noqa: C901
2321
2603
  self,
2322
- delim: str = "\n\n", # TODO deprecate
2604
+ delim: str = "\n\n",
2323
2605
  from_element: int = 0,
2324
2606
  to_element: int = sys.maxsize,
2325
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2607
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2326
2608
  strict_text: bool = False,
2327
2609
  escaping_underscores: bool = True,
2328
2610
  image_placeholder: str = "<!-- image -->",
@@ -2337,9 +2619,8 @@ class DoclingDocument(BaseModel):
2337
2619
  Operates on a slice of the document's body as defined through arguments
2338
2620
  from_element and to_element; defaulting to the whole document.
2339
2621
 
2340
- :param delim: Delimiter to use when concatenating the various
2341
- Markdown parts. (Default value = "\n").
2342
- :type delim: str = "\n"
2622
+ :param delim: Deprecated.
2623
+ :type delim: str = "\n\n"
2343
2624
  :param from_element: Body slicing start index (inclusive).
2344
2625
  (Default value = 0).
2345
2626
  :type from_element: int = 0
@@ -2347,9 +2628,8 @@ class DoclingDocument(BaseModel):
2347
2628
  (exclusive). (Default value = maxint).
2348
2629
  :type to_element: int = sys.maxsize
2349
2630
  :param labels: The set of document labels to include in the export.
2350
- :type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
2351
- :param strict_text: bool: Whether to only include the text content
2352
- of the document. (Default value = False).
2631
+ :type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
2632
+ :param strict_text: Deprecated.
2353
2633
  :type strict_text: bool = False
2354
2634
  :param escaping_underscores: bool: Whether to escape underscores in the
2355
2635
  text content of the document. (Default value = True).
@@ -2366,250 +2646,48 @@ class DoclingDocument(BaseModel):
2366
2646
  :returns: The exported Markdown representation.
2367
2647
  :rtype: str
2368
2648
  """
2369
- comps = self._get_markdown_components(
2370
- node=self.body,
2371
- from_element=from_element,
2372
- to_element=to_element,
2373
- labels=labels,
2374
- strict_text=strict_text,
2375
- escaping_underscores=escaping_underscores,
2649
+ from docling_core.experimental.serializer.markdown import (
2650
+ MarkdownDocSerializer,
2651
+ MarkdownListSerializer,
2652
+ MarkdownTextSerializer,
2653
+ )
2654
+
2655
+ serializer = MarkdownDocSerializer(
2656
+ doc=self,
2657
+ start=from_element,
2658
+ stop=to_element,
2376
2659
  image_placeholder=image_placeholder,
2377
2660
  image_mode=image_mode,
2378
- indent=indent,
2379
- text_width=text_width,
2380
- page_no=page_no,
2381
- included_content_layers=included_content_layers,
2382
- list_level=0,
2383
- is_inline_scope=False,
2384
- visited=set(),
2661
+ labels=labels,
2662
+ layers=included_content_layers,
2663
+ pages={page_no} if page_no is not None else None,
2664
+ escaping_underscores=escaping_underscores,
2665
+ text_serializer=MarkdownTextSerializer(
2666
+ wrap_width=text_width if text_width > 0 else None,
2667
+ ),
2668
+ list_serializer=MarkdownListSerializer(
2669
+ indent=indent,
2670
+ ),
2385
2671
  )
2386
- return delim.join(comps)
2387
-
2388
- def _get_markdown_components( # noqa: C901
2389
- self,
2390
- node: NodeItem,
2391
- from_element: int,
2392
- to_element: int,
2393
- labels: set[DocItemLabel],
2394
- strict_text: bool,
2395
- escaping_underscores: bool,
2396
- image_placeholder: str,
2397
- image_mode: ImageRefMode,
2398
- indent: int,
2399
- text_width: int,
2400
- page_no: Optional[int],
2401
- included_content_layers: set[ContentLayer],
2402
- list_level: int,
2403
- is_inline_scope: bool,
2404
- visited: set[str], # refs of visited items
2405
- ) -> list[str]:
2406
- components: list[str] = [] # components to concatenate
2407
-
2408
- # Our export markdown doesn't contain any emphasis styling:
2409
- # Bold, Italic, or Bold-Italic
2410
- # Hence, any underscore that we print into Markdown is coming from document text
2411
- # That means we need to escape it, to properly reflect content in the markdown
2412
- # However, we need to preserve underscores in image URLs
2413
- # to maintain their validity
2414
- # For example: ![image](path/to_image.png) should remain unchanged
2415
- def _escape_underscores(text):
2416
- """Escape underscores but leave them intact in the URL.."""
2417
- # Firstly, identify all the URL patterns.
2418
- url_pattern = r"!\[.*?\]\((.*?)\)"
2419
- # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2420
- latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2421
- combined_pattern = f"({url_pattern})|({latex_pattern})"
2422
-
2423
- parts = []
2424
- last_end = 0
2425
-
2426
- for match in re.finditer(combined_pattern, text):
2427
- # Text to add before the URL (needs to be escaped)
2428
- before_url = text[last_end : match.start()]
2429
- parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2430
-
2431
- # Add the full URL part (do not escape)
2432
- parts.append(match.group(0))
2433
- last_end = match.end()
2434
-
2435
- # Add the final part of the text (which needs to be escaped)
2436
- if last_end < len(text):
2437
- parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2438
-
2439
- return "".join(parts)
2440
-
2441
- def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
2442
- if do_escape_underscores and escaping_underscores:
2443
- text = _escape_underscores(text)
2444
- if do_escape_html:
2445
- text = html.escape(text, quote=False)
2446
- if text:
2447
- components.append(text)
2672
+ ser_res = serializer.serialize()
2448
2673
 
2449
- for ix, (item, level) in enumerate(
2450
- self.iterate_items(
2451
- node,
2452
- with_groups=True,
2453
- page_no=page_no,
2454
- included_content_layers=included_content_layers,
2674
+ if delim != "\n\n":
2675
+ _logger.warning(
2676
+ "Parameter `delim` has been deprecated and will be ignored.",
2677
+ )
2678
+ if strict_text:
2679
+ _logger.warning(
2680
+ "Parameter `strict_text` has been deprecated and will be ignored.",
2455
2681
  )
2456
- ):
2457
- if item.self_ref in visited:
2458
- continue
2459
- else:
2460
- visited.add(item.self_ref)
2461
-
2462
- if ix < from_element or to_element <= ix:
2463
- continue # skip as many items as you want
2464
-
2465
- elif (isinstance(item, DocItem)) and (item.label not in labels):
2466
- continue # skip any label that is not whitelisted
2467
-
2468
- elif isinstance(item, GroupItem):
2469
- if item.label in [
2470
- GroupLabel.LIST,
2471
- GroupLabel.ORDERED_LIST,
2472
- ]:
2473
- comps = self._get_markdown_components(
2474
- node=item,
2475
- from_element=from_element,
2476
- to_element=to_element,
2477
- labels=labels,
2478
- strict_text=strict_text,
2479
- escaping_underscores=escaping_underscores,
2480
- image_placeholder=image_placeholder,
2481
- image_mode=image_mode,
2482
- indent=indent,
2483
- text_width=text_width,
2484
- page_no=page_no,
2485
- included_content_layers=included_content_layers,
2486
- list_level=list_level + 1,
2487
- is_inline_scope=is_inline_scope,
2488
- visited=visited,
2489
- )
2490
- indent_str = list_level * indent * " "
2491
- is_ol = item.label == GroupLabel.ORDERED_LIST
2492
- text = "\n".join(
2493
- [
2494
- # avoid additional marker on already evaled sublists
2495
- (
2496
- c
2497
- if c and c[0] == " "
2498
- else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
2499
- )
2500
- for i, c in enumerate(comps)
2501
- ]
2502
- )
2503
- _ingest_text(
2504
- text=text,
2505
- # special chars have already been escaped as needed
2506
- do_escape_html=False,
2507
- do_escape_underscores=False,
2508
- )
2509
- elif item.label == GroupLabel.INLINE:
2510
- comps = self._get_markdown_components(
2511
- node=item,
2512
- from_element=from_element,
2513
- to_element=to_element,
2514
- labels=labels,
2515
- strict_text=strict_text,
2516
- escaping_underscores=escaping_underscores,
2517
- image_placeholder=image_placeholder,
2518
- image_mode=image_mode,
2519
- indent=indent,
2520
- text_width=text_width,
2521
- page_no=page_no,
2522
- included_content_layers=included_content_layers,
2523
- list_level=list_level,
2524
- is_inline_scope=True,
2525
- visited=visited,
2526
- )
2527
- text = " ".join(comps)
2528
- _ingest_text(
2529
- text=text,
2530
- # special chars have already been escaped as needed
2531
- do_escape_html=False,
2532
- do_escape_underscores=False,
2533
- )
2534
- else:
2535
- continue
2536
-
2537
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2538
- marker = "" if strict_text else "#"
2539
- text = f"{marker} {item.text}"
2540
- _ingest_text(text.strip())
2541
-
2542
- elif (
2543
- isinstance(item, TextItem)
2544
- and item.label in [DocItemLabel.SECTION_HEADER]
2545
- ) or isinstance(item, SectionHeaderItem):
2546
- marker = ""
2547
- if not strict_text:
2548
- marker = "#" * level
2549
- if len(marker) < 2:
2550
- marker = "##"
2551
- text = f"{marker} {item.text}"
2552
- _ingest_text(text.strip())
2553
-
2554
- elif isinstance(item, CodeItem):
2555
- text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
2556
- _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
2557
-
2558
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2559
- if item.text != "":
2560
- _ingest_text(
2561
- f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
2562
- do_escape_underscores=False,
2563
- do_escape_html=False,
2564
- )
2565
- elif item.orig != "":
2566
- _ingest_text(
2567
- "<!-- formula-not-decoded -->",
2568
- do_escape_underscores=False,
2569
- do_escape_html=False,
2570
- )
2571
-
2572
- elif isinstance(item, TextItem):
2573
- if len(item.text) and text_width > 0:
2574
- text = item.text
2575
- wrapped_text = textwrap.fill(text, width=text_width)
2576
- _ingest_text(wrapped_text)
2577
- elif len(item.text):
2578
- _ingest_text(item.text)
2579
-
2580
- elif isinstance(item, TableItem) and not strict_text:
2581
- if caption_text := item.caption_text(self):
2582
- _ingest_text(caption_text)
2583
- md_table = item.export_to_markdown()
2584
- _ingest_text(md_table)
2585
-
2586
- elif isinstance(item, PictureItem) and not strict_text:
2587
- _ingest_text(item.caption_text(self))
2588
-
2589
- line = item.export_to_markdown(
2590
- doc=self,
2591
- image_placeholder=image_placeholder,
2592
- image_mode=image_mode,
2593
- )
2594
-
2595
- _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
2596
-
2597
- elif isinstance(item, (KeyValueItem, FormItem)):
2598
- text = item._export_to_markdown()
2599
- _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2600
-
2601
- elif isinstance(item, DocItem):
2602
- text = "<!-- missing-text -->"
2603
- _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2604
2682
 
2605
- return components
2683
+ return ser_res.text
2606
2684
 
2607
2685
  def export_to_text( # noqa: C901
2608
2686
  self,
2609
2687
  delim: str = "\n\n",
2610
2688
  from_element: int = 0,
2611
2689
  to_element: int = 1000000,
2612
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2690
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2613
2691
  ) -> str:
2614
2692
  """export_to_text."""
2615
2693
  return self.export_to_markdown(
@@ -2936,7 +3014,378 @@ class DoclingDocument(BaseModel):
2936
3014
 
2937
3015
  return html_text
2938
3016
 
2939
- def save_as_document_tokens(
3017
+ def load_from_doctags( # noqa: C901
3018
+ self,
3019
+ doctag_document: DocTagsDocument,
3020
+ ) -> "DoclingDocument":
3021
+ r"""Load Docling document from lists of DocTags and Images."""
3022
+ # Maps the recognized tag to a Docling label.
3023
+ # Code items will be given DocItemLabel.CODE
3024
+ tag_to_doclabel = {
3025
+ "title": DocItemLabel.TITLE,
3026
+ "document_index": DocItemLabel.DOCUMENT_INDEX,
3027
+ "otsl": DocItemLabel.TABLE,
3028
+ "section_header_level_1": DocItemLabel.SECTION_HEADER,
3029
+ "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
3030
+ "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
3031
+ "text": DocItemLabel.TEXT,
3032
+ "page_header": DocItemLabel.PAGE_HEADER,
3033
+ "page_footer": DocItemLabel.PAGE_FOOTER,
3034
+ "formula": DocItemLabel.FORMULA,
3035
+ "caption": DocItemLabel.CAPTION,
3036
+ "picture": DocItemLabel.PICTURE,
3037
+ "list_item": DocItemLabel.LIST_ITEM,
3038
+ "footnote": DocItemLabel.FOOTNOTE,
3039
+ "code": DocItemLabel.CODE,
3040
+ }
3041
+
3042
+ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3043
+ """Extract <loc_...> coords from the chunk, normalized by / 500."""
3044
+ coords = re.findall(r"<loc_(\d+)>", text_chunk)
3045
+ if len(coords) == 4:
3046
+ l, t, r, b = map(float, coords)
3047
+ return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
3048
+ return None
3049
+
3050
+ def extract_inner_text(text_chunk: str) -> str:
3051
+ """Strip all <...> tags inside the chunk to get the raw text content."""
3052
+ return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
3053
+
3054
+ def otsl_parse_texts(texts, tokens):
3055
+ split_word = TableToken.OTSL_NL.value
3056
+ split_row_tokens = [
3057
+ list(y)
3058
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
3059
+ if not x
3060
+ ]
3061
+ table_cells = []
3062
+ r_idx = 0
3063
+ c_idx = 0
3064
+
3065
+ def count_right(tokens, c_idx, r_idx, which_tokens):
3066
+ span = 0
3067
+ c_idx_iter = c_idx
3068
+ while tokens[r_idx][c_idx_iter] in which_tokens:
3069
+ c_idx_iter += 1
3070
+ span += 1
3071
+ if c_idx_iter >= len(tokens[r_idx]):
3072
+ return span
3073
+ return span
3074
+
3075
+ def count_down(tokens, c_idx, r_idx, which_tokens):
3076
+ span = 0
3077
+ r_idx_iter = r_idx
3078
+ while tokens[r_idx_iter][c_idx] in which_tokens:
3079
+ r_idx_iter += 1
3080
+ span += 1
3081
+ if r_idx_iter >= len(tokens):
3082
+ return span
3083
+ return span
3084
+
3085
+ for i, text in enumerate(texts):
3086
+ cell_text = ""
3087
+ if text in [
3088
+ TableToken.OTSL_FCEL.value,
3089
+ TableToken.OTSL_ECEL.value,
3090
+ TableToken.OTSL_CHED.value,
3091
+ TableToken.OTSL_RHED.value,
3092
+ TableToken.OTSL_SROW.value,
3093
+ ]:
3094
+ row_span = 1
3095
+ col_span = 1
3096
+ right_offset = 1
3097
+ if text != TableToken.OTSL_ECEL.value:
3098
+ cell_text = texts[i + 1]
3099
+ right_offset = 2
3100
+
3101
+ # Check next element(s) for lcel / ucel / xcel,
3102
+ # set properly row_span, col_span
3103
+ next_right_cell = ""
3104
+ if i + right_offset < len(texts):
3105
+ next_right_cell = texts[i + right_offset]
3106
+
3107
+ next_bottom_cell = ""
3108
+ if r_idx + 1 < len(split_row_tokens):
3109
+ if c_idx < len(split_row_tokens[r_idx + 1]):
3110
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
3111
+
3112
+ if next_right_cell in [
3113
+ TableToken.OTSL_LCEL.value,
3114
+ TableToken.OTSL_XCEL.value,
3115
+ ]:
3116
+ # we have horisontal spanning cell or 2d spanning cell
3117
+ col_span += count_right(
3118
+ split_row_tokens,
3119
+ c_idx + 1,
3120
+ r_idx,
3121
+ [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
3122
+ )
3123
+ if next_bottom_cell in [
3124
+ TableToken.OTSL_UCEL.value,
3125
+ TableToken.OTSL_XCEL.value,
3126
+ ]:
3127
+ # we have a vertical spanning cell or 2d spanning cell
3128
+ row_span += count_down(
3129
+ split_row_tokens,
3130
+ c_idx,
3131
+ r_idx + 1,
3132
+ [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
3133
+ )
3134
+
3135
+ table_cells.append(
3136
+ TableCell(
3137
+ text=cell_text.strip(),
3138
+ row_span=row_span,
3139
+ col_span=col_span,
3140
+ start_row_offset_idx=r_idx,
3141
+ end_row_offset_idx=r_idx + row_span,
3142
+ start_col_offset_idx=c_idx,
3143
+ end_col_offset_idx=c_idx + col_span,
3144
+ )
3145
+ )
3146
+ if text in [
3147
+ TableToken.OTSL_FCEL.value,
3148
+ TableToken.OTSL_ECEL.value,
3149
+ TableToken.OTSL_CHED.value,
3150
+ TableToken.OTSL_RHED.value,
3151
+ TableToken.OTSL_SROW.value,
3152
+ TableToken.OTSL_LCEL.value,
3153
+ TableToken.OTSL_UCEL.value,
3154
+ TableToken.OTSL_XCEL.value,
3155
+ ]:
3156
+ c_idx += 1
3157
+ if text == TableToken.OTSL_NL.value:
3158
+ r_idx += 1
3159
+ c_idx = 0
3160
+ return table_cells, split_row_tokens
3161
+
3162
+ def otsl_extract_tokens_and_text(s: str):
3163
+ # Pattern to match anything enclosed by < >
3164
+ # (including the angle brackets themselves)
3165
+ pattern = r"(<[^>]+>)"
3166
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
3167
+ tokens = re.findall(pattern, s)
3168
+ # Remove any tokens that start with "<loc_"
3169
+ tokens = [
3170
+ token
3171
+ for token in tokens
3172
+ if not (
3173
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3174
+ or token
3175
+ in [
3176
+ rf"<{DocumentToken.OTSL.value}>",
3177
+ rf"</{DocumentToken.OTSL.value}>",
3178
+ ]
3179
+ )
3180
+ ]
3181
+ # Split the string by those tokens to get the in-between text
3182
+ text_parts = re.split(pattern, s)
3183
+ text_parts = [
3184
+ token
3185
+ for token in text_parts
3186
+ if not (
3187
+ token.startswith(rf"<{DocumentToken.LOC.value}")
3188
+ or token
3189
+ in [
3190
+ rf"<{DocumentToken.OTSL.value}>",
3191
+ rf"</{DocumentToken.OTSL.value}>",
3192
+ ]
3193
+ )
3194
+ ]
3195
+ # Remove any empty or purely whitespace strings from text_parts
3196
+ text_parts = [part for part in text_parts if part.strip()]
3197
+
3198
+ return tokens, text_parts
3199
+
3200
+ def parse_table_content(otsl_content: str) -> TableData:
3201
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
3202
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
3203
+
3204
+ return TableData(
3205
+ num_rows=len(split_row_tokens),
3206
+ num_cols=(
3207
+ max(len(row) for row in split_row_tokens) if split_row_tokens else 0
3208
+ ),
3209
+ table_cells=table_cells,
3210
+ )
3211
+
3212
+ # doc = DoclingDocument(name="Document")
3213
+ for pg_idx, doctag_page in enumerate(doctag_document.pages):
3214
+ page_doctags = doctag_page.tokens
3215
+ image = doctag_page.image
3216
+
3217
+ page_no = pg_idx + 1
3218
+ # bounding_boxes = []
3219
+
3220
+ if image is not None:
3221
+ pg_width = image.width
3222
+ pg_height = image.height
3223
+ else:
3224
+ pg_width = 1
3225
+ pg_height = 1
3226
+
3227
+ """
3228
+ 1. Finds all <tag>...</tag>
3229
+ blocks in the entire string (multi-line friendly)
3230
+ in the order they appear.
3231
+ 2. For each chunk, extracts bounding box (if any) and inner text.
3232
+ 3. Adds the item to a DoclingDocument structure with the right label.
3233
+ 4. Tracks bounding boxes+color in a separate list for later visualization.
3234
+ """
3235
+
3236
+ # Regex for root level recognized tags
3237
+ tag_pattern = (
3238
+ rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
3239
+ rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
3240
+ rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
3241
+ rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
3242
+ rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
3243
+ rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
3244
+ rf"{DocItemLabel.SECTION_HEADER}_level_1|"
3245
+ rf"{DocumentToken.ORDERED_LIST.value}|"
3246
+ rf"{DocumentToken.UNORDERED_LIST.value}|"
3247
+ rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3248
+ )
3249
+
3250
+ # DocumentToken.OTSL
3251
+ pattern = re.compile(tag_pattern, re.DOTALL)
3252
+
3253
+ # Go through each match in order
3254
+ for match in pattern.finditer(page_doctags):
3255
+ full_chunk = match.group(0)
3256
+ tag_name = match.group("tag")
3257
+
3258
+ bbox = extract_bounding_box(full_chunk) if image else None
3259
+ doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3260
+
3261
+ if tag_name == DocumentToken.OTSL.value:
3262
+ table_data = parse_table_content(full_chunk)
3263
+ bbox = extract_bounding_box(full_chunk) if image else None
3264
+
3265
+ if bbox:
3266
+ prov = ProvenanceItem(
3267
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3268
+ charspan=(0, 0),
3269
+ page_no=page_no,
3270
+ )
3271
+ self.add_table(data=table_data, prov=prov)
3272
+ else:
3273
+ self.add_table(data=table_data)
3274
+
3275
+ elif tag_name == DocItemLabel.PICTURE:
3276
+ text_caption_content = extract_inner_text(full_chunk)
3277
+ if image:
3278
+ if bbox:
3279
+ im_width, im_height = image.size
3280
+
3281
+ crop_box = (
3282
+ int(bbox.l * im_width),
3283
+ int(bbox.t * im_height),
3284
+ int(bbox.r * im_width),
3285
+ int(bbox.b * im_height),
3286
+ )
3287
+ cropped_image = image.crop(crop_box)
3288
+ pic = self.add_picture(
3289
+ parent=None,
3290
+ image=ImageRef.from_pil(image=cropped_image, dpi=72),
3291
+ prov=(
3292
+ ProvenanceItem(
3293
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3294
+ charspan=(0, 0),
3295
+ page_no=page_no,
3296
+ )
3297
+ ),
3298
+ )
3299
+ # If there is a caption to an image, add it as well
3300
+ if len(text_caption_content) > 0:
3301
+ caption_item = self.add_text(
3302
+ label=DocItemLabel.CAPTION,
3303
+ text=text_caption_content,
3304
+ parent=None,
3305
+ )
3306
+ pic.captions.append(caption_item.get_ref())
3307
+ else:
3308
+ if bbox:
3309
+ # In case we don't have access to an binary of an image
3310
+ self.add_picture(
3311
+ parent=None,
3312
+ prov=ProvenanceItem(
3313
+ bbox=bbox, charspan=(0, 0), page_no=page_no
3314
+ ),
3315
+ )
3316
+ # If there is a caption to an image, add it as well
3317
+ if len(text_caption_content) > 0:
3318
+ caption_item = self.add_text(
3319
+ label=DocItemLabel.CAPTION,
3320
+ text=text_caption_content,
3321
+ parent=None,
3322
+ )
3323
+ pic.captions.append(caption_item.get_ref())
3324
+ elif tag_name in [
3325
+ DocumentToken.ORDERED_LIST.value,
3326
+ DocumentToken.UNORDERED_LIST.value,
3327
+ ]:
3328
+ list_label = GroupLabel.LIST
3329
+ enum_marker = ""
3330
+ enum_value = 0
3331
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3332
+ list_label = GroupLabel.ORDERED_LIST
3333
+
3334
+ list_item_pattern = (
3335
+ rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
3336
+ )
3337
+ li_pattern = re.compile(list_item_pattern, re.DOTALL)
3338
+ # Add list group:
3339
+ new_list = self.add_group(label=list_label, name="list")
3340
+ # Pricess list items
3341
+ for li_match in li_pattern.finditer(full_chunk):
3342
+ enum_value += 1
3343
+ if tag_name == DocumentToken.ORDERED_LIST.value:
3344
+ enum_marker = str(enum_value) + "."
3345
+
3346
+ li_full_chunk = li_match.group(0)
3347
+ li_bbox = extract_bounding_box(li_full_chunk) if image else None
3348
+ text_content = extract_inner_text(li_full_chunk)
3349
+ # Add list item
3350
+ self.add_list_item(
3351
+ marker=enum_marker,
3352
+ enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
3353
+ parent=new_list,
3354
+ text=text_content,
3355
+ prov=(
3356
+ ProvenanceItem(
3357
+ bbox=li_bbox.resize_by_scale(pg_width, pg_height),
3358
+ charspan=(0, len(text_content)),
3359
+ page_no=page_no,
3360
+ )
3361
+ if li_bbox
3362
+ else None
3363
+ ),
3364
+ )
3365
+ else:
3366
+ # For everything else, treat as text
3367
+ text_content = extract_inner_text(full_chunk)
3368
+ self.add_text(
3369
+ label=doc_label,
3370
+ text=text_content,
3371
+ prov=(
3372
+ ProvenanceItem(
3373
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3374
+ charspan=(0, len(text_content)),
3375
+ page_no=page_no,
3376
+ )
3377
+ if bbox
3378
+ else None
3379
+ ),
3380
+ )
3381
+ return self
3382
+
3383
+ @deprecated("Use save_as_doctags instead.")
3384
+ def save_as_document_tokens(self, *args, **kwargs):
3385
+ r"""Save the document content to a DocumentToken format."""
3386
+ return self.save_as_doctags(*args, **kwargs)
3387
+
3388
+ def save_as_doctags(
2940
3389
  self,
2941
3390
  filename: Path,
2942
3391
  delim: str = "",
@@ -2952,7 +3401,7 @@ class DoclingDocument(BaseModel):
2952
3401
  add_table_cell_location: bool = False,
2953
3402
  add_table_cell_text: bool = True,
2954
3403
  ):
2955
- r"""Save the document content to a DocumentToken format."""
3404
+ r"""Save the document content to DocTags format."""
2956
3405
  out = self.export_to_document_tokens(
2957
3406
  delim=delim,
2958
3407
  from_element=from_element,