PyPI - docling-core - Versions diffs - 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl - Mend - Supply Chain Defender

docling-core 2.21.2py3-none-any.whl → 2.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (13) hide show

docling_core/types/doc/document.py CHANGED Viewed

@@ -4,13 +4,13 @@ import base64
 import copy
 import hashlib
 import html
+import itertools
 import json
 import logging
 import mimetypes
 import os
 import re
 import sys
-import textwrap
 import typing
 import warnings
 from enum import Enum
@@ -37,7 +37,7 @@ from pydantic import (
     model_validator,
 )
 from tabulate import tabulate
-from typing_extensions import Annotated, Self
+from typing_extensions import Annotated, Self, deprecated
 from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.base import _JSON_POINTER_REGEX
@@ -61,7 +61,7 @@ _logger = logging.getLogger(__name__)
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
-CURRENT_VERSION: Final = "1.2.0"
+CURRENT_VERSION: Final = "1.3.0"
 DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TITLE,
@@ -86,6 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
     [
         DocItemLabel.FOOTNOTE,
         DocItemLabel.CAPTION,
+        DocItemLabel.KEY_VALUE_REGION,
+        DocItemLabel.FORM,
     ]
 )
@@ -522,6 +524,49 @@ class ImageRef(BaseModel):
         )
+class DocTagsPage(BaseModel):
+    """DocTagsPage."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokens: str
+    image: Optional[PILImage.Image] = None
+class DocTagsDocument(BaseModel):
+    """DocTagsDocument."""
+    pages: List[DocTagsPage] = []
+    @classmethod
+    def from_doctags_and_image_pairs(
+        cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
+    ):
+        """from_doctags_and_image_pairs."""
+        if len(doctags) != len(images):
+            raise ValueError("Number of page doctags must be equal to page images!")
+        doctags_doc = cls()
+        pages = []
+        for dt, img in zip(doctags, images):
+            if isinstance(dt, Path):
+                with dt.open("r") as fp:
+                    dt = fp.read()
+            elif isinstance(dt, str):
+                pass
+            if isinstance(img, Path):
+                img = PILImage.open(img)
+            elif isinstance(dt, PILImage.Image):
+                pass
+            page = DocTagsPage(tokens=dt, image=img)
+            pages.append(page)
+        doctags_doc.pages = pages
+        return doctags_doc
 class ProvenanceItem(BaseModel):
     """ProvenanceItem."""
@@ -563,9 +608,30 @@ class GroupItem(NodeItem):  # Container type, can't be a leaf node
         "group"  # Name of the group, e.g. "Introduction Chapter",
         # "Slide 5", "Navigation menu list", ...
     )
+    # TODO narrow down to allowed values, i.e. excluding those used for subtypes
     label: GroupLabel = GroupLabel.UNSPECIFIED
+class UnorderedList(GroupItem):
+    """UnorderedList."""
+    label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST  # type: ignore[assignment]
+class OrderedList(GroupItem):
+    """OrderedList."""
+    label: typing.Literal[GroupLabel.ORDERED_LIST] = (
+        GroupLabel.ORDERED_LIST  # type: ignore[assignment]
+    )
+class InlineGroup(GroupItem):
+    """InlineGroup."""
+    label: typing.Literal[GroupLabel.INLINE] = GroupLabel.INLINE
 class DocItem(
     NodeItem
 ):  # Base type for any element that carries content, can be a leaf node
@@ -626,6 +692,15 @@ class DocItem(
         return page_image.crop(crop_bbox.as_tuple())
+class Formatting(BaseModel):
+    """Formatting."""
+    bold: bool = False
+    italic: bool = False
+    underline: bool = False
+    strikethrough: bool = False
 class TextItem(DocItem):
     """TextItem."""
@@ -634,18 +709,19 @@ class TextItem(DocItem):
         DocItemLabel.CHECKBOX_SELECTED,
         DocItemLabel.CHECKBOX_UNSELECTED,
         DocItemLabel.FOOTNOTE,
-        DocItemLabel.FORMULA,
         DocItemLabel.PAGE_FOOTER,
         DocItemLabel.PAGE_HEADER,
         DocItemLabel.PARAGRAPH,
         DocItemLabel.REFERENCE,
         DocItemLabel.TEXT,
-        DocItemLabel.TITLE,
     ]
     orig: str  # untreated representation
     text: str  # sanitized representation
+    formatting: Optional[Formatting] = None
+    hyperlink: Optional[Union[AnyUrl, Path]] = None
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
@@ -683,6 +759,14 @@ class TextItem(DocItem):
         return body
+class TitleItem(TextItem):
+    """TitleItem."""
+    label: typing.Literal[DocItemLabel.TITLE] = (
+        DocItemLabel.TITLE  # type: ignore[assignment]
+    )
 class SectionHeaderItem(TextItem):
     """SectionItem."""
@@ -818,6 +902,14 @@ class CodeItem(FloatingItem, TextItem):
         return body
+class FormulaItem(TextItem):
+    """FormulaItem."""
+    label: typing.Literal[DocItemLabel.FORMULA] = (
+        DocItemLabel.FORMULA  # type: ignore[assignment]
+    )
 class PictureItem(FloatingItem):
     """PictureItem."""
@@ -856,54 +948,34 @@ class PictureItem(FloatingItem):
     def export_to_markdown(
         self,
         doc: "DoclingDocument",
-        add_caption: bool = True,
+        add_caption: bool = True,  # deprecated
         image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
         image_placeholder: str = "<!-- image -->",
     ) -> str:
         """Export picture to Markdown format."""
-        default_response = image_placeholder
-        error_response = (
-            "<!-- 🖼️❌ Image not available. "
-            "Please use `PdfPipelineOptions(generate_picture_images=True)`"
-            " -->"
-        )
+        from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
-        if image_mode == ImageRefMode.PLACEHOLDER:
-            return default_response
-        elif image_mode == ImageRefMode.EMBEDDED:
-            # short-cut: we already have the image in base64
-            if (
-                isinstance(self.image, ImageRef)
-                and isinstance(self.image.uri, AnyUrl)
-                and self.image.uri.scheme == "data"
-            ):
-                text = f"![Image]({self.image.uri})"
-                return text
-            # get the self.image._pil or crop it out of the page-image
-            img = self.get_image(doc)
-            if img is not None:
-                imgb64 = self._image_to_base64(img)
-                text = f"![Image](data:image/png;base64,{imgb64})"
-                return text
-            else:
-                return error_response
-        elif image_mode == ImageRefMode.REFERENCED:
-            if not isinstance(self.image, ImageRef) or (
-                isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
-            ):
-                return default_response
-            text = f"![Image]({quote(str(self.image.uri))})"
-            return text
+        if not add_caption:
+            _logger.warning(
+                "Argument `add_caption` is deprecated and will be ignored.",
+            )
-        else:
-            return default_response
+        serializer = MarkdownDocSerializer(
+            doc=self,
+            image_mode=image_mode,
+        )
+        text = (
+            serializer.picture_serializer.serialize(
+                item=self,
+                doc_serializer=serializer,
+                doc=doc,
+                image_mode=image_mode,
+                image_placeholder=image_placeholder,
+            ).text
+            if serializer.picture_serializer
+            else ""
+        )
+        return text
     def export_to_html(
         self,
@@ -1003,6 +1075,20 @@ class PictureItem(FloatingItem):
             predicted_class = classifications[0].predicted_classes[0].class_name
             body += DocumentToken.get_picture_classification_token(predicted_class)
+        smiles_annotations = [
+            ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
+        ]
+        if len(smiles_annotations) > 0:
+            body += (
+                "<"
+                + DocumentToken.SMILES.value
+                + ">"
+                + smiles_annotations[0].smi
+                + "</"
+                + DocumentToken.SMILES.value
+                + ">"
+            )
         if add_caption and len(self.captions):
             text = self.caption_text(doc)
@@ -1078,33 +1164,58 @@ class TableItem(FloatingItem):
         return df
-    def export_to_markdown(self) -> str:
+    def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
         """Export the table as markdown."""
-        table = []
-        for row in self.data.grid:
-            tmp = []
-            for col in row:
+        if doc is not None:
+            from docling_core.experimental.serializer.markdown import (
+                MarkdownDocSerializer,
+            )
-                # make sure that md tables are not broken
-                # due to newline chars in the text
-                text = col.text
-                text = text.replace("\n", " ")
-                tmp.append(text)
+            serializer = MarkdownDocSerializer(
+                doc=doc,
+            )
+            text = (
+                serializer.table_serializer.serialize(
+                    item=self,
+                    doc_serializer=serializer,
+                    doc=doc,
+                ).text
+                if serializer.table_serializer
+                else ""
+            )
+            return text
+        else:
+            _logger.warning(
+                "Usage of TableItem.export_to_markdown() without `doc` argument is "
+                "deprecated.",
+            )
-            table.append(tmp)
+            table = []
+            for row in self.data.grid:
+                tmp = []
+                for col in row:
+                    # make sure that md tables are not broken
+                    # due to newline chars in the text
+                    text = col.text
+                    text = text.replace("\n", " ")
+                    tmp.append(text)
+                table.append(tmp)
+            res = ""
+            if len(table) > 1 and len(table[0]) > 0:
+                try:
+                    res = tabulate(table[1:], headers=table[0], tablefmt="github")
+                except ValueError:
+                    res = tabulate(
+                        table[1:],
+                        headers=table[0],
+                        tablefmt="github",
+                        disable_numparse=True,
+                    )
-        md_table = ""
-        if len(table) > 1 and len(table[0]) > 0:
-            try:
-                md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
-            except ValueError:
-                md_table = tabulate(
-                    table[1:],
-                    headers=table[0],
-                    tablefmt="github",
-                    disable_numparse=True,
-                )
-        return md_table
+        return res
     def export_to_html(
         self,
@@ -1397,10 +1508,6 @@ class KeyValueItem(FloatingItem):
     graph: GraphData
-    def _export_to_markdown(self) -> str:
-        # TODO add actual implementation
-        return "<!-- missing-key-value-item -->"
 class FormItem(FloatingItem):
     """FormItem."""
@@ -1409,17 +1516,15 @@ class FormItem(FloatingItem):
     graph: GraphData
-    def _export_to_markdown(self) -> str:
-        # TODO add actual implementation
-        return "<!-- missing-form-item -->"
 ContentItem = Annotated[
     Union[
         TextItem,
+        TitleItem,
         SectionHeaderItem,
         ListItem,
         CodeItem,
+        FormulaItem,
         PictureItem,
         TableItem,
         KeyValueItem,
@@ -1530,8 +1635,10 @@ class DoclingDocument(BaseModel):
     )  # List[RefItem] = []
     body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
-    groups: List[GroupItem] = []
-    texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
+    groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
+    texts: List[
+        Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
+    ] = []
     pictures: List[PictureItem] = []
     tables: List[TableItem] = []
     key_value_items: List[KeyValueItem] = []
@@ -1555,6 +1662,68 @@ class DoclingDocument(BaseModel):
                     item["content_layer"] = "furniture"
         return data
+    ###################################
+    # TODO: refactor add* methods below
+    ###################################
+    def add_ordered_list(
+        self,
+        name: Optional[str] = None,
+        parent: Optional[NodeItem] = None,
+        content_layer: Optional[ContentLayer] = None,
+    ) -> GroupItem:
+        """add_ordered_list."""
+        _parent = parent or self.body
+        cref = f"#/groups/{len(self.groups)}"
+        group = OrderedList(self_ref=cref, parent=_parent.get_ref())
+        if name is not None:
+            group.name = name
+        if content_layer:
+            group.content_layer = content_layer
+        self.groups.append(group)
+        _parent.children.append(RefItem(cref=cref))
+        return group
+    def add_unordered_list(
+        self,
+        name: Optional[str] = None,
+        parent: Optional[NodeItem] = None,
+        content_layer: Optional[ContentLayer] = None,
+    ) -> GroupItem:
+        """add_unordered_list."""
+        _parent = parent or self.body
+        cref = f"#/groups/{len(self.groups)}"
+        group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
+        if name is not None:
+            group.name = name
+        if content_layer:
+            group.content_layer = content_layer
+        self.groups.append(group)
+        _parent.children.append(RefItem(cref=cref))
+        return group
+    def add_inline_group(
+        self,
+        name: Optional[str] = None,
+        parent: Optional[NodeItem] = None,
+        content_layer: Optional[ContentLayer] = None,
+        # marker: Optional[UnorderedList.ULMarker] = None,
+    ) -> GroupItem:
+        """add_inline_group."""
+        _parent = parent or self.body
+        cref = f"#/groups/{len(self.groups)}"
+        group = InlineGroup(self_ref=cref, parent=_parent.get_ref())
+        if name is not None:
+            group.name = name
+        if content_layer:
+            group.content_layer = content_layer
+        self.groups.append(group)
+        _parent.children.append(RefItem(cref=cref))
+        return group
     def add_group(
         self,
         label: Optional[GroupLabel] = None,
@@ -1569,6 +1738,25 @@ class DoclingDocument(BaseModel):
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
+        if label == GroupLabel.LIST:
+            return self.add_unordered_list(
+                name=name,
+                parent=parent,
+                content_layer=content_layer,
+            )
+        elif label == GroupLabel.ORDERED_LIST:
+            return self.add_ordered_list(
+                name=name,
+                parent=parent,
+                content_layer=content_layer,
+            )
+        elif label == GroupLabel.INLINE:
+            return self.add_inline_group(
+                name=name,
+                parent=parent,
+                content_layer=content_layer,
+            )
         if not parent:
             parent = self.body
@@ -1597,6 +1785,8 @@ class DoclingDocument(BaseModel):
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         """add_list_item.
@@ -1624,6 +1814,8 @@ class DoclingDocument(BaseModel):
             parent=parent.get_ref(),
             enumerated=enumerated,
             marker=marker,
+            formatting=formatting,
+            hyperlink=hyperlink,
         )
         if prov:
             list_item.prov.append(prov)
@@ -1643,6 +1835,8 @@ class DoclingDocument(BaseModel):
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         """add_text.
@@ -1662,6 +1856,8 @@ class DoclingDocument(BaseModel):
                 prov=prov,
                 parent=parent,
                 content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
         elif label in [DocItemLabel.LIST_ITEM]:
@@ -1671,15 +1867,31 @@ class DoclingDocument(BaseModel):
                 prov=prov,
                 parent=parent,
                 content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
+        elif label in [DocItemLabel.TITLE]:
+            return self.add_title(
+                text=text,
+                orig=orig,
+                prov=prov,
+                parent=parent,
+                content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
         elif label in [DocItemLabel.SECTION_HEADER]:
             return self.add_heading(
                 text=text,
                 orig=orig,
+                # NOTE: we do not / cannot pass the level here, lossy path..
                 prov=prov,
                 parent=parent,
                 content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
         elif label in [DocItemLabel.CODE]:
@@ -1689,6 +1901,18 @@ class DoclingDocument(BaseModel):
                 prov=prov,
                 parent=parent,
                 content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
+        elif label in [DocItemLabel.FORMULA]:
+            return self.add_formula(
+                text=text,
+                orig=orig,
+                prov=prov,
+                parent=parent,
+                content_layer=content_layer,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
         else:
@@ -1707,6 +1931,8 @@ class DoclingDocument(BaseModel):
                 orig=orig,
                 self_ref=cref,
                 parent=parent.get_ref(),
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
             if prov:
                 text_item.prov.append(prov)
@@ -1808,11 +2034,14 @@ class DoclingDocument(BaseModel):
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         """add_title.
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
+        :param level: LevelNumber:  (Default value = 1)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
@@ -1824,22 +2053,23 @@ class DoclingDocument(BaseModel):
         text_index = len(self.texts)
         cref = f"#/texts/{text_index}"
-        text_item = TextItem(
-            label=DocItemLabel.TITLE,
+        item = TitleItem(
             text=text,
             orig=orig,
             self_ref=cref,
             parent=parent.get_ref(),
+            formatting=formatting,
+            hyperlink=hyperlink,
         )
         if prov:
-            text_item.prov.append(prov)
+            item.prov.append(prov)
         if content_layer:
-            text_item.content_layer = content_layer
+            item.content_layer = content_layer
-        self.texts.append(text_item)
+        self.texts.append(item)
         parent.children.append(RefItem(cref=cref))
-        return text_item
+        return item
     def add_code(
         self,
@@ -1850,6 +2080,8 @@ class DoclingDocument(BaseModel):
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         """add_code.
@@ -1874,6 +2106,8 @@ class DoclingDocument(BaseModel):
             orig=orig,
             self_ref=cref,
             parent=parent.get_ref(),
+            formatting=formatting,
+            hyperlink=hyperlink,
         )
         if code_language:
             code_item.code_language = code_language
@@ -1889,6 +2123,50 @@ class DoclingDocument(BaseModel):
         return code_item
+    def add_formula(
+        self,
+        text: str,
+        orig: Optional[str] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[NodeItem] = None,
+        content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
+    ):
+        """add_formula.
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param level: LevelNumber:  (Default value = 1)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        if not orig:
+            orig = text
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        section_header_item = FormulaItem(
+            text=text,
+            orig=orig,
+            self_ref=cref,
+            parent=parent.get_ref(),
+            formatting=formatting,
+            hyperlink=hyperlink,
+        )
+        if prov:
+            section_header_item.prov.append(prov)
+        if content_layer:
+            section_header_item.content_layer = content_layer
+        self.texts.append(section_header_item)
+        parent.children.append(RefItem(cref=cref))
+        return section_header_item
     def add_heading(
         self,
         text: str,
@@ -1897,6 +2175,8 @@ class DoclingDocument(BaseModel):
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         """add_heading.
@@ -1921,6 +2201,8 @@ class DoclingDocument(BaseModel):
             orig=orig,
             self_ref=cref,
             parent=parent.get_ref(),
+            formatting=formatting,
+            hyperlink=hyperlink,
         )
         if prov:
             section_header_item.prov.append(prov)
@@ -2276,10 +2558,10 @@ class DoclingDocument(BaseModel):
         self,
         filename: Path,
         artifacts_dir: Optional[Path] = None,
-        delim: str = "\n\n",  # TODO: deprecate
+        delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
         strict_text: bool = False,
         escaping_underscores: bool = True,
         image_placeholder: str = "<!-- image -->",
@@ -2319,10 +2601,10 @@ class DoclingDocument(BaseModel):
     def export_to_markdown(  # noqa: C901
         self,
-        delim: str = "\n\n",  # TODO deprecate
+        delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
         strict_text: bool = False,
         escaping_underscores: bool = True,
         image_placeholder: str = "<!-- image -->",
@@ -2337,9 +2619,8 @@ class DoclingDocument(BaseModel):
         Operates on a slice of the document's body as defined through arguments
         from_element and to_element; defaulting to the whole document.
-        :param delim: Delimiter to use when concatenating the various
-                Markdown parts. (Default value = "\n").
-        :type delim: str = "\n"
+        :param delim: Deprecated.
+        :type delim: str = "\n\n"
         :param from_element: Body slicing start index (inclusive).
                 (Default value = 0).
         :type from_element: int = 0
@@ -2347,9 +2628,8 @@ class DoclingDocument(BaseModel):
                 (exclusive). (Default value = maxint).
         :type to_element: int = sys.maxsize
         :param labels: The set of document labels to include in the export.
-        :type labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS
-        :param strict_text: bool: Whether to only include the text content
-            of the document. (Default value = False).
+        :type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
+        :param strict_text: Deprecated.
         :type strict_text: bool = False
         :param escaping_underscores: bool: Whether to escape underscores in the
             text content of the document. (Default value = True).
@@ -2366,250 +2646,48 @@ class DoclingDocument(BaseModel):
         :returns: The exported Markdown representation.
         :rtype: str
         """
-        comps = self._get_markdown_components(
-            node=self.body,
-            from_element=from_element,
-            to_element=to_element,
-            labels=labels,
-            strict_text=strict_text,
-            escaping_underscores=escaping_underscores,
+        from docling_core.experimental.serializer.markdown import (
+            MarkdownDocSerializer,
+            MarkdownListSerializer,
+            MarkdownTextSerializer,
+        )
+        serializer = MarkdownDocSerializer(
+            doc=self,
+            start=from_element,
+            stop=to_element,
             image_placeholder=image_placeholder,
             image_mode=image_mode,
-            indent=indent,
-            text_width=text_width,
-            page_no=page_no,
-            included_content_layers=included_content_layers,
-            list_level=0,
-            is_inline_scope=False,
-            visited=set(),
+            labels=labels,
+            layers=included_content_layers,
+            pages={page_no} if page_no is not None else None,
+            escaping_underscores=escaping_underscores,
+            text_serializer=MarkdownTextSerializer(
+                wrap_width=text_width if text_width > 0 else None,
+            ),
+            list_serializer=MarkdownListSerializer(
+                indent=indent,
+            ),
         )
-        return delim.join(comps)
-    def _get_markdown_components(  # noqa: C901
-        self,
-        node: NodeItem,
-        from_element: int,
-        to_element: int,
-        labels: set[DocItemLabel],
-        strict_text: bool,
-        escaping_underscores: bool,
-        image_placeholder: str,
-        image_mode: ImageRefMode,
-        indent: int,
-        text_width: int,
-        page_no: Optional[int],
-        included_content_layers: set[ContentLayer],
-        list_level: int,
-        is_inline_scope: bool,
-        visited: set[str],  # refs of visited items
-    ) -> list[str]:
-        components: list[str] = []  # components to concatenate
-        # Our export markdown doesn't contain any emphasis styling:
-        # Bold, Italic, or Bold-Italic
-        # Hence, any underscore that we print into Markdown is coming from document text
-        # That means we need to escape it, to properly reflect content in the markdown
-        # However, we need to preserve underscores in image URLs
-        # to maintain their validity
-        # For example: ![image](path/to_image.png) should remain unchanged
-        def _escape_underscores(text):
-            """Escape underscores but leave them intact in the URL.."""
-            # Firstly, identify all the URL patterns.
-            url_pattern = r"!\[.*?\]\((.*?)\)"
-            # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
-            latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
-            combined_pattern = f"({url_pattern})|({latex_pattern})"
-            parts = []
-            last_end = 0
-            for match in re.finditer(combined_pattern, text):
-                # Text to add before the URL (needs to be escaped)
-                before_url = text[last_end : match.start()]
-                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
-                # Add the full URL part (do not escape)
-                parts.append(match.group(0))
-                last_end = match.end()
-            # Add the final part of the text (which needs to be escaped)
-            if last_end < len(text):
-                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
-            return "".join(parts)
-        def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
-            if do_escape_underscores and escaping_underscores:
-                text = _escape_underscores(text)
-            if do_escape_html:
-                text = html.escape(text, quote=False)
-            if text:
-                components.append(text)
+        ser_res = serializer.serialize()
-        for ix, (item, level) in enumerate(
-            self.iterate_items(
-                node,
-                with_groups=True,
-                page_no=page_no,
-                included_content_layers=included_content_layers,
+        if delim != "\n\n":
+            _logger.warning(
+                "Parameter `delim` has been deprecated and will be ignored.",
+            )
+        if strict_text:
+            _logger.warning(
+                "Parameter `strict_text` has been deprecated and will be ignored.",
             )
-        ):
-            if item.self_ref in visited:
-                continue
-            else:
-                visited.add(item.self_ref)
-            if ix < from_element or to_element <= ix:
-                continue  # skip as many items as you want
-            elif (isinstance(item, DocItem)) and (item.label not in labels):
-                continue  # skip any label that is not whitelisted
-            elif isinstance(item, GroupItem):
-                if item.label in [
-                    GroupLabel.LIST,
-                    GroupLabel.ORDERED_LIST,
-                ]:
-                    comps = self._get_markdown_components(
-                        node=item,
-                        from_element=from_element,
-                        to_element=to_element,
-                        labels=labels,
-                        strict_text=strict_text,
-                        escaping_underscores=escaping_underscores,
-                        image_placeholder=image_placeholder,
-                        image_mode=image_mode,
-                        indent=indent,
-                        text_width=text_width,
-                        page_no=page_no,
-                        included_content_layers=included_content_layers,
-                        list_level=list_level + 1,
-                        is_inline_scope=is_inline_scope,
-                        visited=visited,
-                    )
-                    indent_str = list_level * indent * " "
-                    is_ol = item.label == GroupLabel.ORDERED_LIST
-                    text = "\n".join(
-                        [
-                            # avoid additional marker on already evaled sublists
-                            (
-                                c
-                                if c and c[0] == " "
-                                else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c}"
-                            )
-                            for i, c in enumerate(comps)
-                        ]
-                    )
-                    _ingest_text(
-                        text=text,
-                        # special chars have already been escaped as needed
-                        do_escape_html=False,
-                        do_escape_underscores=False,
-                    )
-                elif item.label == GroupLabel.INLINE:
-                    comps = self._get_markdown_components(
-                        node=item,
-                        from_element=from_element,
-                        to_element=to_element,
-                        labels=labels,
-                        strict_text=strict_text,
-                        escaping_underscores=escaping_underscores,
-                        image_placeholder=image_placeholder,
-                        image_mode=image_mode,
-                        indent=indent,
-                        text_width=text_width,
-                        page_no=page_no,
-                        included_content_layers=included_content_layers,
-                        list_level=list_level,
-                        is_inline_scope=True,
-                        visited=visited,
-                    )
-                    text = " ".join(comps)
-                    _ingest_text(
-                        text=text,
-                        # special chars have already been escaped as needed
-                        do_escape_html=False,
-                        do_escape_underscores=False,
-                    )
-                else:
-                    continue
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
-                marker = "" if strict_text else "#"
-                text = f"{marker} {item.text}"
-                _ingest_text(text.strip())
-            elif (
-                isinstance(item, TextItem)
-                and item.label in [DocItemLabel.SECTION_HEADER]
-            ) or isinstance(item, SectionHeaderItem):
-                marker = ""
-                if not strict_text:
-                    marker = "#" * level
-                    if len(marker) < 2:
-                        marker = "##"
-                text = f"{marker} {item.text}"
-                _ingest_text(text.strip())
-            elif isinstance(item, CodeItem):
-                text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
-                _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
-                if item.text != "":
-                    _ingest_text(
-                        f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
-                        do_escape_underscores=False,
-                        do_escape_html=False,
-                    )
-                elif item.orig != "":
-                    _ingest_text(
-                        "<!-- formula-not-decoded -->",
-                        do_escape_underscores=False,
-                        do_escape_html=False,
-                    )
-            elif isinstance(item, TextItem):
-                if len(item.text) and text_width > 0:
-                    text = item.text
-                    wrapped_text = textwrap.fill(text, width=text_width)
-                    _ingest_text(wrapped_text)
-                elif len(item.text):
-                    _ingest_text(item.text)
-            elif isinstance(item, TableItem) and not strict_text:
-                if caption_text := item.caption_text(self):
-                    _ingest_text(caption_text)
-                md_table = item.export_to_markdown()
-                _ingest_text(md_table)
-            elif isinstance(item, PictureItem) and not strict_text:
-                _ingest_text(item.caption_text(self))
-                line = item.export_to_markdown(
-                    doc=self,
-                    image_placeholder=image_placeholder,
-                    image_mode=image_mode,
-                )
-                _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
-            elif isinstance(item, (KeyValueItem, FormItem)):
-                text = item._export_to_markdown()
-                _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
-            elif isinstance(item, DocItem):
-                text = "<!-- missing-text -->"
-                _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
-        return components
+        return ser_res.text
     def export_to_text(  # noqa: C901
         self,
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = 1000000,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
     ) -> str:
         """export_to_text."""
         return self.export_to_markdown(
@@ -2936,7 +3014,378 @@ class DoclingDocument(BaseModel):
         return html_text
-    def save_as_document_tokens(
+    def load_from_doctags(  # noqa: C901
+        self,
+        doctag_document: DocTagsDocument,
+    ) -> "DoclingDocument":
+        r"""Load Docling document from lists of DocTags and Images."""
+        # Maps the recognized tag to a Docling label.
+        # Code items will be given DocItemLabel.CODE
+        tag_to_doclabel = {
+            "title": DocItemLabel.TITLE,
+            "document_index": DocItemLabel.DOCUMENT_INDEX,
+            "otsl": DocItemLabel.TABLE,
+            "section_header_level_1": DocItemLabel.SECTION_HEADER,
+            "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
+            "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
+            "text": DocItemLabel.TEXT,
+            "page_header": DocItemLabel.PAGE_HEADER,
+            "page_footer": DocItemLabel.PAGE_FOOTER,
+            "formula": DocItemLabel.FORMULA,
+            "caption": DocItemLabel.CAPTION,
+            "picture": DocItemLabel.PICTURE,
+            "list_item": DocItemLabel.LIST_ITEM,
+            "footnote": DocItemLabel.FOOTNOTE,
+            "code": DocItemLabel.CODE,
+        }
+        def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
+            """Extract <loc_...> coords from the chunk, normalized by / 500."""
+            coords = re.findall(r"<loc_(\d+)>", text_chunk)
+            if len(coords) == 4:
+                l, t, r, b = map(float, coords)
+                return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
+            return None
+        def extract_inner_text(text_chunk: str) -> str:
+            """Strip all <...> tags inside the chunk to get the raw text content."""
+            return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
+        def otsl_parse_texts(texts, tokens):
+            split_word = TableToken.OTSL_NL.value
+            split_row_tokens = [
+                list(y)
+                for x, y in itertools.groupby(tokens, lambda z: z == split_word)
+                if not x
+            ]
+            table_cells = []
+            r_idx = 0
+            c_idx = 0
+            def count_right(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                c_idx_iter = c_idx
+                while tokens[r_idx][c_idx_iter] in which_tokens:
+                    c_idx_iter += 1
+                    span += 1
+                    if c_idx_iter >= len(tokens[r_idx]):
+                        return span
+                return span
+            def count_down(tokens, c_idx, r_idx, which_tokens):
+                span = 0
+                r_idx_iter = r_idx
+                while tokens[r_idx_iter][c_idx] in which_tokens:
+                    r_idx_iter += 1
+                    span += 1
+                    if r_idx_iter >= len(tokens):
+                        return span
+                return span
+            for i, text in enumerate(texts):
+                cell_text = ""
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                ]:
+                    row_span = 1
+                    col_span = 1
+                    right_offset = 1
+                    if text != TableToken.OTSL_ECEL.value:
+                        cell_text = texts[i + 1]
+                        right_offset = 2
+                    # Check next element(s) for lcel / ucel / xcel,
+                    # set properly row_span, col_span
+                    next_right_cell = ""
+                    if i + right_offset < len(texts):
+                        next_right_cell = texts[i + right_offset]
+                    next_bottom_cell = ""
+                    if r_idx + 1 < len(split_row_tokens):
+                        if c_idx < len(split_row_tokens[r_idx + 1]):
+                            next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
+                    if next_right_cell in [
+                        TableToken.OTSL_LCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have horisontal spanning cell or 2d spanning cell
+                        col_span += count_right(
+                            split_row_tokens,
+                            c_idx + 1,
+                            r_idx,
+                            [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+                    if next_bottom_cell in [
+                        TableToken.OTSL_UCEL.value,
+                        TableToken.OTSL_XCEL.value,
+                    ]:
+                        # we have a vertical spanning cell or 2d spanning cell
+                        row_span += count_down(
+                            split_row_tokens,
+                            c_idx,
+                            r_idx + 1,
+                            [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
+                        )
+                    table_cells.append(
+                        TableCell(
+                            text=cell_text.strip(),
+                            row_span=row_span,
+                            col_span=col_span,
+                            start_row_offset_idx=r_idx,
+                            end_row_offset_idx=r_idx + row_span,
+                            start_col_offset_idx=c_idx,
+                            end_col_offset_idx=c_idx + col_span,
+                        )
+                    )
+                if text in [
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                    TableToken.OTSL_LCEL.value,
+                    TableToken.OTSL_UCEL.value,
+                    TableToken.OTSL_XCEL.value,
+                ]:
+                    c_idx += 1
+                if text == TableToken.OTSL_NL.value:
+                    r_idx += 1
+                    c_idx = 0
+            return table_cells, split_row_tokens
+        def otsl_extract_tokens_and_text(s: str):
+            # Pattern to match anything enclosed by < >
+            # (including the angle brackets themselves)
+            pattern = r"(<[^>]+>)"
+            # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
+            tokens = re.findall(pattern, s)
+            # Remove any tokens that start with "<loc_"
+            tokens = [
+                token
+                for token in tokens
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Split the string by those tokens to get the in-between text
+            text_parts = re.split(pattern, s)
+            text_parts = [
+                token
+                for token in text_parts
+                if not (
+                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    or token
+                    in [
+                        rf"<{DocumentToken.OTSL.value}>",
+                        rf"</{DocumentToken.OTSL.value}>",
+                    ]
+                )
+            ]
+            # Remove any empty or purely whitespace strings from text_parts
+            text_parts = [part for part in text_parts if part.strip()]
+            return tokens, text_parts
+        def parse_table_content(otsl_content: str) -> TableData:
+            tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
+            table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
+            return TableData(
+                num_rows=len(split_row_tokens),
+                num_cols=(
+                    max(len(row) for row in split_row_tokens) if split_row_tokens else 0
+                ),
+                table_cells=table_cells,
+            )
+        # doc = DoclingDocument(name="Document")
+        for pg_idx, doctag_page in enumerate(doctag_document.pages):
+            page_doctags = doctag_page.tokens
+            image = doctag_page.image
+            page_no = pg_idx + 1
+            # bounding_boxes = []
+            if image is not None:
+                pg_width = image.width
+                pg_height = image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+            """
+            1. Finds all <tag>...</tag>
+               blocks in the entire string (multi-line friendly)
+               in the order they appear.
+            2. For each chunk, extracts bounding box (if any) and inner text.
+            3. Adds the item to a DoclingDocument structure with the right label.
+            4. Tracks bounding boxes+color in a separate list for later visualization.
+            """
+            # Regex for root level recognized tags
+            tag_pattern = (
+                rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
+                rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
+                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
+                rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
+                rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
+                rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
+                rf"{DocItemLabel.SECTION_HEADER}_level_1|"
+                rf"{DocumentToken.ORDERED_LIST.value}|"
+                rf"{DocumentToken.UNORDERED_LIST.value}|"
+                rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
+            )
+            # DocumentToken.OTSL
+            pattern = re.compile(tag_pattern, re.DOTALL)
+            # Go through each match in order
+            for match in pattern.finditer(page_doctags):
+                full_chunk = match.group(0)
+                tag_name = match.group("tag")
+                bbox = extract_bounding_box(full_chunk) if image else None
+                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
+                if tag_name == DocumentToken.OTSL.value:
+                    table_data = parse_table_content(full_chunk)
+                    bbox = extract_bounding_box(full_chunk) if image else None
+                    if bbox:
+                        prov = ProvenanceItem(
+                            bbox=bbox.resize_by_scale(pg_width, pg_height),
+                            charspan=(0, 0),
+                            page_no=page_no,
+                        )
+                        self.add_table(data=table_data, prov=prov)
+                    else:
+                        self.add_table(data=table_data)
+                elif tag_name == DocItemLabel.PICTURE:
+                    text_caption_content = extract_inner_text(full_chunk)
+                    if image:
+                        if bbox:
+                            im_width, im_height = image.size
+                            crop_box = (
+                                int(bbox.l * im_width),
+                                int(bbox.t * im_height),
+                                int(bbox.r * im_width),
+                                int(bbox.b * im_height),
+                            )
+                            cropped_image = image.crop(crop_box)
+                            pic = self.add_picture(
+                                parent=None,
+                                image=ImageRef.from_pil(image=cropped_image, dpi=72),
+                                prov=(
+                                    ProvenanceItem(
+                                        bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                        charspan=(0, 0),
+                                        page_no=page_no,
+                                    )
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = self.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                    else:
+                        if bbox:
+                            # In case we don't have access to an binary of an image
+                            self.add_picture(
+                                parent=None,
+                                prov=ProvenanceItem(
+                                    bbox=bbox, charspan=(0, 0), page_no=page_no
+                                ),
+                            )
+                            # If there is a caption to an image, add it as well
+                            if len(text_caption_content) > 0:
+                                caption_item = self.add_text(
+                                    label=DocItemLabel.CAPTION,
+                                    text=text_caption_content,
+                                    parent=None,
+                                )
+                                pic.captions.append(caption_item.get_ref())
+                elif tag_name in [
+                    DocumentToken.ORDERED_LIST.value,
+                    DocumentToken.UNORDERED_LIST.value,
+                ]:
+                    list_label = GroupLabel.LIST
+                    enum_marker = ""
+                    enum_value = 0
+                    if tag_name == DocumentToken.ORDERED_LIST.value:
+                        list_label = GroupLabel.ORDERED_LIST
+                    list_item_pattern = (
+                        rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
+                    )
+                    li_pattern = re.compile(list_item_pattern, re.DOTALL)
+                    # Add list group:
+                    new_list = self.add_group(label=list_label, name="list")
+                    # Pricess list items
+                    for li_match in li_pattern.finditer(full_chunk):
+                        enum_value += 1
+                        if tag_name == DocumentToken.ORDERED_LIST.value:
+                            enum_marker = str(enum_value) + "."
+                        li_full_chunk = li_match.group(0)
+                        li_bbox = extract_bounding_box(li_full_chunk) if image else None
+                        text_content = extract_inner_text(li_full_chunk)
+                        # Add list item
+                        self.add_list_item(
+                            marker=enum_marker,
+                            enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
+                            parent=new_list,
+                            text=text_content,
+                            prov=(
+                                ProvenanceItem(
+                                    bbox=li_bbox.resize_by_scale(pg_width, pg_height),
+                                    charspan=(0, len(text_content)),
+                                    page_no=page_no,
+                                )
+                                if li_bbox
+                                else None
+                            ),
+                        )
+                else:
+                    # For everything else, treat as text
+                    text_content = extract_inner_text(full_chunk)
+                    self.add_text(
+                        label=doc_label,
+                        text=text_content,
+                        prov=(
+                            ProvenanceItem(
+                                bbox=bbox.resize_by_scale(pg_width, pg_height),
+                                charspan=(0, len(text_content)),
+                                page_no=page_no,
+                            )
+                            if bbox
+                            else None
+                        ),
+                    )
+        return self
+    @deprecated("Use save_as_doctags instead.")
+    def save_as_document_tokens(self, *args, **kwargs):
+        r"""Save the document content to a DocumentToken format."""
+        return self.save_as_doctags(*args, **kwargs)
+    def save_as_doctags(
         self,
         filename: Path,
         delim: str = "",
@@ -2952,7 +3401,7 @@ class DoclingDocument(BaseModel):
         add_table_cell_location: bool = False,
         add_table_cell_text: bool = True,
     ):
-        r"""Save the document content to a DocumentToken format."""
+        r"""Save the document content to DocTags format."""
         out = self.export_to_document_tokens(
             delim=delim,
             from_element=from_element,