PyPI - docling-core - Versions diffs - 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl - Mend - Supply Chain Defender

docling-core 2.23.3py3-none-any.whl → 2.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (12) hide show

docling_core/types/doc/document.py CHANGED Viewed

@@ -50,7 +50,7 @@ from docling_core.types.doc.labels import (
     GraphLinkLabel,
     GroupLabel,
 )
-from docling_core.types.doc.tokens import DocumentToken, TableToken
+from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
 from docling_core.types.doc.utils import (
     get_html_tag_with_text_direction,
     get_text_direction,
@@ -79,6 +79,7 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.REFERENCE,
     DocItemLabel.PAGE_HEADER,
     DocItemLabel.PAGE_FOOTER,
+    DocItemLabel.KEY_VALUE_REGION,
 }
 DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
@@ -414,6 +415,7 @@ class DocumentOrigin(BaseModel):
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         "text/asciidoc",
         "text/markdown",
+        "text/csv",
     ]
     @field_validator("binary_hash", mode="before")
@@ -643,7 +645,7 @@ class DocItem(
     def get_location_tokens(
         self,
         doc: "DoclingDocument",
-        new_line: str,
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
     ) -> str:
@@ -662,7 +664,7 @@ class DocItem(
                 xsize=xsize,
                 ysize=ysize,
             )
-            location += f"{loc_str}{new_line}"
+            location += loc_str
         return location
@@ -722,10 +724,15 @@ class TextItem(DocItem):
     formatting: Optional[Formatting] = None
     hyperlink: Optional[Union[AnyUrl, Path]] = None
-    def export_to_document_tokens(
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(
         self,
         doc: "DoclingDocument",
-        new_line: str = "",
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -734,29 +741,29 @@ class TextItem(DocItem):
         r"""Export text element to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str (Default value = "")
+        :param new_line: str (Default value = "")  Deprecated
         :param xsize: int:  (Default value = 500)
         :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}>{new_line}"
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
-        if add_location:
-            body += self.get_location_tokens(
-                doc=doc,
-                new_line=new_line,
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
                 xsize=xsize,
                 ysize=ysize,
-            )
-        if add_content and self.text is not None:
-            body += f"{self.text.strip()}{new_line}"
-        body += f"</{self.label.value}>\n"
-        return body
+                add_location=add_location,
+                add_content=add_content,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class TitleItem(TextItem):
@@ -775,10 +782,15 @@ class SectionHeaderItem(TextItem):
     )
     level: LevelNumber = 1
-    def export_to_document_tokens(
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(
         self,
         doc: "DoclingDocument",
-        new_line: str = "",
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -787,34 +799,29 @@ class SectionHeaderItem(TextItem):
         r"""Export text element to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str (Default value = "")
+        :param new_line: str (Default value = "")  Deprecated
         :param xsize: int:  (Default value = 500)
         :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}_level_{self.level}>{new_line}"
-        # TODO: This must be done through an explicit mapping.
-        # assert DocumentToken.is_known_token(
-        #    body
-        # ), f"failed DocumentToken.is_known_token({body})"
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
-        if add_location:
-            body += self.get_location_tokens(
-                doc=doc,
-                new_line=new_line,
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
                 xsize=xsize,
                 ysize=ysize,
-            )
-        if add_content and self.text is not None:
-            body += f"{self.text.strip()}{new_line}"
-        body += f"</{self.label.value}_level_{self.level}>\n"
-        return body
+                add_location=add_location,
+                add_content=add_content,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class ListItem(TextItem):
@@ -865,10 +872,15 @@ class CodeItem(FloatingItem, TextItem):
     )
     code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
-    def export_to_document_tokens(
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(
         self,
         doc: "DoclingDocument",
-        new_line: str = "",
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -877,29 +889,29 @@ class CodeItem(FloatingItem, TextItem):
         r"""Export text element to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str (Default value = "")
+        :param new_line: str (Default value = "")  Deprecated
         :param xsize: int:  (Default value = 500)
         :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}>{new_line}"
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
-        if add_location:
-            body += self.get_location_tokens(
-                doc=doc,
-                new_line=new_line,
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
                 xsize=xsize,
                 ysize=ysize,
-            )
-        if add_content and self.text is not None:
-            body += f"<_{self.code_language.value}_>{self.text}{new_line}"
-        body += f"</{self.label.value}>\n"
-        return body
+                add_location=add_location,
+                add_content=add_content,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class FormulaItem(TextItem):
@@ -953,7 +965,10 @@ class PictureItem(FloatingItem):
         image_placeholder: str = "<!-- image -->",
     ) -> str:
         """Export picture to Markdown format."""
-        from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
+        from docling_core.experimental.serializer.markdown import (
+            MarkdownDocSerializer,
+            MarkdownParams,
+        )
         if not add_caption:
             _logger.warning(
@@ -961,20 +976,13 @@ class PictureItem(FloatingItem):
             )
         serializer = MarkdownDocSerializer(
-            doc=self,
-            image_mode=image_mode,
-        )
-        text = (
-            serializer.picture_serializer.serialize(
-                item=self,
-                doc_serializer=serializer,
-                doc=doc,
+            doc=doc,
+            params=MarkdownParams(
                 image_mode=image_mode,
                 image_placeholder=image_placeholder,
-            ).text
-            if serializer.picture_serializer
-            else ""
+            ),
         )
+        text = serializer.serialize(item=self).text
         return text
     def export_to_html(
@@ -1033,10 +1041,15 @@ class PictureItem(FloatingItem):
         else:
             return default_response
-    def export_to_document_tokens(
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(
         self,
         doc: "DoclingDocument",
-        new_line: str = "",
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -1046,7 +1059,7 @@ class PictureItem(FloatingItem):
         r"""Export picture to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str (Default value = "")
+        :param new_line: str (Default value = "")  Deprecated
         :param xsize: int:  (Default value = 500)
         :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
@@ -1055,59 +1068,23 @@ class PictureItem(FloatingItem):
         :param # not used at the moment
         """
-        body = f"<{self.label.value}>{new_line}"
-        if add_location:
-            body += self.get_location_tokens(
-                doc=doc,
-                new_line=new_line,
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
                 xsize=xsize,
                 ysize=ysize,
-            )
-        classifications = [
-            ann
-            for ann in self.annotations
-            if isinstance(ann, PictureClassificationData)
-        ]
-        if len(classifications) > 0:
-            # ! TODO: currently this code assumes class_name is of type 'str'
-            # ! TODO: when it will change to an ENUM --> adapt code
-            predicted_class = classifications[0].predicted_classes[0].class_name
-            body += DocumentToken.get_picture_classification_token(predicted_class)
-        smiles_annotations = [
-            ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
-        ]
-        if len(smiles_annotations) > 0:
-            body += (
-                "<"
-                + DocumentToken.SMILES.value
-                + ">"
-                + smiles_annotations[0].smi
-                + "</"
-                + DocumentToken.SMILES.value
-                + ">"
-            )
-        if add_caption and len(self.captions):
-            text = self.caption_text(doc)
-            if len(text):
-                body += f"<{DocItemLabel.CAPTION.value}>"
-                for caption in self.captions:
-                    body += caption.resolve(doc).get_location_tokens(
-                        doc=doc,
-                        new_line=new_line,
-                        xsize=xsize,
-                        ysize=ysize,
-                    )
-                body += f"{text.strip()}"
-                body += f"</{DocItemLabel.CAPTION.value}>"
-                body += f"{new_line}"
-        body += f"</{self.label.value}>\n"
-        return body
+                add_location=add_location,
+                add_content=add_content,
+                add_caption=add_caption,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class TableItem(FloatingItem):
@@ -1171,18 +1148,8 @@ class TableItem(FloatingItem):
                 MarkdownDocSerializer,
             )
-            serializer = MarkdownDocSerializer(
-                doc=doc,
-            )
-            text = (
-                serializer.table_serializer.serialize(
-                    item=self,
-                    doc_serializer=serializer,
-                    doc=doc,
-                ).text
-                if serializer.table_serializer
-                else ""
-            )
+            serializer = MarkdownDocSerializer(doc=doc)
+            text = serializer.serialize(item=self).text
             return text
         else:
             _logger.warning(
@@ -1391,10 +1358,15 @@ class TableItem(FloatingItem):
             body_str = "".join(body)
         return body_str
-    def export_to_document_tokens(
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(
         self,
         doc: "DoclingDocument",
-        new_line: str = "",
+        new_line: str = "",  # deprecated
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -1405,7 +1377,7 @@ class TableItem(FloatingItem):
         r"""Export table to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str (Default value = "")
+        :param new_line: str (Default value = "")  Deprecated
         :param xsize: int:  (Default value = 500)
         :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
@@ -1414,39 +1386,24 @@ class TableItem(FloatingItem):
         :param add_caption: bool:  (Default value = True)
         """
-        otsl_tag = DocumentToken.OTSL.value
-        body = f"<{otsl_tag}>{new_line}"
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
-        if add_location:
-            body += self.get_location_tokens(
-                doc=doc,
-                new_line=new_line,
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
                 xsize=xsize,
                 ysize=ysize,
-            )
-        body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
-        if add_caption and len(self.captions):
-            text = self.caption_text(doc)
-            if len(text):
-                body += f"<{DocItemLabel.CAPTION.value}>"
-                for caption in self.captions:
-                    body += caption.resolve(doc).get_location_tokens(
-                        doc=doc,
-                        new_line=new_line,
-                        xsize=xsize,
-                        ysize=ysize,
-                    )
-                body += f"{text.strip()}"
-                body += f"</{DocItemLabel.CAPTION.value}>"
-                body += f"{new_line}"
-        body += f"</{otsl_tag}>\n"
-        return body
+                add_location=add_location,
+                add_caption=add_caption,
+                add_table_cell_location=add_cell_location,
+                add_table_cell_text=add_cell_text,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class GraphCell(BaseModel):
@@ -1508,6 +1465,42 @@ class KeyValueItem(FloatingItem):
     graph: GraphData
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "",  # deprecated
+        xsize: int = 500,
+        ysize: int = 500,
+        add_location: bool = True,
+        add_content: bool = True,
+    ):
+        r"""Export key value item to document tokens format.
+        :param doc: "DoclingDocument":
+        :param new_line: str (Default value = "")  Deprecated
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        """
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
+        )
+        serializer = DocTagsDocSerializer(
+            doc=doc,
+            params=DocTagsParams(
+                xsize=xsize,
+                ysize=ysize,
+                add_location=add_location,
+                add_content=add_content,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
 class FormItem(FloatingItem):
     """FormItem."""
@@ -2297,7 +2290,7 @@ class DoclingDocument(BaseModel):
         with_groups: bool = False,
         traverse_pictures: bool = False,
         page_no: Optional[int] = None,
-        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
+        included_content_layers: Optional[set[ContentLayer]] = None,
         _level: int = 0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
         """iterate_elements.
@@ -2310,6 +2303,11 @@ class DoclingDocument(BaseModel):
         :param # fixed parameter:
         :param carries through the node nesting level:
         """
+        my_layers = (
+            included_content_layers
+            if included_content_layers is not None
+            else DEFAULT_CONTENT_LAYERS
+        )
         if not root:
             root = self.body
@@ -2325,7 +2323,7 @@ class DoclingDocument(BaseModel):
                     or any(prov.page_no == page_no for prov in root.prov)
                 )
             )
-            and root.content_layer in included_content_layers
+            and root.content_layer in my_layers
         )
         if should_yield:
@@ -2345,7 +2343,7 @@ class DoclingDocument(BaseModel):
                     traverse_pictures=traverse_pictures,
                     page_no=page_no,
                     _level=_level + 1,
-                    included_content_layers=included_content_layers,
+                    included_content_layers=my_layers,
                 )
     def _clear_picture_pil_cache(self):
@@ -2475,12 +2473,14 @@ class DoclingDocument(BaseModel):
     def save_as_json(
         self,
-        filename: Path,
+        filename: Union[str, Path],
         artifacts_dir: Optional[Path] = None,
         image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
         indent: int = 2,
     ):
         """Save as json."""
+        if isinstance(filename, str):
+            filename = Path(filename)
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
         if image_mode == ImageRefMode.REFERENCED:
@@ -2495,7 +2495,7 @@ class DoclingDocument(BaseModel):
             json.dump(out, fw, indent=indent)
     @classmethod
-    def load_from_json(cls, filename: Path) -> "DoclingDocument":
+    def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
         """load_from_json.
         :param filename: The filename to load a saved DoclingDocument from a .json.
@@ -2505,17 +2505,21 @@ class DoclingDocument(BaseModel):
         :rtype: DoclingDocument
         """
+        if isinstance(filename, str):
+            filename = Path(filename)
         with open(filename, "r", encoding="utf-8") as f:
             return cls.model_validate_json(f.read())
     def save_as_yaml(
         self,
-        filename: Path,
+        filename: Union[str, Path],
         artifacts_dir: Optional[Path] = None,
         image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
         default_flow_style: bool = False,
     ):
         """Save as yaml."""
+        if isinstance(filename, str):
+            filename = Path(filename)
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
         if image_mode == ImageRefMode.REFERENCED:
@@ -2530,7 +2534,7 @@ class DoclingDocument(BaseModel):
             yaml.dump(out, fw, default_flow_style=default_flow_style)
     @classmethod
-    def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
+    def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
         """load_from_yaml.
         Args:
@@ -2539,6 +2543,8 @@ class DoclingDocument(BaseModel):
         Returns:
             DoclingDocument: the loaded DoclingDocument
         """
+        if isinstance(filename, str):
+            filename = Path(filename)
         with open(filename, encoding="utf-8") as f:
             data = yaml.load(f, Loader=yaml.FullLoader)
         return DoclingDocument.model_validate(data)
@@ -2556,12 +2562,12 @@ class DoclingDocument(BaseModel):
     def save_as_markdown(
         self,
-        filename: Path,
+        filename: Union[str, Path],
         artifacts_dir: Optional[Path] = None,
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         strict_text: bool = False,
         escaping_underscores: bool = True,
         image_placeholder: str = "<!-- image -->",
@@ -2569,9 +2575,12 @@ class DoclingDocument(BaseModel):
         indent: int = 4,
         text_width: int = -1,
         page_no: Optional[int] = None,
-        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
+        included_content_layers: Optional[set[ContentLayer]] = None,
+        page_break_placeholder: Optional[str] = None,
     ):
         """Save to markdown."""
+        if isinstance(filename, str):
+            filename = Path(filename)
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
         if image_mode == ImageRefMode.REFERENCED:
@@ -2587,13 +2596,14 @@ class DoclingDocument(BaseModel):
             to_element=to_element,
             labels=labels,
             strict_text=strict_text,
-            escaping_underscores=escaping_underscores,
+            escape_underscores=escaping_underscores,
             image_placeholder=image_placeholder,
             image_mode=image_mode,
             indent=indent,
             text_width=text_width,
             page_no=page_no,
             included_content_layers=included_content_layers,
+            page_break_placeholder=page_break_placeholder,
         )
         with open(filename, "w", encoding="utf-8") as fw:
@@ -2604,15 +2614,16 @@ class DoclingDocument(BaseModel):
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         strict_text: bool = False,
-        escaping_underscores: bool = True,
+        escape_underscores: bool = True,
         image_placeholder: str = "<!-- image -->",
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         indent: int = 4,
         text_width: int = -1,
         page_no: Optional[int] = None,
-        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
+        included_content_layers: Optional[set[ContentLayer]] = None,
+        page_break_placeholder: Optional[str] = None,  # e.g. "<!-- page break -->",
     ) -> str:
         r"""Serialize to Markdown.
@@ -2627,8 +2638,9 @@ class DoclingDocument(BaseModel):
         :param to_element: Body slicing stop index
                 (exclusive). (Default value = maxint).
         :type to_element: int = sys.maxsize
-        :param labels: The set of document labels to include in the export.
-        :type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
+        :param labels: The set of document labels to include in the export. None falls
+            back to the system-defined default.
+        :type labels: Optional[set[DocItemLabel]] = None
         :param strict_text: Deprecated.
         :type strict_text: bool = False
         :param escaping_underscores: bool: Whether to escape underscores in the
@@ -2643,30 +2655,40 @@ class DoclingDocument(BaseModel):
         :param indent: The indent in spaces of the nested lists.
             (Default value = 4).
         :type indent: int = 4
+        :param included_content_layers: The set of layels to include in the export. None
+            falls back to the system-defined default.
+        :type included_content_layers: Optional[set[ContentLayer]] = None
+        :param page_break_placeholder: The placeholder to include for marking page
+            breaks. None means no page break placeholder will be used.
+        :type page_break_placeholder: Optional[str] = None
         :returns: The exported Markdown representation.
         :rtype: str
         """
         from docling_core.experimental.serializer.markdown import (
             MarkdownDocSerializer,
-            MarkdownListSerializer,
-            MarkdownTextSerializer,
+            MarkdownParams,
         )
+        my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
+        my_layers = (
+            included_content_layers
+            if included_content_layers is not None
+            else DEFAULT_CONTENT_LAYERS
+        )
         serializer = MarkdownDocSerializer(
             doc=self,
-            start=from_element,
-            stop=to_element,
-            image_placeholder=image_placeholder,
-            image_mode=image_mode,
-            labels=labels,
-            layers=included_content_layers,
-            pages={page_no} if page_no is not None else None,
-            escaping_underscores=escaping_underscores,
-            text_serializer=MarkdownTextSerializer(
-                wrap_width=text_width if text_width > 0 else None,
-            ),
-            list_serializer=MarkdownListSerializer(
+            params=MarkdownParams(
+                labels=my_labels,
+                layers=my_layers,
+                pages={page_no} if page_no is not None else None,
+                start_idx=from_element,
+                stop_idx=to_element,
+                escape_underscores=escape_underscores,
+                image_placeholder=image_placeholder,
+                image_mode=image_mode,
                 indent=indent,
+                wrap_width=text_width if text_width > 0 else None,
+                page_break_placeholder=page_break_placeholder,
             ),
         )
         ser_res = serializer.serialize()
@@ -2687,34 +2709,38 @@ class DoclingDocument(BaseModel):
         delim: str = "\n\n",
         from_element: int = 0,
         to_element: int = 1000000,
-        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
     ) -> str:
         """export_to_text."""
+        my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
         return self.export_to_markdown(
-            delim,
-            from_element,
-            to_element,
-            labels,
+            delim=delim,
+            from_element=from_element,
+            to_element=to_element,
+            labels=my_labels,
             strict_text=True,
-            escaping_underscores=False,
+            escape_underscores=False,
             image_placeholder="",
         )
     def save_as_html(
         self,
-        filename: Path,
+        filename: Union[str, Path],
         artifacts_dir: Optional[Path] = None,
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
-        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
+        included_content_layers: Optional[set[ContentLayer]] = None,
     ):
         """Save to HTML."""
+        if isinstance(filename, str):
+            filename = Path(filename)
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
         if image_mode == ImageRefMode.REFERENCED:
@@ -2740,8 +2766,10 @@ class DoclingDocument(BaseModel):
             fw.write(html_out)
     def _get_output_paths(
-        self, filename: Path, artifacts_dir: Optional[Path] = None
+        self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
     ) -> Tuple[Path, Optional[Path]]:
+        if isinstance(filename, str):
+            filename = Path(filename)
         if artifacts_dir is None:
             # Remove the extension and add '_pictures'
             artifacts_dir = filename.with_suffix("")
@@ -2775,15 +2803,21 @@ class DoclingDocument(BaseModel):
         self,
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
-        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
+        included_content_layers: Optional[set[ContentLayer]] = None,
     ) -> str:
         r"""Serialize to HTML."""
+        my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
+        my_layers = (
+            included_content_layers
+            if included_content_layers is not None
+            else DEFAULT_CONTENT_LAYERS
+        )
         def close_lists(
             curr_level: int,
@@ -2831,7 +2865,7 @@ class DoclingDocument(BaseModel):
                 self.body,
                 with_groups=True,
                 page_no=page_no,
-                included_content_layers=included_content_layers,
+                included_content_layers=my_layers,
             )
         ):
             # If we've moved to a lower level, we're exiting one or more groups
@@ -2853,7 +2887,7 @@ class DoclingDocument(BaseModel):
             if ix < from_element or to_element <= ix:
                 continue  # skip as many items as you want
-            if (isinstance(item, DocItem)) and (item.label not in labels):
+            if (isinstance(item, DocItem)) and (item.label not in my_labels):
                 continue  # skip any label that is not whitelisted
             if isinstance(item, GroupItem) and item.label in [
@@ -3000,7 +3034,7 @@ class DoclingDocument(BaseModel):
                     )
                 )
-            elif isinstance(item, DocItem) and item.label in labels:
+            elif isinstance(item, DocItem) and item.label in my_labels:
                 continue
         html_texts.append("</html>")
@@ -3037,6 +3071,7 @@ class DoclingDocument(BaseModel):
             "list_item": DocItemLabel.LIST_ITEM,
             "footnote": DocItemLabel.FOOTNOTE,
             "code": DocItemLabel.CODE,
+            "key_value_region": DocItemLabel.KEY_VALUE_REGION,
         }
         def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
@@ -3189,7 +3224,7 @@ class DoclingDocument(BaseModel):
                 token
                 for token in tokens
                 if not (
-                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    token.startswith(rf"<{_LOC_PREFIX}")
                     or token
                     in [
                         rf"<{DocumentToken.OTSL.value}>",
@@ -3203,7 +3238,7 @@ class DoclingDocument(BaseModel):
                 token
                 for token in text_parts
                 if not (
-                    token.startswith(rf"<{DocumentToken.LOC.value}")
+                    token.startswith(rf"<{_LOC_PREFIX}")
                     or token
                     in [
                         rf"<{DocumentToken.OTSL.value}>",
@@ -3228,6 +3263,95 @@ class DoclingDocument(BaseModel):
                 table_cells=table_cells,
             )
+        def parse_key_value_item(
+            tokens: str, image: Optional[PILImage.Image] = None
+        ) -> Tuple[GraphData, Optional[ProvenanceItem]]:
+            if image is not None:
+                pg_width = image.width
+                pg_height = image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+            start_locs_match = re.search(r"<key_value_region>(.*?)<key", tokens)
+            if start_locs_match:
+                overall_locs = start_locs_match.group(1)
+                overall_bbox = extract_bounding_box(overall_locs) if image else None
+                overall_prov = (
+                    ProvenanceItem(
+                        bbox=overall_bbox.resize_by_scale(pg_width, pg_height),
+                        charspan=(0, 0),
+                        page_no=1,
+                    )
+                    if overall_bbox
+                    else None
+                )
+            else:
+                overall_prov = None
+            # here we assumed the labels as only key or value, later on we can update
+            # it to have unspecified, checkbox etc.
+            cell_pattern = re.compile(
+                r"<(?P<label>key|value)_(?P<id>\d+)>"
+                r"(?P<content>.*?)"
+                r"</(?P=label)_(?P=id)>",
+                re.DOTALL,
+            )
+            cells: List["GraphCell"] = []
+            links: List["GraphLink"] = []
+            raw_link_predictions = []
+            for cell_match in cell_pattern.finditer(tokens):
+                cell_label_str = cell_match.group("label")  # "key" or "value"
+                cell_id = int(cell_match.group("id"))
+                raw_content = cell_match.group("content")
+                # link tokens
+                link_matches = re.findall(r"<link_(\d+)>", raw_content)
+                cell_bbox = extract_bounding_box(raw_content) if image else None
+                cell_prov = None
+                if cell_bbox is not None:
+                    cell_prov = ProvenanceItem(
+                        bbox=cell_bbox.resize_by_scale(pg_width, pg_height),
+                        charspan=(0, 0),
+                        page_no=1,
+                    )
+                cleaned_text = re.sub(r"<loc_\d+>", "", raw_content)
+                cleaned_text = re.sub(r"<link_\d+>", "", cleaned_text).strip()
+                cell_obj = GraphCell(
+                    label=GraphCellLabel(cell_label_str),
+                    cell_id=cell_id,
+                    text=cleaned_text,
+                    orig=cleaned_text,
+                    prov=cell_prov,
+                    item_ref=None,
+                )
+                cells.append(cell_obj)
+                cell_ids = {cell.cell_id for cell in cells}
+                for target_str in link_matches:
+                    raw_link_predictions.append((cell_id, int(target_str)))
+            cell_ids = {cell.cell_id for cell in cells}
+            for source_id, target_id in raw_link_predictions:
+                # basic check to validate the prediction
+                if target_id not in cell_ids:
+                    continue
+                link_obj = GraphLink(
+                    label=GraphLinkLabel.TO_VALUE,
+                    source_cell_id=source_id,
+                    target_cell_id=target_id,
+                )
+                links.append(link_obj)
+            return (GraphData(cells=cells, links=links), overall_prov)
         # doc = DoclingDocument(name="Document")
         for pg_idx, doctag_page in enumerate(doctag_document.pages):
             page_doctags = doctag_page.tokens
@@ -3243,6 +3367,12 @@ class DoclingDocument(BaseModel):
                 pg_width = 1
                 pg_height = 1
+            self.add_page(
+                page_no=page_no,
+                size=Size(width=pg_width, height=pg_height),
+                image=ImageRef.from_pil(image=image, dpi=72) if image else None,
+            )
             """
             1. Finds all <tag>...</tag>
                blocks in the entire string (multi-line friendly)
@@ -3263,6 +3393,7 @@ class DoclingDocument(BaseModel):
                 rf"{DocItemLabel.SECTION_HEADER}_level_1|"
                 rf"{DocumentToken.ORDERED_LIST.value}|"
                 rf"{DocumentToken.UNORDERED_LIST.value}|"
+                rf"{DocItemLabel.KEY_VALUE_REGION}|"
                 rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
             )
@@ -3348,6 +3479,11 @@ class DoclingDocument(BaseModel):
                                     parent=None,
                                 )
                                 pic.captions.append(caption_item.get_ref())
+                elif tag_name == DocItemLabel.KEY_VALUE_REGION:
+                    key_value_data, kv_item_prov = parse_key_value_item(
+                        full_chunk, image
+                    )
+                    self.add_key_values(graph=key_value_data, prov=kv_item_prov)
                 elif tag_name in [
                     DocumentToken.ORDERED_LIST.value,
                     DocumentToken.UNORDERED_LIST.value,
@@ -3392,18 +3528,25 @@ class DoclingDocument(BaseModel):
                 else:
                     # For everything else, treat as text
                     text_content = extract_inner_text(full_chunk)
+                    element_prov = (
+                        ProvenanceItem(
+                            bbox=bbox.resize_by_scale(pg_width, pg_height),
+                            charspan=(0, len(text_content)),
+                            page_no=page_no,
+                        )
+                        if bbox
+                        else None
+                    )
+                    content_layer = ContentLayer.BODY
+                    if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                        content_layer = ContentLayer.FURNITURE
                     self.add_text(
                         label=doc_label,
                         text=text_content,
-                        prov=(
-                            ProvenanceItem(
-                                bbox=bbox.resize_by_scale(pg_width, pg_height),
-                                charspan=(0, len(text_content)),
-                                page_no=page_no,
-                            )
-                            if bbox
-                            else None
-                        ),
+                        prov=element_prov,
+                        content_layer=content_layer,
                     )
         return self
@@ -3414,11 +3557,11 @@ class DoclingDocument(BaseModel):
     def save_as_doctags(
         self,
-        filename: Path,
+        filename: Union[str, Path],
         delim: str = "",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -3427,9 +3570,12 @@ class DoclingDocument(BaseModel):
         # table specific flags
         add_table_cell_location: bool = False,
         add_table_cell_text: bool = True,
+        minified: bool = False,
     ):
         r"""Save the document content to DocTags format."""
-        out = self.export_to_document_tokens(
+        if isinstance(filename, str):
+            filename = Path(filename)
+        out = self.export_to_doctags(
             delim=delim,
             from_element=from_element,
             to_element=to_element,
@@ -3442,17 +3588,23 @@ class DoclingDocument(BaseModel):
             # table specific flags
             add_table_cell_location=add_table_cell_location,
             add_table_cell_text=add_table_cell_text,
+            minified=minified,
         )
         with open(filename, "w", encoding="utf-8") as fw:
             fw.write(out)
-    def export_to_document_tokens(  # noqa: C901
+    @deprecated("Use export_to_doctags() instead.")
+    def export_to_document_tokens(self, *args, **kwargs):
+        r"""Export to DocTags format."""
+        return self.export_to_doctags(*args, **kwargs)
+    def export_to_doctags(  # noqa: C901
         self,
-        delim: str = "",
+        delim: str = "",  # deprecated
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        labels: Optional[set[DocItemLabel]] = None,
         xsize: int = 500,
         ysize: int = 500,
         add_location: bool = True,
@@ -3461,13 +3613,14 @@ class DoclingDocument(BaseModel):
         # table specific flags
         add_table_cell_location: bool = False,
         add_table_cell_text: bool = True,
+        minified: bool = False,
     ) -> str:
         r"""Exports the document content to a DocumentToken format.
         Operates on a slice of the document's body as defined through arguments
         from_element and to_element; defaulting to the whole main_text.
-        :param delim: str:  (Default value = "")
+        :param delim: str:  (Default value = "")  Deprecated
         :param from_element: int:  (Default value = 0)
         :param to_element: Optional[int]:  (Default value = None)
         :param labels: set[DocItemLabel]
@@ -3478,199 +3631,40 @@ class DoclingDocument(BaseModel):
         :param add_page_index: bool:  (Default value = True)
         :param # table specific flagsadd_table_cell_location: bool
         :param add_table_cell_text: bool:  (Default value = True)
+        :param minified: bool:  (Default value = False)
         :returns: The content of the document formatted as a DocTags string.
         :rtype: str
         """
-        def _close_lists(
-            current_level: int,
-            previous_level: int,
-            ordered_list_stack: List[bool],
-            output_parts: List[str],
-        ) -> List[bool]:
-            """Close open list tags until the nesting level matches item's level."""
-            while current_level < previous_level and ordered_list_stack:
-                last_is_ordered = ordered_list_stack.pop()
-                if last_is_ordered:
-                    output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
-                else:
-                    output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
-                previous_level -= 1
-            return ordered_list_stack
-        def _add_page_break_if_needed(
-            output_parts: List[str],
-            item,
-            prev_page_no,
-            page_break_enabled: bool,
-        ):
-            """Inserts a page-break token.
-            Inserts a page-break token if the item's page number is different
-            from the previous item and page breaks are enabled.
-            Returns the updated output_parts list and the current page number.
-            """
-            if not page_break_enabled:
-                return output_parts, prev_page_no
-            if not item.prov:
-                return output_parts, prev_page_no
-            current_page_no = item.prov[0].page_no
-            if prev_page_no is None:
-                return output_parts, current_page_no
-            if current_page_no != prev_page_no:
-                output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
-            return output_parts, current_page_no
-        def _get_standalone_captions(document_body):
-            """Identify captions that are not attached to any table or figure."""
-            all_captions = set()
-            matched_captions = set()
-            for item, _ in self.iterate_items(document_body, with_groups=True):
-                if item.label == DocItemLabel.CAPTION:
-                    all_captions.update([item.self_ref])
-                if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
-                    matched_captions.update([caption.cref for caption in item.captions])
-            return all_captions - matched_captions
-        # Initialization
-        output_parts: List[str] = []
-        ordered_list_stack: List[bool] = []
-        previous_level = 0
-        previous_page_no = None
-        # Precompute standalone captions
-        standalone_captions = _get_standalone_captions(self.body)
-        # Begin document
-        output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
-        for ix, (item, current_level) in enumerate(
-            self.iterate_items(
-                self.body,
-                with_groups=True,
-                included_content_layers={
-                    ContentLayer.BODY,
-                    ContentLayer.FURNITURE,
-                },
-            )
-        ):
-            # Close lists if we've moved to a lower nesting level
-            if current_level < previous_level and ordered_list_stack:
-                ordered_list_stack = _close_lists(
-                    current_level,
-                    previous_level,
-                    ordered_list_stack,
-                    output_parts,
-                )
-            previous_level = current_level
-            # Skip items outside the specified element range
-            if ix < from_element or ix >= to_element:
-                continue
-            # Skip items whose label is not in the allowed set
-            if isinstance(item, DocItem) and (item.label not in labels):
-                continue
-            # Skip captions that are not standalone as they will be included below
-            # by the export functions of Table and Picture
-            if (
-                isinstance(item, TextItem)
-                and item.label == DocItemLabel.CAPTION
-                and item.self_ref not in standalone_captions
-            ):
-                continue
-            # Handle list groups
-            if isinstance(item, GroupItem):
-                if item.label == GroupLabel.ORDERED_LIST:
-                    output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
-                    ordered_list_stack.append(True)
-                elif item.label == GroupLabel.LIST:
-                    output_parts.append(
-                        f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
-                    )
-                    ordered_list_stack.append(False)
-                continue
-            # For other item types, optionally insert page-break if the page changed
-            output_parts, previous_page_no = _add_page_break_if_needed(
-                output_parts, item, previous_page_no, add_page_index
-            )
-            if isinstance(item, SectionHeaderItem):
-                output_parts.append(
-                    item.export_to_document_tokens(
-                        doc=self,
-                        new_line=delim,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_location=add_location,
-                        add_content=add_content,
-                    )
-                )
-            elif isinstance(item, CodeItem):
-                output_parts.append(
-                    item.export_to_document_tokens(
-                        doc=self,
-                        new_line=delim,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_location=add_location,
-                        add_content=add_content,
-                    )
-                )
-            elif isinstance(item, TextItem):
-                output_parts.append(
-                    item.export_to_document_tokens(
-                        doc=self,
-                        new_line=delim,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_location=add_location,
-                        add_content=add_content,
-                    )
-                )
-            elif isinstance(item, TableItem):
-                output_parts.append(
-                    item.export_to_document_tokens(
-                        doc=self,
-                        new_line=delim,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_location=add_location,
-                        add_cell_location=add_table_cell_location,
-                        add_cell_text=add_table_cell_text,
-                        add_caption=True,
-                    )
-                )
-            elif isinstance(item, PictureItem):
-                output_parts.append(
-                    item.export_to_document_tokens(
-                        doc=self,
-                        new_line=delim,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_caption=True,
-                        add_location=add_location,
-                        add_content=add_content,
-                    )
-                )
-        # End any lists that might still be open
-        ordered_list_stack = _close_lists(
-            0, previous_level, ordered_list_stack, output_parts
+        from docling_core.experimental.serializer.doctags import (
+            DocTagsDocSerializer,
+            DocTagsParams,
         )
-        # End document
-        output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
-        return "".join(output_parts)
+        my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
+        serializer = DocTagsDocSerializer(
+            doc=self,
+            params=DocTagsParams(
+                labels=my_labels,
+                # layers=...,  # not exposed
+                start_idx=from_element,
+                stop_idx=to_element,
+                xsize=xsize,
+                ysize=ysize,
+                add_location=add_location,
+                # add_caption=...,  # not exposed
+                add_content=add_content,
+                add_page_break=add_page_index,
+                add_table_cell_location=add_table_cell_location,
+                add_table_cell_text=add_table_cell_text,
+                mode=(
+                    DocTagsParams.Mode.MINIFIED
+                    if minified
+                    else DocTagsParams.Mode.HUMAN_FRIENDLY
+                ),
+            ),
+        )
+        ser_res = serializer.serialize()
+        return ser_res.text
     def _export_to_indented_text(
         self,