PyPI - docling-core - Versions diffs - 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl - Mend

docling-core 2.25.0py3-none-any.whl → 2.26.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (18) hide show

docling_core/experimental/serializer/base.py +23 -2
docling_core/experimental/serializer/common.py +79 -34
docling_core/experimental/serializer/doctags.py +83 -47
docling_core/experimental/serializer/html.py +931 -0
docling_core/experimental/serializer/html_styles.py +212 -0
docling_core/experimental/serializer/markdown.py +95 -57
docling_core/transforms/chunker/base.py +8 -2
docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
docling_core/transforms/chunker/hybrid_chunker.py +54 -12
docling_core/types/doc/document.py +702 -482
docling_core/types/doc/labels.py +2 -0
docling_core/types/doc/page.py +12 -17
docling_core/types/doc/tokens.py +3 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/METADATA +1 -1
{docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/RECORD +18 -16
{docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/LICENSE +0 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/WHEEL +0 -0
{docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/entry_points.txt +0 -0

docling_core/types/doc/document.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import base64
 import copy
 import hashlib
-import html
 import itertools
 import json
 import logging
@@ -12,17 +11,12 @@ import os
 import re
 import sys
 import typing
-import warnings
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
-from urllib.parse import quote, unquote
-from xml.etree.cElementTree import SubElement, tostring
-from xml.sax.saxutils import unescape
+from urllib.parse import unquote
-import latex2mathml.converter
-import latex2mathml.exceptions
 import pandas as pd
 import yaml
 from PIL import Image as PILImage
@@ -49,13 +43,10 @@ from docling_core.types.doc.labels import (
     GraphCellLabel,
     GraphLinkLabel,
     GroupLabel,
+    PictureClassificationLabel,
 )
 from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
-from docling_core.types.doc.utils import (
-    get_html_tag_with_text_direction,
-    get_text_direction,
-    relative_path,
-)
+from docling_core.types.doc.utils import relative_path
 _logger = logging.getLogger(__name__)
@@ -290,22 +281,6 @@ class PictureScatterChartData(PictureChartData):
     points: List[ChartPoint]
-PictureDataType = Annotated[
-    Union[
-        PictureClassificationData,
-        PictureDescriptionData,
-        PictureMoleculeData,
-        PictureMiscData,
-        PictureLineChartData,
-        PictureBarChartData,
-        PictureStackedBarChartData,
-        PicturePieChartData,
-        PictureScatterChartData,
-    ],
-    Field(discriminator="kind"),
-]
 class TableCell(BaseModel):
     """TableCell."""
@@ -391,6 +366,35 @@ class TableData(BaseModel):  # TBD
         return table_data
+class PictureTabularChartData(PictureChartData):
+    """Base class for picture chart data.
+    Attributes:
+        title (str): The title of the chart.
+        chart_data (TableData): Chart data in the table format.
+    """
+    kind: Literal["tabular_chart_data"] = "tabular_chart_data"
+    chart_data: TableData
+PictureDataType = Annotated[
+    Union[
+        PictureClassificationData,
+        PictureDescriptionData,
+        PictureMoleculeData,
+        PictureMiscData,
+        PictureTabularChartData,
+        PictureLineChartData,
+        PictureBarChartData,
+        PictureStackedBarChartData,
+        PicturePieChartData,
+        PictureScatterChartData,
+    ],
+    Field(discriminator="kind"),
+]
 class DocumentOrigin(BaseModel):
     """FileSource."""
@@ -458,8 +462,12 @@ class RefItem(BaseModel):
         populate_by_name=True,
     )
+    def _split_ref_to_path(self):
+        """Get the path of the reference."""
+        return self.cref.split("/")
     def resolve(self, doc: "DoclingDocument"):
-        """resolve."""
+        """Resolve the path in the document."""
         path_components = self.cref.split("/")
         if (num_comps := len(path_components)) == 3:
             _, path, index_str = path_components
@@ -624,10 +632,98 @@ class NodeItem(BaseModel):
     model_config = ConfigDict(extra="forbid")
-    def get_ref(self):
+    def get_ref(self) -> RefItem:
         """get_ref."""
         return RefItem(cref=self.self_ref)
+    def _get_parent_ref(
+        self, doc: "DoclingDocument", stack: list[int]
+    ) -> Optional[RefItem]:
+        """get_parent_ref."""
+        if len(stack) == 0:
+            return self.parent
+        elif len(stack) > 0 and stack[0] < len(self.children):
+            item = self.children[stack[0]].resolve(doc)
+            return item._get_parent_ref(doc=doc, stack=stack[1:])
+        return None
+    def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool:
+        """Delete child node in tree."""
+        if len(stack) == 1 and stack[0] < len(self.children):
+            del self.children[stack[0]]
+            return True
+        elif len(stack) > 1 and stack[0] < len(self.children):
+            item = self.children[stack[0]].resolve(doc)
+            return item._delete_child(doc=doc, stack=stack[1:])
+        return False
+    def _update_child(
+        self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
+    ) -> bool:
+        """Update child node in tree."""
+        if len(stack) == 1 and stack[0] < len(self.children):
+            # ensure the parent is correct
+            new_item = new_ref.resolve(doc=doc)
+            new_item.parent = self.get_ref()
+            self.children[stack[0]] = new_ref
+            return True
+        elif len(stack) > 1 and stack[0] < len(self.children):
+            item = self.children[stack[0]].resolve(doc)
+            return item._update_child(doc=doc, stack=stack[1:], new_ref=new_ref)
+        return False
+    def _add_child(
+        self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
+    ) -> bool:
+        """Append child to node identified by stack."""
+        if len(stack) == 0:
+            # ensure the parent is correct
+            new_item = new_ref.resolve(doc=doc)
+            new_item.parent = self.get_ref()
+            self.children.append(new_ref)
+            return True
+        elif len(stack) > 0 and stack[0] < len(self.children):
+            item = self.children[stack[0]].resolve(doc)
+            return item._add_child(doc=doc, stack=stack[1:], new_ref=new_ref)
+        return False
+    def _add_sibling(
+        self,
+        doc: "DoclingDocument",
+        stack: list[int],
+        new_ref: RefItem,
+        after: bool = True,
+    ) -> bool:
+        """Add sibling node in tree."""
+        if len(stack) == 1 and stack[0] < len(self.children) and (not after):
+            # ensure the parent is correct
+            new_item = new_ref.resolve(doc=doc)
+            new_item.parent = self.get_ref()
+            self.children.insert(stack[0], new_ref)
+            return True
+        elif len(stack) == 1 and stack[0] < len(self.children) and (after):
+            # ensure the parent is correct
+            new_item = new_ref.resolve(doc=doc)
+            new_item.parent = self.get_ref()
+            self.children.insert(stack[0] + 1, new_ref)
+            return True
+        elif len(stack) > 1 and stack[0] < len(self.children):
+            item = self.children[stack[0]].resolve(doc)
+            return item._add_sibling(
+                doc=doc, stack=stack[1:], new_ref=new_ref, after=after
+            )
+        return False
 class GroupItem(NodeItem):  # Container type, can't be a leaf node
     """GroupItem."""
@@ -953,7 +1049,9 @@ class FormulaItem(TextItem):
 class PictureItem(FloatingItem):
     """PictureItem."""
-    label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
+    label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = (
+        DocItemLabel.PICTURE
+    )
     annotations: List[PictureDataType] = []
@@ -1020,54 +1118,19 @@ class PictureItem(FloatingItem):
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
     ) -> str:
         """Export picture to HTML format."""
-        text = ""
-        if add_caption and len(self.captions):
-            text = self.caption_text(doc)
-        caption_text = ""
-        if len(text) > 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="figcaption", text=text
-            )
-        default_response = f"<figure>{caption_text}</figure>"
-        if image_mode == ImageRefMode.PLACEHOLDER:
-            return default_response
-        elif image_mode == ImageRefMode.EMBEDDED:
-            # short-cut: we already have the image in base64
-            if (
-                isinstance(self.image, ImageRef)
-                and isinstance(self.image.uri, AnyUrl)
-                and self.image.uri.scheme == "data"
-            ):
-                img_text = f'<img src="{self.image.uri}">'
-                return f"<figure>{caption_text}{img_text}</figure>"
-            # get the self.image._pil or crop it out of the page-image
-            img = self.get_image(doc)
-            if img is not None:
-                imgb64 = self._image_to_base64(img)
-                img_text = f'<img src="data:image/png;base64,{imgb64}">'
-                return f"<figure>{caption_text}{img_text}</figure>"
-            else:
-                return default_response
-        elif image_mode == ImageRefMode.REFERENCED:
-            if not isinstance(self.image, ImageRef) or (
-                isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
-            ):
-                return default_response
-            img_text = f'<img src="{quote(str(self.image.uri))}">'
-            return f"<figure>{caption_text}{img_text}</figure>"
+        from docling_core.experimental.serializer.html import (
+            HTMLDocSerializer,
+            HTMLParams,
+        )
-        else:
-            return default_response
+        serializer = HTMLDocSerializer(
+            doc=doc,
+            params=HTMLParams(
+                image_mode=image_mode,
+            ),
+        )
+        text = serializer.serialize(item=self).text
+        return text
     @deprecated("Use export_to_doctags() instead.")
     def export_to_document_tokens(self, *args, **kwargs):
@@ -1218,81 +1281,18 @@ class TableItem(FloatingItem):
         add_caption: bool = True,
     ) -> str:
         """Export the table as html."""
-        if doc is None:
-            warnings.warn(
-                "The `doc` argument will be mandatory in a future version. "
-                "It must be provided to include a caption.",
-                DeprecationWarning,
-            )
-        nrows = self.data.num_rows
-        ncols = self.data.num_cols
-        text = ""
-        if doc is not None and add_caption and len(self.captions):
-            text = html.escape(self.caption_text(doc))
-        if len(self.data.table_cells) == 0:
-            return ""
-        body = ""
-        for i in range(nrows):
-            body += "<tr>"
-            for j in range(ncols):
-                cell: TableCell = self.data.grid[i][j]
-                rowspan, rowstart = (
-                    cell.row_span,
-                    cell.start_row_offset_idx,
-                )
-                colspan, colstart = (
-                    cell.col_span,
-                    cell.start_col_offset_idx,
-                )
-                if rowstart != i:
-                    continue
-                if colstart != j:
-                    continue
-                content = html.escape(cell.text.strip())
-                celltag = "td"
-                if cell.column_header:
-                    celltag = "th"
-                opening_tag = f"{celltag}"
-                if rowspan > 1:
-                    opening_tag += f' rowspan="{rowspan}"'
-                if colspan > 1:
-                    opening_tag += f' colspan="{colspan}"'
-                text_dir = get_text_direction(content)
-                if text_dir == "rtl":
-                    opening_tag += f' dir="{dir}"'
-                body += f"<{opening_tag}>{content}</{celltag}>"
-            body += "</tr>"
-        # dir = get_text_direction(text)
-        if len(text) > 0 and len(body) > 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="caption", text=text
-            )
-            body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
+        if doc is not None:
+            from docling_core.experimental.serializer.html import HTMLDocSerializer
-        elif len(text) == 0 and len(body) > 0:
-            body = f"<table><tbody>{body}</tbody></table>"
-        elif len(text) > 0 and len(body) == 0:
-            caption_text = get_html_tag_with_text_direction(
-                html_tag="caption", text=text
-            )
-            body = f"<table>{caption_text}</table>"
+            serializer = HTMLDocSerializer(doc=doc)
+            text = serializer.serialize(item=self).text
+            return text
         else:
-            body = "<table></table>"
-        return body
+            _logger.error(
+                "Usage of TableItem.export_to_html() without `doc` argument is "
+                "deprecated.",
+            )
+            return ""
     def export_to_otsl(
         self,
@@ -1567,76 +1567,6 @@ class PageItem(BaseModel):
 class DoclingDocument(BaseModel):
     """DoclingDocument."""
-    _HTML_DEFAULT_HEAD: str = r"""<head>
-    <link rel="icon" type="image/png"
-    href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
-    <meta charset="UTF-8">
-    <title>
-    Powered by Docling
-    </title>
-    <style>
-    html {
-    background-color: LightGray;
-    }
-    body {
-    margin: 0 auto;
-    width:800px;
-    padding: 30px;
-    background-color: White;
-    font-family: Arial, sans-serif;
-    box-shadow: 10px 10px 10px grey;
-    }
-    figure{
-    display: block;
-    width: 100%;
-    margin: 0px;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    }
-    img {
-    display: block;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    max-width: 640px;
-    max-height: 640px;
-    }
-    table {
-    min-width:500px;
-    background-color: White;
-    border-collapse: collapse;
-    cell-padding: 5px;
-    margin: auto;
-    margin-top: 10px;
-    margin-bottom: 10px;
-    }
-    th, td {
-    border: 1px solid black;
-    padding: 8px;
-    }
-    th {
-    font-weight: bold;
-    }
-    table tr:nth-child(even) td{
-    background-color: LightGray;
-    }
-    math annotation {
-    display: none;
-    }
-    .formula-not-decoded {
-    background: repeating-linear-gradient(
-    45deg, /* Angle of the stripes */
-    LightGray, /* First color */
-    LightGray 10px, /* Length of the first color */
-    White 10px, /* Second color */
-    White 20px /* Length of the second color */
-    );
-    margin: 0;
-    text-align: center;
-    }
-    </style>
-    </head>"""
     schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
     version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
         CURRENT_VERSION
@@ -1683,6 +1613,364 @@ class DoclingDocument(BaseModel):
                     item["content_layer"] = "furniture"
         return data
+    # ---------------------------
+    # Public Manipulation methods
+    # ---------------------------
+    def append_child_item(
+        self, *, child: NodeItem, parent: Optional[NodeItem] = None
+    ) -> None:
+        """Adds an item."""
+        if len(child.children) > 0:
+            raise ValueError("Can not append a child with children")
+        parent = parent if parent is not None else self.body
+        success, stack = self._get_stack_of_item(item=parent)
+        if not success:
+            raise ValueError(
+                f"Could not resolve the parent node in the document tree: {parent}"
+            )
+        # Append the item to the attributes of the doc
+        self._append_item(item=child, parent_ref=parent.get_ref())
+        # Update the tree of the doc
+        success = self.body._add_child(doc=self, new_ref=child.get_ref(), stack=stack)
+        # Clean the attribute (orphan) if not successful
+        if not success:
+            self._pop_item(item=child)
+            raise ValueError(f"Could not append child: {child} to parent: {parent}")
+    def insert_item_after_sibling(
+        self, *, new_item: NodeItem, sibling: NodeItem
+    ) -> None:
+        """Inserts an item, given its node_item instance, after other as a sibling."""
+        self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True)
+    def insert_item_before_sibling(
+        self, *, new_item: NodeItem, sibling: NodeItem
+    ) -> None:
+        """Inserts an item, given its node_item instance, before other as a sibling."""
+        self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False)
+    def delete_items(self, *, node_items: List[NodeItem]) -> None:
+        """Deletes an item, given its instance or ref, and any children it has."""
+        refs = []
+        for _ in node_items:
+            refs.append(_.get_ref())
+        self._delete_items(refs=refs)
+    def replace_item(self, *, new_item: NodeItem, old_item: NodeItem) -> None:
+        """Replace item with new item."""
+        self.insert_item_after_sibling(new_item=new_item, sibling=old_item)
+        self.delete_items(node_items=[old_item])
+    # ----------------------------
+    # Private Manipulation methods
+    # ----------------------------
+    def _get_stack_of_item(self, item: NodeItem) -> tuple[bool, list[int]]:
+        """Find the stack indices of the item."""
+        return self._get_stack_of_refitem(ref=item.get_ref())
+    def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]:
+        """Find the stack indices of the reference."""
+        if ref == self.body.get_ref():
+            return (True, [])
+        node = ref.resolve(doc=self)
+        parent_ref = node._get_parent_ref(doc=self, stack=[])
+        if parent_ref is None:
+            return (False, [])
+        stack: list[int] = []
+        while parent_ref is not None:
+            parent = parent_ref.resolve(doc=self)
+            index = parent.children.index(node.get_ref())
+            stack.insert(0, index)  # prepend the index
+            node = parent
+            parent_ref = node._get_parent_ref(doc=self, stack=[])
+        return (True, stack)
+    def _insert_item_at_refitem(
+        self, item: NodeItem, ref: RefItem, after: bool
+    ) -> RefItem:
+        """Insert node-item using the self-reference."""
+        success, stack = self._get_stack_of_refitem(ref=ref)
+        if not success:
+            raise ValueError(
+                f"Could not insert at {ref.cref}: could not find the stack"
+            )
+        return self._insert_item_at_stack(item=item, stack=stack, after=after)
+    def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
+        """Append item of its type."""
+        cref: str = ""  # to be updated
+        if isinstance(item, TextItem):
+            item_label = "texts"
+            item_index = len(self.texts)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.texts.append(item)
+        elif isinstance(item, TableItem):
+            item_label = "tables"
+            item_index = len(self.tables)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.tables.append(item)
+        elif isinstance(item, PictureItem):
+            item_label = "pictures"
+            item_index = len(self.pictures)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.pictures.append(item)
+        elif isinstance(item, KeyValueItem):
+            item_label = "key_value_items"
+            item_index = len(self.key_value_items)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.key_value_items.append(item)
+        elif isinstance(item, FormItem):
+            item_label = "form_items"
+            item_index = len(self.form_items)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.form_items.append(item)
+        else:
+            raise ValueError(f"Item {item} is not supported for insertion")
+        return RefItem(cref=cref)
+    def _pop_item(self, *, item: NodeItem):
+        """Pop the last item of its type."""
+        path = item.self_ref.split("/")
+        if len(path) != 3:
+            raise ValueError(f"Can not pop item with path: {path}")
+        item_label = path[1]
+        item_index = int(path[2])
+        if (
+            len(self.__getattribute__(item_label)) + 1 == item_index
+        ):  # we can only pop the last item
+            del self.__getattribute__(item_label)[item_index]
+        else:
+            msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}"
+            raise ValueError(f"Failed to pop: item is not last ({msg})")
+    def _insert_item_at_stack(
+        self, item: NodeItem, stack: list[int], after: bool
+    ) -> RefItem:
+        """Insert node-item using the self-reference."""
+        parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
+        if parent_ref is None:
+            raise ValueError(f"Could not find a parent at stack: {stack}")
+        new_ref = self._append_item(item=item, parent_ref=parent_ref)
+        success = self.body._add_sibling(
+            doc=self, stack=stack, new_ref=new_ref, after=after
+        )
+        if not success:
+            self._pop_item(item=item)
+        return item.get_ref()
+    def _delete_items(self, refs: list[RefItem]) -> bool:
+        """Delete document item using the self-reference."""
+        to_be_deleted_items: dict[tuple[int, ...], str] = {}  # stack to cref
+        # Identify the to_be_deleted_items
+        for item, stack in self._iterate_items_with_stack(with_groups=True):
+            ref = item.get_ref()
+            if ref in refs:
+                to_be_deleted_items[tuple(stack)] = ref.cref
+            substacks = [stack[0 : i + 1] for i in range(len(stack) - 1)]
+            for substack in substacks:
+                if tuple(substack) in to_be_deleted_items:
+                    to_be_deleted_items[tuple(stack)] = ref.cref
+        if len(to_be_deleted_items) == 0:
+            raise ValueError("Nothing to be deleted ...")
+        # Clean the tree, reverse the order to not have to update
+        for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())):
+            success = self.body._delete_child(doc=self, stack=list(stack_))
+            if not success:
+                del to_be_deleted_items[stack_]
+            else:
+                _logger.info(f"deleted item in tree at stack: {stack_} => {ref_}")
+        # Create a new lookup of the orphans:
+        # dict of item_label (`texts`, `tables`, ...) to a
+        # dict of item_label with delta (default = -1).
+        lookup: dict[str, dict[int, int]] = {}
+        for stack_, ref_ in to_be_deleted_items.items():
+            path = ref_.split("/")
+            if len(path) == 3:
+                item_label = path[1]
+                item_index = int(path[2])
+                if item_label not in lookup:
+                    lookup[item_label] = {}
+                lookup[item_label][item_index] = -1
+        # Remove the orphans in reverse order
+        for item_label, item_inds in lookup.items():
+            for item_index, val in reversed(
+                sorted(item_inds.items())
+            ):  # make sure you delete the last in the list first!
+                _logger.debug(f"deleting item in doc for {item_label} for {item_index}")
+                del self.__getattribute__(item_label)[item_index]
+        self._update_breadth_first_with_lookup(
+            node=self.body, refs_to_be_deleted=refs, lookup=lookup
+        )
+        return True
+    # Update the references
+    def _update_ref_with_lookup(
+        self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]
+    ) -> RefItem:
+        """Update ref with lookup."""
+        if item_label not in lookup:  # Nothing to be done
+            return RefItem(cref=f"#/{item_label}/{item_index}")
+        # Count how many items have been deleted in front of you
+        delta = sum(
+            val if item_index >= key else 0 for key, val in lookup[item_label].items()
+        )
+        new_index = item_index + delta
+        return RefItem(cref=f"#/{item_label}/{new_index}")
+    def _update_refitems_with_lookup(
+        self,
+        ref_items: list[RefItem],
+        refs_to_be_deleted: list[RefItem],
+        lookup: dict[str, dict[int, int]],
+    ) -> list[RefItem]:
+        """Update refitems with lookup."""
+        new_refitems = []
+        for ref_item in ref_items:
+            if (
+                ref_item not in refs_to_be_deleted
+            ):  # if ref_item is in ref, then delete/skip them
+                path = ref_item._split_ref_to_path()
+                if len(path) == 3:
+                    new_refitems.append(
+                        self._update_ref_with_lookup(
+                            item_label=path[1],
+                            item_index=int(path[2]),
+                            lookup=lookup,
+                        )
+                    )
+                else:
+                    new_refitems.append(ref_item)
+        return new_refitems
+    def _update_breadth_first_with_lookup(
+        self,
+        node: NodeItem,
+        refs_to_be_deleted: list[RefItem],
+        lookup: dict[str, dict[int, int]],
+    ):
+        """Update breadth first with lookup."""
+        # Update the captions, references and footnote references
+        if isinstance(node, FloatingItem):
+            node.captions = self._update_refitems_with_lookup(
+                ref_items=node.captions,
+                refs_to_be_deleted=refs_to_be_deleted,
+                lookup=lookup,
+            )
+            node.references = self._update_refitems_with_lookup(
+                ref_items=node.references,
+                refs_to_be_deleted=refs_to_be_deleted,
+                lookup=lookup,
+            )
+            node.footnotes = self._update_refitems_with_lookup(
+                ref_items=node.footnotes,
+                refs_to_be_deleted=refs_to_be_deleted,
+                lookup=lookup,
+            )
+        # Update the self_ref reference
+        if node.parent is not None:
+            path = node.parent._split_ref_to_path()
+            if len(path) == 3:
+                node.parent = self._update_ref_with_lookup(
+                    item_label=path[1], item_index=int(path[2]), lookup=lookup
+                )
+        # Update the parent reference
+        if node.self_ref is not None:
+            path = node.self_ref.split("/")
+            if len(path) == 3:
+                _ref = self._update_ref_with_lookup(
+                    item_label=path[1], item_index=int(path[2]), lookup=lookup
+                )
+                node.self_ref = _ref.cref
+        # Update the child references
+        node.children = self._update_refitems_with_lookup(
+            ref_items=node.children,
+            refs_to_be_deleted=refs_to_be_deleted,
+            lookup=lookup,
+        )
+        for i, child_ref in enumerate(node.children):
+            node = child_ref.resolve(self)
+            self._update_breadth_first_with_lookup(
+                node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup
+            )
     ###################################
     # TODO: refactor add* methods below
     ###################################
@@ -2321,21 +2609,33 @@ class DoclingDocument(BaseModel):
         included_content_layers: Optional[set[ContentLayer]] = None,
         _level: int = 0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
-        """iterate_elements.
-        :param root: Optional[NodeItem]:  (Default value = None)
-        :param with_groups: bool:  (Default value = False)
-        :param traverse_pictures: bool:  (Default value = False)
-        :param page_no: Optional[int]:  (Default value = None)
-        :param _level:  (Default value = 0)
-        :param # fixed parameter:
-        :param carries through the node nesting level:
-        """
+        """Iterate elements with level."""
+        for item, stack in self._iterate_items_with_stack(
+            root=root,
+            with_groups=with_groups,
+            traverse_pictures=traverse_pictures,
+            page_no=page_no,
+            included_content_layers=included_content_layers,
+        ):
+            yield item, len(stack)
+    def _iterate_items_with_stack(
+        self,
+        root: Optional[NodeItem] = None,
+        with_groups: bool = False,
+        traverse_pictures: bool = False,
+        page_no: Optional[int] = None,
+        included_content_layers: Optional[set[ContentLayer]] = None,
+        _stack: Optional[list[int]] = None,
+    ) -> typing.Iterable[Tuple[NodeItem, list[int]]]:  # tuple of node and level
+        """Iterate elements with stack."""
         my_layers = (
             included_content_layers
             if included_content_layers is not None
             else DEFAULT_CONTENT_LAYERS
         )
+        my_stack: list[int] = _stack if _stack is not None else []
         if not root:
             root = self.body
@@ -2355,25 +2655,31 @@ class DoclingDocument(BaseModel):
         )
         if should_yield:
-            yield root, _level
+            yield root, my_stack
         # Handle picture traversal - only traverse children if requested
         if isinstance(root, PictureItem) and not traverse_pictures:
             return
+        my_stack.append(-1)
         # Traverse children
-        for child_ref in root.children:
+        for child_ind, child_ref in enumerate(root.children):
+            my_stack[-1] = child_ind
             child = child_ref.resolve(self)
             if isinstance(child, NodeItem):
-                yield from self.iterate_items(
+                yield from self._iterate_items_with_stack(
                     child,
                     with_groups=with_groups,
                     traverse_pictures=traverse_pictures,
                     page_no=page_no,
-                    _level=_level + 1,
+                    _stack=my_stack,
                     included_content_layers=my_layers,
                 )
+        my_stack.pop()
     def _clear_picture_pil_cache(self):
         """Clear cache storage of all images."""
         for item, level in self.iterate_items(with_groups=False):
@@ -2646,6 +2952,7 @@ class DoclingDocument(BaseModel):
         strict_text: bool = False,
         escape_underscores: bool = True,
         image_placeholder: str = "<!-- image -->",
+        enable_chart_tables: bool = True,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
         indent: int = 4,
         text_width: int = -1,
@@ -2713,6 +3020,7 @@ class DoclingDocument(BaseModel):
                 stop_idx=to_element,
                 escape_underscores=escape_underscores,
                 image_placeholder=image_placeholder,
+                enable_chart_tables=enable_chart_tables,
                 image_mode=image_mode,
                 indent=indent,
                 wrap_width=text_width if text_width > 0 else None,
@@ -2763,12 +3071,14 @@ class DoclingDocument(BaseModel):
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = _HTML_DEFAULT_HEAD,
+        html_head: str = "null",  # should be deprecated
         included_content_layers: Optional[set[ContentLayer]] = None,
+        split_page_view: bool = False,
     ):
         """Save to HTML."""
         if isinstance(filename, str):
             filename = Path(filename)
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
         if image_mode == ImageRefMode.REFERENCED:
@@ -2788,6 +3098,7 @@ class DoclingDocument(BaseModel):
             html_lang=html_lang,
             html_head=html_head,
             included_content_layers=included_content_layers,
+            split_page_view=split_page_view,
         )
         with open(filename, "w", encoding="utf-8") as fw:
@@ -2836,245 +3147,51 @@ class DoclingDocument(BaseModel):
         formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
-        html_head: str = _HTML_DEFAULT_HEAD,
+        html_head: str = "null",  # should be deprecated ...
         included_content_layers: Optional[set[ContentLayer]] = None,
+        split_page_view: bool = False,
     ) -> str:
         r"""Serialize to HTML."""
-        my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
+        from docling_core.experimental.serializer.html import (
+            HTMLDocSerializer,
+            HTMLOutputStyle,
+            HTMLParams,
+        )
+        my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
         my_layers = (
             included_content_layers
             if included_content_layers is not None
             else DEFAULT_CONTENT_LAYERS
         )
-        def close_lists(
-            curr_level: int,
-            prev_level: int,
-            in_ordered_list: List[bool],
-            html_texts: list[str],
-        ):
-            if len(in_ordered_list) == 0:
-                return (in_ordered_list, html_texts)
-            while curr_level < prev_level and len(in_ordered_list) > 0:
-                if in_ordered_list[-1]:
-                    html_texts.append("</ol>")
-                else:
-                    html_texts.append("</ul>")
-                prev_level -= 1
-                in_ordered_list.pop()  # = in_ordered_list[:-1]
-            return (in_ordered_list, html_texts)
-        head_lines = [
-            "<!DOCTYPE html>",
-            f'<html lang="{html_lang}">',
-            html_head,
-        ]
-        html_texts: list[str] = []
-        prev_level = 0  # Track the previous item's level
-        in_ordered_list: List[bool] = []  # False
-        def _prepare_tag_content(
-            text: str, do_escape_html=True, do_replace_newline=True
-        ) -> str:
-            if do_escape_html:
-                text = html.escape(text, quote=False)
-            if do_replace_newline:
-                text = text.replace("\n", "<br>")
-            return text
-        for ix, (item, curr_level) in enumerate(
-            self.iterate_items(
-                self.body,
-                with_groups=True,
-                page_no=page_no,
-                included_content_layers=my_layers,
-            )
-        ):
-            # If we've moved to a lower level, we're exiting one or more groups
-            if curr_level < prev_level and len(in_ordered_list) > 0:
-                # Calculate how many levels we've exited
-                # level_difference = previous_level - level
-                # Decrement list_nesting_level for each list group we've exited
-                # list_nesting_level = max(0, list_nesting_level - level_difference)
-                in_ordered_list, html_texts = close_lists(
-                    curr_level=curr_level,
-                    prev_level=prev_level,
-                    in_ordered_list=in_ordered_list,
-                    html_texts=html_texts,
-                )
-            prev_level = curr_level  # Update previous_level for next iteration
-            if ix < from_element or to_element <= ix:
-                continue  # skip as many items as you want
-            if (isinstance(item, DocItem)) and (item.label not in my_labels):
-                continue  # skip any label that is not whitelisted
+        output_style = HTMLOutputStyle.SINGLE_COLUMN
+        if split_page_view:
+            output_style = HTMLOutputStyle.SPLIT_PAGE
-            if isinstance(item, GroupItem) and item.label in [
-                GroupLabel.ORDERED_LIST,
-            ]:
-                text = "<ol>"
-                html_texts.append(text)
-                # Increment list nesting level when entering a new list
-                in_ordered_list.append(True)
-            elif isinstance(item, GroupItem) and item.label in [
-                GroupLabel.LIST,
-            ]:
-                text = "<ul>"
-                html_texts.append(text)
-                # Increment list nesting level when entering a new list
-                in_ordered_list.append(False)
-            elif isinstance(item, GroupItem):
-                continue
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
-                text_inner = _prepare_tag_content(item.text)
-                text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
-                html_texts.append(text)
-            elif isinstance(item, SectionHeaderItem):
-                section_level: int = min(item.level + 1, 6)
-                text = get_html_tag_with_text_direction(
-                    html_tag=f"h{section_level}",
-                    text=_prepare_tag_content(item.text),
-                )
-                html_texts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
-                math_formula = _prepare_tag_content(
-                    item.text, do_escape_html=False, do_replace_newline=False
-                )
-                text = ""
-                def _image_fallback(item: TextItem):
-                    item_image = item.get_image(doc=self)
-                    if item_image is not None:
-                        img_ref = ImageRef.from_pil(item_image, dpi=72)
-                        return (
-                            "<figure>"
-                            f'<img src="{img_ref.uri}" alt="{item.orig}" />'
-                            "</figure>"
-                        )
-                img_fallback = _image_fallback(item)
-                # If the formula is not processed correcty, use its image
-                if (
-                    item.text == ""
-                    and item.orig != ""
-                    and image_mode == ImageRefMode.EMBEDDED
-                    and len(item.prov) > 0
-                    and img_fallback is not None
-                ):
-                    text = img_fallback
-                # Building a math equation in MathML format
-                # ref https://www.w3.org/TR/wai-aria-1.1/#math
-                elif formula_to_mathml and len(math_formula) > 0:
-                    try:
-                        mathml_element = latex2mathml.converter.convert_to_element(
-                            math_formula, display="block"
-                        )
-                        annotation = SubElement(
-                            mathml_element, "annotation", dict(encoding="TeX")
-                        )
-                        annotation.text = math_formula
-                        mathml = unescape(tostring(mathml_element, encoding="unicode"))
-                        text = f"<div>{mathml}</div>"
-                    except Exception as err:
-                        _logger.warning(
-                            "Malformed formula cannot be rendered. "
-                            f"Error {err.__class__.__name__}, formula={math_formula}"
-                        )
-                        if (
-                            image_mode == ImageRefMode.EMBEDDED
-                            and len(item.prov) > 0
-                            and img_fallback is not None
-                        ):
-                            text = img_fallback
-                        else:
-                            text = f"<pre>{math_formula}</pre>"
-                elif math_formula != "":
-                    text = f"<pre>{math_formula}</pre>"
-                if text != "":
-                    html_texts.append(text)
-                else:
-                    html_texts.append(
-                        '<div class="formula-not-decoded">Formula not decoded</div>'
-                    )
-            elif isinstance(item, ListItem):
-                text = get_html_tag_with_text_direction(
-                    html_tag="li", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
-                text = get_html_tag_with_text_direction(
-                    html_tag="li", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-            elif isinstance(item, CodeItem):
-                code_text = _prepare_tag_content(
-                    item.text, do_escape_html=False, do_replace_newline=False
-                )
-                text = f"<pre><code>{code_text}</code></pre>"
-                html_texts.append(text)
-            elif isinstance(item, TextItem):
-                text = get_html_tag_with_text_direction(
-                    html_tag="p", text=_prepare_tag_content(item.text)
-                )
-                html_texts.append(text)
-            elif isinstance(item, TableItem):
-                text = item.export_to_html(doc=self, add_caption=True)
-                html_texts.append(text)
-            elif isinstance(item, PictureItem):
-                html_texts.append(
-                    item.export_to_html(
-                        doc=self, add_caption=True, image_mode=image_mode
-                    )
-                )
-            elif isinstance(item, DocItem) and item.label in my_labels:
-                continue
-        html_texts.append("</html>")
+        params = HTMLParams(
+            labels=my_labels,
+            layers=my_layers,
+            pages={page_no} if page_no is not None else None,
+            start_idx=from_element,
+            stop_idx=to_element,
+            image_mode=image_mode,
+            formula_to_mathml=formula_to_mathml,
+            html_head=html_head,
+            html_lang=html_lang,
+            output_style=output_style,
+        )
-        lines = []
-        lines.extend(head_lines)
-        lines.extend(html_texts)
+        if html_head == "null":
+            params.html_head = None
-        delim = "\n"
-        html_text = (delim.join(lines)).strip()
+        serializer = HTMLDocSerializer(
+            doc=self,
+            params=params,
+        )
+        ser_res = serializer.serialize()
-        return html_text
+        return ser_res.text
     def load_from_doctags(  # noqa: C901
         self,
@@ -3105,6 +3222,8 @@ class DoclingDocument(BaseModel):
         def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
             """Extract <loc_...> coords from the chunk, normalized by / 500."""
             coords = re.findall(r"<loc_(\d+)>", text_chunk)
+            if len(coords) > 4:
+                coords = coords[:4]
             if len(coords) == 4:
                 l, t, r, b = map(float, coords)
                 return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
@@ -3135,11 +3254,28 @@ class DoclingDocument(BaseModel):
         def otsl_parse_texts(texts, tokens):
             split_word = TableToken.OTSL_NL.value
+            # CLEAN tokens from extra tags, only structural OTSL allowed
+            clean_tokens = []
+            for t in tokens:
+                if t in [
+                    TableToken.OTSL_ECEL.value,
+                    TableToken.OTSL_FCEL.value,
+                    TableToken.OTSL_LCEL.value,
+                    TableToken.OTSL_UCEL.value,
+                    TableToken.OTSL_XCEL.value,
+                    TableToken.OTSL_NL.value,
+                    TableToken.OTSL_CHED.value,
+                    TableToken.OTSL_RHED.value,
+                    TableToken.OTSL_SROW.value,
+                ]:
+                    clean_tokens.append(t)
+            tokens = clean_tokens
             split_row_tokens = [
                 list(y)
                 for x, y in itertools.groupby(tokens, lambda z: z == split_word)
                 if not x
             ]
             table_cells = []
             r_idx = 0
             c_idx = 0
@@ -3291,6 +3427,40 @@ class DoclingDocument(BaseModel):
                 table_cells=table_cells,
             )
+        def extract_chart_type(text_chunk: str):
+            label = None
+            chart_labels = [
+                PictureClassificationLabel.PIE_CHART,
+                PictureClassificationLabel.BAR_CHART,
+                PictureClassificationLabel.STACKED_BAR_CHART,
+                PictureClassificationLabel.LINE_CHART,
+                PictureClassificationLabel.FLOW_CHART,
+                PictureClassificationLabel.SCATTER_CHART,
+                PictureClassificationLabel.HEATMAP,
+                "line",
+                "dot_line",
+                "vbar_categorical",
+                "hbar_categorical",
+            ]
+            # Current SmolDocling can predict different labels:
+            chart_labels_mapping = {
+                "line": PictureClassificationLabel.LINE_CHART,
+                "dot_line": PictureClassificationLabel.LINE_CHART,
+                "vbar_categorical": PictureClassificationLabel.BAR_CHART,
+                "hbar_categorical": PictureClassificationLabel.BAR_CHART,
+            }
+            for clabel in chart_labels:
+                tag = f"<{clabel}>"
+                if tag in text_chunk:
+                    if clabel in chart_labels_mapping:
+                        label = PictureClassificationLabel(chart_labels_mapping[clabel])
+                    else:
+                        label = PictureClassificationLabel(clabel)
+                    break
+            return label
         def parse_key_value_item(
             tokens: str, image: Optional[PILImage.Image] = None
         ) -> Tuple[GraphData, Optional[ProvenanceItem]]:
@@ -3422,10 +3592,9 @@ class DoclingDocument(BaseModel):
                 rf"{DocumentToken.ORDERED_LIST.value}|"
                 rf"{DocumentToken.UNORDERED_LIST.value}|"
                 rf"{DocItemLabel.KEY_VALUE_REGION}|"
+                rf"{DocumentToken.CHART.value}|"
                 rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
             )
-            # DocumentToken.OTSL
             pattern = re.compile(tag_pattern, re.DOTALL)
             # Go through each match in order
@@ -3433,18 +3602,17 @@ class DoclingDocument(BaseModel):
                 full_chunk = match.group(0)
                 tag_name = match.group("tag")
-                bbox = extract_bounding_box(full_chunk) if image else None
+                bbox = extract_bounding_box(full_chunk)  # Extracts first bbox
                 doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
                 if tag_name == DocumentToken.OTSL.value:
                     table_data = parse_table_content(full_chunk)
-                    bbox = extract_bounding_box(full_chunk) if image else None
                     caption, caption_bbox = extract_caption(full_chunk)
                     if caption is not None and caption_bbox is not None:
                         caption.prov.append(
                             ProvenanceItem(
                                 bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
-                                charspan=(0, 0),
+                                charspan=(0, len(caption.text)),
                                 page_no=page_no,
                             )
                         )
@@ -3458,8 +3626,13 @@ class DoclingDocument(BaseModel):
                     else:
                         self.add_table(data=table_data, caption=caption)
-                elif tag_name == DocItemLabel.PICTURE:
-                    text_caption_content = extract_inner_text(full_chunk)
+                elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
+                    caption, caption_bbox = extract_caption(full_chunk)
+                    table_data = None
+                    chart_type = None
+                    if tag_name == DocumentToken.CHART.value:
+                        table_data = parse_table_content(full_chunk)
+                        chart_type = extract_chart_type(full_chunk)
                     if image:
                         if bbox:
                             im_width, im_height = image.size
@@ -3483,30 +3656,77 @@ class DoclingDocument(BaseModel):
                                 ),
                             )
                             # If there is a caption to an image, add it as well
-                            if len(text_caption_content) > 0:
-                                caption_item = self.add_text(
-                                    label=DocItemLabel.CAPTION,
-                                    text=text_caption_content,
-                                    parent=None,
+                            if caption is not None and caption_bbox is not None:
+                                caption.prov.append(
+                                    ProvenanceItem(
+                                        bbox=caption_bbox.resize_by_scale(
+                                            pg_width, pg_height
+                                        ),
+                                        charspan=(0, len(caption.text)),
+                                        page_no=page_no,
+                                    )
                                 )
-                                pic.captions.append(caption_item.get_ref())
+                                pic.captions.append(caption.get_ref())
+                            pic_title = "picture"
+                            if chart_type is not None:
+                                pic.annotations.append(
+                                    PictureClassificationData(
+                                        provenance="load_from_doctags",
+                                        predicted_classes=[
+                                            # chart_type
+                                            PictureClassificationClass(
+                                                class_name=chart_type, confidence=1.0
+                                            )
+                                        ],
+                                    )
+                                )
+                                pic_title = chart_type
+                            if table_data is not None:
+                                # Add chart data as PictureTabularChartData
+                                pd = PictureTabularChartData(
+                                    chart_data=table_data, title=pic_title
+                                )
+                                pic.annotations.append(pd)
                     else:
                         if bbox:
                             # In case we don't have access to an binary of an image
-                            self.add_picture(
+                            pic = self.add_picture(
                                 parent=None,
                                 prov=ProvenanceItem(
                                     bbox=bbox, charspan=(0, 0), page_no=page_no
                                 ),
                             )
                             # If there is a caption to an image, add it as well
-                            if len(text_caption_content) > 0:
-                                caption_item = self.add_text(
-                                    label=DocItemLabel.CAPTION,
-                                    text=text_caption_content,
-                                    parent=None,
+                            if caption is not None and caption_bbox is not None:
+                                caption.prov.append(
+                                    ProvenanceItem(
+                                        bbox=caption_bbox.resize_by_scale(
+                                            pg_width, pg_height
+                                        ),
+                                        charspan=(0, len(caption.text)),
+                                        page_no=page_no,
+                                    )
+                                )
+                                pic.captions.append(caption.get_ref())
+                            if chart_type is not None:
+                                pic.annotations.append(
+                                    PictureClassificationData(
+                                        provenance="load_from_doctags",
+                                        predicted_classes=[
+                                            # chart_type
+                                            PictureClassificationClass(
+                                                class_name=chart_type, confidence=1.0
+                                            )
+                                        ],
+                                    )
                                 )
-                                pic.captions.append(caption_item.get_ref())
+                            if table_data is not None:
+                                # Add chart data as PictureTabularChartData
+                                pd = PictureTabularChartData(
+                                    chart_data=table_data, title=pic_title
+                                )
+                                pic.annotations.append(pd)
                 elif tag_name == DocItemLabel.KEY_VALUE_REGION:
                     key_value_data, kv_item_prov = parse_key_value_item(
                         full_chunk, image

docling-core 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.25.0py3-none-any.whl → 2.26.0py3-none-any.whl