PyPI - docling-core - Versions diffs - 1.1.4__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

docling-core 1.1.4py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (12) hide show

docling_core/transforms/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Data transformations package."""

docling_core/transforms/chunker/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define the chunker types."""
+from docling_core.transforms.chunker.base import (  # noqa
+    BaseChunker,
+    Chunk,
+    ChunkWithMetadata,
+)
+from docling_core.transforms.chunker.hierarchical_chunker import (  # noqa
+    HierarchicalChunker,
+)

docling_core/transforms/chunker/base.py ADDED Viewed

@@ -0,0 +1,45 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define base classes for chunking."""
+from abc import ABC, abstractmethod
+from typing import Iterator, Optional
+from pydantic import BaseModel
+from docling_core.types import BoundingBox, Document
+class Chunk(BaseModel):
+    """Data model for Chunk."""
+    path: str
+    text: str
+class ChunkWithMetadata(Chunk):
+    """Data model for Chunk including metadata."""
+    page: Optional[int]
+    bbox: Optional[BoundingBox]
+class BaseChunker(BaseModel, ABC):
+    """Base class for Chunker."""
+    @abstractmethod
+    def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
+        """Chunk the provided document.
+        Args:
+            dl_doc (Document): document to chunk
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        raise NotImplementedError()

docling_core/transforms/chunker/hierarchical_chunker.py ADDED Viewed

@@ -0,0 +1,337 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Chunker implementation leveraging the document structure."""
+from __future__ import annotations
+import logging
+from enum import Enum
+from typing import Any, Iterator, Optional, Union
+import pandas as pd
+from pydantic import BaseModel, PositiveInt
+from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
+from docling_core.types import BaseText
+from docling_core.types import Document as DLDocument
+from docling_core.types import Ref, Table
+_logger = logging.getLogger(__name__)
+class HierarchicalChunker(BaseChunker):
+    """Chunker implementation leveraging the document layout."""
+    include_metadata: bool = True
+    min_chunk_len: PositiveInt = 64
+    class _NodeType(str, Enum):
+        PARAGRAPH = "paragraph"
+        SUBTITLE_LEVEL_1 = "subtitle-level-1"
+        TABLE = "table"
+        CAPTION = "caption"
+    class _NodeName(str, Enum):
+        TITLE = "title"
+        REFERENCE = "reference"
+        LIST_ITEM = "list-item"
+        SUBTITLE_LEVEL_1 = "subtitle-level-1"
+    _allowed_types: list[str] = [
+        _NodeType.PARAGRAPH,
+        _NodeType.SUBTITLE_LEVEL_1,
+        _NodeType.TABLE,
+        _NodeType.CAPTION,
+    ]
+    _disallowed_names_by_type: dict[str, list[str]] = {
+        _NodeType.PARAGRAPH: [
+            _NodeName.REFERENCE,
+        ],
+    }
+    @classmethod
+    def _norm(cls, text: Optional[str]) -> Optional[str]:
+        return text.lower() if text is not None else None
+    @classmethod
+    def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]:
+        if table.data:
+            table_content = [[cell.text for cell in row] for row in table.data]
+            return pd.DataFrame(table_content)
+        else:
+            return None
+    @classmethod
+    def _triplet_serialize(cls, table) -> Optional[str]:
+        output_text: Optional[str] = None
+        table_df = cls._convert_table_to_dataframe(table)
+        if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1:
+            rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
+            cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
+            nrows = table_df.shape[0]
+            ncols = table_df.shape[1]
+            texts = [
+                f"{rows[i]}, {cols[j]} = {table_df.iloc[i, j].strip()}"
+                for i in range(1, nrows)
+                for j in range(1, ncols)
+            ]
+            output_text = ". ".join(texts)
+        return output_text
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return f"$.{path_prefix}[{pos}]"
+    class _MainTextItemNode(BaseModel):
+        parent: Optional[int] = None
+        children: list[int] = []
+    class _TitleInfo(BaseModel):
+        text: str
+        path_in_doc: str
+    class _GlobalContext(BaseModel):
+        title: Optional[_HC._TitleInfo] = None
+    class _DocContext(BaseModel):
+        dmap: dict[int, _HC._MainTextItemNode]  # main text element context
+        glob: _HC._GlobalContext  # global context
+        @classmethod
+        def from_doc(cls, doc: DLDocument) -> _HC._DocContext:
+            dmap: dict[int, _HC._MainTextItemNode] = {}
+            glob: _HC._GlobalContext = _HC._GlobalContext()
+            if doc.description.title:
+                glob.title = _HC._TitleInfo(
+                    text=doc.description.title,
+                    path_in_doc="description.title",
+                )
+            parent = None
+            if doc.main_text:
+                idx = 0
+                while idx < len(doc.main_text):
+                    item = doc.main_text[idx]
+                    if (
+                        not glob.title
+                        and isinstance(item, BaseText)
+                        and _HC._norm(item.name) == _HC._NodeName.TITLE
+                    ):
+                        glob.title = _HC._TitleInfo(
+                            text=item.text,
+                            path_in_doc=_HC._create_path(idx),
+                        )
+                    # start of a subtitle-level-1 parent
+                    if (
+                        isinstance(item, BaseText)
+                        and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
+                    ):
+                        dmap[idx] = _HC._MainTextItemNode(parent=None)
+                        parent = idx
+                        if not glob.title:
+                            glob.title = _HC._TitleInfo(
+                                text=item.text,
+                                path_in_doc=_HC._create_path(idx),
+                            )
+                    # start of a list parent
+                    elif (
+                        isinstance(item, BaseText)
+                        and _HC._norm(item.name) != _HC._NodeName.LIST_ITEM
+                        and idx + 1 < len(doc.main_text)
+                        and _HC._norm(doc.main_text[idx + 1].name)
+                        == _HC._NodeName.LIST_ITEM
+                    ):
+                        if parent is not None:
+                            dmap[parent].children.append(idx)
+                        dmap[idx] = _HC._MainTextItemNode(parent=parent)
+                        # have all children register locally
+                        li = idx + 1
+                        while (
+                            li < len(doc.main_text)
+                            and _HC._norm(doc.main_text[li].name)
+                            == _HC._NodeName.LIST_ITEM
+                        ):
+                            dmap[idx].children.append(li)
+                            dmap[li] = _HC._MainTextItemNode(parent=idx)
+                            li += 1
+                        idx = li
+                        continue
+                    # normal case
+                    else:
+                        if parent is not None:
+                            dmap[parent].children.append(idx)
+                        dmap[idx] = _HC._MainTextItemNode(parent=parent)
+                    idx += 1
+            else:
+                pass
+            return cls(
+                dmap=dmap,
+                glob=glob,
+            )
+    class _TextEntry(BaseModel):
+        text: str
+        path: str
+    def _build_chunk_impl(
+        self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
+    ) -> list[_TextEntry]:
+        if doc.main_text:
+            item = doc.main_text[idx]
+            item_type = _HC._norm(item.obj_type)
+            item_name = _HC._norm(item.name)
+            if (
+                item_type not in self._allowed_types
+                or item_name in self._disallowed_names_by_type.get(item_type, [])
+            ):
+                return []
+            c2p = doc_map.dmap
+            text_entries: list[_HC._TextEntry] = []
+            if (
+                isinstance(item, Ref)
+                and item_type == _HC._NodeType.TABLE
+                and doc.tables
+            ):
+                # resolve table reference
+                ref_nr = int(item.ref.split("/")[2])  # e.g. '#/tables/0'
+                table = doc.tables[ref_nr]
+                ser_out = _HC._triplet_serialize(table)
+                if table.data:
+                    text_entries = (
+                        [
+                            self._TextEntry(
+                                text=ser_out,
+                                path=self._create_path(idx),
+                            )
+                        ]
+                        if ser_out
+                        else []
+                    )
+                else:
+                    return []
+            elif isinstance(item, BaseText):
+                text_entries = [
+                    self._TextEntry(
+                        text=item.text,
+                        path=self._create_path(idx),
+                    )
+                ]
+            # squash in any children of type list-item
+            if not rec:
+                if (
+                    c2p[idx].children
+                    and _HC._norm(doc.main_text[c2p[idx].children[0]].name)
+                    == _HC._NodeName.LIST_ITEM
+                ):
+                    text_entries = text_entries + [
+                        self._TextEntry(
+                            text=doc.main_text[c].text,  # type: ignore[union-attr]
+                            path=self._create_path(c),
+                        )
+                        for c in c2p[idx].children
+                        if isinstance(doc.main_text[c], BaseText)
+                        and _HC._norm(doc.main_text[c].name) == _HC._NodeName.LIST_ITEM
+                    ]
+                elif item_name in [
+                    _HC._NodeName.LIST_ITEM,
+                    _HC._NodeName.SUBTITLE_LEVEL_1,
+                ]:
+                    return []
+            if (parent := c2p[idx].parent) is not None:
+                # prepend with ancestors
+                return (
+                    self._build_chunk_impl(
+                        doc=doc, doc_map=doc_map, idx=parent, rec=True
+                    )
+                    + text_entries
+                )
+            else:
+                # if root, augment with title (if available and different)
+                return text_entries
+        else:
+            return []
+    def _build_chunk(
+        self,
+        doc: DLDocument,
+        doc_map: _DocContext,
+        idx: int,
+        delim: str,
+        rec: bool = False,
+    ) -> Optional[Chunk]:
+        texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
+        concat = delim.join([t.text for t in texts if t.text])
+        assert doc.main_text is not None
+        if len(concat) >= self.min_chunk_len:
+            orig_item = doc.main_text[idx]
+            item: Union[BaseText, Table]
+            if isinstance(orig_item, Ref):
+                if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables:
+                    pos = int(orig_item.ref.split("/")[2])
+                    item = doc.tables[pos]
+                    path = self._create_path(pos, path_prefix="tables")
+                else:  # currently disregarding non-table references
+                    return None
+            else:
+                item = orig_item
+                path = self._create_path(idx)
+            if self.include_metadata:
+                return ChunkWithMetadata(
+                    text=concat,
+                    path=path,
+                    page=item.prov[0].page if item.prov else None,
+                    bbox=item.prov[0].bbox if item.prov else None,
+                )
+            else:
+                return Chunk(
+                    text=concat,
+                    path=path,
+                )
+        else:
+            return None
+    def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk]:
+        r"""Chunk the provided document.
+        Args:
+            dl_doc (DLDocument): document to chunk
+            delim (str, optional): delimiter to use when concatenating sub-items.
+                Defaults to "\n".
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        if dl_doc.main_text:
+            # extract doc structure incl. metadata for
+            # each item (e.g. parent, children)
+            doc_ctx = self._DocContext.from_doc(doc=dl_doc)
+            _logger.debug(f"{doc_ctx.model_dump()=}")
+            for i, item in enumerate(dl_doc.main_text):
+                if (
+                    isinstance(item, BaseText)
+                    or _HC._norm(item.obj_type) == _HC._NodeType.TABLE
+                ):
+                    chunk = self._build_chunk(
+                        doc=dl_doc, doc_map=doc_ctx, idx=i, delim=delim
+                    )
+                    if chunk:
+                        _logger.info(f"{i=}, {chunk=}")
+                        yield chunk
+_HC = HierarchicalChunker

docling_core/types/doc/base.py CHANGED Viewed

@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
 class BaseCell(AliasModel):
     """Base cell."""
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
     bounding_box: Optional[BoundingBoxContainer] = Field(
         default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
     )
@@ -152,6 +153,11 @@ class Table(BaseCell):
     model: Optional[str] = None
+# FIXME: let's add some figure specific data-types later
+class Figure(BaseCell):
+    """Figure."""
 class BaseText(AliasModel):
     """Base model for text objects."""

docling_core/types/doc/document.py CHANGED Viewed

@@ -6,7 +6,8 @@
 """Models for the Docling Document data type."""
 from datetime import datetime
-from typing import Generic, Optional, Union
+from enum import Enum
+from typing import Generic, Optional, Tuple, Union
 from pydantic import (
     AnyHttpUrl,
@@ -35,6 +36,7 @@ from docling_core.types.doc.base import (
     BaseCell,
     BaseText,
     BitmapObject,
+    Figure,
     PageDimensions,
     PageReference,
     Ref,
@@ -275,7 +277,7 @@ class MinimalDocument(
     main_text: Optional[list[Union[Ref, BaseText]]] = Field(
         default=None, alias="main-text"
     )
-    figures: Optional[list[BaseCell]] = None
+    figures: Optional[list[Figure]] = None
     tables: Optional[list[Table]] = None
@@ -345,6 +347,107 @@ class CCSDocument(
         return data
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+        for i in range(0, max_cols):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages):
+            special_tokens.append(f"<page_{i}>")
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0], page_dimension[1])):
+            special_tokens.append(f"<loc_{i}>")
+        return special_tokens
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+        if val_ < 0:
+            return "<loc_0>"
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+        return f"<loc_{val_}>"
 class ExportedCCSDocument(
     MinimalDocument,
     Generic[
@@ -427,6 +530,14 @@ class ExportedCCSDocument(
         delim: str = "\n\n",
         main_text_start: int = 0,
         main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
     ) -> str:
         r"""Serialize to Markdown.
@@ -461,12 +572,7 @@ class ExportedCCSDocument(
                     continue
                 item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in {
-                    "title",
-                    "subtitle-level-1",
-                    "paragraph",
-                    "caption",
-                }:
+                if isinstance(item, BaseText) and item_type in main_text_labels:
                     text = item.text
                     # ignore repeated text
@@ -477,20 +583,31 @@ class ExportedCCSDocument(
                     # first title match
                     if item_type == "title" and not has_title:
-                        markdown_text = f"# {text}"
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"# {text}"
                         has_title = True
                     # secondary titles
                     elif item_type in {"title", "subtitle-level-1"} or (
                         has_title and item_type == "title"
                     ):
-                        markdown_text = f"## {text}"
+                        if strict_text:
+                            markdown_text = f"{text}"
+                        else:
+                            markdown_text = f"## {text}"
                     # normal text
                     else:
                         markdown_text = text
-                elif isinstance(item, Table) and item.data:
+                elif (
+                    isinstance(item, Table)
+                    and item.data
+                    and item_type in main_text_labels
+                    and not strict_text
+                ):
                     table = []
                     for row in item.data:
                         tmp = []
@@ -518,3 +635,157 @@ class ExportedCCSDocument(
         result = delim.join(md_texts)
         return result
+    def export_to_document_tokens(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        r"""Exports the document content to an DocumentToken format.
+        Operates on a slice of the document's main_text as defined through arguments
+        main_text_start and main_text_stop; defaulting to the whole main_text.
+        Args:
+            delim (str, optional): The delimiter used to separate text blocks in the
+                exported XML. Default is two newline characters ("\n\n").
+            main_text_start (int, optional): The starting index of the main text to
+                be included in the XML. Default is 0 (the beginning of the text).
+            main_text_stop (Optional[int], optional): The stopping index of the main
+                text. If set to None, the export includes text up to the end.
+                Default is None.
+            main_text_labels (list[str], optional): A list of text labels that
+                categorize the different sections of the document (e.g., "title",
+                "subtitle-level-1", "paragraph", "caption"). Default labels are
+                "title", "subtitle-level-1", "paragraph", and "caption".
+            location_tagging (bool, optional): Determines whether to include
+                location-based tagging in the XML. If True, the exported XML will
+                contain information about the locations of the text elements.
+                Default is True.
+            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
+                (width and height) for the location tagging, if enabled.
+                Default is [100, 100].
+            add_new_line (bool, optional): Whether to add new line characters after
+                each text block. If True, a new line is added after each block of
+                text in the XML. Default is True.
+        Returns:
+            str: The content of the document formatted as an XML string.
+        """
+        xml_str = DocumentToken.BEG_DOCUMENT.value
+        new_line = ""
+        if add_new_line:
+            new_line = "\n"
+        if self.main_text is not None:
+            for orig_item in self.main_text[main_text_start:main_text_stop]:
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+                if item is None:
+                    continue
+                prov = item.prov
+                loc_str = ""  # default is zero
+                if (
+                    location_tagging
+                    and self.page_dimensions is not None
+                    and prov is not None
+                    and len(prov) > 0
+                ):
+                    page = prov[0].page
+                    page_dim = self.page_dimensions[page - 1]
+                    page_w = float(page_dim.width)
+                    page_h = float(page_dim.height)
+                    x0 = float(prov[0].bbox[0]) / float(page_w)
+                    y0 = float(prov[0].bbox[1]) / float(page_h)
+                    x1 = float(prov[0].bbox[2]) / float(page_w)
+                    y1 = float(prov[0].bbox[3]) / float(page_h)
+                    page_tok = ""
+                    if page_tagging:
+                        page_tok = DocumentToken.get_page_token(page=page)
+                    x0_tok = DocumentToken.get_location_token(
+                        val=min(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y0_tok = DocumentToken.get_location_token(
+                        val=min(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    x1_tok = DocumentToken.get_location_token(
+                        val=max(x0, x1), rnorm=location_dimensions[0]
+                    )
+                    y1_tok = DocumentToken.get_location_token(
+                        val=max(y0, y1), rnorm=location_dimensions[1]
+                    )
+                    # update
+                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+                    loc_str += f"{page_tok}"
+                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+                    loc_str += f"{DocumentToken.END_LOCATION.value}"
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and (item_type in main_text_labels):
+                    text = item.text
+                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+                elif isinstance(item, Table) and (item_type in main_text_labels):
+                    xml_str += f"<{item_type}>{loc_str}"
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+                    if item.data is not None and len(item.data) > 0:
+                        for i, row in enumerate(item.data):
+                            xml_str += f"<row_{i}>"
+                            for j, col in enumerate(row):
+                                text = col.text
+                                xml_str += f"<col_{j}>{text}</col_{j}>"
+                            xml_str += f"</row_{i}>{new_line}"
+                    xml_str += f"</{item_type}>{new_line}"
+                elif isinstance(item, Figure) and (item_type in main_text_labels):
+                    xml_str += f"<{item_type}>{loc_str}"
+                    if item.text is not None and len(item.text) > 0:
+                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
+                        xml_str += (
+                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
+                        )
+                    xml_str += f"</{item_type}>{new_line}"
+        xml_str += DocumentToken.END_DOCUMENT.value
+        return xml_str

docling_core/types/rec/statement.py CHANGED Viewed

@@ -4,6 +4,7 @@
 #
 """Define the model Statement."""
+from enum import Enum
 from typing import Generic
 from pydantic import Field
@@ -21,6 +22,39 @@ from docling_core.types.rec.attribute import Attribute
 from docling_core.types.rec.subject import Subject
+class StatementToken(Enum):
+    """Class to represent an LLM friendly representation of statements."""
+    BEG_STATEMENTS = "<statements>"
+    END_STATEMENTS = "</statements>"
+    BEG_STATEMENT = "<statement>"
+    END_STATEMENT = "</statement>"
+    BEG_PROV = "<prov>"
+    END_PROV = "</prov>"
+    BEG_SUBJECT = "<subject>"
+    END_SUBJECT = "</subject>"
+    BEG_PREDICATE = "<predicate>"
+    END_PREDICATE = "</predicate>"
+    BEG_PROPERTY = "<property>"
+    END_PROPERTY = "</property>"
+    BEG_VALUE = "<value>"
+    END_VALUE = "</value>"
+    BEG_UNIT = "<unit>"
+    END_UNIT = "</unit>"
+    @classmethod
+    def get_special_tokens(cls):
+        """Function to get all special statements tokens."""
+        return [token.value for token in cls]
 class Statement(
     Attribute,
     Generic[

{docling_core-1.1.4.dist-info → docling_core-1.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 1.1.4
+Version: 1.3.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -28,6 +28,7 @@ Classifier: Typing :: Typed
 Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
 Requires-Dist: jsonref (>=1.1.0,<2.0.0)
 Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
+Requires-Dist: pandas (>=2.2.2,<3.0.0)
 Requires-Dist: pydantic (>=2.6.0,<3.0.0)
 Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)

{docling_core-1.1.4.dist-info → docling_core-1.3.0.dist-info}/RECORD RENAMED Viewed

@@ -13,14 +13,18 @@ docling_core/search/json_schema_to_search_mapper.py,sha256=9crSFuSbcXrJej7j1rYWK
 docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpOk,724
 docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
 docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
+docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
+docling_core/transforms/chunker/__init__.py,sha256=xZ5ELOB8tbCoJY1dKUvOrFqxYyoHmmCNUSHxrrRi8a4,317
+docling_core/transforms/chunker/base.py,sha256=y1YswRxkdIaNX3Ek7asa1D__KuErRgRKcB8CZ_fQ1uM,970
+docling_core/transforms/chunker/hierarchical_chunker.py,sha256=lAeHgJ4relA1EU0YVOKeuX6mLASmA-SZ5_ChgliSCKk,11996
 docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
 docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
 docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
-docling_core/types/doc/base.py,sha256=Vwh-8Q8n9meFxbrbMUx2zNzt1JnUo3Y3Hpwmmf82IlM,5206
+docling_core/types/doc/base.py,sha256=Jqw5vqiJSJPseq4TUXsARGtH5h095VnA5IJsxvcobns,5387
 docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
 docling_core/types/doc/doc_ocr.py,sha256=6PC0C-OczF-MyfgRxEI1xs3PWgNOzi7i2yEQbTqZz0I,1387
 docling_core/types/doc/doc_raw.py,sha256=Y69G6IiauNDaoT-5el4xo1ypWpnBJQ75akGGkCMTZSc,3888
-docling_core/types/doc/document.py,sha256=kpnBa3cjhH0SKdDaZDUuNIFX7VnPZOHhoB2FlDhwq2g,17187
+docling_core/types/doc/document.py,sha256=6puIPc3aK3kecklCFqVgJXgU4gvGPqjuXePx82poFYE,26934
 docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
 docling_core/types/nlp/__init__.py,sha256=hGcztAeVK7xkRBqRRvc4zbY4PGeJ0r0QrEsetnSx9nI,119
@@ -31,7 +35,7 @@ docling_core/types/rec/attribute.py,sha256=PzPdaPhP5NWbFo8rYOoBl3Vfyx4zJUxN6ZpXl
 docling_core/types/rec/base.py,sha256=jhTfInNGyB9NUw7o33PElrFGL80TqhU8MLcLZNZYj3E,3222
 docling_core/types/rec/predicate.py,sha256=4iDwXl9c4jzHTDIlRNE88yvDzKA9_od0xjPUUUP5IjI,3959
 docling_core/types/rec/record.py,sha256=r1QgPepwH3YjmMHlwwmeK00ZHEJnAsvyOMeXFY_D9_Q,2750
-docling_core/types/rec/statement.py,sha256=BXkuKBz0BL7eiowL_aaYxsz_WBLfR4hfgiqTby4TRnk,920
+docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiKRdCZ5o,1701
 docling_core/types/rec/subject.py,sha256=wX9qsihwDbR7ZNSzY3vQymxi0eN1nxxsonrhSZzsMhA,2565
 docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
 docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
@@ -39,8 +43,8 @@ docling_core/utils/ds_generate_docs.py,sha256=0xGBagdC_PGjyeHXYZo90VnVrSTMZgHb0S
 docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvGqg54qBIvUMHTHdA,1647
 docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
 docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
-docling_core-1.1.4.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-1.1.4.dist-info/METADATA,sha256=nrVfDBk66tXsL8wbyBiE3XcGJcpc0TT5lnRoB41qH5Y,5393
-docling_core-1.1.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-1.1.4.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
-docling_core-1.1.4.dist-info/RECORD,,
+docling_core-1.3.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-1.3.0.dist-info/METADATA,sha256=gD3LDYHPJeRhUO7-OA21cU6EV4PKwvxrZzOjAdUcWB0,5432
+docling_core-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-1.3.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
+docling_core-1.3.0.dist-info/RECORD,,

{docling_core-1.1.4.dist-info → docling_core-1.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-1.1.4.dist-info → docling_core-1.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-1.1.4.dist-info → docling_core-1.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling-core 1.1.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

Potentially problematic release.

docling-core 1.1.4py3-none-any.whl → 1.3.0py3-none-any.whl