PyPI - docling-core - Versions diffs - 1.4.1__tar.gz → 1.6.0__tar.gz - Mend

docling-core 1.4.1tar.gz → 1.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (51) hide show

{docling_core-1.4.1 → docling_core-1.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 1.4.1
+Version: 1.6.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT

{docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/base.py RENAMED Viewed

@@ -10,6 +10,7 @@ import pandas as pd
 from pydantic import BaseModel, Field, PositiveInt, StrictStr
 from docling_core.search.mapping import es_field
+from docling_core.types.doc.tokens import DocumentToken
 from docling_core.utils.alias import AliasModel
 CellData = tuple[float, float, float, float, str, str]
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
 class BaseCell(AliasModel):
     """Base cell."""
-    # FIXME: we need to check why we have bounding_box (this should be in prov)
-    bounding_box: Optional[BoundingBoxContainer] = Field(
-        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
-    )
     prov: Optional[list[Prov]] = None
     text: Optional[str] = Field(
         default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
         alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
     )
+    def get_location_tokens(
+        self,
+        new_line: str,
+        page_w: float,
+        page_h: float,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_page_index: bool = True,
+    ) -> str:
+        """Get the location string for the BaseCell."""
+        if self.prov is None:
+            return ""
+        location = ""
+        for prov in self.prov:
+            page_i = -1
+            if add_page_index:
+                page_i = prov.page
+            loc_str = DocumentToken.get_location(
+                bbox=prov.bbox,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=page_i,
+            )
+            location += f"{loc_str}{new_line}"
+        return location
 class Table(BaseCell):
     """Table."""
@@ -153,6 +182,11 @@ class Table(BaseCell):
     data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
     model: Optional[str] = None
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
     def _get_tablecell_span(self, cell: TableCell, ix: int):
         if cell.spans is None:
             span = set()
@@ -249,26 +283,185 @@ class Table(BaseCell):
         return body
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,
+        add_cell_location: bool = True,
+        add_cell_label: bool = True,
+        add_cell_text: bool = True,
+        add_page_index: bool = True,
+    ):
+        """Export table to document tokens format."""
+        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+        if add_location:
+            body += self.get_location_tokens(
+                new_line=new_line,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_caption and self.text is not None and len(self.text) > 0:
+            body += f"{DocumentToken.BEG_CAPTION.value}"
+            body += f"{self.text.strip()}"
+            body += f"{DocumentToken.END_CAPTION.value}"
+            body += f"{new_line}"
+        if add_content and self.data is not None and len(self.data) > 0:
+            for i, row in enumerate(self.data):
+                body += f"<row_{i}>"
+                for j, col in enumerate(row):
+                    text = ""
+                    if add_cell_text:
+                        text = col.text.strip()
+                    cell_loc = ""
+                    if (
+                        col.bbox is not None
+                        and add_cell_location
+                        and add_page_index
+                        and self.prov is not None
+                        and len(self.prov) > 0
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox,
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=self.prov[0].page,
+                        )
+                    elif (
+                        col.bbox is not None
+                        and add_cell_location
+                        and not add_page_index
+                    ):
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox,
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=-1,
+                        )
+                    cell_label = ""
+                    if (
+                        add_cell_label
+                        and col.obj_type is not None
+                        and len(col.obj_type) > 0
+                    ):
+                        cell_label = f"<{col.obj_type}>"
+                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
+                body += f"</row_{i}>{new_line}"
+        body += f"{DocumentToken.END_TABLE.value}{new_line}"
+        return body
 # FIXME: let's add some figure specific data-types later
 class Figure(BaseCell):
     """Figure."""
+    # FIXME: we need to check why we have bounding_box (this should be in prov)
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,  # not used at the moment
+        add_page_index: bool = True,
+    ):
+        """Export figure to document tokens format."""
+        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
+        if add_location:
+            body += self.get_location_tokens(
+                new_line=new_line,
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_caption and self.text is not None and len(self.text) > 0:
+            body += f"{DocumentToken.BEG_CAPTION.value}"
+            body += f"{self.text.strip()}"
+            body += f"{DocumentToken.END_CAPTION.value}"
+            body += f"{new_line}"
+        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
+        return body
-class BaseText(AliasModel):
+class BaseText(BaseCell):
     """Base model for text objects."""
-    text: StrictStr = Field(
-        json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: StrictStr = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
+    # FIXME: do we need these ???
     name: Optional[StrictStr] = Field(
         default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
     )
     font: Optional[str] = None
-    prov: Optional[list[Prov]] = None
+    def export_to_document_tokens(
+        self,
+        new_line: str = "\n",
+        page_w: float = 0.0,
+        page_h: float = 0.0,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+    ):
+        """Export text element to document tokens format."""
+        body = f"<{self.obj_type}>"
+        # body = f"<{self.name}>"
+        assert DocumentToken.is_known_token(
+            body
+        ), f"failed DocumentToken.is_known_token({body})"
+        if add_location:
+            body += self.get_location_tokens(
+                new_line="",
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_content and self.text is not None:
+            body += self.text.strip()
+        body += f"</{self.obj_type}>{new_line}"
+        return body
 class ListItem(BaseText):

{docling_core-1.4.1 → docling_core-1.6.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -6,8 +6,7 @@
 """Models for the Docling Document data type."""
 from datetime import datetime
-from enum import Enum
-from typing import Generic, Optional, Tuple, Union
+from typing import Generic, Optional, Union
 from pydantic import (
     AnyHttpUrl,
@@ -43,6 +42,7 @@ from docling_core.types.doc.base import (
     S3Data,
     Table,
 )
+from docling_core.types.doc.tokens import DocumentToken
 from docling_core.utils.alias import AliasModel
@@ -347,107 +347,6 @@ class CCSDocument(
         return data
-class DocumentToken(Enum):
-    """Class to represent an LLM friendly representation of a Document."""
-    BEG_DOCUMENT = "<document>"
-    END_DOCUMENT = "</document>"
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-    BEG_HEADER = "<section-header>"
-    END_HEADER = "</section-header>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_FIGURE = "<figure>"
-    END_FIGURE = "</figure>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<equation>"
-    END_EQUATION = "</equation>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
-    @classmethod
-    def get_special_tokens(
-        cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
-        page_dimension: Tuple[int, int] = (100, 100),
-    ):
-        """Function to get all special document tokens."""
-        special_tokens = [token.value for token in cls]
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows + 1):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-        for i in range(0, max_cols + 1):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-        for i in range(6):
-            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
-        # Adding dynamically generated page-tokens
-        for i in range(0, max_pages + 1):
-            special_tokens.append(f"<page_{i}>")
-        # Adding dynamically generated location-tokens
-        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
-            special_tokens.append(f"<loc_{i}>")
-        return special_tokens
-    @staticmethod
-    def get_page_token(page: int):
-        """Function to get page tokens."""
-        return f"<page_{page}>"
-    @staticmethod
-    def get_location_token(val: float, rnorm: int = 100):
-        """Function to get location tokens."""
-        val_ = round(rnorm * val)
-        if val_ < 0:
-            return "<loc_0>"
-        if val_ > rnorm:
-            return f"<loc_{rnorm}>"
-        return f"<loc_{val_}>"
 class ExportedCCSDocument(
     MinimalDocument,
     Generic[
@@ -525,7 +424,17 @@ class ExportedCCSDocument(
         return result
-    def export_to_markdown(
+    def get_map_to_page_dimensions(self):
+        """Get a map from page-index (start at 1) to page-dim [width, height]."""
+        pagedims = {}
+        if self.page_dimensions is not None:
+            for _ in self.page_dimensions:
+                pagedims[_.page] = [_.width, _.height]
+        return pagedims
+    def export_to_markdown(  # noqa: C901
         self,
         delim: str = "\n\n",
         main_text_start: int = 0,
@@ -536,8 +445,10 @@ class ExportedCCSDocument(
             "paragraph",
             "caption",
             "table",
+            "figure",
         ],
         strict_text: bool = False,
+        image_placeholder: str = "<!-- image -->",
     ) -> str:
         r"""Serialize to Markdown.
@@ -551,6 +462,12 @@ class ExportedCCSDocument(
                 Defaults to 0.
             main_text_end (Optional[int], optional): Main-text slicing stop index
                 (exclusive). Defaults to None.
+            main_text_labels (list[str], optional): The labels to include in the
+                markdown.
+            strict_text (bool, optional): if true, the output will be only plain text
+                without any markdown styling. Defaults to False.
+            image_placeholder (str, optional): the placeholder to include to position
+                images in the markdown. Defaults to a markdown comment "<!-- image -->".
         Returns:
             str: The exported Markdown representation.
@@ -576,7 +493,7 @@ class ExportedCCSDocument(
                     text = item.text
                     # ignore repeated text
-                    if prev_text == text:
+                    if prev_text == text or text is None:
                         continue
                     else:
                         prev_text = text
@@ -630,6 +547,14 @@ class ExportedCCSDocument(
                         markdown_text = md_table
+                elif isinstance(item, Figure) and item_type in main_text_labels:
+                    markdown_text = ""
+                    if not strict_text:
+                        markdown_text = f"{image_placeholder}"
+                    if item.text:
+                        markdown_text += "\n" + item.text
                 if markdown_text:
                     md_texts.append(markdown_text)
@@ -649,48 +574,32 @@ class ExportedCCSDocument(
             "table",
             "figure",
         ],
-        page_tagging: bool = True,
-        location_tagging: bool = True,
-        location_dimensions: Tuple[int, int] = (100, 100),
-        add_new_line: bool = True,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+        # table specific flags
+        add_table_cell_location: bool = False,
+        add_table_cell_label: bool = True,
+        add_table_cell_text: bool = True,
     ) -> str:
         r"""Exports the document content to an DocumentToken format.
         Operates on a slice of the document's main_text as defined through arguments
         main_text_start and main_text_stop; defaulting to the whole main_text.
-        Args:
-            delim (str, optional): The delimiter used to separate text blocks in the
-                exported XML. Default is two newline characters ("\n\n").
-            main_text_start (int, optional): The starting index of the main text to
-                be included in the XML. Default is 0 (the beginning of the text).
-            main_text_stop (Optional[int], optional): The stopping index of the main
-                text. If set to None, the export includes text up to the end.
-                Default is None.
-            main_text_labels (list[str], optional): A list of text labels that
-                categorize the different sections of the document (e.g., "title",
-                "subtitle-level-1", "paragraph", "caption"). Default labels are
-                "title", "subtitle-level-1", "paragraph", and "caption".
-            location_tagging (bool, optional): Determines whether to include
-                location-based tagging in the XML. If True, the exported XML will
-                contain information about the locations of the text elements.
-                Default is True.
-            location_dimensions (Tuple[int, int], optional): Specifies the dimensions
-                (width and height) for the location tagging, if enabled.
-                Default is [100, 100].
-            add_new_line (bool, optional): Whether to add new line characters after
-                each text block. If True, a new line is added after each block of
-                text in the XML. Default is True.
         Returns:
-            str: The content of the document formatted as an XML string.
+            str: The content of the document formatted as a DocTags string.
         """
-        xml_str = DocumentToken.BEG_DOCUMENT.value
         new_line = ""
-        if add_new_line:
+        if delim:
             new_line = "\n"
+        doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
+        # pagedims = self.get_map_to_page_dimensions()
         if self.main_text is not None:
             for orig_item in self.main_text[main_text_start:main_text_stop]:
@@ -705,87 +614,68 @@ class ExportedCCSDocument(
                 prov = item.prov
-                loc_str = ""  # default is zero
+                page_i = -1
+                page_w = 0.0
+                page_h = 0.0
                 if (
-                    location_tagging
+                    add_location
                     and self.page_dimensions is not None
                     and prov is not None
                     and len(prov) > 0
                 ):
-                    page = prov[0].page
-                    page_dim = self.page_dimensions[page - 1]
+                    page_i = prov[0].page
+                    page_dim = self.page_dimensions[page_i - 1]
                     page_w = float(page_dim.width)
                     page_h = float(page_dim.height)
-                    x0 = float(prov[0].bbox[0]) / float(page_w)
-                    y0 = float(prov[0].bbox[1]) / float(page_h)
-                    x1 = float(prov[0].bbox[2]) / float(page_w)
-                    y1 = float(prov[0].bbox[3]) / float(page_h)
-                    page_tok = ""
-                    if page_tagging:
-                        page_tok = DocumentToken.get_page_token(page=page)
-                    x0_tok = DocumentToken.get_location_token(
-                        val=min(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y0_tok = DocumentToken.get_location_token(
-                        val=min(y0, y1), rnorm=location_dimensions[1]
-                    )
-                    x1_tok = DocumentToken.get_location_token(
-                        val=max(x0, x1), rnorm=location_dimensions[0]
-                    )
-                    y1_tok = DocumentToken.get_location_token(
-                        val=max(y0, y1), rnorm=location_dimensions[1]
-                    )
-                    # update
-                    loc_str = f"{DocumentToken.BEG_LOCATION.value}"
-                    loc_str += f"{page_tok}"
-                    loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
-                    loc_str += f"{DocumentToken.END_LOCATION.value}"
                 item_type = item.obj_type
                 if isinstance(item, BaseText) and (item_type in main_text_labels):
-                    text = item.text
-                    xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
+                    doctags += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_page_index=add_page_index,
+                    )
                 elif isinstance(item, Table) and (item_type in main_text_labels):
-                    xml_str += f"<{item_type}>{loc_str}"
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-                    if item.data is not None and len(item.data) > 0:
-                        for i, row in enumerate(item.data):
-                            xml_str += f"<row_{i}>"
-                            for j, col in enumerate(row):
-                                text = col.text
-                                xml_str += f"<col_{j}>{text}</col_{j}>"
-                            xml_str += f"</row_{i}>{new_line}"
-                    xml_str += f"</{item_type}>{new_line}"
+                    doctags += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_caption=True,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_cell_location=add_table_cell_location,
+                        add_cell_label=add_table_cell_label,
+                        add_cell_text=add_table_cell_text,
+                        add_page_index=add_page_index,
+                    )
                 elif isinstance(item, Figure) and (item_type in main_text_labels):
-                    xml_str += f"<{item_type}>{loc_str}"
-                    if item.text is not None and len(item.text) > 0:
-                        xml_str += f"{DocumentToken.BEG_CAPTION.value}"
-                        xml_str += (
-                            f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
-                        )
-                    xml_str += f"</{item_type}>{new_line}"
+                    doctags += item.export_to_document_tokens(
+                        new_line=new_line,
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_caption=True,
+                        add_location=add_location,
+                        add_content=add_content,
+                        add_page_index=add_page_index,
+                    )
-        xml_str += DocumentToken.END_DOCUMENT.value
+        doctags += DocumentToken.END_DOCUMENT.value
-        return xml_str
+        return doctags

docling_core-1.6.0/docling_core/types/doc/tokens.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Tokens used in the docling document model."""
+from enum import Enum
+from typing import Annotated, Tuple
+from pydantic import Field
+class TableToken(Enum):
+    """Class to represent an LLM friendly representation of a Table."""
+    CELL_LABEL_COLUMN_HEADER = "<column_header>"
+    CELL_LABEL_ROW_HEADER = "<row_header>"
+    CELL_LABEL_SECTION_HEADERE = "<section_header>"
+    CELL_LABEL_DATA = "<data>"
+    OTSL_ECEL = "<ecel>"  # empty cell
+    OTSL_FCEL = "<fcel>"  # cell with content
+    OTSL_LCEL = "<lcel>"  # left looking cell,
+    OTSL_UCEL = "<ucel>"  # up looking cell,
+    OTSL_XCEL = "<xcel>"  # 2d extension cell (cross cell),
+    OTSL_NL = "<nl>"  # new line,
+    OTSL_CHED = "<ched>"  # - column header cell,
+    OTSL_RHED = "<rhed>"  # - row header cell,
+    OTSL_SROW = "<srow>"  # - section row cell
+    @classmethod
+    def get_special_tokens(cls):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in TableToken.get_special_tokens()
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows + 1):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+        for i in range(0, max_cols + 1):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+        # FIXME: this is synonym of section header
+        for i in range(6):
+            special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages + 1):
+            special_tokens.append(f"<page_{i}>")
+            special_tokens.append(f"</page_{i}>")
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
+            special_tokens.append(f"<loc_{i}>")
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in DocumentToken.get_special_tokens()
+    @staticmethod
+    def get_row_token(row: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<row_{row}>"
+        else:
+            return f"</row_{row}>"
+    @staticmethod
+    def get_col_token(col: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<col_{col}>"
+        else:
+            return f"</col_{col}>"
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+        if val_ < 0:
+            return "<loc_0>"
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+        return f"<loc_{val_}>"
+    @staticmethod
+    def get_location(
+        # bbox: Tuple[float, float, float, float],
+        bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
+        page_w: float,
+        page_h: float,
+        xsize: int = 100,
+        ysize: int = 100,
+        page_i: int = -1,
+    ):
+        """Get the location string give bbox and page-dim."""
+        assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
+        assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
+        x0 = bbox[0] / page_w
+        y0 = bbox[1] / page_h
+        x1 = bbox[2] / page_w
+        y1 = bbox[3] / page_h
+        page_tok = ""
+        if page_i != -1:
+            page_tok = DocumentToken.get_page_token(page=page_i)
+        x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
+        y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
+        x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
+        y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
+        loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+        loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+        loc_str += f"{DocumentToken.END_LOCATION.value}"
+        return loc_str

docling_core-1.6.0/docling_core/utils/file.py ADDED Viewed

@@ -0,0 +1,54 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""File-related utilities."""
+import tempfile
+from pathlib import Path
+from typing import Union
+import requests
+from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
+    """Resolves the source (URL, path) of a file to a local file path.
+    If a URL is provided, the content is first downloaded to a temporary local file.
+    Args:
+        source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
+    Raises:
+        ValueError: If source is of unexpected type.
+    Returns:
+        Path: The local file path.
+    """
+    try:
+        http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
+        res = requests.get(http_url, stream=True)
+        res.raise_for_status()
+        fname = None
+        # try to get filename from response header
+        if cont_disp := res.headers.get("Content-Disposition"):
+            for par in cont_disp.strip().split(";"):
+                # currently only handling directive "filename" (not "*filename")
+                if (split := par.split("=")) and split[0].strip() == "filename":
+                    fname = "=".join(split[1:]).strip().strip("'\"") or None
+                    break
+        # otherwise, use name from URL:
+        if fname is None:
+            fname = Path(http_url.path or "file").name
+        local_path = Path(tempfile.mkdtemp()) / fname
+        with open(local_path, "wb") as f:
+            for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
+                f.write(chunk)
+    except ValidationError:
+        try:
+            local_path = TypeAdapter(Path).validate_python(source)
+        except ValidationError:
+            raise ValueError(f"Unexpected source type encountered: {type(source)}")
+    return local_path

{docling_core-1.4.1 → docling_core-1.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "1.4.1"
+version = "1.6.0"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [
@@ -118,6 +118,7 @@ module = [
     "jsonschema.*",
     "json_schema_for_humans.*",
     "pandas.*",
+    "requests.*",
     "tabulate.*",
 ]
 ignore_missing_imports = true