PyPI - docling-core - Versions diffs - 2.6.1__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

docling-core 2.6.1py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (7) hide show

docling_core/types/doc/document.py CHANGED Viewed

@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.labels import DocItemLabel, GroupLabel
+from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import relative_path
-from docling_core.types.legacy_doc.tokens import DocumentToken
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
                 DeprecationWarning,
             )
-        body = ""
         nrows = self.data.num_rows
         ncols = self.data.num_cols
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
         return body
+    def export_to_otsl(
+        self,
+        doc: "DoclingDocument",
+        add_cell_location: bool = True,
+        add_cell_text: bool = True,
+        xsize: int = 100,
+        ysize: int = 100,
+    ) -> str:
+        """Export the table as OTSL."""
+        # Possible OTSL tokens...
+        #
+        # Empty and full cells:
+        # "ecel", "fcel"
+        #
+        # Cell spans (horisontal, vertical, 2d):
+        # "lcel", "ucel", "xcel"
+        #
+        # New line:
+        # "nl"
+        #
+        # Headers (column, row, section row):
+        # "ched", "rhed", "srow"
+        body = []
+        nrows = self.data.num_rows
+        ncols = self.data.num_cols
+        if len(self.data.table_cells) == 0:
+            return ""
+        page_no = 0
+        if len(self.prov) > 0:
+            page_no = self.prov[0].page_no
+        for i in range(nrows):
+            for j in range(ncols):
+                cell: TableCell = self.data.grid[i][j]
+                content = cell.text.strip()
+                rowspan, rowstart = (
+                    cell.row_span,
+                    cell.start_row_offset_idx,
+                )
+                colspan, colstart = (
+                    cell.col_span,
+                    cell.start_col_offset_idx,
+                )
+                if len(doc.pages.keys()):
+                    page_w, page_h = doc.pages[page_no].size.as_tuple()
+                cell_loc = ""
+                if cell.bbox is not None:
+                    cell_loc = DocumentToken.get_location(
+                        bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        page_i=page_no,
+                    )
+                if rowstart == i and colstart == j:
+                    if len(content) > 0:
+                        if cell.column_header:
+                            body.append(str(TableToken.OTSL_CHED.value))
+                        elif cell.row_header:
+                            body.append(str(TableToken.OTSL_RHED.value))
+                        elif cell.row_section:
+                            body.append(str(TableToken.OTSL_SROW.value))
+                        else:
+                            body.append(str(TableToken.OTSL_FCEL.value))
+                        if add_cell_location:
+                            body.append(str(cell_loc))
+                        if add_cell_text:
+                            body.append(str(content))
+                    else:
+                        body.append(str(TableToken.OTSL_ECEL.value))
+                else:
+                    add_cross_cell = False
+                    if rowstart != i:
+                        if colspan == 1:
+                            body.append(str(TableToken.OTSL_UCEL.value))
+                        else:
+                            add_cross_cell = True
+                    if colstart != j:
+                        if rowspan == 1:
+                            body.append(str(TableToken.OTSL_LCEL.value))
+                        else:
+                            add_cross_cell = True
+                    if add_cross_cell:
+                        body.append(str(TableToken.OTSL_XCEL.value))
+            body.append(str(TableToken.OTSL_NL.value))
+            body_str = "".join(body)
+        return body_str
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",

docling_core/types/doc/tokens.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Tokens used in the docling document model."""
+from enum import Enum
+from typing import Annotated, Tuple
+from pydantic import Field
+class TableToken(Enum):
+    """Class to represent an LLM friendly representation of a Table."""
+    CELL_LABEL_COLUMN_HEADER = "<column_header>"
+    CELL_LABEL_ROW_HEADER = "<row_header>"
+    CELL_LABEL_SECTION_HEADERE = "<section_header>"
+    CELL_LABEL_DATA = "<data>"
+    OTSL_ECEL = "<ecel>"  # empty cell
+    OTSL_FCEL = "<fcel>"  # cell with content
+    OTSL_LCEL = "<lcel>"  # left looking cell,
+    OTSL_UCEL = "<ucel>"  # up looking cell,
+    OTSL_XCEL = "<xcel>"  # 2d extension cell (cross cell),
+    OTSL_NL = "<nl>"  # new line,
+    OTSL_CHED = "<ched>"  # - column header cell,
+    OTSL_RHED = "<rhed>"  # - row header cell,
+    OTSL_SROW = "<srow>"  # - section row cell
+    @classmethod
+    def get_special_tokens(cls):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in TableToken.get_special_tokens()
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows + 1):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+        for i in range(0, max_cols + 1):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+        # FIXME: this is synonym of section header
+        for i in range(6):
+            special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages + 1):
+            special_tokens.append(f"<page_{i}>")
+            special_tokens.append(f"</page_{i}>")
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
+            special_tokens.append(f"<loc_{i}>")
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in DocumentToken.get_special_tokens()
+    @staticmethod
+    def get_row_token(row: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<row_{row}>"
+        else:
+            return f"</row_{row}>"
+    @staticmethod
+    def get_col_token(col: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<col_{col}>"
+        else:
+            return f"</col_{col}>"
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+        if val_ < 0:
+            return "<loc_0>"
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+        return f"<loc_{val_}>"
+    @staticmethod
+    def get_location(
+        # bbox: Tuple[float, float, float, float],
+        bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
+        page_w: float,
+        page_h: float,
+        xsize: int = 100,
+        ysize: int = 100,
+        page_i: int = -1,
+    ):
+        """Get the location string give bbox and page-dim."""
+        assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
+        assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
+        x0 = bbox[0] / page_w
+        y0 = bbox[1] / page_h
+        x1 = bbox[2] / page_w
+        y1 = bbox[3] / page_h
+        page_tok = ""
+        if page_i != -1:
+            page_tok = DocumentToken.get_page_token(page=page_i)
+        x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
+        y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
+        x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
+        y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
+        loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+        loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+        loc_str += f"{DocumentToken.END_LOCATION.value}"
+        return loc_str

{docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.6.1
+Version: 2.7.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
 Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
 Requires-Dist: pillow (>=10.3.0,<11.0.0)
-Requires-Dist: pydantic (>=2.6.0,<2.10)
+Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
 Requires-Dist: pyyaml (>=5.1,<7.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)

{docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/RECORD RENAMED Viewed

@@ -21,8 +21,9 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
 docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
-docling_core/types/doc/document.py,sha256=8qVhet6eQtvju286zUkdOU0NXnkZ0AoOVAysMEZ3Aws,87099
+docling_core/types/doc/document.py,sha256=LXmDD0qZiB34WTWSTklcdWndetOqumMFN3yJEqifb8M,90500
 docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
+docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
 docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
 docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -51,8 +52,8 @@ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6
 docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
 docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.6.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.6.1.dist-info/METADATA,sha256=aHtmbajidCAFKmJiAq-sSW-rSjZhHAMsqSEfRrpYBes,5519
-docling_core-2.6.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-2.6.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
-docling_core-2.6.1.dist-info/RECORD,,
+docling_core-2.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.7.0.dist-info/METADATA,sha256=ht4UM23KfXIPp2aeUjSr9AUruTANa-kSt9kDwHQyeNk,5547
+docling_core-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-2.7.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
+docling_core-2.7.0.dist-info/RECORD,,

{docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling-core 2.6.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.6.1py3-none-any.whl → 2.7.0py3-none-any.whl