PyPI - docling-core - Versions diffs - 2.6.0__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

docling-core 2.6.0py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (9) hide show

docling_core/types/doc/document.py CHANGED Viewed

@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.labels import DocItemLabel, GroupLabel
-from docling_core.types.legacy_doc.tokens import DocumentToken
-from docling_core.utils.file import relative_path
+from docling_core.types.doc.tokens import DocumentToken, TableToken
+from docling_core.types.doc.utils import relative_path
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
                 DeprecationWarning,
             )
-        body = ""
         nrows = self.data.num_rows
         ncols = self.data.num_cols
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
         return body
+    def export_to_otsl(
+        self,
+        doc: "DoclingDocument",
+        add_cell_location: bool = True,
+        add_cell_text: bool = True,
+        xsize: int = 100,
+        ysize: int = 100,
+    ) -> str:
+        """Export the table as OTSL."""
+        # Possible OTSL tokens...
+        #
+        # Empty and full cells:
+        # "ecel", "fcel"
+        #
+        # Cell spans (horisontal, vertical, 2d):
+        # "lcel", "ucel", "xcel"
+        #
+        # New line:
+        # "nl"
+        #
+        # Headers (column, row, section row):
+        # "ched", "rhed", "srow"
+        body = []
+        nrows = self.data.num_rows
+        ncols = self.data.num_cols
+        if len(self.data.table_cells) == 0:
+            return ""
+        page_no = 0
+        if len(self.prov) > 0:
+            page_no = self.prov[0].page_no
+        for i in range(nrows):
+            for j in range(ncols):
+                cell: TableCell = self.data.grid[i][j]
+                content = cell.text.strip()
+                rowspan, rowstart = (
+                    cell.row_span,
+                    cell.start_row_offset_idx,
+                )
+                colspan, colstart = (
+                    cell.col_span,
+                    cell.start_col_offset_idx,
+                )
+                if len(doc.pages.keys()):
+                    page_w, page_h = doc.pages[page_no].size.as_tuple()
+                cell_loc = ""
+                if cell.bbox is not None:
+                    cell_loc = DocumentToken.get_location(
+                        bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                        page_w=page_w,
+                        page_h=page_h,
+                        xsize=xsize,
+                        ysize=ysize,
+                        page_i=page_no,
+                    )
+                if rowstart == i and colstart == j:
+                    if len(content) > 0:
+                        if cell.column_header:
+                            body.append(str(TableToken.OTSL_CHED.value))
+                        elif cell.row_header:
+                            body.append(str(TableToken.OTSL_RHED.value))
+                        elif cell.row_section:
+                            body.append(str(TableToken.OTSL_SROW.value))
+                        else:
+                            body.append(str(TableToken.OTSL_FCEL.value))
+                        if add_cell_location:
+                            body.append(str(cell_loc))
+                        if add_cell_text:
+                            body.append(str(content))
+                    else:
+                        body.append(str(TableToken.OTSL_ECEL.value))
+                else:
+                    add_cross_cell = False
+                    if rowstart != i:
+                        if colspan == 1:
+                            body.append(str(TableToken.OTSL_UCEL.value))
+                        else:
+                            add_cross_cell = True
+                    if colstart != j:
+                        if rowspan == 1:
+                            body.append(str(TableToken.OTSL_LCEL.value))
+                        else:
+                            add_cross_cell = True
+                    if add_cross_cell:
+                        body.append(str(TableToken.OTSL_XCEL.value))
+            body.append(str(TableToken.OTSL_NL.value))
+            body_str = "".join(body)
+        return body_str
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",

docling_core/types/doc/tokens.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Tokens used in the docling document model."""
+from enum import Enum
+from typing import Annotated, Tuple
+from pydantic import Field
+class TableToken(Enum):
+    """Class to represent an LLM friendly representation of a Table."""
+    CELL_LABEL_COLUMN_HEADER = "<column_header>"
+    CELL_LABEL_ROW_HEADER = "<row_header>"
+    CELL_LABEL_SECTION_HEADERE = "<section_header>"
+    CELL_LABEL_DATA = "<data>"
+    OTSL_ECEL = "<ecel>"  # empty cell
+    OTSL_FCEL = "<fcel>"  # cell with content
+    OTSL_LCEL = "<lcel>"  # left looking cell,
+    OTSL_UCEL = "<ucel>"  # up looking cell,
+    OTSL_XCEL = "<xcel>"  # 2d extension cell (cross cell),
+    OTSL_NL = "<nl>"  # new line,
+    OTSL_CHED = "<ched>"  # - column header cell,
+    OTSL_RHED = "<rhed>"  # - row header cell,
+    OTSL_SROW = "<srow>"  # - section row cell
+    @classmethod
+    def get_special_tokens(cls):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in TableToken.get_special_tokens()
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows + 1):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+        for i in range(0, max_cols + 1):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+        # FIXME: this is synonym of section header
+        for i in range(6):
+            special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages + 1):
+            special_tokens.append(f"<page_{i}>")
+            special_tokens.append(f"</page_{i}>")
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
+            special_tokens.append(f"<loc_{i}>")
+        return special_tokens
+    @staticmethod
+    def is_known_token(label):
+        """Function to check if label is in tokens."""
+        return label in DocumentToken.get_special_tokens()
+    @staticmethod
+    def get_row_token(row: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<row_{row}>"
+        else:
+            return f"</row_{row}>"
+    @staticmethod
+    def get_col_token(col: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<col_{col}>"
+        else:
+            return f"</col_{col}>"
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+        if val_ < 0:
+            return "<loc_0>"
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+        return f"<loc_{val_}>"
+    @staticmethod
+    def get_location(
+        # bbox: Tuple[float, float, float, float],
+        bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
+        page_w: float,
+        page_h: float,
+        xsize: int = 100,
+        ysize: int = 100,
+        page_i: int = -1,
+    ):
+        """Get the location string give bbox and page-dim."""
+        assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
+        assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
+        x0 = bbox[0] / page_w
+        y0 = bbox[1] / page_h
+        x1 = bbox[2] / page_w
+        y1 = bbox[3] / page_h
+        page_tok = ""
+        if page_i != -1:
+            page_tok = DocumentToken.get_page_token(page=page_i)
+        x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
+        y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
+        x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
+        y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
+        loc_str = f"{DocumentToken.BEG_LOCATION.value}"
+        loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
+        loc_str += f"{DocumentToken.END_LOCATION.value}"
+        return loc_str

docling_core/types/doc/utils.py ADDED Viewed

@@ -0,0 +1,48 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Utils for document types."""
+from pathlib import Path
+def relative_path(src: Path, target: Path) -> Path:
+    """Compute the relative path from `src` to `target`.
+    Args:
+        src (str | Path): The source directory or file path (must be absolute).
+        target (str | Path): The target directory or file path (must be absolute).
+    Returns:
+        Path: The relative path from `src` to `target`.
+    Raises:
+        ValueError: If either `src` or `target` is not an absolute path.
+    """
+    src = Path(src).resolve()
+    target = Path(target).resolve()
+    # Ensure both paths are absolute
+    if not src.is_absolute():
+        raise ValueError(f"The source path must be absolute: {src}")
+    if not target.is_absolute():
+        raise ValueError(f"The target path must be absolute: {target}")
+    # Find the common ancestor
+    common_parts = []
+    for src_part, target_part in zip(src.parts, target.parts):
+        if src_part == target_part:
+            common_parts.append(src_part)
+        else:
+            break
+    # Determine the path to go up from src to the common ancestor
+    up_segments = [".."] * (len(src.parts) - len(common_parts))
+    # Add the path from the common ancestor to the target
+    down_segments = target.parts[len(common_parts) :]
+    # Combine and return the result
+    return Path(*up_segments, *down_segments)

docling_core/utils/file.py CHANGED Viewed

@@ -15,6 +15,7 @@ import requests
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 from typing_extensions import deprecated
+from docling_core.types.doc.utils import relative_path  # noqa
 from docling_core.types.io import DocumentStream
@@ -168,43 +169,3 @@ def resolve_file_source(
         source=source,
         headers=headers,
     )
-def relative_path(src: Path, target: Path) -> Path:
-    """Compute the relative path from `src` to `target`.
-    Args:
-        src (str | Path): The source directory or file path (must be absolute).
-        target (str | Path): The target directory or file path (must be absolute).
-    Returns:
-        Path: The relative path from `src` to `target`.
-    Raises:
-        ValueError: If either `src` or `target` is not an absolute path.
-    """
-    src = Path(src).resolve()
-    target = Path(target).resolve()
-    # Ensure both paths are absolute
-    if not src.is_absolute():
-        raise ValueError(f"The source path must be absolute: {src}")
-    if not target.is_absolute():
-        raise ValueError(f"The target path must be absolute: {target}")
-    # Find the common ancestor
-    common_parts = []
-    for src_part, target_part in zip(src.parts, target.parts):
-        if src_part == target_part:
-            common_parts.append(src_part)
-        else:
-            break
-    # Determine the path to go up from src to the common ancestor
-    up_segments = [".."] * (len(src.parts) - len(common_parts))
-    # Add the path from the common ancestor to the target
-    down_segments = target.parts[len(common_parts) :]
-    # Combine and return the result
-    return Path(*up_segments, *down_segments)

{docling_core-2.6.0.dist-info → docling_core-2.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.6.0
+Version: 2.7.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
 Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
 Requires-Dist: pillow (>=10.3.0,<11.0.0)
-Requires-Dist: pydantic (>=2.6.0,<2.10)
+Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
 Requires-Dist: pyyaml (>=5.1,<7.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)

{docling_core-2.6.0.dist-info → docling_core-2.7.0.dist-info}/RECORD RENAMED Viewed

@@ -21,8 +21,10 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
 docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
-docling_core/types/doc/document.py,sha256=K6ixUeB0vyrnd3_ljM0Ed_8JBdltLPCsrGz7IoLgjUI,87094
+docling_core/types/doc/document.py,sha256=LXmDD0qZiB34WTWSTklcdWndetOqumMFN3yJEqifb8M,90500
 docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
+docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
+docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
 docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
 docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
@@ -45,13 +47,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
 docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
 docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
 docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
-docling_core/utils/file.py,sha256=B1Iu8buqk_Yz4bhrGf7NyFIiYlsa_MC37vZLwQHqKLU,6876
+docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
 docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
 docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
 docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.6.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.6.0.dist-info/METADATA,sha256=LhnsqU5AgndZllazTDXe_acmPWQ6NuMuH_b6-d4K1gM,5519
-docling_core-2.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-2.6.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
-docling_core-2.6.0.dist-info/RECORD,,
+docling_core-2.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.7.0.dist-info/METADATA,sha256=ht4UM23KfXIPp2aeUjSr9AUruTANa-kSt9kDwHQyeNk,5547
+docling_core-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-2.7.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
+docling_core-2.7.0.dist-info/RECORD,,

{docling_core-2.6.0.dist-info → docling_core-2.7.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-2.6.0.dist-info → docling_core-2.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.6.0.dist-info → docling_core-2.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling-core 2.6.0__py3-none-any.whl → 2.7.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.6.0py3-none-any.whl → 2.7.0py3-none-any.whl