PyPI - docling-core - Versions diffs - 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

docling-core 1.7.2py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show

docling_core/transforms/chunker/__init__.py +2 -8
docling_core/transforms/chunker/base.py +27 -40
docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
docling_core/types/__init__.py +3 -18
docling_core/types/doc/__init__.py +25 -0
docling_core/types/doc/base.py +136 -451
docling_core/types/doc/document.py +1289 -559
docling_core/types/{experimental → doc}/labels.py +4 -1
docling_core/types/legacy_doc/__init__.py +6 -0
docling_core/types/legacy_doc/base.py +485 -0
docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
docling_core/types/legacy_doc/document.py +715 -0
docling_core/types/rec/subject.py +1 -1
docling_core/utils/generate_docs.py +82 -0
docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
docling_core/utils/validators.py +3 -3
{docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
{docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
docling_core-2.0.1.dist-info/entry_points.txt +5 -0
docling_core/transforms/id_generator/__init__.py +0 -12
docling_core/transforms/id_generator/base.py +0 -30
docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
docling_core/transforms/id_generator/uuid_generator.py +0 -34
docling_core/transforms/metadata_extractor/__init__.py +0 -13
docling_core/transforms/metadata_extractor/base.py +0 -59
docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
docling_core/types/experimental/__init__.py +0 -30
docling_core/types/experimental/base.py +0 -167
docling_core/types/experimental/document.py +0 -1192
docling_core/utils/ds_generate_docs.py +0 -144
docling_core-1.7.2.dist-info/entry_points.txt +0 -5
/docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
{docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
{docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0

docling_core/types/doc/base.py CHANGED Viewed

@@ -1,485 +1,170 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
+"""Models for the base data types."""
-"""Define common models across CCS objects."""
-from typing import Annotated, List, Literal, Optional, Union
+import copy
+from enum import Enum
+from typing import Tuple
-import pandas as pd
-from pydantic import BaseModel, Field, PositiveInt, StrictStr
+from pydantic import BaseModel
-from docling_core.search.mapping import es_field
-from docling_core.types.doc.tokens import DocumentToken
-from docling_core.utils.alias import AliasModel
-CellData = tuple[float, float, float, float, str, str]
+class CoordOrigin(str, Enum):
+    """CoordOrigin."""
-CellHeader = tuple[
-    Literal["x0"],
-    Literal["y0"],
-    Literal["x1"],
-    Literal["y1"],
-    Literal["font"],
-    Literal["text"],
-]
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
-BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
-Span = Annotated[list[int], Field(min_length=2, max_length=2)]
+class Size(BaseModel):
+    """Size."""
+    width: float = 0.0
+    height: float = 0.0
-class CellsContainer(BaseModel):
-    """Cell container."""
+    def as_tuple(self):
+        """as_tuple."""
+        return (self.width, self.height)
-    data: Optional[list[CellData]] = None
-    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
+class BoundingBox(BaseModel):
+    """BoundingBox."""
-class S3Resource(BaseModel):
-    """Resource in a cloud object storage."""
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
-    mime: str
-    path: str
-    page: Optional[PositiveInt] = None
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+    @property
+    def width(self):
+        """width."""
+        return self.r - self.l
-class S3Data(AliasModel):
-    """Data object in a cloud object storage."""
+    @property
+    def height(self):
+        """height."""
+        return abs(self.t - self.b)
-    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
-    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
-    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
-    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
-    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
-    glm_json_document: Optional[S3Resource] = Field(
-        default=None, alias="glm-json-document"
-    )
-    figures: Optional[list[S3Resource]] = None
+    def scaled(self, scale: float) -> "BoundingBox":
+        """scaled.
+        :param scale: float:
-class S3Reference(AliasModel):
-    """References an s3 resource."""
+        """
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
-    ref_s3_data: StrictStr = Field(
-        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
-    )
+        return out_bbox
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        """normalized.
-class Prov(AliasModel):
-    """Provenance."""
+        :param page_size: Size:
-    bbox: BoundingBox
-    page: PositiveInt
-    span: Span
-    ref_s3_data: Optional[StrictStr] = Field(
-        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
-    )
+        """
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+        return out_bbox
-class BoundingBoxContainer(BaseModel):
-    """Bounding box container."""
+    def as_tuple(self):
+        """as_tuple."""
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
-    min: BoundingBox
-    max: BoundingBox
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        """from_tuple.
+        :param coord: Tuple[float:
+        :param ...]:
+        :param origin: CoordOrigin:
-class BitmapObject(AliasModel):
-    """Bitmap object."""
+        """
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
-    obj_type: str = Field(alias="type")
-    bounding_box: BoundingBoxContainer = Field(
-        json_schema_extra=es_field(suppress=True)
-    )
-    prov: Prov
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-class PageDimensions(BaseModel):
-    """Page dimensions."""
+    def area(self) -> float:
+        """area."""
+        area = (self.r - self.l) * (self.b - self.t)
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            area = -area
+        return area
-    height: float
-    page: PositiveInt
-    width: float
-class TableCell(AliasModel):
-    """Table cell."""
-    bbox: Optional[BoundingBox] = None
-    spans: Optional[list[Span]] = None
-    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
-    obj_type: str = Field(alias="type")
-class GlmTableCell(TableCell):
-    """Glm Table cell."""
-    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    col_header: bool = Field(
-        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
-    )
-    col_span: Optional[Span] = Field(
-        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
-    )
-    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
-    row_header: bool = Field(
-        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
-    )
-    row_span: Optional[Span] = Field(
-        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
-    )
-class BaseCell(AliasModel):
-    """Base cell."""
-    prov: Optional[list[Prov]] = None
-    text: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
-    )
-    obj_type: str = Field(
-        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    def get_location_tokens(
-        self,
-        new_line: str,
-        page_w: float,
-        page_h: float,
-        xsize: int = 100,
-        ysize: int = 100,
-        add_page_index: bool = True,
-    ) -> str:
-        """Get the location string for the BaseCell."""
-        if self.prov is None:
-            return ""
-        location = ""
-        for prov in self.prov:
-            page_i = -1
-            if add_page_index:
-                page_i = prov.page
-            loc_str = DocumentToken.get_location(
-                bbox=prov.bbox,
-                page_w=page_w,
-                page_h=page_h,
-                xsize=xsize,
-                ysize=ysize,
-                page_i=page_i,
-            )
-            location += f"{loc_str}{new_line}"
-        return location
-class Table(BaseCell):
-    """Table."""
-    num_cols: int = Field(alias="#-cols")
-    num_rows: int = Field(alias="#-rows")
-    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
-    model: Optional[str] = None
-    # FIXME: we need to check why we have bounding_box (this should be in prov)
-    bounding_box: Optional[BoundingBoxContainer] = Field(
-        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
-    )
-    def _get_tablecell_span(self, cell: TableCell, ix: int):
-        if cell.spans is None:
-            span = set()
-        else:
-            span = set([s[ix] for s in cell.spans])
-        if len(span) == 0:
-            return 1, None, None
-        return len(span), min(span), max(span)
-    def export_to_dataframe(self) -> pd.DataFrame:
-        """Export the table as a Pandas DataFrame."""
-        if self.data is None or self.num_rows == 0 or self.num_cols == 0:
-            return pd.DataFrame()
-        # Count how many rows are column headers
-        num_headers = 0
-        for i, row in enumerate(self.data):
-            if len(row) == 0:
-                raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
-            any_header = False
-            for cell in row:
-                if cell.obj_type == "col_header":
-                    any_header = True
-                    break
-            if any_header:
-                num_headers += 1
-            else:
-                break
-        # Create the column names from all col_headers
-        columns: Optional[List[str]] = None
-        if num_headers > 0:
-            columns = ["" for _ in range(self.num_cols)]
-            for i in range(num_headers):
-                for j, cell in enumerate(self.data[i]):
-                    col_name = cell.text
-                    if columns[j] != "":
-                        col_name = f".{col_name}"
-                    columns[j] += col_name
-        # Create table data
-        table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
-        # Create DataFrame
-        df = pd.DataFrame(table_data, columns=columns)
-        return df
-    def export_to_html(self) -> str:
-        """Export the table as html."""
-        body = ""
-        nrows = self.num_rows
-        ncols = self.num_cols
-        if self.data is None:
-            return ""
-        for i in range(nrows):
-            body += "<tr>"
-            for j in range(ncols):
-                cell: TableCell = self.data[i][j]
-                rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
-                colspan, colstart, colend = self._get_tablecell_span(cell, 1)
-                if rowstart is not None and rowstart != i:
-                    continue
-                if colstart is not None and colstart != j:
-                    continue
-                if rowstart is None:
-                    rowstart = i
-                if colstart is None:
-                    colstart = j
-                content = cell.text.strip()
-                label = cell.obj_type
-                celltag = "td"
-                if label in ["row_header", "row_multi_header", "row_title"]:
-                    pass
-                elif label in ["col_header", "col_multi_header"]:
-                    celltag = "th"
-                opening_tag = f"{celltag}"
-                if rowspan > 1:
-                    opening_tag += f' rowspan="{rowspan}"'
-                if colspan > 1:
-                    opening_tag += f' colspan="{colspan}"'
-                body += f"<{opening_tag}>{content}</{celltag}>"
-            body += "</tr>"
-        body = f"<table>{body}</table>"
-        return body
-    def export_to_document_tokens(
-        self,
-        new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
-        xsize: int = 100,
-        ysize: int = 100,
-        add_location: bool = True,
-        add_caption: bool = True,
-        add_content: bool = True,
-        add_cell_location: bool = True,
-        add_cell_label: bool = True,
-        add_cell_text: bool = True,
-        add_page_index: bool = True,
-    ):
-        """Export table to document tokens format."""
-        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
-        if add_location:
-            body += self.get_location_tokens(
-                new_line=new_line,
-                page_w=page_w,
-                page_h=page_h,
-                xsize=xsize,
-                ysize=ysize,
-                add_page_index=add_page_index,
-            )
-        if add_caption and self.text is not None and len(self.text) > 0:
-            body += f"{DocumentToken.BEG_CAPTION.value}"
-            body += f"{self.text.strip()}"
-            body += f"{DocumentToken.END_CAPTION.value}"
-            body += f"{new_line}"
-        if add_content and self.data is not None and len(self.data) > 0:
-            for i, row in enumerate(self.data):
-                body += f"<row_{i}>"
-                for j, col in enumerate(row):
-                    text = ""
-                    if add_cell_text:
-                        text = col.text.strip()
-                    cell_loc = ""
-                    if (
-                        col.bbox is not None
-                        and add_cell_location
-                        and add_page_index
-                        and self.prov is not None
-                        and len(self.prov) > 0
-                    ):
-                        cell_loc = DocumentToken.get_location(
-                            bbox=col.bbox,
-                            page_w=page_w,
-                            page_h=page_h,
-                            xsize=xsize,
-                            ysize=ysize,
-                            page_i=self.prov[0].page,
-                        )
-                    elif (
-                        col.bbox is not None
-                        and add_cell_location
-                        and not add_page_index
-                    ):
-                        cell_loc = DocumentToken.get_location(
-                            bbox=col.bbox,
-                            page_w=page_w,
-                            page_h=page_h,
-                            xsize=xsize,
-                            ysize=ysize,
-                            page_i=-1,
-                        )
-                    cell_label = ""
-                    if (
-                        add_cell_label
-                        and col.obj_type is not None
-                        and len(col.obj_type) > 0
-                    ):
-                        cell_label = f"<{col.obj_type}>"
-                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
-                body += f"</row_{i}>{new_line}"
-        body += f"{DocumentToken.END_TABLE.value}{new_line}"
-        return body
-# FIXME: let's add some figure specific data-types later
-class Figure(BaseCell):
-    """Figure."""
-    # FIXME: we need to check why we have bounding_box (this should be in prov)
-    bounding_box: Optional[BoundingBoxContainer] = Field(
-        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
-    )
-    def export_to_document_tokens(
-        self,
-        new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
-        xsize: int = 100,
-        ysize: int = 100,
-        add_location: bool = True,
-        add_caption: bool = True,
-        add_content: bool = True,  # not used at the moment
-        add_page_index: bool = True,
-    ):
-        """Export figure to document tokens format."""
-        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
-        if add_location:
-            body += self.get_location_tokens(
-                new_line=new_line,
-                page_w=page_w,
-                page_h=page_h,
-                xsize=xsize,
-                ysize=ysize,
-                add_page_index=add_page_index,
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        """intersection_area_with.
+        :param other: "BoundingBox":
+        """
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+        return width * height
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        """to_bottom_left_origin.
+        :param page_height:
+        """
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
             )
-        if add_caption and self.text is not None and len(self.text) > 0:
-            body += f"{DocumentToken.BEG_CAPTION.value}"
-            body += f"{self.text.strip()}"
-            body += f"{DocumentToken.END_CAPTION.value}"
-            body += f"{new_line}"
-        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
-        return body
-class BaseText(BaseCell):
-    """Base model for text objects."""
-    # FIXME: do we need these ???
-    name: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    font: Optional[str] = None
-    def export_to_document_tokens(
-        self,
-        new_line: str = "\n",
-        page_w: float = 0.0,
-        page_h: float = 0.0,
-        xsize: int = 100,
-        ysize: int = 100,
-        add_location: bool = True,
-        add_content: bool = True,
-        add_page_index: bool = True,
-    ):
-        """Export text element to document tokens format."""
-        body = f"<{self.obj_type}>"
-        assert DocumentToken.is_known_token(
-            body
-        ), f"failed DocumentToken.is_known_token({body})"
-        if add_location:
-            body += self.get_location_tokens(
-                new_line="",
-                page_w=page_w,
-                page_h=page_h,
-                xsize=xsize,
-                ysize=ysize,
-                add_page_index=add_page_index,
+    def to_top_left_origin(self, page_height):
+        """to_top_left_origin.
+        :param page_height:
+        """
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
             )
-        if add_content and self.text is not None:
-            body += self.text.strip()
-        body += f"</{self.obj_type}>{new_line}"
-        return body
-class ListItem(BaseText):
-    """List item."""
-    identifier: str
-class Ref(AliasModel):
-    """Reference."""
-    name: str
-    obj_type: str = Field(alias="type")
-    ref: str = Field(alias="$ref")
-class PageReference(BaseModel):
-    """Page reference."""
-    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
-    model: str = Field(json_schema_extra=es_field(suppress=True))
-    page: PositiveInt = Field(json_schema_extra=es_field(type="short"))

docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

Potentially problematic release.

docling-core 1.7.2py3-none-any.whl → 2.0.1py3-none-any.whl