PyPI - docling-core - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

docling-core 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (20) hide show

docling_core/transforms/chunker/base.py CHANGED Viewed

@@ -4,26 +4,48 @@
 #
 """Define base classes for chunking."""
+import re
 from abc import ABC, abstractmethod
-from typing import Iterator, Optional
+from typing import Final, Iterator, Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, field_validator
 from docling_core.types import BoundingBox, Document
+from docling_core.types.base import _JSON_POINTER_REGEX
+# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
+_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
+def _create_path(pos: int, path_prefix: str = "main-text") -> str:
+    return f"#/{path_prefix}/{pos}"
 class Chunk(BaseModel):
     """Data model for Chunk."""
-    path: str
+    path: str = Field(pattern=_JSON_POINTER_REGEX)
     text: str
+    heading: Optional[str] = None
+    @field_validator("path", mode="before")
+    @classmethod
+    def _json_pointer_from_json_path(cls, path: str):
+        if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
+            groups = match.groups()
+            if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
+                return _create_path(
+                    pos=int(groups[1]),
+                    path_prefix=groups[0],
+                )
+        return path
 class ChunkWithMetadata(Chunk):
     """Data model for Chunk including metadata."""
-    page: Optional[int]
-    bbox: Optional[BoundingBox]
+    page: Optional[int] = None
+    bbox: Optional[BoundingBox] = None
 class BaseChunker(BaseModel, ABC):
@@ -43,3 +65,10 @@ class BaseChunker(BaseModel, ABC):
             Iterator[Chunk]: iterator over extracted chunks
         """
         raise NotImplementedError()
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return _create_path(
+            pos=pos,
+            path_prefix=path_prefix,
+        )

docling_core/transforms/chunker/hierarchical_chunker.py CHANGED Viewed

@@ -12,7 +12,7 @@ from enum import Enum
 from typing import Any, Iterator, Optional, Union
 import pandas as pd
-from pydantic import BaseModel, PositiveInt
+from pydantic import BaseModel, Field, PositiveInt
 from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
 from docling_core.types import BaseText
@@ -25,8 +25,17 @@ _logger = logging.getLogger(__name__)
 class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
-    include_metadata: bool = True
-    min_chunk_len: PositiveInt = 64
+    heading_as_metadata: bool = Field(
+        default=False,
+        description="Whether heading should be in metadata (instead of text)",
+    )
+    include_metadata: bool = Field(
+        default=True,
+        description="Whether to include extras in the metadata",
+    )
+    min_chunk_len: PositiveInt = Field(
+        default=64, description="Minimum chunk text length to consider (in chars)"
+    )
     class _NodeType(str, Enum):
         PARAGRAPH = "paragraph"
@@ -82,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
         return output_text
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return f"$.{path_prefix}[{pos}]"
     class _MainTextItemNode(BaseModel):
         parent: Optional[int] = None
         children: list[int] = []
@@ -184,7 +189,7 @@ class HierarchicalChunker(BaseChunker):
     def _build_chunk_impl(
         self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
-    ) -> list[_TextEntry]:
+    ) -> tuple[list[_TextEntry], Optional[str]]:
         if doc.main_text:
             item = doc.main_text[idx]
             item_type = _HC._norm(item.obj_type)
@@ -193,7 +198,7 @@ class HierarchicalChunker(BaseChunker):
                 item_type not in self._allowed_types
                 or item_name in self._disallowed_names_by_type.get(item_type, [])
             ):
-                return []
+                return [], None
             c2p = doc_map.dmap
@@ -219,7 +224,7 @@ class HierarchicalChunker(BaseChunker):
                         else []
                     )
                 else:
-                    return []
+                    return [], None
             elif isinstance(item, BaseText):
                 text_entries = [
                     self._TextEntry(
@@ -248,21 +253,29 @@ class HierarchicalChunker(BaseChunker):
                     _HC._NodeName.LIST_ITEM,
                     _HC._NodeName.SUBTITLE_LEVEL_1,
                 ]:
-                    return []
+                    return [], None
             if (parent := c2p[idx].parent) is not None:
                 # prepend with ancestors
+                parent_res = self._build_chunk_impl(
+                    doc=doc, doc_map=doc_map, idx=parent, rec=True
+                )
                 return (
-                    self._build_chunk_impl(
-                        doc=doc, doc_map=doc_map, idx=parent, rec=True
-                    )
-                    + text_entries
+                    parent_res[0] + text_entries,  # expanded text
+                    parent_res[1],  # heading
                 )
             else:
-                # if root, augment with title (if available and different)
-                return text_entries
+                if (
+                    self.heading_as_metadata
+                    and isinstance(item, BaseText)
+                    and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
+                ):
+                    return [], text_entries[0].text
+                else:
+                    return text_entries, None
         else:
-            return []
+            return [], None
     def _build_chunk(
         self,
@@ -272,7 +285,9 @@ class HierarchicalChunker(BaseChunker):
         delim: str,
         rec: bool = False,
     ) -> Optional[Chunk]:
-        texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
+        res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
+        texts = res[0]
+        heading = res[1]
         concat = delim.join([t.text for t in texts if t.text])
         assert doc.main_text is not None
         if len(concat) >= self.min_chunk_len:
@@ -293,6 +308,7 @@ class HierarchicalChunker(BaseChunker):
                 return ChunkWithMetadata(
                     text=concat,
                     path=path,
+                    heading=heading,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
                 )
@@ -300,6 +316,7 @@ class HierarchicalChunker(BaseChunker):
                 return Chunk(
                     text=concat,
                     path=path,
+                    heading=heading,
                 )
         else:
             return None

docling_core/transforms/id_generator/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define the ID generator types."""
+from docling_core.transforms.id_generator.base import BaseIDGenerator  # noqa
+from docling_core.transforms.id_generator.doc_hash_id_generator import (  # noqa
+    DocHashIDGenerator,
+)
+from docling_core.transforms.id_generator.uuid_generator import UUIDGenerator  # noqa

docling_core/transforms/id_generator/base.py ADDED Viewed

@@ -0,0 +1,30 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Base document ID generator module."""
+from abc import ABC, abstractmethod
+from typing import Any
+from docling_core.types import Document as DLDocument
+class BaseIDGenerator(ABC):
+    """Document ID generator base class."""
+    @abstractmethod
+    def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
+        """Generate an ID for the given document.
+        Args:
+            doc (DLDocument): document to generate ID for
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Returns:
+            str: the generated ID
+        """
+        raise NotImplementedError()

docling_core/transforms/id_generator/doc_hash_id_generator.py ADDED Viewed

@@ -0,0 +1,27 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Doc-hash-based ID generator module."""
+from typing import Any
+from docling_core.transforms.id_generator import BaseIDGenerator
+from docling_core.types import Document as DLDocument
+class DocHashIDGenerator(BaseIDGenerator):
+    """Doc-hash-based ID generator class."""
+    def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
+        """Generate an ID for the given document.
+        Args:
+            doc (DLDocument): document to generate ID for
+        Returns:
+            str: the generated ID
+        """
+        return doc.file_info.document_hash

docling_core/transforms/id_generator/uuid_generator.py ADDED Viewed

@@ -0,0 +1,34 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""UUID-based ID generator module."""
+from random import Random
+from typing import Annotated, Any, Optional
+from uuid import UUID
+from pydantic import BaseModel, Field
+from docling_core.transforms.id_generator import BaseIDGenerator
+from docling_core.types import Document as DLDocument
+class UUIDGenerator(BaseModel, BaseIDGenerator):
+    """UUID-based ID generator class."""
+    seed: Optional[int] = None
+    uuid_version: Annotated[int, Field(strict=True, ge=1, le=5)] = 4
+    def generate_id(self, doc: DLDocument, *args: Any, **kwargs: Any) -> str:
+        """Generate an ID for the given document.
+        Args:
+            doc (DLDocument): document to generate ID for
+        Returns:
+            str: the generated ID
+        """
+        rd = Random(x=self.seed)
+        return str(UUID(int=rd.getrandbits(128), version=self.uuid_version))

docling_core/transforms/metadata_extractor/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define the metadata extractor types."""
+from docling_core.transforms.metadata_extractor.base import (  # noqa
+    BaseMetadataExtractor,
+)
+from docling_core.transforms.metadata_extractor.simple_metadata_extractor import (  # noqa
+    SimpleMetadataExtractor,
+)

docling_core/transforms/metadata_extractor/base.py ADDED Viewed

@@ -0,0 +1,59 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Base metadata extractor module."""
+from abc import ABC, abstractmethod
+from typing import Any
+from pydantic import BaseModel
+from docling_core.types import Document as DLDocument
+class BaseMetadataExtractor(BaseModel, ABC):
+    """Metadata extractor base class."""
+    @abstractmethod
+    def get_metadata(
+        self, doc: DLDocument, *args: Any, **kwargs: Any
+    ) -> dict[str, Any]:
+        """Extract metadata for the given document.
+        Args:
+            doc (DLDocument): document to extract metadata for
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Returns:
+            dict[str, Any]: the extracted metadata
+        """
+        raise NotImplementedError()
+    @abstractmethod
+    def get_excluded_embed_metadata_keys(self) -> list[str]:
+        """Get metadata keys to exclude from embedding.
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Returns:
+            list[str]: the metadata to exclude
+        """
+        raise NotImplementedError()
+    @abstractmethod
+    def get_excluded_llm_metadata_keys(self) -> list[str]:
+        """Get metadata keys to exclude from LLM generation.
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Returns:
+            list[str]: the metadata to exclude
+        """
+        raise NotImplementedError()

docling_core/transforms/metadata_extractor/simple_metadata_extractor.py ADDED Viewed

@@ -0,0 +1,59 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Simple metadata extractor module."""
+from typing import Any, Final
+from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
+from docling_core.types import Document as DLDocument
+_DL_DOC_HASH: Final[str] = "dl_doc_hash"
+_ORIGIN: Final[str] = "origin"
+class SimpleMetadataExtractor(BaseMetadataExtractor):
+    """Simple metadata extractor class."""
+    include_origin: bool = False
+    def get_metadata(
+        self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any
+    ) -> dict[str, Any]:
+        """Extract metadata for the given document.
+        Args:
+            doc (DLDocument): document to extract metadata for
+            origin (str): the document origin
+        Returns:
+            dict[str, Any]: the extracted metadata
+        """
+        meta: dict[str, Any] = {
+            _DL_DOC_HASH: doc.file_info.document_hash,
+        }
+        if self.include_origin:
+            meta[_ORIGIN] = origin
+        return meta
+    def get_excluded_embed_metadata_keys(self) -> list[str]:
+        """Get metadata keys to exclude from embedding.
+        Returns:
+            list[str]: the metadata to exclude
+        """
+        excl_keys: list[str] = [_DL_DOC_HASH]
+        if self.include_origin:
+            excl_keys.append(_ORIGIN)
+        return excl_keys
+    def get_excluded_llm_metadata_keys(self) -> list[str]:
+        """Get metadata keys to exclude from LLM generation.
+        Returns:
+            list[str]: the metadata to exclude
+        """
+        return self.get_excluded_embed_metadata_keys()

docling_core/types/base.py CHANGED Viewed

@@ -6,7 +6,7 @@
 """Define common models across types."""
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Generic, Hashable, List, Literal, Optional, TypeVar
+from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
 from pydantic import (
     AfterValidator,
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
 from docling_core.utils.alias import AliasModel
 from docling_core.utils.validators import validate_datetime, validate_unique_list
+# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
+_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
 LanguageT = TypeVar("LanguageT", bound=str)
 IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
 DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)

docling_core/types/doc/base.py CHANGED Viewed

@@ -440,7 +440,6 @@ class BaseText(BaseCell):
     ):
         """Export text element to document tokens format."""
         body = f"<{self.obj_type}>"
-        # body = f"<{self.name}>"
         assert DocumentToken.is_known_token(
             body

docling_core/types/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Package for models defined by the Document type."""
+from .base import BoundingBox, CoordOrigin, Size
+from .document import (
+    BasePictureData,
+    BaseTableData,
+    DescriptionItem,
+    DocItem,
+    DoclingDocument,
+    DocumentOrigin,
+    FloatingItem,
+    GroupItem,
+    ImageRef,
+    KeyValueItem,
+    NodeItem,
+    PageItem,
+    PictureItem,
+    ProvenanceItem,
+    RefItem,
+    SectionHeaderItem,
+    TableCell,
+    TableItem,
+    TextItem,
+)
+from .labels import DocItemLabel, GroupLabel, TableCellLabel

docling_core/types/experimental/base.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""Models for the base data types."""
+import copy
+from enum import Enum
+from typing import Tuple
+from pydantic import BaseModel
+class CoordOrigin(str, Enum):
+    """CoordOrigin."""
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
+class Size(BaseModel):
+    """Size."""
+    width: float = 0.0
+    height: float = 0.0
+    def as_tuple(self):
+        """as_tuple."""
+        return (self.width, self.height)
+class BoundingBox(BaseModel):
+    """BoundingBox."""
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+    @property
+    def width(self):
+        """width."""
+        return self.r - self.l
+    @property
+    def height(self):
+        """height."""
+        return abs(self.t - self.b)
+    def scaled(self, scale: float) -> "BoundingBox":
+        """scaled.
+        :param scale: float:
+        """
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+        return out_bbox
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        """normalized.
+        :param page_size: Size:
+        """
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+        return out_bbox
+    def as_tuple(self):
+        """as_tuple."""
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        """from_tuple.
+        :param coord: Tuple[float:
+        :param ...]:
+        :param origin: CoordOrigin:
+        """
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+    def area(self) -> float:
+        """area."""
+        return (self.r - self.l) * (self.b - self.t)
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        """intersection_area_with.
+        :param other: "BoundingBox":
+        """
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+        return width * height
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        """to_bottom_left_origin.
+        :param page_height:
+        """
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+    def to_top_left_origin(self, page_height):
+        """to_top_left_origin.
+        :param page_height:
+        """
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )

docling-core 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

Potentially problematic release.

docling-core 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl