PyPI - docling-core - Versions diffs - 1.7.0__tar.gz → 1.7.1__tar.gz - Mend

docling-core 1.7.0tar.gz → 1.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (63) hide show

{docling_core-1.7.0 → docling_core-1.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 1.7.0
+Version: 1.7.1
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT

docling_core-1.7.1/docling_core/transforms/chunker/base.py ADDED Viewed

@@ -0,0 +1,74 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define base classes for chunking."""
+import re
+from abc import ABC, abstractmethod
+from typing import Final, Iterator, Optional
+from pydantic import BaseModel, Field, field_validator
+from docling_core.types import BoundingBox, Document
+from docling_core.types.base import _JSON_POINTER_REGEX
+# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
+_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
+def _create_path(pos: int, path_prefix: str = "main-text") -> str:
+    return f"#/{path_prefix}/{pos}"
+class Chunk(BaseModel):
+    """Data model for Chunk."""
+    path: str = Field(pattern=_JSON_POINTER_REGEX)
+    text: str
+    heading: Optional[str] = None
+    @field_validator("path", mode="before")
+    @classmethod
+    def _json_pointer_from_json_path(cls, path: str):
+        if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
+            groups = match.groups()
+            if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
+                return _create_path(
+                    pos=int(groups[1]),
+                    path_prefix=groups[0],
+                )
+        return path
+class ChunkWithMetadata(Chunk):
+    """Data model for Chunk including metadata."""
+    page: Optional[int] = None
+    bbox: Optional[BoundingBox] = None
+class BaseChunker(BaseModel, ABC):
+    """Base class for Chunker."""
+    @abstractmethod
+    def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
+        """Chunk the provided document.
+        Args:
+            dl_doc (Document): document to chunk
+        Raises:
+            NotImplementedError: in this abstract implementation
+        Yields:
+            Iterator[Chunk]: iterator over extracted chunks
+        """
+        raise NotImplementedError()
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return _create_path(
+            pos=pos,
+            path_prefix=path_prefix,
+        )

{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/hierarchical_chunker.py RENAMED Viewed

@@ -12,7 +12,7 @@ from enum import Enum
 from typing import Any, Iterator, Optional, Union
 import pandas as pd
-from pydantic import BaseModel, PositiveInt
+from pydantic import BaseModel, Field, PositiveInt
 from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
 from docling_core.types import BaseText
@@ -25,9 +25,17 @@ _logger = logging.getLogger(__name__)
 class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
-    include_metadata: bool = True
-    heading_as_metadata: bool = False
-    min_chunk_len: PositiveInt = 64
+    heading_as_metadata: bool = Field(
+        default=False,
+        description="Whether heading should be in metadata (instead of text)",
+    )
+    include_metadata: bool = Field(
+        default=True,
+        description="Whether to include extras in the metadata",
+    )
+    min_chunk_len: PositiveInt = Field(
+        default=64, description="Minimum chunk text length to consider (in chars)"
+    )
     class _NodeType(str, Enum):
         PARAGRAPH = "paragraph"
@@ -83,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
         return output_text
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return f"$.{path_prefix}[{pos}]"
     class _MainTextItemNode(BaseModel):
         parent: Optional[int] = None
         children: list[int] = []
@@ -304,14 +308,15 @@ class HierarchicalChunker(BaseChunker):
                 return ChunkWithMetadata(
                     text=concat,
                     path=path,
+                    heading=heading,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
-                    heading=heading,
                 )
             else:
                 return Chunk(
                     text=concat,
                     path=path,
+                    heading=heading,
                 )
         else:
             return None
@@ -327,11 +332,6 @@ class HierarchicalChunker(BaseChunker):
         Yields:
             Iterator[Chunk]: iterator over extracted chunks
         """
-        if (not self.include_metadata) and self.heading_as_metadata:
-            raise RuntimeError(
-                "To enable `heading_as_metadata`, also `include_metadata` must be True."
-            )
         if dl_doc.main_text:
             # extract doc structure incl. metadata for
             # each item (e.g. parent, children)

{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py RENAMED Viewed

@@ -6,20 +6,18 @@
 """Simple metadata extractor module."""
-from enum import Enum
-from typing import Any
+from typing import Any, Final
 from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
 from docling_core.types import Document as DLDocument
+_DL_DOC_HASH: Final[str] = "dl_doc_hash"
+_ORIGIN: Final[str] = "origin"
 class SimpleMetadataExtractor(BaseMetadataExtractor):
     """Simple metadata extractor class."""
-    class _Keys(str, Enum):
-        DL_DOC_HASH = "dl_doc_hash"
-        ORIGIN = "origin"
     include_origin: bool = False
     def get_metadata(
@@ -35,10 +33,10 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
             dict[str, Any]: the extracted metadata
         """
         meta: dict[str, Any] = {
-            self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
+            _DL_DOC_HASH: doc.file_info.document_hash,
         }
         if self.include_origin:
-            meta[self._Keys.ORIGIN] = origin
+            meta[_ORIGIN] = origin
         return meta
     def get_excluded_embed_metadata_keys(self) -> list[str]:
@@ -47,9 +45,9 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
         Returns:
             list[str]: the metadata to exclude
         """
-        excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
+        excl_keys: list[str] = [_DL_DOC_HASH]
         if self.include_origin:
-            excl_keys.append(self._Keys.ORIGIN)
+            excl_keys.append(_ORIGIN)
         return excl_keys
     def get_excluded_llm_metadata_keys(self) -> list[str]:

{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/base.py RENAMED Viewed

@@ -6,7 +6,7 @@
 """Define common models across types."""
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Generic, Hashable, List, Literal, Optional, TypeVar
+from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
 from pydantic import (
     AfterValidator,
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
 from docling_core.utils.alias import AliasModel
 from docling_core.utils.validators import validate_datetime, validate_unique_list
+# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
+_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
 LanguageT = TypeVar("LanguageT", bound=str)
 IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
 DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)

{docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/document.py RENAMED Viewed

@@ -20,6 +20,7 @@ from tabulate import tabulate
 from typing_extensions import Annotated
 from docling_core.search.package import VERSION_PATTERN
+from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc.tokens import DocumentToken
 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
@@ -28,9 +29,6 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
 CURRENT_VERSION: Final = "1.0.0"
-# (subset of) JSON Pointer URI fragment identifier format:
-_JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
 class BasePictureData(BaseModel):  # TBD
     """BasePictureData."""

{docling_core-1.7.0 → docling_core-1.7.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "1.7.0"
+version = "1.7.1"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [

docling_core-1.7.0/docling_core/transforms/chunker/base.py DELETED Viewed

@@ -1,46 +0,0 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-"""Define base classes for chunking."""
-from abc import ABC, abstractmethod
-from typing import Iterator, Optional
-from pydantic import BaseModel
-from docling_core.types import BoundingBox, Document
-class Chunk(BaseModel):
-    """Data model for Chunk."""
-    path: str
-    text: str
-class ChunkWithMetadata(Chunk):
-    """Data model for Chunk including metadata."""
-    page: Optional[int] = None
-    bbox: Optional[BoundingBox] = None
-    heading: Optional[str] = None
-class BaseChunker(BaseModel, ABC):
-    """Base class for Chunker."""
-    @abstractmethod
-    def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
-        """Chunk the provided document.
-        Args:
-            dl_doc (Document): document to chunk
-        Raises:
-            NotImplementedError: in this abstract implementation
-        Yields:
-            Iterator[Chunk]: iterator over extracted chunks
-        """
-        raise NotImplementedError()