PyPI - docling-core - Versions diffs - 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl - Mend

docling-core 2.27.0py3-none-any.whl → 2.28.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (22) hide show

docling_core/transforms/chunker/hybrid_chunker.py CHANGED Viewed

@@ -8,27 +8,21 @@ import warnings
 from functools import cached_property
 from typing import Any, Iterable, Iterator, Optional, Union
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    PositiveInt,
-    TypeAdapter,
-    computed_field,
-    model_validator,
-)
-from typing_extensions import Self
+from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
 from docling_core.transforms.chunker.hierarchical_chunker import (
     ChunkingSerializerProvider,
 )
+from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
 try:
     import semchunk
-    from transformers import AutoTokenizer, PreTrainedTokenizerBase
 except ImportError:
     raise RuntimeError(
-        "Module requires 'chunking' extra; to install, run: "
-        "`pip install 'docling-core[chunking]'`"
+        "Extra required by module: 'chunking' by default (or 'chunking-openai' if "
+        "specifically using OpenAI tokenization); to install, run: "
+        "`pip install 'docling-core[chunking]'` or "
+        "`pip install 'docling-core[chunking-openai]'`"
     )
 from docling_core.experimental.serializer.base import (
@@ -45,6 +39,16 @@ from docling_core.transforms.chunker import (
 from docling_core.types import DoclingDocument
+def _get_default_tokenizer():
+    from docling_core.transforms.chunker.tokenizer.huggingface import (
+        HuggingFaceTokenizer,
+    )
+    return HuggingFaceTokenizer.from_pretrained(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
 class HybridChunker(BaseChunker):
     r"""Chunker doing tokenization-aware refinements on top of document layout chunking.
@@ -58,26 +62,40 @@ class HybridChunker(BaseChunker):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    tokenizer: Union[PreTrainedTokenizerBase, str] = (
-        "sentence-transformers/all-MiniLM-L6-v2"
-    )
-    max_tokens: int = None  # type: ignore[assignment]
+    tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
     merge_peers: bool = True
     serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
-    @model_validator(mode="after")
-    def _patch_tokenizer_and_max_tokens(self) -> Self:
-        self._tokenizer = (
-            self.tokenizer
-            if isinstance(self.tokenizer, PreTrainedTokenizerBase)
-            else AutoTokenizer.from_pretrained(self.tokenizer)
-        )
-        if self.max_tokens is None:
-            self.max_tokens = TypeAdapter(PositiveInt).validate_python(
-                self._tokenizer.model_max_length
-            )
-        return self
+    @model_validator(mode="before")
+    @classmethod
+    def _patch(cls, data: Any) -> Any:
+        if isinstance(data, dict) and (tokenizer := data.get("tokenizer")):
+            max_tokens = data.get("max_tokens")
+            if isinstance(tokenizer, BaseTokenizer):
+                pass
+            else:
+                from docling_core.transforms.chunker.tokenizer.huggingface import (
+                    HuggingFaceTokenizer,
+                )
+                if isinstance(tokenizer, str):
+                    data["tokenizer"] = HuggingFaceTokenizer.from_pretrained(
+                        model_name=tokenizer,
+                        max_tokens=max_tokens,
+                    )
+                else:
+                    # migrate previous HF-based tokenizers
+                    kwargs = {"tokenizer": tokenizer}
+                    if max_tokens is not None:
+                        kwargs["max_tokens"] = max_tokens
+                    data["tokenizer"] = HuggingFaceTokenizer(**kwargs)
+        return data
+    @property
+    def max_tokens(self) -> int:
+        """Get maximum number of tokens allowed."""
+        return self.tokenizer.get_max_tokens()
     @computed_field  # type: ignore[misc]
     @cached_property
@@ -92,7 +110,7 @@ class HybridChunker(BaseChunker):
             for t in text:
                 total += self._count_text_tokens(t)
             return total
-        return len(self._tokenizer.tokenize(text))
+        return self.tokenizer.count_tokens(text=text)
     class _ChunkLengthInfo(BaseModel):
         total_len: int
@@ -101,7 +119,7 @@ class HybridChunker(BaseChunker):
     def _count_chunk_tokens(self, doc_chunk: DocChunk):
         ser_txt = self.contextualize(chunk=doc_chunk)
-        return len(self._tokenizer.tokenize(text=ser_txt))
+        return self.tokenizer.count_tokens(text=ser_txt)
     def _doc_chunk_length(self, doc_chunk: DocChunk):
         text_length = self._count_text_tokens(doc_chunk.text)
@@ -198,7 +216,7 @@ class HybridChunker(BaseChunker):
             # captions:
             available_length = self.max_tokens - lengths.other_len
             sem_chunker = semchunk.chunkerify(
-                self._tokenizer, chunk_size=available_length
+                self.tokenizer.get_tokenizer(), chunk_size=available_length
             )
             if available_length <= 0:
                 warnings.warn(

docling_core/transforms/chunker/tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Define the tokenizer types."""

docling_core/transforms/chunker/tokenizer/base.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Define base classes for tokenization."""
+from abc import ABC, abstractmethod
+from typing import Any
+from pydantic import BaseModel
+class BaseTokenizer(BaseModel, ABC):
+    """Base tokenizer class."""
+    @abstractmethod
+    def count_tokens(self, text: str) -> int:
+        """Get number of tokens for given text."""
+        ...
+    @abstractmethod
+    def get_max_tokens(self) -> int:
+        """Get maximum number of tokens allowed."""
+        ...
+    @abstractmethod
+    def get_tokenizer(self) -> Any:
+        """Get underlying tokenizer object."""
+        ...

docling_core/transforms/chunker/tokenizer/huggingface.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""HuggingFace tokenization."""
+import sys
+from os import PathLike
+from typing import Optional, Union
+from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator
+from typing_extensions import Self
+from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
+try:
+    from transformers import AutoTokenizer, PreTrainedTokenizerBase
+except ImportError:
+    raise RuntimeError(
+        "Module requires 'chunking' extra; to install, run: "
+        "`pip install 'docling-core[chunking]'`"
+    )
+class HuggingFaceTokenizer(BaseTokenizer):
+    """HuggingFace tokenizer."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokenizer: PreTrainedTokenizerBase
+    max_tokens: int = None  # type: ignore[assignment]
+    @model_validator(mode="after")
+    def _patch(self) -> Self:
+        if hasattr(self.tokenizer, "model_max_length"):
+            model_max_tokens: PositiveInt = TypeAdapter(PositiveInt).validate_python(
+                self.tokenizer.model_max_length
+            )
+            user_max_tokens = self.max_tokens or sys.maxsize
+            self.max_tokens = min(model_max_tokens, user_max_tokens)
+        elif self.max_tokens is None:
+            raise ValueError(
+                "max_tokens must be defined as model does not define model_max_length"
+            )
+        return self
+    def count_tokens(self, text: str):
+        """Get number of tokens for given text."""
+        return len(self.tokenizer.tokenize(text=text))
+    def get_max_tokens(self):
+        """Get maximum number of tokens allowed."""
+        return self.max_tokens
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name: Union[str, PathLike],
+        max_tokens: Optional[int] = None,
+        **kwargs,
+    ) -> Self:
+        """Create tokenizer from model name."""
+        my_kwargs = {
+            "tokenizer": AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path=model_name, **kwargs
+            ),
+        }
+        if max_tokens is not None:
+            my_kwargs["max_tokens"] = max_tokens
+        return cls(**my_kwargs)
+    def get_tokenizer(self):
+        """Get underlying tokenizer object."""
+        return self.tokenizer

docling_core/transforms/chunker/tokenizer/openai.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""OpenAI tokenization."""
+from pydantic import ConfigDict
+from docling_core.transforms.chunker.hybrid_chunker import BaseTokenizer
+try:
+    import tiktoken
+except ImportError:
+    raise RuntimeError(
+        "Module requires 'chunking-openai' extra; to install, run: "
+        "`pip install 'docling-core[chunking-openai]'`"
+    )
+class OpenAITokenizer(BaseTokenizer):
+    """OpenAI tokenizer."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    tokenizer: tiktoken.Encoding
+    max_tokens: int
+    def count_tokens(self, text: str) -> int:
+        """Get number of tokens for given text."""
+        return len(self.tokenizer.encode(text=text))
+    def get_max_tokens(self) -> int:
+        """Get maximum number of tokens allowed."""
+        return self.max_tokens
+    def get_tokenizer(self) -> tiktoken.Encoding:
+        """Get underlying tokenizer object."""
+        return self.tokenizer

docling_core/transforms/visualizer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Define the visualizer types."""

docling_core/transforms/visualizer/base.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Define base classes for visualization."""
+from abc import ABC, abstractmethod
+from typing import Optional
+from PIL.Image import Image
+from pydantic import BaseModel
+from docling_core.types.doc import DoclingDocument
+class BaseVisualizer(BaseModel, ABC):
+    """Visualize base class."""
+    @abstractmethod
+    def get_visualization(
+        self,
+        *,
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> dict[Optional[int], Image]:
+        """Get visualization of the document as images by page."""
+        raise NotImplementedError()

docling_core/transforms/visualizer/layout_visualizer.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""Define classes for layout visualization."""
+from copy import deepcopy
+from typing import Literal, Optional, Union
+from PIL import ImageDraw, ImageFont
+from PIL.Image import Image
+from PIL.ImageFont import FreeTypeFont
+from pydantic import BaseModel
+from typing_extensions import override
+from docling_core.transforms.visualizer.base import BaseVisualizer
+from docling_core.types.doc import DocItemLabel
+from docling_core.types.doc.base import CoordOrigin
+from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+class _TLBoundingRectangle(BoundingRectangle):
+    coord_origin: Literal[CoordOrigin.TOPLEFT] = CoordOrigin.TOPLEFT
+class _TLTextCell(TextCell):
+    rect: _TLBoundingRectangle
+class _TLCluster(BaseModel):
+    id: int
+    label: DocItemLabel
+    brec: _TLBoundingRectangle
+    confidence: float = 1.0
+    cells: list[_TLTextCell] = []
+    children: list["_TLCluster"] = []  # Add child cluster support
+class LayoutVisualizer(BaseVisualizer):
+    """Layout visualizer."""
+    class Params(BaseModel):
+        """Layout visualization parameters."""
+        show_label: bool = True
+    base_visualizer: Optional[BaseVisualizer] = None
+    params: Params = Params()
+    def _draw_clusters(
+        self, image: Image, clusters: list[_TLCluster], scale_x: float, scale_y: float
+    ) -> None:
+        """Draw clusters on an image."""
+        draw = ImageDraw.Draw(image, "RGBA")
+        # Create a smaller font for the labels
+        font: Union[ImageFont.ImageFont, FreeTypeFont]
+        try:
+            font = ImageFont.truetype("arial.ttf", 12)
+        except OSError:
+            # Fallback to default font if arial is not available
+            font = ImageFont.load_default()
+        for c_tl in clusters:
+            all_clusters = [c_tl, *c_tl.children]
+            for c in all_clusters:
+                # Draw cells first (underneath)
+                cell_color = (0, 0, 0, 40)  # Transparent black for cells
+                for tc in c.cells:
+                    cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
+                    cx0 *= scale_x
+                    cx1 *= scale_x
+                    cy0 *= scale_y
+                    cy1 *= scale_y
+                    draw.rectangle(
+                        [(cx0, cy0), (cx1, cy1)],
+                        outline=None,
+                        fill=cell_color,
+                    )
+                # Draw cluster rectangle
+                x0, y0, x1, y1 = c.brec.to_bounding_box().as_tuple()
+                x0 *= scale_x
+                x1 *= scale_x
+                y0 *= scale_y
+                y1 *= scale_y
+                cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+                cluster_outline_color = (
+                    *list(DocItemLabel.get_color(c.label)),
+                    255,
+                )
+                draw.rectangle(
+                    [(x0, y0), (x1, y1)],
+                    outline=cluster_outline_color,
+                    fill=cluster_fill_color,
+                )
+                if self.params.show_label:
+                    # Add label name and confidence
+                    label_text = f"{c.label.name} ({c.confidence:.2f})"
+                    # Create semi-transparent background for text
+                    text_bbox = draw.textbbox((x0, y0), label_text, font=font)
+                    text_bg_padding = 2
+                    draw.rectangle(
+                        [
+                            (
+                                text_bbox[0] - text_bg_padding,
+                                text_bbox[1] - text_bg_padding,
+                            ),
+                            (
+                                text_bbox[2] + text_bg_padding,
+                                text_bbox[3] + text_bg_padding,
+                            ),
+                        ],
+                        fill=(255, 255, 255, 180),  # Semi-transparent white
+                    )
+                    # Draw text
+                    draw.text(
+                        (x0, y0),
+                        label_text,
+                        fill=(0, 0, 0, 255),  # Solid black
+                        font=font,
+                    )
+    def _draw_doc_layout(
+        self, doc: DoclingDocument, images: Optional[dict[Optional[int], Image]] = None
+    ):
+        """Draw the document clusters and optionaly the reading order."""
+        clusters = []
+        my_images: dict[Optional[int], Image] = {}
+        if images is not None:
+            my_images = images
+        # Initialise `my_images` beforehand: sometimes, you have the
+        # page-images but no DocItems!
+        for page_nr, page in doc.pages.items():
+            page_image = doc.pages[page_nr].image
+            if page_image is None or (pil_img := page_image.pil_image) is None:
+                raise RuntimeError("Cannot visualize document without images")
+            elif page_nr not in my_images:
+                image = deepcopy(pil_img)
+                my_images[page_nr] = image
+        prev_image = None
+        prev_page_nr = None
+        for idx, (elem, _) in enumerate(
+            doc.iterate_items(
+                included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
+            )
+        ):
+            if not isinstance(elem, DocItem):
+                continue
+            if len(elem.prov) == 0:
+                continue  # Skip elements without provenances
+            prov = elem.prov[0]
+            page_nr = prov.page_no
+            if page_nr in my_images:
+                image = my_images[page_nr]
+            else:
+                raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
+            if prev_page_nr is None or page_nr > prev_page_nr:  # new page begins
+                # complete previous drawing
+                if prev_page_nr is not None and prev_image and clusters:
+                    self._draw_clusters(
+                        image=prev_image,
+                        clusters=clusters,
+                        scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
+                        scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
+                    )
+                    clusters = []
+            tlo_bbox = prov.bbox.to_top_left_origin(
+                page_height=doc.pages[prov.page_no].size.height
+            )
+            cluster = _TLCluster(
+                id=idx,
+                label=elem.label,
+                brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
+                cells=[],
+            )
+            clusters.append(cluster)
+            prev_page_nr = page_nr
+            prev_image = image
+        # complete last drawing
+        if prev_page_nr is not None and prev_image and clusters:
+            self._draw_clusters(
+                image=prev_image,
+                clusters=clusters,
+                scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
+                scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
+            )
+        return my_images
+    @override
+    def get_visualization(
+        self,
+        *,
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> dict[Optional[int], Image]:
+        """Get visualization of the document as images by page."""
+        base_images = (
+            self.base_visualizer.get_visualization(doc=doc, **kwargs)
+            if self.base_visualizer
+            else None
+        )
+        return self._draw_doc_layout(
+            doc=doc,
+            images=base_images,
+        )

docling_core/transforms/visualizer/reading_order_visualizer.py ADDED Viewed

@@ -0,0 +1,149 @@
+"""Define classes for reading order visualization."""
+from copy import deepcopy
+from typing import Optional
+from PIL import ImageDraw
+from PIL.Image import Image
+from typing_extensions import override
+from docling_core.transforms.visualizer.base import BaseVisualizer
+from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
+class ReadingOrderVisualizer(BaseVisualizer):
+    """Reading order visualizer."""
+    base_visualizer: Optional[BaseVisualizer] = None
+    def _draw_arrow(
+        self,
+        draw: ImageDraw.ImageDraw,
+        arrow_coords: tuple[float, float, float, float],
+        line_width: int = 2,
+        color: str = "red",
+    ):
+        """Draw an arrow inside the given draw object."""
+        x0, y0, x1, y1 = arrow_coords
+        # Arrow parameters
+        start_point = (x0, y0)  # Starting point of the arrow
+        end_point = (x1, y1)  # Ending point of the arrow
+        arrowhead_length = 20  # Length of the arrowhead
+        arrowhead_width = 10  # Width of the arrowhead
+        # Draw the arrow shaft (line)
+        draw.line([start_point, end_point], fill=color, width=line_width)
+        # Calculate the arrowhead points
+        dx = end_point[0] - start_point[0]
+        dy = end_point[1] - start_point[1]
+        angle = (dx**2 + dy**2) ** 0.5 + 0.01  # Length of the arrow shaft
+        # Normalized direction vector for the arrow shaft
+        ux, uy = dx / angle, dy / angle
+        # Base of the arrowhead
+        base_x = end_point[0] - ux * arrowhead_length
+        base_y = end_point[1] - uy * arrowhead_length
+        # Left and right points of the arrowhead
+        left_x = base_x - uy * arrowhead_width
+        left_y = base_y + ux * arrowhead_width
+        right_x = base_x + uy * arrowhead_width
+        right_y = base_y - ux * arrowhead_width
+        # Draw the arrowhead (triangle)
+        draw.polygon(
+            [end_point, (left_x, left_y), (right_x, right_y)],
+            fill=color,
+        )
+        return draw
+    def _draw_doc_reading_order(
+        self,
+        doc: DoclingDocument,
+        images: Optional[dict[Optional[int], Image]] = None,
+    ):
+        """Draw the reading order."""
+        # draw = ImageDraw.Draw(image)
+        x0, y0 = None, None
+        my_images: dict[Optional[int], Image] = images or {}
+        prev_page = None
+        for elem, _ in doc.iterate_items(
+            included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
+        ):
+            if not isinstance(elem, DocItem):
+                continue
+            if len(elem.prov) == 0:
+                continue  # Skip elements without provenances
+            prov = elem.prov[0]
+            page_no = prov.page_no
+            image = my_images.get(page_no)
+            if image is None or prev_page is None or page_no > prev_page:
+                # new page begins
+                prev_page = page_no
+                x0 = y0 = None
+                if image is None:
+                    page_image = doc.pages[page_no].image
+                    if page_image is None or (pil_img := page_image.pil_image) is None:
+                        raise RuntimeError("Cannot visualize document without images")
+                    else:
+                        image = deepcopy(pil_img)
+                        my_images[page_no] = image
+            draw = ImageDraw.Draw(image)
+            # if prov.page_no not in true_doc.pages or prov.page_no != 1:
+            #     logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ")
+            #     continue
+            tlo_bbox = prov.bbox.to_top_left_origin(
+                page_height=doc.pages[prov.page_no].size.height
+            )
+            ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
+            ro_bbox.l = round(ro_bbox.l * image.width)  # noqa: E741
+            ro_bbox.r = round(ro_bbox.r * image.width)
+            ro_bbox.t = round(ro_bbox.t * image.height)
+            ro_bbox.b = round(ro_bbox.b * image.height)
+            if ro_bbox.b > ro_bbox.t:
+                ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
+            if x0 is None and y0 is None:
+                x0 = (ro_bbox.l + ro_bbox.r) / 2.0
+                y0 = (ro_bbox.b + ro_bbox.t) / 2.0
+            else:
+                assert x0 is not None
+                assert y0 is not None
+                x1 = (ro_bbox.l + ro_bbox.r) / 2.0
+                y1 = (ro_bbox.b + ro_bbox.t) / 2.0
+                draw = self._draw_arrow(
+                    draw=draw,
+                    arrow_coords=(x0, y0, x1, y1),
+                    line_width=2,
+                    color="red",
+                )
+                x0, y0 = x1, y1
+        return my_images
+    @override
+    def get_visualization(
+        self,
+        *,
+        doc: DoclingDocument,
+        **kwargs,
+    ) -> dict[Optional[int], Image]:
+        """Get visualization of the document as images by page."""
+        base_images = (
+            self.base_visualizer.get_visualization(doc=doc, **kwargs)
+            if self.base_visualizer
+            else None
+        )
+        return self._draw_doc_reading_order(
+            doc=doc,
+            images=base_images,
+        )

docling-core 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl

Potentially problematic release.

docling-core 2.27.0py3-none-any.whl → 2.28.1py3-none-any.whl