PyPI - docling-core - Versions diffs - 2.28.1__tar.gz → 2.30.0__tar.gz - Mend

docling-core 2.28.1tar.gz → 2.30.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (79) hide show

{docling_core-2.28.1 → docling_core-2.30.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.28.1
+Version: 2.30.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://github.com/docling-project
 License: MIT

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hierarchical_chunker.py RENAMED Viewed

@@ -14,19 +14,19 @@ from typing import Any, ClassVar, Final, Iterator, Literal, Optional
 from pydantic import ConfigDict, Field, StringConstraints, field_validator
 from typing_extensions import Annotated, override
-from docling_core.experimental.serializer.base import (
+from docling_core.search.package import VERSION_PATTERN
+from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
+from docling_core.transforms.serializer.base import (
     BaseDocSerializer,
     BaseSerializerProvider,
     BaseTableSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import create_ser_result
-from docling_core.experimental.serializer.markdown import (
+from docling_core.transforms.serializer.common import create_ser_result
+from docling_core.transforms.serializer.markdown import (
     MarkdownDocSerializer,
     MarkdownParams,
 )
-from docling_core.search.package import VERSION_PATTERN
-from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
 from docling_core.types import DoclingDocument as DLDocument
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/chunker/hybrid_chunker.py RENAMED Viewed

@@ -25,10 +25,6 @@ except ImportError:
         "`pip install 'docling-core[chunking-openai]'`"
     )
-from docling_core.experimental.serializer.base import (
-    BaseDocSerializer,
-    BaseSerializerProvider,
-)
 from docling_core.transforms.chunker import (
     BaseChunk,
     BaseChunker,
@@ -36,6 +32,10 @@ from docling_core.transforms.chunker import (
     DocMeta,
     HierarchicalChunker,
 )
+from docling_core.transforms.serializer.base import (
+    BaseDocSerializer,
+    BaseSerializerProvider,
+)
 from docling_core.types import DoclingDocument

{docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/common.py RENAMED Viewed

@@ -14,7 +14,7 @@ from typing import Any, Iterable, Optional, Tuple, Union
 from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
 from typing_extensions import Self, override
-from docling_core.experimental.serializer.base import (
+from docling_core.transforms.serializer.base import (
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,

{docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/doctags.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union
 from pydantic import BaseModel
 from typing_extensions import override
-from docling_core.experimental.serializer.base import (
+from docling_core.transforms.serializer.base import (
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -18,7 +18,7 @@ from docling_core.experimental.serializer.base import (
     BaseTextSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import (
+from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
     create_ser_result,

{docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/html.py RENAMED Viewed

@@ -19,7 +19,7 @@ import latex2mathml.converter
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
-from docling_core.experimental.serializer.base import (
+from docling_core.transforms.serializer.base import (
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -31,12 +31,12 @@ from docling_core.experimental.serializer.base import (
     BaseTextSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import (
+from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
     create_ser_result,
 )
-from docling_core.experimental.serializer.html_styles import (
+from docling_core.transforms.serializer.html_styles import (
     _get_css_for_single_column,
     _get_css_for_split_page,
 )
@@ -370,6 +370,13 @@ class HTMLPictureSerializer(BasePictureSerializer):
         **kwargs: Any,
     ) -> SerializationResult:
         """Export picture to HTML format."""
+        def get_img_row(imgb64: str, ind: int) -> str:
+            row = '<tr><td style="border: 2px solid black; padding: 8px;">'
+            row += f'<img src="data:image/png;base64,{imgb64}" alt="image {ind}">'
+            row += "</td></tr>\n"
+            return row
         params = HTMLParams(**kwargs)
         res_parts: list[SerializationResult] = []
@@ -393,6 +400,22 @@ class HTMLPictureSerializer(BasePictureSerializer):
                     and item.image.uri.scheme == "data"
                 ):
                     img_text = f'<img src="{item.image.uri}">'
+                elif len(item.prov) > 1:  # more than 1 provenance
+                    img_text = (
+                        '<table style="border-collapse: collapse; width: 100%;">\n'
+                    )
+                    for ind, prov in enumerate(item.prov):
+                        img = item.get_image(doc, prov_index=ind)
+                        if img is not None:
+                            imgb64 = item._image_to_base64(img)
+                            img_text += get_img_row(imgb64=imgb64, ind=ind)
+                        else:
+                            _logger.warning("Could not get image")
+                    img_text += "</table>\n"
                 else:
                     # get the item.image._pil or crop it out of the page-image
                     img = item.get_image(doc)
@@ -400,6 +423,9 @@ class HTMLPictureSerializer(BasePictureSerializer):
                     if img is not None:
                         imgb64 = item._image_to_base64(img)
                         img_text = f'<img src="data:image/png;base64,{imgb64}">'
+                    else:
+                        _logger.warning("Could not get image")
             elif params.image_mode == ImageRefMode.REFERENCED:
                 if isinstance(item.image, ImageRef) and not (
                     isinstance(item.image.uri, AnyUrl)

{docling_core-2.28.1/docling_core/experimental → docling_core-2.30.0/docling_core/transforms}/serializer/markdown.py RENAMED Viewed

@@ -14,7 +14,7 @@ from pydantic import AnyUrl, BaseModel, PositiveInt
 from tabulate import tabulate
 from typing_extensions import override
-from docling_core.experimental.serializer.base import (
+from docling_core.transforms.serializer.base import (
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -26,7 +26,7 @@ from docling_core.experimental.serializer.base import (
     BaseTextSerializer,
     SerializationResult,
 )
-from docling_core.experimental.serializer.common import (
+from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
     _PageBreakSerResult,

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/layout_visualizer.py RENAMED Viewed

@@ -149,38 +149,41 @@ class LayoutVisualizer(BaseVisualizer):
                 continue
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
-            prov = elem.prov[0]
-            page_nr = prov.page_no
-            if page_nr in my_images:
-                image = my_images[page_nr]
-            else:
-                raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
-            if prev_page_nr is None or page_nr > prev_page_nr:  # new page begins
-                # complete previous drawing
-                if prev_page_nr is not None and prev_image and clusters:
-                    self._draw_clusters(
-                        image=prev_image,
-                        clusters=clusters,
-                        scale_x=prev_image.width / doc.pages[prev_page_nr].size.width,
-                        scale_y=prev_image.height / doc.pages[prev_page_nr].size.height,
-                    )
-                    clusters = []
-            tlo_bbox = prov.bbox.to_top_left_origin(
-                page_height=doc.pages[prov.page_no].size.height
-            )
-            cluster = _TLCluster(
-                id=idx,
-                label=elem.label,
-                brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
-                cells=[],
-            )
-            clusters.append(cluster)
+            for prov in elem.prov:
+                page_nr = prov.page_no
+                if page_nr in my_images:
+                    image = my_images[page_nr]
+                else:
+                    raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
+                if prev_page_nr is None or page_nr > prev_page_nr:  # new page begins
+                    # complete previous drawing
+                    if prev_page_nr is not None and prev_image and clusters:
+                        self._draw_clusters(
+                            image=prev_image,
+                            clusters=clusters,
+                            scale_x=prev_image.width
+                            / doc.pages[prev_page_nr].size.width,
+                            scale_y=prev_image.height
+                            / doc.pages[prev_page_nr].size.height,
+                        )
+                        clusters = []
+                tlo_bbox = prov.bbox.to_top_left_origin(
+                    page_height=doc.pages[prov.page_no].size.height
+                )
+                cluster = _TLCluster(
+                    id=idx,
+                    label=elem.label,
+                    brec=_TLBoundingRectangle.from_bounding_box(bbox=tlo_bbox),
+                    cells=[],
+                )
+                clusters.append(cluster)
-            prev_page_nr = page_nr
-            prev_image = image
+                prev_page_nr = page_nr
+                prev_image = image
         # complete last drawing
         if prev_page_nr is not None and prev_image and clusters:

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/transforms/visualizer/reading_order_visualizer.py RENAMED Viewed

@@ -77,57 +77,59 @@ class ReadingOrderVisualizer(BaseVisualizer):
                 continue
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
-            prov = elem.prov[0]
-            page_no = prov.page_no
-            image = my_images.get(page_no)
-            if image is None or prev_page is None or page_no > prev_page:
-                # new page begins
-                prev_page = page_no
-                x0 = y0 = None
-                if image is None:
-                    page_image = doc.pages[page_no].image
-                    if page_image is None or (pil_img := page_image.pil_image) is None:
-                        raise RuntimeError("Cannot visualize document without images")
-                    else:
-                        image = deepcopy(pil_img)
-                        my_images[page_no] = image
-            draw = ImageDraw.Draw(image)
-            # if prov.page_no not in true_doc.pages or prov.page_no != 1:
-            #     logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ")
-            #     continue
-            tlo_bbox = prov.bbox.to_top_left_origin(
-                page_height=doc.pages[prov.page_no].size.height
-            )
-            ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
-            ro_bbox.l = round(ro_bbox.l * image.width)  # noqa: E741
-            ro_bbox.r = round(ro_bbox.r * image.width)
-            ro_bbox.t = round(ro_bbox.t * image.height)
-            ro_bbox.b = round(ro_bbox.b * image.height)
-            if ro_bbox.b > ro_bbox.t:
-                ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
-            if x0 is None and y0 is None:
-                x0 = (ro_bbox.l + ro_bbox.r) / 2.0
-                y0 = (ro_bbox.b + ro_bbox.t) / 2.0
-            else:
-                assert x0 is not None
-                assert y0 is not None
-                x1 = (ro_bbox.l + ro_bbox.r) / 2.0
-                y1 = (ro_bbox.b + ro_bbox.t) / 2.0
-                draw = self._draw_arrow(
-                    draw=draw,
-                    arrow_coords=(x0, y0, x1, y1),
-                    line_width=2,
-                    color="red",
+            for prov in elem.prov:
+                page_no = prov.page_no
+                image = my_images.get(page_no)
+                if image is None or prev_page is None or page_no > prev_page:
+                    # new page begins
+                    prev_page = page_no
+                    x0 = y0 = None
+                    if image is None:
+                        page_image = doc.pages[page_no].image
+                        if (
+                            page_image is None
+                            or (pil_img := page_image.pil_image) is None
+                        ):
+                            raise RuntimeError(
+                                "Cannot visualize document without images"
+                            )
+                        else:
+                            image = deepcopy(pil_img)
+                            my_images[page_no] = image
+                draw = ImageDraw.Draw(image)
+                tlo_bbox = prov.bbox.to_top_left_origin(
+                    page_height=doc.pages[prov.page_no].size.height
                 )
-                x0, y0 = x1, y1
+                ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size)
+                ro_bbox.l = round(ro_bbox.l * image.width)  # noqa: E741
+                ro_bbox.r = round(ro_bbox.r * image.width)
+                ro_bbox.t = round(ro_bbox.t * image.height)
+                ro_bbox.b = round(ro_bbox.b * image.height)
+                if ro_bbox.b > ro_bbox.t:
+                    ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
+                if x0 is None and y0 is None:
+                    x0 = (ro_bbox.l + ro_bbox.r) / 2.0
+                    y0 = (ro_bbox.b + ro_bbox.t) / 2.0
+                else:
+                    assert x0 is not None
+                    assert y0 is not None
+                    x1 = (ro_bbox.l + ro_bbox.r) / 2.0
+                    y1 = (ro_bbox.b + ro_bbox.t) / 2.0
+                    draw = self._draw_arrow(
+                        draw=draw,
+                        arrow_coords=(x0, y0, x1, y1),
+                        line_width=2,
+                        color="red",
+                    )
+                    x0, y0 = x1, y1
         return my_images
     @override

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -790,7 +790,9 @@ class DocItem(
         return location
-    def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
+    def get_image(
+        self, doc: "DoclingDocument", prov_index: int = 0
+    ) -> Optional[PILImage.Image]:
         """Returns the image of this DocItem.
         The function returns None if this DocItem has no valid provenance or
@@ -800,7 +802,7 @@ class DocItem(
         if not len(self.prov):
             return None
-        page = doc.pages.get(self.prov[0].page_no)
+        page = doc.pages.get(self.prov[prov_index].page_no)
         if page is None or page.size is None or page.image is None:
             return None
@@ -808,7 +810,7 @@ class DocItem(
         if not page_image:
             return None
         crop_bbox = (
-            self.prov[0]
+            self.prov[prov_index]
             .bbox.to_top_left_origin(page_height=page.size.height)
             .scale_to_size(old_size=page.size, new_size=page.image.size)
             # .scaled(scale=page_image.height / page.size.height)
@@ -872,7 +874,7 @@ class TextItem(DocItem):
         :param add_content: bool:  (Default value = True)
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -930,7 +932,7 @@ class SectionHeaderItem(TextItem):
         :param add_content: bool:  (Default value = True)
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -973,7 +975,9 @@ class FloatingItem(DocItem):
             text += cap.resolve(doc).text
         return text
-    def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
+    def get_image(
+        self, doc: "DoclingDocument", prov_index: int = 0
+    ) -> Optional[PILImage.Image]:
         """Returns the image corresponding to this FloatingItem.
         This function returns the PIL image from self.image if one is available.
@@ -985,7 +989,7 @@ class FloatingItem(DocItem):
         """
         if self.image is not None:
             return self.image.pil_image
-        return super().get_image(doc=doc)
+        return super().get_image(doc=doc, prov_index=prov_index)
 class CodeItem(FloatingItem, TextItem):
@@ -1020,7 +1024,7 @@ class CodeItem(FloatingItem, TextItem):
         :param add_content: bool:  (Default value = True)
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -1073,7 +1077,7 @@ class PictureItem(FloatingItem):
             image_bytes = self.image._pil.tobytes()
             # Create a hash object (e.g., SHA-256)
-            hasher = hashlib.sha256()
+            hasher = hashlib.sha256(usedforsecurity=False)
             # Feed the image bytes into the hash object
             hasher.update(image_bytes)
@@ -1091,7 +1095,7 @@ class PictureItem(FloatingItem):
         image_placeholder: str = "<!-- image -->",
     ) -> str:
         """Export picture to Markdown format."""
-        from docling_core.experimental.serializer.markdown import (
+        from docling_core.transforms.serializer.markdown import (
             MarkdownDocSerializer,
             MarkdownParams,
         )
@@ -1118,7 +1122,7 @@ class PictureItem(FloatingItem):
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
     ) -> str:
         """Export picture to HTML format."""
-        from docling_core.experimental.serializer.html import (
+        from docling_core.transforms.serializer.html import (
             HTMLDocSerializer,
             HTMLParams,
         )
@@ -1159,7 +1163,7 @@ class PictureItem(FloatingItem):
         :param # not used at the moment
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -1235,7 +1239,7 @@ class TableItem(FloatingItem):
     def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str:
         """Export the table as markdown."""
         if doc is not None:
-            from docling_core.experimental.serializer.markdown import (
+            from docling_core.transforms.serializer.markdown import (
                 MarkdownDocSerializer,
             )
@@ -1282,7 +1286,7 @@ class TableItem(FloatingItem):
     ) -> str:
         """Export the table as html."""
         if doc is not None:
-            from docling_core.experimental.serializer.html import HTMLDocSerializer
+            from docling_core.transforms.serializer.html import HTMLDocSerializer
             serializer = HTMLDocSerializer(doc=doc)
             text = serializer.serialize(item=self).text
@@ -1414,7 +1418,7 @@ class TableItem(FloatingItem):
         :param add_caption: bool:  (Default value = True)
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -1512,7 +1516,7 @@ class KeyValueItem(FloatingItem):
         :param add_content: bool:  (Default value = True)
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )
@@ -2657,16 +2661,25 @@ class DoclingDocument(BaseModel):
         if should_yield:
             yield root, my_stack
-        # Handle picture traversal - only traverse children if requested
-        if isinstance(root, PictureItem) and not traverse_pictures:
-            return
         my_stack.append(-1)
+        allowed_pic_refs: set[str] = (
+            {r.cref for r in root.captions}
+            if (root_is_picture := isinstance(root, PictureItem))
+            else set()
+        )
         # Traverse children
         for child_ind, child_ref in enumerate(root.children):
-            my_stack[-1] = child_ind
             child = child_ref.resolve(self)
+            if (
+                root_is_picture
+                and not traverse_pictures
+                and isinstance(child, DocItem)
+                and child.self_ref not in allowed_pic_refs
+            ):
+                continue
+            my_stack[-1] = child_ind
             if isinstance(child, NodeItem):
                 yield from self._iterate_items_with_stack(
@@ -2999,7 +3012,7 @@ class DoclingDocument(BaseModel):
         :returns: The exported Markdown representation.
         :rtype: str
         """
-        from docling_core.experimental.serializer.markdown import (
+        from docling_core.transforms.serializer.markdown import (
             MarkdownDocSerializer,
             MarkdownParams,
         )
@@ -3153,7 +3166,7 @@ class DoclingDocument(BaseModel):
         split_page_view: bool = False,
     ) -> str:
         r"""Serialize to HTML."""
-        from docling_core.experimental.serializer.html import (
+        from docling_core.transforms.serializer.html import (
             HTMLDocSerializer,
             HTMLOutputStyle,
             HTMLParams,
@@ -3195,9 +3208,9 @@ class DoclingDocument(BaseModel):
         return ser_res.text
+    @staticmethod
     def load_from_doctags(  # noqa: C901
-        self,
-        doctag_document: DocTagsDocument,
+        doctag_document: DocTagsDocument, document_name: str = "Document"
     ) -> "DoclingDocument":
         r"""Load Docling document from lists of DocTags and Images."""
         # Maps the recognized tag to a Docling label.
@@ -3221,6 +3234,8 @@ class DoclingDocument(BaseModel):
             "key_value_region": DocItemLabel.KEY_VALUE_REGION,
         }
+        doc = DoclingDocument(name=document_name)
         def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
             """Extract <loc_...> coords from the chunk, normalized by / 500."""
             coords = re.findall(r"<loc_(\d+)>", text_chunk)
@@ -3244,7 +3259,7 @@ class DoclingDocument(BaseModel):
                 caption_content = caption.group(1)
                 bbox = extract_bounding_box(caption_content)
                 caption_text = extract_inner_text(caption_content)
-                caption_item = self.add_text(
+                caption_item = doc.add_text(
                     label=DocItemLabel.CAPTION,
                     text=caption_text,
                     parent=None,
@@ -3567,7 +3582,7 @@ class DoclingDocument(BaseModel):
                 pg_width = 1
                 pg_height = 1
-            self.add_page(
+            doc.add_page(
                 page_no=page_no,
                 size=Size(width=pg_width, height=pg_height),
                 image=ImageRef.from_pil(image=image, dpi=72) if image else None,
@@ -3595,7 +3610,9 @@ class DoclingDocument(BaseModel):
                 rf"{DocumentToken.UNORDERED_LIST.value}|"
                 rf"{DocItemLabel.KEY_VALUE_REGION}|"
                 rf"{DocumentToken.CHART.value}|"
-                rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
+                rf"{DocumentToken.OTSL.value})>"
+                rf"(?P<content>.*?)"
+                rf"(?:(?P<closed></(?P=tag)>)|(?P<eof>$))"
             )
             pattern = re.compile(tag_pattern, re.DOTALL)
@@ -3605,6 +3622,10 @@ class DoclingDocument(BaseModel):
                 tag_name = match.group("tag")
                 bbox = extract_bounding_box(full_chunk)  # Extracts first bbox
+                if not match.group("closed"):
+                    # no closing tag; only the existence of the item is recovered
+                    full_chunk = f"<{tag_name}></{tag_name}>"
                 doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
                 if tag_name == DocumentToken.OTSL.value:
@@ -3624,9 +3645,9 @@ class DoclingDocument(BaseModel):
                             charspan=(0, 0),
                             page_no=page_no,
                         )
-                        self.add_table(data=table_data, prov=prov, caption=caption)
+                        doc.add_table(data=table_data, prov=prov, caption=caption)
                     else:
-                        self.add_table(data=table_data, caption=caption)
+                        doc.add_table(data=table_data, caption=caption)
                 elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
                     caption, caption_bbox = extract_caption(full_chunk)
@@ -3646,7 +3667,7 @@ class DoclingDocument(BaseModel):
                                 int(bbox.b * im_height),
                             )
                             cropped_image = image.crop(crop_box)
-                            pic = self.add_picture(
+                            pic = doc.add_picture(
                                 parent=None,
                                 image=ImageRef.from_pil(image=cropped_image, dpi=72),
                                 prov=(
@@ -3692,7 +3713,7 @@ class DoclingDocument(BaseModel):
                     else:
                         if bbox:
                             # In case we don't have access to an binary of an image
-                            pic = self.add_picture(
+                            pic = doc.add_picture(
                                 parent=None,
                                 prov=ProvenanceItem(
                                     bbox=bbox, charspan=(0, 0), page_no=page_no
@@ -3733,7 +3754,7 @@ class DoclingDocument(BaseModel):
                     key_value_data, kv_item_prov = parse_key_value_item(
                         full_chunk, image
                     )
-                    self.add_key_values(graph=key_value_data, prov=kv_item_prov)
+                    doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
                 elif tag_name in [
                     DocumentToken.ORDERED_LIST.value,
                     DocumentToken.UNORDERED_LIST.value,
@@ -3749,7 +3770,7 @@ class DoclingDocument(BaseModel):
                     )
                     li_pattern = re.compile(list_item_pattern, re.DOTALL)
                     # Add list group:
-                    new_list = self.add_group(label=list_label, name="list")
+                    new_list = doc.add_group(label=list_label, name="list")
                     # Pricess list items
                     for li_match in li_pattern.finditer(full_chunk):
                         enum_value += 1
@@ -3760,7 +3781,7 @@ class DoclingDocument(BaseModel):
                         li_bbox = extract_bounding_box(li_full_chunk) if image else None
                         text_content = extract_inner_text(li_full_chunk)
                         # Add list item
-                        self.add_list_item(
+                        doc.add_list_item(
                             marker=enum_marker,
                             enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
                             parent=new_list,
@@ -3792,13 +3813,13 @@ class DoclingDocument(BaseModel):
                     if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
                         content_layer = ContentLayer.FURNITURE
-                    self.add_text(
+                    doc.add_text(
                         label=doc_label,
                         text=text_content,
                         prov=element_prov,
                         content_layer=content_layer,
                     )
-        return self
+        return doc
     @deprecated("Use save_as_doctags instead.")
     def save_as_document_tokens(self, *args, **kwargs):
@@ -3885,7 +3906,7 @@ class DoclingDocument(BaseModel):
         :returns: The content of the document formatted as a DocTags string.
         :rtype: str
         """
-        from docling_core.experimental.serializer.doctags import (
+        from docling_core.transforms.serializer.doctags import (
             DocTagsDocSerializer,
             DocTagsParams,
         )

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/labels.py RENAMED Viewed

@@ -78,6 +78,7 @@ class GroupLabel(str, Enum):
     KEY_VALUE_AREA = "key_value_area"
     COMMENT_SECTION = "comment_section"
     INLINE = "inline"
+    PICTURE_AREA = "picture_area"
     def __str__(self):
         """Get string value."""

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/types/doc/page.py RENAMED Viewed

@@ -472,8 +472,27 @@ class SegmentedPage(BaseModel):
     word_cells: List[TextCell] = []
     textline_cells: List[TextCell] = []
+    # These flags are set to differentiate if above lists of this SegmentedPage
+    # are empty (page had no content) or if they have not been computed (i.e. textline_cells may be present
+    # but word_cells are not)
+    has_chars: bool = False
+    has_words: bool = False
+    has_lines: bool = False
     image: Optional[ImageRef] = None
+    @model_validator(mode="after")
+    def validate_page(self) -> "SegmentedPage":
+        """Validate page."""
+        if len(self.textline_cells) > 0:
+            self.has_lines = True
+        if len(self.word_cells) > 0:
+            self.has_words = True
+        if len(self.char_cells) > 0:
+            self.has_chars = True
+        return self
     def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
         """Iterate through text cells of the specified unit type.
@@ -579,13 +598,17 @@ class SegmentedPdfPage(SegmentedPage):
         with open(filename, "r", encoding="utf-8") as f:
             return cls.model_validate_json(f.read())
-    def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0):
+    def crop_text(
+        self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0
+    ) -> str:
         """Extract text from cells within the specified bounding box.
         Args:
             cell_unit: Type of text unit to extract
             bbox: Bounding box to extract from
             eps: Epsilon value for position comparison
+        Returns:
+            Extracted text from the cells
         """
         selection = []
         for page_cell in self.iterate_cells(cell_unit):
@@ -605,7 +628,6 @@ class SegmentedPdfPage(SegmentedPage):
         text = ""
         for i, cell in enumerate(selection):
             if i == 0:
                 text += cell.text
             else:
@@ -619,6 +641,7 @@ class SegmentedPdfPage(SegmentedPage):
                 else:
                     text += " "
                     text += cell.text
+        return text
     def export_to_textlines(
         self,
@@ -640,7 +663,6 @@ class SegmentedPdfPage(SegmentedPage):
         """
         lines: List[str] = []
         for cell in self.iterate_cells(cell_unit):
             line = ""
             if add_location:
                 line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
@@ -1104,7 +1126,6 @@ class SegmentedPdfPage(SegmentedPage):
         # Draw each rectangle by connecting its four points
         for line in self.lines:
             line.to_top_left_origin(page_height=page_height)
             for segment in line.iterate_segments():
                 draw.line(

{docling_core-2.28.1 → docling_core-2.30.0}/docling_core/utils/legacy.py RENAMED Viewed

@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocu
 def _create_hash(string: str):
-    hasher = hashlib.sha256()
+    hasher = hashlib.sha256(usedforsecurity=False)
     hasher.update(string.encode("utf-8"))
     return hasher.hexdigest()

{docling_core-2.28.1 → docling_core-2.30.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "2.28.1"
+version = "2.30.0"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [