PyPI - docling-core - Versions diffs - 2.0.1__tar.gz → 2.2.0__tar.gz - Mend

docling-core 2.0.1tar.gz → 2.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (55) hide show

{docling_core-2.0.1 → docling_core-2.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.0.1
+Version: 2.2.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -95,7 +95,7 @@ poetry run pytest test
 Docling Core contains 3 top-level data types:
-- **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
+- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
   The DoclingDocument type also models the metadata that may be attached to the converted document.
   Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
 - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.

{docling_core-2.0.1 → docling_core-2.2.0}/README.md RENAMED Viewed

@@ -59,7 +59,7 @@ poetry run pytest test
 Docling Core contains 3 top-level data types:
-- **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
+- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
   The DoclingDocument type also models the metadata that may be attached to the converted document.
   Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
 - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/__init__.py RENAMED Viewed

@@ -6,4 +6,7 @@
 """Define the chunker types."""
 from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
-from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
+from docling_core.transforms.chunker.hierarchical_chunker import (
+    DocMeta,
+    HierarchicalChunker,
+)

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/base.py RENAMED Viewed

@@ -13,7 +13,7 @@ from docling_core.types.doc import DoclingDocument as DLDocument
 class BaseMeta(BaseModel):
-    """Metadata base class."""
+    """Chunk metadata base class."""
     excluded_embed: ClassVar[list[str]] = []
     excluded_llm: ClassVar[list[str]] = []

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/transforms/chunker/hierarchical_chunker.py RENAMED Viewed

@@ -8,15 +8,19 @@
 from __future__ import annotations
 import logging
-from typing import Any, ClassVar, Iterator, Optional
+import re
+from typing import Any, ClassVar, Final, Iterator, Literal, Optional
 from pandas import DataFrame
-from pydantic import Field
+from pydantic import Field, StringConstraints, field_validator
+from typing_extensions import Annotated
+from docling_core.search.package import VERSION_PATTERN
 from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
-from docling_core.types.doc import DoclingDocument as DLDocument
+from docling_core.types import DoclingDocument as DLDocument
 from docling_core.types.doc.document import (
     DocItem,
+    DocumentOrigin,
     LevelNumber,
     ListItem,
     SectionHeaderItem,
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
 )
 from docling_core.types.doc.labels import DocItemLabel
+_VERSION: Final = "1.0.0"
+_KEY_SCHEMA_NAME = "schema_name"
+_KEY_VERSION = "version"
 _KEY_DOC_ITEMS = "doc_items"
 _KEY_HEADINGS = "headings"
 _KEY_CAPTIONS = "captions"
+_KEY_ORIGIN = "origin"
 _logger = logging.getLogger(__name__)
 class DocMeta(BaseMeta):
-    """Data model for Hierarchical Chunker metadata."""
+    """Data model for Hierarchical Chunker chunk metadata."""
+    schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
+        default="docling_core.transforms.chunker.DocMeta",
+        alias=_KEY_SCHEMA_NAME,
+    )
+    version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
+        Field(
+            default=_VERSION,
+            alias=_KEY_VERSION,
+        )
+    )
     doc_items: list[DocItem] = Field(
         alias=_KEY_DOC_ITEMS,
         min_length=1,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
         alias=_KEY_CAPTIONS,
         min_length=1,
     )
+    origin: Optional[DocumentOrigin] = Field(
+        default=None,
+        alias=_KEY_ORIGIN,
+    )
-    excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
-    excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
+    excluded_embed: ClassVar[list[str]] = [
+        _KEY_SCHEMA_NAME,
+        _KEY_VERSION,
+        _KEY_DOC_ITEMS,
+        _KEY_ORIGIN,
+    ]
+    excluded_llm: ClassVar[list[str]] = [
+        _KEY_SCHEMA_NAME,
+        _KEY_VERSION,
+        _KEY_DOC_ITEMS,
+        _KEY_ORIGIN,
+    ]
+    @field_validator(_KEY_VERSION)
+    @classmethod
+    def check_version_is_compatible(cls, v: str) -> str:
+        """Check if this meta item version is compatible with current version."""
+        current_match = re.match(VERSION_PATTERN, _VERSION)
+        doc_match = re.match(VERSION_PATTERN, v)
+        if (
+            doc_match is None
+            or current_match is None
+            or doc_match["major"] != current_match["major"]
+            or doc_match["minor"] > current_match["minor"]
+        ):
+            raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
+        else:
+            return _VERSION
 class DocChunk(BaseChunk):
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
                                     for k in sorted(heading_by_level)
                                 ]
                                 or None,
+                                origin=dl_doc.origin,
                             ),
                         )
                         list_items = []  # reset
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
                         headings=[heading_by_level[k] for k in sorted(heading_by_level)]
                         or None,
                         captions=captions,
+                        origin=dl_doc.origin,
                     ),
                 )
                 yield c
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
                     doc_items=list_items,
                     headings=[heading_by_level[k] for k in sorted(heading_by_level)]
                     or None,
+                    origin=dl_doc.origin,
                 ),
             )

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@
 """Package for models defined by the Document type."""
-from .base import BoundingBox, CoordOrigin, Size
+from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
 from .document import (
     DocItem,
     DoclingDocument,

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/base.py RENAMED Viewed

@@ -7,6 +7,13 @@ from typing import Tuple
 from pydantic import BaseModel
+class ImageRefMode(str, Enum):
+    """ImageRefMode."""
+    PLACEHOLDER = "placeholder"
+    EMBEDDED = "embedded"
 class CoordOrigin(str, Enum):
     """CoordOrigin."""

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/types/doc/document.py RENAMED Viewed

@@ -3,6 +3,7 @@
 import base64
 import mimetypes
 import re
+import sys
 import typing
 from io import BytesIO
 from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
 from docling_core.search.package import VERSION_PATTERN
 from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.labels import DocItemLabel, GroupLabel
 from docling_core.types.legacy_doc.tokens import DocumentToken
@@ -215,6 +217,7 @@ class DocumentOrigin(BaseModel):
         "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
         "text/asciidoc",
+        "text/markdown",
     ]
     @field_validator("binary_hash", mode="before")
@@ -588,7 +591,13 @@ class TableItem(FloatingItem):
         for row in self.data.grid:
             tmp = []
             for col in row:
-                tmp.append(col.text)
+                # make sure that md tables are not broken
+                # due to newline chars in the text
+                text = col.text
+                text = text.replace("\n", " ")
+                tmp.append(text)
             table.append(tmp)
         md_table = ""
@@ -1108,12 +1117,14 @@ class DoclingDocument(BaseModel):
     def export_to_markdown(  # noqa: C901
         self,
-        delim: str = "\n\n",
+        delim: str = "\n",
         from_element: int = 0,
-        to_element: Optional[int] = None,
+        to_element: int = sys.maxsize,
         labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
         strict_text: bool = False,
         image_placeholder: str = "<!-- image -->",
+        image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+        indent: int = 4,
     ) -> str:
         r"""Serialize to Markdown.
@@ -1143,136 +1154,150 @@ class DoclingDocument(BaseModel):
         :param strict_text: bool:  (Default value = False)
         :param image_placeholder str:  (Default value = "<!-- image -->")
             the placeholder to include to position images in the markdown.
+        :param indent: int (default=4): indent of the nested lists
         :returns: The exported Markdown representation.
         :rtype: str
         """
-        has_title = False
-        prev_text = ""
-        md_texts: list[str] = []
+        mdtexts: list[str] = []
+        list_nesting_level = 0  # Track the current list nesting level
+        previous_level = 0  # Track the previous item's level
+        in_list = False  # Track if we're currently processing list items
-        # collect all captions embedded in table and figure objects
-        # to avoid repeating them
-        embedded_captions = set()
-        skip_count = 0
-        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
-            if skip_count < from_element:
-                skip_count += 1
-                continue  # skip as many items as you want
-            if to_element and ix >= to_element:
-                break
+        for ix, (item, level) in enumerate(
+            self.iterate_items(self.body, with_groups=True)
+        ):
+            # If we've moved to a lower level, we're exiting one or more groups
+            if level < previous_level:
+                # Calculate how many levels we've exited
+                level_difference = previous_level - level
+                # Decrement list_nesting_level for each list group we've exited
+                list_nesting_level = max(0, list_nesting_level - level_difference)
-            if (
-                isinstance(item, (TableItem, PictureItem))
-                and len(item.captions) > 0
-                and item.label in labels
-            ):
-                caption = item.caption_text(self)
-                if caption:
-                    embedded_captions.add(caption)
+            previous_level = level  # Update previous_level for next iteration
-        skip_count = 0
-        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
-            if skip_count < from_element:
-                skip_count += 1
+            if ix < from_element and to_element <= ix:
                 continue  # skip as many items as you want
-            if to_element and ix >= to_element:
-                break
-            markdown_text = ""
-            if isinstance(item, DocItem):
-                item_type = item.label
-                if isinstance(item, TextItem) and item_type in labels:
-                    text = item.text
-                    # skip captions of they are embedded in the actual
-                    # floating object
-                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
-                        continue
-                    # ignore repeated text
-                    if prev_text == text or text is None:
-                        continue
-                    else:
-                        prev_text = text
-                    # first title match
-                    if item_type == DocItemLabel.TITLE and not has_title:
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"# {text}"
-                        has_title = True
-                    # secondary titles
-                    elif item_type in {
-                        DocItemLabel.TITLE,
-                        DocItemLabel.SECTION_HEADER,
-                    } or (has_title and item_type == DocItemLabel.TITLE):
-                        if strict_text:
-                            markdown_text = f"{text}"
-                        else:
-                            markdown_text = f"## {text}"
-                    # secondary titles
-                    elif isinstance(item, ListItem):
-                        if item.enumerated:
-                            marker = item.marker
-                        else:
-                            marker = "-"
-                        markdown_text = f"{marker} {text}"
-                    # normal text
-                    else:
-                        markdown_text = text
-                elif isinstance(item, TableItem) and item.data and item_type in labels:
-                    parts = []
-                    # Compute the caption
-                    if caption := item.caption_text(self):
-                        parts.append(caption)
-                        parts.append("\n")
+            # Handle newlines between different types of content
+            if (
+                len(mdtexts) > 0
+                and not isinstance(item, (ListItem, GroupItem))
+                and in_list
+            ):
+                mdtexts[-1] += "\n"
+                in_list = False
-                    # Rendered the item
-                    if not strict_text:
-                        md_table = item.export_to_markdown()
-                        if md_table:
-                            parts.append(item.export_to_markdown())
+            if isinstance(item, GroupItem) and item.label in [
+                GroupLabel.LIST,
+                GroupLabel.ORDERED_LIST,
+            ]:
-                    # Combine parts
-                    markdown_text = "\n".join(parts)
+                if list_nesting_level == 0:  # Check if we're on the top level.
+                    # In that case a new list starts directly after another list.
+                    mdtexts.append("\n")  # Add a blank line
-                elif isinstance(item, PictureItem) and item_type in labels:
-                    parts = []
+                # Increment list nesting level when entering a new list
+                list_nesting_level += 1
+                in_list = True
+                continue
-                    # Compute the caption
-                    if caption := item.caption_text(self):
-                        parts.append(caption)
-                        parts.append("\n")
+            elif isinstance(item, GroupItem):
+                continue
-                    # Rendered the item
-                    if not strict_text:
-                        parts.append(f"{image_placeholder}")
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
+                in_list = False
+                marker = "" if strict_text else "#"
+                text = f"{marker} {item.text}\n"
+                mdtexts.append(text.strip())
+            elif (
+                isinstance(item, TextItem)
+                and item.label in [DocItemLabel.SECTION_HEADER]
+            ) or isinstance(item, SectionHeaderItem):
+                in_list = False
+                marker = ""
+                if not strict_text:
+                    marker = "#" * level
+                    if len(marker) < 2:
+                        marker = "##"
+                text = f"{marker} {item.text}\n"
+                mdtexts.append(text.strip() + "\n")
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
+                in_list = False
+                text = f"```\n{item.text}\n```\n"
+                mdtexts.append(text)
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
+                # captions are printed in picture and table ... skipping for now
+                continue
-                    # Combine parts
-                    markdown_text = "\n".join(parts)
+            elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
+                in_list = True
+                # Calculate indent based on list_nesting_level
+                # -1 because level 1 needs no indent
+                list_indent = " " * (indent * (list_nesting_level - 1))
+                marker = ""
+                if strict_text:
+                    marker = ""
+                elif item.enumerated:
+                    marker = item.marker
+                else:
+                    marker = "-"  # Markdown needs only dash as item marker.
+                text = f"{list_indent}{marker} {item.text}"
+                mdtexts.append(text)
+            elif isinstance(item, TextItem) and item.label in labels:
+                in_list = False
+                if len(item.text):
+                    text = f"{item.text}\n"
+                    mdtexts.append(text)
+            elif isinstance(item, TableItem) and not strict_text:
+                in_list = False
+                mdtexts.append(item.caption_text(self))
+                md_table = item.export_to_markdown()
+                mdtexts.append("\n" + md_table + "\n")
+            elif isinstance(item, PictureItem) and not strict_text:
+                in_list = False
+                mdtexts.append(item.caption_text(self))
+                if image_mode == ImageRefMode.PLACEHOLDER:
+                    mdtexts.append("\n" + image_placeholder + "\n")
+                elif image_mode == ImageRefMode.EMBEDDED and isinstance(
+                    item.image, ImageRef
+                ):
+                    text = f"![Local Image]({item.image.uri})\n"
+                    mdtexts.append(text)
+                elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
+                    item.image, ImageRef
+                ):
+                    text = (
+                        "<!-- 🖼️❌ Image not available. "
+                        "Please use `PdfPipelineOptions(generate_picture_images=True)`"
+                        " --> "
+                    )
+                    mdtexts.append(text)
-            if markdown_text:
-                md_texts.append(markdown_text)
+            elif isinstance(item, DocItem) and item.label in labels:
+                in_list = False
+                text = "<missing-text>"
+                mdtexts.append(text)
-        result = delim.join(md_texts)
-        return result
+        mdtext = (delim.join(mdtexts)).strip()
+        mdtext = re.sub(
+            r"\n\n\n+", "\n\n", mdtext
+        )  # remove cases of double or more empty lines.
+        return mdtext
     def export_to_text(  # noqa: C901
         self,
         delim: str = "\n\n",
         from_element: int = 0,
-        to_element: Optional[int] = None,
+        to_element: int = 1000000,
         labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
     ) -> str:
         """export_to_text."""
@@ -1399,6 +1424,121 @@ class DoclingDocument(BaseModel):
         return doctags
+    def _export_to_indented_text(
+        self, indent="  ", max_text_len: int = -1, explicit_tables: bool = False
+    ):
+        """Export the document to indented text to expose hierarchy."""
+        result = []
+        def get_text(text: str, max_text_len: int):
+            middle = " ... "
+            if max_text_len == -1:
+                return text
+            elif len(text) < max_text_len + len(middle):
+                return text
+            else:
+                tbeg = int((max_text_len - len(middle)) / 2)
+                tend = int(max_text_len - tbeg)
+                return text[0:tbeg] + middle + text[-tend:]
+        for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
+            if isinstance(item, GroupItem):
+                result.append(
+                    indent * level
+                    + f"item-{i} at level {level}: {item.label}: group {item.name}"
+                )
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
+                text = get_text(text=item.text, max_text_len=max_text_len)
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}: {text}"
+                )
+            elif isinstance(item, SectionHeaderItem):
+                text = get_text(text=item.text, max_text_len=max_text_len)
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}: {text}"
+                )
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
+                text = get_text(text=item.text, max_text_len=max_text_len)
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}: {text}"
+                )
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
+                # captions are printed in picture and table ... skipping for now
+                continue
+            elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
+                text = get_text(text=item.text, max_text_len=max_text_len)
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}: {text}"
+                )
+            elif isinstance(item, TextItem):
+                text = get_text(text=item.text, max_text_len=max_text_len)
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}: {text}"
+                )
+            elif isinstance(item, TableItem):
+                result.append(
+                    indent * level
+                    + f"item-{i} at level {level}: {item.label} with "
+                    + f"[{item.data.num_rows}x{item.data.num_cols}]"
+                )
+                for _ in item.captions:
+                    caption = _.resolve(self)
+                    result.append(
+                        indent * (level + 1)
+                        + f"item-{i} at level {level + 1}: {caption.label}: "
+                        + f"{caption.text}"
+                    )
+                if explicit_tables:
+                    grid: list[list[str]] = []
+                    for i, row in enumerate(item.data.grid):
+                        grid.append([])
+                        for j, cell in enumerate(row):
+                            if j < 10:
+                                text = get_text(text=cell.text, max_text_len=16)
+                                grid[-1].append(text)
+                    result.append("\n" + tabulate(grid) + "\n")
+            elif isinstance(item, PictureItem):
+                result.append(
+                    indent * level + f"item-{i} at level {level}: {item.label}"
+                )
+                for _ in item.captions:
+                    caption = _.resolve(self)
+                    result.append(
+                        indent * (level + 1)
+                        + f"item-{i} at level {level + 1}: {caption.label}: "
+                        + f"{caption.text}"
+                    )
+            elif isinstance(item, DocItem):
+                result.append(
+                    indent * (level + 1)
+                    + f"item-{i} at level {level}: {item.label}: ignored"
+                )
+        return "\n".join(result)
     def add_page(
         self, page_no: int, size: Size, image: Optional[ImageRef] = None
     ) -> PageItem:

{docling_core-2.0.1 → docling_core-2.2.0}/docling_core/utils/file.py RENAMED Viewed

@@ -5,15 +5,18 @@
 """File-related utilities."""
+import importlib
 import tempfile
 from pathlib import Path
-from typing import Union
+from typing import Dict, Optional, Union
 import requests
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
-def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
+def resolve_file_source(
+    source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
+) -> Path:
     """Resolves the source (URL, path) of a file to a local file path.
     If a URL is provided, the content is first downloaded to a temporary local file.
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
     """
     try:
         http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
-        res = requests.get(http_url, stream=True)
+        # make all header keys lower case
+        _headers = headers or {}
+        req_headers = {k.lower(): v for k, v in _headers.items()}
+        # add user-agent is not set
+        if "user-agent" not in req_headers:
+            agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
+            req_headers["user-agent"] = agent_name
+        # fetch the page
+        res = requests.get(http_url, stream=True, headers=req_headers)
         res.raise_for_status()
         fname = None
         # try to get filename from response header
@@ -41,7 +54,7 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
                     break
         # otherwise, use name from URL:
         if fname is None:
-            fname = Path(http_url.path or "file").name
+            fname = Path(http_url.path or "").name or "file"
         local_path = Path(tempfile.mkdtemp()) / fname
         with open(local_path, "wb") as f:
             for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks

{docling_core-2.0.1 → docling_core-2.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-core"
-version = "2.0.1"
+version = "2.2.0"
 description = "A python library to define and validate data types in Docling."
 license = "MIT"
 authors = [