PyPI - docling - Versions diffs - 2.9.0__tar.gz → 2.10.0__tar.gz - Mend

docling 2.9.0tar.gz → 2.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{docling-2.9.0 → docling-2.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.9.0
+Version: 2.10.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
 Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
+Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
+Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
-Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
+Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)

{docling-2.9.0 → docling-2.10.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

{docling-2.9.0 → docling-2.10.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_parse.docling_parse import pdf_parser_v2
+from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
         self.parser = pdf_parser_v2("fatal")
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                self.document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(self.document_hash, str(path_or_stream))
         if not success:
             raise RuntimeError(

{docling-2.9.0 → docling-2.10.0}/docling/cli/main.py RENAMED Viewed

@@ -208,7 +208,7 @@ def convert(
     ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
-    ] = PdfBackend.DLPARSE_V1,
+    ] = PdfBackend.DLPARSE_V2,
     table_mode: Annotated[
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),
@@ -372,11 +372,13 @@ def convert(
         else:
             raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+        pdf_format_option = PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=backend,  # pdf_backend
+        )
         format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options,
-                backend=backend,  # pdf_backend
-            )
+            InputFormat.PDF: pdf_format_option,
+            InputFormat.IMAGE: pdf_format_option,
         }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,

docling-2.10.0/docling/datamodel/document.py ADDED Viewed

@@ -0,0 +1,309 @@
+import logging
+import re
+from enum import Enum
+from io import BytesIO
+from pathlib import Path, PurePath
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
+import filetype
+from docling_core.types.doc import (
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    PictureItem,
+    SectionHeaderItem,
+    TableItem,
+    TextItem,
+)
+from docling_core.types.doc.document import ListItem
+from docling_core.types.legacy_doc.base import (
+    BaseText,
+    Figure,
+    GlmTableCell,
+    PageDimensions,
+    PageReference,
+    Prov,
+    Ref,
+)
+from docling_core.types.legacy_doc.base import Table as DsSchemaTable
+from docling_core.types.legacy_doc.base import TableCell
+from docling_core.types.legacy_doc.document import (
+    CCSDocumentDescription as DsDocumentDescription,
+)
+from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
+from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
+from docling_core.utils.file import resolve_source_to_stream
+from docling_core.utils.legacy import docling_document_to_legacy
+from pydantic import BaseModel
+from typing_extensions import deprecated
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    PaginatedDocumentBackend,
+)
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    DocumentStream,
+    ErrorItem,
+    FormatToExtensions,
+    FormatToMimeType,
+    InputFormat,
+    MimeTypeToFormat,
+    Page,
+)
+from docling.datamodel.settings import DocumentLimits
+from docling.utils.profiling import ProfilingItem
+from docling.utils.utils import create_file_hash, create_hash
+if TYPE_CHECKING:
+    from docling.document_converter import FormatOption
+_log = logging.getLogger(__name__)
+layout_label_to_ds_type = {
+    DocItemLabel.TITLE: "title",
+    DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+    DocItemLabel.SECTION_HEADER: "subtitle-level-1",
+    DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
+    DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+    DocItemLabel.CAPTION: "caption",
+    DocItemLabel.PAGE_HEADER: "page-header",
+    DocItemLabel.PAGE_FOOTER: "page-footer",
+    DocItemLabel.FOOTNOTE: "footnote",
+    DocItemLabel.TABLE: "table",
+    DocItemLabel.FORMULA: "equation",
+    DocItemLabel.LIST_ITEM: "paragraph",
+    DocItemLabel.CODE: "paragraph",
+    DocItemLabel.PICTURE: "figure",
+    DocItemLabel.TEXT: "paragraph",
+    DocItemLabel.PARAGRAPH: "paragraph",
+}
+_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
+class InputDocument(BaseModel):
+    file: PurePath
+    document_hash: str  # = None
+    valid: bool = True
+    limits: DocumentLimits = DocumentLimits()
+    format: InputFormat  # = None
+    filesize: Optional[int] = None
+    page_count: int = 0
+    _backend: AbstractDocumentBackend  # Internal PDF backend used
+    def __init__(
+        self,
+        path_or_stream: Union[BytesIO, Path],
+        format: InputFormat,
+        backend: Type[AbstractDocumentBackend],
+        filename: Optional[str] = None,
+        limits: Optional[DocumentLimits] = None,
+    ):
+        super().__init__(
+            file="", document_hash="", format=InputFormat.PDF
+        )  # initialize with dummy values
+        self.limits = limits or DocumentLimits()
+        self.format = format
+        try:
+            if isinstance(path_or_stream, Path):
+                self.file = path_or_stream
+                self.filesize = path_or_stream.stat().st_size
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._init_doc(backend, path_or_stream)
+            elif isinstance(path_or_stream, BytesIO):
+                assert (
+                    filename is not None
+                ), "Can't construct InputDocument from stream without providing filename arg."
+                self.file = PurePath(filename)
+                self.filesize = path_or_stream.getbuffer().nbytes
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._init_doc(backend, path_or_stream)
+            else:
+                raise RuntimeError(
+                    f"Unexpected type path_or_stream: {type(path_or_stream)}"
+                )
+            # For paginated backends, check if the maximum page count is exceeded.
+            if self.valid and self._backend.is_valid():
+                if self._backend.supports_pagination() and isinstance(
+                    self._backend, PaginatedDocumentBackend
+                ):
+                    self.page_count = self._backend.page_count()
+                    if not self.page_count <= self.limits.max_num_pages:
+                        self.valid = False
+        except (FileNotFoundError, OSError) as e:
+            self.valid = False
+            _log.exception(
+                f"File {self.file.name} not found or cannot be opened.", exc_info=e
+            )
+            # raise
+        except RuntimeError as e:
+            self.valid = False
+            _log.exception(
+                f"An unexpected error occurred while opening the document {self.file.name}",
+                exc_info=e,
+            )
+            # raise
+    def _init_doc(
+        self,
+        backend: Type[AbstractDocumentBackend],
+        path_or_stream: Union[BytesIO, Path],
+    ) -> None:
+        self._backend = backend(self, path_or_stream=path_or_stream)
+        if not self._backend.is_valid():
+            self.valid = False
+class DocumentFormat(str, Enum):
+    V2 = "v2"
+    V1 = "v1"
+class ConversionResult(BaseModel):
+    input: InputDocument
+    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
+    errors: List[ErrorItem] = []  # structure to keep errors
+    pages: List[Page] = []
+    assembled: AssembledUnit = AssembledUnit()
+    timings: Dict[str, ProfilingItem] = {}
+    document: DoclingDocument = _EMPTY_DOCLING_DOC
+    @property
+    @deprecated("Use document instead.")
+    def legacy_document(self):
+        return docling_document_to_legacy(self.document)
+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def is_valid(self) -> bool:
+        return False
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+    def unload(self):
+        return super().unload()
+class _DocumentConversionInput(BaseModel):
+    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
+    limits: Optional[DocumentLimits] = DocumentLimits()
+    def docs(
+        self, format_options: Dict[InputFormat, "FormatOption"]
+    ) -> Iterable[InputDocument]:
+        for item in self.path_or_stream_iterator:
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
+            format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
+            if format not in format_options.keys():
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
+                )
+                backend = _DummyBackend
+            else:
+                backend = format_options[format].backend
+            if isinstance(obj, Path):
+                yield InputDocument(
+                    path_or_stream=obj,
+                    format=format,
+                    filename=obj.name,
+                    limits=self.limits,
+                    backend=backend,
+                )
+            elif isinstance(obj, DocumentStream):
+                yield InputDocument(
+                    path_or_stream=obj.stream,
+                    format=format,
+                    filename=obj.name,
+                    limits=self.limits,
+                    backend=backend,
+                )
+            else:
+                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
+    def _guess_format(self, obj: Union[Path, DocumentStream]):
+        content = b""  # empty binary blob
+        format = None
+        if isinstance(obj, Path):
+            mime = filetype.guess_mime(str(obj))
+            if mime is None:
+                ext = obj.suffix[1:]
+                mime = self._mime_from_extension(ext)
+            if mime is None:  # must guess from
+                with obj.open("rb") as f:
+                    content = f.read(1024)  # Read first 1KB
+        elif isinstance(obj, DocumentStream):
+            content = obj.stream.read(8192)
+            obj.stream.seek(0)
+            mime = filetype.guess_mime(content)
+            if mime is None:
+                ext = (
+                    obj.name.rsplit(".", 1)[-1]
+                    if ("." in obj.name and not obj.name.startswith("."))
+                    else ""
+                )
+                mime = self._mime_from_extension(ext)
+        mime = mime or self._detect_html_xhtml(content)
+        mime = mime or "text/plain"
+        format = MimeTypeToFormat.get(mime)
+        return format
+    def _mime_from_extension(self, ext):
+        mime = None
+        if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
+            mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
+        elif ext in FormatToExtensions[InputFormat.HTML]:
+            mime = FormatToMimeType[InputFormat.HTML][0]
+        elif ext in FormatToExtensions[InputFormat.MD]:
+            mime = FormatToMimeType[InputFormat.MD][0]
+        return mime
+    def _detect_html_xhtml(self, content):
+        content_str = content.decode("ascii", errors="ignore").lower()
+        # Remove XML comments
+        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
+        content_str = content_str.lstrip()
+        if re.match(r"<\?xml", content_str):
+            if "xhtml" in content_str[:1000]:
+                return "application/xhtml+xml"
+        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+            return "text/html"
+        return None

{docling-2.9.0 → docling-2.10.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[
-        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+        EasyOcrOptions,
+        TesseractCliOcrOptions,
+        TesseractOcrOptions,
+        OcrMacOptions,
+        RapidOcrOptions,
     ] = Field(EasyOcrOptions(), discriminator="kind")
     images_scale: float = 1.0

{docling-2.9.0 → docling-2.10.0}/docling/document_converter.py RENAMED Viewed

@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
 class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
             pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
         ),
         InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
         InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
     }
     if (options := format_to_default_options.get(format)) is not None:

{docling-2.9.0 → docling-2.10.0}/docling/models/ds_glm_model.py RENAMED Viewed

@@ -4,7 +4,6 @@ from pathlib import Path
 from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
@@ -29,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
 from docling.datamodel.settings import settings
+from docling.utils.glm_utils import to_docling_document
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import create_hash
@@ -232,7 +232,7 @@ class GlmModel:
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
         with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
             ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True)
+            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
             glm_doc = self.model.apply_on_doc(ds_doc_dict)

{docling-2.9.0 → docling-2.10.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

@@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
         local_dir: Optional[Path] = None, force: bool = False
     ) -> Path:
         from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        disable_progress_bars()
         download_path = snapshot_download(
             repo_id="ds4sd/docling-models",
             force_download=force,

docling 2.9.0__tar.gz → 2.10.0__tar.gz

docling 2.9.0tar.gz → 2.10.0tar.gz