PyPI - docling - Versions diffs - 2.48.0__tar.gz → 2.50.0__tar.gz - Mend

docling 2.48.0tar.gz → 2.50.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

{docling-2.48.0 → docling-2.50.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.48.0
+Version: 2.50.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -28,7 +28,7 @@ License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
 Requires-Dist: docling-parse<5.0.0,>=4.2.2
-Requires-Dist: docling-ibm-models<4,>=3.9.0
+Requires-Dist: docling-ibm-models<4,>=3.9.1
 Requires-Dist: filetype<2.0.0,>=1.2.0
 Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
 Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
 Requires-Dist: pylatexenc<3.0,>=2.10
 Requires-Dist: scipy<2.0.0,>=1.6.0
 Requires-Dist: accelerate<2,>=1.0.0
+Requires-Dist: polyfactory>=2.22.2
 Provides-Extra: tesserocr
 Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
 Provides-Extra: ocrmac
@@ -60,6 +61,7 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
 Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
+Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"

{docling-2.48.0 → docling-2.50.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -467,13 +467,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @contextmanager
     def _use_hyperlink(self, tag: Tag):
+        old_hyperlink: Union[AnyUrl, Path, None] = None
+        new_hyperlink: Union[AnyUrl, Path, None] = None
         this_href = tag.get("href")
         if this_href is None:
             yield None
         else:
             if isinstance(this_href, str) and this_href:
-                old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
-                new_hyperlink: Union[AnyUrl, Path, None] = None
+                old_hyperlink = self.hyperlink
                 if self.original_url is not None:
                     this_href = urljoin(str(self.original_url), str(this_href))
                 # ugly fix for relative links since pydantic does not support them.

{docling-2.48.0 → docling-2.50.0}/docling/backend/msexcel_backend.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union, cast
+from typing import Any, Optional, Union, cast
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     CoordOrigin,
     DocItem,
     DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                     parent=None,
                     label=GroupLabel.SECTION,
                     name=f"sheet: {sheet_name}",
+                    content_layer=self._get_sheet_content_layer(sheet),
                 )
                 doc = self._convert_sheet(doc, sheet)
                 width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         """
         if self.workbook is not None:
+            content_layer = self._get_sheet_content_layer(sheet)
             tables = self._find_data_tables(sheet)
             for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                             origin=CoordOrigin.TOPLEFT,
                         ),
                     ),
+                    content_layer=content_layer,
                 )
         return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             The updated DoclingDocument.
         """
         if self.workbook is not None:
+            content_layer = self._get_sheet_content_layer(sheet)
             # Iterate over byte images in the sheet
             for item in sheet._images:  # type: ignore[attr-defined]
                 try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                                 anchor, origin=CoordOrigin.TOPLEFT
                             ),
                         ),
+                        content_layer=content_layer,
                     )
                 except Exception:
                     _log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                 bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
         return (right - left, bottom - top)
+    @staticmethod
+    def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
+        return (
+            None
+            if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
+            else ContentLayer.INVISIBLE
+        )

{docling-2.48.0 → docling-2.50.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 0  # 32 * 32
         page_size = self.get_size()
+        rotation = self._ppage.get_rotation()
         with pypdfium2_lock:
             for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                 pos = obj.get_pos()
+                if rotation == 90:
+                    pos = (
+                        pos[1],
+                        page_size.height - pos[2],
+                        pos[3],
+                        page_size.height - pos[0],
+                    )
+                elif rotation == 180:
+                    pos = (
+                        page_size.width - pos[2],
+                        page_size.height - pos[3],
+                        page_size.width - pos[0],
+                        page_size.height - pos[1],
+                    )
+                elif rotation == 270:
+                    pos = (
+                        page_size.width - pos[3],
+                        pos[0],
+                        page_size.width - pos[1],
+                        pos[2],
+                    )
                 cropbox = BoundingBox.from_tuple(
                     pos, origin=CoordOrigin.BOTTOMLEFT
                 ).to_top_left_origin(page_height=page_size.height)
                 if cropbox.area() > AREA_THRESHOLD:
                     cropbox = cropbox.scaled(scale=scale)
                     yield cropbox
     def get_text_in_rect(self, bbox: BoundingBox) -> str:

{docling-2.48.0 → docling-2.50.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
 import numpy as np
 from docling_core.types.doc import (
@@ -32,6 +32,18 @@ from pydantic import (
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.pipeline_options import PipelineOptions
+class BaseFormatOption(BaseModel):
+    """Base class for format options used by _DocumentConversionInput."""
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
+    model_config = ConfigDict(arbitrary_types_allowed=True)
 class ConversionStatus(str, Enum):
     PENDING = "pending"

{docling-2.48.0 → docling-2.50.0}/docling/datamodel/document.py RENAMED Viewed

@@ -2,12 +2,13 @@ import csv
 import logging
 import re
 import tarfile
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
     TYPE_CHECKING,
+    Any,
     Dict,
     List,
     Literal,
@@ -72,7 +73,7 @@ from docling.utils.profiling import ProfilingItem
 from docling.utils.utils import create_file_hash
 if TYPE_CHECKING:
-    from docling.document_converter import FormatOption
+    from docling.datamodel.base_models import BaseFormatOption
 _log = logging.getLogger(__name__)
@@ -238,7 +239,8 @@ class _DocumentConversionInput(BaseModel):
     limits: Optional[DocumentLimits] = DocumentLimits()
     def docs(
-        self, format_options: Dict[InputFormat, "FormatOption"]
+        self,
+        format_options: Mapping[InputFormat, "BaseFormatOption"],
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
             obj = (

docling-2.50.0/docling/datamodel/extraction.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Data models for document extraction functionality."""
+from typing import Any, Dict, List, Optional, Type, Union
+from pydantic import BaseModel, Field
+from docling.datamodel.base_models import ConversionStatus, ErrorItem
+from docling.datamodel.document import InputDocument
+class ExtractedPageData(BaseModel):
+    """Data model for extracted content from a single page."""
+    page_no: int = Field(..., description="1-indexed page number")
+    extracted_data: Optional[Dict[str, Any]] = Field(
+        None, description="Extracted structured data from the page"
+    )
+    raw_text: Optional[str] = Field(None, description="Raw extracted text")
+    errors: List[str] = Field(
+        default_factory=list,
+        description="Any errors encountered during extraction for this page",
+    )
+class ExtractionResult(BaseModel):
+    """Result of document extraction."""
+    input: InputDocument
+    status: ConversionStatus = ConversionStatus.PENDING
+    errors: List[ErrorItem] = []
+    # Pages field - always a list for consistency
+    pages: List[ExtractedPageData] = Field(
+        default_factory=list, description="Extracted data from each page"
+    )
+# Type alias for template parameters that can be string, dict, or BaseModel
+ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]

{docling-2.48.0 → docling-2.50.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
 from docling.datamodel.vlm_model_specs import (
     GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
     GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
+    NU_EXTRACT_2B_TRANSFORMERS,
     SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
     SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
     VlmModelType,
@@ -113,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
     cls_model_path: Optional[str] = None  # same default as rapidocr
     rec_model_path: Optional[str] = None  # same default as rapidocr
     rec_keys_path: Optional[str] = None  # same default as rapidocr
+    rec_font_path: Optional[str] = None  # same default as rapidocr
     model_config = ConfigDict(
         extra="forbid",
@@ -246,12 +248,9 @@ class OcrEngine(str, Enum):
     RAPIDOCR = "rapidocr"
-class PipelineOptions(BaseModel):
+class PipelineOptions(BaseOptions):
     """Base pipeline options."""
-    create_legacy_output: bool = (
-        True  # This default will be set to False on a future version of docling
-    )
     document_timeout: Optional[float] = None
     accelerator_options: AcceleratorOptions = AcceleratorOptions()
     enable_remote_services: bool = False
@@ -284,10 +283,10 @@ class LayoutOptions(BaseModel):
     keep_empty_clusters: bool = (
         False  # Whether to keep clusters that contain no text cells
     )
+    model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
     skip_cell_assignment: bool = (
         False  # Skip cell-to-cluster assignment for VLM-only processing
     )
-    model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
 class AsrPipelineOptions(PipelineOptions):
@@ -295,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
+class VlmExtractionPipelineOptions(PipelineOptions):
+    """Options for extraction pipeline."""
+    artifacts_path: Optional[Union[Path, str]] = None
+    vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""

{docling-2.48.0 → docling-2.50.0}/docling/datamodel/vlm_model_specs.py RENAMED Viewed

@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
     temperature=0.0,
 )
+# NuExtract
+NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
+    repo_id="numind/NuExtract-2.0-2B",
+    prompt="",  # This won't be used, template is passed separately
+    torch_dtype="bfloat16",
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    response_format=ResponseFormat.PLAINTEXT,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"

{docling-2.48.0 → docling-2.50.0}/docling/document_converter.py RENAMED Viewed

@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
+    BaseFormatOption,
     ConversionStatus,
     DoclingComponentType,
     DocumentStream,
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
 _PIPELINE_CACHE_LOCK = threading.Lock()
-class FormatOption(BaseModel):
+class FormatOption(BaseFormatOption):
     pipeline_cls: Type[BasePipeline]
-    pipeline_options: Optional[PipelineOptions] = None
-    backend: Type[AbstractDocumentBackend]
-    model_config = ConfigDict(arbitrary_types_allowed=True)
     @model_validator(mode="after")
     def set_optional_field_default(self) -> "FormatOption":
@@ -191,7 +188,7 @@ class DocumentConverter:
         self.allowed_formats = (
             allowed_formats if allowed_formats is not None else list(InputFormat)
         )
-        self.format_to_options = {
+        self.format_to_options: Dict[InputFormat, FormatOption] = {
             format: (
                 _get_default_option(format=format)
                 if (custom_option := (format_options or {}).get(format)) is None

docling 2.48.0__tar.gz → 2.50.0__tar.gz

docling 2.48.0tar.gz → 2.50.0tar.gz