PyPI - docling - Versions diffs - 2.42.2__tar.gz → 2.43.0__tar.gz - Mend

docling 2.42.2tar.gz → 2.43.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

{docling-2.42.2 → docling-2.43.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.42.2
+Version: 2.43.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -28,9 +28,9 @@ License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
 Requires-Dist: docling-parse<5.0.0,>=4.0.0
-Requires-Dist: docling-ibm-models<4,>=3.6.0
+Requires-Dist: docling-ibm-models<4,>=3.9.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
-Requires-Dist: pypdfium2<5.0.0,>=4.30.0
+Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
 Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
 Requires-Dist: huggingface_hub<1,>=0.23
 Requires-Dist: requests<3.0.0,>=2.32.2

{docling-2.42.2 → docling-2.43.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import logging
 import re
-import traceback
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
@@ -144,11 +143,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         )
         # reset context
         self.ctx = _Context()
-        try:
-            self._walk(content, doc)
-        except Exception:
-            print(traceback.format_exc())
+        self._walk(content, doc)
         return doc

{docling-2.42.2 → docling-2.43.0}/docling/backend/md_backend.py RENAMED Viewed

@@ -5,7 +5,7 @@ from copy import deepcopy
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
-from typing import List, Literal, Optional, Set, Union
+from typing import Literal, Optional, Union, cast
 import marko
 import marko.element
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    ListItem,
     NodeItem,
     TableCell,
     TableData,
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
-        _log.debug("MD INIT!!!")
+        _log.debug("Starting MarkdownDocumentBackend...")
         # Markdown file:
         self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells: List[TableCell] = []
+            tcells: list[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        visited: Set[marko.element.Element],
+        visited: set[marko.element.Element],
         creation_stack: list[
             _CreationPayload
         ],  # stack for lazy item creation triggered deep in marko's AST (on RawText)
         list_ordered_flag_by_ref: dict[str, bool],
+        list_last_item_by_ref: dict[str, ListItem],
         parent_item: Optional[NodeItem] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif (
             isinstance(element, marko.block.ListItem)
-            and len(element.children) == 1
+            and len(element.children) > 0
             and isinstance((child := element.children[0]), marko.block.Paragraph)
             and len(child.children) > 0
         ):
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 if parent_item
                 else False
             )
-            if len(child.children) > 1:  # inline group will be created further down
+            non_list_children: list[marko.element.Element] = [
+                item
+                for item in child.children
+                if not isinstance(item, marko.block.ListItem)
+            ]
+            if len(non_list_children) > 1:  # inline group will be created further down
+                parent_ref: Optional[str] = (
+                    parent_item.self_ref if parent_item else None
+                )
                 parent_item = self._create_list_item(
                     doc=doc,
                     parent_item=parent_item,
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     formatting=formatting,
                     hyperlink=hyperlink,
                 )
+                if parent_ref:
+                    list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
             else:
                 creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 element.dest
             )
-        elif isinstance(element, marko.inline.RawText):
-            _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = element.children.strip()
+        elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
+            _log.debug(f" - RawText/Literal: {element.children}")
+            snippet_text = (
+                element.children.strip() if isinstance(element.children, str) else ""
+            )
             # Detect start of the table:
             if "|" in snippet_text or self.in_table:
                 # most likely part of the markdown table
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 if parent_item
                                 else False
                             )
+                            parent_ref = parent_item.self_ref if parent_item else None
                             parent_item = self._create_list_item(
                                 doc=doc,
                                 parent_item=parent_item,
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 formatting=formatting,
                                 hyperlink=hyperlink,
                             )
+                            if parent_ref:
+                                list_last_item_by_ref[parent_ref] = cast(
+                                    ListItem, parent_item
+                                )
                         elif isinstance(to_create, _HeadingCreationPayload):
                             # not keeping as parent_item as logic for correctly tracking
                             # that not implemented yet (section components not captured
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
+                if (
+                    isinstance(element, marko.block.ListItem)
+                    and isinstance(child, marko.block.List)
+                    and parent_item
+                    and list_last_item_by_ref.get(parent_item.self_ref, None)
+                ):
+                    _log.debug(
+                        f"walking into new List hanging from item of parent list {parent_item.self_ref}"
+                    )
+                    parent_item = list_last_item_by_ref[parent_item.self_ref]
                 self._iterate_elements(
                     element=child,
                     depth=depth + 1,
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     visited=visited,
                     creation_stack=creation_stack,
                     list_ordered_flag_by_ref=list_ordered_flag_by_ref,
+                    list_last_item_by_ref=list_last_item_by_ref,
                     parent_item=parent_item,
                     formatting=formatting,
                     hyperlink=hyperlink,
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         return False
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.MD}
     def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 visited=set(),
                 creation_stack=[],
                 list_ordered_flag_by_ref={},
+                list_last_item_by_ref={},
             )
             self._close_table(doc=doc)  # handle any last hanging table
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 ]:
                     html_str = _restore_original_html(txt=html_str, regex=regex)
                 self._html_blocks = 0
                 # delegate to HTML backend
                 stream = BytesIO(bytes(html_str, encoding="utf-8"))
                 in_doc = InputDocument(

{docling-2.42.2 → docling-2.43.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
     ASR = "asr"
+class ThreadedPdfPipelineOptions(PdfPipelineOptions):
+    """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
+    # Batch sizes for different stages
+    ocr_batch_size: int = 4
+    layout_batch_size: int = 4
+    table_batch_size: int = 4
+    # Timing control
+    batch_timeout_seconds: float = 2.0
+    # Backpressure and queue control
+    queue_max_size: int = 100

{docling-2.42.2 → docling-2.43.0}/docling/datamodel/settings.py RENAMED Viewed

@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
 class BatchConcurrencySettings(BaseModel):
-    doc_batch_size: int = 2
-    doc_batch_concurrency: int = 2
-    page_batch_size: int = 4
-    page_batch_concurrency: int = 2
-    elements_batch_size: int = 16
-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-    # model_concurrency: int = 2
+    doc_batch_size: int = 1  # Number of documents processed in one batch. Should be >= doc_batch_concurrency
+    doc_batch_concurrency: int = 1  # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
+    page_batch_size: int = 4  # Number of pages processed in one batch.
+    page_batch_concurrency: int = 1  # Currently unused.
+    elements_batch_size: int = (
+        16  # Number of elements processed in one batch, in enrichment models.
+    )
     # To force models into single core: export OMP_NUM_THREADS=1

{docling-2.42.2 → docling-2.43.0}/docling/document_converter.py RENAMED Viewed

@@ -4,6 +4,7 @@ import sys
 import threading
 import time
 from collections.abc import Iterable, Iterator
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -284,24 +285,33 @@ class DocumentConverter:
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info("Going to convert document batch...")
+            process_func = partial(
+                self._process_document, raises_on_error=raises_on_error
+            )
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-            for item in map(
-                partial(self._process_document, raises_on_error=raises_on_error),
-                input_batch,
+            if (
+                settings.perf.doc_batch_concurrency > 1
+                and settings.perf.doc_batch_size > 1
             ):
-                elapsed = time.monotonic() - start_time
-                start_time = time.monotonic()
-                _log.info(
-                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                )
-                yield item
+                with ThreadPoolExecutor(
+                    max_workers=settings.perf.doc_batch_concurrency
+                ) as pool:
+                    for item in pool.map(
+                        process_func,
+                        input_batch,
+                    ):
+                        yield item
+            else:
+                for item in map(
+                    process_func,
+                    input_batch,
+                ):
+                    elapsed = time.monotonic() - start_time
+                    start_time = time.monotonic()
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
+                    yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +340,7 @@ class DocumentConverter:
                     f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
                 )
-        return self.initialized_pipelines[cache_key]
+            return self.initialized_pipelines[cache_key]
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool

{docling-2.42.2 → docling-2.43.0}/docling/models/layout_model.py RENAMED Viewed

@@ -3,7 +3,7 @@ import logging
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 import numpy as np
 from docling_core.types.doc import DocItemLabel
@@ -148,72 +148,90 @@ class LayoutModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        # Convert to list to allow multiple iterations
+        pages = list(page_batch)
+        # Separate valid and invalid pages
+        valid_pages = []
+        valid_page_images: List[Union[Image.Image, np.ndarray]] = []
+        for page in pages:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "layout"):
-                    assert page.size is not None
-                    page_image = page.get_image(scale=1.0)
-                    assert page_image is not None
-                    clusters = []
-                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page_image)
-                    ):
-                        label = DocItemLabel(
-                            pred_item["label"]
-                            .lower()
-                            .replace(" ", "_")
-                            .replace("-", "_")
-                        )  # Temporary, until docling-ibm-model uses docling-core types
-                        cluster = Cluster(
-                            id=ix,
-                            label=label,
-                            confidence=pred_item["confidence"],
-                            bbox=BoundingBox.model_validate(pred_item),
-                            cells=[],
-                        )
-                        clusters.append(cluster)
-                    if settings.debug.visualize_raw_layout:
-                        self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
-                        )
-                    # Apply postprocessing
-                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters, self.options
-                    ).postprocess()
-                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
-                    with warnings.catch_warnings():
-                        warnings.filterwarnings(
-                            "ignore",
-                            "Mean of empty slice|invalid value encountered in scalar divide",
-                            RuntimeWarning,
-                            "numpy",
-                        )
-                        conv_res.confidence.pages[page.page_no].layout_score = float(
-                            np.mean([c.confidence for c in processed_clusters])
-                        )
-                        conv_res.confidence.pages[page.page_no].ocr_score = float(
-                            np.mean(
-                                [c.confidence for c in processed_cells if c.from_ocr]
-                            )
-                        )
-                    page.predictions.layout = LayoutPrediction(
-                        clusters=processed_clusters
-                    )
-                if settings.debug.visualize_layout:
-                    self.draw_clusters_and_cells_side_by_side(
-                        conv_res, page, processed_clusters, mode_prefix="postprocessed"
-                    )
+                continue
+            assert page.size is not None
+            page_image = page.get_image(scale=1.0)
+            assert page_image is not None
+            valid_pages.append(page)
+            valid_page_images.append(page_image)
+        # Process all valid pages with batch prediction
+        batch_predictions = []
+        if valid_page_images:
+            with TimeRecorder(conv_res, "layout"):
+                batch_predictions = self.layout_predictor.predict_batch(  # type: ignore[attr-defined]
+                    valid_page_images
+                )
+        # Process each page with its predictions
+        valid_page_idx = 0
+        for page in pages:
+            assert page._backend is not None
+            if not page._backend.is_valid():
                 yield page
+                continue
+            page_predictions = batch_predictions[valid_page_idx]
+            valid_page_idx += 1
+            clusters = []
+            for ix, pred_item in enumerate(page_predictions):
+                label = DocItemLabel(
+                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                )  # Temporary, until docling-ibm-model uses docling-core types
+                cluster = Cluster(
+                    id=ix,
+                    label=label,
+                    confidence=pred_item["confidence"],
+                    bbox=BoundingBox.model_validate(pred_item),
+                    cells=[],
+                )
+                clusters.append(cluster)
+            if settings.debug.visualize_raw_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, clusters, mode_prefix="raw"
+                )
+            # Apply postprocessing
+            processed_clusters, processed_cells = LayoutPostprocessor(
+                page, clusters, self.options
+            ).postprocess()
+            # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "Mean of empty slice|invalid value encountered in scalar divide",
+                    RuntimeWarning,
+                    "numpy",
+                )
+                conv_res.confidence.pages[page.page_no].layout_score = float(
+                    np.mean([c.confidence for c in processed_clusters])
+                )
+                conv_res.confidence.pages[page.page_no].ocr_score = float(
+                    np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                )
+            page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
+            if settings.debug.visualize_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, processed_clusters, mode_prefix="postprocessed"
+                )
+            yield page

docling 2.42.2__tar.gz → 2.43.0__tar.gz

docling 2.42.2tar.gz → 2.43.0tar.gz