PyPI - docling - Versions diffs - 2.42.1__py3-none-any.whl → 2.43.0__py3-none-any.whl - Mend

docling 2.42.1py3-none-any.whl → 2.43.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

docling/backend/html_backend.py +34 -11
docling/backend/md_backend.py +43 -11
docling/backend/msword_backend.py +10 -1
docling/backend/pdf_backend.py +25 -1
docling/datamodel/pipeline_options.py +15 -0
docling/datamodel/settings.py +7 -12
docling/document_converter.py +27 -17
docling/models/layout_model.py +84 -66
docling/pipeline/base_pipeline.py +7 -1
docling/pipeline/threaded_standard_pdf_pipeline.py +605 -0
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/METADATA +4 -3
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/RECORD +16 -15
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/WHEEL +0 -0
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/entry_points.txt +0 -0
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/top_level.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import logging
 import re
-import traceback
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
-from bs4 import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
 from docling_core.types.doc import (
     DocItem,
@@ -144,11 +143,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         )
         # reset context
         self.ctx = _Context()
-        try:
-            self._walk(content, doc)
-        except Exception:
-            print(traceback.format_exc())
+        self._walk(content, doc)
         return doc
@@ -297,7 +292,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     ):
                         parts.append(child)
                     elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
-                        text_part = child.get_text()
+                        text_part = HTMLDocumentBackend.get_text(child)
                         if text_part:
                             parts.append(text_part)
                 li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +412,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             content_layer=self.content_layer,
         )
+    @staticmethod
+    def get_text(item: PageElement) -> str:
+        """Concatenate all child strings of a PageElement.
+        This method is equivalent to `PageElement.get_text()` but also considers
+        certain tags. When called on a <p> or <li> tags, it returns the text with a
+        trailing space, otherwise the text is concatenated without separators.
+        """
+        def _extract_text_recursively(item: PageElement) -> list[str]:
+            """Recursively extract text from all child nodes."""
+            result: list[str] = []
+            if isinstance(item, NavigableString):
+                result = [item]
+            elif isinstance(item, Tag):
+                tag = cast(Tag, item)
+                parts: list[str] = []
+                for child in tag:
+                    parts.extend(_extract_text_recursively(child))
+                result.append(
+                    "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
+                )
+            return result
+        parts: list[str] = _extract_text_recursively(item)
+        return "".join(parts)
     @staticmethod
     def _get_cell_spans(cell: Tag) -> tuple[int, int]:
         """Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +535,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         formula.replace_with(NavigableString(math_formula))
                 # TODO: extract content correctly from table-cells with lists
-                text = html_cell.text
-                # label = html_cell.name
+                text = HTMLDocumentBackend.get_text(html_cell).strip()
                 col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
                 if row_header:
                     row_span -= 1

docling/backend/md_backend.py CHANGED Viewed

@@ -5,7 +5,7 @@ from copy import deepcopy
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
-from typing import List, Literal, Optional, Set, Union
+from typing import Literal, Optional, Union, cast
 import marko
 import marko.element
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    ListItem,
     NodeItem,
     TableCell,
     TableData,
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
-        _log.debug("MD INIT!!!")
+        _log.debug("Starting MarkdownDocumentBackend...")
         # Markdown file:
         self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells: List[TableCell] = []
+            tcells: list[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        visited: Set[marko.element.Element],
+        visited: set[marko.element.Element],
         creation_stack: list[
             _CreationPayload
         ],  # stack for lazy item creation triggered deep in marko's AST (on RawText)
         list_ordered_flag_by_ref: dict[str, bool],
+        list_last_item_by_ref: dict[str, ListItem],
         parent_item: Optional[NodeItem] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif (
             isinstance(element, marko.block.ListItem)
-            and len(element.children) == 1
+            and len(element.children) > 0
             and isinstance((child := element.children[0]), marko.block.Paragraph)
             and len(child.children) > 0
         ):
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 if parent_item
                 else False
             )
-            if len(child.children) > 1:  # inline group will be created further down
+            non_list_children: list[marko.element.Element] = [
+                item
+                for item in child.children
+                if not isinstance(item, marko.block.ListItem)
+            ]
+            if len(non_list_children) > 1:  # inline group will be created further down
+                parent_ref: Optional[str] = (
+                    parent_item.self_ref if parent_item else None
+                )
                 parent_item = self._create_list_item(
                     doc=doc,
                     parent_item=parent_item,
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     formatting=formatting,
                     hyperlink=hyperlink,
                 )
+                if parent_ref:
+                    list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
             else:
                 creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 element.dest
             )
-        elif isinstance(element, marko.inline.RawText):
-            _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = element.children.strip()
+        elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
+            _log.debug(f" - RawText/Literal: {element.children}")
+            snippet_text = (
+                element.children.strip() if isinstance(element.children, str) else ""
+            )
             # Detect start of the table:
             if "|" in snippet_text or self.in_table:
                 # most likely part of the markdown table
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 if parent_item
                                 else False
                             )
+                            parent_ref = parent_item.self_ref if parent_item else None
                             parent_item = self._create_list_item(
                                 doc=doc,
                                 parent_item=parent_item,
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 formatting=formatting,
                                 hyperlink=hyperlink,
                             )
+                            if parent_ref:
+                                list_last_item_by_ref[parent_ref] = cast(
+                                    ListItem, parent_item
+                                )
                         elif isinstance(to_create, _HeadingCreationPayload):
                             # not keeping as parent_item as logic for correctly tracking
                             # that not implemented yet (section components not captured
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
+                if (
+                    isinstance(element, marko.block.ListItem)
+                    and isinstance(child, marko.block.List)
+                    and parent_item
+                    and list_last_item_by_ref.get(parent_item.self_ref, None)
+                ):
+                    _log.debug(
+                        f"walking into new List hanging from item of parent list {parent_item.self_ref}"
+                    )
+                    parent_item = list_last_item_by_ref[parent_item.self_ref]
                 self._iterate_elements(
                     element=child,
                     depth=depth + 1,
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     visited=visited,
                     creation_stack=creation_stack,
                     list_ordered_flag_by_ref=list_ordered_flag_by_ref,
+                    list_last_item_by_ref=list_last_item_by_ref,
                     parent_item=parent_item,
                     formatting=formatting,
                     hyperlink=hyperlink,
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         return False
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.MD}
     def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 visited=set(),
                 creation_stack=[],
                 list_ordered_flag_by_ref={},
+                list_last_item_by_ref={},
             )
             self._close_table(doc=doc)  # handle any last hanging table
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 ]:
                     html_str = _restore_original_html(txt=html_str, regex=regex)
                 self._html_blocks = 0
                 # delegate to HTML backend
                 stream = BytesIO(bytes(html_str, encoding="utf-8"))
                 in_doc = InputDocument(

docling/backend/msword_backend.py CHANGED Viewed

@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     )
                 _log.debug(f"  spanned before row {spanned_idx}")
+                # Detect equations in cell text
+                text, equations = self._handle_equations_in_text(
+                    element=cell._element, text=cell.text
+                )
+                if len(equations) == 0:
+                    text = cell.text
+                else:
+                    text = text.replace("<eq>", "$").replace("</eq>", "$")
                 table_cell = TableCell(
-                    text=cell.text,
+                    text=text,
                     row_span=spanned_idx - row_idx,
                     col_span=cell.grid_span,
                     start_row_offset_idx=row.grid_cols_before + row_idx,

docling/backend/pdf_backend.py CHANGED Viewed

@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
             if self.input_format is InputFormat.IMAGE:
                 buf = BytesIO()
                 img = Image.open(self.path_or_stream)
-                img.save(buf, "PDF")
+                # Handle multi-page TIFF images
+                if hasattr(img, "n_frames") and img.n_frames > 1:
+                    # Extract all frames from multi-page image
+                    frames = []
+                    try:
+                        for i in range(img.n_frames):
+                            img.seek(i)
+                            frame = img.copy().convert("RGB")
+                            frames.append(frame)
+                    except EOFError:
+                        pass
+                    # Save as multi-page PDF
+                    if frames:
+                        frames[0].save(
+                            buf, "PDF", save_all=True, append_images=frames[1:]
+                        )
+                    else:
+                        # Fallback to single page if frame extraction fails
+                        img.convert("RGB").save(buf, "PDF")
+                else:
+                    # Single page image - convert to RGB and save
+                    img.convert("RGB").save(buf, "PDF")
                 buf.seek(0)
                 self.path_or_stream = buf
             else:

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
     ASR = "asr"
+class ThreadedPdfPipelineOptions(PdfPipelineOptions):
+    """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
+    # Batch sizes for different stages
+    ocr_batch_size: int = 4
+    layout_batch_size: int = 4
+    table_batch_size: int = 4
+    # Timing control
+    batch_timeout_seconds: float = 2.0
+    # Backpressure and queue control
+    queue_max_size: int = 100

docling/datamodel/settings.py CHANGED Viewed

@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
 class BatchConcurrencySettings(BaseModel):
-    doc_batch_size: int = 2
-    doc_batch_concurrency: int = 2
-    page_batch_size: int = 4
-    page_batch_concurrency: int = 2
-    elements_batch_size: int = 16
-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-    # model_concurrency: int = 2
+    doc_batch_size: int = 1  # Number of documents processed in one batch. Should be >= doc_batch_concurrency
+    doc_batch_concurrency: int = 1  # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
+    page_batch_size: int = 4  # Number of pages processed in one batch.
+    page_batch_concurrency: int = 1  # Currently unused.
+    elements_batch_size: int = (
+        16  # Number of elements processed in one batch, in enrichment models.
+    )
     # To force models into single core: export OMP_NUM_THREADS=1

docling/document_converter.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 import threading
 import time
 from collections.abc import Iterable, Iterator
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -284,24 +285,33 @@ class DocumentConverter:
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info("Going to convert document batch...")
+            process_func = partial(
+                self._process_document, raises_on_error=raises_on_error
+            )
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-            for item in map(
-                partial(self._process_document, raises_on_error=raises_on_error),
-                input_batch,
+            if (
+                settings.perf.doc_batch_concurrency > 1
+                and settings.perf.doc_batch_size > 1
             ):
-                elapsed = time.monotonic() - start_time
-                start_time = time.monotonic()
-                _log.info(
-                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                )
-                yield item
+                with ThreadPoolExecutor(
+                    max_workers=settings.perf.doc_batch_concurrency
+                ) as pool:
+                    for item in pool.map(
+                        process_func,
+                        input_batch,
+                    ):
+                        yield item
+            else:
+                for item in map(
+                    process_func,
+                    input_batch,
+                ):
+                    elapsed = time.monotonic() - start_time
+                    start_time = time.monotonic()
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
+                    yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +340,7 @@ class DocumentConverter:
                     f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
                 )
-        return self.initialized_pipelines[cache_key]
+            return self.initialized_pipelines[cache_key]
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool

docling/models/layout_model.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 import numpy as np
 from docling_core.types.doc import DocItemLabel
@@ -148,72 +148,90 @@ class LayoutModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        # Convert to list to allow multiple iterations
+        pages = list(page_batch)
+        # Separate valid and invalid pages
+        valid_pages = []
+        valid_page_images: List[Union[Image.Image, np.ndarray]] = []
+        for page in pages:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "layout"):
-                    assert page.size is not None
-                    page_image = page.get_image(scale=1.0)
-                    assert page_image is not None
-                    clusters = []
-                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page_image)
-                    ):
-                        label = DocItemLabel(
-                            pred_item["label"]
-                            .lower()
-                            .replace(" ", "_")
-                            .replace("-", "_")
-                        )  # Temporary, until docling-ibm-model uses docling-core types
-                        cluster = Cluster(
-                            id=ix,
-                            label=label,
-                            confidence=pred_item["confidence"],
-                            bbox=BoundingBox.model_validate(pred_item),
-                            cells=[],
-                        )
-                        clusters.append(cluster)
-                    if settings.debug.visualize_raw_layout:
-                        self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
-                        )
-                    # Apply postprocessing
-                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters, self.options
-                    ).postprocess()
-                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
-                    with warnings.catch_warnings():
-                        warnings.filterwarnings(
-                            "ignore",
-                            "Mean of empty slice|invalid value encountered in scalar divide",
-                            RuntimeWarning,
-                            "numpy",
-                        )
-                        conv_res.confidence.pages[page.page_no].layout_score = float(
-                            np.mean([c.confidence for c in processed_clusters])
-                        )
-                        conv_res.confidence.pages[page.page_no].ocr_score = float(
-                            np.mean(
-                                [c.confidence for c in processed_cells if c.from_ocr]
-                            )
-                        )
-                    page.predictions.layout = LayoutPrediction(
-                        clusters=processed_clusters
-                    )
-                if settings.debug.visualize_layout:
-                    self.draw_clusters_and_cells_side_by_side(
-                        conv_res, page, processed_clusters, mode_prefix="postprocessed"
-                    )
+                continue
+            assert page.size is not None
+            page_image = page.get_image(scale=1.0)
+            assert page_image is not None
+            valid_pages.append(page)
+            valid_page_images.append(page_image)
+        # Process all valid pages with batch prediction
+        batch_predictions = []
+        if valid_page_images:
+            with TimeRecorder(conv_res, "layout"):
+                batch_predictions = self.layout_predictor.predict_batch(  # type: ignore[attr-defined]
+                    valid_page_images
+                )
+        # Process each page with its predictions
+        valid_page_idx = 0
+        for page in pages:
+            assert page._backend is not None
+            if not page._backend.is_valid():
                 yield page
+                continue
+            page_predictions = batch_predictions[valid_page_idx]
+            valid_page_idx += 1
+            clusters = []
+            for ix, pred_item in enumerate(page_predictions):
+                label = DocItemLabel(
+                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                )  # Temporary, until docling-ibm-model uses docling-core types
+                cluster = Cluster(
+                    id=ix,
+                    label=label,
+                    confidence=pred_item["confidence"],
+                    bbox=BoundingBox.model_validate(pred_item),
+                    cells=[],
+                )
+                clusters.append(cluster)
+            if settings.debug.visualize_raw_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, clusters, mode_prefix="raw"
+                )
+            # Apply postprocessing
+            processed_clusters, processed_cells = LayoutPostprocessor(
+                page, clusters, self.options
+            ).postprocess()
+            # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "Mean of empty slice|invalid value encountered in scalar divide",
+                    RuntimeWarning,
+                    "numpy",
+                )
+                conv_res.confidence.pages[page.page_no].layout_score = float(
+                    np.mean([c.confidence for c in processed_clusters])
+                )
+                conv_res.confidence.pages[page.page_no].ocr_score = float(
+                    np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                )
+            page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
+            if settings.debug.visualize_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, processed_clusters, mode_prefix="postprocessed"
+                )
+            yield page

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         return conv_res
     def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
-        status = ConversionStatus.SUCCESS
+        status = conv_res.status
+        if status in [
+            ConversionStatus.PENDING,
+            ConversionStatus.STARTED,
+        ]:  # preserves ConversionStatus.PARTIAL_SUCCESS
+            status = ConversionStatus.SUCCESS
         for page in conv_res.pages:
             if page._backend is None or not page._backend.is_valid():
                 conv_res.errors.append(

docling/pipeline/threaded_standard_pdf_pipeline.py ADDED Viewed

@@ -0,0 +1,605 @@
+# threaded_standard_pdf_pipeline.py
+"""Thread-safe, production-ready PDF pipeline
+================================================
+A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
+* **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
+  threads so that concurrent invocations never share mutable state.
+* **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
+  relying on :pyfunc:`id`, which may clash after garbage collection.
+* **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
+  propagates downstream so stages terminate deterministically without sentinels.
+* **Minimal shared state** - heavyweight models are initialised once per pipeline instance
+  and only read by worker threads; no runtime mutability is exposed.
+* **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
+"""
+from __future__ import annotations
+import itertools
+import logging
+import threading
+import time
+from collections import defaultdict, deque
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Sequence, Tuple
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
+from docling.datamodel.settings import settings
+from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
+from docling.models.factories import get_ocr_factory, get_picture_description_factory
+from docling.models.layout_model import LayoutModel
+from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
+from docling.models.page_preprocessing_model import (
+    PagePreprocessingModel,
+    PagePreprocessingOptions,
+)
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
+from docling.models.table_structure_model import TableStructureModel
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+from docling.utils.utils import chunkify
+_log = logging.getLogger(__name__)
+# ──────────────────────────────────────────────────────────────────────────────
+# Helper data structures
+# ──────────────────────────────────────────────────────────────────────────────
+@dataclass
+class ThreadedItem:
+    """Envelope that travels between pipeline stages."""
+    payload: Optional[Page]
+    run_id: int  # Unique per *execute* call, monotonic across pipeline instance
+    page_no: int
+    conv_res: ConversionResult
+    error: Optional[Exception] = None
+    is_failed: bool = False
+@dataclass
+class ProcessingResult:
+    """Aggregated outcome of a pipeline run."""
+    pages: List[Page] = field(default_factory=list)
+    failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
+    total_expected: int = 0
+    @property
+    def success_count(self) -> int:
+        return len(self.pages)
+    @property
+    def failure_count(self) -> int:
+        return len(self.failed_pages)
+    @property
+    def is_partial_success(self) -> bool:
+        return 0 < self.success_count < self.total_expected
+    @property
+    def is_complete_failure(self) -> bool:
+        return self.success_count == 0 and self.failure_count > 0
+class ThreadedQueue:
+    """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
+    __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
+    def __init__(self, max_size: int) -> None:
+        self._max: int = max_size
+        self._items: deque[ThreadedItem] = deque()
+        self._lock = threading.Lock()
+        self._not_full = threading.Condition(self._lock)
+        self._not_empty = threading.Condition(self._lock)
+        self._closed = False
+    # ---------------------------------------------------------------- put()
+    def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
+        """Block until queue accepts *item* or is closed.  Returns *False* if closed."""
+        with self._not_full:
+            if self._closed:
+                return False
+            start = time.monotonic()
+            while len(self._items) >= self._max and not self._closed:
+                if timeout is not None:
+                    remaining = timeout - (time.monotonic() - start)
+                    if remaining <= 0:
+                        return False
+                    self._not_full.wait(remaining)
+                else:
+                    self._not_full.wait()
+            if self._closed:
+                return False
+            self._items.append(item)
+            self._not_empty.notify()
+            return True
+    # ------------------------------------------------------------ get_batch()
+    def get_batch(
+        self, size: int, timeout: Optional[float] | None = None
+    ) -> List[ThreadedItem]:
+        """Return up to *size* items.  Blocks until ≥1 item present or queue closed/timeout."""
+        with self._not_empty:
+            start = time.monotonic()
+            while not self._items and not self._closed:
+                if timeout is not None:
+                    remaining = timeout - (time.monotonic() - start)
+                    if remaining <= 0:
+                        return []
+                    self._not_empty.wait(remaining)
+                else:
+                    self._not_empty.wait()
+            batch: List[ThreadedItem] = []
+            while self._items and len(batch) < size:
+                batch.append(self._items.popleft())
+            if batch:
+                self._not_full.notify_all()
+            return batch
+    # ---------------------------------------------------------------- close()
+    def close(self) -> None:
+        with self._lock:
+            self._closed = True
+            self._not_empty.notify_all()
+            self._not_full.notify_all()
+    # -------------------------------------------------------------- property
+    @property
+    def closed(self) -> bool:
+        return self._closed
+class ThreadedPipelineStage:
+    """A single pipeline stage backed by one worker thread."""
+    def __init__(
+        self,
+        *,
+        name: str,
+        model: Any,
+        batch_size: int,
+        batch_timeout: float,
+        queue_max_size: int,
+    ) -> None:
+        self.name = name
+        self.model = model
+        self.batch_size = batch_size
+        self.batch_timeout = batch_timeout
+        self.input_queue = ThreadedQueue(queue_max_size)
+        self._outputs: list[ThreadedQueue] = []
+        self._thread: Optional[threading.Thread] = None
+        self._running = False
+    # ---------------------------------------------------------------- wiring
+    def add_output_queue(self, q: ThreadedQueue) -> None:
+        self._outputs.append(q)
+    # -------------------------------------------------------------- lifecycle
+    def start(self) -> None:
+        if self._running:
+            return
+        self._running = True
+        self._thread = threading.Thread(
+            target=self._run, name=f"Stage-{self.name}", daemon=False
+        )
+        self._thread.start()
+    def stop(self) -> None:
+        if not self._running:
+            return
+        self._running = False
+        self.input_queue.close()
+        if self._thread is not None:
+            self._thread.join(timeout=30.0)
+            if self._thread.is_alive():
+                _log.warning("Stage %s did not terminate cleanly within 30s", self.name)
+    # ------------------------------------------------------------------ _run
+    def _run(self) -> None:
+        try:
+            while self._running:
+                batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
+                if not batch and self.input_queue.closed:
+                    break
+                processed = self._process_batch(batch)
+                self._emit(processed)
+        except Exception:  # pragma: no cover - top-level guard
+            _log.exception("Fatal error in stage %s", self.name)
+        finally:
+            for q in self._outputs:
+                q.close()
+    # ----------------------------------------------------- _process_batch()
+    def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
+        """Run *model* on *batch* grouped by run_id to maximise batching."""
+        groups: dict[int, list[ThreadedItem]] = defaultdict(list)
+        for itm in batch:
+            groups[itm.run_id].append(itm)
+        result: list[ThreadedItem] = []
+        for rid, items in groups.items():
+            good: list[ThreadedItem] = [i for i in items if not i.is_failed]
+            if not good:
+                result.extend(items)
+                continue
+            try:
+                # Filter out None payloads and ensure type safety
+                pages_with_payloads = [
+                    (i, i.payload) for i in good if i.payload is not None
+                ]
+                if len(pages_with_payloads) != len(good):
+                    # Some items have None payloads, mark all as failed
+                    for it in items:
+                        it.is_failed = True
+                        it.error = RuntimeError("Page payload is None")
+                    result.extend(items)
+                    continue
+                pages: List[Page] = [payload for _, payload in pages_with_payloads]
+                processed_pages = list(self.model(good[0].conv_res, pages))  # type: ignore[arg-type]
+                if len(processed_pages) != len(pages):  # strict mismatch guard
+                    raise RuntimeError(
+                        f"Model {self.name} returned wrong number of pages"
+                    )
+                for idx, page in enumerate(processed_pages):
+                    result.append(
+                        ThreadedItem(
+                            payload=page,
+                            run_id=rid,
+                            page_no=good[idx].page_no,
+                            conv_res=good[idx].conv_res,
+                        )
+                    )
+            except Exception as exc:
+                _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
+                for it in items:
+                    it.is_failed = True
+                    it.error = exc
+                result.extend(items)
+        return result
+    # -------------------------------------------------------------- _emit()
+    def _emit(self, items: Iterable[ThreadedItem]) -> None:
+        for item in items:
+            for q in self._outputs:
+                if not q.put(item):
+                    _log.error("Output queue closed while emitting from %s", self.name)
+@dataclass
+class RunContext:
+    """Wiring for a single *execute* call."""
+    stages: list[ThreadedPipelineStage]
+    first_stage: ThreadedPipelineStage
+    output_queue: ThreadedQueue
+# ──────────────────────────────────────────────────────────────────────────────
+# Main pipeline
+# ──────────────────────────────────────────────────────────────────────────────
+class ThreadedStandardPdfPipeline(BasePipeline):
+    """High-performance PDF pipeline with multi-threaded stages."""
+    def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
+        super().__init__(pipeline_options)
+        self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
+        self._run_seq = itertools.count(1)  # deterministic, monotonic run ids
+        # initialise heavy models once
+        self._init_models()
+    # ────────────────────────────────────────────────────────────────────────
+    # Heavy-model initialisation & helpers
+    # ────────────────────────────────────────────────────────────────────────
+    def _init_models(self) -> None:
+        art_path = self._resolve_artifacts_path()
+        self.keep_images = (
+            self.pipeline_options.generate_page_images
+            or self.pipeline_options.generate_picture_images
+            or self.pipeline_options.generate_table_images
+        )
+        self.preprocessing_model = PagePreprocessingModel(
+            options=PagePreprocessingOptions(
+                images_scale=self.pipeline_options.images_scale
+            )
+        )
+        self.ocr_model = self._make_ocr_model(art_path)
+        self.layout_model = LayoutModel(
+            artifacts_path=art_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+            options=self.pipeline_options.layout_options,
+        )
+        self.table_model = TableStructureModel(
+            enabled=self.pipeline_options.do_table_structure,
+            artifacts_path=art_path,
+            options=self.pipeline_options.table_structure_options,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+        self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
+        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
+        # --- optional enrichment ------------------------------------------------
+        self.enrichment_pipe = []
+        code_formula = CodeFormulaModel(
+            enabled=self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_formula_enrichment,
+            artifacts_path=art_path,
+            options=CodeFormulaModelOptions(
+                do_code_enrichment=self.pipeline_options.do_code_enrichment,
+                do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
+            ),
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+        if code_formula.enabled:
+            self.enrichment_pipe.append(code_formula)
+        picture_classifier = DocumentPictureClassifier(
+            enabled=self.pipeline_options.do_picture_classification,
+            artifacts_path=art_path,
+            options=DocumentPictureClassifierOptions(),
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+        if picture_classifier.enabled:
+            self.enrichment_pipe.append(picture_classifier)
+        picture_descr = self._make_picture_description_model(art_path)
+        if picture_descr and picture_descr.enabled:
+            self.enrichment_pipe.append(picture_descr)
+        self.keep_backend = any(
+            (
+                self.pipeline_options.do_formula_enrichment,
+                self.pipeline_options.do_code_enrichment,
+                self.pipeline_options.do_picture_classification,
+                self.pipeline_options.do_picture_description,
+            )
+        )
+    # ---------------------------------------------------------------- helpers
+    def _resolve_artifacts_path(self) -> Optional[Path]:
+        if self.pipeline_options.artifacts_path:
+            p = Path(self.pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path:
+            p = Path(settings.artifacts_path).expanduser()
+        else:
+            return None
+        if not p.is_dir():
+            raise RuntimeError(
+                f"{p} does not exist or is not a directory containing the required models"
+            )
+        return p
+    def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
+        factory = get_ocr_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.ocr_options,
+            enabled=self.pipeline_options.do_ocr,
+            artifacts_path=art_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+    def _make_picture_description_model(
+        self, art_path: Optional[Path]
+    ) -> Optional[PictureDescriptionBaseModel]:
+        factory = get_picture_description_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.picture_description_options,
+            enabled=self.pipeline_options.do_picture_description,
+            enable_remote_services=self.pipeline_options.enable_remote_services,
+            artifacts_path=art_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+    # ────────────────────────────────────────────────────────────────────────
+    # Build - thread pipeline
+    # ────────────────────────────────────────────────────────────────────────
+    def _create_run_ctx(self) -> RunContext:
+        opts = self.pipeline_options
+        preprocess = ThreadedPipelineStage(
+            name="preprocess",
+            model=self.preprocessing_model,
+            batch_size=1,
+            batch_timeout=opts.batch_timeout_seconds,
+            queue_max_size=opts.queue_max_size,
+        )
+        ocr = ThreadedPipelineStage(
+            name="ocr",
+            model=self.ocr_model,
+            batch_size=opts.ocr_batch_size,
+            batch_timeout=opts.batch_timeout_seconds,
+            queue_max_size=opts.queue_max_size,
+        )
+        layout = ThreadedPipelineStage(
+            name="layout",
+            model=self.layout_model,
+            batch_size=opts.layout_batch_size,
+            batch_timeout=opts.batch_timeout_seconds,
+            queue_max_size=opts.queue_max_size,
+        )
+        table = ThreadedPipelineStage(
+            name="table",
+            model=self.table_model,
+            batch_size=opts.table_batch_size,
+            batch_timeout=opts.batch_timeout_seconds,
+            queue_max_size=opts.queue_max_size,
+        )
+        assemble = ThreadedPipelineStage(
+            name="assemble",
+            model=self.assemble_model,
+            batch_size=1,
+            batch_timeout=opts.batch_timeout_seconds,
+            queue_max_size=opts.queue_max_size,
+        )
+        # wire stages
+        output_q = ThreadedQueue(opts.queue_max_size)
+        preprocess.add_output_queue(ocr.input_queue)
+        ocr.add_output_queue(layout.input_queue)
+        layout.add_output_queue(table.input_queue)
+        table.add_output_queue(assemble.input_queue)
+        assemble.add_output_queue(output_q)
+        stages = [preprocess, ocr, layout, table, assemble]
+        return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
+    # --------------------------------------------------------------------- build
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+        """Stream-build the document while interleaving producer and consumer work."""
+        run_id = next(self._run_seq)
+        assert isinstance(conv_res.input._backend, PdfDocumentBackend)
+        backend = conv_res.input._backend
+        # preload & initialise pages -------------------------------------------------------------
+        start_page, end_page = conv_res.input.limits.page_range
+        pages: list[Page] = []
+        for i in range(conv_res.input.page_count):
+            if start_page - 1 <= i <= end_page - 1:
+                page = Page(page_no=i)
+                page._backend = backend.load_page(i)
+                if page._backend and page._backend.is_valid():
+                    page.size = page._backend.get_size()
+                    conv_res.pages.append(page)
+                    pages.append(page)
+        if not pages:
+            conv_res.status = ConversionStatus.FAILURE
+            return conv_res
+        total_pages: int = len(pages)
+        ctx: RunContext = self._create_run_ctx()
+        for st in ctx.stages:
+            st.start()
+        proc = ProcessingResult(total_expected=total_pages)
+        fed_idx: int = 0  # number of pages successfully queued
+        batch_size: int = 32  # drain chunk
+        try:
+            while proc.success_count + proc.failure_count < total_pages:
+                # 1) feed - try to enqueue until the first queue is full
+                while fed_idx < total_pages:
+                    ok = ctx.first_stage.input_queue.put(
+                        ThreadedItem(
+                            payload=pages[fed_idx],
+                            run_id=run_id,
+                            page_no=pages[fed_idx].page_no,
+                            conv_res=conv_res,
+                        ),
+                        timeout=0.0,  # non-blocking try-put
+                    )
+                    if ok:
+                        fed_idx += 1
+                        if fed_idx == total_pages:
+                            ctx.first_stage.input_queue.close()
+                    else:  # queue full - switch to draining
+                        break
+                # 2) drain - pull whatever is ready from the output side
+                out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
+                for itm in out_batch:
+                    if itm.run_id != run_id:
+                        continue
+                    if itm.is_failed or itm.error:
+                        proc.failed_pages.append(
+                            (itm.page_no, itm.error or RuntimeError("unknown error"))
+                        )
+                    else:
+                        assert itm.payload is not None
+                        proc.pages.append(itm.payload)
+                # 3) failure safety - downstream closed early -> mark missing pages failed
+                if not out_batch and ctx.output_queue.closed:
+                    missing = total_pages - (proc.success_count + proc.failure_count)
+                    if missing > 0:
+                        proc.failed_pages.extend(
+                            [(-1, RuntimeError("pipeline terminated early"))] * missing
+                        )
+                    break
+        finally:
+            for st in ctx.stages:
+                st.stop()
+            ctx.output_queue.close()
+        self._integrate_results(conv_res, proc)
+        return conv_res
+    # ---------------------------------------------------- integrate_results()
+    def _integrate_results(
+        self, conv_res: ConversionResult, proc: ProcessingResult
+    ) -> None:
+        page_map = {p.page_no: p for p in proc.pages}
+        conv_res.pages = [
+            page_map.get(p.page_no, p)
+            for p in conv_res.pages
+            if p.page_no in page_map
+            or not any(fp == p.page_no for fp, _ in proc.failed_pages)
+        ]
+        if proc.is_complete_failure:
+            conv_res.status = ConversionStatus.FAILURE
+        elif proc.is_partial_success:
+            conv_res.status = ConversionStatus.PARTIAL_SUCCESS
+        else:
+            conv_res.status = ConversionStatus.SUCCESS
+        if not self.keep_images:
+            for p in conv_res.pages:
+                p._image_cache = {}
+        if not self.keep_backend:
+            for p in conv_res.pages:
+                if p._backend is not None:
+                    p._backend.unload()
+    # ---------------------------------------------------------------- assemble
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
+        elements, headers, body = [], [], []
+        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
+            for p in conv_res.pages:
+                if p.assembled:
+                    elements.extend(p.assembled.elements)
+                    headers.extend(p.assembled.headers)
+                    body.extend(p.assembled.body)
+            conv_res.assembled = AssembledUnit(
+                elements=elements, headers=headers, body=body
+            )
+            conv_res.document = self.reading_order_model(conv_res)
+        return conv_res
+    # ---------------------------------------------------------------- misc
+    @classmethod
+    def get_default_options(cls) -> ThreadedPdfPipelineOptions:
+        return ThreadedPdfPipelineOptions()
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
+        return isinstance(backend, PdfDocumentBackend)
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        return conv_res.status
+    def _unload(self, conv_res: ConversionResult) -> None:
+        for p in conv_res.pages:
+            if p._backend is not None:
+                p._backend.unload()
+        if conv_res.input._backend:
+            conv_res.input._backend.unload()

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.42.1
+Version: 2.43.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -28,9 +28,9 @@ License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
 Requires-Dist: docling-parse<5.0.0,>=4.0.0
-Requires-Dist: docling-ibm-models<4,>=3.6.0
+Requires-Dist: docling-ibm-models<4,>=3.9.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
-Requires-Dist: pypdfium2<5.0.0,>=4.30.0
+Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
 Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
 Requires-Dist: huggingface_hub<1,>=0.23
 Requires-Dist: requests<3.0.0,>=2.32.2
@@ -89,6 +89,7 @@ Dynamic: license-file
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
+[![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/document_converter.py,sha256=9aH8B30_jOYN4P_ySCCvtgEb3GoIpec15r7lEAFlMDU,14469
+docling/document_converter.py,sha256=pYlozCp6X1iGO75m3KSudMfrSCrXihTlRpKARFN67BI,14757
 docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,13 +9,13 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
 docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
 docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
 docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
-docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
-docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
+docling/backend/html_backend.py,sha256=Nuzyp6kyjd0g_MsBEPiWdFWU5w9UM60yWSluwU5C0M4,20310
+docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
 docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
 docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
-docling/backend/msword_backend.py,sha256=7mzPCF4bGWZPst5ntoV3aSxH5WUu2nBP-l8lgQT3tdw,44544
+docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
 docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
-docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
+docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
 docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
 docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,10 +37,10 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
 docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
 docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
 docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
-docling/datamodel/pipeline_options.py,sha256=nlejeQjnJx2RBMkCukDECHGuVEOol9hbsSLUi2ee9hY,10134
+docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
 docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
 docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
-docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
+docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
 docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
@@ -49,7 +49,7 @@ docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDW
 docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
 docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
 docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
-docling/models/layout_model.py,sha256=8bfLVKCS2A-ePTQK-T4M2K_Ah-jUVj71YOtwZvZ9rsU,8825
+docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
 docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
 docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
 docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
@@ -74,9 +74,10 @@ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6D
 docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
-docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
+docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
 docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
 docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
+docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
 docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
@@ -91,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
-docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.42.1.dist-info/RECORD,,
+docling-2.43.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.43.0.dist-info/METADATA,sha256=HS5J6rDKaZ_G_d4p10XgAwrNe-FjmHV-u5EmoTP4hro,10458
+docling-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.43.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.43.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.43.0.dist-info/RECORD,,

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.42.1.dist-info → docling-2.43.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.42.1__py3-none-any.whl → 2.43.0__py3-none-any.whl

docling 2.42.1py3-none-any.whl → 2.43.0py3-none-any.whl