PyPI - docling - Versions diffs - 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl - Mend

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

docling/backend/asciidoc_backend.py +39 -18
docling/backend/docling_parse_backend.py +61 -59
docling/backend/docling_parse_v2_backend.py +72 -62
docling/backend/docling_parse_v4_backend.py +21 -19
docling/backend/md_backend.py +101 -81
docling/backend/mspowerpoint_backend.py +72 -113
docling/backend/msword_backend.py +99 -80
docling/backend/noop_backend.py +51 -0
docling/backend/pypdfium2_backend.py +127 -53
docling/cli/main.py +82 -14
docling/datamodel/asr_model_specs.py +92 -0
docling/datamodel/base_models.py +21 -4
docling/datamodel/document.py +3 -1
docling/datamodel/pipeline_options.py +15 -2
docling/datamodel/pipeline_options_asr_model.py +57 -0
docling/datamodel/pipeline_options_vlm_model.py +4 -4
docling/document_converter.py +8 -0
docling/models/api_vlm_model.py +3 -1
docling/models/base_model.py +1 -1
docling/models/base_ocr_model.py +33 -11
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +2 -3
docling/models/ocr_mac_model.py +1 -1
docling/models/page_preprocessing_model.py +3 -6
docling/models/rapid_ocr_model.py +1 -1
docling/models/readingorder_model.py +3 -3
docling/models/tesseract_ocr_cli_model.py +4 -3
docling/models/tesseract_ocr_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
docling/models/vlm_models_inline/mlx_model.py +3 -1
docling/pipeline/asr_pipeline.py +253 -0
docling/pipeline/base_pipeline.py +11 -0
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/utils/layout_postprocessor.py +11 -6
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional
 import numpy as np
 from PIL import ImageDraw
@@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
     images_scale: Optional[float]
-    create_parsed_page: bool
 class PagePreprocessingModel(BasePageModel):
@@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
     def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
         assert page._backend is not None
-        page.cells = list(page._backend.get_text_cells())
-        if self.options.create_parsed_page:
-            page.parsed_page = page._backend.get_segmented_page()
+        page.parsed_page = page._backend.get_segmented_page()
+        assert page.parsed_page is not None
         # Rate the text quality from the PDF parser, and aggregate on page
         text_scores = []

docling/models/rapid_ocr_model.py CHANGED Viewed

@@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
                             all_ocr_cells.extend(cells)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling/models/readingorder_model.py CHANGED Viewed

@@ -124,7 +124,7 @@ class ReadingOrderModel:
             page_no = page.page_no + 1
             size = page.size
-            assert size is not None
+            assert size is not None, "Page size is not initialized."
             out_doc.add_page(page_no=page_no, size=size)
@@ -334,12 +334,12 @@ class ReadingOrderModel:
             "Labels of merged elements must match."
         )
         prov = ProvenanceItem(
-            page_no=element.page_no + 1,
+            page_no=merged_elem.page_no + 1,
             charspan=(
                 len(new_item.text) + 1,
                 len(new_item.text) + 1 + len(merged_elem.text),
             ),
-            bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+            bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
         )
         new_item.text += f" {merged_elem.text}"
         new_item.orig += f" {merged_elem.text}"  # TODO: This is incomplete, we don't have the `orig` field of the merged element.

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -99,12 +99,12 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
-    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
+    def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
         r"""
         Run tesseract CLI
         """
         cmd = [self.options.tesseract_cmd]
-        if self._is_auto:
+        if self._is_auto and osd is not None:
             lang = self._parse_language(osd)
             if lang is not None:
                 cmd.append("-l")
@@ -231,6 +231,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                                 fname = image_file.name
                                 high_res_image.save(image_file)
                             doc_orientation = 0
+                            df_osd: Optional[pd.DataFrame] = None
                             try:
                                 df_osd = self._perform_osd(fname)
                                 doc_orientation = _parse_orientation(df_osd)
@@ -305,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                             all_ocr_cells.append(cell)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
                         all_ocr_cells.extend(cells)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -99,6 +99,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
             self.vlm_model = model_cls.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
+                torch_dtype=self.vlm_options.torch_dtype,
                 _attn_implementation=(
                     "flash_attention_2"
                     if self.device.startswith("cuda")
@@ -122,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     # Define prompt structure
                     prompt = self.formulate_prompt()

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                 with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size

docling/pipeline/asr_pipeline.py ADDED Viewed

@@ -0,0 +1,253 @@
+import logging
+import os
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Union, cast
+from docling_core.types.doc import DoclingDocument, DocumentOrigin
+# import whisper  # type: ignore
+# import librosa
+# import numpy as np
+# import soundfile as sf  # type: ignore
+from docling_core.types.doc.labels import DocItemLabel
+from pydantic import BaseModel, Field, validator
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
+# from pydub import AudioSegment  # type: ignore
+# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    FormatToMimeType,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
+)
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrNativeWhisperOptions,
+    # AsrResponseFormat,
+    InlineAsrOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+)
+from docling.datamodel.settings import settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+_log = logging.getLogger(__name__)
+class _ConversationWord(BaseModel):
+    text: str
+    start_time: Optional[float] = Field(
+        None, description="Start time in seconds from video start"
+    )
+    end_time: Optional[float] = Field(
+        None, ge=0, description="End time in seconds from video start"
+    )
+class _ConversationItem(BaseModel):
+    text: str
+    start_time: Optional[float] = Field(
+        None, description="Start time in seconds from video start"
+    )
+    end_time: Optional[float] = Field(
+        None, ge=0, description="End time in seconds from video start"
+    )
+    speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
+    speaker: Optional[str] = Field(
+        None, description="Speaker name, defaults to speaker-{speaker_id}"
+    )
+    words: Optional[list[_ConversationWord]] = Field(
+        None, description="Individual words with time-stamps"
+    )
+    def __lt__(self, other):
+        if not isinstance(other, _ConversationItem):
+            return NotImplemented
+        return self.start_time < other.start_time
+    def __eq__(self, other):
+        if not isinstance(other, _ConversationItem):
+            return NotImplemented
+        return self.start_time == other.start_time
+    def to_string(self) -> str:
+        """Format the conversation entry as a string"""
+        result = ""
+        if (self.start_time is not None) and (self.end_time is not None):
+            result += f"[time: {self.start_time}-{self.end_time}] "
+        if self.speaker is not None:
+            result += f"[speaker:{self.speaker}] "
+        result += self.text
+        return result
+class _NativeWhisperModel:
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        asr_options: InlineAsrNativeWhisperOptions,
+    ):
+        """
+        Transcriber using native Whisper.
+        """
+        self.enabled = enabled
+        _log.info(f"artifacts-path: {artifacts_path}")
+        _log.info(f"accelerator_options: {accelerator_options}")
+        if self.enabled:
+            try:
+                import whisper  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
+                )
+            self.asr_options = asr_options
+            self.max_tokens = asr_options.max_new_tokens
+            self.temperature = asr_options.temperature
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=asr_options.supported_devices,
+            )
+            _log.info(f"Available device for Whisper: {self.device}")
+            self.model_name = asr_options.repo_id
+            _log.info(f"loading _NativeWhisperModel({self.model_name})")
+            if artifacts_path is not None:
+                _log.info(f"loading {self.model_name} from {artifacts_path}")
+                self.model = whisper.load_model(
+                    name=self.model_name,
+                    device=self.device,
+                    download_root=str(artifacts_path),
+                )
+            else:
+                self.model = whisper.load_model(
+                    name=self.model_name, device=self.device
+                )
+            self.verbose = asr_options.verbose
+            self.timestamps = asr_options.timestamps
+            self.word_timestamps = asr_options.word_timestamps
+    def run(self, conv_res: ConversionResult) -> ConversionResult:
+        audio_path: Path = Path(conv_res.input.file).resolve()
+        try:
+            conversation = self.transcribe(audio_path)
+            # Ensure we have a proper DoclingDocument
+            origin = DocumentOrigin(
+                filename=conv_res.input.file.name or "audio.wav",
+                mimetype="audio/x-wav",
+                binary_hash=conv_res.input.document_hash,
+            )
+            conv_res.document = DoclingDocument(
+                name=conv_res.input.file.stem or "audio.wav", origin=origin
+            )
+            for citem in conversation:
+                conv_res.document.add_text(
+                    label=DocItemLabel.TEXT, text=citem.to_string()
+                )
+            conv_res.status = ConversionStatus.SUCCESS
+            return conv_res
+        except Exception as exc:
+            _log.error(f"Audio tranciption has an error: {exc}")
+        conv_res.status = ConversionStatus.FAILURE
+        return conv_res
+    def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+        result = self.model.transcribe(
+            str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
+        )
+        convo: list[_ConversationItem] = []
+        for _ in result["segments"]:
+            item = _ConversationItem(
+                start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
+            )
+            if "words" in _ and self.word_timestamps:
+                item.words = []
+                for __ in _["words"]:
+                    item.words.append(
+                        _ConversationWord(
+                            start_time=__["start"],
+                            end_time=__["end"],
+                            text=__["word"],
+                        )
+                    )
+            convo.append(item)
+        return convo
+class AsrPipeline(BasePipeline):
+    def __init__(self, pipeline_options: AsrPipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = True
+        self.pipeline_options: AsrPipelineOptions = pipeline_options
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+        if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
+            asr_options: InlineAsrNativeWhisperOptions = (
+                self.pipeline_options.asr_options
+            )
+            self._model = _NativeWhisperModel(
+                enabled=True,  # must be always enabled for this pipeline to make sense.
+                artifacts_path=artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                asr_options=asr_options,
+            )
+        else:
+            _log.error(f"No model support for {self.pipeline_options.asr_options}")
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        status = ConversionStatus.SUCCESS
+        return status
+    @classmethod
+    def get_default_options(cls) -> AsrPipelineOptions:
+        return AsrPipelineOptions()
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+        _log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            self._model.run(conv_res=conv_res)
+        return conv_res
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, NoOpBackend)

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                 )
                 raise e
+            # Filter out uninitialized pages (those with size=None) that may remain
+            # after timeout or processing failures to prevent assertion errors downstream
+            initial_page_count = len(conv_res.pages)
+            conv_res.pages = [page for page in conv_res.pages if page.size is not None]
+            if len(conv_res.pages) < initial_page_count:
+                _log.info(
+                    f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
+                    f"due to timeout or processing failures"
+                )
         return conv_res
     def _unload(self, conv_res: ConversionResult) -> ConversionResult:

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
             PagePreprocessingModel(
                 options=PagePreprocessingOptions(
                     images_scale=pipeline_options.images_scale,
-                    create_parsed_page=pipeline_options.generate_parsed_pages,
                 )
             ),
             # OCR

docling/utils/layout_postprocessor.py CHANGED Viewed

@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
 from docling_core.types.doc.page import TextCell
 from rtree import index
-from docling.datamodel.base_models import BoundingBox, Cluster
+from docling.datamodel.base_models import BoundingBox, Cluster, Page
 _log = logging.getLogger(__name__)
@@ -194,11 +194,11 @@ class LayoutPostprocessor:
         DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
     }
-    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
-        """Initialize processor with cells and clusters."""
-        """Initialize processor with cells and spatial indices."""
-        self.cells = cells
-        self.page_size = page_size
+    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+        """Initialize processor with page and clusters."""
+        self.cells = page.cells
+        self.page = page
+        self.page_size = page.size
         self.all_clusters = clusters
         self.regular_clusters = [
             c for c in clusters if c.label not in self.SPECIAL_TYPES
@@ -240,6 +240,10 @@ class LayoutPostprocessor:
             for child in cluster.children:
                 child.cells = self._sort_cells(child.cells)
+        assert self.page.parsed_page is not None
+        self.page.parsed_page.textline_cells = self.cells
+        self.page.parsed_page.has_lines = len(self.cells) > 0
         return final_clusters, self.cells
     def _process_regular_clusters(self) -> List[Cluster]:
@@ -301,6 +305,7 @@ class LayoutPostprocessor:
         special_clusters = self._handle_cross_type_overlaps(special_clusters)
         # Calculate page area from known page size
+        assert self.page_size is not None
         page_area = self.page_size.width * self.page_size.height
         if page_area > 0:
             # Filter out full-page pictures

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.36.1
+Version: 2.38.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
+Provides-Extra: asr
+Requires-Dist: openai-whisper>=20240930; extra == "asr"
 Dynamic: license-file
 <p align="center">
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
-* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🎙️  Support for Audio with Automatic Speech Recognition (ASR) models
 * 💻 Simple and convenient CLI
 ### Coming soon

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,22 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/document_converter.py,sha256=bnUA9k1LCuCfNwCsneGQiGCvFdnX8W-vbpnu6U_fuuI,14003
+docling/document_converter.py,sha256=3jWywP_TLy-1PMvjJBUlnTM9FNzpBLRCHYA1RKFvGR4,14333
 docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
-docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
+docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo9PbCKU,14417
 docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
-docling/backend/docling_parse_backend.py,sha256=bVSPmmiVXdCVfe-eLtDhbPQKBjkFR8rZJoRxdWIMdYU,7998
-docling/backend/docling_parse_v2_backend.py,sha256=R4YPCEs72GYg-Xc9VfizPv8QjtGmKOsQzVPNAU2RIK0,9376
-docling/backend/docling_parse_v4_backend.py,sha256=aWh-fd-lnuRGVGC_DG17QUptIsArv5V1gJo8QFbB5Ys,6263
+docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
+docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
+docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
 docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
-docling/backend/md_backend.py,sha256=JkY1qTvQFXjKSZGfD-83d-fZelorUG_l6mpJdYGqvX8,17210
+docling/backend/md_backend.py,sha256=ghIU_NSaENKrRu49Dn5GvjYtcAgEU7ZHbf-TeYg49nY,17673
 docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
-docling/backend/mspowerpoint_backend.py,sha256=RwqfvvzrtM56L9uf7PR9lvlHJ-LyYGpkS1iVxkTl72Q,17203
-docling/backend/msword_backend.py,sha256=iB2yRg8hXtET2-Wjkv5pq0p9Y1SGQYIVCcWtOtXUILU,44621
+docling/backend/mspowerpoint_backend.py,sha256=0lsb8ZeQFxbDt7jZpSQyk5wYHYa3SP2T2y2dMI-o30o,15216
+docling/backend/msword_backend.py,sha256=C4qs4mQEt1JzonCg5v6_yUxdngzcTzSO9k1ik8_DW5Q,44855
+docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
 docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
-docling/backend/pypdfium2_backend.py,sha256=fUGRBupwTYftEgdIDWKphA2zdfb-SrUoUGENK6j-q-0,11002
+docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
 docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
@@ -27,36 +28,38 @@ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e
 docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=fDGjepShl6KO_BdA6qUNyNBoCjqZUKRnmmkzesGtvVU,27202
+docling/cli/main.py,sha256=D2gEoArnQ2yQ9BesH9CkxZbYQyhZRGgjjNWYqmRRUtU,29617
 docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
 docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
-docling/datamodel/base_models.py,sha256=bkooSG4brZy2jt2dndkin3DHvfZ5HFp0C94yBGmCWeI,10568
-docling/datamodel/document.py,sha256=vPwiVU5zWCKbVYMq-TSmb7LTjijrqJq0FyAgDBa0XGA,16154
-docling/datamodel/pipeline_options.py,sha256=iMuwsa77hkAgjJWXBRAFEQGw9tGNMDQrPnSvE5mirNs,9081
-docling/datamodel/pipeline_options_vlm_model.py,sha256=-ZPAp2uSKMatDbjZPv9chT587B1aftfDVmi_FDb2aw8,1997
+docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
+docling/datamodel/base_models.py,sha256=L35qXLmADZQNEzBC0M6K2xrfLyqrTqDlbPD6E6DkWMc,11146
+docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
+docling/datamodel/pipeline_options.py,sha256=N7my7hmvuX6EzlujHeF6RObPSrG_HjN_nfPzILTqP-E,9479
+docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
+docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
 docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
 docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/api_vlm_model.py,sha256=w3P1wOsr3JvZsawbK1Z4uwnD5ehUMbcKGkyhcX83Okc,2738
-docling/models/base_model.py,sha256=Zx_nByGYkubTvvYiQxwiB6P8lc7wOD4ZTC2QIw6vCEg,2950
-docling/models/base_ocr_model.py,sha256=c6a2QzZnAMfQECQDz1JASecl_Z2F3i6P3ax6kHWcz6o,7221
+docling/models/api_vlm_model.py,sha256=GDDJGAia4SJjK7JFxsZy5oEU-D8yQo8Kb3NvvPbTvT0,2820
+docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
+docling/models/base_ocr_model.py,sha256=HtrefTq9Zy4UnUInMchPv0tbobiA7CQU5VUauKJD7IU,8006
 docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
 docling/models/document_picture_classifier.py,sha256=fkJLV7pMy3v6iNwOzVb6zdBU1dGtBM1ARHLIRPfoAG4,6124
-docling/models/easyocr_model.py,sha256=bTK-AQYc-WTzX8SRoMRwVjqlMigaJKGloaLUcH6RCKU,7406
-docling/models/layout_model.py,sha256=KdGhS4EMWKP6BwlhUJ0mdbhk2Fc78qwzqEZbTxyrbFM,8508
-docling/models/ocr_mac_model.py,sha256=CJOwz9h84crvZd3kQMLxYntpXz-1w2eLDjhGUnGIwMQ,5415
+docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
+docling/models/layout_model.py,sha256=EJuRXW0rFdnNPS5AifdEsr812EATUqAioeMCVjw8PL0,8460
+docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
 docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
-docling/models/page_preprocessing_model.py,sha256=8cdhR9n3zcC8JxDen8WdPBx_GNk_5VICeHJo1-kP518,5186
+docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
 docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
 docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
 docling/models/picture_description_vlm_model.py,sha256=7LeCx9ZdPxsmWJ468OtxCdAkH48A1HD0iwH9cs_7-1Q,3800
-docling/models/rapid_ocr_model.py,sha256=miTPn1YTWKtXUuddiVv0SjgkuNWHXCW3CZ6epDUmKjI,5935
-docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
+docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
+docling/models/readingorder_model.py,sha256=QHb5fyiqmxU8lg4W5IzdukqHPh6V7rNw_57O4-z-Az4,14615
 docling/models/table_structure_model.py,sha256=dQf6u_zn5fHCkHzmTwYfCbRtZCBddsyAM0WNVBUUQzk,12473
-docling/models/tesseract_ocr_cli_model.py,sha256=oQZVWXQ6wRrFonRFwbWeW9nJ9FLQZdzSWErOp0mEff0,12698
-docling/models/tesseract_ocr_model.py,sha256=AjrZNwgVbV0IbzBJwI35YP0KxvqWJWJE0v_lgHJiQrk,10606
+docling/models/tesseract_ocr_cli_model.py,sha256=qcM3-n7Z_dm1CGBhVUcNr2XT41iXnU32zk4RqKHBl9I,12775
+docling/models/tesseract_ocr_model.py,sha256=9DPAE7XP7smej7HYhr7mdwpuxSjAcv_GPrYZG3bb1RA,10587
 docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
 docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
 docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
@@ -66,19 +69,20 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
 docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
 docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/vlm_models_inline/hf_transformers_model.py,sha256=SXSu6spu8zNCsrD32RU_irLs59ltF6PqbLVfpjDujmE,7285
-docling/models/vlm_models_inline/mlx_model.py,sha256=CFe1UNxQufZd5K4iaOW3HsplQBPb_1cENf3KIwWUSWw,5702
+docling/models/vlm_models_inline/hf_transformers_model.py,sha256=w9_N4ccjmYYK5yYQou0LSMGaj6gs8l0hULvXbkfYXSQ,7425
+docling/models/vlm_models_inline/mlx_model.py,sha256=qpyi6fGHm0vPqW2yeTsRBKOTTshNJ1LAPbH1SBDp8Y8,5784
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
+docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
+docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
 docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
-docling/pipeline/standard_pdf_pipeline.py,sha256=itCZPj7nMFAQtAlStfmWthpCIHZFUm9W5uTgvVi6PkQ,12738
+docling/pipeline/standard_pdf_pipeline.py,sha256=2Hqg2wnAXfbZbLUOQrRus8PMEuZ549jR1mfR86-CAB4,12659
 docling/pipeline/vlm_pipeline.py,sha256=IrjDbajCPmUPep_jATKNiABST4tQ8mvpkQz9mtBQ8qQ,15279
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/accelerator_utils.py,sha256=Fww4UiTiuIB91iuPgUZTy-DYpCGRMI8YuCYKhFb0gjA,2905
 docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
 docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
 docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
-docling/utils/layout_postprocessor.py,sha256=3WCmkPsPJ80xfWzAUeWb5L9BmuwJ79ztctvbbUs8AfI,24068
+docling/utils/layout_postprocessor.py,sha256=laTPGGj-hv16Zh1TRcn8NK0POKs7d3jeaV1pRR_TjIU,24228
 docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
 docling/utils/model_downloader.py,sha256=6TDxFOvMRYT8JyYyaQS_wXMJzNga61ImY3sFdks66qM,4004
 docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
@@ -86,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.36.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.36.1.dist-info/METADATA,sha256=0Sl0LfCopUXdEd6mm2kqRpMgFoq6nvZBUXlIKeIqY_E,10036
-docling-2.36.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.36.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.36.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.36.1.dist-info/RECORD,,
+docling-2.38.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.38.0.dist-info/METADATA,sha256=vT8Zko4wD8iyKUjLAJ83Cm7ntscjEk5ojHvcJXlvT5A,10273
+docling-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.38.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.38.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.38.0.dist-info/RECORD,,

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl