PyPI - docling - Versions diffs - 2.37.0__tar.gz → 2.38.0__tar.gz - Mend

docling 2.37.0tar.gz → 2.38.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

{docling-2.37.0 → docling-2.38.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.37.0
+Version: 2.38.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
+Provides-Extra: asr
+Requires-Dist: openai-whisper>=20240930; extra == "asr"
 Dynamic: license-file
 <p align="center">
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
-* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🎙️  Support for Audio with Automatic Speech Recognition (ASR) models
 * 💻 Simple and convenient CLI
 ### Coming soon

{docling-2.37.0 → docling-2.38.0}/README.md RENAMED Viewed

@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 🗂️  Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
-* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* ↪️  Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
 * 🔒 Local execution capabilities for sensitive data and air-gapped environments
 * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🎙️  Support for Audio with Automatic Speech Recognition (ASR) models
 * 💻 Simple and convenient CLI
 ### Coming soon

{docling-2.37.0 → docling-2.38.0}/docling/backend/md_backend.py RENAMED Viewed

@@ -1,17 +1,15 @@
 import logging
 import re
 import warnings
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Set, Union
 import marko
 import marko.element
-import marko.ext
-import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
     TableData,
     TextItem,
 )
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from marko import Markdown
+from pydantic import AnyUrl, TypeAdapter
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
-        self.inline_texts: list[str] = []
         self._html_blocks: int = 0
         try:
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
-    def _process_inline_text(
-        self, parent_item: Optional[NodeItem], doc: DoclingDocument
-    ):
-        txt = " ".join(self.inline_texts)
-        if len(txt) > 0:
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                parent=parent_item,
-                text=txt,
-            )
-        self.inline_texts = []
     def _iterate_elements(  # noqa: C901
         self,
+        *,
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
         visited: Set[marko.element.Element],
         parent_item: Optional[NodeItem] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         if element in visited:
             return
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading) and len(element.children) > 0:
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
+            if len(element.children) == 1:
+                child = element.children[0]
+                snippet_text = str(child.children)  # type: ignore
+                visited.add(child)
+            else:
+                snippet_text = ""  # inline group will be created
             if element.level == 1:
-                doc_label = DocItemLabel.TITLE
+                parent_item = doc.add_title(
+                    text=snippet_text,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
             else:
-                doc_label = DocItemLabel.SECTION_HEADER
-            # Header could have arbitrary inclusion of bold, italic or emphasis,
-            # hence we need to traverse the tree to get full text of a header
-            strings: List[str] = []
-            # Define a recursive function to traverse the tree
-            def traverse(node: marko.block.BlockElement):
-                # Check if the node has a "children" attribute
-                if hasattr(node, "children"):
-                    # If "children" is a list, continue traversal
-                    if isinstance(node.children, list):
-                        for child in node.children:
-                            traverse(child)
-                    # If "children" is text, add it to header text
-                    elif isinstance(node.children, str):
-                        strings.append(node.children)
-            traverse(element)
-            snippet_text = "".join(strings)
-            if len(snippet_text) > 0:
-                if doc_label == DocItemLabel.SECTION_HEADER:
-                    parent_item = doc.add_heading(
-                        text=snippet_text,
-                        level=element.level - 1,
-                        parent=parent_item,
-                    )
-                else:
-                    parent_item = doc.add_text(
-                        label=doc_label, parent=parent_item, text=snippet_text
-                    )
+                parent_item = doc.add_heading(
+                    text=snippet_text,
+                    level=element.level - 1,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         elif isinstance(element, marko.block.List):
             has_non_empty_list_items = False
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     break
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif (
             isinstance(element, marko.block.ListItem)
-            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.block.Paragraph)
+            and len(element.children) == 1
+            and isinstance((child := element.children[0]), marko.block.Paragraph)
+            and len(child.children) > 0
         ):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
-            snippet_text = str(first_child.children[0].children)  # type: ignore
-            is_numbered = False
-            if (
-                parent_item is not None
-                and isinstance(parent_item, DocItem)
-                and parent_item.label == GroupLabel.ORDERED_LIST
-            ):
-                is_numbered = True
-            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_item, text=snippet_text
+            if len(child.children) == 1:
+                snippet_text = str(child.children[0].children)  # type: ignore
+                visited.add(child)
+            else:
+                snippet_text = ""  # inline group will be created
+            is_numbered = isinstance(parent_item, OrderedList)
+            if not isinstance(parent_item, (OrderedList, UnorderedList)):
+                _log.warning("ListItem would have not had a list parent, adding one.")
+                parent_item = doc.add_unordered_list(parent=parent_item)
+            parent_item = doc.add_list_item(
+                enumerated=is_numbered,
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
-            visited.add(first_child)
         elif isinstance(element, marko.inline.Image):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
             fig_caption: Optional[TextItem] = None
             if element.title is not None and element.title != "":
                 fig_caption = doc.add_text(
-                    label=DocItemLabel.CAPTION, text=element.title
+                    label=DocItemLabel.CAPTION,
+                    text=element.title,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                 )
             doc.add_picture(parent=parent_item, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self._process_inline_text(parent_item, doc)
+        elif isinstance(element, marko.inline.Emphasis):
+            _log.debug(f" - Emphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.italic = True
+        elif isinstance(element, marko.inline.StrongEmphasis):
+            _log.debug(f" - StrongEmphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.bold = True
+        elif isinstance(element, marko.inline.Link):
+            _log.debug(f" - Link: {element.children}")
+            hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
+                element.dest
+            )
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                 else:
                     self.md_table_buffer.append(snippet_text)
-            else:
+            elif snippet_text:
                 self._close_table(doc)
-                # most likely just inline text
-                self.inline_texts.append(str(element.children))
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=parent_item,
+                    text=snippet_text,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         elif isinstance(element, marko.inline.CodeSpan):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
         elif (
             isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
             and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.inline.RawText)
-            and len(snippet_text := (first_child.children.strip())) > 0
+            and isinstance((child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (child.children.strip())) > 0
         ):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif isinstance(element, marko.block.HTMLBlock):
             self._html_blocks += 1
-            self._process_inline_text(parent_item, doc)
             self._close_table(doc)
             _log.debug(f"HTML Block: {element}")
             if (
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # wrap in markers to enable post-processing in convert()
                 text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_item, text=text_to_add)
+                doc.add_code(
+                    parent=parent_item,
+                    text=text_to_add,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         else:
             if not isinstance(element, str):
                 self._close_table(doc)
                 _log.debug(f"Some other element: {element}")
+        if (
+            isinstance(element, (marko.block.Paragraph, marko.block.Heading))
+            and len(element.children) > 1
+        ):
+            parent_item = doc.add_inline_group(parent=parent_item)
         processed_block_types = (
-            marko.block.Heading,
+            # marko.block.Heading,
             marko.block.CodeBlock,
             marko.block.FencedCode,
             marko.inline.RawText,
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     doc=doc,
                     visited=visited,
                     parent_item=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                 )
     def is_valid(self) -> bool:
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 parent_item=None,
                 visited=set(),
             )
-            self._process_inline_text(None, doc)  # handle last hanging inline text
             self._close_table(doc=doc)  # handle any last hanging table
             # if HTML blocks were detected, export to HTML and delegate to HTML backend

{docling-2.37.0 → docling-2.38.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -14,7 +14,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             self.valid = True
         except Exception as e:
             raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
             ) from e
     @override
@@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self._handle_tables(element, docx_obj, doc)
                 except Exception:
                     _log.debug("could not parse a table, broken docx table")
+            # Check for Image
             elif drawing_blip:
                 self._handle_pictures(docx_obj, drawing_blip, doc)
+                # Check for Text after the Image
+                if (
+                    tag_name in ["p"]
+                    and element.find(".//w:t", namespaces=namespaces) is not None
+                ):
+                    self._handle_text_elements(element, docx_obj, doc)
             # Check for the sdt containers, like table of contents
             elif tag_name in ["sdt"]:
                 sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self._handle_text_elements(element, docx_obj, doc)
             else:
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
         return doc
     def _str_to_int(
@@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         all_paragraphs = []
         # Sort paragraphs within each container, then process containers
-        for container_id, paragraphs in container_paragraphs.items():
+        for paragraphs in container_paragraphs.values():
             # Sort by vertical position within each container
             sorted_container_paragraphs = sorted(
                 paragraphs,
@@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         doc: DoclingDocument,
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
+        paragraph_elements = self._get_paragraph_elements(paragraph)
         text, equations = self._handle_equations_in_text(
             element=element, text=paragraph.text
         )
         if text is None:
             return
-        paragraph_elements = self._get_paragraph_elements(paragraph)
         text = text.strip()
         # Common styles for bullet and numbered lists.
@@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         )
         return
+    def _add_formatted_list_item(
+        self,
+        doc: DoclingDocument,
+        elements: list,
+        marker: str,
+        enumerated: bool,
+        level: int,
+    ) -> None:
+        # This should not happen by construction
+        if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
+            return
+        if len(elements) == 1:
+            text, format, hyperlink = elements[0]
+            doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text=text,
+                formatting=format,
+                hyperlink=hyperlink,
+            )
+        else:
+            new_item = doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text="",
+            )
+            new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
+            for text, format, hyperlink in elements:
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=new_parent,
+                    text=text,
+                    formatting=format,
+                    hyperlink=hyperlink,
+                )
     def _add_list_item(
         self,
         *,
@@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         elements: list,
         is_numbered: bool = False,
     ) -> None:
+        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
+        if not elements:
+            return None
         enum_marker = ""
         level = self._get_level()
@@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         elif (
             self._prev_numid() == numid
             and self.level_at_new_list is not None
@@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         elif (
             self._prev_numid() == numid
             and self.level_at_new_list is not None
             and prev_indent is not None
             and ilevel < prev_indent
         ):  # Close list
-            for k, v in self.parents.items():
+            for k in self.parents:
                 if k > self.level_at_new_list + ilevel:
                     self.parents[k] = None
@@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
             self.listIter = 0
         elif self._prev_numid() == numid or prev_indent == ilevel:
@@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level - 1],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level - 1
             )
-            for text, format, hyperlink in elements:
-                # Add the list item to the parent group
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         return
     def _handle_tables(

docling-2.38.0/docling/backend/noop_backend.py ADDED Viewed

@@ -0,0 +1,51 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class NoOpBackend(AbstractDocumentBackend):
+    """
+    A no-op backend that only validates input existence.
+    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+    """
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
+        # Validate input
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                # Check if stream has content
+                self.valid = len(self.path_or_stream.getvalue()) > 0
+                _log.debug(
+                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+                )
+            elif isinstance(self.path_or_stream, Path):
+                # Check if file exists
+                self.valid = self.path_or_stream.exists()
+                _log.debug(f"File exists: {self.valid}")
+            else:
+                self.valid = False
+        except Exception as e:
+            _log.error(f"NoOpBackend validation failed: {e}")
+            self.valid = False
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set(InputFormat)

docling 2.37.0__tar.gz → 2.38.0__tar.gz

docling 2.37.0tar.gz → 2.38.0tar.gz