PyPI - docling - Versions diffs - 2.47.1__tar.gz → 2.49.0__tar.gz - Mend

docling 2.47.1tar.gz → 2.49.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

{docling-2.47.1 → docling-2.49.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.47.1
+Version: 2.49.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
 Requires-Dist: pylatexenc<3.0,>=2.10
 Requires-Dist: scipy<2.0.0,>=1.6.0
 Requires-Dist: accelerate<2,>=1.0.0
+Requires-Dist: polyfactory>=2.22.2
 Provides-Extra: tesserocr
 Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
 Provides-Extra: ocrmac
@@ -60,9 +61,11 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
 Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
+Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
 Provides-Extra: rapidocr
-Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
+Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
+Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
 Provides-Extra: asr
 Requires-Dist: openai-whisper>=20250625; extra == "asr"
 Dynamic: license-file

{docling-2.47.1 → docling-2.49.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import logging
 import re
+import traceback
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
     "h4",
     "h5",
     "h6",
+    "ol",
     "p",
     "pre",
-    "code",
-    "ul",
-    "ol",
     "summary",
     "table",
+    "ul",
 }
+_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
 _FORMAT_TAG_MAP: Final = {
     "b": {"bold": True},
     "strong": {"bold": True},
     "i": {"italic": True},
     "em": {"italic": True},
+    "var": {"italic": True},
     # "mark",
     # "small",
     "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
     "ins": {"underline": True},
     "sub": {"script": Script.SUB},
     "sup": {"script": Script.SUPER},
+    **{k: {} for k in _CODE_TAG_SET},
 }
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
     text: str
     hyperlink: Union[AnyUrl, Path, None] = None
     formatting: Union[Formatting, None] = None
+    code: bool = False
 class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
         current_h = None
         current_text = ""
         current_f = None
+        current_code = False
         for at in self:
             t = at.text
             h = at.hyperlink
             f = at.formatting
+            c = at.code
             current_text += t.strip() + " "
             if f is not None and current_f is None:
                 current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
                 _log.warning(
                     f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                 )
+            current_code = c if c else current_code
         return AnnotatedText(
-            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+            text=current_text.strip(),
+            hyperlink=current_h,
+            formatting=current_f,
+            code=current_code,
         )
     def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
         text = self[0].text
         hyperlink = self[0].hyperlink
         formatting = self[0].formatting
+        code = self[0].code
         last_elm = text
         for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
+            if (
+                hyperlink == self[i].hyperlink
+                and formatting == self[i].formatting
+                and code == self[i].code
+            ):
                 sep = " "
                 if not self[i].text.strip() or not last_elm.strip():
                     sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
                 last_elm = self[i].text
             else:
                 simplified.append(
-                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                    AnnotatedText(
+                        text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                    )
                 )
                 text = self[i].text
                 last_elm = text
                 hyperlink = self[i].hyperlink
                 formatting = self[i].formatting
+                code = self[i].code
         if text:
             simplified.append(
-                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                AnnotatedText(
+                    text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                )
             )
         return simplified
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx = _Context()
         for i in range(self.max_levels):
             self.parents[i] = None
-        self.hyperlink = None
+        self.hyperlink: Union[AnyUrl, Path, None] = None
         self.original_url = original_url
         self.format_tags: list[str] = []
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
-        # remove scripts/styles
+        # remove script and style tags
         for tag in self.soup(["script", "style"]):
             tag.decompose()
+        # remove any hidden tag
+        for tag in self.soup(hidden=True):
+            tag.decompose()
         content = self.soup.body or self.soup
         # normalize <br> tags
         for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         def flush_buffer():
             if not buffer:
                 return
-            annotated_text_list = buffer.simplify_text_elements()
+            annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
             parts = annotated_text_list.split_by_newline()
             buffer.clear()
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 return
             for annotated_text_list in parts:
-                with self.use_inline_group(annotated_text_list, doc):
+                with self._use_inline_group(annotated_text_list, doc):
                     for annotated_text in annotated_text_list:
                         if annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(
                                 annotated_text.text.strip()
                             )
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
         for node in element.contents:
             if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     flush_buffer()
                     self._emit_image(node, doc)
                 elif name in _FORMAT_TAG_MAP:
-                    with self.use_format([name]):
+                    with self._use_format([name]):
                         self._walk(node, doc)
                 elif name == "a":
-                    with self.use_hyperlink(node):
+                    with self._use_hyperlink(node):
                         self._walk(node, doc)
                 elif name in _BLOCK_TAGS:
                     flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             this_parent = item.parent
             while this_parent is not None:
                 if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_format(format_tags):
-                        with self.use_hyperlink(this_parent):
+                    with self._use_format(format_tags):
+                        with self._use_hyperlink(this_parent):
                             return self._extract_text_and_hyperlink_recursively(
                                 item, ignore_list
                             )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if isinstance(item, NavigableString):
             text = item.strip()
+            code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
             if text:
                 return AnnotatedTextList(
                     [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text=text,
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text="\n",
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if not ignore_list or (tag.name not in ["ul", "ol"]):
             for child in tag:
                 if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
-                    with self.use_format([child.name]):
+                    with self._use_format([child.name]):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
                             )
                         )
                 elif isinstance(child, Tag) and child.name == "a":
-                    with self.use_hyperlink(child):
+                    with self._use_hyperlink(child):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return result
     @contextmanager
-    def use_hyperlink(self, tag):
+    def _use_hyperlink(self, tag: Tag):
         this_href = tag.get("href")
         if this_href is None:
             yield None
         else:
-            if this_href:
-                old_hyperlink = self.hyperlink
+            if isinstance(this_href, str) and this_href:
+                old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
+                new_hyperlink: Union[AnyUrl, Path, None] = None
                 if self.original_url is not None:
-                    this_href = urljoin(self.original_url, this_href)
+                    this_href = urljoin(str(self.original_url), str(this_href))
                 # ugly fix for relative links since pydantic does not support them.
                 try:
-                    AnyUrl(this_href)
+                    new_hyperlink = AnyUrl(this_href)
                 except ValidationError:
-                    this_href = Path(this_href)
-                self.hyperlink = this_href
+                    new_hyperlink = Path(this_href)
+                self.hyperlink = new_hyperlink
             try:
                 yield None
             finally:
-                if this_href:
+                if new_hyperlink:
                     self.hyperlink = old_hyperlink
     @contextmanager
-    def use_format(self, tags: list[str]):
+    def _use_format(self, tags: list[str]):
         if not tags:
             yield None
         else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.format_tags = self.format_tags[: -len(tags)]
     @contextmanager
-    def use_inline_group(
+    def _use_inline_group(
         self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
     ):
         """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         Args:
             annotated_text_list (AnnotatedTextList): Annotated text
             doc (DoclingDocument): Currently used document
-        Yields:
-            None: _description_
         """
         if len(annotated_text_list) > 1:
             inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             yield None
+    @contextmanager
+    def _use_details(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with the content of a details tag.
+        While the context manager is active, the hierarchy level is set one
+        level higher as the cuurent parent.
+        Args:
+            tag: The details tag.
+            doc: Currently used document.
+        """
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+    @contextmanager
+    def _use_footer(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with a footer.
+        Create a group with the content of a footer tag. While the context manager
+        is active, the hierarchy level is set one level higher as the cuurent parent.
+        Args:
+            tag: The footer tag.
+            doc: Currently used document.
+        """
+        current_layer = self.content_layer
+        self.content_layer = ContentLayer.FURNITURE
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+            self.content_layer = current_layer
     def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
         tag_name = tag.name.lower()
         # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             content_layer=self.content_layer,
                         )
                         self.level += 1
-                        with self.use_inline_group(min_parts, doc):
+                        with self._use_inline_group(min_parts, doc):
                             for annotated_text in min_parts:
                                 li_text = re.sub(
                                     r"\s+|\n+", " ", annotated_text.text
                                 ).strip()
                                 li_clean = HTMLDocumentBackend._clean_unicode(li_text)
-                                doc.add_text(
-                                    parent=self.parents[self.level],
-                                    label=DocItemLabel.TEXT,
-                                    text=li_clean,
-                                    content_layer=self.content_layer,
-                                    formatting=annotated_text.formatting,
-                                    hyperlink=annotated_text.hyperlink,
-                                )
+                                if annotated_text.code:
+                                    doc.add_code(
+                                        parent=self.parents[self.level],
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
+                                else:
+                                    doc.add_text(
+                                        parent=self.parents[self.level],
+                                        label=DocItemLabel.TEXT,
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
                         # 4) recurse into any nested lists, attaching them to this <li> item
                         for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text_list = self._extract_text_and_hyperlink_recursively(
                 tag, find_parent_annotation=True
             )
-            annotated_texts = text_list.simplify_text_elements()
+            annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
             for part in annotated_texts.split_by_newline():
-                with self.use_inline_group(part, doc):
+                with self._use_inline_group(part, doc):
                     for annotated_text in part:
                         if seg := annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     content_layer=self.content_layer,
                 )
-        elif tag_name in {"pre", "code"}:
+        elif tag_name in {"pre"}:
             # handle monospace code snippets (pre).
             text_list = self._extract_text_and_hyperlink_recursively(
-                tag, find_parent_annotation=True
+                tag, find_parent_annotation=True, keep_newlines=True
             )
             annotated_texts = text_list.simplify_text_elements()
-            with self.use_inline_group(annotated_texts, doc):
+            with self._use_inline_group(annotated_texts, doc):
                 for annotated_text in annotated_texts:
                     text_clean = HTMLDocumentBackend._clean_unicode(
                         annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         hyperlink=annotated_text.hyperlink,
                     )
-        elif tag_name in {"details", "footer"}:
-            if tag_name == "footer":
-                current_layer = self.content_layer
-                self.content_layer = ContentLayer.FURNITURE
-            self.parents[self.level + 1] = doc.add_group(
-                name=tag_name,
-                label=GroupLabel.SECTION,
-                parent=self.parents[self.level],
-                content_layer=self.content_layer,
-            )
-            self.level += 1
-            self._walk(tag, doc)
-            self.parents[self.level + 1] = None
-            self.level -= 1
-            if tag_name == "footer":
-                self.content_layer = current_layer
+        elif tag_name == "footer":
+            with self._use_footer(tag, doc):
+                self._walk(tag, doc)
+        elif tag_name == "details":
+            with self._use_details(tag, doc):
+                self._walk(tag, doc)
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")

{docling-2.47.1 → docling-2.49.0}/docling/backend/msexcel_backend.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union, cast
+from typing import Any, Optional, Union, cast
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     CoordOrigin,
     DocItem,
     DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                     parent=None,
                     label=GroupLabel.SECTION,
                     name=f"sheet: {sheet_name}",
+                    content_layer=self._get_sheet_content_layer(sheet),
                 )
                 doc = self._convert_sheet(doc, sheet)
                 width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         """
         if self.workbook is not None:
+            content_layer = self._get_sheet_content_layer(sheet)
             tables = self._find_data_tables(sheet)
             for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                             origin=CoordOrigin.TOPLEFT,
                         ),
                     ),
+                    content_layer=content_layer,
                 )
         return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             The updated DoclingDocument.
         """
         if self.workbook is not None:
+            content_layer = self._get_sheet_content_layer(sheet)
             # Iterate over byte images in the sheet
             for item in sheet._images:  # type: ignore[attr-defined]
                 try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                                 anchor, origin=CoordOrigin.TOPLEFT
                             ),
                         ),
+                        content_layer=content_layer,
                     )
                 except Exception:
                     _log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                 bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
         return (right - left, bottom - top)
+    @staticmethod
+    def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
+        return (
+            None
+            if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
+            else ContentLayer.INVISIBLE
+        )

{docling-2.47.1 → docling-2.49.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 0  # 32 * 32
         page_size = self.get_size()
+        rotation = self._ppage.get_rotation()
         with pypdfium2_lock:
             for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                 pos = obj.get_pos()
+                if rotation == 90:
+                    pos = (
+                        pos[1],
+                        page_size.height - pos[2],
+                        pos[3],
+                        page_size.height - pos[0],
+                    )
+                elif rotation == 180:
+                    pos = (
+                        page_size.width - pos[2],
+                        page_size.height - pos[3],
+                        page_size.width - pos[0],
+                        page_size.height - pos[1],
+                    )
+                elif rotation == 270:
+                    pos = (
+                        page_size.width - pos[3],
+                        pos[0],
+                        page_size.width - pos[1],
+                        pos[2],
+                    )
                 cropbox = BoundingBox.from_tuple(
                     pos, origin=CoordOrigin.BOTTOMLEFT
                 ).to_top_left_origin(page_height=page_size.height)
                 if cropbox.area() > AREA_THRESHOLD:
                     cropbox = cropbox.scaled(scale=scale)
                     yield cropbox
     def get_text_in_rect(self, bbox: BoundingBox) -> str:

{docling-2.47.1 → docling-2.49.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
 import numpy as np
 from docling_core.types.doc import (
@@ -32,6 +32,18 @@ from pydantic import (
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.pipeline_options import PipelineOptions
+class BaseFormatOption(BaseModel):
+    """Base class for format options used by _DocumentConversionInput."""
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
+    model_config = ConfigDict(arbitrary_types_allowed=True)
 class ConversionStatus(str, Enum):
     PENDING = "pending"

docling 2.47.1__tar.gz → 2.49.0__tar.gz

docling 2.47.1tar.gz → 2.49.0tar.gz