PyPI - docling - Versions diffs - 2.15.0__tar.gz → 2.16.0__tar.gz - Mend

docling 2.15.0tar.gz → 2.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{docling-2.15.0 → docling-2.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.15.0
+Version: 2.16.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
-Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
-Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
+Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
+Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
+Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,13 +39,14 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
 Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
+Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
 Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
-Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: requests (>=2.32.2,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.6.0,<2.0.0)
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +85,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
@@ -94,7 +95,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 ## Installation

{docling-2.15.0 → docling-2.16.0}/README.md RENAMED Viewed

@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
@@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 ## Installation

{docling-2.15.0 → docling-2.16.0}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
     def supports_pagination(cls) -> bool:
         pass
-    @abstractmethod
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()

{docling-2.15.0 → docling-2.16.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

@@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)
 class AsciiDocBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)

{docling-2.15.0 → docling-2.16.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         return cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         for i in range(len(self._dpage["images"])):
             bitmap = self._dpage["images"][i]
@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

{docling-2.15.0 → docling-2.16.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
         return cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         images = self._dpage["sanitized"]["images"]["data"]
         images_header = self._dpage["sanitized"]["images"]["header"]
@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

{docling-2.15.0 → docling-2.16.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -215,7 +215,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], label=label, text=text)
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""

docling-2.16.0/docling/backend/json/docling_json_backend.py ADDED Viewed

@@ -0,0 +1,58 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+from docling_core.types.doc import DoclingDocument
+from typing_extensions import override
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+class DoclingJSONBackend(DeclarativeDocumentBackend):
+    @override
+    def __init__(
+        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        # given we need to store any actual conversion exception for raising it from
+        # convert(), this captures the successful result or the actual error in a
+        # mutually exclusive way:
+        self._doc_or_err = self._get_doc_or_err()
+    @override
+    def is_valid(self) -> bool:
+        return isinstance(self._doc_or_err, DoclingDocument)
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.JSON_DOCLING}
+    def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
+        try:
+            json_data: Union[str, bytes]
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as f:
+                    json_data = f.read()
+            elif isinstance(self.path_or_stream, BytesIO):
+                json_data = self.path_or_stream.getvalue()
+            else:
+                raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
+            return DoclingDocument.model_validate_json(json_data=json_data)
+        except Exception as e:
+            return e
+    @override
+    def convert(self) -> DoclingDocument:
+        if isinstance(self._doc_or_err, DoclingDocument):
+            return self._doc_or_err
+        else:
+            raise self._doc_or_err

{docling-2.15.0 → docling-2.16.0}/docling/backend/md_backend.py RENAMED Viewed

@@ -3,19 +3,22 @@ import re
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
 import marko
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    NodeItem,
     TableCell,
     TableData,
+    TextItem,
 )
 from marko import Markdown
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
-    def close_table(self, doc=None):
+    def close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -137,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.in_table = False
             self.md_table_buffer = []  # clean table markdown buffer
             # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
             # Populate
             for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
             if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
         return
-    def process_inline_text(self, parent_element, doc=None):
+    def process_inline_text(
+        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    ):
         # self.inline_text_buffer += str(text_in)
         txt = self.inline_text_buffer.strip()
         if len(txt) > 0:
@@ -156,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             )
         self.inline_text_buffer = ""
-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+    def iterate_elements(
+        self,
+        element: marko.block.Element,
+        depth: int,
+        doc: DoclingDocument,
+        parent_element: Optional[NodeItem] = None,
+    ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
             if element.level == 1:
                 doc_label = DocItemLabel.TITLE
@@ -172,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             # Header could have arbitrary inclusion of bold, italic or emphasis,
             # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []
             # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                 # Check if the node has a "children" attribute
                 if hasattr(node, "children"):
                     # If "children" is a list, continue traversal
@@ -209,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
-            snippet_text = str(element.children[0].children[0].children)
+            snippet_text = str(element.children[0].children[0].children)  # type: ignore
             is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_element is not None
+                and isinstance(parent_element, DocItem)
+                and parent_element.label == GroupLabel.ORDERED_LIST
+            ):
                 is_numbered = True
             doc.add_list_item(
                 enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+            doc.add_picture(parent=parent_element, caption=fig_caption)
         elif isinstance(element, marko.block.Paragraph):
             self.process_inline_text(parent_element, doc)
@@ -252,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.block.CodeBlock):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.block.FencedCode):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
             self.process_inline_text(parent_element, doc)

{docling-2.15.0 → docling-2.16.0}/docling/backend/msexcel_backend.py RENAMED Viewed

@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
 from typing import Any, List
+from PIL import Image as PILImage
 from pydantic import BaseModel
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
 class MsExcelDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
         self, doc: DoclingDocument, sheet: Worksheet
     ) -> DoclingDocument:
-        # FIXME: mypy does not agree with _images ...
-        """
-        # Iterate over images in the sheet
-        for idx, image in enumerate(sheet._images):  # Access embedded images
+        # Iterate over byte images in the sheet
+        for idx, image in enumerate(sheet._images):  # type: ignore
-            image_bytes = BytesIO(image.ref.blob)
-            pil_image = Image.open(image_bytes)
+            try:
+                pil_image = PILImage.open(image.ref)
-            doc.add_picture(
-                parent=self.parents[0],
-                image=ImageRef.from_pil(image=pil_image, dpi=72),
-                caption=None,
-            )
-        """
+                doc.add_picture(
+                    parent=self.parents[0],
+                    image=ImageRef.from_pil(image=pil_image, dpi=72),
+                    caption=None,
+                )
+            except:
+                _log.error("could not extract the image from excel sheets")
-        # FIXME: mypy does not agree with _charts ...
         """
-        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
-            chart_path = f"chart_{idx + 1}.png"
-            _log.info(
-                f"Chart found, but dynamic rendering is required for: {chart_path}"
-            )
+        for idx, chart in enumerate(sheet._charts):  # type: ignore
+            try:
+                chart_path = f"chart_{idx + 1}.png"
+                _log.info(
+                    f"Chart found, but dynamic rendering is required for: {chart_path}"
+                )
-            _log.info(f"Chart {idx + 1}:")
-            # Chart type
-            _log.info(f"Type: {type(chart).__name__}")
-            # Title
-            if chart.title:
-                _log.info(f"Title: {chart.title}")
-            else:
-                _log.info("No title")
-            # Data series
-            for series in chart.series:
-                _log.info(" => series ...")
-                _log.info(f"Data Series: {series.title}")
-                _log.info(f"Values: {series.values}")
-                _log.info(f"Categories: {series.categories}")
+                _log.info(f"Chart {idx + 1}:")
-            # Position
-            # _log.info(f"Anchor Cell: {chart.anchor}")
+                # Chart type
+                # _log.info(f"Type: {type(chart).__name__}")
+                print(f"Type: {type(chart).__name__}")
+                # Extract series data
+                for series_idx, series in enumerate(chart.series):
+                    #_log.info(f"Series {series_idx + 1}:")
+                    print(f"Series {series_idx + 1} type: {type(series).__name__}")
+                    #print(f"x-values: {series.xVal}")
+                    #print(f"y-values: {series.yVal}")
+                    print(f"xval type: {type(series.xVal).__name__}")
+                    xvals = []
+                    for _ in series.xVal.numLit.pt:
+                        print(f"xval type: {type(_).__name__}")
+                        if hasattr(_, 'v'):
+                            xvals.append(_.v)
+                    print(f"x-values: {xvals}")
+                    yvals = []
+                    for _ in series.yVal:
+                        if hasattr(_, 'v'):
+                            yvals.append(_.v)
+                    print(f"y-values: {yvals}")
+            except Exception as exc:
+                print(exc)
+                continue
         """
         return doc

{docling-2.15.0 → docling-2.16.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -26,7 +26,6 @@ _log = logging.getLogger(__name__)
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         self.XML_KEY = (

{docling-2.15.0 → docling-2.16.0}/docling/backend/pdf_backend.py RENAMED Viewed

@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
 class PdfPageBackend(ABC):
     @abstractmethod
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         pass
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(PaginatedDocumentBackend):
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)

{docling-2.15.0 → docling-2.16.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
         return self.valid
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
             pos = obj.get_pos()
             cropbox = BoundingBox.from_tuple(
@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

{docling-2.15.0 → docling-2.16.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
+    NodeItem,
     PictureDataType,
     Size,
     TableCell,
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
     MD = "md"
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
+    JSON_DOCLING = "json_docling"
 class OutputFormat(str, Enum):
@@ -61,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.JSON_DOCLING: ["json"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -89,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.JSON_DOCLING: ["application/json"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -201,6 +205,13 @@ class AssembledUnit(BaseModel):
     headers: List[PageElement] = []
+class ItemAndImageEnrichmentElement(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    item: NodeItem
+    image: Image
 class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -219,12 +230,28 @@ class Page(BaseModel):
         {}
     )  # Cache of images in different scales. By default it is cleared during assembling.
-    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+    def get_image(
+        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+    ) -> Optional[Image]:
         if self._backend is None:
             return self._image_cache.get(scale, None)
         if not scale in self._image_cache:
-            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
-        return self._image_cache[scale]
+            if cropbox is None:
+                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+            else:
+                return self._backend.get_page_image(scale=scale, cropbox=cropbox)
+        if cropbox is None:
+            return self._image_cache[scale]
+        else:
+            page_im = self._image_cache[scale]
+            assert self.size is not None
+            return page_im.crop(
+                cropbox.to_top_left_origin(page_height=self.size.height)
+                .scaled(scale=scale)
+                .as_tuple()
+            )
     @property
     def image(self) -> Optional[Image]:

{docling-2.15.0 → docling-2.16.0}/docling/datamodel/document.py RENAMED Viewed

@@ -350,6 +350,8 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.HTML][0]
         elif ext in FormatToExtensions[InputFormat.MD]:
             mime = FormatToMimeType[InputFormat.MD][0]
+        elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
+            mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
         return mime
     @staticmethod

{docling-2.15.0 → docling-2.16.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -1,17 +1,11 @@
 import logging
 import os
-import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, List, Literal, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic_settings import (
-    BaseSettings,
-    PydanticBaseSettingsSource,
-    SettingsConfigDict,
-)
-from typing_extensions import deprecated
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
 _log = logging.getLogger(__name__)
@@ -139,7 +133,7 @@ class EasyOcrOptions(OcrOptions):
     use_gpu: Optional[bool] = None
-    confidence_threshold: float = 0.65
+    confidence_threshold: float = 0.5
     model_storage_directory: Optional[str] = None
     recog_network: Optional[str] = "standard"
@@ -225,6 +219,9 @@ class PdfPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    do_code_enrichment: bool = False  # True: perform code OCR
+    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
+    do_picture_classification: bool = False  # True: classify pictures in documents
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[

docling 2.15.0__tar.gz → 2.16.0__tar.gz

docling 2.15.0tar.gz → 2.16.0tar.gz