PyPI - docling - Versions diffs - 1.6.2__tar.gz → 1.7.0__tar.gz - Mend

docling 1.6.2tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{docling-1.6.2 → docling-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.6.2
+Version: 1.7.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
-Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
+Requires-Dist: docling-parse (>=1.0.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)

{docling-1.6.2 → docling-1.7.0}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        pass
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        self.path_or_stream = path_or_stream
+        self.document_hash = document_hash
     @abstractmethod
     def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
     @abstractmethod
     def unload(self):
-        pass
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None

{docling-1.6.2 → docling-1.7.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import logging
 import random
-import time
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, Optional, Union
@@ -17,11 +16,14 @@ _log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage, docling_page_obj):
+    def __init__(
+        self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
+    ):
         super().__init__(page_obj)
         self._ppage = page_obj
-        self._dpage = docling_page_obj
-        self.text_page = None
+        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
+        self._dpage = parsed_page["pages"][0]
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         # Find intersecting cells on the page
@@ -168,40 +170,39 @@ class DoclingParsePageBackend(PdfPageBackend):
     def unload(self):
         self._ppage = None
         self._dpage = None
-        self.text_page = None
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
-        # Parsing cells with docling_parser call
-        parser = pdf_parser()
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
-        start_pb_time = time.time()
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self.parser = pdf_parser()
+        success = False
         if isinstance(path_or_stream, BytesIO):
-            self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
-        else:
-            self._parser_doc = parser.find_cells(str(path_or_stream))
+            success = self.parser.load_document_from_bytesio(
+                document_hash, path_or_stream
+            )
+        elif isinstance(path_or_stream, Path):
+            success = self.parser.load_document(document_hash, str(path_or_stream))
-        end_pb_time = time.time() - start_pb_time
-        _log.info(
-            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
-        )
+        if not success:
+            raise RuntimeError("docling-parse could not load this document.")
     def page_count(self) -> int:
-        return len(self._parser_doc["pages"])
+        return len(self._pdoc)  # To be replaced with docling-parse API
     def load_page(self, page_no: int) -> DoclingParsePageBackend:
         return DoclingParsePageBackend(
-            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+            self.parser, self.document_hash, page_no, self._pdoc[page_no]
         )
     def is_valid(self) -> bool:
         return self.page_count() > 0
     def unload(self):
+        super().unload()
+        self.parser.unload_document(self.document_hash)
         self._pdoc.close()
         self._pdoc = None
-        self._parser_doc = None

{docling-1.6.2 → docling-1.7.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
         self._pdoc = pdfium.PdfDocument(path_or_stream)
     def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
         return self.page_count() > 0
     def unload(self):
+        super().unload()
         self._pdoc.close()
         self._pdoc = None

{docling-1.6.2 → docling-1.7.0}/docling/datamodel/document.py RENAMED Viewed

@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             elif isinstance(path_or_stream, BytesIO):
                 self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             if self.document_hash and self._backend.page_count() > 0:
                 self.page_count = self._backend.page_count()

{docling-1.6.2 → docling-1.7.0}/docling/document_converter.py RENAMED Viewed

@@ -141,6 +141,8 @@ class DocumentConverter:
         start_doc_time = time.time()
         converted_doc = ConvertedDocument(input=in_doc)
+        _log.info(f"Processing document {in_doc.file.name}")
         if not in_doc.valid:
             converted_doc.status = ConversionStatus.FAILURE
             return converted_doc

{docling-1.6.2 → docling-1.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.6.2"  # DO NOT EDIT, updated automatically
+version = "1.7.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = "^1.7"
-docling-parse = "^0.2.0"
+docling-parse = "^1.0.0"
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
 scipy = "^1.14.1"