PyPI - docling - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

docling 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

docling/backend/abstract_backend.py CHANGED Viewed

@@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        pass
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        self.path_or_stream = path_or_stream
+        self.document_hash = document_hash
     @abstractmethod
     def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
     @abstractmethod
     def unload(self):
-        pass
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 import random
-import time
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, Optional, Union
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage, docling_page_obj):
+    def __init__(
+        self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
+    ):
         super().__init__(page_obj)
         self._ppage = page_obj
-        self._dpage = docling_page_obj
-        self.text_page = None
+        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
+        self._dpage = None
+        self.broken_page = "pages" not in parsed_page
+        if not self.broken_page:
+            self._dpage = parsed_page["pages"][0]
+        else:
+            raise RuntimeError(
+                f"Page {page_no} of document {document_hash} could not be parsed."
+            )
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if self.broken_page:
+            return ""
         # Find intersecting cells on the page
         text_piece = ""
         page_size = self.get_size()
@@ -58,6 +70,9 @@ class DoclingParsePageBackend(PdfPageBackend):
         cells = []
         cell_counter = 0
+        if self.broken_page:
+            return cells
         page_size = self.get_size()
         parser_width = self._dpage["width"]
@@ -168,38 +183,39 @@ class DoclingParsePageBackend(PdfPageBackend):
     def unload(self):
         self._ppage = None
         self._dpage = None
-        self.text_page = None
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
-        # Parsing cells with docling_parser call
-        parser = pdf_parser()
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
-        start_pb_time = time.time()
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self.parser = pdf_parser()
+        success = False
         if isinstance(path_or_stream, BytesIO):
-            self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
-        else:
-            self._parser_doc = parser.find_cells(str(path_or_stream))
+            success = self.parser.load_document_from_bytesio(
+                document_hash, path_or_stream
+            )
+        elif isinstance(path_or_stream, Path):
+            success = self.parser.load_document(document_hash, str(path_or_stream))
-        end_pb_time = time.time() - start_pb_time
-        _log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}")
+        if not success:
+            raise RuntimeError("docling-parse could not load this document.")
     def page_count(self) -> int:
-        return len(self._parser_doc["pages"])
+        return len(self._pdoc)  # To be replaced with docling-parse API
     def load_page(self, page_no: int) -> DoclingParsePageBackend:
         return DoclingParsePageBackend(
-            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+            self.parser, self.document_hash, page_no, self._pdoc[page_no]
         )
     def is_valid(self) -> bool:
         return self.page_count() > 0
     def unload(self):
+        super().unload()
+        self.parser.unload_document(self.document_hash)
         self._pdoc.close()
         self._pdoc = None
-        self._parser_doc = None

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
         self._pdoc = pdfium.PdfDocument(path_or_stream)
     def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
         return self.page_count() > 0
     def unload(self):
+        super().unload()
         self._pdoc.close()
         self._pdoc = None

docling/datamodel/document.py CHANGED Viewed

@@ -79,7 +79,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             elif isinstance(path_or_stream, BytesIO):
                 self.file = PurePath(filename)
@@ -89,7 +91,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             if self.document_hash and self._backend.page_count() > 0:
                 self.page_count = self._backend.page_count()

docling/document_converter.py CHANGED Viewed

@@ -141,6 +141,8 @@ class DocumentConverter:
         start_doc_time = time.time()
         converted_doc = ConvertedDocument(input=in_doc)
+        _log.info(f"Processing document {in_doc.file.name}")
         if not in_doc.valid:
             converted_doc.status = ConversionStatus.FAILURE
             return converted_doc

{docling-1.6.3.dist-info → docling-1.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.6.3
+Version: 1.7.1
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -23,7 +23,7 @@ Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
-Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
+Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)

{docling-1.6.3.dist-info → docling-1.7.1.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
-docling/backend/docling_parse_backend.py,sha256=ELDJeC0bHYWEtkMcvcPxTMIbTBLO1N9VLeqsardlXg4,6880
-docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
+docling/backend/abstract_backend.py,sha256=wvrywm1pPt79L5Dt_da5QGmf9dDzjBGj1rSaUQxqI8s,1432
+docling/backend/docling_parse_backend.py,sha256=hXyF2VPPdLs7APWEXTlfz0wI86rUGYa67Q73zgTB-Ug,7438
+docling/backend/pypdfium2_backend.py,sha256=84AnFah8Ztk-j8_9MTHalPU3a9fClrEz7A_rfzWDkFc,8122
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
-docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
+docling/datamodel/document.py,sha256=J97KeT8fJRKijUorDky-xA2FoOGBXOjrReYjdeo8NK4,13333
 docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
-docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
+docling/document_converter.py,sha256=Tx4BwtOxpwizmXgJl2nK6I-6m0V3fygHwYFomJTH2Ns,10433
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
 docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
@@ -21,7 +21,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjg
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.6.3.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.6.3.dist-info/METADATA,sha256=DeRKK5TVCv9rp3eQfZkXfZXwKLi4df2l10qXKcm3ISQ,7229
-docling-1.6.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.6.3.dist-info/RECORD,,
+docling-1.7.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.7.1.dist-info/METADATA,sha256=ADdVabYgc4VEIGKhM-tI6XBU_CG9tzKl_au69TZ9LbY,7229
+docling-1.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.7.1.dist-info/RECORD,,

{docling-1.6.3.dist-info → docling-1.7.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.6.3.dist-info → docling-1.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

docling 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl