PyPI - docling - Versions diffs - 1.7.1__tar.gz → 1.8.1__tar.gz - Mend

docling 1.7.1tar.gz → 1.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{docling-1.7.1 → docling-1.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.7.1
+Version: 1.8.1
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT

{docling-1.7.1 → docling-1.8.1}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -7,8 +7,6 @@ from PIL import Image
 class PdfPageBackend(ABC):
-    def __init__(self, page_obj: Any) -> object:
-        pass
     @abstractmethod
     def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
     def get_size(self) -> "PageSize":
         pass
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
     @abstractmethod
     def unload(self):
         pass

{docling-1.7.1 → docling-1.8.1}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
     def __init__(
         self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
     ):
-        super().__init__(page_obj)
         self._ppage = page_obj
         parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
         self._dpage = None
-        self.broken_page = "pages" not in parsed_page
-        if not self.broken_page:
+        self.valid = "pages" in parsed_page
+        if self.valid:
             self._dpage = parsed_page["pages"][0]
         else:
-            raise RuntimeError(
-                f"Page {page_no} of document {document_hash} could not be parsed."
+            _log.info(
+                f"An error occured when loading page {page_no} of document {document_hash}."
             )
+    def is_valid(self) -> bool:
+        return self.valid
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if self.broken_page:
+        if not self.valid:
             return ""
         # Find intersecting cells on the page
         text_piece = ""
@@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         cells = []
         cell_counter = 0
-        if self.broken_page:
+        if not self.valid:
             return cells
         page_size = self.get_size()
@@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
             success = self.parser.load_document(document_hash, str(path_or_stream))
         if not success:
-            raise RuntimeError("docling-parse could not load this document.")
+            raise RuntimeError(
+                f"docling-parse could not load document {document_hash}."
+            )
     def page_count(self) -> int:
         return len(self._pdoc)  # To be replaced with docling-parse API

{docling-1.7.1 → docling-1.8.1}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import logging
 import random
 from io import BytesIO
 from pathlib import Path
@@ -7,17 +8,32 @@ import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
+from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+_log = logging.getLogger(__name__)
 class PyPdfiumPageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage):
-        super().__init__(page_obj)
-        self._ppage = page_obj
+    def __init__(
+        self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
+    ):
+        self.valid = True  # No better way to tell from pypdfium.
+        try:
+            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
+        except PdfiumError as e:
+            _log.info(
+                f"An exception occured when loading page {page_no} of document {document_hash}.",
+                exc_info=True,
+            )
+            self.valid = False
         self.text_page = None
+    def is_valid(self) -> bool:
+        return self.valid
     def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
@@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
     def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
         super().__init__(path_or_stream, document_hash)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        try:
+            self._pdoc = pdfium.PdfDocument(path_or_stream)
+        except PdfiumError as e:
+            raise RuntimeError(
+                f"pypdfium could not load document {document_hash}"
+            ) from e
     def page_count(self) -> int:
         return len(self._pdoc)
     def load_page(self, page_no: int) -> PyPdfiumPageBackend:
-        return PyPdfiumPageBackend(self._pdoc[page_no])
+        return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
     def is_valid(self) -> bool:
         return self.page_count() > 0

{docling-1.7.1 → docling-1.8.1}/docling/datamodel/base_models.py RENAMED Viewed

@@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
     STARTED = auto()
     FAILURE = auto()
     SUCCESS = auto()
-    SUCCESS_WITH_ERRORS = auto()
+    PARTIAL_SUCCESS = auto()
 class DocInputType(str, Enum):
@@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
     BOTTOMLEFT = auto()
+class DoclingComponentType(str, Enum):
+    PDF_BACKEND = auto()
+    MODEL = auto()
+    DOC_ASSEMBLER = auto()
+class ErrorItem(BaseModel):
+    component_type: DoclingComponentType
+    module_name: str
+    error_message: str
 class PageSize(BaseModel):
     width: float = 0.0
     height: float = 0.0

{docling-1.7.1 → docling-1.8.1}/docling/datamodel/document.py RENAMED Viewed

@@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
     AssembledUnit,
     ConversionStatus,
     DocumentStream,
+    ErrorItem,
     FigureElement,
     Page,
     PageElement,
@@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
     input: InputDocument
     status: ConversionStatus = ConversionStatus.PENDING  # failure, success
-    errors: List[Dict] = []  # structure to keep errors
+    errors: List[ErrorItem] = []  # structure to keep errors
     pages: List[Page] = []
     assembled: Optional[AssembledUnit] = None

{docling-1.7.1 → docling-1.8.1}/docling/document_converter.py RENAMED Viewed

@@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
     AssembledUnit,
     AssembleOptions,
     ConversionStatus,
+    DoclingComponentType,
+    ErrorItem,
     Page,
     PipelineOptions,
 )
@@ -86,7 +88,7 @@ class DocumentConverter:
             # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
             yield from map(self.process_document, input_batch)
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument:
         """Convert a single document.
         Args:
@@ -131,11 +133,10 @@ class DocumentConverter:
             converted_doc: ConvertedDocument = next(converted_docs_iter)
         if converted_doc.status not in {
             ConversionStatus.SUCCESS,
-            ConversionStatus.SUCCESS_WITH_ERRORS,
+            ConversionStatus.PARTIAL_SUCCESS,
         }:
             raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
-        doc = converted_doc.to_ds_document()
-        return doc
+        return converted_doc
     def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
         start_doc_time = time.time()
@@ -157,7 +158,6 @@ class DocumentConverter:
             for page_batch in chunkify(
                 converted_doc.pages, settings.perf.page_batch_size
             ):
                 start_pb_time = time.time()
                 # Pipeline
@@ -205,12 +205,27 @@ class DocumentConverter:
             converted_doc.pages = all_assembled_pages
             self.assemble_doc(converted_doc)
-            converted_doc.status = ConversionStatus.SUCCESS
+            status = ConversionStatus.SUCCESS
+            for page in converted_doc.pages:
+                if not page._backend.is_valid():
+                    converted_doc.errors.append(
+                        ErrorItem(
+                            component_type=DoclingComponentType.PDF_BACKEND,
+                            module_name=type(page._backend).__name__,
+                            error_message=f"Page {page.page_no} failed to parse.",
+                        )
+                    )
+                    status = ConversionStatus.PARTIAL_SUCCESS
+            converted_doc.status = status
         except Exception as e:
             converted_doc.status = ConversionStatus.FAILURE
             trace = "\n".join(traceback.format_exception(e))
-            _log.info(f"Encountered an error during conversion: {trace}")
+            _log.info(
+                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
+                f"{trace}"
+            )
         end_doc_time = time.time() - start_doc_time
         _log.info(
@@ -230,7 +245,9 @@ class DocumentConverter:
     # Generate the page image and store it in the page object
     def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
         # default scale
-        page.get_image(scale=1.0)
+        page.get_image(
+            scale=1.0
+        )  # puts the page image on the image cache at default scale
         # user requested scales
         if self.assemble_options.images_scale is not None:

{docling-1.7.1 → docling-1.8.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.7.1"  # DO NOT EDIT, updated automatically
+version = "1.8.1"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"