PyPI - docling - Versions diffs - 1.8.0__tar.gz → 1.8.2__tar.gz - Mend

docling 1.8.0tar.gz → 1.8.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{docling-1.8.0 → docling-1.8.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.8.0
+Version: 1.8.2
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -87,10 +87,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
 ```python
 from docling.document_converter import DocumentConverter
-source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
-doc = converter.convert_single(source)
-print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
+result = converter.convert_single(source)
+print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
 ```
 ### Convert a batch of documents
@@ -156,7 +156,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
 buf = BytesIO(your_binary_stream)
 docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
 conv_input = DocumentConversionInput.from_streams(docs)
-converted_docs = doc_converter.convert(conv_input)
+results = doc_converter.convert(conv_input)
 ```
 ### Limit resource usage

{docling-1.8.0 → docling-1.8.2}/README.md RENAMED Viewed

@@ -49,10 +49,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
 ```python
 from docling.document_converter import DocumentConverter
-source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
-doc = converter.convert_single(source)
-print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
+result = converter.convert_single(source)
+print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
 ```
 ### Convert a batch of documents
@@ -118,7 +118,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
 buf = BytesIO(your_binary_stream)
 docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
 conv_input = DocumentConversionInput.from_streams(docs)
-converted_docs = doc_converter.convert(conv_input)
+results = doc_converter.convert(conv_input)
 ```
 ### Limit resource usage

{docling-1.8.0 → docling-1.8.2}/docling/datamodel/base_models.py RENAMED Viewed

@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
 class AssembledUnit(BaseModel):
-    elements: List[PageElement]
-    body: List[PageElement]
-    headers: List[PageElement]
+    elements: List[PageElement] = []
+    body: List[PageElement] = []
+    headers: List[PageElement] = []
 class Page(BaseModel):

{docling-1.8.0 → docling-1.8.2}/docling/datamodel/document.py RENAMED Viewed

@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
 from pydantic import BaseModel
+from typing_extensions import deprecated
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
     "Text": "paragraph",
 }
+_EMPTY_DOC = DsDocument(
+    _name="",
+    description=DsDocumentDescription(logs=[]),
+    file_info=DsFileInfoObject(
+        filename="",
+        document_hash="",
+    ),
+)
 class InputDocument(BaseModel):
     file: PurePath = None
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
             # raise
+@deprecated("Use `ConversionResult` instead.")
 class ConvertedDocument(BaseModel):
     input: InputDocument
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
     errors: List[ErrorItem] = []  # structure to keep errors
     pages: List[Page] = []
-    assembled: Optional[AssembledUnit] = None
+    assembled: AssembledUnit = AssembledUnit()
-    output: Optional[DsDocument] = None
+    output: DsDocument = _EMPTY_DOC
-    def to_ds_document(self) -> DsDocument:
+    def _to_ds_document(self) -> DsDocument:
         title = ""
         desc = DsDocumentDescription(logs=[])
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
         return ds_doc
     def render_as_dict(self):
-        if self.output:
-            return self.output.model_dump(by_alias=True, exclude_none=True)
-        else:
-            return {}
+        return self.output.model_dump(by_alias=True, exclude_none=True)
     def render_as_markdown(self):
-        if self.output:
-            return self.output.export_to_markdown()
-        else:
-            return ""
+        return self.output.export_to_markdown()
     def render_element_images(
         self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
                 yield element, cropped_im
+class ConversionResult(ConvertedDocument):
+    pass
 class DocumentConversionInput(BaseModel):
     _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None

{docling-1.8.0 → docling-1.8.2}/docling/document_converter.py RENAMED Viewed

@@ -7,7 +7,6 @@ from pathlib import Path
 from typing import Iterable, Optional, Type, Union
 import requests
-from docling_core.types import Document
 from PIL import ImageDraw
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
@@ -22,7 +21,7 @@ from docling.datamodel.base_models import (
     PipelineOptions,
 )
 from docling.datamodel.document import (
-    ConvertedDocument,
+    ConversionResult,
     DocumentConversionInput,
     InputDocument,
 )
@@ -73,7 +72,7 @@ class DocumentConverter:
         return Path(download_path)
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
+    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
         for input_batch in chunkify(
             input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
@@ -86,9 +85,9 @@ class DocumentConverter:
             #   yield from pool.map(self.process_document, input_batch)
             # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self.process_document, input_batch)
+            yield from map(self._process_document, input_batch)
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
         """Convert a single document.
         Args:
@@ -99,7 +98,7 @@ class DocumentConverter:
             RuntimeError: If conversion fails.
         Returns:
-            Document: The converted document object.
+            ConversionResult: The conversion result object.
         """
         with tempfile.TemporaryDirectory() as temp_dir:
             try:
@@ -129,52 +128,49 @@ class DocumentConverter:
                         f"Unexpected file path type encountered: {type(source)}"
                     )
             conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            converted_docs_iter = self.convert(conv_inp)
-            converted_doc: ConvertedDocument = next(converted_docs_iter)
-        if converted_doc.status not in {
+            conv_res_iter = self.convert(conv_inp)
+            conv_res: ConversionResult = next(conv_res_iter)
+        if conv_res.status not in {
             ConversionStatus.SUCCESS,
-            ConversionStatus.SUCCESS_WITH_ERRORS,
+            ConversionStatus.PARTIAL_SUCCESS,
         }:
-            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
-        doc = converted_doc.to_ds_document()
-        return doc
+            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
+        return conv_res
-    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
+    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
         start_doc_time = time.time()
-        converted_doc = ConvertedDocument(input=in_doc)
+        conv_res = ConversionResult(input=in_doc)
         _log.info(f"Processing document {in_doc.file.name}")
         if not in_doc.valid:
-            converted_doc.status = ConversionStatus.FAILURE
-            return converted_doc
+            conv_res.status = ConversionStatus.FAILURE
+            return conv_res
         for i in range(0, in_doc.page_count):
-            converted_doc.pages.append(Page(page_no=i))
+            conv_res.pages.append(Page(page_no=i))
         all_assembled_pages = []
         try:
             # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(
-                converted_doc.pages, settings.perf.page_batch_size
-            ):
+            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
                 start_pb_time = time.time()
                 # Pipeline
                 # 1. Initialise the page resources
                 init_pages = map(
-                    functools.partial(self.initialize_page, in_doc), page_batch
+                    functools.partial(self._initialize_page, in_doc), page_batch
                 )
                 # 2. Populate page image
                 pages_with_images = map(
-                    functools.partial(self.populate_page_images, in_doc), init_pages
+                    functools.partial(self._populate_page_images, in_doc), init_pages
                 )
                 # 3. Populate programmatic page cells
                 pages_with_cells = map(
-                    functools.partial(self.parse_page_cells, in_doc),
+                    functools.partial(self._parse_page_cells, in_doc),
                     pages_with_images,
                 )
@@ -203,13 +199,13 @@ class DocumentConverter:
             # Free up mem resources of PDF backend
             in_doc._backend.unload()
-            converted_doc.pages = all_assembled_pages
-            self.assemble_doc(converted_doc)
+            conv_res.pages = all_assembled_pages
+            self._assemble_doc(conv_res)
             status = ConversionStatus.SUCCESS
-            for page in converted_doc.pages:
+            for page in conv_res.pages:
                 if not page._backend.is_valid():
-                    converted_doc.errors.append(
+                    conv_res.errors.append(
                         ErrorItem(
                             component_type=DoclingComponentType.PDF_BACKEND,
                             module_name=type(page._backend).__name__,
@@ -218,10 +214,10 @@ class DocumentConverter:
                     )
                     status = ConversionStatus.PARTIAL_SUCCESS
-            converted_doc.status = status
+            conv_res.status = status
         except Exception as e:
-            converted_doc.status = ConversionStatus.FAILURE
+            conv_res.status = ConversionStatus.FAILURE
             trace = "\n".join(traceback.format_exception(e))
             _log.info(
                 f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
@@ -233,10 +229,10 @@ class DocumentConverter:
             f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
         )
-        return converted_doc
+        return conv_res
     # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
         page._backend = doc._backend.load_page(page.page_no)
         page.size = page._backend.get_size()
         page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
@@ -244,7 +240,7 @@ class DocumentConverter:
         return page
     # Generate the page image and store it in the page object
-    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
+    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
         # default scale
         page.get_image(
             scale=1.0
@@ -260,7 +256,7 @@ class DocumentConverter:
         return page
     # Extract and populate the page cells and store it in the page object
-    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
+    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
         page.cells = page._backend.get_text_cells()
         # DEBUG code:
@@ -275,12 +271,12 @@ class DocumentConverter:
         return page
-    def assemble_doc(self, converted_doc: ConvertedDocument):
+    def _assemble_doc(self, conv_res: ConversionResult):
         all_elements = []
         all_headers = []
         all_body = []
-        for p in converted_doc.pages:
+        for p in conv_res.pages:
             for el in p.assembled.body:
                 all_body.append(el)
@@ -289,8 +285,8 @@ class DocumentConverter:
             for el in p.assembled.elements:
                 all_elements.append(el)
-        converted_doc.assembled = AssembledUnit(
+        conv_res.assembled = AssembledUnit(
             elements=all_elements, headers=all_headers, body=all_body
         )
-        converted_doc.output = self.glm_model(converted_doc)
+        conv_res.output = self.glm_model(conv_res)

{docling-1.8.0 → docling-1.8.2}/docling/models/ds_glm_model.py RENAMED Viewed

@@ -10,7 +10,7 @@ from docling_core.types import Ref
 from PIL import ImageDraw
 from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
-from docling.datamodel.document import ConvertedDocument
+from docling.datamodel.document import ConversionResult
 class GlmModel:
@@ -20,8 +20,8 @@ class GlmModel:
         model = init_nlp_model(model_names="language;term;reference")
         self.model = model
-    def __call__(self, document: ConvertedDocument) -> DsDocument:
-        ds_doc = document.to_ds_document()
+    def __call__(self, conv_res: ConversionResult) -> DsDocument:
+        ds_doc = conv_res._to_ds_document()
         ds_doc_dict = ds_doc.model_dump(by_alias=True)
         glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -34,7 +34,7 @@ class GlmModel:
         # DEBUG code:
         def draw_clusters_and_cells(ds_document, page_no):
             clusters_to_draw = []
-            image = copy.deepcopy(document.pages[page_no].image)
+            image = copy.deepcopy(conv_res.pages[page_no].image)
             for ix, elem in enumerate(ds_document.main_text):
                 if isinstance(elem, BaseText):
                     prov = elem.prov[0]
@@ -56,7 +56,7 @@ class GlmModel:
                             bbox=BoundingBox.from_tuple(
                                 coord=prov.bbox,
                                 origin=CoordOrigin.BOTTOMLEFT,
-                            ).to_top_left_origin(document.pages[page_no].size.height),
+                            ).to_top_left_origin(conv_res.pages[page_no].size.height),
                         )
                     )

{docling-1.8.0 → docling-1.8.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.8.0"  # DO NOT EDIT, updated automatically
+version = "1.8.2"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"