PyPI - docling - Versions diffs - 1.2.0__tar.gz → 1.3.0__tar.gz - Mend

docling 1.2.0tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{docling-1.2.0 → docling-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.2.0
+Version: 1.3.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -21,6 +21,7 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: easyocr
 Provides-Extra: ocr
+Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
@@ -93,17 +94,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 From a local repo clone, you can run it with:
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Adjust pipeline features
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
 #### Control pipeline options
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:

{docling-1.2.0 → docling-1.3.0}/README.md RENAMED Viewed

@@ -56,17 +56,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 From a local repo clone, you can run it with:
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Adjust pipeline features
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
 #### Control pipeline options
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:

{docling-1.2.0 → docling-1.3.0}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         pass
     @abstractmethod

{docling-1.2.0 → docling-1.3.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -146,11 +146,12 @@ class DoclingParsePageBackend(PdfPageBackend):
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         super().__init__(path_or_stream)
         self._pdoc = pdfium.PdfDocument(path_or_stream)
         # Parsing cells with docling_parser call
-        print("PARSING WITH DOCLING PARSE")
+        if isinstance(path_or_stream, BytesIO):
+            raise NotImplemented("This backend does not support byte streams yet.")
         parser = pdf_parser()
         self._parser_doc = parser.find_cells(str(path_or_stream))

{docling-1.2.0 → docling-1.3.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         super().__init__(path_or_stream)
         self._pdoc = pdfium.PdfDocument(path_or_stream)

{docling-1.2.0 → docling-1.3.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -265,3 +265,9 @@ class PipelineOptions(BaseModel):
     do_ocr: bool = False  # True: perform OCR, replace programmatic PDF text
     table_structure_options: TableStructureOptions = TableStructureOptions()
+class AssembleOptions(BaseModel):
+    keep_page_images: bool = (
+        False  # False: page images are removed in the assemble step
+    )

{docling-1.2.0 → docling-1.3.0}/docling/document_converter.py RENAMED Viewed

@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
     AssembledUnit,
+    AssembleOptions,
     ConversionStatus,
     Page,
     PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
         pipeline_options: PipelineOptions = PipelineOptions(),
         pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
         pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
+        assemble_options: AssembleOptions = AssembleOptions(),
     ):
         if not artifacts_path:
             artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
         self.page_assemble_model = PageAssembleModel(config={})
         self.glm_model = GlmModel(config={})
         self.pdf_backend = pdf_backend
+        self.assemble_options = assemble_options
     @staticmethod
     def download_models_hf(
@@ -174,17 +177,23 @@ class DocumentConverter:
                     pages_with_images,
                 )
+                # 4. Run pipeline stages
                 pipeline_pages = self.model_pipeline.apply(pages_with_cells)
-                # 7. Assemble page elements (per page)
+                # 5. Assemble page elements (per page)
                 assembled_pages = self.page_assemble_model(pipeline_pages)
                 # exhaust assembled_pages
                 for assembled_page in assembled_pages:
                     # Free up mem resources before moving on with next batch
-                    assembled_page.image = (
-                        None  # Comment this if you want to visualize page images
-                    )
+                    # Remove page images (can be disabled)
+                    if not self.assemble_options.keep_page_images:
+                        assembled_page.image = (
+                            None  # Comment this if you want to visualize page images
+                        )
+                    # Unload backend
                     assembled_page._backend.unload()
                     all_assembled_pages.append(assembled_page)

{docling-1.2.0 → docling-1.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.2.0"  # DO NOT EDIT, updated automatically
+version = "1.3.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -33,6 +33,7 @@ huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = { version = "^1.7", optional = true }
 docling-parse = "^0.0.1"
+certifi = ">=2024.7.4"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}