PyPI - docling - Versions diffs - 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

docling 2.3.0py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

docling/cli/main.py CHANGED Viewed

@@ -5,12 +5,15 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
 from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
+    TableFormerMode,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
@@ -58,9 +62,10 @@ def version_callback(value: bool):
 # Define an enum for the backend options
-class Backend(str, Enum):
+class PdfBackend(str, Enum):
     PYPDFIUM2 = "pypdfium2"
-    DOCLING = "docling"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
 # Define an enum for the ocr engines
@@ -90,28 +95,28 @@ def export_documents(
             # Export Deep Search document JSON format:
             if export_json:
                 fname = output_dir / f"{doc_filename}.json"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing JSON output to {fname}")
                     fp.write(json.dumps(conv_res.document.export_to_dict()))
             # Export Text format:
             if export_txt:
                 fname = output_dir / f"{doc_filename}.txt"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Text output to {fname}")
                     fp.write(conv_res.document.export_to_markdown(strict_text=True))
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Markdown output to {fname}")
                     fp.write(conv_res.document.export_to_markdown())
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Doc Tags output to {fname}")
                     fp.write(conv_res.document.export_to_document_tokens())
@@ -151,6 +156,17 @@ def convert(
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    pdf_backend: Annotated[
+        PdfBackend, typer.Option(..., help="The PDF backend to use.")
+    ] = PdfBackend.DLPARSE_V1,
+    table_mode: Annotated[
+        TableFormerMode,
+        typer.Option(..., help="The mode to use in the table structure model."),
+    ] = TableFormerMode.FAST,
+    artifacts_path: Annotated[
+        Optional[Path],
+        typer.Option(..., help="If provided, the location of the model artifacts."),
+    ] = None,
     abort_on_error: Annotated[
         bool,
         typer.Option(
@@ -217,11 +233,25 @@ def convert(
         do_table_structure=True,
     )
     pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+    pipeline_options.table_structure_options.mode = table_mode
+    if artifacts_path is not None:
+        pipeline_options.artifacts_path = artifacts_path
+    match pdf_backend:
+        case PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        case PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        case PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
     format_options: Dict[InputFormat, FormatOption] = {
         InputFormat.PDF: PdfFormatOption(
             pipeline_options=pipeline_options,
-            backend=DoclingParseDocumentBackend,  # pdf_backend
+            backend=backend,  # pdf_backend
         )
     }
     doc_converter = DocumentConverter(

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from pathlib import Path
 from typing import List, Literal, Optional, Union
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
 class TableFormerMode(str, Enum):
-    FAST = auto()
-    ACCURATE = auto()
+    FAST = "fast"
+    ACCURATE = "accurate"
 class TableStructureOptions(BaseModel):

docling/document_converter.py CHANGED Viewed

@@ -139,6 +139,10 @@ class DocumentConverter:
         self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+    def initialize_pipeline(self, format: InputFormat):
+        """Initialize the conversion pipeline for the selected format."""
+        self._get_pipeline(doc_format=format)
     @validate_call(config=ConfigDict(strict=True))
     def convert(
         self,
@@ -219,13 +223,13 @@ class DocumentConverter:
                 else:
                     _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
-    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
+    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         assert self.format_to_options is not None
-        fopt = self.format_to_options.get(doc.format)
+        fopt = self.format_to_options.get(doc_format)
         if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for document {doc.file}")
+            raise RuntimeError(f"Could not get pipeline for {doc_format}")
         else:
             pipeline_class = fopt.pipeline_cls
             pipeline_options = fopt.pipeline_options
@@ -256,7 +260,7 @@ class DocumentConverter:
         self, in_doc: InputDocument, raises_on_error: bool
     ) -> ConversionResult:
         if in_doc.valid:
-            pipeline = self._get_pipeline(in_doc)
+            pipeline = self._get_pipeline(in_doc.format)
             if pipeline is None:  # Can't find a default pipeline. Should this raise?
                 if raises_on_error:
                     raise RuntimeError(

{docling-2.3.0.dist-info → docling-2.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.3.0
-Summary: Docling PDF conversion package
+Version: 2.4.0
+Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
-Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
+Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
 Author: Christoph Auer
 Author-email: cau@zurich.ibm.com
 Requires-Python: >=3.10,<4.0
@@ -23,9 +23,9 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.2.3,<3.0.0)
-Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
-Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
+Requires-Dist: docling-core (>=2.3.0,<3.0.0)
+Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
+Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -41,10 +41,6 @@ Requires-Dist: requests (>=2.32.3,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.14.1,<2.0.0)
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
-Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
-Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
-Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
-Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
 Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown

{docling-2.3.0.dist-info → docling-2.4.0.dist-info}/RECORD RENAMED Viewed

@@ -11,13 +11,13 @@ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpR
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
+docling/cli/main.py,sha256=IOeIpGoK_5AeE_6LYTU_nfZjqpZ5xeGaTCB8Vfsama0,9334
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
 docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
-docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
+docling/datamodel/pipeline_options.py,sha256=PqQ4VjMDN16oWZSUYtskQEH366504OZmnjinCaOWmMc,2444
 docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
-docling/document_converter.py,sha256=Y0Tngh-seNSty7Ov71DDAJzbBgruoEdwYPunVn7DT00,10413
+docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
 docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
@@ -38,8 +38,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.3.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.3.0.dist-info/METADATA,sha256=e3LTQgbktuUHzQlI4qXDhIDMGOX0duC1EJWws6j6_y8,6373
-docling-2.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.3.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.3.0.dist-info/RECORD,,
+docling-2.4.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.4.0.dist-info/METADATA,sha256=9o2Nd020wn0UeQ7d0ABRQt6UnVagPxTFson9bDzcbEA,6116
+docling-2.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.4.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.4.0.dist-info/RECORD,,

{docling-2.3.0.dist-info → docling-2.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.3.0.dist-info → docling-2.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.3.0.dist-info → docling-2.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

docling 2.3.0py3-none-any.whl → 2.4.0py3-none-any.whl