PyPI - docling - Versions diffs - 2.10.0__tar.gz → 2.11.0__tar.gz - Mend

docling 2.10.0tar.gz → 2.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{docling-2.10.0 → docling-2.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.10.0
+Version: 2.11.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT

{docling-2.10.0 → docling-2.11.0}/docling/cli/main.py RENAMED Viewed

@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrEngine,
     OcrMacOptions,
     OcrOptions,
+    PdfBackend,
     PdfPipelineOptions,
     RapidOcrOptions,
     TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
         raise typer.Exit()
-# Define an enum for the backend options
-class PdfBackend(str, Enum):
-    PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
-    DLPARSE_V2 = "dlparse_v2"
-# Define an enum for the ocr engines
-class OcrEngine(str, Enum):
-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -264,6 +250,13 @@ def convert(
             help="Show version information.",
         ),
     ] = None,
+    document_timeout: Annotated[
+        Optional[float],
+        typer.Option(
+            ...,
+            help="The timeout for processing each document, in seconds.",
+        ),
+    ] = None,
 ):
     if verbose == 0:
         logging.basicConfig(level=logging.WARNING)
@@ -347,6 +340,7 @@ def convert(
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,
+            document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (
             True  # do_cell_matching

{docling-2.10.0 → docling-2.11.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -19,12 +19,12 @@ if TYPE_CHECKING:
 class ConversionStatus(str, Enum):
-    PENDING = auto()
-    STARTED = auto()
-    FAILURE = auto()
-    SUCCESS = auto()
-    PARTIAL_SUCCESS = auto()
-    SKIPPED = auto()
+    PENDING = "pending"
+    STARTED = "started"
+    FAILURE = "failure"
+    SUCCESS = "success"
+    PARTIAL_SUCCESS = "partial_success"
+    SKIPPED = "skipped"
 class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
 class DocInputType(str, Enum):
-    PATH = auto()
-    STREAM = auto()
+    PATH = "path"
+    STREAM = "stream"
 class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = auto()
-    MODEL = auto()
-    DOC_ASSEMBLER = auto()
-    USER_INPUT = auto()
+    DOCUMENT_BACKEND = "document_backend"
+    MODEL = "model"
+    DOC_ASSEMBLER = "doc_assembler"
+    USER_INPUT = "user_input"
 class ErrorItem(BaseModel):

{docling-2.10.0 → docling-2.11.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
     )
+# Define an enum for the backend options
+class PdfBackend(str, Enum):
+    """Enum of valid PDF backends."""
+    PYPDFIUM2 = "pypdfium2"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    """Enum of valid OCR engines."""
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
+    RAPIDOCR = "rapidocr"
 class PipelineOptions(BaseModel):
     """Base pipeline options."""
     create_legacy_output: bool = (
-        True  # This defautl will be set to False on a future version of docling
+        True  # This default will be set to False on a future version of docling
     )
+    document_timeout: Optional[float] = None
 class PdfPipelineOptions(PipelineOptions):

{docling-2.10.0 → docling-2.11.0}/docling/models/ds_glm_model.py RENAMED Viewed

@@ -3,8 +3,7 @@ import random
 from pathlib import Path
 from typing import List, Union
-from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from deepsearch_glm.andromeda_nlp import nlp_model
 from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import (
@@ -43,9 +42,7 @@ class GlmModel:
     def __init__(self, options: GlmOptions):
         self.options = options
-        if self.options.model_names != "":
-            load_pretrained_nlp_models()
-        self.model = init_nlp_model(model_names=self.options.model_names)
+        self.model = nlp_model(loglevel="error", text_ordering=True)
     def _to_legacy_document(self, conv_res) -> DsDocument:
         title = ""

{docling-2.10.0 → docling-2.11.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
                         del high_res_image
                         del im
-                        cells = [
-                            OcrCell(
-                                id=ix,
-                                text=line[1],
-                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                        if result is not None:
+                            cells = [
+                                OcrCell(
+                                    id=ix,
+                                    text=line[1],
+                                    confidence=line[2],
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
                                     ),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
-                            for ix, line in enumerate(result)
-                        ]
-                        all_ocr_cells.extend(cells)
+                                )
+                                for ix, line in enumerate(result)
+                            ]
+                            all_ocr_cells.extend(cells)
                     # Post-process the cells
                     page.cells = self.post_process_cells(all_ocr_cells, page.cells)

{docling-2.10.0 → docling-2.11.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

@@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
             # conv_res.status = ConversionStatus.FAILURE
             # return conv_res
+        total_elapsed_time = 0.0
         with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
             for i in range(0, conv_res.input.page_count):
@@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                 for page_batch in chunkify(
                     conv_res.pages, settings.perf.page_batch_size
                 ):
-                    start_pb_time = time.time()
+                    start_batch_time = time.monotonic()
                     # 1. Initialise the page resources
                     init_pages = map(
@@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     for p in pipeline_pages:  # Must exhaust!
                         pass
-                    end_pb_time = time.time() - start_pb_time
-                    _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+                    end_batch_time = time.monotonic()
+                    total_elapsed_time += end_batch_time - start_batch_time
+                    if (
+                        self.pipeline_options.document_timeout is not None
+                        and total_elapsed_time > self.pipeline_options.document_timeout
+                    ):
+                        _log.warning(
+                            f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
+                        )
+                        conv_res.status = ConversionStatus.PARTIAL_SUCCESS
+                        break
+                    _log.debug(
+                        f"Finished converting page batch time={end_batch_time:.3f}"
+                    )
             except Exception as e:
                 conv_res.status = ConversionStatus.FAILURE

{docling-2.10.0 → docling-2.11.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.10.0"  # DO NOT EDIT, updated automatically
+version = "2.11.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"