PyPI - docling - Versions diffs - 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl - Mend

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docling/backend/asciidoc_backend.py +1 -1
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +21 -13
docling/backend/docling_parse_v2_backend.py +20 -12
docling/backend/docling_parse_v4_backend.py +192 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +271 -0
docling/backend/docx/latex/omml.py +453 -0
docling/backend/html_backend.py +7 -7
docling/backend/md_backend.py +1 -1
docling/backend/msexcel_backend.py +2 -45
docling/backend/mspowerpoint_backend.py +19 -1
docling/backend/msword_backend.py +68 -3
docling/backend/pdf_backend.py +7 -2
docling/backend/pypdfium2_backend.py +52 -30
docling/backend/xml/uspto_backend.py +1 -1
docling/cli/main.py +135 -53
docling/cli/models.py +1 -1
docling/datamodel/base_models.py +8 -10
docling/datamodel/pipeline_options.py +54 -32
docling/document_converter.py +5 -5
docling/models/base_model.py +9 -1
docling/models/base_ocr_model.py +27 -16
docling/models/easyocr_model.py +28 -13
docling/models/factories/__init__.py +27 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/hf_mlx_model.py +137 -0
docling/models/ocr_mac_model.py +39 -11
docling/models/page_preprocessing_model.py +4 -0
docling/models/picture_description_api_model.py +20 -3
docling/models/picture_description_base_model.py +19 -3
docling/models/picture_description_vlm_model.py +14 -2
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +28 -0
docling/models/rapid_ocr_model.py +34 -13
docling/models/table_structure_model.py +13 -4
docling/models/tesseract_ocr_cli_model.py +40 -15
docling/models/tesseract_ocr_model.py +37 -12
docling/pipeline/standard_pdf_pipeline.py +25 -78
docling/pipeline/vlm_pipeline.py +78 -398
docling/utils/export.py +8 -6
docling/utils/layout_postprocessor.py +26 -23
docling/utils/visualization.py +1 -1
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
docling-2.28.0.dist-info/RECORD +84 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
docling-2.26.0.dist-info/RECORD +0 -72
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import logging
-from typing import Iterable
+from pathlib import Path
+from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
-from docling.datamodel.base_models import Cell, OcrCell, Page
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    OcrOptions,
+    TesseractOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.ocr_utils import map_tesseract_script
@@ -15,8 +21,19 @@ _log = logging.getLogger(__name__)
 class TesseractOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: TesseractOcrOptions):
-        super().__init__(enabled=enabled, options=options)
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: TesseractOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
         self.options: TesseractOcrOptions
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
@@ -31,14 +48,14 @@ class TesseractOcrModel(BaseOcrModel):
                 "Note that tesserocr might have to be manually compiled for working with "
                 "your Tesseract installation. The Docling documentation provides examples for it. "
                 "Alternatively, Docling has support for other OCR engines. See the documentation: "
-                "https://ds4sd.github.io/docling/installation/"
+                "https://docling-project.github.io/docling/installation/"
             )
             missing_langs_errmsg = (
                 "tesserocr is not correctly configured. No language models have been detected. "
                 "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
                 "You can find more information how to setup other OCR engines in Docling "
                 "documentation: "
-                "https://ds4sd.github.io/docling/installation/"
+                "https://docling-project.github.io/docling/installation/"
             )
             try:
@@ -173,13 +190,17 @@ class TesseractOcrModel(BaseOcrModel):
                             top = (box["y"] + box["h"]) / self.scale
                             cells.append(
-                                OcrCell(
-                                    id=ix,
+                                TextCell(
+                                    index=ix,
                                     text=text,
+                                    orig=text,
+                                    from_ocr=True,
                                     confidence=confidence,
-                                    bbox=BoundingBox.from_tuple(
-                                        coord=(left, top, right, bottom),
-                                        origin=CoordOrigin.TOPLEFT,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        ),
                                     ),
                                 )
                             )
@@ -195,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
                     self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return TesseractOcrOptions

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -10,16 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    OcrMacOptions,
-    PdfPipelineOptions,
-    PictureDescriptionApiOptions,
-    PictureDescriptionVlmOptions,
-    RapidOcrOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
@@ -27,22 +18,16 @@ from docling.models.document_picture_classifier import (
     DocumentPictureClassifier,
     DocumentPictureClassifierOptions,
 )
-from docling.models.easyocr_model import EasyOcrModel
+from docling.models.factories import get_ocr_factory, get_picture_description_factory
 from docling.models.layout_model import LayoutModel
-from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
     PagePreprocessingOptions,
 )
-from docling.models.picture_description_api_model import PictureDescriptionApiModel
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
-from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
-from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
 from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
-from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -78,16 +63,14 @@ class StandardPdfPipeline(PaginatedPipeline):
         self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
-        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
-            raise RuntimeError(
-                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
-            )
+        ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
         self.build_pipe = [
             # Pre-processing
             PagePreprocessingModel(
                 options=PagePreprocessingOptions(
-                    images_scale=pipeline_options.images_scale
+                    images_scale=pipeline_options.images_scale,
+                    create_parsed_page=pipeline_options.generate_parsed_pages,
                 )
             ),
             # OCR
@@ -163,66 +146,30 @@ class StandardPdfPipeline(PaginatedPipeline):
         output_dir = download_models(output_dir=local_dir, force=force, progress=False)
         return output_dir
-    def get_ocr_model(
-        self, artifacts_path: Optional[Path] = None
-    ) -> Optional[BaseOcrModel]:
-        if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
-            return EasyOcrModel(
-                enabled=self.pipeline_options.do_ocr,
-                artifacts_path=artifacts_path,
-                options=self.pipeline_options.ocr_options,
-                accelerator_options=self.pipeline_options.accelerator_options,
-            )
-        elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
-            return TesseractOcrCliModel(
-                enabled=self.pipeline_options.do_ocr,
-                options=self.pipeline_options.ocr_options,
-            )
-        elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
-            return TesseractOcrModel(
-                enabled=self.pipeline_options.do_ocr,
-                options=self.pipeline_options.ocr_options,
-            )
-        elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
-            return RapidOcrModel(
-                enabled=self.pipeline_options.do_ocr,
-                options=self.pipeline_options.ocr_options,
-                accelerator_options=self.pipeline_options.accelerator_options,
-            )
-        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
-            if "darwin" != sys.platform:
-                raise RuntimeError(
-                    f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
-                )
-            return OcrMacModel(
-                enabled=self.pipeline_options.do_ocr,
-                options=self.pipeline_options.ocr_options,
-            )
-        return None
+    def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
+        factory = get_ocr_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.ocr_options,
+            enabled=self.pipeline_options.do_ocr,
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
     def get_picture_description_model(
         self, artifacts_path: Optional[Path] = None
     ) -> Optional[PictureDescriptionBaseModel]:
-        if isinstance(
-            self.pipeline_options.picture_description_options,
-            PictureDescriptionApiOptions,
-        ):
-            return PictureDescriptionApiModel(
-                enabled=self.pipeline_options.do_picture_description,
-                enable_remote_services=self.pipeline_options.enable_remote_services,
-                options=self.pipeline_options.picture_description_options,
-            )
-        elif isinstance(
-            self.pipeline_options.picture_description_options,
-            PictureDescriptionVlmOptions,
-        ):
-            return PictureDescriptionVlmModel(
-                enabled=self.pipeline_options.do_picture_description,
-                artifacts_path=artifacts_path,
-                options=self.pipeline_options.picture_description_options,
-                accelerator_options=self.pipeline_options.accelerator_options,
-            )
-        return None
+        factory = get_picture_description_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.picture_description_options,
+            enabled=self.pipeline_options.do_picture_description,
+            enable_remote_services=self.pipeline_options.enable_remote_services,
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
         with TimeRecorder(conv_res, "page_init"):

docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl