PyPI - docling - Versions diffs - 2.69.0__py3-none-any.whl - Mend

docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (138) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +84 -0
docling/backend/asciidoc_backend.py +443 -0
docling/backend/csv_backend.py +125 -0
docling/backend/docling_parse_backend.py +237 -0
docling/backend/docling_parse_v2_backend.py +276 -0
docling/backend/docling_parse_v4_backend.py +260 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/drawingml/utils.py +131 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +274 -0
docling/backend/docx/latex/omml.py +459 -0
docling/backend/html_backend.py +1502 -0
docling/backend/image_backend.py +188 -0
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +618 -0
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/msexcel_backend.py +686 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +1663 -0
docling/backend/noop_backend.py +51 -0
docling/backend/pdf_backend.py +82 -0
docling/backend/pypdfium2_backend.py +417 -0
docling/backend/webvtt_backend.py +572 -0
docling/backend/xml/__init__.py +0 -0
docling/backend/xml/jats_backend.py +819 -0
docling/backend/xml/uspto_backend.py +1905 -0
docling/chunking/__init__.py +12 -0
docling/cli/__init__.py +0 -0
docling/cli/main.py +974 -0
docling/cli/models.py +196 -0
docling/cli/tools.py +17 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/accelerator_options.py +69 -0
docling/datamodel/asr_model_specs.py +494 -0
docling/datamodel/backend_options.py +102 -0
docling/datamodel/base_models.py +493 -0
docling/datamodel/document.py +699 -0
docling/datamodel/extraction.py +39 -0
docling/datamodel/layout_model_specs.py +91 -0
docling/datamodel/pipeline_options.py +457 -0
docling/datamodel/pipeline_options_asr_model.py +78 -0
docling/datamodel/pipeline_options_vlm_model.py +136 -0
docling/datamodel/settings.py +65 -0
docling/datamodel/vlm_model_specs.py +365 -0
docling/document_converter.py +559 -0
docling/document_extractor.py +327 -0
docling/exceptions.py +10 -0
docling/experimental/__init__.py +5 -0
docling/experimental/datamodel/__init__.py +1 -0
docling/experimental/datamodel/table_crops_layout_options.py +13 -0
docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
docling/experimental/models/__init__.py +3 -0
docling/experimental/models/table_crops_layout_model.py +114 -0
docling/experimental/pipeline/__init__.py +1 -0
docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
docling/models/__init__.py +0 -0
docling/models/base_layout_model.py +39 -0
docling/models/base_model.py +230 -0
docling/models/base_ocr_model.py +241 -0
docling/models/base_table_model.py +45 -0
docling/models/extraction/__init__.py +0 -0
docling/models/extraction/nuextract_transformers_model.py +305 -0
docling/models/factories/__init__.py +47 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/layout_factory.py +7 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/factories/table_factory.py +7 -0
docling/models/picture_description_base_model.py +149 -0
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +60 -0
docling/models/stages/__init__.py +0 -0
docling/models/stages/code_formula/__init__.py +0 -0
docling/models/stages/code_formula/code_formula_model.py +342 -0
docling/models/stages/layout/__init__.py +0 -0
docling/models/stages/layout/layout_model.py +249 -0
docling/models/stages/ocr/__init__.py +0 -0
docling/models/stages/ocr/auto_ocr_model.py +132 -0
docling/models/stages/ocr/easyocr_model.py +200 -0
docling/models/stages/ocr/ocr_mac_model.py +145 -0
docling/models/stages/ocr/rapid_ocr_model.py +328 -0
docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
docling/models/stages/page_assemble/__init__.py +0 -0
docling/models/stages/page_assemble/page_assemble_model.py +156 -0
docling/models/stages/page_preprocessing/__init__.py +0 -0
docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
docling/models/stages/picture_classifier/__init__.py +0 -0
docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
docling/models/stages/picture_description/__init__.py +0 -0
docling/models/stages/picture_description/picture_description_api_model.py +66 -0
docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
docling/models/stages/reading_order/__init__.py +0 -0
docling/models/stages/reading_order/readingorder_model.py +431 -0
docling/models/stages/table_structure/__init__.py +0 -0
docling/models/stages/table_structure/table_structure_model.py +305 -0
docling/models/utils/__init__.py +0 -0
docling/models/utils/generation_utils.py +157 -0
docling/models/utils/hf_model_download.py +45 -0
docling/models/vlm_pipeline_models/__init__.py +1 -0
docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
docling/models/vlm_pipeline_models/mlx_model.py +325 -0
docling/models/vlm_pipeline_models/vllm_model.py +344 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/asr_pipeline.py +431 -0
docling/pipeline/base_extraction_pipeline.py +72 -0
docling/pipeline/base_pipeline.py +326 -0
docling/pipeline/extraction_vlm_pipeline.py +207 -0
docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
docling/pipeline/simple_pipeline.py +55 -0
docling/pipeline/standard_pdf_pipeline.py +859 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
docling/pipeline/vlm_pipeline.py +416 -0
docling/py.typed +1 -0
docling/utils/__init__.py +0 -0
docling/utils/accelerator_utils.py +97 -0
docling/utils/api_image_request.py +205 -0
docling/utils/deepseekocr_utils.py +388 -0
docling/utils/export.py +146 -0
docling/utils/glm_utils.py +361 -0
docling/utils/layout_postprocessor.py +683 -0
docling/utils/locks.py +3 -0
docling/utils/model_downloader.py +168 -0
docling/utils/ocr_utils.py +69 -0
docling/utils/orientation.py +65 -0
docling/utils/profiling.py +65 -0
docling/utils/utils.py +65 -0
docling/utils/visualization.py +85 -0
docling-2.69.0.dist-info/METADATA +237 -0
docling-2.69.0.dist-info/RECORD +138 -0
docling-2.69.0.dist-info/WHEEL +5 -0
docling-2.69.0.dist-info/entry_points.txt +6 -0
docling-2.69.0.dist-info/licenses/LICENSE +21 -0
docling-2.69.0.dist-info/top_level.txt +1 -0

docling/models/stages/ocr/auto_ocr_model.py ADDED Viewed

@@ -0,0 +1,132 @@
+import logging
+import sys
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Optional, Type
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrAutoOptions,
+    OcrMacOptions,
+    OcrOptions,
+    RapidOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.stages.ocr.easyocr_model import EasyOcrModel
+from docling.models.stages.ocr.ocr_mac_model import OcrMacModel
+from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
+_log = logging.getLogger(__name__)
+class OcrAutoModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: OcrAutoOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: OcrAutoOptions
+        self._engine: Optional[BaseOcrModel] = None
+        if self.enabled:
+            if "darwin" == sys.platform:
+                try:
+                    from ocrmac import ocrmac
+                    self._engine = OcrMacModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=OcrMacOptions(
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected ocrmac.")
+                except ImportError:
+                    _log.info("ocrmac cannot be used because ocrmac is not installed.")
+            if self._engine is None:
+                try:
+                    import onnxruntime
+                    from rapidocr import EngineType, RapidOCR  # type: ignore
+                    self._engine = RapidOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=RapidOcrOptions(
+                            backend="onnxruntime",
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected rapidocr with onnxruntime.")
+                except ImportError:
+                    _log.info(
+                        "rapidocr cannot be used because onnxruntime is not installed."
+                    )
+            if self._engine is None:
+                try:
+                    import easyocr
+                    self._engine = EasyOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=EasyOcrOptions(
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected easyocr.")
+                except ImportError:
+                    _log.info("easyocr cannot be used because it is not installed.")
+            if self._engine is None:
+                try:
+                    import torch
+                    from rapidocr import EngineType, RapidOCR  # type: ignore
+                    self._engine = RapidOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=RapidOcrOptions(
+                            backend="torch",
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected rapidocr with torch.")
+                except ImportError:
+                    _log.info(
+                        "rapidocr cannot be used because rapidocr or torch is not installed."
+                    )
+            if self._engine is None:
+                _log.warning("No OCR engine found. Please review the install details.")
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled or self._engine is None:
+            yield from page_batch
+            return
+        yield from self._engine(conv_res, page_batch)
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return OcrAutoOptions

docling/models/stages/ocr/easyocr_model.py ADDED Viewed

@@ -0,0 +1,200 @@
+import logging
+import warnings
+import zipfile
+from collections.abc import Iterable
+from pathlib import Path
+from typing import List, Optional, Type
+import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+from docling.utils.utils import download_url_with_progress
+_log = logging.getLogger(__name__)
+class EasyOcrModel(BaseOcrModel):
+    _model_repo_folder = "EasyOcr"
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: EasyOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: EasyOcrOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        if self.enabled:
+            try:
+                import easyocr
+            except ImportError:
+                raise ImportError(
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+            if self.options.use_gpu is None:
+                device = decide_device(accelerator_options.device)
+                # Enable easyocr GPU if running on CUDA, MPS
+                use_gpu = any(
+                    device.startswith(x)
+                    for x in [
+                        AcceleratorDevice.CUDA.value,
+                        AcceleratorDevice.MPS.value,
+                    ]
+                )
+            else:
+                warnings.warn(
+                    "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
+                    "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
+                    "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
+                )
+                use_gpu = self.options.use_gpu
+            download_enabled = self.options.download_enabled
+            model_storage_directory = self.options.model_storage_directory
+            if artifacts_path is not None and model_storage_directory is None:
+                download_enabled = False
+                model_storage_directory = str(artifacts_path / self._model_repo_folder)
+            with warnings.catch_warnings():
+                if self.options.suppress_mps_warnings:
+                    warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
+                self.reader = easyocr.Reader(
+                    lang_list=self.options.lang,
+                    gpu=use_gpu,
+                    model_storage_directory=model_storage_directory,
+                    recog_network=self.options.recog_network,
+                    download_enabled=download_enabled,
+                    verbose=False,
+                )
+    @staticmethod
+    def download_models(
+        detection_models: List[str] = ["craft"],
+        recognition_models: List[str] = ["english_g2", "latin_g2"],
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
+        from easyocr.config import (
+            detection_models as det_models_dict,
+            recognition_models as rec_models_dict,
+        )
+        if local_dir is None:
+            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
+        local_dir.mkdir(parents=True, exist_ok=True)
+        # Collect models to download
+        download_list = []
+        for model_name in detection_models:
+            if model_name in det_models_dict:
+                download_list.append(det_models_dict[model_name])
+        for model_name in recognition_models:
+            if model_name in rec_models_dict["gen2"]:
+                download_list.append(rec_models_dict["gen2"][model_name])
+        # Download models
+        for model_details in download_list:
+            buf = download_url_with_progress(model_details["url"], progress=progress)
+            with zipfile.ZipFile(buf, "r") as zip_ref:
+                zip_ref.extractall(local_dir)
+        return local_dir
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        im = numpy.array(high_res_image)
+                        with warnings.catch_warnings():
+                            if self.options.suppress_mps_warnings:
+                                warnings.filterwarnings(
+                                    "ignore", message=".*pin_memory.*MPS.*"
+                                )
+                            result = self.reader.readtext(im)
+                        del high_res_image
+                        del im
+                        cells = [
+                            TextCell(
+                                index=ix,
+                                text=line[1],
+                                orig=line[1],
+                                from_ocr=True,
+                                confidence=line[2],
+                                rect=BoundingRectangle.from_bounding_box(
+                                    BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    )
+                                ),
+                            )
+                            for ix, line in enumerate(result)
+                            if line[2] >= self.options.confidence_threshold
+                        ]
+                        all_ocr_cells.extend(cells)
+                    # Post-process the cells
+                    self.post_process_cells(all_ocr_cells, page)
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+                yield page
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return EasyOcrOptions

docling/models/stages/ocr/ocr_mac_model.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+import sys
+import tempfile
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Optional, Type
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    OcrMacOptions,
+    OcrOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class OcrMacModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: OcrMacOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: OcrMacOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        if self.enabled:
+            if "darwin" != sys.platform:
+                raise RuntimeError("OcrMac is only supported on Mac.")
+            install_errmsg = (
+                "ocrmac is not correctly installed. "
+                "Please install it via `pip install ocrmac` to use this OCR engine. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation: "
+                "https://docling-project.github.io/docling/installation/"
+            )
+            try:
+                from ocrmac import ocrmac
+            except ImportError:
+                raise ImportError(install_errmsg)
+            self.reader_RIL = ocrmac.OCR
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+                            boxes = self.reader_RIL(
+                                fname,
+                                recognition_level=self.options.recognition,
+                                framework=self.options.framework,
+                                language_preference=self.options.lang,
+                            ).recognize()
+                        im_width, im_height = high_res_image.size
+                        cells = []
+                        for ix, (text, confidence, box) in enumerate(boxes):
+                            x = float(box[0])
+                            y = float(box[1])
+                            w = float(box[2])
+                            h = float(box[3])
+                            x1 = x * im_width
+                            y2 = (1 - y) * im_height
+                            x2 = x1 + w * im_width
+                            y1 = y2 - h * im_height
+                            left = x1 / self.scale
+                            top = y1 / self.scale
+                            right = x2 / self.scale
+                            bottom = y2 / self.scale
+                            cells.append(
+                                TextCell(
+                                    index=ix,
+                                    text=text,
+                                    orig=text,
+                                    from_ocr=True,
+                                    confidence=confidence,
+                                    rect=BoundingRectangle.from_bounding_box(
+                                        BoundingBox.from_tuple(
+                                            coord=(left, top, right, bottom),
+                                            origin=CoordOrigin.TOPLEFT,
+                                        )
+                                    ),
+                                )
+                            )
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+                    # Post-process the cells
+                    self.post_process_cells(all_ocr_cells, page)
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+                yield page
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return OcrMacOptions