PyPI - docling - Versions diffs - 2.69.0__py3-none-any.whl - Mend

docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (138) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +84 -0
docling/backend/asciidoc_backend.py +443 -0
docling/backend/csv_backend.py +125 -0
docling/backend/docling_parse_backend.py +237 -0
docling/backend/docling_parse_v2_backend.py +276 -0
docling/backend/docling_parse_v4_backend.py +260 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/drawingml/utils.py +131 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +274 -0
docling/backend/docx/latex/omml.py +459 -0
docling/backend/html_backend.py +1502 -0
docling/backend/image_backend.py +188 -0
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +618 -0
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/msexcel_backend.py +686 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +1663 -0
docling/backend/noop_backend.py +51 -0
docling/backend/pdf_backend.py +82 -0
docling/backend/pypdfium2_backend.py +417 -0
docling/backend/webvtt_backend.py +572 -0
docling/backend/xml/__init__.py +0 -0
docling/backend/xml/jats_backend.py +819 -0
docling/backend/xml/uspto_backend.py +1905 -0
docling/chunking/__init__.py +12 -0
docling/cli/__init__.py +0 -0
docling/cli/main.py +974 -0
docling/cli/models.py +196 -0
docling/cli/tools.py +17 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/accelerator_options.py +69 -0
docling/datamodel/asr_model_specs.py +494 -0
docling/datamodel/backend_options.py +102 -0
docling/datamodel/base_models.py +493 -0
docling/datamodel/document.py +699 -0
docling/datamodel/extraction.py +39 -0
docling/datamodel/layout_model_specs.py +91 -0
docling/datamodel/pipeline_options.py +457 -0
docling/datamodel/pipeline_options_asr_model.py +78 -0
docling/datamodel/pipeline_options_vlm_model.py +136 -0
docling/datamodel/settings.py +65 -0
docling/datamodel/vlm_model_specs.py +365 -0
docling/document_converter.py +559 -0
docling/document_extractor.py +327 -0
docling/exceptions.py +10 -0
docling/experimental/__init__.py +5 -0
docling/experimental/datamodel/__init__.py +1 -0
docling/experimental/datamodel/table_crops_layout_options.py +13 -0
docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
docling/experimental/models/__init__.py +3 -0
docling/experimental/models/table_crops_layout_model.py +114 -0
docling/experimental/pipeline/__init__.py +1 -0
docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
docling/models/__init__.py +0 -0
docling/models/base_layout_model.py +39 -0
docling/models/base_model.py +230 -0
docling/models/base_ocr_model.py +241 -0
docling/models/base_table_model.py +45 -0
docling/models/extraction/__init__.py +0 -0
docling/models/extraction/nuextract_transformers_model.py +305 -0
docling/models/factories/__init__.py +47 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/layout_factory.py +7 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/factories/table_factory.py +7 -0
docling/models/picture_description_base_model.py +149 -0
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +60 -0
docling/models/stages/__init__.py +0 -0
docling/models/stages/code_formula/__init__.py +0 -0
docling/models/stages/code_formula/code_formula_model.py +342 -0
docling/models/stages/layout/__init__.py +0 -0
docling/models/stages/layout/layout_model.py +249 -0
docling/models/stages/ocr/__init__.py +0 -0
docling/models/stages/ocr/auto_ocr_model.py +132 -0
docling/models/stages/ocr/easyocr_model.py +200 -0
docling/models/stages/ocr/ocr_mac_model.py +145 -0
docling/models/stages/ocr/rapid_ocr_model.py +328 -0
docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
docling/models/stages/page_assemble/__init__.py +0 -0
docling/models/stages/page_assemble/page_assemble_model.py +156 -0
docling/models/stages/page_preprocessing/__init__.py +0 -0
docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
docling/models/stages/picture_classifier/__init__.py +0 -0
docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
docling/models/stages/picture_description/__init__.py +0 -0
docling/models/stages/picture_description/picture_description_api_model.py +66 -0
docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
docling/models/stages/reading_order/__init__.py +0 -0
docling/models/stages/reading_order/readingorder_model.py +431 -0
docling/models/stages/table_structure/__init__.py +0 -0
docling/models/stages/table_structure/table_structure_model.py +305 -0
docling/models/utils/__init__.py +0 -0
docling/models/utils/generation_utils.py +157 -0
docling/models/utils/hf_model_download.py +45 -0
docling/models/vlm_pipeline_models/__init__.py +1 -0
docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
docling/models/vlm_pipeline_models/mlx_model.py +325 -0
docling/models/vlm_pipeline_models/vllm_model.py +344 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/asr_pipeline.py +431 -0
docling/pipeline/base_extraction_pipeline.py +72 -0
docling/pipeline/base_pipeline.py +326 -0
docling/pipeline/extraction_vlm_pipeline.py +207 -0
docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
docling/pipeline/simple_pipeline.py +55 -0
docling/pipeline/standard_pdf_pipeline.py +859 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
docling/pipeline/vlm_pipeline.py +416 -0
docling/py.typed +1 -0
docling/utils/__init__.py +0 -0
docling/utils/accelerator_utils.py +97 -0
docling/utils/api_image_request.py +205 -0
docling/utils/deepseekocr_utils.py +388 -0
docling/utils/export.py +146 -0
docling/utils/glm_utils.py +361 -0
docling/utils/layout_postprocessor.py +683 -0
docling/utils/locks.py +3 -0
docling/utils/model_downloader.py +168 -0
docling/utils/ocr_utils.py +69 -0
docling/utils/orientation.py +65 -0
docling/utils/profiling.py +65 -0
docling/utils/utils.py +65 -0
docling/utils/visualization.py +85 -0
docling-2.69.0.dist-info/METADATA +237 -0
docling-2.69.0.dist-info/RECORD +138 -0
docling-2.69.0.dist-info/WHEEL +5 -0
docling-2.69.0.dist-info/entry_points.txt +6 -0
docling-2.69.0.dist-info/licenses/LICENSE +21 -0
docling-2.69.0.dist-info/top_level.txt +1 -0

docling/pipeline/legacy_standard_pdf_pipeline.py ADDED Viewed

@@ -0,0 +1,262 @@
+import logging
+import warnings
+from pathlib import Path
+from typing import Optional, cast
+import numpy as np
+from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import AssembledUnit, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.layout_model_specs import LayoutModelConfig
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.factories import (
+    get_layout_factory,
+    get_ocr_factory,
+    get_table_structure_factory,
+)
+from docling.models.stages.code_formula.code_formula_model import (
+    CodeFormulaModel,
+    CodeFormulaModelOptions,
+)
+from docling.models.stages.page_assemble.page_assemble_model import (
+    PageAssembleModel,
+    PageAssembleOptions,
+)
+from docling.models.stages.page_preprocessing.page_preprocessing_model import (
+    PagePreprocessingModel,
+    PagePreprocessingOptions,
+)
+from docling.models.stages.reading_order.readingorder_model import (
+    ReadingOrderModel,
+    ReadingOrderOptions,
+)
+from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.model_downloader import download_models
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+_log = logging.getLogger(__name__)
+class LegacyStandardPdfPipeline(PaginatedPipeline):
+    def __init__(self, pipeline_options: PdfPipelineOptions):
+        super().__init__(pipeline_options)
+        self.pipeline_options: PdfPipelineOptions
+        with warnings.catch_warnings():  # deprecated generate_table_images
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            self.keep_images = (
+                self.pipeline_options.generate_page_images
+                or self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            )
+        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
+        ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
+        layout_factory = get_layout_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        layout_model = layout_factory.create_instance(
+            options=pipeline_options.layout_options,
+            artifacts_path=self.artifacts_path,
+            accelerator_options=pipeline_options.accelerator_options,
+        )
+        table_factory = get_table_structure_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        table_model = table_factory.create_instance(
+            options=pipeline_options.table_structure_options,
+            enabled=pipeline_options.do_table_structure,
+            artifacts_path=self.artifacts_path,
+            accelerator_options=pipeline_options.accelerator_options,
+        )
+        self.build_pipe = [
+            # Pre-processing
+            PagePreprocessingModel(
+                options=PagePreprocessingOptions(
+                    images_scale=pipeline_options.images_scale,
+                )
+            ),
+            # OCR
+            ocr_model,
+            # Layout model
+            layout_model,
+            # Table structure model
+            table_model,
+            # Page assemble
+            PageAssembleModel(options=PageAssembleOptions()),
+        ]
+        self.enrichment_pipe = [
+            # Code Formula Enrichment Model
+            CodeFormulaModel(
+                enabled=pipeline_options.do_code_enrichment
+                or pipeline_options.do_formula_enrichment,
+                artifacts_path=self.artifacts_path,
+                options=CodeFormulaModelOptions(
+                    do_code_enrichment=pipeline_options.do_code_enrichment,
+                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
+                ),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
+            *self.enrichment_pipe,
+        ]
+        if (
+            self.pipeline_options.do_formula_enrichment
+            or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_classification
+            or self.pipeline_options.do_picture_description
+        ):
+            self.keep_backend = True
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        warnings.warn(
+            "The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
+            "use instead the utility `docling-tools models download`, or "
+            "the upstream method docling.utils.models_downloader.download_all()",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
+        return output_dir
+    def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
+        factory = get_ocr_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.ocr_options,
+            enabled=self.pipeline_options.do_ocr,
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
+        with TimeRecorder(conv_res, "page_init"):
+            page._backend = conv_res.input._backend.load_page(page.page_no - 1)  # type: ignore
+            if page._backend is not None and page._backend.is_valid():
+                page.size = page._backend.get_size()
+        return page
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
+        all_elements = []
+        all_headers = []
+        all_body = []
+        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
+            for p in conv_res.pages:
+                if p.assembled is not None:
+                    for el in p.assembled.body:
+                        all_body.append(el)
+                    for el in p.assembled.headers:
+                        all_headers.append(el)
+                    for el in p.assembled.elements:
+                        all_elements.append(el)
+            conv_res.assembled = AssembledUnit(
+                elements=all_elements, headers=all_headers, body=all_body
+            )
+            conv_res.document = self.reading_order_model(conv_res)
+            # Generate page images in the output
+            if self.pipeline_options.generate_page_images:
+                for page in conv_res.pages:
+                    assert page.image is not None
+                    page_no = page.page_no
+                    conv_res.document.pages[page_no].image = ImageRef.from_pil(
+                        page.image, dpi=int(72 * self.pipeline_options.images_scale)
+                    )
+            # Generate images of the requested element types
+            with warnings.catch_warnings():  # deprecated generate_table_images
+                warnings.filterwarnings("ignore", category=DeprecationWarning)
+                if (
+                    self.pipeline_options.generate_picture_images
+                    or self.pipeline_options.generate_table_images
+                ):
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                            continue
+                        if (
+                            isinstance(element, PictureItem)
+                            and self.pipeline_options.generate_picture_images
+                        ) or (
+                            isinstance(element, TableItem)
+                            and self.pipeline_options.generate_table_images
+                        ):
+                            page_ix = element.prov[0].page_no - 1
+                            page = next(
+                                (p for p in conv_res.pages if p.page_no == page_ix),
+                                cast("Page", None),
+                            )
+                            assert page is not None
+                            assert page.size is not None
+                            assert page.image is not None
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
+                            cropped_im = page.image.crop(crop_bbox.as_tuple())
+                            element.image = ImageRef.from_pil(
+                                cropped_im, dpi=int(72 * scale)
+                            )
+            # Aggregate confidence values for document:
+            if len(conv_res.pages) > 0:
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        category=RuntimeWarning,
+                        message="Mean of empty slice|All-NaN slice encountered",
+                    )
+                    conv_res.confidence.layout_score = float(
+                        np.nanmean(
+                            [c.layout_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+                    conv_res.confidence.parse_score = float(
+                        np.nanquantile(
+                            [c.parse_score for c in conv_res.confidence.pages.values()],
+                            q=0.1,  # parse score should relate to worst 10% of pages.
+                        )
+                    )
+                    conv_res.confidence.table_score = float(
+                        np.nanmean(
+                            [c.table_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+                    conv_res.confidence.ocr_score = float(
+                        np.nanmean(
+                            [c.ocr_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+        return conv_res
+    @classmethod
+    def get_default_options(cls) -> PdfPipelineOptions:
+        return PdfPipelineOptions()
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, PdfDocumentBackend)

docling/pipeline/simple_pipeline.py ADDED Viewed

@@ -0,0 +1,55 @@
+import logging
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    DeclarativeDocumentBackend,
+)
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import ConvertPipelineOptions
+from docling.pipeline.base_pipeline import ConvertPipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+_log = logging.getLogger(__name__)
+class SimplePipeline(ConvertPipeline):
+    """SimpleModelPipeline.
+    This class is used at the moment for formats / backends
+    which produce straight DoclingDocument output.
+    """
+    def __init__(self, pipeline_options: ConvertPipelineOptions):
+        super().__init__(pipeline_options)
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+        if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
+            raise RuntimeError(
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
+                f"Can not convert this with simple pipeline. "
+                f"Please check your format configuration on DocumentConverter."
+            )
+            # conv_res.status = ConversionStatus.FAILURE
+            # return conv_res
+        # Instead of running a page-level pipeline to build up the document structure,
+        # the backend is expected to be of type DeclarativeDocumentBackend, which can output
+        # a DoclingDocument straight.
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            conv_res.document = conv_res.input._backend.convert()
+        return conv_res
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        # This is called only if the previous steps didn't raise.
+        # Since we don't have anything else to evaluate, we can
+        # safely return SUCCESS.
+        return ConversionStatus.SUCCESS
+    @classmethod
+    def get_default_options(cls) -> ConvertPipelineOptions:
+        return ConvertPipelineOptions()
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, DeclarativeDocumentBackend)