PyPI - docling - Versions diffs - 2.69.0__py3-none-any.whl - Mend

docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (138) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +84 -0
docling/backend/asciidoc_backend.py +443 -0
docling/backend/csv_backend.py +125 -0
docling/backend/docling_parse_backend.py +237 -0
docling/backend/docling_parse_v2_backend.py +276 -0
docling/backend/docling_parse_v4_backend.py +260 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/drawingml/utils.py +131 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +274 -0
docling/backend/docx/latex/omml.py +459 -0
docling/backend/html_backend.py +1502 -0
docling/backend/image_backend.py +188 -0
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +618 -0
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/msexcel_backend.py +686 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +1663 -0
docling/backend/noop_backend.py +51 -0
docling/backend/pdf_backend.py +82 -0
docling/backend/pypdfium2_backend.py +417 -0
docling/backend/webvtt_backend.py +572 -0
docling/backend/xml/__init__.py +0 -0
docling/backend/xml/jats_backend.py +819 -0
docling/backend/xml/uspto_backend.py +1905 -0
docling/chunking/__init__.py +12 -0
docling/cli/__init__.py +0 -0
docling/cli/main.py +974 -0
docling/cli/models.py +196 -0
docling/cli/tools.py +17 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/accelerator_options.py +69 -0
docling/datamodel/asr_model_specs.py +494 -0
docling/datamodel/backend_options.py +102 -0
docling/datamodel/base_models.py +493 -0
docling/datamodel/document.py +699 -0
docling/datamodel/extraction.py +39 -0
docling/datamodel/layout_model_specs.py +91 -0
docling/datamodel/pipeline_options.py +457 -0
docling/datamodel/pipeline_options_asr_model.py +78 -0
docling/datamodel/pipeline_options_vlm_model.py +136 -0
docling/datamodel/settings.py +65 -0
docling/datamodel/vlm_model_specs.py +365 -0
docling/document_converter.py +559 -0
docling/document_extractor.py +327 -0
docling/exceptions.py +10 -0
docling/experimental/__init__.py +5 -0
docling/experimental/datamodel/__init__.py +1 -0
docling/experimental/datamodel/table_crops_layout_options.py +13 -0
docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
docling/experimental/models/__init__.py +3 -0
docling/experimental/models/table_crops_layout_model.py +114 -0
docling/experimental/pipeline/__init__.py +1 -0
docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
docling/models/__init__.py +0 -0
docling/models/base_layout_model.py +39 -0
docling/models/base_model.py +230 -0
docling/models/base_ocr_model.py +241 -0
docling/models/base_table_model.py +45 -0
docling/models/extraction/__init__.py +0 -0
docling/models/extraction/nuextract_transformers_model.py +305 -0
docling/models/factories/__init__.py +47 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/layout_factory.py +7 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/factories/table_factory.py +7 -0
docling/models/picture_description_base_model.py +149 -0
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +60 -0
docling/models/stages/__init__.py +0 -0
docling/models/stages/code_formula/__init__.py +0 -0
docling/models/stages/code_formula/code_formula_model.py +342 -0
docling/models/stages/layout/__init__.py +0 -0
docling/models/stages/layout/layout_model.py +249 -0
docling/models/stages/ocr/__init__.py +0 -0
docling/models/stages/ocr/auto_ocr_model.py +132 -0
docling/models/stages/ocr/easyocr_model.py +200 -0
docling/models/stages/ocr/ocr_mac_model.py +145 -0
docling/models/stages/ocr/rapid_ocr_model.py +328 -0
docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
docling/models/stages/page_assemble/__init__.py +0 -0
docling/models/stages/page_assemble/page_assemble_model.py +156 -0
docling/models/stages/page_preprocessing/__init__.py +0 -0
docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
docling/models/stages/picture_classifier/__init__.py +0 -0
docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
docling/models/stages/picture_description/__init__.py +0 -0
docling/models/stages/picture_description/picture_description_api_model.py +66 -0
docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
docling/models/stages/reading_order/__init__.py +0 -0
docling/models/stages/reading_order/readingorder_model.py +431 -0
docling/models/stages/table_structure/__init__.py +0 -0
docling/models/stages/table_structure/table_structure_model.py +305 -0
docling/models/utils/__init__.py +0 -0
docling/models/utils/generation_utils.py +157 -0
docling/models/utils/hf_model_download.py +45 -0
docling/models/vlm_pipeline_models/__init__.py +1 -0
docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
docling/models/vlm_pipeline_models/mlx_model.py +325 -0
docling/models/vlm_pipeline_models/vllm_model.py +344 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/asr_pipeline.py +431 -0
docling/pipeline/base_extraction_pipeline.py +72 -0
docling/pipeline/base_pipeline.py +326 -0
docling/pipeline/extraction_vlm_pipeline.py +207 -0
docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
docling/pipeline/simple_pipeline.py +55 -0
docling/pipeline/standard_pdf_pipeline.py +859 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
docling/pipeline/vlm_pipeline.py +416 -0
docling/py.typed +1 -0
docling/utils/__init__.py +0 -0
docling/utils/accelerator_utils.py +97 -0
docling/utils/api_image_request.py +205 -0
docling/utils/deepseekocr_utils.py +388 -0
docling/utils/export.py +146 -0
docling/utils/glm_utils.py +361 -0
docling/utils/layout_postprocessor.py +683 -0
docling/utils/locks.py +3 -0
docling/utils/model_downloader.py +168 -0
docling/utils/ocr_utils.py +69 -0
docling/utils/orientation.py +65 -0
docling/utils/profiling.py +65 -0
docling/utils/utils.py +65 -0
docling/utils/visualization.py +85 -0
docling-2.69.0.dist-info/METADATA +237 -0
docling-2.69.0.dist-info/RECORD +138 -0
docling-2.69.0.dist-info/WHEEL +5 -0
docling-2.69.0.dist-info/entry_points.txt +6 -0
docling-2.69.0.dist-info/licenses/LICENSE +21 -0
docling-2.69.0.dist-info/top_level.txt +1 -0

docling/cli/models.py ADDED Viewed

@@ -0,0 +1,196 @@
+import logging
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from rich.console import Console
+from rich.logging import RichHandler
+from docling.datamodel.settings import settings
+from docling.models.utils.hf_model_download import download_hf_model
+from docling.utils.model_downloader import download_models
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+console = Console()
+err_console = Console(stderr=True)
+app = typer.Typer(
+    name="Docling models helper",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+class _AvailableModels(str, Enum):
+    LAYOUT = "layout"
+    TABLEFORMER = "tableformer"
+    CODE_FORMULA = "code_formula"
+    PICTURE_CLASSIFIER = "picture_classifier"
+    SMOLVLM = "smolvlm"
+    GRANITEDOCLING = "granitedocling"
+    GRANITEDOCLING_MLX = "granitedocling_mlx"
+    SMOLDOCLING = "smoldocling"
+    SMOLDOCLING_MLX = "smoldocling_mlx"
+    GRANITE_VISION = "granite_vision"
+    RAPIDOCR = "rapidocr"
+    EASYOCR = "easyocr"
+_default_models = [
+    _AvailableModels.LAYOUT,
+    _AvailableModels.TABLEFORMER,
+    _AvailableModels.CODE_FORMULA,
+    _AvailableModels.PICTURE_CLASSIFIER,
+    _AvailableModels.RAPIDOCR,
+]
+@app.command("download")
+def download(
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where to download the models.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced.")
+    ] = False,
+    models: Annotated[
+        Optional[list[_AvailableModels]],
+        typer.Argument(
+            help="Models to download (default behavior: a predefined set of models will be downloaded).",
+        ),
+    ] = None,
+    all: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--all",
+            help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
+            show_default=True,
+        ),
+    ] = False,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if models and all:
+        raise typer.BadParameter(
+            "Cannot simultaneously set 'all' parameter and specify models to download."
+        )
+    if not quiet:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    to_download = models or (list(_AvailableModels) if all else _default_models)
+    output_dir = download_models(
+        output_dir=output_dir,
+        force=force,
+        progress=(not quiet),
+        with_layout=_AvailableModels.LAYOUT in to_download,
+        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
+        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
+        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_smolvlm=_AvailableModels.SMOLVLM in to_download,
+        with_granitedocling=_AvailableModels.GRANITEDOCLING in to_download,
+        with_granitedocling_mlx=_AvailableModels.GRANITEDOCLING_MLX in to_download,
+        with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
+        with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
+        with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
+        with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
+        with_easyocr=_AvailableModels.EASYOCR in to_download,
+    )
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
+        console.print(
+            "\n",
+            "Docling can now be configured for running offline using the local artifacts.\n\n",
+            "Using the CLI:",
+            f"`docling --artifacts-path={output_dir} FILE`",
+            "\n",
+            "Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
+        )
+@app.command("download-hf-repo")
+def download_hf_repo(
+    models: Annotated[
+        list[str],
+        typer.Argument(
+            help="Specific models to download from HuggingFace identified by their repo id. For example: docling-project/docling-models .",
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where to download the models.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced.")
+    ] = False,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if not quiet:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    for item in models:
+        typer.secho(f"\nDownloading {item} model from HuggingFace...")
+        download_hf_model(
+            repo_id=item,
+            # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
+            # but creating options objects seams like an overkill
+            local_dir=output_dir / item.replace("/", "--"),
+            force=force,
+            progress=(not quiet),
+        )
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling/cli/tools.py ADDED Viewed

@@ -0,0 +1,17 @@
+import typer
+from docling.cli.models import app as models_app
+app = typer.Typer(
+    name="Docling helpers",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+app.add_typer(models_app, name="models")
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling/datamodel/__init__.py ADDED Viewed

File without changes

docling/datamodel/accelerator_options.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+import os
+import re
+from enum import Enum
+from typing import Any, Union
+from pydantic import field_validator, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+_log = logging.getLogger(__name__)
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+    XPU = "xpu"
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+    num_threads: int = 4
+    device: Union[str, AcceleratorDevice] = "auto"
+    cuda_use_flash_attention2: bool = False
+    @field_validator("device")
+    def validate_device(cls, value):
+        # "auto", "cpu", "cuda", "mps", "xpu", or "cuda:N"
+        if value in {d.value for d in AcceleratorDevice} or re.match(
+            r"^cuda(:\d+)?$", value
+        ):
+            return value
+        raise ValueError(
+            "Invalid device option. Use 'auto', 'cpu', 'mps', 'xpu', 'cuda', or 'cuda:N'."
+        )
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data