PyPI - docling - Versions diffs - 2.18.0__py3-none-any.whl → 2.19.0__py3-none-any.whl - Mend

docling 2.18.0py3-none-any.whl → 2.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

docling/backend/md_backend.py +62 -46
docling/backend/msword_backend.py +1 -1
docling/cli/main.py +8 -0
docling/cli/models.py +105 -0
docling/cli/tools.py +17 -0
docling/datamodel/settings.py +2 -0
docling/models/base_model.py +3 -0
docling/models/code_formula_model.py +15 -9
docling/models/document_picture_classifier.py +11 -8
docling/models/easyocr_model.py +50 -3
docling/models/layout_model.py +49 -3
docling/models/table_structure_model.py +44 -2
docling/pipeline/base_pipeline.py +1 -1
docling/pipeline/standard_pdf_pipeline.py +25 -24
docling/utils/model_downloader.py +72 -0
docling/utils/utils.py +24 -0
{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/METADATA +5 -4
{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/RECORD +21 -18
{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/entry_points.txt +1 -0
{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/LICENSE +0 -0
{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/WHEEL +0 -0

docling/backend/md_backend.py CHANGED Viewed

@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # very long sequences of underscores will lead to unnecessary long processing times.
                 # In any proper Markdown files, underscores have to be escaped,
                 # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     # very long sequences of underscores will lead to unnecessary long processing times.
                     # In any proper Markdown files, underscores have to be escaped,
                     # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
             self.valid = True
             _log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
-    def close_table(self, doc: DoclingDocument):
+    def _close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
-    def process_inline_text(
-        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
     ):
         txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                 text=txt,
             )
         self.inline_texts = []
-    def iterate_elements(
+    def _iterate_elements(
         self,
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        parent_element: Optional[NodeItem] = None,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
     ):
+        if element in visited:
+            return
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             traverse(element)
             snippet_text = "".join(strings)
             if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                 )
         elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     has_non_empty_list_items = True
                     break
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
-                parent_element = doc.add_group(
-                    label=label, name=f"list", parent=parent_element
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
                 )
         elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
-            snippet_text = str(element.children[0].children[0].children)  # type: ignore
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
             is_numbered = False
             if (
-                parent_element is not None
-                and isinstance(parent_element, DocItem)
-                and parent_element.label == GroupLabel.ORDERED_LIST
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
             ):
                 is_numbered = True
             doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
             )
+            visited.add(first_child)
         elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
             fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     label=DocItemLabel.CAPTION, text=element.title
                 )
-            doc.add_picture(parent=parent_element, caption=fig_caption)
+            doc.add_picture(parent=parent_item, caption=fig_caption)
         elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self.process_inline_text(parent_element, doc)
+            self._process_inline_text(parent_item, doc)
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 else:
                     self.md_table_buffer.append(snippet_text)
             else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                 # most likely just inline text
                 self.inline_texts.append(str(element.children))
         elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
         elif (
             isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             and isinstance((first_child := element.children[0]), marko.inline.RawText)
             and len(snippet_text := (first_child.children.strip())) > 0
         ):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif isinstance(element, marko.block.HTMLBlock):
             self._html_blocks += 1
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
             _log.debug("HTML Block: {}".format(element))
             if (
                 len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # wrap in markers to enable post-processing in convert()
                 text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_element, text=text_to_add)
+                doc.add_code(parent=parent_item, text=text_to_add)
         else:
             if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                 _log.debug("Some other element: {}".format(element))
         processed_block_types = (
-            marko.block.ListItem,
             marko.block.Heading,
             marko.block.CodeBlock,
             marko.block.FencedCode,
-            # marko.block.Paragraph,
             marko.inline.RawText,
         )
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
-                self.iterate_elements(child, depth + 1, doc, parent_element)
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )
     def is_valid(self) -> bool:
         return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             marko_parser = Markdown()
             parsed_ast = marko_parser.parse(self.markdown)
             # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
-            self.close_table(doc=doc)  # handle any last hanging table
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table
             # if HTML blocks were detected, export to HTML and delegate to HTML backend
             if self._html_blocks > 0:

docling/backend/msword_backend.py CHANGED Viewed

@@ -242,7 +242,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             parts = label.split(":")
             if len(parts) == 2:
-                return parts[0], int(parts[1])
+                return parts[0], self.str_to_int(parts[1], None)
         parts = self.split_text_and_number(label)

docling/cli/main.py CHANGED Viewed

@@ -219,6 +219,13 @@ def convert(
         bool,
         typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
     ] = False,
+    enrich_picture_classes: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Enable the picture classification enrichment model in the pipeline.",
+        ),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -375,6 +382,7 @@ def convert(
             do_table_structure=True,
             do_code_enrichment=enrich_code,
             do_formula_enrichment=enrich_formula,
+            do_picture_classification=enrich_picture_classes,
             document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (

docling/cli/models.py ADDED Viewed

@@ -0,0 +1,105 @@
+import logging
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from rich.console import Console
+from rich.logging import RichHandler
+from docling.datamodel.settings import settings
+from docling.utils.model_downloader import download_models
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+console = Console()
+err_console = Console(stderr=True)
+app = typer.Typer(
+    name="Docling models helper",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+class _AvailableModels(str, Enum):
+    LAYOUT = "layout"
+    TABLEFORMER = "tableformer"
+    CODE_FORMULA = "code_formula"
+    PICTURE_CLASSIFIER = "picture_classifier"
+    EASYOCR = "easyocr"
+@app.command("download")
+def download(
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where all the models are downloaded.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced")
+    ] = False,
+    models: Annotated[
+        Optional[list[_AvailableModels]],
+        typer.Argument(
+            help=f"Models to download (default behavior: all will be downloaded)",
+        ),
+    ] = None,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if not quiet:
+        FORMAT = "%(message)s"
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    to_download = models or [m for m in _AvailableModels]
+    output_dir = download_models(
+        output_dir=output_dir,
+        force=force,
+        progress=(not quiet),
+        with_layout=_AvailableModels.LAYOUT in to_download,
+        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
+        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
+        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_easyocr=_AvailableModels.EASYOCR in to_download,
+    )
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
+        console.print(
+            "\n",
+            "Docling can now be configured for running offline using the local artifacts.\n\n",
+            "Using the CLI:",
+            f"`docling --artifacts-path={output_dir} FILE`",
+            "\n",
+            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
+        )
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling/cli/tools.py ADDED Viewed

@@ -0,0 +1,17 @@
+import typer
+from docling.cli.models import app as models_app
+app = typer.Typer(
+    name="Docling helpers",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+app.add_typer(models_app, name="models")
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling/datamodel/settings.py CHANGED Viewed

@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
     perf: BatchConcurrencySettings
     debug: DebugSettings
+    cache_dir: Path = Path.home() / ".cache" / "docling"
 settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

docling/models/base_model.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing_extensions import TypeVar
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
 class BasePageModel(ABC):
@@ -21,6 +22,8 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
 class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
+    elements_batch_size: int = settings.perf.elements_batch_size
     @abstractmethod
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         pass

docling/models/code_formula_model.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union
+import numpy as np
 from docling_core.types.doc import (
     CodeItem,
     DocItemLabel,
@@ -61,13 +62,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         Processes the given batch of elements and enriches them with predictions.
     """
+    _model_repo_folder = "ds4sd--CodeFormula"
+    elements_batch_size = 5
     images_scale = 1.66  # = 120 dpi, aligned with training data resolution
     expansion_factor = 0.03
     def __init__(
         self,
         enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
         options: CodeFormulaModelOptions,
         accelerator_options: AcceleratorOptions,
     ):
@@ -96,29 +99,32 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
             )
             if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
             else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder
             self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                 device=device,
                 num_threads=accelerator_options.num_threads,
             )
     @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
     ) -> Path:
         from huggingface_hub import snapshot_download
         from huggingface_hub.utils import disable_progress_bars
-        disable_progress_bars()
+        if not progress:
+            disable_progress_bars()
         download_path = snapshot_download(
             repo_id="ds4sd/CodeFormula",
             force_download=force,
             local_dir=local_dir,
-            revision="v1.0.0",
+            revision="v1.0.1",
         )
         return Path(download_path)
@@ -226,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
             return
         labels: List[str] = []
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[TextItem] = []
         for el in element_batch:
             assert isinstance(el.item, TextItem)

docling/models/document_picture_classifier.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union
+import numpy as np
 from docling_core.types.doc import (
     DoclingDocument,
     NodeItem,
@@ -55,12 +56,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         Processes a batch of elements and adds classification annotations.
     """
+    _model_repo_folder = "ds4sd--DocumentFigureClassifier"
     images_scale = 2
     def __init__(
         self,
         enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
         options: DocumentPictureClassifierOptions,
         accelerator_options: AcceleratorOptions,
     ):
@@ -88,24 +90,25 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
             )
             if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
             else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder
             self.document_picture_classifier = DocumentFigureClassifierPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                 device=device,
                 num_threads=accelerator_options.num_threads,
             )
     @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
     ) -> Path:
         from huggingface_hub import snapshot_download
         from huggingface_hub.utils import disable_progress_bars
-        disable_progress_bars()
+        if not progress:
+            disable_progress_bars()
         download_path = snapshot_download(
             repo_id="ds4sd/DocumentFigureClassifier",
             force_download=force,
@@ -159,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                 yield element
             return
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[PictureItem] = []
         for el in element_batch:
             assert isinstance(el, PictureItem)

docling/models/easyocr_model.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import logging
 import warnings
-from typing import Iterable
+import zipfile
+from pathlib import Path
+from typing import Iterable, List, Optional
+import httpx
 import numpy
 import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
+from docling.utils.utils import download_url_with_progress
 _log = logging.getLogger(__name__)
 class EasyOcrModel(BaseOcrModel):
+    _model_repo_folder = "EasyOcr"
     def __init__(
         self,
         enabled: bool,
+        artifacts_path: Optional[Path],
         options: EasyOcrOptions,
         accelerator_options: AcceleratorOptions,
     ):
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
                 )
                 use_gpu = self.options.use_gpu
+            download_enabled = self.options.download_enabled
+            model_storage_directory = self.options.model_storage_directory
+            if artifacts_path is not None and model_storage_directory is None:
+                download_enabled = False
+                model_storage_directory = str(artifacts_path / self._model_repo_folder)
             self.reader = easyocr.Reader(
                 lang_list=self.options.lang,
                 gpu=use_gpu,
-                model_storage_directory=self.options.model_storage_directory,
+                model_storage_directory=model_storage_directory,
                 recog_network=self.options.recog_network,
-                download_enabled=self.options.download_enabled,
+                download_enabled=download_enabled,
                 verbose=False,
             )
+    @staticmethod
+    def download_models(
+        detection_models: List[str] = ["craft"],
+        recognition_models: List[str] = ["english_g2", "latin_g2"],
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
+        from easyocr.config import detection_models as det_models_dict
+        from easyocr.config import recognition_models as rec_models_dict
+        if local_dir is None:
+            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
+        local_dir.mkdir(parents=True, exist_ok=True)
+        # Collect models to download
+        download_list = []
+        for model_name in detection_models:
+            if model_name in det_models_dict:
+                download_list.append(det_models_dict[model_name])
+        for model_name in recognition_models:
+            if model_name in rec_models_dict["gen2"]:
+                download_list.append(rec_models_dict["gen2"][model_name])
+        # Download models
+        for model_details in download_list:
+            buf = download_url_with_progress(model_details["url"], progress=progress)
+            with zipfile.ZipFile(buf, "r") as zip_ref:
+                zip_ref.extractall(local_dir)
+        return local_dir
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:

docling/models/layout_model.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import copy
 import logging
+import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
 class LayoutModel(BasePageModel):
+    _model_repo_folder = "ds4sd--docling-models"
+    _model_path = "model_artifacts/layout"
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
     FORMULA_LABEL = DocItemLabel.FORMULA
     CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
-    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+    def __init__(
+        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+    ):
         device = decide_device(accelerator_options.device)
+        if artifacts_path is None:
+            artifacts_path = self.download_models() / self._model_path
+        else:
+            # will become the default in the future
+            if (artifacts_path / self._model_repo_folder).exists():
+                artifacts_path = (
+                    artifacts_path / self._model_repo_folder / self._model_path
+                )
+            elif (artifacts_path / self._model_path).exists():
+                warnings.warn(
+                    "The usage of artifacts_path containing directly "
+                    f"{self._model_path} is deprecated. Please point "
+                    "the artifacts_path to the parent containing "
+                    f"the {self._model_repo_folder} folder.",
+                    DeprecationWarning,
+                    stacklevel=3,
+                )
+                artifacts_path = artifacts_path / self._model_path
         self.layout_predictor = LayoutPredictor(
             artifact_path=str(artifacts_path),
             device=device,
             num_threads=accelerator_options.num_threads,
         )
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.1.0",
+        )
+        return Path(download_path)
     def draw_clusters_and_cells_side_by_side(
         self, conv_res, page, clusters, mode_prefix: str, show: bool = False
     ):
@@ -106,10 +150,12 @@ class LayoutModel(BasePageModel):
             else:
                 with TimeRecorder(conv_res, "layout"):
                     assert page.size is not None
+                    page_image = page.get_image(scale=1.0)
+                    assert page_image is not None
                     clusters = []
                     for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                        self.layout_predictor.predict(page_image)
                     ):
                         label = DocItemLabel(
                             pred_item["label"]

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import copy
+import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
 class TableStructureModel(BasePageModel):
+    _model_repo_folder = "ds4sd--docling-models"
+    _model_path = "model_artifacts/tableformer"
     def __init__(
         self,
         enabled: bool,
-        artifacts_path: Path,
+        artifacts_path: Optional[Path],
         options: TableStructureOptions,
         accelerator_options: AcceleratorOptions,
     ):
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
         self.enabled = enabled
         if self.enabled:
+            if artifacts_path is None:
+                artifacts_path = self.download_models() / self._model_path
+            else:
+                # will become the default in the future
+                if (artifacts_path / self._model_repo_folder).exists():
+                    artifacts_path = (
+                        artifacts_path / self._model_repo_folder / self._model_path
+                    )
+                elif (artifacts_path / self._model_path).exists():
+                    warnings.warn(
+                        "The usage of artifacts_path containing directly "
+                        f"{self._model_path} is deprecated. Please point "
+                        "the artifacts_path to the parent containing "
+                        f"the {self._model_repo_folder} folder.",
+                        DeprecationWarning,
+                        stacklevel=3,
+                    )
+                    artifacts_path = artifacts_path / self._model_path
             if self.mode == TableFormerMode.ACCURATE:
                 artifacts_path = artifacts_path / "accurate"
             else:
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
             )
             self.scale = 2.0  # Scale up table input images to 144 dpi
+    @staticmethod
+    def download_models(
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.1.0",
+        )
+        return Path(download_path)
     def draw_table_and_cells(
         self,
         conv_res: ConversionResult,

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -79,7 +79,7 @@ class BasePipeline(ABC):
             for model in self.enrichment_pipe:
                 for element_batch in chunkify(
                     _prepare_elements(conv_res, model),
-                    settings.perf.elements_batch_size,
+                    model.elements_batch_size,
                 ):
                     for element in model(
                         doc=conv_res.document, element_batch=element_batch

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import sys
+import warnings
 from pathlib import Path
 from typing import Optional
@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
 from docling.models.document_picture_classifier import (
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
+from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 _log = logging.getLogger(__name__)
 class StandardPdfPipeline(PaginatedPipeline):
-    _layout_model_path = "model_artifacts/layout"
-    _table_model_path = "model_artifacts/tableformer"
+    _layout_model_path = LayoutModel._model_path
+    _table_model_path = TableStructureModel._model_path
     def __init__(self, pipeline_options: PdfPipelineOptions):
         super().__init__(pipeline_options)
         self.pipeline_options: PdfPipelineOptions
-        if pipeline_options.artifacts_path is None:
-            self.artifacts_path = self.download_models_hf()
-        else:
-            self.artifacts_path = Path(pipeline_options.artifacts_path)
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
         self.keep_images = (
             self.pipeline_options.generate_page_images
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
         self.glm_model = GlmModel(options=GlmOptions())
-        if (ocr_model := self.get_ocr_model()) is None:
+        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
             raise RuntimeError(
                 f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
             )
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
             ocr_model,
             # Layout model
             LayoutModel(
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._layout_model_path,
+                artifacts_path=artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
             ),
             # Table structure model
             TableStructureModel(
                 enabled=pipeline_options.do_table_structure,
-                artifacts_path=self.artifacts_path
-                / StandardPdfPipeline._table_model_path,
+                artifacts_path=artifacts_path,
                 options=pipeline_options.table_structure_options,
                 accelerator_options=pipeline_options.accelerator_options,
             ),
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             CodeFormulaModel(
                 enabled=pipeline_options.do_code_enrichment
                 or pipeline_options.do_formula_enrichment,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                 options=CodeFormulaModelOptions(
                     do_code_enrichment=pipeline_options.do_code_enrichment,
                     do_formula_enrichment=pipeline_options.do_formula_enrichment,
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             # Document Picture Classifier
             DocumentPictureClassifier(
                 enabled=pipeline_options.do_picture_classification,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                 options=DocumentPictureClassifierOptions(),
                 accelerator_options=pipeline_options.accelerator_options,
             ),
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
     def download_models_hf(
         local_dir: Optional[Path] = None, force: bool = False
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.1.0",
+        warnings.warn(
+            "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
+            "use instead the utility `docling-tools models download`, or "
+            "the upstream method docling.utils.models_downloader.download_all()",
+            DeprecationWarning,
+            stacklevel=3,
         )
-        return Path(download_path)
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
+        return output_dir
-    def get_ocr_model(self) -> Optional[BaseOcrModel]:
+    def get_ocr_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[BaseOcrModel]:
         if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
             return EasyOcrModel(
                 enabled=self.pipeline_options.do_ocr,
+                artifacts_path=artifacts_path,
                 options=self.pipeline_options.ocr_options,
                 accelerator_options=self.pipeline_options.accelerator_options,
             )

docling/utils/model_downloader.py ADDED Viewed

@@ -0,0 +1,72 @@
+import logging
+from pathlib import Path
+from typing import Optional
+from docling.datamodel.settings import settings
+from docling.models.code_formula_model import CodeFormulaModel
+from docling.models.document_picture_classifier import DocumentPictureClassifier
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.table_structure_model import TableStructureModel
+_log = logging.getLogger(__name__)
+def download_models(
+    output_dir: Optional[Path] = None,
+    *,
+    force: bool = False,
+    progress: bool = False,
+    with_layout: bool = True,
+    with_tableformer: bool = True,
+    with_code_formula: bool = True,
+    with_picture_classifier: bool = True,
+    with_easyocr: bool = True,
+):
+    if output_dir is None:
+        output_dir = settings.cache_dir / "models"
+    # Make sure the folder exists
+    output_dir.mkdir(exist_ok=True, parents=True)
+    if with_layout:
+        _log.info(f"Downloading layout model...")
+        LayoutModel.download_models(
+            local_dir=output_dir / LayoutModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+    if with_tableformer:
+        _log.info(f"Downloading tableformer model...")
+        TableStructureModel.download_models(
+            local_dir=output_dir / TableStructureModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+    if with_picture_classifier:
+        _log.info(f"Downloading picture classifier model...")
+        DocumentPictureClassifier.download_models(
+            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+    if with_code_formula:
+        _log.info(f"Downloading code formula model...")
+        CodeFormulaModel.download_models(
+            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+    if with_easyocr:
+        _log.info(f"Downloading easyocr models...")
+        EasyOcrModel.download_models(
+            local_dir=output_dir / EasyOcrModel._model_repo_folder,
+            force=force,
+            progress=progress,
+        )
+    return output_dir

docling/utils/utils.py CHANGED Viewed

@@ -4,6 +4,9 @@ from itertools import islice
 from pathlib import Path
 from typing import List, Union
+import requests
+from tqdm import tqdm
 def chunkify(iterator, chunk_size):
     """Yield successive chunks of chunk_size from the iterable."""
@@ -39,3 +42,24 @@ def create_hash(string: str):
     hasher.update(string.encode("utf-8"))
     return hasher.hexdigest()
+def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
+    buf = BytesIO()
+    with requests.get(url, stream=True, allow_redirects=True) as response:
+        total_size = int(response.headers.get("content-length", 0))
+        progress_bar = tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            disable=(not progress),
+        )
+        for chunk in response.iter_content(10 * 1024):
+            buf.write(chunk)
+            progress_bar.update(len(chunk))
+        progress_bar.close()
+    buf.seek(0)
+    return buf

{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.18.0
+Version: 2.19.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -24,12 +24,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: ocrmac
 Provides-Extra: rapidocr
 Provides-Extra: tesserocr
-Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
+Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
-Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
+Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -52,6 +52,7 @@ Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
 Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
+Requires-Dist: tqdm (>=4.65.0,<5.0.0)
 Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown

{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/RECORD RENAMED Viewed

@@ -7,10 +7,10 @@ docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAk
 docling/backend/html_backend.py,sha256=YTPLZiEEEuGaP6G62skK3wXJ0KftuqBCl8erNXeJyoE,15893
 docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
-docling/backend/md_backend.py,sha256=d7XAFHzFO9qhrCJA3raWEmZ8WXSYyy3KOE57oMeqKGc,16502
+docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
 docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
 docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
-docling/backend/msword_backend.py,sha256=0iR1l3eLplPv3CPT7iGwQb50LIVf3C32KZFzwAkARrc,20573
+docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4yrQBw,20591
 docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
 docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
 docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,45 +18,48 @@ docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-
 docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
+docling/cli/main.py,sha256=qShZI1f7WWn5T16YtFTeYY1CUucNjyGefIekCWvkAqc,16366
+docling/cli/models.py,sha256=cjP13QZfgHZWPVJw3kJvSszJdDrRVWx-sJipZRfHEuQ,3102
+docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
 docling/datamodel/document.py,sha256=HkmvQKW3QSx3tAqPTnXiJbD_y1EVwR-WE3n6Gq8g1NY,13428
 docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
-docling/datamodel/settings.py,sha256=uN9jeXMwx--tJb-DFU7nr77g0Iou13YAVDzsymTvbHg,1759
+docling/datamodel/settings.py,sha256=pJi9OBqZQhsNi7RwJWQFRDKGhm3u679iN76psA3VtsY,1817
 docling/document_converter.py,sha256=qaldb7Thqk59RdE-RTGtj1M7l5UzaBdnxIvGoQ7lTeo,12876
 docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/base_model.py,sha256=H5X-exVaAN-XMTzxpgUc-rwH-D8Uk7-VuZtq2soNGXI,2567
+docling/models/base_model.py,sha256=IIf_PA933bdwHst3g_MOC4oiYQcSCIVOnxnCnN1NxEQ,2681
 docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
-docling/models/code_formula_model.py,sha256=bOIKJvckZ0QpnDZ-CDiYv-CvuGvaGzJgp2PiYAidKBQ,8422
-docling/models/document_picture_classifier.py,sha256=RLB80ueqWZ86hdXtTKmSynCU13nT-T10vUp2sky9110,6078
+docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
+docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
 docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
-docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
-docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QCw,6090
+docling/models/easyocr_model.py,sha256=dDy0iaR4KUrq7eFIQclMqUYap1B06PG4nC6RMlGYhSw,6886
+docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
 docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
 docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
 docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
 docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
-docling/models/table_structure_model.py,sha256=qZgoBrBh7H-RJGCTtaRGcj79g2WzZiUBTPnHqJZ-bLA,9557
+docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
 docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
 docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/pipeline/base_pipeline.py,sha256=lK8PQiydWJ9M16kIVL7U1A2iryTRFrN5WSucVo2ohFQ,8757
+docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
 docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
-docling/pipeline/standard_pdf_pipeline.py,sha256=Qefg1JSiFwipypi8TZPJ50WgXTLjwkC0wvYAl02RM2o,10480
+docling/pipeline/standard_pdf_pipeline.py,sha256=xVGLYmh677hKBSRCoHYAVn7drmowba2QGI8f-eEC5gs,10624
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbSPev80,1367
 docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/glm_utils.py,sha256=uyCoFTX9FbS1Ke0aSlkdzGLUt08dZfkgriWadkyLiiA,11856
 docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
+docling/utils/model_downloader.py,sha256=5jChSE88byGj7LvGNnB01qBw6n9ODJjnAS66PobRSJc,2267
 docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
-docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
+docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
 docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
-docling-2.18.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.18.0.dist-info/METADATA,sha256=rBP1Z7m0HMpC-HjR360i2JNuIA9lqknRPjUab1mtVic,8403
-docling-2.18.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-docling-2.18.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.18.0.dist-info/RECORD,,
+docling-2.19.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.19.0.dist-info/METADATA,sha256=deXdwXb0i_n3pyEDbVGNQNw4APYoUVtXnkHmC-frXWI,8442
+docling-2.19.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+docling-2.19.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
+docling-2.19.0.dist-info/RECORD,,

{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,3 +1,4 @@
 [console_scripts]
 docling=docling.cli.main:app
+docling-tools=docling.cli.tools:app

{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.18.0.dist-info → docling-2.19.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 2.18.0__py3-none-any.whl → 2.19.0__py3-none-any.whl

docling 2.18.0py3-none-any.whl → 2.19.0py3-none-any.whl