PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

docling/backend/abstract_backend.py +33 -37
docling/backend/asciidoc_backend.py +431 -0
docling/backend/docling_parse_backend.py +20 -16
docling/backend/docling_parse_v2_backend.py +248 -0
docling/backend/html_backend.py +429 -0
docling/backend/md_backend.py +346 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +496 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +16 -11
docling/cli/main.py +96 -65
docling/datamodel/base_models.py +79 -193
docling/datamodel/document.py +405 -320
docling/datamodel/pipeline_options.py +19 -3
docling/datamodel/settings.py +16 -1
docling/document_converter.py +240 -251
docling/models/base_model.py +28 -0
docling/models/base_ocr_model.py +40 -10
docling/models/ds_glm_model.py +244 -30
docling/models/easyocr_model.py +57 -42
docling/models/layout_model.py +158 -116
docling/models/page_assemble_model.py +127 -101
docling/models/page_preprocessing_model.py +79 -0
docling/models/table_structure_model.py +162 -116
docling/models/tesseract_ocr_cli_model.py +76 -59
docling/models/tesseract_ocr_model.py +90 -58
docling/pipeline/base_pipeline.py +189 -0
docling/pipeline/simple_pipeline.py +56 -0
docling/pipeline/standard_pdf_pipeline.py +201 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling/utils/profiling.py +62 -0
docling-2.4.1.dist-info/METADATA +154 -0
docling-2.4.1.dist-info/RECORD +45 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0

docling/utils/layout_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import copy
 import logging
 import networkx as nx
+from docling_core.types.doc import DocItemLabel
 logger = logging.getLogger("layout_utils")
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
             "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
         )
         logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
-        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
             logger.debug("  Empty non-picture, removed")
             continue  ## Skip this former cluster, now without cells.
         new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
-    if not (cluster["type"] in ["Table", "Picture"]):
+    if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
         ## A text-like cluster. The bbox only needs to be around the text cells:
         logger.debug("    Initial bbox: " + str(cluster["bbox"]))
         new_bbox = surrounding_list(
             [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
         )
         logger.debug("  New bounding box:" + str(new_bbox))
-    if cluster["type"] == "Picture":
+    if cluster["type"] == DocItemLabel.PICTURE:
         ## We only make the bbox completely comprise included text cells:
         logger.debug("  Picture")
         if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
     max_id = -1
     figures = []
     for cluster in cluster_predictions:
-        if cluster["type"] == "Picture":
+        if cluster["type"] == DocItemLabel.PICTURE:
             figures.append(cluster)
         if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
             # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
             if fig_flag == False and lines_detector == False:
                 # get class from low confidence detections if not set as text:
-                class_type = "Text"
+                class_type = DocItemLabel.TEXT
                 for cluster in cluster_predictions_low:
                     intersection = compute_intersection(
                         orph_cell["bbox"], cluster["bbox"]
                     )
-                    class_type = "Text"
+                    class_type = DocItemLabel.TEXT
                     if (
                         cluster["confidence"] > 0.1
                         and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
                     if cluster["id"] == node:
                         lines.append(cluster)
                         cluster_predictions.remove(cluster)
-            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
+            new_merged_cluster = build_cluster_from_lines(
+                lines, DocItemLabel.TEXT, max_id
+            )
             cluster_predictions.append(new_merged_cluster)
     return cluster_predictions
@@ -753,9 +756,9 @@ def clean_up_clusters(
                 # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                 elif img_table == True:
                     if (
-                        cluster_1["type"] == "Text"
-                        and cluster_2["type"] == "Picture"
-                        or cluster_2["type"] == "Table"
+                        cluster_1["type"] == DocItemLabel.TEXT
+                        and cluster_2["type"] == DocItemLabel.PICTURE
+                        or cluster_2["type"] == DocItemLabel.TABLE
                     ):
                         if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
             # remove tables that have one pdf cell
             if one_cell_table == True:
-                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
+                if (
+                    cluster_1["type"] == DocItemLabel.TABLE
+                    and len(cluster_1["cell_ids"]) < 2
+                ):
                     DuplicateDeletedClusterIDs.append(cluster_1["id"])
     DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))

docling/utils/profiling.py ADDED Viewed

@@ -0,0 +1,62 @@
+import time
+from datetime import datetime
+from enum import Enum
+from typing import TYPE_CHECKING, List
+import numpy as np
+from pydantic import BaseModel
+from docling.datamodel.settings import settings
+if TYPE_CHECKING:
+    from docling.datamodel.document import ConversionResult
+class ProfilingScope(str, Enum):
+    PAGE = "page"
+    DOCUMENT = "document"
+class ProfilingItem(BaseModel):
+    scope: ProfilingScope
+    count: int = 0
+    times: List[float] = []
+    start_timestamps: List[datetime] = []
+    def avg(self) -> float:
+        return np.average(self.times)  # type: ignore
+    def std(self) -> float:
+        return np.std(self.times)  # type: ignore
+    def mean(self) -> float:
+        return np.mean(self.times)  # type: ignore
+    def percentile(self, perc: float) -> float:
+        return np.percentile(self.times, perc)  # type: ignore
+class TimeRecorder:
+    def __init__(
+        self,
+        conv_res: "ConversionResult",
+        key: str,
+        scope: ProfilingScope = ProfilingScope.PAGE,
+    ):
+        if settings.debug.profile_pipeline_timings:
+            if key not in conv_res.timings.keys():
+                conv_res.timings[key] = ProfilingItem(scope=scope)
+            self.conv_res = conv_res
+            self.key = key
+    def __enter__(self):
+        if settings.debug.profile_pipeline_timings:
+            self.start = time.monotonic()
+            self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
+        return self
+    def __exit__(self, *args):
+        if settings.debug.profile_pipeline_timings:
+            elapsed = time.monotonic() - self.start
+            self.conv_res.timings[self.key].times.append(elapsed)
+            self.conv_res.timings[self.key].count += 1

docling-2.4.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,154 @@
+Metadata-Version: 2.1
+Name: docling
+Version: 2.4.1
+Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
+Home-page: https://github.com/DS4SD/docling
+License: MIT
+Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
+Author: Christoph Auer
+Author-email: cau@zurich.ibm.com
+Requires-Python: >=3.10,<4.0
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Provides-Extra: tesserocr
+Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
+Requires-Dist: certifi (>=2024.7.4)
+Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
+Requires-Dist: docling-core (>=2.3.0,<3.0.0)
+Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
+Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
+Requires-Dist: easyocr (>=1.7,<2.0)
+Requires-Dist: filetype (>=1.2.0,<2.0.0)
+Requires-Dist: huggingface_hub (>=0.23,<1)
+Requires-Dist: marko (>=2.1.2,<3.0.0)
+Requires-Dist: pandas (>=2.1.4,<3.0.0)
+Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
+Requires-Dist: pydantic (>=2.0.0,<3.0.0)
+Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
+Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
+Requires-Dist: python-docx (>=1.1.2,<2.0.0)
+Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
+Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: rtree (>=1.3.0,<2.0.0)
+Requires-Dist: scipy (>=1.14.1,<2.0.0)
+Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
+Requires-Dist: typer (>=0.12.5,<0.13.0)
+Project-URL: Repository, https://github.com/DS4SD/docling
+Description-Content-Type: text/markdown
+<p align="center">
+  <a href="https://github.com/ds4sd/docling">
+    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
+  </a>
+</p>
+# Docling
+<p align="center">
+  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
+[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
+[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
+![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
+[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
+[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
+[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
+Docling parses documents and exports them to the desired format with ease and speed.
+## Features
+* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
+* 📑 Advanced PDF document understanding including page layout, reading order & table structures
+* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
+* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
+* 🔍 OCR support for scanned PDFs
+* 💻 Simple and convenient CLI
+Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
+### Coming soon
+* ♾️ Equation & code extraction
+* 📝 Metadata extraction, including title, authors, references & language
+* 🦜🔗 Native LangChain extension
+## Installation
+To use Docling, simply install `docling` from your package manager, e.g. pip:
+```bash
+pip install docling
+```
+Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
+More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
+## Getting started
+To convert individual documents, use `convert()`, for example:
+```python
+from docling.document_converter import DocumentConverter
+source = "https://arxiv.org/pdf/2408.09869"  # document per local path or URL
+converter = DocumentConverter()
+result = converter.convert(source)
+print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
+```
+Check out [Getting started](https://ds4sd.github.io/docling/).
+You will find lots of tuning options to leverage all the advanced capabilities.
+## Get help and support
+Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
+## Technical report
+For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
+## Contributing
+Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
+## References
+If you use Docling in your projects, please consider citing the following:
+```bib
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {Docling Technical Report},
+  url = {https://arxiv.org/abs/2408.09869},
+  eprint = {2408.09869},
+  doi = {10.48550/arXiv.2408.09869},
+  version = {1.0.0},
+  year = {2024}
+}
+```
+## License
+The Docling codebase is under MIT license.
+For individual model usage, please refer to the model licenses found in the original packages.
+## IBM ❤️ Open Source AI
+Docling has been brought to you by IBM.

docling-2.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,45 @@
+docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
+docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
+docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
+docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
+docling/backend/html_backend.py,sha256=p3WlYta1f3e4osmvVR12KIUYLJimveTX8UwEkyPt7_g,15161
+docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
+docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
+docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
+docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
+docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
+docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/cli/main.py,sha256=IOeIpGoK_5AeE_6LYTU_nfZjqpZ5xeGaTCB8Vfsama0,9334
+docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
+docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
+docling/datamodel/pipeline_options.py,sha256=PqQ4VjMDN16oWZSUYtskQEH366504OZmnjinCaOWmMc,2444
+docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
+docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
+docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
+docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
+docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
+docling/models/easyocr_model.py,sha256=23hWq484qVS3nkch6nRRWowfQamN-McFZgfbHfp5Vuo,3818
+docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
+docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
+docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
+docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
+docling/models/tesseract_ocr_cli_model.py,sha256=ZflwQcD7YjhPqEB8bbgNgP14OBD4NNEJefUS8Lbr5X0,6511
+docling/models/tesseract_ocr_model.py,sha256=X9qlzwaTZLtSGXFIZuD7MO6EzFmHl1D-FjktUBko6us,6234
+docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
+docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
+docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
+docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
+docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
+docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
+docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
+docling-2.4.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.4.1.dist-info/METADATA,sha256=gomJT0uGaDrAANMI7fSJv2iUhmk0CcvlfiCP89VwCAo,6530
+docling-2.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.4.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.4.1.dist-info/RECORD,,

docling/pipeline/base_model_pipeline.py DELETED Viewed

@@ -1,18 +0,0 @@
-from pathlib import Path
-from typing import Callable, Iterable, List
-from docling.datamodel.base_models import Page
-from docling.datamodel.pipeline_options import PipelineOptions
-class BaseModelPipeline:
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe: List[Callable] = []
-        self.artifacts_path = artifacts_path
-        self.pipeline_options = pipeline_options
-    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
-        for model in self.model_pipe:
-            page_batch = model(page_batch)
-        yield from page_batch

docling/pipeline/standard_model_pipeline.py DELETED Viewed

@@ -1,66 +0,0 @@
-from pathlib import Path
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    PipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
-from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-class StandardModelPipeline(BaseModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
-    _table_model_path = "model_artifacts/tableformer"
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        super().__init__(artifacts_path, pipeline_options)
-        ocr_model: BaseOcrModel
-        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
-            ocr_model = EasyOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
-            ocr_model = TesseractOcrCliModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
-            ocr_model = TesseractOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        else:
-            raise RuntimeError(
-                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
-            )
-        self.model_pipe = [
-            # OCR
-            ocr_model,
-            # Layout
-            LayoutModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._layout_model_path
-                }
-            ),
-            # Table structure
-            TableStructureModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._table_model_path,
-                    "enabled": pipeline_options.do_table_structure,
-                    "mode": pipeline_options.table_structure_options.mode,
-                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
-                }
-            ),
-        ]

docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl