PyPI - docling - Versions diffs - 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

docling 1.20.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +15 -11
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +364 -318
docling/datamodel/pipeline_options.py +13 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +10 -5
docling/models/ds_glm_model.py +209 -20
docling/models/easyocr_model.py +4 -1
docling/models/layout_model.py +73 -61
docling/models/page_assemble_model.py +21 -5
docling/models/page_preprocessing_model.py +57 -0
docling/models/table_structure_model.py +34 -32
docling/models/tesseract_ocr_cli_model.py +8 -5
docling/models/tesseract_ocr_model.py +8 -5
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.0.0.dist-info/METADATA +149 -0
docling-2.0.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.20.0.dist-info/METADATA +0 -380
docling-1.20.0.dist-info/RECORD +0 -35
{docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
{docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
{docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0

docling/utils/layout_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import copy
 import logging
 import networkx as nx
+from docling_core.types.doc import DocItemLabel
 logger = logging.getLogger("layout_utils")
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
             "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
         )
         logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
-        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
             logger.debug("  Empty non-picture, removed")
             continue  ## Skip this former cluster, now without cells.
         new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
-    if not (cluster["type"] in ["Table", "Picture"]):
+    if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
         ## A text-like cluster. The bbox only needs to be around the text cells:
         logger.debug("    Initial bbox: " + str(cluster["bbox"]))
         new_bbox = surrounding_list(
             [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
         )
         logger.debug("  New bounding box:" + str(new_bbox))
-    if cluster["type"] == "Picture":
+    if cluster["type"] == DocItemLabel.PICTURE:
         ## We only make the bbox completely comprise included text cells:
         logger.debug("  Picture")
         if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
     max_id = -1
     figures = []
     for cluster in cluster_predictions:
-        if cluster["type"] == "Picture":
+        if cluster["type"] == DocItemLabel.PICTURE:
             figures.append(cluster)
         if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
             # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
             if fig_flag == False and lines_detector == False:
                 # get class from low confidence detections if not set as text:
-                class_type = "Text"
+                class_type = DocItemLabel.TEXT
                 for cluster in cluster_predictions_low:
                     intersection = compute_intersection(
                         orph_cell["bbox"], cluster["bbox"]
                     )
-                    class_type = "Text"
+                    class_type = DocItemLabel.TEXT
                     if (
                         cluster["confidence"] > 0.1
                         and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
                     if cluster["id"] == node:
                         lines.append(cluster)
                         cluster_predictions.remove(cluster)
-            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
+            new_merged_cluster = build_cluster_from_lines(
+                lines, DocItemLabel.TEXT, max_id
+            )
             cluster_predictions.append(new_merged_cluster)
     return cluster_predictions
@@ -753,9 +756,9 @@ def clean_up_clusters(
                 # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                 elif img_table == True:
                     if (
-                        cluster_1["type"] == "Text"
-                        and cluster_2["type"] == "Picture"
-                        or cluster_2["type"] == "Table"
+                        cluster_1["type"] == DocItemLabel.TEXT
+                        and cluster_2["type"] == DocItemLabel.PICTURE
+                        or cluster_2["type"] == DocItemLabel.TABLE
                     ):
                         if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
             # remove tables that have one pdf cell
             if one_cell_table == True:
-                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
+                if (
+                    cluster_1["type"] == DocItemLabel.TABLE
+                    and len(cluster_1["cell_ids"]) < 2
+                ):
                     DuplicateDeletedClusterIDs.append(cluster_1["id"])
     DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))

docling-2.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,149 @@
+Metadata-Version: 2.1
+Name: docling
+Version: 2.0.0
+Summary: Docling PDF conversion package
+Home-page: https://github.com/DS4SD/docling
+License: MIT
+Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
+Author: Christoph Auer
+Author-email: cau@zurich.ibm.com
+Requires-Python: >=3.10,<4.0
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Provides-Extra: tesserocr
+Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
+Requires-Dist: certifi (>=2024.7.4)
+Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
+Requires-Dist: docling-core (>=2.0.0,<3.0.0)
+Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
+Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
+Requires-Dist: easyocr (>=1.7,<2.0)
+Requires-Dist: filetype (>=1.2.0,<2.0.0)
+Requires-Dist: huggingface_hub (>=0.23,<1)
+Requires-Dist: pandas (>=2.1.4,<3.0.0)
+Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
+Requires-Dist: pydantic (>=2.0.0,<3.0.0)
+Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
+Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
+Requires-Dist: python-docx (>=1.1.2,<2.0.0)
+Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
+Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: rtree (>=1.3.0,<2.0.0)
+Requires-Dist: scipy (>=1.14.1,<2.0.0)
+Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
+Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
+Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
+Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
+Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
+Requires-Dist: typer (>=0.12.5,<0.13.0)
+Project-URL: Repository, https://github.com/DS4SD/docling
+Description-Content-Type: text/markdown
+<p align="center">
+  <a href="https://github.com/ds4sd/docling">
+    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
+  </a>
+</p>
+# Docling
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
+[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
+[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
+![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
+[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
+[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
+[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
+Docling parses documents and exports them to the desired format with ease and speed.
+## Features
+* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
+* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
+* 📝 Metadata extraction, including title, authors, references & language
+* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
+* 🔍 OCR support for scanned PDFs
+* 💻 Simple and convenient CLI
+Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
+## Installation
+To use Docling, simply install `docling` from your package manager, e.g. pip:
+```bash
+pip install docling
+```
+Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
+More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
+## Getting started
+To convert invidual documents, use `convert()`, for example:
+```python
+from docling.document_converter import DocumentConverter
+source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
+converter = DocumentConverter()
+result = converter.convert(source)
+print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
+print(result.document.export_to_document_tokens())  # output: "<document><title><page_1><loc_20>..."
+```
+Check out [Getting started](https://ds4sd.github.io/docling/).
+You will find lots of tuning options to leverage all the advanced capabilities.
+## Get help and support
+Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
+## Technical report
+For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
+## Contributing
+Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
+## References
+If you use Docling in your projects, please consider citing the following:
+```bib
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {Docling Technical Report},
+  url = {https://arxiv.org/abs/2408.09869},
+  eprint = {2408.09869},
+  doi = {10.48550/arXiv.2408.09869},
+  version = {1.0.0},
+  year = {2024}
+}
+```
+## License
+The Docling codebase is under MIT license.
+For individual model usage, please refer to the model licenses found in the original packages.

docling-2.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,42 @@
+docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
+docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
+docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
+docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
+docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
+docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
+docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
+docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
+docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
+docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
+docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
+docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
+docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
+docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
+docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
+docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
+docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
+docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
+docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
+docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
+docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
+docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
+docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
+docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
+docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
+docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
+docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
+docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
+docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
+docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
+docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
+docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.0.0.dist-info/RECORD,,

docling/pipeline/base_model_pipeline.py DELETED Viewed

@@ -1,18 +0,0 @@
-from pathlib import Path
-from typing import Callable, Iterable, List
-from docling.datamodel.base_models import Page
-from docling.datamodel.pipeline_options import PipelineOptions
-class BaseModelPipeline:
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe: List[Callable] = []
-        self.artifacts_path = artifacts_path
-        self.pipeline_options = pipeline_options
-    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
-        for model in self.model_pipe:
-            page_batch = model(page_batch)
-        yield from page_batch

docling/pipeline/standard_model_pipeline.py DELETED Viewed

@@ -1,66 +0,0 @@
-from pathlib import Path
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    PipelineOptions,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.models.easyocr_model import EasyOcrModel
-from docling.models.layout_model import LayoutModel
-from docling.models.table_structure_model import TableStructureModel
-from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
-from docling.models.tesseract_ocr_model import TesseractOcrModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-class StandardModelPipeline(BaseModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
-    _table_model_path = "model_artifacts/tableformer"
-    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        super().__init__(artifacts_path, pipeline_options)
-        ocr_model: BaseOcrModel
-        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
-            ocr_model = EasyOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
-            ocr_model = TesseractOcrCliModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
-            ocr_model = TesseractOcrModel(
-                enabled=pipeline_options.do_ocr,
-                options=pipeline_options.ocr_options,
-            )
-        else:
-            raise RuntimeError(
-                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
-            )
-        self.model_pipe = [
-            # OCR
-            ocr_model,
-            # Layout
-            LayoutModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._layout_model_path
-                }
-            ),
-            # Table structure
-            TableStructureModel(
-                config={
-                    "artifacts_path": artifacts_path
-                    / StandardModelPipeline._table_model_path,
-                    "enabled": pipeline_options.do_table_structure,
-                    "mode": pipeline_options.table_structure_options.mode,
-                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
-                }
-            ),
-        ]

docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

docling 1.20.0py3-none-any.whl → 2.0.0py3-none-any.whl