docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +364 -318
- docling/datamodel/pipeline_options.py +13 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +10 -5
- docling/models/ds_glm_model.py +209 -20
- docling/models/easyocr_model.py +4 -1
- docling/models/layout_model.py +73 -61
- docling/models/page_assemble_model.py +21 -5
- docling/models/page_preprocessing_model.py +57 -0
- docling/models/table_structure_model.py +34 -32
- docling/models/tesseract_ocr_cli_model.py +8 -5
- docling/models/tesseract_ocr_model.py +8 -5
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.0.0.dist-info/METADATA +149 -0
- docling-2.0.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
docling/utils/layout_utils.py
CHANGED
@@ -2,6 +2,7 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
|
4
4
|
import networkx as nx
|
5
|
+
from docling_core.types.doc import DocItemLabel
|
5
6
|
|
6
7
|
logger = logging.getLogger("layout_utils")
|
7
8
|
|
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
|
370
371
|
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
371
372
|
)
|
372
373
|
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
373
|
-
if len(cluster["cell_ids"]) == 0 and cluster["type"] !=
|
374
|
+
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
|
374
375
|
logger.debug(" Empty non-picture, removed")
|
375
376
|
continue ## Skip this former cluster, now without cells.
|
376
377
|
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
|
380
381
|
|
381
382
|
|
382
383
|
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
383
|
-
if not (cluster["type"] in [
|
384
|
+
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
|
384
385
|
## A text-like cluster. The bbox only needs to be around the text cells:
|
385
386
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
386
387
|
new_bbox = surrounding_list(
|
387
388
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
388
389
|
)
|
389
390
|
logger.debug(" New bounding box:" + str(new_bbox))
|
390
|
-
if cluster["type"] ==
|
391
|
+
if cluster["type"] == DocItemLabel.PICTURE:
|
391
392
|
## We only make the bbox completely comprise included text cells:
|
392
393
|
logger.debug(" Picture")
|
393
394
|
if len(cluster["cell_ids"]) != 0:
|
@@ -587,7 +588,7 @@ def set_orphan_as_text(
|
|
587
588
|
max_id = -1
|
588
589
|
figures = []
|
589
590
|
for cluster in cluster_predictions:
|
590
|
-
if cluster["type"] ==
|
591
|
+
if cluster["type"] == DocItemLabel.PICTURE:
|
591
592
|
figures.append(cluster)
|
592
593
|
|
593
594
|
if cluster["id"] > max_id:
|
@@ -638,13 +639,13 @@ def set_orphan_as_text(
|
|
638
639
|
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
639
640
|
if fig_flag == False and lines_detector == False:
|
640
641
|
# get class from low confidence detections if not set as text:
|
641
|
-
class_type =
|
642
|
+
class_type = DocItemLabel.TEXT
|
642
643
|
|
643
644
|
for cluster in cluster_predictions_low:
|
644
645
|
intersection = compute_intersection(
|
645
646
|
orph_cell["bbox"], cluster["bbox"]
|
646
647
|
)
|
647
|
-
class_type =
|
648
|
+
class_type = DocItemLabel.TEXT
|
648
649
|
if (
|
649
650
|
cluster["confidence"] > 0.1
|
650
651
|
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
|
|
718
719
|
if cluster["id"] == node:
|
719
720
|
lines.append(cluster)
|
720
721
|
cluster_predictions.remove(cluster)
|
721
|
-
new_merged_cluster = build_cluster_from_lines(
|
722
|
+
new_merged_cluster = build_cluster_from_lines(
|
723
|
+
lines, DocItemLabel.TEXT, max_id
|
724
|
+
)
|
722
725
|
cluster_predictions.append(new_merged_cluster)
|
723
726
|
return cluster_predictions
|
724
727
|
|
@@ -753,9 +756,9 @@ def clean_up_clusters(
|
|
753
756
|
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
754
757
|
elif img_table == True:
|
755
758
|
if (
|
756
|
-
cluster_1["type"] ==
|
757
|
-
and cluster_2["type"] ==
|
758
|
-
or cluster_2["type"] ==
|
759
|
+
cluster_1["type"] == DocItemLabel.TEXT
|
760
|
+
and cluster_2["type"] == DocItemLabel.PICTURE
|
761
|
+
or cluster_2["type"] == DocItemLabel.TABLE
|
759
762
|
):
|
760
763
|
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
761
764
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
@@ -771,7 +774,10 @@ def clean_up_clusters(
|
|
771
774
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
772
775
|
# remove tables that have one pdf cell
|
773
776
|
if one_cell_table == True:
|
774
|
-
if
|
777
|
+
if (
|
778
|
+
cluster_1["type"] == DocItemLabel.TABLE
|
779
|
+
and len(cluster_1["cell_ids"]) < 2
|
780
|
+
):
|
775
781
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
776
782
|
|
777
783
|
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
@@ -0,0 +1,149 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: docling
|
3
|
+
Version: 2.0.0
|
4
|
+
Summary: Docling PDF conversion package
|
5
|
+
Home-page: https://github.com/DS4SD/docling
|
6
|
+
License: MIT
|
7
|
+
Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
|
8
|
+
Author: Christoph Auer
|
9
|
+
Author-email: cau@zurich.ibm.com
|
10
|
+
Requires-Python: >=3.10,<4.0
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: tesserocr
|
23
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
|
+
Requires-Dist: certifi (>=2024.7.4)
|
25
|
+
Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
|
26
|
+
Requires-Dist: docling-core (>=2.0.0,<3.0.0)
|
27
|
+
Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
|
29
|
+
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
|
+
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
|
+
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
|
+
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
33
|
+
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
34
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
35
|
+
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
36
|
+
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
37
|
+
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
38
|
+
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
39
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
40
|
+
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
41
|
+
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
42
|
+
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
43
|
+
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
44
|
+
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
45
|
+
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
46
|
+
Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
47
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
48
|
+
Project-URL: Repository, https://github.com/DS4SD/docling
|
49
|
+
Description-Content-Type: text/markdown
|
50
|
+
|
51
|
+
<p align="center">
|
52
|
+
<a href="https://github.com/ds4sd/docling">
|
53
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
|
54
|
+
</a>
|
55
|
+
</p>
|
56
|
+
|
57
|
+
# Docling
|
58
|
+
|
59
|
+
[](https://arxiv.org/abs/2408.09869)
|
60
|
+
[](https://ds4sd.github.io/docling/)
|
61
|
+
[](https://pypi.org/project/docling/)
|
62
|
+

|
63
|
+
[](https://python-poetry.org/)
|
64
|
+
[](https://github.com/psf/black)
|
65
|
+
[](https://pycqa.github.io/isort/)
|
66
|
+
[](https://pydantic.dev)
|
67
|
+
[](https://github.com/pre-commit/pre-commit)
|
68
|
+
[](https://opensource.org/licenses/MIT)
|
69
|
+
|
70
|
+
Docling parses documents and exports them to the desired format with ease and speed.
|
71
|
+
|
72
|
+
## Features
|
73
|
+
|
74
|
+
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
75
|
+
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
|
76
|
+
* 📝 Metadata extraction, including title, authors, references & language
|
77
|
+
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
78
|
+
* 🔍 OCR support for scanned PDFs
|
79
|
+
* 💻 Simple and convenient CLI
|
80
|
+
|
81
|
+
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
82
|
+
|
83
|
+
|
84
|
+
## Installation
|
85
|
+
|
86
|
+
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
87
|
+
```bash
|
88
|
+
pip install docling
|
89
|
+
```
|
90
|
+
|
91
|
+
Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
|
92
|
+
|
93
|
+
More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
|
94
|
+
|
95
|
+
## Getting started
|
96
|
+
|
97
|
+
To convert invidual documents, use `convert()`, for example:
|
98
|
+
|
99
|
+
```python
|
100
|
+
from docling.document_converter import DocumentConverter
|
101
|
+
|
102
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
103
|
+
converter = DocumentConverter()
|
104
|
+
result = converter.convert(source)
|
105
|
+
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
106
|
+
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
107
|
+
```
|
108
|
+
|
109
|
+
|
110
|
+
Check out [Getting started](https://ds4sd.github.io/docling/).
|
111
|
+
You will find lots of tuning options to leverage all the advanced capabilities.
|
112
|
+
|
113
|
+
|
114
|
+
## Get help and support
|
115
|
+
|
116
|
+
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
117
|
+
|
118
|
+
|
119
|
+
## Technical report
|
120
|
+
|
121
|
+
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
122
|
+
|
123
|
+
## Contributing
|
124
|
+
|
125
|
+
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
126
|
+
|
127
|
+
|
128
|
+
## References
|
129
|
+
|
130
|
+
If you use Docling in your projects, please consider citing the following:
|
131
|
+
|
132
|
+
```bib
|
133
|
+
@techreport{Docling,
|
134
|
+
author = {Deep Search Team},
|
135
|
+
month = {8},
|
136
|
+
title = {Docling Technical Report},
|
137
|
+
url = {https://arxiv.org/abs/2408.09869},
|
138
|
+
eprint = {2408.09869},
|
139
|
+
doi = {10.48550/arXiv.2408.09869},
|
140
|
+
version = {1.0.0},
|
141
|
+
year = {2024}
|
142
|
+
}
|
143
|
+
```
|
144
|
+
|
145
|
+
## License
|
146
|
+
|
147
|
+
The Docling codebase is under MIT license.
|
148
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
149
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
|
5
|
+
docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
|
6
|
+
docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
|
7
|
+
docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
|
8
|
+
docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
|
9
|
+
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
10
|
+
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
11
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
|
13
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
|
15
|
+
docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
|
16
|
+
docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
|
17
|
+
docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
|
18
|
+
docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
|
19
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
|
21
|
+
docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
|
22
|
+
docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
|
23
|
+
docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
|
24
|
+
docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
|
25
|
+
docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
|
26
|
+
docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
|
27
|
+
docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
|
28
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
|
29
|
+
docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
|
30
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
|
32
|
+
docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
|
33
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
|
34
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
|
+
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
36
|
+
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
37
|
+
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
38
|
+
docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
39
|
+
docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
|
40
|
+
docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
41
|
+
docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
42
|
+
docling-2.0.0.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
from typing import Callable, Iterable, List
|
3
|
-
|
4
|
-
from docling.datamodel.base_models import Page
|
5
|
-
from docling.datamodel.pipeline_options import PipelineOptions
|
6
|
-
|
7
|
-
|
8
|
-
class BaseModelPipeline:
|
9
|
-
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
10
|
-
self.model_pipe: List[Callable] = []
|
11
|
-
self.artifacts_path = artifacts_path
|
12
|
-
self.pipeline_options = pipeline_options
|
13
|
-
|
14
|
-
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
15
|
-
for model in self.model_pipe:
|
16
|
-
page_batch = model(page_batch)
|
17
|
-
|
18
|
-
yield from page_batch
|
@@ -1,66 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
|
-
from docling.datamodel.pipeline_options import (
|
4
|
-
EasyOcrOptions,
|
5
|
-
PipelineOptions,
|
6
|
-
TesseractCliOcrOptions,
|
7
|
-
TesseractOcrOptions,
|
8
|
-
)
|
9
|
-
from docling.models.base_ocr_model import BaseOcrModel
|
10
|
-
from docling.models.easyocr_model import EasyOcrModel
|
11
|
-
from docling.models.layout_model import LayoutModel
|
12
|
-
from docling.models.table_structure_model import TableStructureModel
|
13
|
-
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
14
|
-
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
15
|
-
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
16
|
-
|
17
|
-
|
18
|
-
class StandardModelPipeline(BaseModelPipeline):
|
19
|
-
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
20
|
-
_table_model_path = "model_artifacts/tableformer"
|
21
|
-
|
22
|
-
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
23
|
-
super().__init__(artifacts_path, pipeline_options)
|
24
|
-
|
25
|
-
ocr_model: BaseOcrModel
|
26
|
-
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
27
|
-
ocr_model = EasyOcrModel(
|
28
|
-
enabled=pipeline_options.do_ocr,
|
29
|
-
options=pipeline_options.ocr_options,
|
30
|
-
)
|
31
|
-
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
32
|
-
ocr_model = TesseractOcrCliModel(
|
33
|
-
enabled=pipeline_options.do_ocr,
|
34
|
-
options=pipeline_options.ocr_options,
|
35
|
-
)
|
36
|
-
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
37
|
-
ocr_model = TesseractOcrModel(
|
38
|
-
enabled=pipeline_options.do_ocr,
|
39
|
-
options=pipeline_options.ocr_options,
|
40
|
-
)
|
41
|
-
else:
|
42
|
-
raise RuntimeError(
|
43
|
-
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
44
|
-
)
|
45
|
-
|
46
|
-
self.model_pipe = [
|
47
|
-
# OCR
|
48
|
-
ocr_model,
|
49
|
-
# Layout
|
50
|
-
LayoutModel(
|
51
|
-
config={
|
52
|
-
"artifacts_path": artifacts_path
|
53
|
-
/ StandardModelPipeline._layout_model_path
|
54
|
-
}
|
55
|
-
),
|
56
|
-
# Table structure
|
57
|
-
TableStructureModel(
|
58
|
-
config={
|
59
|
-
"artifacts_path": artifacts_path
|
60
|
-
/ StandardModelPipeline._table_model_path,
|
61
|
-
"enabled": pipeline_options.do_table_structure,
|
62
|
-
"mode": pipeline_options.table_structure_options.mode,
|
63
|
-
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
64
|
-
}
|
65
|
-
),
|
66
|
-
]
|