docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +15 -11
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.20.0.dist-info/METADATA +0 -380
  35. docling-1.20.0.dist-info/RECORD +0 -35
  36. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,7 @@ import copy
2
2
  import logging
3
3
 
4
4
  import networkx as nx
5
+ from docling_core.types.doc import DocItemLabel
5
6
 
6
7
  logger = logging.getLogger("layout_utils")
7
8
 
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
370
371
  "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
371
372
  )
372
373
  logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
373
- if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
374
+ if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
374
375
  logger.debug(" Empty non-picture, removed")
375
376
  continue ## Skip this former cluster, now without cells.
376
377
  new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
380
381
 
381
382
 
382
383
  def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
383
- if not (cluster["type"] in ["Table", "Picture"]):
384
+ if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
384
385
  ## A text-like cluster. The bbox only needs to be around the text cells:
385
386
  logger.debug(" Initial bbox: " + str(cluster["bbox"]))
386
387
  new_bbox = surrounding_list(
387
388
  [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
388
389
  )
389
390
  logger.debug(" New bounding box:" + str(new_bbox))
390
- if cluster["type"] == "Picture":
391
+ if cluster["type"] == DocItemLabel.PICTURE:
391
392
  ## We only make the bbox completely comprise included text cells:
392
393
  logger.debug(" Picture")
393
394
  if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
587
588
  max_id = -1
588
589
  figures = []
589
590
  for cluster in cluster_predictions:
590
- if cluster["type"] == "Picture":
591
+ if cluster["type"] == DocItemLabel.PICTURE:
591
592
  figures.append(cluster)
592
593
 
593
594
  if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
638
639
  # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
639
640
  if fig_flag == False and lines_detector == False:
640
641
  # get class from low confidence detections if not set as text:
641
- class_type = "Text"
642
+ class_type = DocItemLabel.TEXT
642
643
 
643
644
  for cluster in cluster_predictions_low:
644
645
  intersection = compute_intersection(
645
646
  orph_cell["bbox"], cluster["bbox"]
646
647
  )
647
- class_type = "Text"
648
+ class_type = DocItemLabel.TEXT
648
649
  if (
649
650
  cluster["confidence"] > 0.1
650
651
  and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
718
719
  if cluster["id"] == node:
719
720
  lines.append(cluster)
720
721
  cluster_predictions.remove(cluster)
721
- new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
722
+ new_merged_cluster = build_cluster_from_lines(
723
+ lines, DocItemLabel.TEXT, max_id
724
+ )
722
725
  cluster_predictions.append(new_merged_cluster)
723
726
  return cluster_predictions
724
727
 
@@ -753,9 +756,9 @@ def clean_up_clusters(
753
756
  # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
754
757
  elif img_table == True:
755
758
  if (
756
- cluster_1["type"] == "Text"
757
- and cluster_2["type"] == "Picture"
758
- or cluster_2["type"] == "Table"
759
+ cluster_1["type"] == DocItemLabel.TEXT
760
+ and cluster_2["type"] == DocItemLabel.PICTURE
761
+ or cluster_2["type"] == DocItemLabel.TABLE
759
762
  ):
760
763
  if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
761
764
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
771
774
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
772
775
  # remove tables that have one pdf cell
773
776
  if one_cell_table == True:
774
- if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
777
+ if (
778
+ cluster_1["type"] == DocItemLabel.TABLE
779
+ and len(cluster_1["cell_ids"]) < 2
780
+ ):
775
781
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
776
782
 
777
783
  DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 2.0.0
4
+ Summary: Docling PDF conversion package
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: tesserocr
23
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
+ Requires-Dist: certifi (>=2024.7.4)
25
+ Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
26
+ Requires-Dist: docling-core (>=2.0.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
+ Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
29
+ Requires-Dist: easyocr (>=1.7,<2.0)
30
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
+ Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
33
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
34
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
35
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
36
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
37
+ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
38
+ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
39
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
40
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
41
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
42
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
43
+ Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
44
+ Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
45
+ Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
46
+ Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
47
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
48
+ Project-URL: Repository, https://github.com/DS4SD/docling
49
+ Description-Content-Type: text/markdown
50
+
51
+ <p align="center">
52
+ <a href="https://github.com/ds4sd/docling">
53
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
54
+ </a>
55
+ </p>
56
+
57
+ # Docling
58
+
59
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
60
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
61
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
62
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
63
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
64
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
65
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
66
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
67
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
68
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
69
+
70
+ Docling parses documents and exports them to the desired format with ease and speed.
71
+
72
+ ## Features
73
+
74
+ * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
75
+ * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
76
+ * 📝 Metadata extraction, including title, authors, references & language
77
+ * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
78
+ * 🔍 OCR support for scanned PDFs
79
+ * 💻 Simple and convenient CLI
80
+
81
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
82
+
83
+
84
+ ## Installation
85
+
86
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
87
+ ```bash
88
+ pip install docling
89
+ ```
90
+
91
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
92
+
93
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
94
+
95
+ ## Getting started
96
+
97
+ To convert invidual documents, use `convert()`, for example:
98
+
99
+ ```python
100
+ from docling.document_converter import DocumentConverter
101
+
102
+ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
103
+ converter = DocumentConverter()
104
+ result = converter.convert(source)
105
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
106
+ print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
107
+ ```
108
+
109
+
110
+ Check out [Getting started](https://ds4sd.github.io/docling/).
111
+ You will find lots of tuning options to leverage all the advanced capabilities.
112
+
113
+
114
+ ## Get help and support
115
+
116
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
117
+
118
+
119
+ ## Technical report
120
+
121
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
122
+
123
+ ## Contributing
124
+
125
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
126
+
127
+
128
+ ## References
129
+
130
+ If you use Docling in your projects, please consider citing the following:
131
+
132
+ ```bib
133
+ @techreport{Docling,
134
+ author = {Deep Search Team},
135
+ month = {8},
136
+ title = {Docling Technical Report},
137
+ url = {https://arxiv.org/abs/2408.09869},
138
+ eprint = {2408.09869},
139
+ doi = {10.48550/arXiv.2408.09869},
140
+ version = {1.0.0},
141
+ year = {2024}
142
+ }
143
+ ```
144
+
145
+ ## License
146
+
147
+ The Docling codebase is under MIT license.
148
+ For individual model usage, please refer to the model licenses found in the original packages.
149
+
@@ -0,0 +1,42 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
4
+ docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
5
+ docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
6
+ docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
7
+ docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
8
+ docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
9
+ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
10
+ docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
11
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
13
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
15
+ docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
16
+ docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
17
+ docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
18
+ docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
19
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
21
+ docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
22
+ docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
23
+ docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
24
+ docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
25
+ docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
26
+ docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
27
+ docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
28
+ docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
29
+ docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
30
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
32
+ docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
33
+ docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
34
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
36
+ docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
37
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
38
+ docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
39
+ docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
40
+ docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
41
+ docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
42
+ docling-2.0.0.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- from pathlib import Path
2
- from typing import Callable, Iterable, List
3
-
4
- from docling.datamodel.base_models import Page
5
- from docling.datamodel.pipeline_options import PipelineOptions
6
-
7
-
8
- class BaseModelPipeline:
9
- def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
10
- self.model_pipe: List[Callable] = []
11
- self.artifacts_path = artifacts_path
12
- self.pipeline_options = pipeline_options
13
-
14
- def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
15
- for model in self.model_pipe:
16
- page_batch = model(page_batch)
17
-
18
- yield from page_batch
@@ -1,66 +0,0 @@
1
- from pathlib import Path
2
-
3
- from docling.datamodel.pipeline_options import (
4
- EasyOcrOptions,
5
- PipelineOptions,
6
- TesseractCliOcrOptions,
7
- TesseractOcrOptions,
8
- )
9
- from docling.models.base_ocr_model import BaseOcrModel
10
- from docling.models.easyocr_model import EasyOcrModel
11
- from docling.models.layout_model import LayoutModel
12
- from docling.models.table_structure_model import TableStructureModel
13
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
14
- from docling.models.tesseract_ocr_model import TesseractOcrModel
15
- from docling.pipeline.base_model_pipeline import BaseModelPipeline
16
-
17
-
18
- class StandardModelPipeline(BaseModelPipeline):
19
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
20
- _table_model_path = "model_artifacts/tableformer"
21
-
22
- def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
23
- super().__init__(artifacts_path, pipeline_options)
24
-
25
- ocr_model: BaseOcrModel
26
- if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
27
- ocr_model = EasyOcrModel(
28
- enabled=pipeline_options.do_ocr,
29
- options=pipeline_options.ocr_options,
30
- )
31
- elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
32
- ocr_model = TesseractOcrCliModel(
33
- enabled=pipeline_options.do_ocr,
34
- options=pipeline_options.ocr_options,
35
- )
36
- elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
37
- ocr_model = TesseractOcrModel(
38
- enabled=pipeline_options.do_ocr,
39
- options=pipeline_options.ocr_options,
40
- )
41
- else:
42
- raise RuntimeError(
43
- f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
44
- )
45
-
46
- self.model_pipe = [
47
- # OCR
48
- ocr_model,
49
- # Layout
50
- LayoutModel(
51
- config={
52
- "artifacts_path": artifacts_path
53
- / StandardModelPipeline._layout_model_path
54
- }
55
- ),
56
- # Table structure
57
- TableStructureModel(
58
- config={
59
- "artifacts_path": artifacts_path
60
- / StandardModelPipeline._table_model_path,
61
- "enabled": pipeline_options.do_table_structure,
62
- "mode": pipeline_options.table_structure_options.mode,
63
- "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
64
- }
65
- ),
66
- ]