docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,7 @@ import copy
2
2
  import logging
3
3
 
4
4
  import networkx as nx
5
+ from docling_core.types.doc import DocItemLabel
5
6
 
6
7
  logger = logging.getLogger("layout_utils")
7
8
 
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
370
371
  "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
371
372
  )
372
373
  logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
373
- if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
374
+ if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
374
375
  logger.debug(" Empty non-picture, removed")
375
376
  continue ## Skip this former cluster, now without cells.
376
377
  new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
380
381
 
381
382
 
382
383
  def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
383
- if not (cluster["type"] in ["Table", "Picture"]):
384
+ if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
384
385
  ## A text-like cluster. The bbox only needs to be around the text cells:
385
386
  logger.debug(" Initial bbox: " + str(cluster["bbox"]))
386
387
  new_bbox = surrounding_list(
387
388
  [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
388
389
  )
389
390
  logger.debug(" New bounding box:" + str(new_bbox))
390
- if cluster["type"] == "Picture":
391
+ if cluster["type"] == DocItemLabel.PICTURE:
391
392
  ## We only make the bbox completely comprise included text cells:
392
393
  logger.debug(" Picture")
393
394
  if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
587
588
  max_id = -1
588
589
  figures = []
589
590
  for cluster in cluster_predictions:
590
- if cluster["type"] == "Picture":
591
+ if cluster["type"] == DocItemLabel.PICTURE:
591
592
  figures.append(cluster)
592
593
 
593
594
  if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
638
639
  # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
639
640
  if fig_flag == False and lines_detector == False:
640
641
  # get class from low confidence detections if not set as text:
641
- class_type = "Text"
642
+ class_type = DocItemLabel.TEXT
642
643
 
643
644
  for cluster in cluster_predictions_low:
644
645
  intersection = compute_intersection(
645
646
  orph_cell["bbox"], cluster["bbox"]
646
647
  )
647
- class_type = "Text"
648
+ class_type = DocItemLabel.TEXT
648
649
  if (
649
650
  cluster["confidence"] > 0.1
650
651
  and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
718
719
  if cluster["id"] == node:
719
720
  lines.append(cluster)
720
721
  cluster_predictions.remove(cluster)
721
- new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
722
+ new_merged_cluster = build_cluster_from_lines(
723
+ lines, DocItemLabel.TEXT, max_id
724
+ )
722
725
  cluster_predictions.append(new_merged_cluster)
723
726
  return cluster_predictions
724
727
 
@@ -753,9 +756,9 @@ def clean_up_clusters(
753
756
  # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
754
757
  elif img_table == True:
755
758
  if (
756
- cluster_1["type"] == "Text"
757
- and cluster_2["type"] == "Picture"
758
- or cluster_2["type"] == "Table"
759
+ cluster_1["type"] == DocItemLabel.TEXT
760
+ and cluster_2["type"] == DocItemLabel.PICTURE
761
+ or cluster_2["type"] == DocItemLabel.TABLE
759
762
  ):
760
763
  if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
761
764
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
771
774
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
772
775
  # remove tables that have one pdf cell
773
776
  if one_cell_table == True:
774
- if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
777
+ if (
778
+ cluster_1["type"] == DocItemLabel.TABLE
779
+ and len(cluster_1["cell_ids"]) < 2
780
+ ):
775
781
  DuplicateDeletedClusterIDs.append(cluster_1["id"])
776
782
 
777
783
  DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
@@ -0,0 +1,62 @@
1
+ import time
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import TYPE_CHECKING, List
5
+
6
+ import numpy as np
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.settings import settings
10
+
11
+ if TYPE_CHECKING:
12
+ from docling.datamodel.document import ConversionResult
13
+
14
+
15
+ class ProfilingScope(str, Enum):
16
+ PAGE = "page"
17
+ DOCUMENT = "document"
18
+
19
+
20
+ class ProfilingItem(BaseModel):
21
+ scope: ProfilingScope
22
+ count: int = 0
23
+ times: List[float] = []
24
+ start_timestamps: List[datetime] = []
25
+
26
+ def avg(self) -> float:
27
+ return np.average(self.times) # type: ignore
28
+
29
+ def std(self) -> float:
30
+ return np.std(self.times) # type: ignore
31
+
32
+ def mean(self) -> float:
33
+ return np.mean(self.times) # type: ignore
34
+
35
+ def percentile(self, perc: float) -> float:
36
+ return np.percentile(self.times, perc) # type: ignore
37
+
38
+
39
+ class TimeRecorder:
40
+ def __init__(
41
+ self,
42
+ conv_res: "ConversionResult",
43
+ key: str,
44
+ scope: ProfilingScope = ProfilingScope.PAGE,
45
+ ):
46
+ if settings.debug.profile_pipeline_timings:
47
+ if key not in conv_res.timings.keys():
48
+ conv_res.timings[key] = ProfilingItem(scope=scope)
49
+ self.conv_res = conv_res
50
+ self.key = key
51
+
52
+ def __enter__(self):
53
+ if settings.debug.profile_pipeline_timings:
54
+ self.start = time.monotonic()
55
+ self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
56
+ return self
57
+
58
+ def __exit__(self, *args):
59
+ if settings.debug.profile_pipeline_timings:
60
+ elapsed = time.monotonic() - self.start
61
+ self.conv_res.timings[self.key].times.append(elapsed)
62
+ self.conv_res.timings[self.key].count += 1
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 2.4.1
4
+ Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: tesserocr
23
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
+ Requires-Dist: certifi (>=2024.7.4)
25
+ Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
+ Requires-Dist: docling-core (>=2.3.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
29
+ Requires-Dist: easyocr (>=1.7,<2.0)
30
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
+ Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: marko (>=2.1.2,<3.0.0)
33
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
34
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
35
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
36
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
37
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
38
+ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
39
+ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
40
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
41
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
42
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
43
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
44
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
45
+ Project-URL: Repository, https://github.com/DS4SD/docling
46
+ Description-Content-Type: text/markdown
47
+
48
+ <p align="center">
49
+ <a href="https://github.com/ds4sd/docling">
50
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
51
+ </a>
52
+ </p>
53
+
54
+ # Docling
55
+
56
+ <p align="center">
57
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
58
+ </p>
59
+
60
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
61
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
62
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
63
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
64
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
65
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
66
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
67
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
68
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
69
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
70
+
71
+ Docling parses documents and exports them to the desired format with ease and speed.
72
+
73
+ ## Features
74
+
75
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
76
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
77
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
78
+ * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
79
+ * 🔍 OCR support for scanned PDFs
80
+ * 💻 Simple and convenient CLI
81
+
82
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
83
+
84
+ ### Coming soon
85
+
86
+ * ♾️ Equation & code extraction
87
+ * 📝 Metadata extraction, including title, authors, references & language
88
+ * 🦜🔗 Native LangChain extension
89
+
90
+ ## Installation
91
+
92
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
93
+ ```bash
94
+ pip install docling
95
+ ```
96
+
97
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
98
+
99
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
100
+
101
+ ## Getting started
102
+
103
+ To convert individual documents, use `convert()`, for example:
104
+
105
+ ```python
106
+ from docling.document_converter import DocumentConverter
107
+
108
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
109
+ converter = DocumentConverter()
110
+ result = converter.convert(source)
111
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
112
+ ```
113
+
114
+ Check out [Getting started](https://ds4sd.github.io/docling/).
115
+ You will find lots of tuning options to leverage all the advanced capabilities.
116
+
117
+ ## Get help and support
118
+
119
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
120
+
121
+ ## Technical report
122
+
123
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
124
+
125
+ ## Contributing
126
+
127
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
128
+
129
+ ## References
130
+
131
+ If you use Docling in your projects, please consider citing the following:
132
+
133
+ ```bib
134
+ @techreport{Docling,
135
+ author = {Deep Search Team},
136
+ month = {8},
137
+ title = {Docling Technical Report},
138
+ url = {https://arxiv.org/abs/2408.09869},
139
+ eprint = {2408.09869},
140
+ doi = {10.48550/arXiv.2408.09869},
141
+ version = {1.0.0},
142
+ year = {2024}
143
+ }
144
+ ```
145
+
146
+ ## License
147
+
148
+ The Docling codebase is under MIT license.
149
+ For individual model usage, please refer to the model licenses found in the original packages.
150
+
151
+ ## IBM ❤️ Open Source AI
152
+
153
+ Docling has been brought to you by IBM.
154
+
@@ -0,0 +1,45 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
+ docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
+ docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
6
+ docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
7
+ docling/backend/html_backend.py,sha256=p3WlYta1f3e4osmvVR12KIUYLJimveTX8UwEkyPt7_g,15161
8
+ docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
+ docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
+ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
11
+ docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
+ docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
13
+ docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ docling/cli/main.py,sha256=IOeIpGoK_5AeE_6LYTU_nfZjqpZ5xeGaTCB8Vfsama0,9334
15
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
17
+ docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
18
+ docling/datamodel/pipeline_options.py,sha256=PqQ4VjMDN16oWZSUYtskQEH366504OZmnjinCaOWmMc,2444
19
+ docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
20
+ docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
21
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
23
+ docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
24
+ docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
25
+ docling/models/easyocr_model.py,sha256=23hWq484qVS3nkch6nRRWowfQamN-McFZgfbHfp5Vuo,3818
26
+ docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
27
+ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
28
+ docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
29
+ docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
30
+ docling/models/tesseract_ocr_cli_model.py,sha256=ZflwQcD7YjhPqEB8bbgNgP14OBD4NNEJefUS8Lbr5X0,6511
31
+ docling/models/tesseract_ocr_model.py,sha256=X9qlzwaTZLtSGXFIZuD7MO6EzFmHl1D-FjktUBko6us,6234
32
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
34
+ docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
35
+ docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
36
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
+ docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
+ docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
40
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
41
+ docling-2.4.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
42
+ docling-2.4.1.dist-info/METADATA,sha256=gomJT0uGaDrAANMI7fSJv2iUhmk0CcvlfiCP89VwCAo,6530
43
+ docling-2.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
+ docling-2.4.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
45
+ docling-2.4.1.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- from pathlib import Path
2
- from typing import Callable, Iterable, List
3
-
4
- from docling.datamodel.base_models import Page
5
- from docling.datamodel.pipeline_options import PipelineOptions
6
-
7
-
8
- class BaseModelPipeline:
9
- def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
10
- self.model_pipe: List[Callable] = []
11
- self.artifacts_path = artifacts_path
12
- self.pipeline_options = pipeline_options
13
-
14
- def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
15
- for model in self.model_pipe:
16
- page_batch = model(page_batch)
17
-
18
- yield from page_batch
@@ -1,66 +0,0 @@
1
- from pathlib import Path
2
-
3
- from docling.datamodel.pipeline_options import (
4
- EasyOcrOptions,
5
- PipelineOptions,
6
- TesseractCliOcrOptions,
7
- TesseractOcrOptions,
8
- )
9
- from docling.models.base_ocr_model import BaseOcrModel
10
- from docling.models.easyocr_model import EasyOcrModel
11
- from docling.models.layout_model import LayoutModel
12
- from docling.models.table_structure_model import TableStructureModel
13
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
14
- from docling.models.tesseract_ocr_model import TesseractOcrModel
15
- from docling.pipeline.base_model_pipeline import BaseModelPipeline
16
-
17
-
18
- class StandardModelPipeline(BaseModelPipeline):
19
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
20
- _table_model_path = "model_artifacts/tableformer"
21
-
22
- def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
23
- super().__init__(artifacts_path, pipeline_options)
24
-
25
- ocr_model: BaseOcrModel
26
- if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
27
- ocr_model = EasyOcrModel(
28
- enabled=pipeline_options.do_ocr,
29
- options=pipeline_options.ocr_options,
30
- )
31
- elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
32
- ocr_model = TesseractOcrCliModel(
33
- enabled=pipeline_options.do_ocr,
34
- options=pipeline_options.ocr_options,
35
- )
36
- elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
37
- ocr_model = TesseractOcrModel(
38
- enabled=pipeline_options.do_ocr,
39
- options=pipeline_options.ocr_options,
40
- )
41
- else:
42
- raise RuntimeError(
43
- f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
44
- )
45
-
46
- self.model_pipe = [
47
- # OCR
48
- ocr_model,
49
- # Layout
50
- LayoutModel(
51
- config={
52
- "artifacts_path": artifacts_path
53
- / StandardModelPipeline._layout_model_path
54
- }
55
- ),
56
- # Table structure
57
- TableStructureModel(
58
- config={
59
- "artifacts_path": artifacts_path
60
- / StandardModelPipeline._table_model_path,
61
- "enabled": pipeline_options.do_table_structure,
62
- "mode": pipeline_options.table_structure_options.mode,
63
- "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
64
- }
65
- ),
66
- ]