docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal, Optional, Type, TypedDict
|
|
5
|
+
|
|
6
|
+
import numpy
|
|
7
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
8
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
|
9
|
+
|
|
10
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
11
|
+
from docling.datamodel.base_models import Page
|
|
12
|
+
from docling.datamodel.document import ConversionResult
|
|
13
|
+
from docling.datamodel.pipeline_options import (
|
|
14
|
+
OcrOptions,
|
|
15
|
+
RapidOcrOptions,
|
|
16
|
+
)
|
|
17
|
+
from docling.datamodel.settings import settings
|
|
18
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
19
|
+
from docling.utils.accelerator_utils import decide_device
|
|
20
|
+
from docling.utils.profiling import TimeRecorder
|
|
21
|
+
from docling.utils.utils import download_url_with_progress
|
|
22
|
+
|
|
23
|
+
_log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
_ModelPathEngines = Literal["onnxruntime", "torch"]
|
|
26
|
+
_ModelPathTypes = Literal[
|
|
27
|
+
"det_model_path", "cls_model_path", "rec_model_path", "rec_keys_path", "font_path"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _ModelPathDetail(TypedDict):
|
|
32
|
+
url: str
|
|
33
|
+
path: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RapidOcrModel(BaseOcrModel):
|
|
37
|
+
_model_repo_folder = "RapidOcr"
|
|
38
|
+
# from https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/default_models.yaml
|
|
39
|
+
# matching the default config in https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/config.yaml
|
|
40
|
+
# and naming f"{file_info.engine_type.value}.{file_info.ocr_version.value}.{file_info.task_type.value}"
|
|
41
|
+
_default_models: dict[
|
|
42
|
+
_ModelPathEngines, dict[_ModelPathTypes, _ModelPathDetail]
|
|
43
|
+
] = {
|
|
44
|
+
"onnxruntime": {
|
|
45
|
+
"det_model_path": {
|
|
46
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
|
|
47
|
+
"path": "onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
|
|
48
|
+
},
|
|
49
|
+
"cls_model_path": {
|
|
50
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
|
|
51
|
+
"path": "onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
|
|
52
|
+
},
|
|
53
|
+
"rec_model_path": {
|
|
54
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
|
|
55
|
+
"path": "onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
|
|
56
|
+
},
|
|
57
|
+
"rec_keys_path": {
|
|
58
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v2.0.7/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
59
|
+
"path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
60
|
+
},
|
|
61
|
+
"font_path": {
|
|
62
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF",
|
|
63
|
+
"path": "fonts/FZYTK.TTF",
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
"torch": {
|
|
67
|
+
"det_model_path": {
|
|
68
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
|
|
69
|
+
"path": "torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
|
|
70
|
+
},
|
|
71
|
+
"cls_model_path": {
|
|
72
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
|
|
73
|
+
"path": "torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
|
|
74
|
+
},
|
|
75
|
+
"rec_model_path": {
|
|
76
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
|
|
77
|
+
"path": "torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
|
|
78
|
+
},
|
|
79
|
+
"rec_keys_path": {
|
|
80
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
81
|
+
"path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
82
|
+
},
|
|
83
|
+
"font_path": {
|
|
84
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF",
|
|
85
|
+
"path": "fonts/FZYTK.TTF",
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
enabled: bool,
|
|
93
|
+
artifacts_path: Optional[Path],
|
|
94
|
+
options: RapidOcrOptions,
|
|
95
|
+
accelerator_options: AcceleratorOptions,
|
|
96
|
+
):
|
|
97
|
+
super().__init__(
|
|
98
|
+
enabled=enabled,
|
|
99
|
+
artifacts_path=artifacts_path,
|
|
100
|
+
options=options,
|
|
101
|
+
accelerator_options=accelerator_options,
|
|
102
|
+
)
|
|
103
|
+
self.options: RapidOcrOptions
|
|
104
|
+
|
|
105
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
|
106
|
+
|
|
107
|
+
if self.enabled:
|
|
108
|
+
try:
|
|
109
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
|
110
|
+
except ImportError:
|
|
111
|
+
raise ImportError(
|
|
112
|
+
"RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
|
|
113
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Decide the accelerator devices
|
|
117
|
+
device = decide_device(accelerator_options.device)
|
|
118
|
+
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
|
119
|
+
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
|
120
|
+
intra_op_num_threads = accelerator_options.num_threads
|
|
121
|
+
gpu_id = 0
|
|
122
|
+
if use_cuda and ":" in device:
|
|
123
|
+
gpu_id = int(device.split(":")[1])
|
|
124
|
+
_ALIASES = {
|
|
125
|
+
"onnxruntime": EngineType.ONNXRUNTIME,
|
|
126
|
+
"openvino": EngineType.OPENVINO,
|
|
127
|
+
"paddle": EngineType.PADDLE,
|
|
128
|
+
"torch": EngineType.TORCH,
|
|
129
|
+
}
|
|
130
|
+
backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
|
|
131
|
+
|
|
132
|
+
det_model_path = self.options.det_model_path
|
|
133
|
+
cls_model_path = self.options.cls_model_path
|
|
134
|
+
rec_model_path = self.options.rec_model_path
|
|
135
|
+
rec_keys_path = self.options.rec_keys_path
|
|
136
|
+
font_path = self.options.font_path
|
|
137
|
+
if artifacts_path is not None:
|
|
138
|
+
det_model_path = (
|
|
139
|
+
det_model_path
|
|
140
|
+
or artifacts_path
|
|
141
|
+
/ self._model_repo_folder
|
|
142
|
+
/ self._default_models[backend_enum.value]["det_model_path"]["path"]
|
|
143
|
+
)
|
|
144
|
+
cls_model_path = (
|
|
145
|
+
cls_model_path
|
|
146
|
+
or artifacts_path
|
|
147
|
+
/ self._model_repo_folder
|
|
148
|
+
/ self._default_models[backend_enum.value]["cls_model_path"]["path"]
|
|
149
|
+
)
|
|
150
|
+
rec_model_path = (
|
|
151
|
+
rec_model_path
|
|
152
|
+
or artifacts_path
|
|
153
|
+
/ self._model_repo_folder
|
|
154
|
+
/ self._default_models[backend_enum.value]["rec_model_path"]["path"]
|
|
155
|
+
)
|
|
156
|
+
rec_keys_path = (
|
|
157
|
+
rec_keys_path
|
|
158
|
+
or artifacts_path
|
|
159
|
+
/ self._model_repo_folder
|
|
160
|
+
/ self._default_models[backend_enum.value]["rec_keys_path"]["path"]
|
|
161
|
+
)
|
|
162
|
+
font_path = (
|
|
163
|
+
font_path
|
|
164
|
+
or artifacts_path
|
|
165
|
+
/ self._model_repo_folder
|
|
166
|
+
/ self._default_models[backend_enum.value]["font_path"]["path"]
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
for model_path in (
|
|
170
|
+
rec_keys_path,
|
|
171
|
+
cls_model_path,
|
|
172
|
+
rec_model_path,
|
|
173
|
+
rec_keys_path,
|
|
174
|
+
font_path,
|
|
175
|
+
):
|
|
176
|
+
if model_path is None:
|
|
177
|
+
continue
|
|
178
|
+
if not Path(model_path).exists():
|
|
179
|
+
_log.warning(f"The provided model path {model_path} is not found.")
|
|
180
|
+
|
|
181
|
+
params = {
|
|
182
|
+
# Global settings (these are still correct)
|
|
183
|
+
"Global.text_score": self.options.text_score,
|
|
184
|
+
"Global.font_path": font_path,
|
|
185
|
+
# "Global.verbose": self.options.print_verbose,
|
|
186
|
+
# Detection model settings
|
|
187
|
+
"Det.model_path": det_model_path,
|
|
188
|
+
"Det.use_cuda": use_cuda,
|
|
189
|
+
"Det.use_dml": use_dml,
|
|
190
|
+
"Det.intra_op_num_threads": intra_op_num_threads,
|
|
191
|
+
# Classification model settings
|
|
192
|
+
"Cls.model_path": cls_model_path,
|
|
193
|
+
"Cls.use_cuda": use_cuda,
|
|
194
|
+
"Cls.use_dml": use_dml,
|
|
195
|
+
"Cls.intra_op_num_threads": intra_op_num_threads,
|
|
196
|
+
# Recognition model settings
|
|
197
|
+
"Rec.model_path": rec_model_path,
|
|
198
|
+
"Rec.font_path": font_path,
|
|
199
|
+
"Rec.rec_keys_path": rec_keys_path,
|
|
200
|
+
"Rec.use_cuda": use_cuda,
|
|
201
|
+
"Rec.use_dml": use_dml,
|
|
202
|
+
"Rec.intra_op_num_threads": intra_op_num_threads,
|
|
203
|
+
"Det.engine_type": backend_enum,
|
|
204
|
+
"Cls.engine_type": backend_enum,
|
|
205
|
+
"Rec.engine_type": backend_enum,
|
|
206
|
+
"EngineConfig.paddle.use_cuda": use_cuda,
|
|
207
|
+
"EngineConfig.paddle.gpu_id": gpu_id,
|
|
208
|
+
"EngineConfig.torch.use_cuda": use_cuda,
|
|
209
|
+
"EngineConfig.torch.gpu_id": gpu_id,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if self.options.rec_font_path is not None:
|
|
213
|
+
_log.warning(
|
|
214
|
+
"The 'rec_font_path' option for RapidOCR is deprecated. Please use 'font_path' instead."
|
|
215
|
+
)
|
|
216
|
+
user_params = self.options.rapidocr_params
|
|
217
|
+
if user_params:
|
|
218
|
+
_log.debug("Overwriting RapidOCR params with user-provided values.")
|
|
219
|
+
params.update(user_params)
|
|
220
|
+
|
|
221
|
+
self.reader = RapidOCR(
|
|
222
|
+
params=params,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def download_models(
|
|
227
|
+
backend: _ModelPathEngines,
|
|
228
|
+
local_dir: Optional[Path] = None,
|
|
229
|
+
force: bool = False,
|
|
230
|
+
progress: bool = False,
|
|
231
|
+
) -> Path:
|
|
232
|
+
if local_dir is None:
|
|
233
|
+
local_dir = settings.cache_dir / "models" / RapidOcrModel._model_repo_folder
|
|
234
|
+
|
|
235
|
+
local_dir.mkdir(parents=True, exist_ok=True)
|
|
236
|
+
|
|
237
|
+
# Download models
|
|
238
|
+
for model_type, model_details in RapidOcrModel._default_models[backend].items():
|
|
239
|
+
output_path = local_dir / model_details["path"]
|
|
240
|
+
if output_path.exists() and not force:
|
|
241
|
+
continue
|
|
242
|
+
output_path.parent.mkdir(exist_ok=True, parents=True)
|
|
243
|
+
buf = download_url_with_progress(model_details["url"], progress=progress)
|
|
244
|
+
with output_path.open("wb") as fw:
|
|
245
|
+
fw.write(buf.read())
|
|
246
|
+
|
|
247
|
+
return local_dir
|
|
248
|
+
|
|
249
|
+
def __call__(
|
|
250
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
251
|
+
) -> Iterable[Page]:
|
|
252
|
+
if not self.enabled:
|
|
253
|
+
yield from page_batch
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
for page in page_batch:
|
|
257
|
+
assert page._backend is not None
|
|
258
|
+
if not page._backend.is_valid():
|
|
259
|
+
yield page
|
|
260
|
+
else:
|
|
261
|
+
with TimeRecorder(conv_res, "ocr"):
|
|
262
|
+
ocr_rects = self.get_ocr_rects(page)
|
|
263
|
+
|
|
264
|
+
all_ocr_cells = []
|
|
265
|
+
for ocr_rect in ocr_rects:
|
|
266
|
+
# Skip zero area boxes
|
|
267
|
+
if ocr_rect.area() == 0:
|
|
268
|
+
continue
|
|
269
|
+
high_res_image = page._backend.get_page_image(
|
|
270
|
+
scale=self.scale, cropbox=ocr_rect
|
|
271
|
+
)
|
|
272
|
+
im = numpy.array(high_res_image)
|
|
273
|
+
result = self.reader(
|
|
274
|
+
im,
|
|
275
|
+
use_det=self.options.use_det,
|
|
276
|
+
use_cls=self.options.use_cls,
|
|
277
|
+
use_rec=self.options.use_rec,
|
|
278
|
+
)
|
|
279
|
+
if result is None or result.boxes is None:
|
|
280
|
+
_log.warning("RapidOCR returned empty result!")
|
|
281
|
+
continue
|
|
282
|
+
result = list(
|
|
283
|
+
zip(result.boxes.tolist(), result.txts, result.scores)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
del high_res_image
|
|
287
|
+
del im
|
|
288
|
+
|
|
289
|
+
if result is not None:
|
|
290
|
+
cells = [
|
|
291
|
+
TextCell(
|
|
292
|
+
index=ix,
|
|
293
|
+
text=line[1],
|
|
294
|
+
orig=line[1],
|
|
295
|
+
confidence=line[2],
|
|
296
|
+
from_ocr=True,
|
|
297
|
+
rect=BoundingRectangle.from_bounding_box(
|
|
298
|
+
BoundingBox.from_tuple(
|
|
299
|
+
coord=(
|
|
300
|
+
(line[0][0][0] / self.scale)
|
|
301
|
+
+ ocr_rect.l,
|
|
302
|
+
(line[0][0][1] / self.scale)
|
|
303
|
+
+ ocr_rect.t,
|
|
304
|
+
(line[0][2][0] / self.scale)
|
|
305
|
+
+ ocr_rect.l,
|
|
306
|
+
(line[0][2][1] / self.scale)
|
|
307
|
+
+ ocr_rect.t,
|
|
308
|
+
),
|
|
309
|
+
origin=CoordOrigin.TOPLEFT,
|
|
310
|
+
)
|
|
311
|
+
),
|
|
312
|
+
)
|
|
313
|
+
for ix, line in enumerate(result)
|
|
314
|
+
]
|
|
315
|
+
all_ocr_cells.extend(cells)
|
|
316
|
+
|
|
317
|
+
# Post-process the cells
|
|
318
|
+
self.post_process_cells(all_ocr_cells, page)
|
|
319
|
+
|
|
320
|
+
# DEBUG code:
|
|
321
|
+
if settings.debug.visualize_ocr:
|
|
322
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
|
323
|
+
|
|
324
|
+
yield page
|
|
325
|
+
|
|
326
|
+
@classmethod
|
|
327
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
328
|
+
return RapidOcrOptions
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from subprocess import DEVNULL, PIPE, Popen
|
|
10
|
+
from typing import List, Optional, Tuple, Type
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
14
|
+
from docling_core.types.doc.page import TextCell
|
|
15
|
+
|
|
16
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
17
|
+
from docling.datamodel.base_models import Page
|
|
18
|
+
from docling.datamodel.document import ConversionResult
|
|
19
|
+
from docling.datamodel.pipeline_options import (
|
|
20
|
+
OcrOptions,
|
|
21
|
+
TesseractCliOcrOptions,
|
|
22
|
+
)
|
|
23
|
+
from docling.datamodel.settings import settings
|
|
24
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
25
|
+
from docling.utils.ocr_utils import (
|
|
26
|
+
map_tesseract_script,
|
|
27
|
+
parse_tesseract_orientation,
|
|
28
|
+
tesseract_box_to_bounding_rectangle,
|
|
29
|
+
)
|
|
30
|
+
from docling.utils.profiling import TimeRecorder
|
|
31
|
+
|
|
32
|
+
_log = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TesseractOcrCliModel(BaseOcrModel):
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
enabled: bool,
|
|
39
|
+
artifacts_path: Optional[Path],
|
|
40
|
+
options: TesseractCliOcrOptions,
|
|
41
|
+
accelerator_options: AcceleratorOptions,
|
|
42
|
+
):
|
|
43
|
+
super().__init__(
|
|
44
|
+
enabled=enabled,
|
|
45
|
+
artifacts_path=artifacts_path,
|
|
46
|
+
options=options,
|
|
47
|
+
accelerator_options=accelerator_options,
|
|
48
|
+
)
|
|
49
|
+
self.options: TesseractCliOcrOptions
|
|
50
|
+
|
|
51
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
|
52
|
+
|
|
53
|
+
self._name: Optional[str] = None
|
|
54
|
+
self._version: Optional[str] = None
|
|
55
|
+
self._tesseract_languages: Optional[List[str]] = None
|
|
56
|
+
self._script_prefix: Optional[str] = None
|
|
57
|
+
self._is_auto: bool = "auto" in self.options.lang
|
|
58
|
+
|
|
59
|
+
if self.enabled:
|
|
60
|
+
try:
|
|
61
|
+
self._get_name_and_version()
|
|
62
|
+
self._set_languages_and_prefix()
|
|
63
|
+
|
|
64
|
+
except Exception as exc:
|
|
65
|
+
raise RuntimeError(
|
|
66
|
+
f"Tesseract is not available, aborting: {exc} "
|
|
67
|
+
"Install tesseract on your system and the tesseract binary is discoverable. "
|
|
68
|
+
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
|
69
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _get_name_and_version(self) -> Tuple[str, str]:
|
|
73
|
+
if self._name is not None and self._version is not None:
|
|
74
|
+
return self._name, self._version # type: ignore
|
|
75
|
+
|
|
76
|
+
cmd = [self.options.tesseract_cmd, "--version"]
|
|
77
|
+
|
|
78
|
+
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
|
79
|
+
stdout, stderr = proc.communicate()
|
|
80
|
+
|
|
81
|
+
proc.wait()
|
|
82
|
+
|
|
83
|
+
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
|
84
|
+
# to stderr, so check both.
|
|
85
|
+
version_line = (
|
|
86
|
+
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
|
87
|
+
.split("\n")[0]
|
|
88
|
+
.strip()
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# If everything else fails...
|
|
92
|
+
if not version_line:
|
|
93
|
+
version_line = "tesseract XXX"
|
|
94
|
+
|
|
95
|
+
name, version = version_line.split(" ")
|
|
96
|
+
|
|
97
|
+
self._name = name
|
|
98
|
+
self._version = version
|
|
99
|
+
|
|
100
|
+
return name, version
|
|
101
|
+
|
|
102
|
+
def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
|
|
103
|
+
r"""
|
|
104
|
+
Run tesseract CLI
|
|
105
|
+
"""
|
|
106
|
+
cmd = [self.options.tesseract_cmd]
|
|
107
|
+
if self._is_auto and osd is not None:
|
|
108
|
+
lang = self._parse_language(osd)
|
|
109
|
+
if lang is not None:
|
|
110
|
+
cmd.append("-l")
|
|
111
|
+
cmd.append(lang)
|
|
112
|
+
elif self.options.lang is not None and len(self.options.lang) > 0:
|
|
113
|
+
cmd.append("-l")
|
|
114
|
+
cmd.append("+".join(self.options.lang))
|
|
115
|
+
|
|
116
|
+
if self.options.path is not None:
|
|
117
|
+
cmd.append("--tessdata-dir")
|
|
118
|
+
cmd.append(self.options.path)
|
|
119
|
+
|
|
120
|
+
# Add PSM option if specified in the configuration
|
|
121
|
+
if self.options.psm is not None:
|
|
122
|
+
cmd.extend(["--psm", str(self.options.psm)])
|
|
123
|
+
|
|
124
|
+
cmd += [ifilename, "stdout", "tsv"]
|
|
125
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
|
126
|
+
|
|
127
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
|
128
|
+
|
|
129
|
+
# _log.info(output)
|
|
130
|
+
|
|
131
|
+
# Decode the byte string to a regular string
|
|
132
|
+
decoded_data = output.stdout.decode("utf-8")
|
|
133
|
+
# _log.info(decoded_data)
|
|
134
|
+
|
|
135
|
+
# Read the TSV file generated by Tesseract
|
|
136
|
+
df_result = pd.read_csv(
|
|
137
|
+
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Display the dataframe (optional)
|
|
141
|
+
# _log.info("df: ", df.head())
|
|
142
|
+
|
|
143
|
+
# Filter rows that contain actual text (ignore header or empty rows)
|
|
144
|
+
df_filtered = df_result[
|
|
145
|
+
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
return df_filtered
|
|
149
|
+
|
|
150
|
+
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
|
151
|
+
r"""
|
|
152
|
+
Run tesseract in PSM 0 mode to detect the language
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
cmd = [self.options.tesseract_cmd]
|
|
156
|
+
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
|
157
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
|
158
|
+
output = subprocess.run(cmd, capture_output=True, check=True)
|
|
159
|
+
decoded_data = output.stdout.decode("utf-8")
|
|
160
|
+
df_detected = pd.read_csv(
|
|
161
|
+
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
|
162
|
+
)
|
|
163
|
+
return df_detected
|
|
164
|
+
|
|
165
|
+
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
|
166
|
+
assert self._tesseract_languages is not None
|
|
167
|
+
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
|
168
|
+
if len(scripts) == 0:
|
|
169
|
+
_log.warning("Tesseract cannot detect the script of the page")
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
script = map_tesseract_script(scripts[0].strip())
|
|
173
|
+
lang = f"{self._script_prefix}{script}"
|
|
174
|
+
|
|
175
|
+
# Check if the detected language has been installed
|
|
176
|
+
if lang not in self._tesseract_languages:
|
|
177
|
+
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
|
178
|
+
msg += " However this language is not installed in your system and will be ignored."
|
|
179
|
+
_log.warning(msg)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
_log.debug(
|
|
183
|
+
f"Using tesseract model for the detected script '{script}' and language '{lang}'"
|
|
184
|
+
)
|
|
185
|
+
return lang
|
|
186
|
+
|
|
187
|
+
def _set_languages_and_prefix(self):
|
|
188
|
+
r"""
|
|
189
|
+
Read and set the languages installed in tesseract and decide the script prefix
|
|
190
|
+
"""
|
|
191
|
+
# Get all languages
|
|
192
|
+
cmd = [self.options.tesseract_cmd]
|
|
193
|
+
cmd.append("--list-langs")
|
|
194
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
|
195
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
|
196
|
+
decoded_data = output.stdout.decode("utf-8")
|
|
197
|
+
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
|
198
|
+
self._tesseract_languages = df_list[0].tolist()[1:]
|
|
199
|
+
|
|
200
|
+
# Decide the script prefix
|
|
201
|
+
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
|
202
|
+
script_prefix = "script/"
|
|
203
|
+
else:
|
|
204
|
+
script_prefix = ""
|
|
205
|
+
|
|
206
|
+
self._script_prefix = script_prefix
|
|
207
|
+
|
|
208
|
+
def __call__(
|
|
209
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
210
|
+
) -> Iterable[Page]:
|
|
211
|
+
if not self.enabled:
|
|
212
|
+
yield from page_batch
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
for page_i, page in enumerate(page_batch):
|
|
216
|
+
assert page._backend is not None
|
|
217
|
+
if not page._backend.is_valid():
|
|
218
|
+
yield page
|
|
219
|
+
else:
|
|
220
|
+
with TimeRecorder(conv_res, "ocr"):
|
|
221
|
+
ocr_rects = self.get_ocr_rects(page)
|
|
222
|
+
|
|
223
|
+
all_ocr_cells = []
|
|
224
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
|
225
|
+
# Skip zero area boxes
|
|
226
|
+
if ocr_rect.area() == 0:
|
|
227
|
+
continue
|
|
228
|
+
high_res_image = page._backend.get_page_image(
|
|
229
|
+
scale=self.scale, cropbox=ocr_rect
|
|
230
|
+
)
|
|
231
|
+
try:
|
|
232
|
+
with tempfile.NamedTemporaryFile(
|
|
233
|
+
suffix=".png", mode="w+b", delete=False
|
|
234
|
+
) as image_file:
|
|
235
|
+
fname = image_file.name
|
|
236
|
+
high_res_image.save(image_file)
|
|
237
|
+
doc_orientation = 0
|
|
238
|
+
df_osd: Optional[pd.DataFrame] = None
|
|
239
|
+
try:
|
|
240
|
+
df_osd = self._perform_osd(fname)
|
|
241
|
+
doc_orientation = _parse_orientation(df_osd)
|
|
242
|
+
except subprocess.CalledProcessError as exc:
|
|
243
|
+
_log.error(
|
|
244
|
+
"OSD failed (doc %s, page: %s, "
|
|
245
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
|
246
|
+
conv_res.input.file,
|
|
247
|
+
page_i,
|
|
248
|
+
ocr_rect_i,
|
|
249
|
+
image_file,
|
|
250
|
+
exc.stderr,
|
|
251
|
+
)
|
|
252
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
|
253
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
|
254
|
+
if self._is_auto:
|
|
255
|
+
continue
|
|
256
|
+
if doc_orientation != 0:
|
|
257
|
+
high_res_image = high_res_image.rotate(
|
|
258
|
+
-doc_orientation, expand=True
|
|
259
|
+
)
|
|
260
|
+
high_res_image.save(fname)
|
|
261
|
+
try:
|
|
262
|
+
df_result = self._run_tesseract(fname, df_osd)
|
|
263
|
+
except subprocess.CalledProcessError as exc:
|
|
264
|
+
_log.error(
|
|
265
|
+
"tesseract OCR failed (doc %s, page: %s, "
|
|
266
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
|
267
|
+
conv_res.input.file,
|
|
268
|
+
page_i,
|
|
269
|
+
ocr_rect_i,
|
|
270
|
+
image_file,
|
|
271
|
+
exc.stderr,
|
|
272
|
+
)
|
|
273
|
+
continue
|
|
274
|
+
finally:
|
|
275
|
+
if os.path.exists(fname):
|
|
276
|
+
os.remove(fname)
|
|
277
|
+
|
|
278
|
+
# _log.info(df_result)
|
|
279
|
+
|
|
280
|
+
# Print relevant columns (bounding box and text)
|
|
281
|
+
for ix, row in df_result.iterrows():
|
|
282
|
+
text = row["text"]
|
|
283
|
+
conf = row["conf"]
|
|
284
|
+
|
|
285
|
+
left, top = float(row["left"]), float(row["top"])
|
|
286
|
+
right = left + float(row["width"])
|
|
287
|
+
bottom = top + row["height"]
|
|
288
|
+
bbox = BoundingBox(
|
|
289
|
+
l=left,
|
|
290
|
+
t=top,
|
|
291
|
+
r=right,
|
|
292
|
+
b=bottom,
|
|
293
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
294
|
+
)
|
|
295
|
+
rect = tesseract_box_to_bounding_rectangle(
|
|
296
|
+
bbox,
|
|
297
|
+
original_offset=ocr_rect,
|
|
298
|
+
scale=self.scale,
|
|
299
|
+
orientation=doc_orientation,
|
|
300
|
+
im_size=high_res_image.size,
|
|
301
|
+
)
|
|
302
|
+
cell = TextCell(
|
|
303
|
+
index=ix,
|
|
304
|
+
text=str(text),
|
|
305
|
+
orig=str(text),
|
|
306
|
+
from_ocr=True,
|
|
307
|
+
confidence=conf / 100.0,
|
|
308
|
+
rect=rect,
|
|
309
|
+
)
|
|
310
|
+
all_ocr_cells.append(cell)
|
|
311
|
+
|
|
312
|
+
# Post-process the cells
|
|
313
|
+
self.post_process_cells(all_ocr_cells, page)
|
|
314
|
+
|
|
315
|
+
# DEBUG code:
|
|
316
|
+
if settings.debug.visualize_ocr:
|
|
317
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
|
318
|
+
|
|
319
|
+
yield page
|
|
320
|
+
|
|
321
|
+
@classmethod
|
|
322
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
323
|
+
return TesseractCliOcrOptions
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
|
327
|
+
# For strictly optimal performance with invariant dataframe format:
|
|
328
|
+
mask = df_osd["key"].to_numpy() == "Orientation in degrees"
|
|
329
|
+
orientation_val = df_osd["value"].to_numpy()[mask][0]
|
|
330
|
+
orientation = parse_tesseract_orientation(orientation_val.strip())
|
|
331
|
+
return orientation
|