docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,328 @@
1
+ import logging
2
+ from collections.abc import Iterable
3
+ from pathlib import Path
4
+ from typing import Literal, Optional, Type, TypedDict
5
+
6
+ import numpy
7
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
9
+
10
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
11
+ from docling.datamodel.base_models import Page
12
+ from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.pipeline_options import (
14
+ OcrOptions,
15
+ RapidOcrOptions,
16
+ )
17
+ from docling.datamodel.settings import settings
18
+ from docling.models.base_ocr_model import BaseOcrModel
19
+ from docling.utils.accelerator_utils import decide_device
20
+ from docling.utils.profiling import TimeRecorder
21
+ from docling.utils.utils import download_url_with_progress
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+ _ModelPathEngines = Literal["onnxruntime", "torch"]
26
+ _ModelPathTypes = Literal[
27
+ "det_model_path", "cls_model_path", "rec_model_path", "rec_keys_path", "font_path"
28
+ ]
29
+
30
+
31
+ class _ModelPathDetail(TypedDict):
32
+ url: str
33
+ path: str
34
+
35
+
36
+ class RapidOcrModel(BaseOcrModel):
37
+ _model_repo_folder = "RapidOcr"
38
+ # from https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/default_models.yaml
39
+ # matching the default config in https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/config.yaml
40
+ # and naming f"{file_info.engine_type.value}.{file_info.ocr_version.value}.{file_info.task_type.value}"
41
+ _default_models: dict[
42
+ _ModelPathEngines, dict[_ModelPathTypes, _ModelPathDetail]
43
+ ] = {
44
+ "onnxruntime": {
45
+ "det_model_path": {
46
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
47
+ "path": "onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
48
+ },
49
+ "cls_model_path": {
50
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
51
+ "path": "onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
52
+ },
53
+ "rec_model_path": {
54
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
55
+ "path": "onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
56
+ },
57
+ "rec_keys_path": {
58
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v2.0.7/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
59
+ "path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
60
+ },
61
+ "font_path": {
62
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF",
63
+ "path": "fonts/FZYTK.TTF",
64
+ },
65
+ },
66
+ "torch": {
67
+ "det_model_path": {
68
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
69
+ "path": "torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
70
+ },
71
+ "cls_model_path": {
72
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
73
+ "path": "torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
74
+ },
75
+ "rec_model_path": {
76
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
77
+ "path": "torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
78
+ },
79
+ "rec_keys_path": {
80
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
81
+ "path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
82
+ },
83
+ "font_path": {
84
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF",
85
+ "path": "fonts/FZYTK.TTF",
86
+ },
87
+ },
88
+ }
89
+
90
+ def __init__(
91
+ self,
92
+ enabled: bool,
93
+ artifacts_path: Optional[Path],
94
+ options: RapidOcrOptions,
95
+ accelerator_options: AcceleratorOptions,
96
+ ):
97
+ super().__init__(
98
+ enabled=enabled,
99
+ artifacts_path=artifacts_path,
100
+ options=options,
101
+ accelerator_options=accelerator_options,
102
+ )
103
+ self.options: RapidOcrOptions
104
+
105
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
106
+
107
+ if self.enabled:
108
+ try:
109
+ from rapidocr import EngineType, RapidOCR # type: ignore
110
+ except ImportError:
111
+ raise ImportError(
112
+ "RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
113
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
114
+ )
115
+
116
+ # Decide the accelerator devices
117
+ device = decide_device(accelerator_options.device)
118
+ use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
119
+ use_dml = accelerator_options.device == AcceleratorDevice.AUTO
120
+ intra_op_num_threads = accelerator_options.num_threads
121
+ gpu_id = 0
122
+ if use_cuda and ":" in device:
123
+ gpu_id = int(device.split(":")[1])
124
+ _ALIASES = {
125
+ "onnxruntime": EngineType.ONNXRUNTIME,
126
+ "openvino": EngineType.OPENVINO,
127
+ "paddle": EngineType.PADDLE,
128
+ "torch": EngineType.TORCH,
129
+ }
130
+ backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
131
+
132
+ det_model_path = self.options.det_model_path
133
+ cls_model_path = self.options.cls_model_path
134
+ rec_model_path = self.options.rec_model_path
135
+ rec_keys_path = self.options.rec_keys_path
136
+ font_path = self.options.font_path
137
+ if artifacts_path is not None:
138
+ det_model_path = (
139
+ det_model_path
140
+ or artifacts_path
141
+ / self._model_repo_folder
142
+ / self._default_models[backend_enum.value]["det_model_path"]["path"]
143
+ )
144
+ cls_model_path = (
145
+ cls_model_path
146
+ or artifacts_path
147
+ / self._model_repo_folder
148
+ / self._default_models[backend_enum.value]["cls_model_path"]["path"]
149
+ )
150
+ rec_model_path = (
151
+ rec_model_path
152
+ or artifacts_path
153
+ / self._model_repo_folder
154
+ / self._default_models[backend_enum.value]["rec_model_path"]["path"]
155
+ )
156
+ rec_keys_path = (
157
+ rec_keys_path
158
+ or artifacts_path
159
+ / self._model_repo_folder
160
+ / self._default_models[backend_enum.value]["rec_keys_path"]["path"]
161
+ )
162
+ font_path = (
163
+ font_path
164
+ or artifacts_path
165
+ / self._model_repo_folder
166
+ / self._default_models[backend_enum.value]["font_path"]["path"]
167
+ )
168
+
169
+ for model_path in (
170
+ rec_keys_path,
171
+ cls_model_path,
172
+ rec_model_path,
173
+ rec_keys_path,
174
+ font_path,
175
+ ):
176
+ if model_path is None:
177
+ continue
178
+ if not Path(model_path).exists():
179
+ _log.warning(f"The provided model path {model_path} is not found.")
180
+
181
+ params = {
182
+ # Global settings (these are still correct)
183
+ "Global.text_score": self.options.text_score,
184
+ "Global.font_path": font_path,
185
+ # "Global.verbose": self.options.print_verbose,
186
+ # Detection model settings
187
+ "Det.model_path": det_model_path,
188
+ "Det.use_cuda": use_cuda,
189
+ "Det.use_dml": use_dml,
190
+ "Det.intra_op_num_threads": intra_op_num_threads,
191
+ # Classification model settings
192
+ "Cls.model_path": cls_model_path,
193
+ "Cls.use_cuda": use_cuda,
194
+ "Cls.use_dml": use_dml,
195
+ "Cls.intra_op_num_threads": intra_op_num_threads,
196
+ # Recognition model settings
197
+ "Rec.model_path": rec_model_path,
198
+ "Rec.font_path": font_path,
199
+ "Rec.rec_keys_path": rec_keys_path,
200
+ "Rec.use_cuda": use_cuda,
201
+ "Rec.use_dml": use_dml,
202
+ "Rec.intra_op_num_threads": intra_op_num_threads,
203
+ "Det.engine_type": backend_enum,
204
+ "Cls.engine_type": backend_enum,
205
+ "Rec.engine_type": backend_enum,
206
+ "EngineConfig.paddle.use_cuda": use_cuda,
207
+ "EngineConfig.paddle.gpu_id": gpu_id,
208
+ "EngineConfig.torch.use_cuda": use_cuda,
209
+ "EngineConfig.torch.gpu_id": gpu_id,
210
+ }
211
+
212
+ if self.options.rec_font_path is not None:
213
+ _log.warning(
214
+ "The 'rec_font_path' option for RapidOCR is deprecated. Please use 'font_path' instead."
215
+ )
216
+ user_params = self.options.rapidocr_params
217
+ if user_params:
218
+ _log.debug("Overwriting RapidOCR params with user-provided values.")
219
+ params.update(user_params)
220
+
221
+ self.reader = RapidOCR(
222
+ params=params,
223
+ )
224
+
225
+ @staticmethod
226
+ def download_models(
227
+ backend: _ModelPathEngines,
228
+ local_dir: Optional[Path] = None,
229
+ force: bool = False,
230
+ progress: bool = False,
231
+ ) -> Path:
232
+ if local_dir is None:
233
+ local_dir = settings.cache_dir / "models" / RapidOcrModel._model_repo_folder
234
+
235
+ local_dir.mkdir(parents=True, exist_ok=True)
236
+
237
+ # Download models
238
+ for model_type, model_details in RapidOcrModel._default_models[backend].items():
239
+ output_path = local_dir / model_details["path"]
240
+ if output_path.exists() and not force:
241
+ continue
242
+ output_path.parent.mkdir(exist_ok=True, parents=True)
243
+ buf = download_url_with_progress(model_details["url"], progress=progress)
244
+ with output_path.open("wb") as fw:
245
+ fw.write(buf.read())
246
+
247
+ return local_dir
248
+
249
+ def __call__(
250
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
251
+ ) -> Iterable[Page]:
252
+ if not self.enabled:
253
+ yield from page_batch
254
+ return
255
+
256
+ for page in page_batch:
257
+ assert page._backend is not None
258
+ if not page._backend.is_valid():
259
+ yield page
260
+ else:
261
+ with TimeRecorder(conv_res, "ocr"):
262
+ ocr_rects = self.get_ocr_rects(page)
263
+
264
+ all_ocr_cells = []
265
+ for ocr_rect in ocr_rects:
266
+ # Skip zero area boxes
267
+ if ocr_rect.area() == 0:
268
+ continue
269
+ high_res_image = page._backend.get_page_image(
270
+ scale=self.scale, cropbox=ocr_rect
271
+ )
272
+ im = numpy.array(high_res_image)
273
+ result = self.reader(
274
+ im,
275
+ use_det=self.options.use_det,
276
+ use_cls=self.options.use_cls,
277
+ use_rec=self.options.use_rec,
278
+ )
279
+ if result is None or result.boxes is None:
280
+ _log.warning("RapidOCR returned empty result!")
281
+ continue
282
+ result = list(
283
+ zip(result.boxes.tolist(), result.txts, result.scores)
284
+ )
285
+
286
+ del high_res_image
287
+ del im
288
+
289
+ if result is not None:
290
+ cells = [
291
+ TextCell(
292
+ index=ix,
293
+ text=line[1],
294
+ orig=line[1],
295
+ confidence=line[2],
296
+ from_ocr=True,
297
+ rect=BoundingRectangle.from_bounding_box(
298
+ BoundingBox.from_tuple(
299
+ coord=(
300
+ (line[0][0][0] / self.scale)
301
+ + ocr_rect.l,
302
+ (line[0][0][1] / self.scale)
303
+ + ocr_rect.t,
304
+ (line[0][2][0] / self.scale)
305
+ + ocr_rect.l,
306
+ (line[0][2][1] / self.scale)
307
+ + ocr_rect.t,
308
+ ),
309
+ origin=CoordOrigin.TOPLEFT,
310
+ )
311
+ ),
312
+ )
313
+ for ix, line in enumerate(result)
314
+ ]
315
+ all_ocr_cells.extend(cells)
316
+
317
+ # Post-process the cells
318
+ self.post_process_cells(all_ocr_cells, page)
319
+
320
+ # DEBUG code:
321
+ if settings.debug.visualize_ocr:
322
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
323
+
324
+ yield page
325
+
326
+ @classmethod
327
+ def get_options_type(cls) -> Type[OcrOptions]:
328
+ return RapidOcrOptions
@@ -0,0 +1,331 @@
1
+ import csv
2
+ import io
3
+ import logging
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ from collections.abc import Iterable
8
+ from pathlib import Path
9
+ from subprocess import DEVNULL, PIPE, Popen
10
+ from typing import List, Optional, Tuple, Type
11
+
12
+ import pandas as pd
13
+ from docling_core.types.doc import BoundingBox, CoordOrigin
14
+ from docling_core.types.doc.page import TextCell
15
+
16
+ from docling.datamodel.accelerator_options import AcceleratorOptions
17
+ from docling.datamodel.base_models import Page
18
+ from docling.datamodel.document import ConversionResult
19
+ from docling.datamodel.pipeline_options import (
20
+ OcrOptions,
21
+ TesseractCliOcrOptions,
22
+ )
23
+ from docling.datamodel.settings import settings
24
+ from docling.models.base_ocr_model import BaseOcrModel
25
+ from docling.utils.ocr_utils import (
26
+ map_tesseract_script,
27
+ parse_tesseract_orientation,
28
+ tesseract_box_to_bounding_rectangle,
29
+ )
30
+ from docling.utils.profiling import TimeRecorder
31
+
32
+ _log = logging.getLogger(__name__)
33
+
34
+
35
+ class TesseractOcrCliModel(BaseOcrModel):
36
+ def __init__(
37
+ self,
38
+ enabled: bool,
39
+ artifacts_path: Optional[Path],
40
+ options: TesseractCliOcrOptions,
41
+ accelerator_options: AcceleratorOptions,
42
+ ):
43
+ super().__init__(
44
+ enabled=enabled,
45
+ artifacts_path=artifacts_path,
46
+ options=options,
47
+ accelerator_options=accelerator_options,
48
+ )
49
+ self.options: TesseractCliOcrOptions
50
+
51
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
52
+
53
+ self._name: Optional[str] = None
54
+ self._version: Optional[str] = None
55
+ self._tesseract_languages: Optional[List[str]] = None
56
+ self._script_prefix: Optional[str] = None
57
+ self._is_auto: bool = "auto" in self.options.lang
58
+
59
+ if self.enabled:
60
+ try:
61
+ self._get_name_and_version()
62
+ self._set_languages_and_prefix()
63
+
64
+ except Exception as exc:
65
+ raise RuntimeError(
66
+ f"Tesseract is not available, aborting: {exc} "
67
+ "Install tesseract on your system and the tesseract binary is discoverable. "
68
+ "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
69
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
70
+ )
71
+
72
+ def _get_name_and_version(self) -> Tuple[str, str]:
73
+ if self._name is not None and self._version is not None:
74
+ return self._name, self._version # type: ignore
75
+
76
+ cmd = [self.options.tesseract_cmd, "--version"]
77
+
78
+ proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
79
+ stdout, stderr = proc.communicate()
80
+
81
+ proc.wait()
82
+
83
+ # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
84
+ # to stderr, so check both.
85
+ version_line = (
86
+ (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
87
+ .split("\n")[0]
88
+ .strip()
89
+ )
90
+
91
+ # If everything else fails...
92
+ if not version_line:
93
+ version_line = "tesseract XXX"
94
+
95
+ name, version = version_line.split(" ")
96
+
97
+ self._name = name
98
+ self._version = version
99
+
100
+ return name, version
101
+
102
+ def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
103
+ r"""
104
+ Run tesseract CLI
105
+ """
106
+ cmd = [self.options.tesseract_cmd]
107
+ if self._is_auto and osd is not None:
108
+ lang = self._parse_language(osd)
109
+ if lang is not None:
110
+ cmd.append("-l")
111
+ cmd.append(lang)
112
+ elif self.options.lang is not None and len(self.options.lang) > 0:
113
+ cmd.append("-l")
114
+ cmd.append("+".join(self.options.lang))
115
+
116
+ if self.options.path is not None:
117
+ cmd.append("--tessdata-dir")
118
+ cmd.append(self.options.path)
119
+
120
+ # Add PSM option if specified in the configuration
121
+ if self.options.psm is not None:
122
+ cmd.extend(["--psm", str(self.options.psm)])
123
+
124
+ cmd += [ifilename, "stdout", "tsv"]
125
+ _log.info("command: {}".format(" ".join(cmd)))
126
+
127
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
128
+
129
+ # _log.info(output)
130
+
131
+ # Decode the byte string to a regular string
132
+ decoded_data = output.stdout.decode("utf-8")
133
+ # _log.info(decoded_data)
134
+
135
+ # Read the TSV file generated by Tesseract
136
+ df_result = pd.read_csv(
137
+ io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
138
+ )
139
+
140
+ # Display the dataframe (optional)
141
+ # _log.info("df: ", df.head())
142
+
143
+ # Filter rows that contain actual text (ignore header or empty rows)
144
+ df_filtered = df_result[
145
+ df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
146
+ ]
147
+
148
+ return df_filtered
149
+
150
+ def _perform_osd(self, ifilename: str) -> pd.DataFrame:
151
+ r"""
152
+ Run tesseract in PSM 0 mode to detect the language
153
+ """
154
+
155
+ cmd = [self.options.tesseract_cmd]
156
+ cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
157
+ _log.info("command: {}".format(" ".join(cmd)))
158
+ output = subprocess.run(cmd, capture_output=True, check=True)
159
+ decoded_data = output.stdout.decode("utf-8")
160
+ df_detected = pd.read_csv(
161
+ io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
162
+ )
163
+ return df_detected
164
+
165
+ def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
166
+ assert self._tesseract_languages is not None
167
+ scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
168
+ if len(scripts) == 0:
169
+ _log.warning("Tesseract cannot detect the script of the page")
170
+ return None
171
+
172
+ script = map_tesseract_script(scripts[0].strip())
173
+ lang = f"{self._script_prefix}{script}"
174
+
175
+ # Check if the detected language has been installed
176
+ if lang not in self._tesseract_languages:
177
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
178
+ msg += " However this language is not installed in your system and will be ignored."
179
+ _log.warning(msg)
180
+ return None
181
+
182
+ _log.debug(
183
+ f"Using tesseract model for the detected script '{script}' and language '{lang}'"
184
+ )
185
+ return lang
186
+
187
+ def _set_languages_and_prefix(self):
188
+ r"""
189
+ Read and set the languages installed in tesseract and decide the script prefix
190
+ """
191
+ # Get all languages
192
+ cmd = [self.options.tesseract_cmd]
193
+ cmd.append("--list-langs")
194
+ _log.info("command: {}".format(" ".join(cmd)))
195
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
196
+ decoded_data = output.stdout.decode("utf-8")
197
+ df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
198
+ self._tesseract_languages = df_list[0].tolist()[1:]
199
+
200
+ # Decide the script prefix
201
+ if any(lang.startswith("script/") for lang in self._tesseract_languages):
202
+ script_prefix = "script/"
203
+ else:
204
+ script_prefix = ""
205
+
206
+ self._script_prefix = script_prefix
207
+
208
+ def __call__(
209
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
210
+ ) -> Iterable[Page]:
211
+ if not self.enabled:
212
+ yield from page_batch
213
+ return
214
+
215
+ for page_i, page in enumerate(page_batch):
216
+ assert page._backend is not None
217
+ if not page._backend.is_valid():
218
+ yield page
219
+ else:
220
+ with TimeRecorder(conv_res, "ocr"):
221
+ ocr_rects = self.get_ocr_rects(page)
222
+
223
+ all_ocr_cells = []
224
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
225
+ # Skip zero area boxes
226
+ if ocr_rect.area() == 0:
227
+ continue
228
+ high_res_image = page._backend.get_page_image(
229
+ scale=self.scale, cropbox=ocr_rect
230
+ )
231
+ try:
232
+ with tempfile.NamedTemporaryFile(
233
+ suffix=".png", mode="w+b", delete=False
234
+ ) as image_file:
235
+ fname = image_file.name
236
+ high_res_image.save(image_file)
237
+ doc_orientation = 0
238
+ df_osd: Optional[pd.DataFrame] = None
239
+ try:
240
+ df_osd = self._perform_osd(fname)
241
+ doc_orientation = _parse_orientation(df_osd)
242
+ except subprocess.CalledProcessError as exc:
243
+ _log.error(
244
+ "OSD failed (doc %s, page: %s, "
245
+ "OCR rectangle: %s, processed image file %s):\n %s",
246
+ conv_res.input.file,
247
+ page_i,
248
+ ocr_rect_i,
249
+ image_file,
250
+ exc.stderr,
251
+ )
252
+ # Skipping if OSD fail when in auto mode, otherwise proceed
253
+ # to OCR in the hope OCR will succeed while OSD failed
254
+ if self._is_auto:
255
+ continue
256
+ if doc_orientation != 0:
257
+ high_res_image = high_res_image.rotate(
258
+ -doc_orientation, expand=True
259
+ )
260
+ high_res_image.save(fname)
261
+ try:
262
+ df_result = self._run_tesseract(fname, df_osd)
263
+ except subprocess.CalledProcessError as exc:
264
+ _log.error(
265
+ "tesseract OCR failed (doc %s, page: %s, "
266
+ "OCR rectangle: %s, processed image file %s):\n %s",
267
+ conv_res.input.file,
268
+ page_i,
269
+ ocr_rect_i,
270
+ image_file,
271
+ exc.stderr,
272
+ )
273
+ continue
274
+ finally:
275
+ if os.path.exists(fname):
276
+ os.remove(fname)
277
+
278
+ # _log.info(df_result)
279
+
280
+ # Print relevant columns (bounding box and text)
281
+ for ix, row in df_result.iterrows():
282
+ text = row["text"]
283
+ conf = row["conf"]
284
+
285
+ left, top = float(row["left"]), float(row["top"])
286
+ right = left + float(row["width"])
287
+ bottom = top + row["height"]
288
+ bbox = BoundingBox(
289
+ l=left,
290
+ t=top,
291
+ r=right,
292
+ b=bottom,
293
+ coord_origin=CoordOrigin.TOPLEFT,
294
+ )
295
+ rect = tesseract_box_to_bounding_rectangle(
296
+ bbox,
297
+ original_offset=ocr_rect,
298
+ scale=self.scale,
299
+ orientation=doc_orientation,
300
+ im_size=high_res_image.size,
301
+ )
302
+ cell = TextCell(
303
+ index=ix,
304
+ text=str(text),
305
+ orig=str(text),
306
+ from_ocr=True,
307
+ confidence=conf / 100.0,
308
+ rect=rect,
309
+ )
310
+ all_ocr_cells.append(cell)
311
+
312
+ # Post-process the cells
313
+ self.post_process_cells(all_ocr_cells, page)
314
+
315
+ # DEBUG code:
316
+ if settings.debug.visualize_ocr:
317
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
318
+
319
+ yield page
320
+
321
+ @classmethod
322
+ def get_options_type(cls) -> Type[OcrOptions]:
323
+ return TesseractCliOcrOptions
324
+
325
+
326
+ def _parse_orientation(df_osd: pd.DataFrame) -> int:
327
+ # For strictly optimal performance with invariant dataframe format:
328
+ mask = df_osd["key"].to_numpy() == "Orientation in degrees"
329
+ orientation_val = df_osd["value"].to_numpy()[mask][0]
330
+ orientation = parse_tesseract_orientation(orientation_val.strip())
331
+ return orientation