docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable, Optional, Type
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
8
|
+
from docling_core.types.doc.page import TextCell
|
|
9
|
+
|
|
10
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
11
|
+
from docling.datamodel.base_models import Page
|
|
12
|
+
from docling.datamodel.document import ConversionResult
|
|
13
|
+
from docling.datamodel.pipeline_options import (
|
|
14
|
+
OcrOptions,
|
|
15
|
+
TesseractOcrOptions,
|
|
16
|
+
)
|
|
17
|
+
from docling.datamodel.settings import settings
|
|
18
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
19
|
+
from docling.utils.ocr_utils import (
|
|
20
|
+
map_tesseract_script,
|
|
21
|
+
parse_tesseract_orientation,
|
|
22
|
+
tesseract_box_to_bounding_rectangle,
|
|
23
|
+
)
|
|
24
|
+
from docling.utils.profiling import TimeRecorder
|
|
25
|
+
|
|
26
|
+
_log = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TesseractOcrModel(BaseOcrModel):
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
enabled: bool,
|
|
33
|
+
artifacts_path: Optional[Path],
|
|
34
|
+
options: TesseractOcrOptions,
|
|
35
|
+
accelerator_options: AcceleratorOptions,
|
|
36
|
+
):
|
|
37
|
+
super().__init__(
|
|
38
|
+
enabled=enabled,
|
|
39
|
+
artifacts_path=artifacts_path,
|
|
40
|
+
options=options,
|
|
41
|
+
accelerator_options=accelerator_options,
|
|
42
|
+
)
|
|
43
|
+
self.options: TesseractOcrOptions
|
|
44
|
+
self._is_auto: bool = "auto" in self.options.lang
|
|
45
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
|
46
|
+
self.reader = None
|
|
47
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
|
48
|
+
|
|
49
|
+
if self.enabled:
|
|
50
|
+
install_errmsg = (
|
|
51
|
+
"tesserocr is not correctly installed. "
|
|
52
|
+
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
|
53
|
+
"Note that tesserocr might have to be manually compiled for working with "
|
|
54
|
+
"your Tesseract installation. The Docling documentation provides examples for it. "
|
|
55
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
|
56
|
+
"https://docling-project.github.io/docling/installation/"
|
|
57
|
+
)
|
|
58
|
+
missing_langs_errmsg = (
|
|
59
|
+
"tesserocr is not correctly configured. No language models have been detected. "
|
|
60
|
+
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
|
|
61
|
+
"You can find more information how to setup other OCR engines in Docling "
|
|
62
|
+
"documentation: "
|
|
63
|
+
"https://docling-project.github.io/docling/installation/"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import tesserocr
|
|
68
|
+
except ImportError:
|
|
69
|
+
raise ImportError(install_errmsg)
|
|
70
|
+
try:
|
|
71
|
+
tesseract_version = tesserocr.tesseract_version()
|
|
72
|
+
except Exception:
|
|
73
|
+
raise ImportError(install_errmsg)
|
|
74
|
+
|
|
75
|
+
_, self._tesserocr_languages = tesserocr.get_languages()
|
|
76
|
+
if not self._tesserocr_languages:
|
|
77
|
+
raise ImportError(missing_langs_errmsg)
|
|
78
|
+
|
|
79
|
+
# Initialize the tesseractAPI
|
|
80
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
|
81
|
+
lang = "+".join(self.options.lang)
|
|
82
|
+
|
|
83
|
+
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
|
84
|
+
self.script_prefix = "script/"
|
|
85
|
+
else:
|
|
86
|
+
self.script_prefix = ""
|
|
87
|
+
|
|
88
|
+
tesserocr_kwargs = {
|
|
89
|
+
"init": True,
|
|
90
|
+
"oem": tesserocr.OEM.DEFAULT,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
self.osd_reader = None
|
|
94
|
+
|
|
95
|
+
if self.options.path is not None:
|
|
96
|
+
tesserocr_kwargs["path"] = self.options.path
|
|
97
|
+
|
|
98
|
+
# Set main OCR reader with configurable PSM
|
|
99
|
+
main_psm = (
|
|
100
|
+
self.options.psm if self.options.psm is not None else tesserocr.PSM.AUTO
|
|
101
|
+
)
|
|
102
|
+
if lang == "auto":
|
|
103
|
+
self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
|
|
104
|
+
else:
|
|
105
|
+
self.reader = tesserocr.PyTessBaseAPI(
|
|
106
|
+
lang=lang,
|
|
107
|
+
psm=main_psm,
|
|
108
|
+
**tesserocr_kwargs,
|
|
109
|
+
)
|
|
110
|
+
# OSD reader must use PSM.OSD_ONLY for orientation detection
|
|
111
|
+
self.osd_reader = tesserocr.PyTessBaseAPI(
|
|
112
|
+
lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
|
|
113
|
+
)
|
|
114
|
+
self.reader_RIL = tesserocr.RIL
|
|
115
|
+
|
|
116
|
+
def __del__(self):
|
|
117
|
+
if self.reader is not None:
|
|
118
|
+
# Finalize the tesseractAPI
|
|
119
|
+
self.reader.End()
|
|
120
|
+
for script in self.script_readers:
|
|
121
|
+
self.script_readers[script].End()
|
|
122
|
+
|
|
123
|
+
def __call__(
|
|
124
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
125
|
+
) -> Iterable[Page]:
|
|
126
|
+
if not self.enabled:
|
|
127
|
+
yield from page_batch
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
for page_i, page in enumerate(page_batch):
|
|
131
|
+
assert page._backend is not None
|
|
132
|
+
if not page._backend.is_valid():
|
|
133
|
+
yield page
|
|
134
|
+
else:
|
|
135
|
+
with TimeRecorder(conv_res, "ocr"):
|
|
136
|
+
assert self.reader is not None
|
|
137
|
+
assert self.osd_reader is not None
|
|
138
|
+
assert self._tesserocr_languages is not None
|
|
139
|
+
|
|
140
|
+
ocr_rects = self.get_ocr_rects(page)
|
|
141
|
+
|
|
142
|
+
all_ocr_cells = []
|
|
143
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
|
144
|
+
# Skip zero area boxes
|
|
145
|
+
if ocr_rect.area() == 0:
|
|
146
|
+
continue
|
|
147
|
+
high_res_image = page._backend.get_page_image(
|
|
148
|
+
scale=self.scale, cropbox=ocr_rect
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
local_reader = self.reader
|
|
152
|
+
self.osd_reader.SetImage(high_res_image)
|
|
153
|
+
|
|
154
|
+
doc_orientation = 0
|
|
155
|
+
osd = self.osd_reader.DetectOrientationScript()
|
|
156
|
+
|
|
157
|
+
# No text, or Orientation and Script detection failure
|
|
158
|
+
if osd is None:
|
|
159
|
+
_log.error(
|
|
160
|
+
"OSD failed for doc (doc %s, page: %s, "
|
|
161
|
+
"OCR rectangle: %s)",
|
|
162
|
+
conv_res.input.file,
|
|
163
|
+
page_i,
|
|
164
|
+
ocr_rect_i,
|
|
165
|
+
)
|
|
166
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
|
167
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
|
168
|
+
if self._is_auto:
|
|
169
|
+
continue
|
|
170
|
+
else:
|
|
171
|
+
doc_orientation = parse_tesseract_orientation(
|
|
172
|
+
osd["orient_deg"]
|
|
173
|
+
)
|
|
174
|
+
if doc_orientation != 0:
|
|
175
|
+
high_res_image = high_res_image.rotate(
|
|
176
|
+
-doc_orientation, expand=True
|
|
177
|
+
)
|
|
178
|
+
if self._is_auto:
|
|
179
|
+
script = osd["script_name"]
|
|
180
|
+
script = map_tesseract_script(script)
|
|
181
|
+
lang = f"{self.script_prefix}{script}"
|
|
182
|
+
|
|
183
|
+
# Check if the detected language is present in the system
|
|
184
|
+
if lang not in self._tesserocr_languages:
|
|
185
|
+
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
|
186
|
+
msg += " However this language is not installed in your system and will be ignored."
|
|
187
|
+
_log.warning(msg)
|
|
188
|
+
else:
|
|
189
|
+
if script not in self.script_readers:
|
|
190
|
+
import tesserocr
|
|
191
|
+
|
|
192
|
+
self.script_readers[script] = (
|
|
193
|
+
tesserocr.PyTessBaseAPI(
|
|
194
|
+
path=self.reader.GetDatapath(),
|
|
195
|
+
lang=lang,
|
|
196
|
+
psm=self.options.psm
|
|
197
|
+
if self.options.psm is not None
|
|
198
|
+
else tesserocr.PSM.AUTO,
|
|
199
|
+
init=True,
|
|
200
|
+
oem=tesserocr.OEM.DEFAULT,
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
local_reader = self.script_readers[script]
|
|
204
|
+
|
|
205
|
+
local_reader.SetImage(high_res_image)
|
|
206
|
+
boxes = local_reader.GetComponentImages(
|
|
207
|
+
self.reader_RIL.TEXTLINE, True
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
cells = []
|
|
211
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
|
212
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
|
213
|
+
local_reader.SetRectangle(
|
|
214
|
+
box["x"], box["y"], box["w"], box["h"]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Extract text within the bounding box
|
|
218
|
+
text = local_reader.GetUTF8Text().strip()
|
|
219
|
+
confidence = local_reader.MeanTextConf()
|
|
220
|
+
left, top = box["x"], box["y"]
|
|
221
|
+
right = left + box["w"]
|
|
222
|
+
bottom = top + box["h"]
|
|
223
|
+
bbox = BoundingBox(
|
|
224
|
+
l=left,
|
|
225
|
+
t=top,
|
|
226
|
+
r=right,
|
|
227
|
+
b=bottom,
|
|
228
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
229
|
+
)
|
|
230
|
+
rect = tesseract_box_to_bounding_rectangle(
|
|
231
|
+
bbox,
|
|
232
|
+
original_offset=ocr_rect,
|
|
233
|
+
scale=self.scale,
|
|
234
|
+
orientation=doc_orientation,
|
|
235
|
+
im_size=high_res_image.size,
|
|
236
|
+
)
|
|
237
|
+
cells.append(
|
|
238
|
+
TextCell(
|
|
239
|
+
index=ix,
|
|
240
|
+
text=text,
|
|
241
|
+
orig=text,
|
|
242
|
+
from_ocr=True,
|
|
243
|
+
confidence=confidence,
|
|
244
|
+
rect=rect,
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# del high_res_image
|
|
249
|
+
all_ocr_cells.extend(cells)
|
|
250
|
+
|
|
251
|
+
# Post-process the cells
|
|
252
|
+
self.post_process_cells(all_ocr_cells, page)
|
|
253
|
+
|
|
254
|
+
# DEBUG code:
|
|
255
|
+
if settings.debug.visualize_ocr:
|
|
256
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
|
257
|
+
|
|
258
|
+
yield page
|
|
259
|
+
|
|
260
|
+
@classmethod
|
|
261
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
262
|
+
return TesseractOcrOptions
|
|
File without changes
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from docling.datamodel.base_models import (
|
|
10
|
+
AssembledUnit,
|
|
11
|
+
ContainerElement,
|
|
12
|
+
FigureElement,
|
|
13
|
+
Page,
|
|
14
|
+
PageElement,
|
|
15
|
+
Table,
|
|
16
|
+
TextElement,
|
|
17
|
+
)
|
|
18
|
+
from docling.datamodel.document import ConversionResult
|
|
19
|
+
from docling.models.base_model import BasePageModel
|
|
20
|
+
from docling.models.stages.layout.layout_model import LayoutModel
|
|
21
|
+
from docling.utils.profiling import TimeRecorder
|
|
22
|
+
|
|
23
|
+
_log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PageAssembleOptions(BaseModel):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PageAssembleModel(BasePageModel):
|
|
31
|
+
def __init__(self, options: PageAssembleOptions):
|
|
32
|
+
self.options = options
|
|
33
|
+
|
|
34
|
+
def sanitize_text(self, lines):
|
|
35
|
+
if len(lines) <= 1:
|
|
36
|
+
return " ".join(lines)
|
|
37
|
+
|
|
38
|
+
for ix, line in enumerate(lines[1:]):
|
|
39
|
+
prev_line = lines[ix]
|
|
40
|
+
|
|
41
|
+
if prev_line.endswith("-"):
|
|
42
|
+
prev_words = re.findall(r"\b[\w]+\b", prev_line)
|
|
43
|
+
line_words = re.findall(r"\b[\w]+\b", line)
|
|
44
|
+
|
|
45
|
+
if (
|
|
46
|
+
len(prev_words)
|
|
47
|
+
and len(line_words)
|
|
48
|
+
and prev_words[-1].isalnum()
|
|
49
|
+
and line_words[0].isalnum()
|
|
50
|
+
):
|
|
51
|
+
lines[ix] = prev_line[:-1]
|
|
52
|
+
else:
|
|
53
|
+
lines[ix] += " "
|
|
54
|
+
|
|
55
|
+
sanitized_text = "".join(lines)
|
|
56
|
+
|
|
57
|
+
# Text normalization
|
|
58
|
+
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
|
59
|
+
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
|
60
|
+
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
|
61
|
+
sanitized_text = sanitized_text.replace("“", '"')
|
|
62
|
+
sanitized_text = sanitized_text.replace("”", '"')
|
|
63
|
+
sanitized_text = sanitized_text.replace("•", "·")
|
|
64
|
+
|
|
65
|
+
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
|
66
|
+
|
|
67
|
+
def __call__(
|
|
68
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
69
|
+
) -> Iterable[Page]:
|
|
70
|
+
for page in page_batch:
|
|
71
|
+
assert page._backend is not None
|
|
72
|
+
if not page._backend.is_valid():
|
|
73
|
+
yield page
|
|
74
|
+
else:
|
|
75
|
+
with TimeRecorder(conv_res, "page_assemble"):
|
|
76
|
+
assert page.predictions.layout is not None
|
|
77
|
+
|
|
78
|
+
# assembles some JSON output page by page.
|
|
79
|
+
|
|
80
|
+
elements: List[PageElement] = []
|
|
81
|
+
headers: List[PageElement] = []
|
|
82
|
+
body: List[PageElement] = []
|
|
83
|
+
|
|
84
|
+
for cluster in page.predictions.layout.clusters:
|
|
85
|
+
# _log.info("Cluster label seen:", cluster.label)
|
|
86
|
+
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
|
87
|
+
textlines = [
|
|
88
|
+
cell.text.replace("\x02", "-").strip()
|
|
89
|
+
for cell in cluster.cells
|
|
90
|
+
if len(cell.text.strip()) > 0
|
|
91
|
+
]
|
|
92
|
+
text = self.sanitize_text(textlines)
|
|
93
|
+
text_el = TextElement(
|
|
94
|
+
label=cluster.label,
|
|
95
|
+
id=cluster.id,
|
|
96
|
+
text=text,
|
|
97
|
+
page_no=page.page_no,
|
|
98
|
+
cluster=cluster,
|
|
99
|
+
)
|
|
100
|
+
elements.append(text_el)
|
|
101
|
+
|
|
102
|
+
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
|
103
|
+
headers.append(text_el)
|
|
104
|
+
else:
|
|
105
|
+
body.append(text_el)
|
|
106
|
+
elif cluster.label in LayoutModel.TABLE_LABELS:
|
|
107
|
+
tbl = None
|
|
108
|
+
if page.predictions.tablestructure:
|
|
109
|
+
tbl = page.predictions.tablestructure.table_map.get(
|
|
110
|
+
cluster.id, None
|
|
111
|
+
)
|
|
112
|
+
if not tbl: # fallback: add table without structure, if it isn't present
|
|
113
|
+
tbl = Table(
|
|
114
|
+
label=cluster.label,
|
|
115
|
+
id=cluster.id,
|
|
116
|
+
text="",
|
|
117
|
+
otsl_seq=[],
|
|
118
|
+
table_cells=[],
|
|
119
|
+
cluster=cluster,
|
|
120
|
+
page_no=page.page_no,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
elements.append(tbl)
|
|
124
|
+
body.append(tbl)
|
|
125
|
+
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
|
126
|
+
fig = None
|
|
127
|
+
if page.predictions.figures_classification:
|
|
128
|
+
fig = page.predictions.figures_classification.figure_map.get(
|
|
129
|
+
cluster.id, None
|
|
130
|
+
)
|
|
131
|
+
if not fig: # fallback: add figure without classification, if it isn't present
|
|
132
|
+
fig = FigureElement(
|
|
133
|
+
label=cluster.label,
|
|
134
|
+
id=cluster.id,
|
|
135
|
+
text="",
|
|
136
|
+
data=None,
|
|
137
|
+
cluster=cluster,
|
|
138
|
+
page_no=page.page_no,
|
|
139
|
+
)
|
|
140
|
+
elements.append(fig)
|
|
141
|
+
body.append(fig)
|
|
142
|
+
elif cluster.label in LayoutModel.CONTAINER_LABELS:
|
|
143
|
+
container_el = ContainerElement(
|
|
144
|
+
label=cluster.label,
|
|
145
|
+
id=cluster.id,
|
|
146
|
+
page_no=page.page_no,
|
|
147
|
+
cluster=cluster,
|
|
148
|
+
)
|
|
149
|
+
elements.append(container_el)
|
|
150
|
+
body.append(container_el)
|
|
151
|
+
|
|
152
|
+
page.assembled = AssembledUnit(
|
|
153
|
+
elements=elements, headers=headers, body=body
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
yield page
|
|
File without changes
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import warnings
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Literal, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from PIL import ImageDraw
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from docling.datamodel.base_models import Page
|
|
12
|
+
from docling.datamodel.document import ConversionResult
|
|
13
|
+
from docling.datamodel.settings import settings
|
|
14
|
+
from docling.models.base_model import BasePageModel
|
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PagePreprocessingOptions(BaseModel):
|
|
19
|
+
images_scale: Optional[float]
|
|
20
|
+
skip_cell_extraction: bool = (
|
|
21
|
+
False # Skip text cell extraction for VLM-only processing
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PagePreprocessingModel(BasePageModel):
|
|
26
|
+
def __init__(self, options: PagePreprocessingOptions):
|
|
27
|
+
self.options = options
|
|
28
|
+
|
|
29
|
+
# Pre-compiled regex patterns for efficiency
|
|
30
|
+
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
|
31
|
+
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
|
32
|
+
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
|
33
|
+
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
|
34
|
+
r"(?:/\w+\s*){2,}"
|
|
35
|
+
) # Two or more "/token " sequences
|
|
36
|
+
|
|
37
|
+
def __call__(
|
|
38
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
39
|
+
) -> Iterable[Page]:
|
|
40
|
+
for page in page_batch:
|
|
41
|
+
assert page._backend is not None
|
|
42
|
+
if not page._backend.is_valid():
|
|
43
|
+
yield page
|
|
44
|
+
else:
|
|
45
|
+
with TimeRecorder(conv_res, "page_parse"):
|
|
46
|
+
page = self._populate_page_images(page)
|
|
47
|
+
if not self.options.skip_cell_extraction:
|
|
48
|
+
page = self._parse_page_cells(conv_res, page)
|
|
49
|
+
yield page
|
|
50
|
+
|
|
51
|
+
# Generate the page image and store it in the page object
|
|
52
|
+
def _populate_page_images(self, page: Page) -> Page:
|
|
53
|
+
# default scale
|
|
54
|
+
page.get_image(
|
|
55
|
+
scale=1.0
|
|
56
|
+
) # puts the page image on the image cache at default scale
|
|
57
|
+
|
|
58
|
+
images_scale = self.options.images_scale
|
|
59
|
+
# user requested scales
|
|
60
|
+
if images_scale is not None:
|
|
61
|
+
page._default_image_scale = images_scale
|
|
62
|
+
page.get_image(
|
|
63
|
+
scale=images_scale
|
|
64
|
+
) # this will trigger storing the image in the internal cache
|
|
65
|
+
|
|
66
|
+
return page
|
|
67
|
+
|
|
68
|
+
# Extract and populate the page cells and store it in the page object
|
|
69
|
+
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
70
|
+
assert page._backend is not None
|
|
71
|
+
|
|
72
|
+
page.parsed_page = page._backend.get_segmented_page()
|
|
73
|
+
assert page.parsed_page is not None
|
|
74
|
+
|
|
75
|
+
# Rate the text quality from the PDF parser, and aggregate on page
|
|
76
|
+
text_scores = []
|
|
77
|
+
for c in page.cells:
|
|
78
|
+
score = self.rate_text_quality(c.text)
|
|
79
|
+
text_scores.append(score)
|
|
80
|
+
|
|
81
|
+
with warnings.catch_warnings():
|
|
82
|
+
warnings.filterwarnings(
|
|
83
|
+
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
|
84
|
+
)
|
|
85
|
+
conv_res.confidence.pages[page.page_no].parse_score = float(
|
|
86
|
+
np.nanquantile(
|
|
87
|
+
text_scores, q=0.10
|
|
88
|
+
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# DEBUG code:
|
|
92
|
+
def draw_text_boxes(image, cells, show: bool = False):
|
|
93
|
+
draw = ImageDraw.Draw(image.copy())
|
|
94
|
+
for c in cells:
|
|
95
|
+
x0, y0, x1, y1 = (
|
|
96
|
+
c.to_bounding_box().l,
|
|
97
|
+
c.to_bounding_box().t,
|
|
98
|
+
c.to_bounding_box().r,
|
|
99
|
+
c.to_bounding_box().b,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
|
103
|
+
if show:
|
|
104
|
+
image.show()
|
|
105
|
+
else:
|
|
106
|
+
out_path: Path = (
|
|
107
|
+
Path(settings.debug.debug_output_path)
|
|
108
|
+
/ f"debug_{conv_res.input.file.stem}"
|
|
109
|
+
)
|
|
110
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
|
|
112
|
+
out_file = out_path / f"cells_page_{page.page_no:05}.png"
|
|
113
|
+
image.save(str(out_file), format="png")
|
|
114
|
+
|
|
115
|
+
if settings.debug.visualize_cells:
|
|
116
|
+
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
|
117
|
+
|
|
118
|
+
return page
|
|
119
|
+
|
|
120
|
+
def rate_text_quality(self, text: str) -> float:
|
|
121
|
+
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
|
122
|
+
blacklist_chars = ["�"]
|
|
123
|
+
if (
|
|
124
|
+
any(text.find(c) >= 0 for c in blacklist_chars)
|
|
125
|
+
or self.GLYPH_RE.search(text)
|
|
126
|
+
or self.SLASH_G_RE.search(text)
|
|
127
|
+
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
|
128
|
+
text
|
|
129
|
+
) # Check if text is mostly slash-number pattern
|
|
130
|
+
):
|
|
131
|
+
return 0.0
|
|
132
|
+
|
|
133
|
+
penalty = 0.0
|
|
134
|
+
|
|
135
|
+
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
|
136
|
+
frag_matches = self.FRAG_RE.findall(text)
|
|
137
|
+
if len(frag_matches) >= 3:
|
|
138
|
+
penalty += 0.1 * len(frag_matches)
|
|
139
|
+
|
|
140
|
+
# Additional heuristic: if the average token length is below 2, add a penalty.
|
|
141
|
+
# tokens = text.split()
|
|
142
|
+
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
|
143
|
+
# penalty += 0.2
|
|
144
|
+
|
|
145
|
+
return max(1.0 - penalty, 0.0)
|
|
File without changes
|