docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +1 -0
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +4 -4
- docling/backend/docling_parse_v2_backend.py +12 -4
- docling/backend/html_backend.py +61 -57
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +62 -39
- docling/backend/msword_backend.py +12 -25
- docling/backend/pypdfium2_backend.py +1 -1
- docling/cli/main.py +38 -8
- docling/datamodel/base_models.py +16 -10
- docling/datamodel/document.py +36 -6
- docling/datamodel/pipeline_options.py +3 -3
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +38 -12
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +76 -52
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
- docling-2.4.1.dist-info/RECORD +45 -0
- docling-2.1.0.dist-info/RECORD +0 -42
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/models/easyocr_model.py
CHANGED
@@ -5,8 +5,11 @@ import numpy
|
|
5
5
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
8
9
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
10
|
+
from docling.datamodel.settings import settings
|
9
11
|
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
from docling.utils.profiling import TimeRecorder
|
10
13
|
|
11
14
|
_log = logging.getLogger(__name__)
|
12
15
|
|
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
|
|
33
36
|
download_enabled=self.options.download_enabled,
|
34
37
|
)
|
35
38
|
|
36
|
-
def __call__(
|
39
|
+
def __call__(
|
40
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
41
|
+
) -> Iterable[Page]:
|
37
42
|
|
38
43
|
if not self.enabled:
|
39
44
|
yield from page_batch
|
40
45
|
return
|
41
46
|
|
42
47
|
for page in page_batch:
|
48
|
+
|
43
49
|
assert page._backend is not None
|
44
50
|
if not page._backend.is_valid():
|
45
51
|
yield page
|
46
52
|
else:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
im = numpy.array(high_res_image)
|
58
|
-
result = self.reader.readtext(im)
|
59
|
-
|
60
|
-
del high_res_image
|
61
|
-
del im
|
62
|
-
|
63
|
-
cells = [
|
64
|
-
OcrCell(
|
65
|
-
id=ix,
|
66
|
-
text=line[1],
|
67
|
-
confidence=line[2],
|
68
|
-
bbox=BoundingBox.from_tuple(
|
69
|
-
coord=(
|
70
|
-
(line[0][0][0] / self.scale) + ocr_rect.l,
|
71
|
-
(line[0][0][1] / self.scale) + ocr_rect.t,
|
72
|
-
(line[0][2][0] / self.scale) + ocr_rect.l,
|
73
|
-
(line[0][2][1] / self.scale) + ocr_rect.t,
|
74
|
-
),
|
75
|
-
origin=CoordOrigin.TOPLEFT,
|
76
|
-
),
|
53
|
+
with TimeRecorder(conv_res, "ocr"):
|
54
|
+
ocr_rects = self.get_ocr_rects(page)
|
55
|
+
|
56
|
+
all_ocr_cells = []
|
57
|
+
for ocr_rect in ocr_rects:
|
58
|
+
# Skip zero area boxes
|
59
|
+
if ocr_rect.area() == 0:
|
60
|
+
continue
|
61
|
+
high_res_image = page._backend.get_page_image(
|
62
|
+
scale=self.scale, cropbox=ocr_rect
|
77
63
|
)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
64
|
+
im = numpy.array(high_res_image)
|
65
|
+
result = self.reader.readtext(im)
|
66
|
+
|
67
|
+
del high_res_image
|
68
|
+
del im
|
69
|
+
|
70
|
+
cells = [
|
71
|
+
OcrCell(
|
72
|
+
id=ix,
|
73
|
+
text=line[1],
|
74
|
+
confidence=line[2],
|
75
|
+
bbox=BoundingBox.from_tuple(
|
76
|
+
coord=(
|
77
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
78
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
79
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
80
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
81
|
+
),
|
82
|
+
origin=CoordOrigin.TOPLEFT,
|
83
|
+
),
|
84
|
+
)
|
85
|
+
for ix, line in enumerate(result)
|
86
|
+
]
|
87
|
+
all_ocr_cells.extend(cells)
|
88
|
+
|
89
|
+
## Remove OCR cells which overlap with programmatic cells.
|
90
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
91
|
+
all_ocr_cells, page.cells
|
92
|
+
)
|
84
93
|
|
85
|
-
|
94
|
+
page.cells.extend(filtered_ocr_cells)
|
86
95
|
|
87
96
|
# DEBUG code:
|
88
|
-
|
97
|
+
if settings.debug.visualize_ocr:
|
98
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
89
99
|
|
90
100
|
yield page
|
docling/models/layout_model.py
CHANGED
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
|
|
16
16
|
LayoutPrediction,
|
17
17
|
Page,
|
18
18
|
)
|
19
|
+
from docling.datamodel.document import ConversionResult
|
20
|
+
from docling.datamodel.settings import settings
|
19
21
|
from docling.models.base_model import BasePageModel
|
20
22
|
from docling.utils import layout_utils as lu
|
23
|
+
from docling.utils.profiling import TimeRecorder
|
21
24
|
|
22
25
|
_log = logging.getLogger(__name__)
|
23
26
|
|
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
|
|
271
274
|
|
272
275
|
return clusters_out_new, cells_out_new
|
273
276
|
|
274
|
-
def __call__(
|
277
|
+
def __call__(
|
278
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
279
|
+
) -> Iterable[Page]:
|
280
|
+
|
275
281
|
for page in page_batch:
|
276
282
|
assert page._backend is not None
|
277
283
|
if not page._backend.is_valid():
|
278
284
|
yield page
|
279
285
|
else:
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
# TODO: Remove, postprocess should take care of it anyway.
|
300
|
-
for cell in page.cells:
|
301
|
-
for cluster in clusters:
|
302
|
-
if not cell.bbox.area() > 0:
|
303
|
-
overlap_frac = 0.0
|
304
|
-
else:
|
305
|
-
overlap_frac = (
|
306
|
-
cell.bbox.intersection_area_with(cluster.bbox)
|
307
|
-
/ cell.bbox.area()
|
308
|
-
)
|
309
|
-
|
310
|
-
if overlap_frac > 0.5:
|
311
|
-
cluster.cells.append(cell)
|
312
|
-
|
313
|
-
# Pre-sort clusters
|
314
|
-
# clusters = self.sort_clusters_by_cell_order(clusters)
|
315
|
-
|
316
|
-
# DEBUG code:
|
317
|
-
def draw_clusters_and_cells():
|
318
|
-
image = copy.deepcopy(page.image)
|
319
|
-
draw = ImageDraw.Draw(image)
|
320
|
-
for c in clusters:
|
321
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
322
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
323
|
-
|
324
|
-
cell_color = (
|
325
|
-
random.randint(30, 140),
|
326
|
-
random.randint(30, 140),
|
327
|
-
random.randint(30, 140),
|
286
|
+
with TimeRecorder(conv_res, "layout"):
|
287
|
+
assert page.size is not None
|
288
|
+
|
289
|
+
clusters = []
|
290
|
+
for ix, pred_item in enumerate(
|
291
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
292
|
+
):
|
293
|
+
label = DocItemLabel(
|
294
|
+
pred_item["label"]
|
295
|
+
.lower()
|
296
|
+
.replace(" ", "_")
|
297
|
+
.replace("-", "_")
|
298
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
299
|
+
cluster = Cluster(
|
300
|
+
id=ix,
|
301
|
+
label=label,
|
302
|
+
confidence=pred_item["confidence"],
|
303
|
+
bbox=BoundingBox.model_validate(pred_item),
|
304
|
+
cells=[],
|
328
305
|
)
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
306
|
+
clusters.append(cluster)
|
307
|
+
|
308
|
+
# Map cells to clusters
|
309
|
+
# TODO: Remove, postprocess should take care of it anyway.
|
310
|
+
for cell in page.cells:
|
311
|
+
for cluster in clusters:
|
312
|
+
if not cell.bbox.area() > 0:
|
313
|
+
overlap_frac = 0.0
|
314
|
+
else:
|
315
|
+
overlap_frac = (
|
316
|
+
cell.bbox.intersection_area_with(cluster.bbox)
|
317
|
+
/ cell.bbox.area()
|
318
|
+
)
|
319
|
+
|
320
|
+
if overlap_frac > 0.5:
|
321
|
+
cluster.cells.append(cell)
|
322
|
+
|
323
|
+
# Pre-sort clusters
|
324
|
+
# clusters = self.sort_clusters_by_cell_order(clusters)
|
325
|
+
|
326
|
+
# DEBUG code:
|
327
|
+
def draw_clusters_and_cells(show: bool = False):
|
328
|
+
image = copy.deepcopy(page.image)
|
329
|
+
if image is not None:
|
330
|
+
draw = ImageDraw.Draw(image)
|
331
|
+
for c in clusters:
|
332
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
333
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
334
|
+
|
335
|
+
cell_color = (
|
336
|
+
random.randint(30, 140),
|
337
|
+
random.randint(30, 140),
|
338
|
+
random.randint(30, 140),
|
339
|
+
)
|
340
|
+
for tc in c.cells: # [:1]:
|
341
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
342
|
+
draw.rectangle(
|
343
|
+
[(x0, y0), (x1, y1)], outline=cell_color
|
344
|
+
)
|
345
|
+
if show:
|
346
|
+
image.show()
|
347
|
+
else:
|
348
|
+
out_path: Path = (
|
349
|
+
Path(settings.debug.debug_output_path)
|
350
|
+
/ f"debug_{conv_res.input.file.stem}"
|
351
|
+
)
|
352
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
353
|
+
|
354
|
+
out_file = (
|
355
|
+
out_path / f"layout_page_{page.page_no:05}.png"
|
356
|
+
)
|
357
|
+
image.save(str(out_file), format="png")
|
358
|
+
|
359
|
+
# draw_clusters_and_cells()
|
360
|
+
|
361
|
+
clusters, page.cells = self.postprocess(
|
362
|
+
clusters, page.cells, page.size.height
|
363
|
+
)
|
339
364
|
|
340
|
-
|
365
|
+
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
341
366
|
|
342
|
-
|
367
|
+
if settings.debug.visualize_layout:
|
368
|
+
draw_clusters_and_cells()
|
343
369
|
|
344
370
|
yield page
|
@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
|
|
12
12
|
Table,
|
13
13
|
TextElement,
|
14
14
|
)
|
15
|
+
from docling.datamodel.document import ConversionResult
|
15
16
|
from docling.models.base_model import BasePageModel
|
16
17
|
from docling.models.layout_model import LayoutModel
|
18
|
+
from docling.utils.profiling import TimeRecorder
|
17
19
|
|
18
20
|
_log = logging.getLogger(__name__)
|
19
21
|
|
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
|
|
51
53
|
|
52
54
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
53
55
|
|
54
|
-
def __call__(
|
56
|
+
def __call__(
|
57
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
58
|
+
) -> Iterable[Page]:
|
55
59
|
for page in page_batch:
|
56
60
|
assert page._backend is not None
|
57
61
|
if not page._backend.is_valid():
|
58
62
|
yield page
|
59
63
|
else:
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
text=text,
|
82
|
-
page_no=page.page_no,
|
83
|
-
cluster=cluster,
|
84
|
-
)
|
85
|
-
elements.append(text_el)
|
86
|
-
|
87
|
-
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
88
|
-
headers.append(text_el)
|
89
|
-
else:
|
90
|
-
body.append(text_el)
|
91
|
-
elif cluster.label == LayoutModel.TABLE_LABEL:
|
92
|
-
tbl = None
|
93
|
-
if page.predictions.tablestructure:
|
94
|
-
tbl = page.predictions.tablestructure.table_map.get(
|
95
|
-
cluster.id, None
|
96
|
-
)
|
97
|
-
if (
|
98
|
-
not tbl
|
99
|
-
): # fallback: add table without structure, if it isn't present
|
100
|
-
tbl = Table(
|
64
|
+
with TimeRecorder(conv_res, "page_assemble"):
|
65
|
+
|
66
|
+
assert page.predictions.layout is not None
|
67
|
+
|
68
|
+
# assembles some JSON output page by page.
|
69
|
+
|
70
|
+
elements: List[PageElement] = []
|
71
|
+
headers: List[PageElement] = []
|
72
|
+
body: List[PageElement] = []
|
73
|
+
|
74
|
+
for cluster in page.predictions.layout.clusters:
|
75
|
+
# _log.info("Cluster label seen:", cluster.label)
|
76
|
+
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
77
|
+
|
78
|
+
textlines = [
|
79
|
+
cell.text.replace("\x02", "-").strip()
|
80
|
+
for cell in cluster.cells
|
81
|
+
if len(cell.text.strip()) > 0
|
82
|
+
]
|
83
|
+
text = self.sanitize_text(textlines)
|
84
|
+
text_el = TextElement(
|
101
85
|
label=cluster.label,
|
102
86
|
id=cluster.id,
|
103
|
-
text=
|
104
|
-
otsl_seq=[],
|
105
|
-
table_cells=[],
|
106
|
-
cluster=cluster,
|
87
|
+
text=text,
|
107
88
|
page_no=page.page_no,
|
89
|
+
cluster=cluster,
|
108
90
|
)
|
91
|
+
elements.append(text_el)
|
92
|
+
|
93
|
+
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
94
|
+
headers.append(text_el)
|
95
|
+
else:
|
96
|
+
body.append(text_el)
|
97
|
+
elif cluster.label == LayoutModel.TABLE_LABEL:
|
98
|
+
tbl = None
|
99
|
+
if page.predictions.tablestructure:
|
100
|
+
tbl = page.predictions.tablestructure.table_map.get(
|
101
|
+
cluster.id, None
|
102
|
+
)
|
103
|
+
if (
|
104
|
+
not tbl
|
105
|
+
): # fallback: add table without structure, if it isn't present
|
106
|
+
tbl = Table(
|
107
|
+
label=cluster.label,
|
108
|
+
id=cluster.id,
|
109
|
+
text="",
|
110
|
+
otsl_seq=[],
|
111
|
+
table_cells=[],
|
112
|
+
cluster=cluster,
|
113
|
+
page_no=page.page_no,
|
114
|
+
)
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
page.predictions.figures_classification.figure_map.get(
|
116
|
+
elements.append(tbl)
|
117
|
+
body.append(tbl)
|
118
|
+
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
119
|
+
fig = None
|
120
|
+
if page.predictions.figures_classification:
|
121
|
+
fig = page.predictions.figures_classification.figure_map.get(
|
117
122
|
cluster.id, None
|
118
123
|
)
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
)
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
equation = (
|
137
|
-
page.predictions.equations_prediction.equation_map.get(
|
124
|
+
if (
|
125
|
+
not fig
|
126
|
+
): # fallback: add figure without classification, if it isn't present
|
127
|
+
fig = FigureElement(
|
128
|
+
label=cluster.label,
|
129
|
+
id=cluster.id,
|
130
|
+
text="",
|
131
|
+
data=None,
|
132
|
+
cluster=cluster,
|
133
|
+
page_no=page.page_no,
|
134
|
+
)
|
135
|
+
elements.append(fig)
|
136
|
+
body.append(fig)
|
137
|
+
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
138
|
+
equation = None
|
139
|
+
if page.predictions.equations_prediction:
|
140
|
+
equation = page.predictions.equations_prediction.equation_map.get(
|
138
141
|
cluster.id, None
|
139
142
|
)
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
)
|
158
|
-
|
159
|
-
body.append(equation)
|
143
|
+
if (
|
144
|
+
not equation
|
145
|
+
): # fallback: add empty formula, if it isn't present
|
146
|
+
text = self.sanitize_text(
|
147
|
+
[
|
148
|
+
cell.text.replace("\x02", "-").strip()
|
149
|
+
for cell in cluster.cells
|
150
|
+
if len(cell.text.strip()) > 0
|
151
|
+
]
|
152
|
+
)
|
153
|
+
equation = TextElement(
|
154
|
+
label=cluster.label,
|
155
|
+
id=cluster.id,
|
156
|
+
cluster=cluster,
|
157
|
+
page_no=page.page_no,
|
158
|
+
text=text,
|
159
|
+
)
|
160
|
+
elements.append(equation)
|
161
|
+
body.append(equation)
|
160
162
|
|
161
|
-
|
162
|
-
|
163
|
-
|
163
|
+
page.assembled = AssembledUnit(
|
164
|
+
elements=elements, headers=headers, body=body
|
165
|
+
)
|
164
166
|
|
165
|
-
|
166
|
-
|
167
|
-
|
167
|
+
# Remove page images (can be disabled)
|
168
|
+
if not self.options.keep_images:
|
169
|
+
page._image_cache = {}
|
168
170
|
|
169
|
-
|
170
|
-
|
171
|
+
# Unload backend
|
172
|
+
page._backend.unload()
|
171
173
|
|
172
174
|
yield page
|
@@ -1,10 +1,14 @@
|
|
1
|
+
from pathlib import Path
|
1
2
|
from typing import Iterable, Optional
|
2
3
|
|
3
4
|
from PIL import ImageDraw
|
4
5
|
from pydantic import BaseModel
|
5
6
|
|
6
7
|
from docling.datamodel.base_models import Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.settings import settings
|
7
10
|
from docling.models.base_model import BasePageModel
|
11
|
+
from docling.utils.profiling import TimeRecorder
|
8
12
|
|
9
13
|
|
10
14
|
class PagePreprocessingOptions(BaseModel):
|
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
|
|
15
19
|
def __init__(self, options: PagePreprocessingOptions):
|
16
20
|
self.options = options
|
17
21
|
|
18
|
-
def __call__(
|
22
|
+
def __call__(
|
23
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
24
|
+
) -> Iterable[Page]:
|
19
25
|
for page in page_batch:
|
20
26
|
assert page._backend is not None
|
21
27
|
if not page._backend.is_valid():
|
22
28
|
yield page
|
23
29
|
else:
|
24
|
-
|
25
|
-
|
30
|
+
with TimeRecorder(conv_res, "page_parse"):
|
31
|
+
page = self._populate_page_images(page)
|
32
|
+
page = self._parse_page_cells(conv_res, page)
|
26
33
|
yield page
|
27
34
|
|
28
35
|
# Generate the page image and store it in the page object
|
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
|
|
43
50
|
return page
|
44
51
|
|
45
52
|
# Extract and populate the page cells and store it in the page object
|
46
|
-
def _parse_page_cells(self, page: Page) -> Page:
|
53
|
+
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
47
54
|
assert page._backend is not None
|
48
55
|
|
49
56
|
page.cells = list(page._backend.get_text_cells())
|
50
57
|
|
51
58
|
# DEBUG code:
|
52
|
-
def draw_text_boxes(image, cells):
|
59
|
+
def draw_text_boxes(image, cells, show: bool = False):
|
53
60
|
draw = ImageDraw.Draw(image)
|
54
61
|
for c in cells:
|
55
62
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
56
63
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
57
|
-
|
64
|
+
if show:
|
65
|
+
image.show()
|
66
|
+
else:
|
67
|
+
out_path: Path = (
|
68
|
+
Path(settings.debug.debug_output_path)
|
69
|
+
/ f"debug_{conv_res.input.file.stem}"
|
70
|
+
)
|
71
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
72
|
+
|
73
|
+
out_file = out_path / f"cells_page_{page.page_no:05}.png"
|
74
|
+
image.save(str(out_file), format="png")
|
58
75
|
|
59
|
-
|
76
|
+
if settings.debug.visualize_cells:
|
77
|
+
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
60
78
|
|
61
79
|
return page
|