docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Iterable, Optional
|
3
|
+
|
4
|
+
from PIL import ImageDraw
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.settings import settings
|
10
|
+
from docling.models.base_model import BasePageModel
|
11
|
+
from docling.utils.profiling import TimeRecorder
|
12
|
+
|
13
|
+
|
14
|
+
class PagePreprocessingOptions(BaseModel):
|
15
|
+
images_scale: Optional[float]
|
16
|
+
|
17
|
+
|
18
|
+
class PagePreprocessingModel(BasePageModel):
|
19
|
+
def __init__(self, options: PagePreprocessingOptions):
|
20
|
+
self.options = options
|
21
|
+
|
22
|
+
def __call__(
|
23
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
24
|
+
) -> Iterable[Page]:
|
25
|
+
for page in page_batch:
|
26
|
+
assert page._backend is not None
|
27
|
+
if not page._backend.is_valid():
|
28
|
+
yield page
|
29
|
+
else:
|
30
|
+
with TimeRecorder(conv_res, "page_parse"):
|
31
|
+
page = self._populate_page_images(page)
|
32
|
+
page = self._parse_page_cells(conv_res, page)
|
33
|
+
yield page
|
34
|
+
|
35
|
+
# Generate the page image and store it in the page object
|
36
|
+
def _populate_page_images(self, page: Page) -> Page:
|
37
|
+
# default scale
|
38
|
+
page.get_image(
|
39
|
+
scale=1.0
|
40
|
+
) # puts the page image on the image cache at default scale
|
41
|
+
|
42
|
+
images_scale = self.options.images_scale
|
43
|
+
# user requested scales
|
44
|
+
if images_scale is not None:
|
45
|
+
page._default_image_scale = images_scale
|
46
|
+
page.get_image(
|
47
|
+
scale=images_scale
|
48
|
+
) # this will trigger storing the image in the internal cache
|
49
|
+
|
50
|
+
return page
|
51
|
+
|
52
|
+
# Extract and populate the page cells and store it in the page object
|
53
|
+
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
54
|
+
assert page._backend is not None
|
55
|
+
|
56
|
+
page.cells = list(page._backend.get_text_cells())
|
57
|
+
|
58
|
+
# DEBUG code:
|
59
|
+
def draw_text_boxes(image, cells, show: bool = False):
|
60
|
+
draw = ImageDraw.Draw(image)
|
61
|
+
for c in cells:
|
62
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
63
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
64
|
+
if show:
|
65
|
+
image.show()
|
66
|
+
else:
|
67
|
+
out_path: Path = (
|
68
|
+
Path(settings.debug.debug_output_path)
|
69
|
+
/ f"debug_{conv_res.input.file.stem}"
|
70
|
+
)
|
71
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
72
|
+
|
73
|
+
out_file = out_path / f"cells_page_{page.page_no:05}.png"
|
74
|
+
image.save(str(out_file), format="png")
|
75
|
+
|
76
|
+
if settings.debug.visualize_cells:
|
77
|
+
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
78
|
+
|
79
|
+
return page
|
@@ -1,31 +1,30 @@
|
|
1
1
|
import copy
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Iterable
|
3
|
+
from typing import Iterable
|
4
4
|
|
5
5
|
import numpy
|
6
|
+
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
6
7
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
7
8
|
from PIL import ImageDraw
|
8
9
|
|
9
|
-
from docling.datamodel.base_models import
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
)
|
16
|
-
from docling.datamodel.pipeline_options import TableFormerMode
|
10
|
+
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
|
+
from docling.datamodel.document import ConversionResult
|
12
|
+
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
13
|
+
from docling.datamodel.settings import settings
|
14
|
+
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
17
16
|
|
18
17
|
|
19
|
-
class TableStructureModel:
|
20
|
-
def __init__(
|
21
|
-
self
|
22
|
-
|
23
|
-
self.
|
18
|
+
class TableStructureModel(BasePageModel):
|
19
|
+
def __init__(
|
20
|
+
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
21
|
+
):
|
22
|
+
self.options = options
|
23
|
+
self.do_cell_matching = self.options.do_cell_matching
|
24
|
+
self.mode = self.options.mode
|
24
25
|
|
25
|
-
self.enabled =
|
26
|
+
self.enabled = enabled
|
26
27
|
if self.enabled:
|
27
|
-
artifacts_path: Path = config["artifacts_path"]
|
28
|
-
|
29
28
|
if self.mode == TableFormerMode.ACCURATE:
|
30
29
|
artifacts_path = artifacts_path / "fat"
|
31
30
|
|
@@ -39,7 +38,15 @@ class TableStructureModel:
|
|
39
38
|
self.tf_predictor = TFPredictor(self.tm_config)
|
40
39
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
41
40
|
|
42
|
-
def draw_table_and_cells(
|
41
|
+
def draw_table_and_cells(
|
42
|
+
self,
|
43
|
+
conv_res: ConversionResult,
|
44
|
+
page: Page,
|
45
|
+
tbl_list: Iterable[Table],
|
46
|
+
show: bool = False,
|
47
|
+
):
|
48
|
+
assert page._backend is not None
|
49
|
+
|
43
50
|
image = (
|
44
51
|
page._backend.get_page_image()
|
45
52
|
) # make new image to avoid drawing on the saved ones
|
@@ -50,111 +57,150 @@ class TableStructureModel:
|
|
50
57
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
51
58
|
|
52
59
|
for tc in table_element.table_cells:
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
if tc.bbox is not None:
|
61
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
62
|
+
if tc.column_header:
|
63
|
+
width = 3
|
64
|
+
else:
|
65
|
+
width = 1
|
66
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
|
67
|
+
draw.text(
|
68
|
+
(x0 + 3, y0 + 3),
|
69
|
+
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
70
|
+
fill="black",
|
71
|
+
)
|
72
|
+
|
73
|
+
if show:
|
74
|
+
image.show()
|
75
|
+
else:
|
76
|
+
out_path: Path = (
|
77
|
+
Path(settings.debug.debug_output_path)
|
78
|
+
/ f"debug_{conv_res.input.file.stem}"
|
79
|
+
)
|
80
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
81
|
+
|
82
|
+
out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
|
83
|
+
image.save(str(out_file), format="png")
|
84
|
+
|
85
|
+
def __call__(
|
86
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
87
|
+
) -> Iterable[Page]:
|
68
88
|
|
69
89
|
if not self.enabled:
|
70
90
|
yield from page_batch
|
71
91
|
return
|
72
92
|
|
73
93
|
for page in page_batch:
|
74
|
-
|
75
|
-
page.
|
76
|
-
|
77
|
-
in_tables = [
|
78
|
-
(
|
79
|
-
cluster,
|
80
|
-
[
|
81
|
-
round(cluster.bbox.l) * self.scale,
|
82
|
-
round(cluster.bbox.t) * self.scale,
|
83
|
-
round(cluster.bbox.r) * self.scale,
|
84
|
-
round(cluster.bbox.b) * self.scale,
|
85
|
-
],
|
86
|
-
)
|
87
|
-
for cluster in page.predictions.layout.clusters
|
88
|
-
if cluster.label == "Table"
|
89
|
-
]
|
90
|
-
if not len(in_tables):
|
94
|
+
assert page._backend is not None
|
95
|
+
if not page._backend.is_valid():
|
91
96
|
yield page
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
tbl = TableElement(
|
145
|
-
otsl_seq=otsl_seq,
|
146
|
-
table_cells=table_cells,
|
147
|
-
num_rows=num_rows,
|
148
|
-
num_cols=num_cols,
|
149
|
-
id=table_cluster.id,
|
150
|
-
page_no=page.page_no,
|
151
|
-
cluster=table_cluster,
|
152
|
-
label="Table",
|
97
|
+
else:
|
98
|
+
with TimeRecorder(conv_res, "table_structure"):
|
99
|
+
|
100
|
+
assert page.predictions.layout is not None
|
101
|
+
assert page.size is not None
|
102
|
+
|
103
|
+
page.predictions.tablestructure = (
|
104
|
+
TableStructurePrediction()
|
105
|
+
) # dummy
|
106
|
+
|
107
|
+
in_tables = [
|
108
|
+
(
|
109
|
+
cluster,
|
110
|
+
[
|
111
|
+
round(cluster.bbox.l) * self.scale,
|
112
|
+
round(cluster.bbox.t) * self.scale,
|
113
|
+
round(cluster.bbox.r) * self.scale,
|
114
|
+
round(cluster.bbox.b) * self.scale,
|
115
|
+
],
|
116
|
+
)
|
117
|
+
for cluster in page.predictions.layout.clusters
|
118
|
+
if cluster.label == DocItemLabel.TABLE
|
119
|
+
]
|
120
|
+
if not len(in_tables):
|
121
|
+
yield page
|
122
|
+
continue
|
123
|
+
|
124
|
+
tokens = []
|
125
|
+
for c in page.cells:
|
126
|
+
for cluster, _ in in_tables:
|
127
|
+
if c.bbox.area() > 0:
|
128
|
+
if (
|
129
|
+
c.bbox.intersection_area_with(cluster.bbox)
|
130
|
+
/ c.bbox.area()
|
131
|
+
> 0.2
|
132
|
+
):
|
133
|
+
# Only allow non empty stings (spaces) into the cells of a table
|
134
|
+
if len(c.text.strip()) > 0:
|
135
|
+
new_cell = copy.deepcopy(c)
|
136
|
+
new_cell.bbox = new_cell.bbox.scaled(
|
137
|
+
scale=self.scale
|
138
|
+
)
|
139
|
+
|
140
|
+
tokens.append(new_cell.model_dump())
|
141
|
+
|
142
|
+
page_input = {
|
143
|
+
"tokens": tokens,
|
144
|
+
"width": page.size.width * self.scale,
|
145
|
+
"height": page.size.height * self.scale,
|
146
|
+
}
|
147
|
+
page_input["image"] = numpy.asarray(
|
148
|
+
page.get_image(scale=self.scale)
|
153
149
|
)
|
154
150
|
|
155
|
-
|
151
|
+
table_clusters, table_bboxes = zip(*in_tables)
|
152
|
+
|
153
|
+
if len(table_bboxes):
|
154
|
+
tf_output = self.tf_predictor.multi_table_predict(
|
155
|
+
page_input, table_bboxes, do_matching=self.do_cell_matching
|
156
|
+
)
|
157
|
+
|
158
|
+
for table_cluster, table_out in zip(table_clusters, tf_output):
|
159
|
+
table_cells = []
|
160
|
+
for element in table_out["tf_responses"]:
|
161
|
+
|
162
|
+
if not self.do_cell_matching:
|
163
|
+
the_bbox = BoundingBox.model_validate(
|
164
|
+
element["bbox"]
|
165
|
+
).scaled(1 / self.scale)
|
166
|
+
text_piece = page._backend.get_text_in_rect(
|
167
|
+
the_bbox
|
168
|
+
)
|
169
|
+
element["bbox"]["token"] = text_piece
|
170
|
+
|
171
|
+
tc = TableCell.model_validate(element)
|
172
|
+
if self.do_cell_matching and tc.bbox is not None:
|
173
|
+
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
174
|
+
table_cells.append(tc)
|
175
|
+
|
176
|
+
# Retrieving cols/rows, after post processing:
|
177
|
+
num_rows = table_out["predict_details"]["num_rows"]
|
178
|
+
num_cols = table_out["predict_details"]["num_cols"]
|
179
|
+
otsl_seq = table_out["predict_details"]["prediction"][
|
180
|
+
"rs_seq"
|
181
|
+
]
|
182
|
+
|
183
|
+
tbl = Table(
|
184
|
+
otsl_seq=otsl_seq,
|
185
|
+
table_cells=table_cells,
|
186
|
+
num_rows=num_rows,
|
187
|
+
num_cols=num_cols,
|
188
|
+
id=table_cluster.id,
|
189
|
+
page_no=page.page_no,
|
190
|
+
cluster=table_cluster,
|
191
|
+
label=DocItemLabel.TABLE,
|
192
|
+
)
|
193
|
+
|
194
|
+
page.predictions.tablestructure.table_map[
|
195
|
+
table_cluster.id
|
196
|
+
] = tbl
|
197
|
+
|
198
|
+
# For debugging purposes:
|
199
|
+
if settings.debug.visualize_tables:
|
200
|
+
self.draw_table_and_cells(
|
201
|
+
conv_res,
|
202
|
+
page,
|
203
|
+
page.predictions.tablestructure.table_map.values(),
|
204
|
+
)
|
156
205
|
|
157
|
-
|
158
|
-
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
|
159
|
-
|
160
|
-
yield page
|
206
|
+
yield page
|
@@ -2,13 +2,17 @@ import io
|
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
4
|
from subprocess import DEVNULL, PIPE, Popen
|
5
|
-
from typing import Iterable, Tuple
|
5
|
+
from typing import Iterable, Optional, Tuple
|
6
6
|
|
7
7
|
import pandas as pd
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
9
|
|
9
|
-
from docling.datamodel.base_models import
|
10
|
+
from docling.datamodel.base_models import OcrCell, Page
|
11
|
+
from docling.datamodel.document import ConversionResult
|
10
12
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
13
|
+
from docling.datamodel.settings import settings
|
11
14
|
from docling.models.base_ocr_model import BaseOcrModel
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
12
16
|
|
13
17
|
_log = logging.getLogger(__name__)
|
14
18
|
|
@@ -21,8 +25,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
21
25
|
|
22
26
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
27
|
|
24
|
-
self._name = None
|
25
|
-
self._version = None
|
28
|
+
self._name: Optional[str] = None
|
29
|
+
self._version: Optional[str] = None
|
26
30
|
|
27
31
|
if self.enabled:
|
28
32
|
try:
|
@@ -39,7 +43,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
39
43
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
40
44
|
|
41
45
|
if self._name != None and self._version != None:
|
42
|
-
return self._name, self._version
|
46
|
+
return self._name, self._version # type: ignore
|
43
47
|
|
44
48
|
cmd = [self.options.tesseract_cmd, "--version"]
|
45
49
|
|
@@ -101,67 +105,80 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
101
105
|
|
102
106
|
return df_filtered
|
103
107
|
|
104
|
-
def __call__(
|
108
|
+
def __call__(
|
109
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
110
|
+
) -> Iterable[Page]:
|
105
111
|
|
106
112
|
if not self.enabled:
|
107
113
|
yield from page_batch
|
108
114
|
return
|
109
115
|
|
110
116
|
for page in page_batch:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
117
|
+
assert page._backend is not None
|
118
|
+
if not page._backend.is_valid():
|
119
|
+
yield page
|
120
|
+
else:
|
121
|
+
with TimeRecorder(conv_res, "ocr"):
|
122
|
+
|
123
|
+
ocr_rects = self.get_ocr_rects(page)
|
124
|
+
|
125
|
+
all_ocr_cells = []
|
126
|
+
for ocr_rect in ocr_rects:
|
127
|
+
# Skip zero area boxes
|
128
|
+
if ocr_rect.area() == 0:
|
129
|
+
continue
|
130
|
+
high_res_image = page._backend.get_page_image(
|
131
|
+
scale=self.scale, cropbox=ocr_rect
|
132
|
+
)
|
133
|
+
|
134
|
+
with tempfile.NamedTemporaryFile(
|
135
|
+
suffix=".png", mode="w"
|
136
|
+
) as image_file:
|
137
|
+
fname = image_file.name
|
138
|
+
high_res_image.save(fname)
|
139
|
+
|
140
|
+
df = self._run_tesseract(fname)
|
141
|
+
|
142
|
+
# _log.info(df)
|
143
|
+
|
144
|
+
# Print relevant columns (bounding box and text)
|
145
|
+
for ix, row in df.iterrows():
|
146
|
+
text = row["text"]
|
147
|
+
conf = row["conf"]
|
148
|
+
|
149
|
+
l = float(row["left"])
|
150
|
+
b = float(row["top"])
|
151
|
+
w = float(row["width"])
|
152
|
+
h = float(row["height"])
|
153
|
+
|
154
|
+
t = b + h
|
155
|
+
r = l + w
|
156
|
+
|
157
|
+
cell = OcrCell(
|
158
|
+
id=ix,
|
159
|
+
text=text,
|
160
|
+
confidence=conf / 100.0,
|
161
|
+
bbox=BoundingBox.from_tuple(
|
162
|
+
coord=(
|
163
|
+
(l / self.scale) + ocr_rect.l,
|
164
|
+
(b / self.scale) + ocr_rect.t,
|
165
|
+
(r / self.scale) + ocr_rect.l,
|
166
|
+
(t / self.scale) + ocr_rect.t,
|
167
|
+
),
|
168
|
+
origin=CoordOrigin.TOPLEFT,
|
169
|
+
),
|
170
|
+
)
|
171
|
+
all_ocr_cells.append(cell)
|
172
|
+
|
173
|
+
## Remove OCR cells which overlap with programmatic cells.
|
174
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
175
|
+
all_ocr_cells, page.cells
|
156
176
|
)
|
157
|
-
all_ocr_cells.append(cell)
|
158
|
-
|
159
|
-
## Remove OCR cells which overlap with programmatic cells.
|
160
|
-
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
161
177
|
|
162
|
-
|
178
|
+
page.cells.extend(filtered_ocr_cells)
|
163
179
|
|
164
|
-
|
165
|
-
|
180
|
+
# DEBUG code:
|
181
|
+
if settings.debug.visualize_ocr:
|
182
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
166
183
|
|
167
|
-
|
184
|
+
yield page
|