docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +65 -58
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.0.dist-info/METADATA +0 -380
- docling-1.19.0.dist-info/RECORD +0 -34
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -3,29 +3,25 @@ from pathlib import Path
|
|
3
3
|
from typing import Iterable, List
|
4
4
|
|
5
5
|
import numpy
|
6
|
+
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
6
7
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
7
8
|
from PIL import ImageDraw
|
8
9
|
|
9
|
-
from docling.datamodel.base_models import
|
10
|
-
|
11
|
-
|
12
|
-
TableCell,
|
13
|
-
TableElement,
|
14
|
-
TableStructurePrediction,
|
15
|
-
)
|
16
|
-
from docling.datamodel.pipeline_options import TableFormerMode
|
10
|
+
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
|
+
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
12
|
+
from docling.models.base_model import BasePageModel
|
17
13
|
|
18
14
|
|
19
|
-
class TableStructureModel:
|
20
|
-
def __init__(
|
21
|
-
self
|
22
|
-
|
23
|
-
self.
|
15
|
+
class TableStructureModel(BasePageModel):
|
16
|
+
def __init__(
|
17
|
+
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
18
|
+
):
|
19
|
+
self.options = options
|
20
|
+
self.do_cell_matching = self.options.do_cell_matching
|
21
|
+
self.mode = self.options.mode
|
24
22
|
|
25
|
-
self.enabled =
|
23
|
+
self.enabled = enabled
|
26
24
|
if self.enabled:
|
27
|
-
artifacts_path: Path = config["artifacts_path"]
|
28
|
-
|
29
25
|
if self.mode == TableFormerMode.ACCURATE:
|
30
26
|
artifacts_path = artifacts_path / "fat"
|
31
27
|
|
@@ -39,7 +35,9 @@ class TableStructureModel:
|
|
39
35
|
self.tf_predictor = TFPredictor(self.tm_config)
|
40
36
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
41
37
|
|
42
|
-
def draw_table_and_cells(self, page: Page, tbl_list: List[
|
38
|
+
def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
|
39
|
+
assert page._backend is not None
|
40
|
+
|
43
41
|
image = (
|
44
42
|
page._backend.get_page_image()
|
45
43
|
) # make new image to avoid drawing on the saved ones
|
@@ -50,17 +48,18 @@ class TableStructureModel:
|
|
50
48
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
51
49
|
|
52
50
|
for tc in table_element.table_cells:
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
(
|
61
|
-
|
62
|
-
|
63
|
-
|
51
|
+
if tc.bbox is not None:
|
52
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
53
|
+
if tc.column_header:
|
54
|
+
width = 3
|
55
|
+
else:
|
56
|
+
width = 1
|
57
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
|
58
|
+
draw.text(
|
59
|
+
(x0 + 3, y0 + 3),
|
60
|
+
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
61
|
+
fill="black",
|
62
|
+
)
|
64
63
|
|
65
64
|
image.show()
|
66
65
|
|
@@ -71,90 +70,102 @@ class TableStructureModel:
|
|
71
70
|
return
|
72
71
|
|
73
72
|
for page in page_batch:
|
74
|
-
|
75
|
-
page.
|
76
|
-
|
77
|
-
in_tables = [
|
78
|
-
(
|
79
|
-
cluster,
|
80
|
-
[
|
81
|
-
round(cluster.bbox.l) * self.scale,
|
82
|
-
round(cluster.bbox.t) * self.scale,
|
83
|
-
round(cluster.bbox.r) * self.scale,
|
84
|
-
round(cluster.bbox.b) * self.scale,
|
85
|
-
],
|
86
|
-
)
|
87
|
-
for cluster in page.predictions.layout.clusters
|
88
|
-
if cluster.label == "Table"
|
89
|
-
]
|
90
|
-
if not len(in_tables):
|
73
|
+
assert page._backend is not None
|
74
|
+
if not page._backend.is_valid():
|
91
75
|
yield page
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
table_cells=table_cells,
|
147
|
-
num_rows=num_rows,
|
148
|
-
num_cols=num_cols,
|
149
|
-
id=table_cluster.id,
|
150
|
-
page_no=page.page_no,
|
151
|
-
cluster=table_cluster,
|
152
|
-
label="Table",
|
76
|
+
else:
|
77
|
+
|
78
|
+
assert page.predictions.layout is not None
|
79
|
+
assert page.size is not None
|
80
|
+
|
81
|
+
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
82
|
+
|
83
|
+
in_tables = [
|
84
|
+
(
|
85
|
+
cluster,
|
86
|
+
[
|
87
|
+
round(cluster.bbox.l) * self.scale,
|
88
|
+
round(cluster.bbox.t) * self.scale,
|
89
|
+
round(cluster.bbox.r) * self.scale,
|
90
|
+
round(cluster.bbox.b) * self.scale,
|
91
|
+
],
|
92
|
+
)
|
93
|
+
for cluster in page.predictions.layout.clusters
|
94
|
+
if cluster.label == DocItemLabel.TABLE
|
95
|
+
]
|
96
|
+
if not len(in_tables):
|
97
|
+
yield page
|
98
|
+
continue
|
99
|
+
|
100
|
+
tokens = []
|
101
|
+
for c in page.cells:
|
102
|
+
for cluster, _ in in_tables:
|
103
|
+
if c.bbox.area() > 0:
|
104
|
+
if (
|
105
|
+
c.bbox.intersection_area_with(cluster.bbox)
|
106
|
+
/ c.bbox.area()
|
107
|
+
> 0.2
|
108
|
+
):
|
109
|
+
# Only allow non empty stings (spaces) into the cells of a table
|
110
|
+
if len(c.text.strip()) > 0:
|
111
|
+
new_cell = copy.deepcopy(c)
|
112
|
+
new_cell.bbox = new_cell.bbox.scaled(
|
113
|
+
scale=self.scale
|
114
|
+
)
|
115
|
+
|
116
|
+
tokens.append(new_cell.model_dump())
|
117
|
+
|
118
|
+
page_input = {
|
119
|
+
"tokens": tokens,
|
120
|
+
"width": page.size.width * self.scale,
|
121
|
+
"height": page.size.height * self.scale,
|
122
|
+
}
|
123
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
124
|
+
|
125
|
+
table_clusters, table_bboxes = zip(*in_tables)
|
126
|
+
|
127
|
+
if len(table_bboxes):
|
128
|
+
tf_output = self.tf_predictor.multi_table_predict(
|
129
|
+
page_input, table_bboxes, do_matching=self.do_cell_matching
|
153
130
|
)
|
154
131
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
132
|
+
for table_cluster, table_out in zip(table_clusters, tf_output):
|
133
|
+
table_cells = []
|
134
|
+
for element in table_out["tf_responses"]:
|
135
|
+
|
136
|
+
if not self.do_cell_matching:
|
137
|
+
the_bbox = BoundingBox.model_validate(
|
138
|
+
element["bbox"]
|
139
|
+
).scaled(1 / self.scale)
|
140
|
+
text_piece = page._backend.get_text_in_rect(the_bbox)
|
141
|
+
element["bbox"]["token"] = text_piece
|
142
|
+
|
143
|
+
tc = TableCell.model_validate(element)
|
144
|
+
if self.do_cell_matching and tc.bbox is not None:
|
145
|
+
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
146
|
+
table_cells.append(tc)
|
147
|
+
|
148
|
+
# Retrieving cols/rows, after post processing:
|
149
|
+
num_rows = table_out["predict_details"]["num_rows"]
|
150
|
+
num_cols = table_out["predict_details"]["num_cols"]
|
151
|
+
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
152
|
+
|
153
|
+
tbl = Table(
|
154
|
+
otsl_seq=otsl_seq,
|
155
|
+
table_cells=table_cells,
|
156
|
+
num_rows=num_rows,
|
157
|
+
num_cols=num_cols,
|
158
|
+
id=table_cluster.id,
|
159
|
+
page_no=page.page_no,
|
160
|
+
cluster=table_cluster,
|
161
|
+
label=DocItemLabel.TABLE,
|
162
|
+
)
|
163
|
+
|
164
|
+
page.predictions.tablestructure.table_map[table_cluster.id] = (
|
165
|
+
tbl
|
166
|
+
)
|
167
|
+
|
168
|
+
# For debugging purposes:
|
169
|
+
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
|
159
170
|
|
160
|
-
|
171
|
+
yield page
|
@@ -1,12 +1,13 @@
|
|
1
1
|
import io
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
|
-
from subprocess import PIPE, Popen
|
5
|
-
from typing import Iterable, Tuple
|
4
|
+
from subprocess import DEVNULL, PIPE, Popen
|
5
|
+
from typing import Iterable, Optional, Tuple
|
6
6
|
|
7
7
|
import pandas as pd
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
8
9
|
|
9
|
-
from docling.datamodel.base_models import
|
10
|
+
from docling.datamodel.base_models import OcrCell, Page
|
10
11
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
11
12
|
from docling.models.base_ocr_model import BaseOcrModel
|
12
13
|
|
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
21
22
|
|
22
23
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
24
|
|
24
|
-
self._name = None
|
25
|
-
self._version = None
|
25
|
+
self._name: Optional[str] = None
|
26
|
+
self._version: Optional[str] = None
|
26
27
|
|
27
28
|
if self.enabled:
|
28
29
|
try:
|
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
39
40
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
40
41
|
|
41
42
|
if self._name != None and self._version != None:
|
42
|
-
return self._name, self._version
|
43
|
+
return self._name, self._version # type: ignore
|
43
44
|
|
44
45
|
cmd = [self.options.tesseract_cmd, "--version"]
|
45
46
|
|
@@ -81,7 +82,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
81
82
|
cmd += [ifilename, "stdout", "tsv"]
|
82
83
|
_log.info("command: {}".format(" ".join(cmd)))
|
83
84
|
|
84
|
-
proc = Popen(cmd, stdout=PIPE)
|
85
|
+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
85
86
|
output, _ = proc.communicate()
|
86
87
|
|
87
88
|
# _log.info(output)
|
@@ -108,60 +109,66 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
108
109
|
return
|
109
110
|
|
110
111
|
for page in page_batch:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
112
|
+
assert page._backend is not None
|
113
|
+
if not page._backend.is_valid():
|
114
|
+
yield page
|
115
|
+
else:
|
116
|
+
ocr_rects = self.get_ocr_rects(page)
|
117
|
+
|
118
|
+
all_ocr_cells = []
|
119
|
+
for ocr_rect in ocr_rects:
|
120
|
+
# Skip zero area boxes
|
121
|
+
if ocr_rect.area() == 0:
|
122
|
+
continue
|
123
|
+
high_res_image = page._backend.get_page_image(
|
124
|
+
scale=self.scale, cropbox=ocr_rect
|
125
|
+
)
|
121
126
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
(
|
151
|
-
|
152
|
-
|
127
|
+
with tempfile.NamedTemporaryFile(
|
128
|
+
suffix=".png", mode="w"
|
129
|
+
) as image_file:
|
130
|
+
fname = image_file.name
|
131
|
+
high_res_image.save(fname)
|
132
|
+
|
133
|
+
df = self._run_tesseract(fname)
|
134
|
+
|
135
|
+
# _log.info(df)
|
136
|
+
|
137
|
+
# Print relevant columns (bounding box and text)
|
138
|
+
for ix, row in df.iterrows():
|
139
|
+
text = row["text"]
|
140
|
+
conf = row["conf"]
|
141
|
+
|
142
|
+
l = float(row["left"])
|
143
|
+
b = float(row["top"])
|
144
|
+
w = float(row["width"])
|
145
|
+
h = float(row["height"])
|
146
|
+
|
147
|
+
t = b + h
|
148
|
+
r = l + w
|
149
|
+
|
150
|
+
cell = OcrCell(
|
151
|
+
id=ix,
|
152
|
+
text=text,
|
153
|
+
confidence=conf / 100.0,
|
154
|
+
bbox=BoundingBox.from_tuple(
|
155
|
+
coord=(
|
156
|
+
(l / self.scale) + ocr_rect.l,
|
157
|
+
(b / self.scale) + ocr_rect.t,
|
158
|
+
(r / self.scale) + ocr_rect.l,
|
159
|
+
(t / self.scale) + ocr_rect.t,
|
160
|
+
),
|
161
|
+
origin=CoordOrigin.TOPLEFT,
|
153
162
|
),
|
154
|
-
|
155
|
-
)
|
156
|
-
)
|
157
|
-
all_ocr_cells.append(cell)
|
163
|
+
)
|
164
|
+
all_ocr_cells.append(cell)
|
158
165
|
|
159
|
-
|
160
|
-
|
166
|
+
## Remove OCR cells which overlap with programmatic cells.
|
167
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
161
168
|
|
162
|
-
|
169
|
+
page.cells.extend(filtered_ocr_cells)
|
163
170
|
|
164
|
-
|
165
|
-
|
171
|
+
# DEBUG code:
|
172
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
166
173
|
|
167
|
-
|
174
|
+
yield page
|
@@ -1,19 +1,19 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Iterable
|
3
3
|
|
4
|
-
import
|
4
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
5
|
|
6
|
-
from docling.datamodel.base_models import
|
7
|
-
from docling.datamodel.pipeline_options import
|
6
|
+
from docling.datamodel.base_models import OcrCell, Page
|
7
|
+
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
8
8
|
from docling.models.base_ocr_model import BaseOcrModel
|
9
9
|
|
10
10
|
_log = logging.getLogger(__name__)
|
11
11
|
|
12
12
|
|
13
13
|
class TesseractOcrModel(BaseOcrModel):
|
14
|
-
def __init__(self, enabled: bool, options:
|
14
|
+
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
15
15
|
super().__init__(enabled=enabled, options=options)
|
16
|
-
self.options:
|
16
|
+
self.options: TesseractOcrOptions
|
17
17
|
|
18
18
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
19
|
self.reader = None
|
@@ -68,55 +68,63 @@ class TesseractOcrModel(BaseOcrModel):
|
|
68
68
|
return
|
69
69
|
|
70
70
|
for page in page_batch:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
if ocr_rect.area() == 0:
|
77
|
-
continue
|
78
|
-
high_res_image = page._backend.get_page_image(
|
79
|
-
scale=self.scale, cropbox=ocr_rect
|
80
|
-
)
|
71
|
+
assert page._backend is not None
|
72
|
+
if not page._backend.is_valid():
|
73
|
+
yield page
|
74
|
+
else:
|
75
|
+
assert self.reader is not None
|
81
76
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
# Extract text within the bounding box
|
92
|
-
text = self.reader.GetUTF8Text().strip()
|
93
|
-
confidence = self.reader.MeanTextConf()
|
94
|
-
left = box["x"] / self.scale
|
95
|
-
bottom = box["y"] / self.scale
|
96
|
-
right = (box["x"] + box["w"]) / self.scale
|
97
|
-
top = (box["y"] + box["h"]) / self.scale
|
98
|
-
|
99
|
-
cells.append(
|
100
|
-
OcrCell(
|
101
|
-
id=ix,
|
102
|
-
text=text,
|
103
|
-
confidence=confidence,
|
104
|
-
bbox=BoundingBox.from_tuple(
|
105
|
-
coord=(left, top, right, bottom),
|
106
|
-
origin=CoordOrigin.TOPLEFT,
|
107
|
-
),
|
108
|
-
)
|
77
|
+
ocr_rects = self.get_ocr_rects(page)
|
78
|
+
|
79
|
+
all_ocr_cells = []
|
80
|
+
for ocr_rect in ocr_rects:
|
81
|
+
# Skip zero area boxes
|
82
|
+
if ocr_rect.area() == 0:
|
83
|
+
continue
|
84
|
+
high_res_image = page._backend.get_page_image(
|
85
|
+
scale=self.scale, cropbox=ocr_rect
|
109
86
|
)
|
110
87
|
|
111
|
-
|
112
|
-
|
88
|
+
# Retrieve text snippets with their bounding boxes
|
89
|
+
self.reader.SetImage(high_res_image)
|
90
|
+
boxes = self.reader.GetComponentImages(
|
91
|
+
self.reader_RIL.TEXTLINE, True
|
92
|
+
)
|
93
|
+
|
94
|
+
cells = []
|
95
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
96
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
97
|
+
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
98
|
+
|
99
|
+
# Extract text within the bounding box
|
100
|
+
text = self.reader.GetUTF8Text().strip()
|
101
|
+
confidence = self.reader.MeanTextConf()
|
102
|
+
left = box["x"] / self.scale
|
103
|
+
bottom = box["y"] / self.scale
|
104
|
+
right = (box["x"] + box["w"]) / self.scale
|
105
|
+
top = (box["y"] + box["h"]) / self.scale
|
106
|
+
|
107
|
+
cells.append(
|
108
|
+
OcrCell(
|
109
|
+
id=ix,
|
110
|
+
text=text,
|
111
|
+
confidence=confidence,
|
112
|
+
bbox=BoundingBox.from_tuple(
|
113
|
+
coord=(left, top, right, bottom),
|
114
|
+
origin=CoordOrigin.TOPLEFT,
|
115
|
+
),
|
116
|
+
)
|
117
|
+
)
|
118
|
+
|
119
|
+
# del high_res_image
|
120
|
+
all_ocr_cells.extend(cells)
|
113
121
|
|
114
|
-
|
115
|
-
|
122
|
+
## Remove OCR cells which overlap with programmatic cells.
|
123
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
116
124
|
|
117
|
-
|
125
|
+
page.cells.extend(filtered_ocr_cells)
|
118
126
|
|
119
|
-
|
120
|
-
|
127
|
+
# DEBUG code:
|
128
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
121
129
|
|
122
|
-
|
130
|
+
yield page
|