docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +1 -0
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +4 -4
- docling/backend/docling_parse_v2_backend.py +12 -4
- docling/backend/html_backend.py +61 -57
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +62 -39
- docling/backend/msword_backend.py +12 -25
- docling/backend/pypdfium2_backend.py +1 -1
- docling/cli/main.py +38 -8
- docling/datamodel/base_models.py +16 -10
- docling/datamodel/document.py +36 -6
- docling/datamodel/pipeline_options.py +3 -3
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +38 -12
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +76 -52
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
- docling-2.4.1.dist-info/RECORD +45 -0
- docling-2.1.0.dist-info/RECORD +0 -42
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Iterable
|
3
|
+
from typing import Iterable
|
4
4
|
|
5
5
|
import numpy
|
6
6
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
|
8
8
|
from PIL import ImageDraw
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
11
|
+
from docling.datamodel.document import ConversionResult
|
11
12
|
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
13
|
+
from docling.datamodel.settings import settings
|
12
14
|
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
13
16
|
|
14
17
|
|
15
18
|
class TableStructureModel(BasePageModel):
|
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
|
|
35
38
|
self.tf_predictor = TFPredictor(self.tm_config)
|
36
39
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
37
40
|
|
38
|
-
def draw_table_and_cells(
|
41
|
+
def draw_table_and_cells(
|
42
|
+
self,
|
43
|
+
conv_res: ConversionResult,
|
44
|
+
page: Page,
|
45
|
+
tbl_list: Iterable[Table],
|
46
|
+
show: bool = False,
|
47
|
+
):
|
39
48
|
assert page._backend is not None
|
40
49
|
|
41
50
|
image = (
|
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
|
|
61
70
|
fill="black",
|
62
71
|
)
|
63
72
|
|
64
|
-
|
73
|
+
if show:
|
74
|
+
image.show()
|
75
|
+
else:
|
76
|
+
out_path: Path = (
|
77
|
+
Path(settings.debug.debug_output_path)
|
78
|
+
/ f"debug_{conv_res.input.file.stem}"
|
79
|
+
)
|
80
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
81
|
+
|
82
|
+
out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
|
83
|
+
image.save(str(out_file), format="png")
|
65
84
|
|
66
|
-
def __call__(
|
85
|
+
def __call__(
|
86
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
87
|
+
) -> Iterable[Page]:
|
67
88
|
|
68
89
|
if not self.enabled:
|
69
90
|
yield from page_batch
|
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
|
|
74
95
|
if not page._backend.is_valid():
|
75
96
|
yield page
|
76
97
|
else:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
98
|
+
with TimeRecorder(conv_res, "table_structure"):
|
99
|
+
|
100
|
+
assert page.predictions.layout is not None
|
101
|
+
assert page.size is not None
|
102
|
+
|
103
|
+
page.predictions.tablestructure = (
|
104
|
+
TableStructurePrediction()
|
105
|
+
) # dummy
|
106
|
+
|
107
|
+
in_tables = [
|
108
|
+
(
|
109
|
+
cluster,
|
110
|
+
[
|
111
|
+
round(cluster.bbox.l) * self.scale,
|
112
|
+
round(cluster.bbox.t) * self.scale,
|
113
|
+
round(cluster.bbox.r) * self.scale,
|
114
|
+
round(cluster.bbox.b) * self.scale,
|
115
|
+
],
|
116
|
+
)
|
117
|
+
for cluster in page.predictions.layout.clusters
|
118
|
+
if cluster.label == DocItemLabel.TABLE
|
119
|
+
]
|
120
|
+
if not len(in_tables):
|
121
|
+
yield page
|
122
|
+
continue
|
123
|
+
|
124
|
+
tokens = []
|
125
|
+
for c in page.cells:
|
126
|
+
for cluster, _ in in_tables:
|
127
|
+
if c.bbox.area() > 0:
|
128
|
+
if (
|
129
|
+
c.bbox.intersection_area_with(cluster.bbox)
|
130
|
+
/ c.bbox.area()
|
131
|
+
> 0.2
|
132
|
+
):
|
133
|
+
# Only allow non empty stings (spaces) into the cells of a table
|
134
|
+
if len(c.text.strip()) > 0:
|
135
|
+
new_cell = copy.deepcopy(c)
|
136
|
+
new_cell.bbox = new_cell.bbox.scaled(
|
137
|
+
scale=self.scale
|
138
|
+
)
|
139
|
+
|
140
|
+
tokens.append(new_cell.model_dump())
|
141
|
+
|
142
|
+
page_input = {
|
143
|
+
"tokens": tokens,
|
144
|
+
"width": page.size.width * self.scale,
|
145
|
+
"height": page.size.height * self.scale,
|
146
|
+
}
|
147
|
+
page_input["image"] = numpy.asarray(
|
148
|
+
page.get_image(scale=self.scale)
|
92
149
|
)
|
93
|
-
for cluster in page.predictions.layout.clusters
|
94
|
-
if cluster.label == DocItemLabel.TABLE
|
95
|
-
]
|
96
|
-
if not len(in_tables):
|
97
|
-
yield page
|
98
|
-
continue
|
99
|
-
|
100
|
-
tokens = []
|
101
|
-
for c in page.cells:
|
102
|
-
for cluster, _ in in_tables:
|
103
|
-
if c.bbox.area() > 0:
|
104
|
-
if (
|
105
|
-
c.bbox.intersection_area_with(cluster.bbox)
|
106
|
-
/ c.bbox.area()
|
107
|
-
> 0.2
|
108
|
-
):
|
109
|
-
# Only allow non empty stings (spaces) into the cells of a table
|
110
|
-
if len(c.text.strip()) > 0:
|
111
|
-
new_cell = copy.deepcopy(c)
|
112
|
-
new_cell.bbox = new_cell.bbox.scaled(
|
113
|
-
scale=self.scale
|
114
|
-
)
|
115
|
-
|
116
|
-
tokens.append(new_cell.model_dump())
|
117
150
|
|
118
|
-
|
119
|
-
"tokens": tokens,
|
120
|
-
"width": page.size.width * self.scale,
|
121
|
-
"height": page.size.height * self.scale,
|
122
|
-
}
|
123
|
-
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
151
|
+
table_clusters, table_bboxes = zip(*in_tables)
|
124
152
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
tf_output = self.tf_predictor.multi_table_predict(
|
129
|
-
page_input, table_bboxes, do_matching=self.do_cell_matching
|
130
|
-
)
|
131
|
-
|
132
|
-
for table_cluster, table_out in zip(table_clusters, tf_output):
|
133
|
-
table_cells = []
|
134
|
-
for element in table_out["tf_responses"]:
|
135
|
-
|
136
|
-
if not self.do_cell_matching:
|
137
|
-
the_bbox = BoundingBox.model_validate(
|
138
|
-
element["bbox"]
|
139
|
-
).scaled(1 / self.scale)
|
140
|
-
text_piece = page._backend.get_text_in_rect(the_bbox)
|
141
|
-
element["bbox"]["token"] = text_piece
|
142
|
-
|
143
|
-
tc = TableCell.model_validate(element)
|
144
|
-
if self.do_cell_matching and tc.bbox is not None:
|
145
|
-
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
146
|
-
table_cells.append(tc)
|
147
|
-
|
148
|
-
# Retrieving cols/rows, after post processing:
|
149
|
-
num_rows = table_out["predict_details"]["num_rows"]
|
150
|
-
num_cols = table_out["predict_details"]["num_cols"]
|
151
|
-
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
152
|
-
|
153
|
-
tbl = Table(
|
154
|
-
otsl_seq=otsl_seq,
|
155
|
-
table_cells=table_cells,
|
156
|
-
num_rows=num_rows,
|
157
|
-
num_cols=num_cols,
|
158
|
-
id=table_cluster.id,
|
159
|
-
page_no=page.page_no,
|
160
|
-
cluster=table_cluster,
|
161
|
-
label=DocItemLabel.TABLE,
|
153
|
+
if len(table_bboxes):
|
154
|
+
tf_output = self.tf_predictor.multi_table_predict(
|
155
|
+
page_input, table_bboxes, do_matching=self.do_cell_matching
|
162
156
|
)
|
163
157
|
|
164
|
-
|
165
|
-
|
166
|
-
|
158
|
+
for table_cluster, table_out in zip(table_clusters, tf_output):
|
159
|
+
table_cells = []
|
160
|
+
for element in table_out["tf_responses"]:
|
161
|
+
|
162
|
+
if not self.do_cell_matching:
|
163
|
+
the_bbox = BoundingBox.model_validate(
|
164
|
+
element["bbox"]
|
165
|
+
).scaled(1 / self.scale)
|
166
|
+
text_piece = page._backend.get_text_in_rect(
|
167
|
+
the_bbox
|
168
|
+
)
|
169
|
+
element["bbox"]["token"] = text_piece
|
170
|
+
|
171
|
+
tc = TableCell.model_validate(element)
|
172
|
+
if self.do_cell_matching and tc.bbox is not None:
|
173
|
+
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
174
|
+
table_cells.append(tc)
|
175
|
+
|
176
|
+
# Retrieving cols/rows, after post processing:
|
177
|
+
num_rows = table_out["predict_details"]["num_rows"]
|
178
|
+
num_cols = table_out["predict_details"]["num_cols"]
|
179
|
+
otsl_seq = table_out["predict_details"]["prediction"][
|
180
|
+
"rs_seq"
|
181
|
+
]
|
182
|
+
|
183
|
+
tbl = Table(
|
184
|
+
otsl_seq=otsl_seq,
|
185
|
+
table_cells=table_cells,
|
186
|
+
num_rows=num_rows,
|
187
|
+
num_cols=num_cols,
|
188
|
+
id=table_cluster.id,
|
189
|
+
page_no=page.page_no,
|
190
|
+
cluster=table_cluster,
|
191
|
+
label=DocItemLabel.TABLE,
|
192
|
+
)
|
193
|
+
|
194
|
+
page.predictions.tablestructure.table_map[
|
195
|
+
table_cluster.id
|
196
|
+
] = tbl
|
167
197
|
|
168
198
|
# For debugging purposes:
|
169
|
-
|
199
|
+
if settings.debug.visualize_tables:
|
200
|
+
self.draw_table_and_cells(
|
201
|
+
conv_res,
|
202
|
+
page,
|
203
|
+
page.predictions.tablestructure.table_map.values(),
|
204
|
+
)
|
170
205
|
|
171
206
|
yield page
|
@@ -8,8 +8,11 @@ import pandas as pd
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import OcrCell, Page
|
11
|
+
from docling.datamodel.document import ConversionResult
|
11
12
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
13
|
+
from docling.datamodel.settings import settings
|
12
14
|
from docling.models.base_ocr_model import BaseOcrModel
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
13
16
|
|
14
17
|
_log = logging.getLogger(__name__)
|
15
18
|
|
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
102
105
|
|
103
106
|
return df_filtered
|
104
107
|
|
105
|
-
def __call__(
|
108
|
+
def __call__(
|
109
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
110
|
+
) -> Iterable[Page]:
|
106
111
|
|
107
112
|
if not self.enabled:
|
108
113
|
yield from page_batch
|
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
113
118
|
if not page._backend.is_valid():
|
114
119
|
yield page
|
115
120
|
else:
|
116
|
-
|
117
|
-
|
118
|
-
all_ocr_cells = []
|
119
|
-
for ocr_rect in ocr_rects:
|
120
|
-
# Skip zero area boxes
|
121
|
-
if ocr_rect.area() == 0:
|
122
|
-
continue
|
123
|
-
high_res_image = page._backend.get_page_image(
|
124
|
-
scale=self.scale, cropbox=ocr_rect
|
125
|
-
)
|
121
|
+
with TimeRecorder(conv_res, "ocr"):
|
126
122
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
# Print relevant columns (bounding box and text)
|
138
|
-
for ix, row in df.iterrows():
|
139
|
-
text = row["text"]
|
140
|
-
conf = row["conf"]
|
141
|
-
|
142
|
-
l = float(row["left"])
|
143
|
-
b = float(row["top"])
|
144
|
-
w = float(row["width"])
|
145
|
-
h = float(row["height"])
|
146
|
-
|
147
|
-
t = b + h
|
148
|
-
r = l + w
|
149
|
-
|
150
|
-
cell = OcrCell(
|
151
|
-
id=ix,
|
152
|
-
text=text,
|
153
|
-
confidence=conf / 100.0,
|
154
|
-
bbox=BoundingBox.from_tuple(
|
155
|
-
coord=(
|
156
|
-
(l / self.scale) + ocr_rect.l,
|
157
|
-
(b / self.scale) + ocr_rect.t,
|
158
|
-
(r / self.scale) + ocr_rect.l,
|
159
|
-
(t / self.scale) + ocr_rect.t,
|
160
|
-
),
|
161
|
-
origin=CoordOrigin.TOPLEFT,
|
162
|
-
),
|
123
|
+
ocr_rects = self.get_ocr_rects(page)
|
124
|
+
|
125
|
+
all_ocr_cells = []
|
126
|
+
for ocr_rect in ocr_rects:
|
127
|
+
# Skip zero area boxes
|
128
|
+
if ocr_rect.area() == 0:
|
129
|
+
continue
|
130
|
+
high_res_image = page._backend.get_page_image(
|
131
|
+
scale=self.scale, cropbox=ocr_rect
|
163
132
|
)
|
164
|
-
all_ocr_cells.append(cell)
|
165
133
|
|
166
|
-
|
167
|
-
|
134
|
+
with tempfile.NamedTemporaryFile(
|
135
|
+
suffix=".png", mode="w"
|
136
|
+
) as image_file:
|
137
|
+
fname = image_file.name
|
138
|
+
high_res_image.save(fname)
|
139
|
+
|
140
|
+
df = self._run_tesseract(fname)
|
141
|
+
|
142
|
+
# _log.info(df)
|
143
|
+
|
144
|
+
# Print relevant columns (bounding box and text)
|
145
|
+
for ix, row in df.iterrows():
|
146
|
+
text = row["text"]
|
147
|
+
conf = row["conf"]
|
148
|
+
|
149
|
+
l = float(row["left"])
|
150
|
+
b = float(row["top"])
|
151
|
+
w = float(row["width"])
|
152
|
+
h = float(row["height"])
|
153
|
+
|
154
|
+
t = b + h
|
155
|
+
r = l + w
|
156
|
+
|
157
|
+
cell = OcrCell(
|
158
|
+
id=ix,
|
159
|
+
text=text,
|
160
|
+
confidence=conf / 100.0,
|
161
|
+
bbox=BoundingBox.from_tuple(
|
162
|
+
coord=(
|
163
|
+
(l / self.scale) + ocr_rect.l,
|
164
|
+
(b / self.scale) + ocr_rect.t,
|
165
|
+
(r / self.scale) + ocr_rect.l,
|
166
|
+
(t / self.scale) + ocr_rect.t,
|
167
|
+
),
|
168
|
+
origin=CoordOrigin.TOPLEFT,
|
169
|
+
),
|
170
|
+
)
|
171
|
+
all_ocr_cells.append(cell)
|
172
|
+
|
173
|
+
## Remove OCR cells which overlap with programmatic cells.
|
174
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
175
|
+
all_ocr_cells, page.cells
|
176
|
+
)
|
168
177
|
|
169
|
-
|
178
|
+
page.cells.extend(filtered_ocr_cells)
|
170
179
|
|
171
180
|
# DEBUG code:
|
172
|
-
|
181
|
+
if settings.debug.visualize_ocr:
|
182
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
173
183
|
|
174
184
|
yield page
|
@@ -4,8 +4,11 @@ from typing import Iterable
|
|
4
4
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import OcrCell, Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
7
8
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
9
|
+
from docling.datamodel.settings import settings
|
8
10
|
from docling.models.base_ocr_model import BaseOcrModel
|
11
|
+
from docling.utils.profiling import TimeRecorder
|
9
12
|
|
10
13
|
_log = logging.getLogger(__name__)
|
11
14
|
|
@@ -19,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
|
|
19
22
|
self.reader = None
|
20
23
|
|
21
24
|
if self.enabled:
|
22
|
-
|
25
|
+
install_errmsg = (
|
23
26
|
"tesserocr is not correctly installed. "
|
24
27
|
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
25
|
-
"Note that tesserocr might have to be manually compiled for working with"
|
28
|
+
"Note that tesserocr might have to be manually compiled for working with "
|
26
29
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
27
|
-
"Alternatively, Docling has support for other OCR engines. See the documentation
|
30
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
31
|
+
"https://ds4sd.github.io/docling/installation/"
|
28
32
|
)
|
33
|
+
missing_langs_errmsg = (
|
34
|
+
"tesserocr is not correctly configured. No language models have been detected. "
|
35
|
+
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
|
36
|
+
"You can find more information how to setup other OCR engines in Docling "
|
37
|
+
"documentation: "
|
38
|
+
"https://ds4sd.github.io/docling/installation/"
|
39
|
+
)
|
40
|
+
|
29
41
|
try:
|
30
42
|
import tesserocr
|
31
43
|
except ImportError:
|
32
|
-
raise ImportError(
|
33
|
-
|
44
|
+
raise ImportError(install_errmsg)
|
34
45
|
try:
|
35
46
|
tesseract_version = tesserocr.tesseract_version()
|
36
|
-
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
37
47
|
except:
|
38
|
-
raise ImportError(
|
48
|
+
raise ImportError(install_errmsg)
|
49
|
+
|
50
|
+
_, tesserocr_languages = tesserocr.get_languages()
|
51
|
+
if not tesserocr_languages:
|
52
|
+
raise ImportError(missing_langs_errmsg)
|
39
53
|
|
40
54
|
# Initialize the tesseractAPI
|
55
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
41
56
|
lang = "+".join(self.options.lang)
|
42
57
|
if self.options.path is not None:
|
43
58
|
self.reader = tesserocr.PyTessBaseAPI(
|
@@ -61,7 +76,9 @@ class TesseractOcrModel(BaseOcrModel):
|
|
61
76
|
# Finalize the tesseractAPI
|
62
77
|
self.reader.End()
|
63
78
|
|
64
|
-
def __call__(
|
79
|
+
def __call__(
|
80
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
81
|
+
) -> Iterable[Page]:
|
65
82
|
|
66
83
|
if not self.enabled:
|
67
84
|
yield from page_batch
|
@@ -72,59 +89,66 @@ class TesseractOcrModel(BaseOcrModel):
|
|
72
89
|
if not page._backend.is_valid():
|
73
90
|
yield page
|
74
91
|
else:
|
75
|
-
|
92
|
+
with TimeRecorder(conv_res, "ocr"):
|
76
93
|
|
77
|
-
|
94
|
+
assert self.reader is not None
|
78
95
|
|
79
|
-
|
80
|
-
for ocr_rect in ocr_rects:
|
81
|
-
# Skip zero area boxes
|
82
|
-
if ocr_rect.area() == 0:
|
83
|
-
continue
|
84
|
-
high_res_image = page._backend.get_page_image(
|
85
|
-
scale=self.scale, cropbox=ocr_rect
|
86
|
-
)
|
96
|
+
ocr_rects = self.get_ocr_rects(page)
|
87
97
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
98
|
+
all_ocr_cells = []
|
99
|
+
for ocr_rect in ocr_rects:
|
100
|
+
# Skip zero area boxes
|
101
|
+
if ocr_rect.area() == 0:
|
102
|
+
continue
|
103
|
+
high_res_image = page._backend.get_page_image(
|
104
|
+
scale=self.scale, cropbox=ocr_rect
|
105
|
+
)
|
93
106
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
# Extract text within the bounding box
|
100
|
-
text = self.reader.GetUTF8Text().strip()
|
101
|
-
confidence = self.reader.MeanTextConf()
|
102
|
-
left = box["x"] / self.scale
|
103
|
-
bottom = box["y"] / self.scale
|
104
|
-
right = (box["x"] + box["w"]) / self.scale
|
105
|
-
top = (box["y"] + box["h"]) / self.scale
|
106
|
-
|
107
|
-
cells.append(
|
108
|
-
OcrCell(
|
109
|
-
id=ix,
|
110
|
-
text=text,
|
111
|
-
confidence=confidence,
|
112
|
-
bbox=BoundingBox.from_tuple(
|
113
|
-
coord=(left, top, right, bottom),
|
114
|
-
origin=CoordOrigin.TOPLEFT,
|
115
|
-
),
|
116
|
-
)
|
107
|
+
# Retrieve text snippets with their bounding boxes
|
108
|
+
self.reader.SetImage(high_res_image)
|
109
|
+
boxes = self.reader.GetComponentImages(
|
110
|
+
self.reader_RIL.TEXTLINE, True
|
117
111
|
)
|
118
112
|
|
119
|
-
|
120
|
-
|
113
|
+
cells = []
|
114
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
115
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
116
|
+
self.reader.SetRectangle(
|
117
|
+
box["x"], box["y"], box["w"], box["h"]
|
118
|
+
)
|
119
|
+
|
120
|
+
# Extract text within the bounding box
|
121
|
+
text = self.reader.GetUTF8Text().strip()
|
122
|
+
confidence = self.reader.MeanTextConf()
|
123
|
+
left = box["x"] / self.scale
|
124
|
+
bottom = box["y"] / self.scale
|
125
|
+
right = (box["x"] + box["w"]) / self.scale
|
126
|
+
top = (box["y"] + box["h"]) / self.scale
|
127
|
+
|
128
|
+
cells.append(
|
129
|
+
OcrCell(
|
130
|
+
id=ix,
|
131
|
+
text=text,
|
132
|
+
confidence=confidence,
|
133
|
+
bbox=BoundingBox.from_tuple(
|
134
|
+
coord=(left, top, right, bottom),
|
135
|
+
origin=CoordOrigin.TOPLEFT,
|
136
|
+
),
|
137
|
+
)
|
138
|
+
)
|
139
|
+
|
140
|
+
# del high_res_image
|
141
|
+
all_ocr_cells.extend(cells)
|
121
142
|
|
122
|
-
|
123
|
-
|
143
|
+
## Remove OCR cells which overlap with programmatic cells.
|
144
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
145
|
+
all_ocr_cells, page.cells
|
146
|
+
)
|
124
147
|
|
125
|
-
|
148
|
+
page.cells.extend(filtered_ocr_cells)
|
126
149
|
|
127
150
|
# DEBUG code:
|
128
|
-
|
151
|
+
if settings.debug.visualize_ocr:
|
152
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
129
153
|
|
130
154
|
yield page
|