docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/models/base_ocr_model.py
CHANGED
@@ -1,27 +1,33 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
-
from
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Iterable, List
|
5
6
|
|
6
7
|
import numpy as np
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
7
9
|
from PIL import Image, ImageDraw
|
8
10
|
from rtree import index
|
9
11
|
from scipy.ndimage import find_objects, label
|
10
12
|
|
11
|
-
from docling.datamodel.base_models import
|
13
|
+
from docling.datamodel.base_models import OcrCell, Page
|
14
|
+
from docling.datamodel.document import ConversionResult
|
12
15
|
from docling.datamodel.pipeline_options import OcrOptions
|
16
|
+
from docling.datamodel.settings import settings
|
17
|
+
from docling.models.base_model import BasePageModel
|
13
18
|
|
14
19
|
_log = logging.getLogger(__name__)
|
15
20
|
|
16
21
|
|
17
|
-
class BaseOcrModel:
|
22
|
+
class BaseOcrModel(BasePageModel):
|
18
23
|
def __init__(self, enabled: bool, options: OcrOptions):
|
19
24
|
self.enabled = enabled
|
20
25
|
self.options = options
|
21
26
|
|
22
27
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
|
-
def get_ocr_rects(self, page: Page) ->
|
28
|
+
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
24
29
|
BITMAP_COVERAGE_TRESHOLD = 0.75
|
30
|
+
assert page.size is not None
|
25
31
|
|
26
32
|
def find_ocr_rects(size, bitmap_rects):
|
27
33
|
image = Image.new(
|
@@ -60,11 +66,14 @@ class BaseOcrModel:
|
|
60
66
|
|
61
67
|
return (area_frac, bounding_boxes) # fraction covered # boxes
|
62
68
|
|
63
|
-
|
69
|
+
if page._backend is not None:
|
70
|
+
bitmap_rects = page._backend.get_bitmap_rects()
|
71
|
+
else:
|
72
|
+
bitmap_rects = []
|
64
73
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
65
74
|
|
66
75
|
# return full-page rectangle if sufficiently covered with bitmaps
|
67
|
-
if coverage > BITMAP_COVERAGE_TRESHOLD:
|
76
|
+
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
68
77
|
return [
|
69
78
|
BoundingBox(
|
70
79
|
l=0,
|
@@ -75,7 +84,15 @@ class BaseOcrModel:
|
|
75
84
|
)
|
76
85
|
]
|
77
86
|
# return individual rectangles if the bitmap coverage is smaller
|
78
|
-
|
87
|
+
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
88
|
+
|
89
|
+
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
90
|
+
ocr_rects = [
|
91
|
+
rect
|
92
|
+
for rect in ocr_rects
|
93
|
+
if rect.area() / (page.size.width * page.size.height)
|
94
|
+
> self.options.bitmap_area_threshold
|
95
|
+
]
|
79
96
|
return ocr_rects
|
80
97
|
|
81
98
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
@@ -100,7 +117,7 @@ class BaseOcrModel:
|
|
100
117
|
]
|
101
118
|
return filtered_ocr_cells
|
102
119
|
|
103
|
-
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
120
|
+
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
104
121
|
image = copy.deepcopy(page.image)
|
105
122
|
draw = ImageDraw.Draw(image, "RGBA")
|
106
123
|
|
@@ -117,8 +134,21 @@ class BaseOcrModel:
|
|
117
134
|
if isinstance(tc, OcrCell):
|
118
135
|
color = "magenta"
|
119
136
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
120
|
-
|
137
|
+
|
138
|
+
if show:
|
139
|
+
image.show()
|
140
|
+
else:
|
141
|
+
out_path: Path = (
|
142
|
+
Path(settings.debug.debug_output_path)
|
143
|
+
/ f"debug_{conv_res.input.file.stem}"
|
144
|
+
)
|
145
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
146
|
+
|
147
|
+
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
148
|
+
image.save(str(out_file), format="png")
|
121
149
|
|
122
150
|
@abstractmethod
|
123
|
-
def __call__(
|
151
|
+
def __call__(
|
152
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
153
|
+
) -> Iterable[Page]:
|
124
154
|
pass
|
docling/models/ds_glm_model.py
CHANGED
@@ -1,54 +1,256 @@
|
|
1
1
|
import copy
|
2
2
|
import random
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import List, Union
|
3
5
|
|
4
6
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
5
|
-
from deepsearch_glm.utils.doc_utils import
|
7
|
+
from deepsearch_glm.utils.doc_utils import to_docling_document
|
6
8
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
7
|
-
from docling_core.types import
|
8
|
-
from docling_core.types import
|
9
|
-
from docling_core.types import
|
9
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
10
|
+
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
11
|
+
from docling_core.types.legacy_doc.base import (
|
12
|
+
Figure,
|
13
|
+
PageDimensions,
|
14
|
+
PageReference,
|
15
|
+
Prov,
|
16
|
+
Ref,
|
17
|
+
)
|
18
|
+
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
19
|
+
from docling_core.types.legacy_doc.base import TableCell
|
20
|
+
from docling_core.types.legacy_doc.document import BaseText
|
21
|
+
from docling_core.types.legacy_doc.document import (
|
22
|
+
CCSDocumentDescription as DsDocumentDescription,
|
23
|
+
)
|
24
|
+
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
25
|
+
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
10
26
|
from PIL import ImageDraw
|
27
|
+
from pydantic import BaseModel, ConfigDict
|
11
28
|
|
12
|
-
from docling.datamodel.base_models import
|
13
|
-
from docling.datamodel.document import ConversionResult
|
29
|
+
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
30
|
+
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
31
|
+
from docling.datamodel.settings import settings
|
32
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
33
|
+
from docling.utils.utils import create_hash
|
34
|
+
|
35
|
+
|
36
|
+
class GlmOptions(BaseModel):
|
37
|
+
model_config = ConfigDict(protected_namespaces=())
|
38
|
+
|
39
|
+
model_names: str = "" # e.g. "language;term;reference"
|
14
40
|
|
15
41
|
|
16
42
|
class GlmModel:
|
17
|
-
def __init__(self,
|
18
|
-
self.
|
19
|
-
|
20
|
-
"model_names", ""
|
21
|
-
) # "language;term;reference"
|
43
|
+
def __init__(self, options: GlmOptions):
|
44
|
+
self.options = options
|
45
|
+
|
22
46
|
load_pretrained_nlp_models()
|
23
|
-
|
24
|
-
|
25
|
-
|
47
|
+
self.model = init_nlp_model(model_names=self.options.model_names)
|
48
|
+
|
49
|
+
def _to_legacy_document(self, conv_res) -> DsDocument:
|
50
|
+
title = ""
|
51
|
+
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
52
|
+
|
53
|
+
page_hashes = [
|
54
|
+
PageReference(
|
55
|
+
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
56
|
+
page=p.page_no + 1,
|
57
|
+
model="default",
|
58
|
+
)
|
59
|
+
for p in conv_res.pages
|
60
|
+
]
|
61
|
+
|
62
|
+
file_info = DsFileInfoObject(
|
63
|
+
filename=conv_res.input.file.name,
|
64
|
+
document_hash=conv_res.input.document_hash,
|
65
|
+
num_pages=conv_res.input.page_count,
|
66
|
+
page_hashes=page_hashes,
|
67
|
+
)
|
68
|
+
|
69
|
+
main_text: List[Union[Ref, BaseText]] = []
|
70
|
+
tables: List[DsSchemaTable] = []
|
71
|
+
figures: List[Figure] = []
|
72
|
+
|
73
|
+
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
74
|
+
|
75
|
+
for element in conv_res.assembled.elements:
|
76
|
+
# Convert bboxes to lower-left origin.
|
77
|
+
target_bbox = DsBoundingBox(
|
78
|
+
element.cluster.bbox.to_bottom_left_origin(
|
79
|
+
page_no_to_page[element.page_no].size.height
|
80
|
+
).as_tuple()
|
81
|
+
)
|
82
|
+
|
83
|
+
if isinstance(element, TextElement):
|
84
|
+
main_text.append(
|
85
|
+
BaseText(
|
86
|
+
text=element.text,
|
87
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
88
|
+
name=element.label,
|
89
|
+
prov=[
|
90
|
+
Prov(
|
91
|
+
bbox=target_bbox,
|
92
|
+
page=element.page_no + 1,
|
93
|
+
span=[0, len(element.text)],
|
94
|
+
)
|
95
|
+
],
|
96
|
+
)
|
97
|
+
)
|
98
|
+
elif isinstance(element, Table):
|
99
|
+
index = len(tables)
|
100
|
+
ref_str = f"#/tables/{index}"
|
101
|
+
main_text.append(
|
102
|
+
Ref(
|
103
|
+
name=element.label,
|
104
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
105
|
+
ref=ref_str,
|
106
|
+
),
|
107
|
+
)
|
108
|
+
|
109
|
+
# Initialise empty table data grid (only empty cells)
|
110
|
+
table_data = [
|
111
|
+
[
|
112
|
+
TableCell(
|
113
|
+
text="",
|
114
|
+
# bbox=[0,0,0,0],
|
115
|
+
spans=[[i, j]],
|
116
|
+
obj_type="body",
|
117
|
+
)
|
118
|
+
for j in range(element.num_cols)
|
119
|
+
]
|
120
|
+
for i in range(element.num_rows)
|
121
|
+
]
|
26
122
|
|
27
|
-
|
28
|
-
|
29
|
-
|
123
|
+
# Overwrite cells in table data for which there is actual cell content.
|
124
|
+
for cell in element.table_cells:
|
125
|
+
for i in range(
|
126
|
+
min(cell.start_row_offset_idx, element.num_rows),
|
127
|
+
min(cell.end_row_offset_idx, element.num_rows),
|
128
|
+
):
|
129
|
+
for j in range(
|
130
|
+
min(cell.start_col_offset_idx, element.num_cols),
|
131
|
+
min(cell.end_col_offset_idx, element.num_cols),
|
132
|
+
):
|
133
|
+
celltype = "body"
|
134
|
+
if cell.column_header:
|
135
|
+
celltype = "col_header"
|
136
|
+
elif cell.row_header:
|
137
|
+
celltype = "row_header"
|
138
|
+
elif cell.row_section:
|
139
|
+
celltype = "row_section"
|
30
140
|
|
31
|
-
|
32
|
-
|
33
|
-
|
141
|
+
def make_spans(cell):
|
142
|
+
for rspan in range(
|
143
|
+
min(cell.start_row_offset_idx, element.num_rows),
|
144
|
+
min(cell.end_row_offset_idx, element.num_rows),
|
145
|
+
):
|
146
|
+
for cspan in range(
|
147
|
+
min(
|
148
|
+
cell.start_col_offset_idx, element.num_cols
|
149
|
+
),
|
150
|
+
min(cell.end_col_offset_idx, element.num_cols),
|
151
|
+
):
|
152
|
+
yield [rspan, cspan]
|
153
|
+
|
154
|
+
spans = list(make_spans(cell))
|
155
|
+
if cell.bbox is not None:
|
156
|
+
bbox = cell.bbox.to_bottom_left_origin(
|
157
|
+
page_no_to_page[element.page_no].size.height
|
158
|
+
).as_tuple()
|
159
|
+
else:
|
160
|
+
bbox = None
|
161
|
+
|
162
|
+
table_data[i][j] = TableCell(
|
163
|
+
text=cell.text,
|
164
|
+
bbox=bbox,
|
165
|
+
# col=j,
|
166
|
+
# row=i,
|
167
|
+
spans=spans,
|
168
|
+
obj_type=celltype,
|
169
|
+
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
170
|
+
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
171
|
+
)
|
172
|
+
|
173
|
+
tables.append(
|
174
|
+
DsSchemaTable(
|
175
|
+
num_cols=element.num_cols,
|
176
|
+
num_rows=element.num_rows,
|
177
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
178
|
+
data=table_data,
|
179
|
+
prov=[
|
180
|
+
Prov(
|
181
|
+
bbox=target_bbox,
|
182
|
+
page=element.page_no + 1,
|
183
|
+
span=[0, 0],
|
184
|
+
)
|
185
|
+
],
|
186
|
+
)
|
187
|
+
)
|
188
|
+
|
189
|
+
elif isinstance(element, FigureElement):
|
190
|
+
index = len(figures)
|
191
|
+
ref_str = f"#/figures/{index}"
|
192
|
+
main_text.append(
|
193
|
+
Ref(
|
194
|
+
name=element.label,
|
195
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
196
|
+
ref=ref_str,
|
197
|
+
),
|
198
|
+
)
|
199
|
+
figures.append(
|
200
|
+
Figure(
|
201
|
+
prov=[
|
202
|
+
Prov(
|
203
|
+
bbox=target_bbox,
|
204
|
+
page=element.page_no + 1,
|
205
|
+
span=[0, 0],
|
206
|
+
)
|
207
|
+
],
|
208
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
209
|
+
# data=[[]],
|
210
|
+
)
|
211
|
+
)
|
212
|
+
|
213
|
+
page_dimensions = [
|
214
|
+
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
215
|
+
for p in conv_res.pages
|
216
|
+
if p.size is not None
|
217
|
+
]
|
218
|
+
|
219
|
+
ds_doc: DsDocument = DsDocument(
|
220
|
+
name=title,
|
221
|
+
description=desc,
|
222
|
+
file_info=file_info,
|
223
|
+
main_text=main_text,
|
224
|
+
tables=tables,
|
225
|
+
figures=figures,
|
226
|
+
page_dimensions=page_dimensions,
|
34
227
|
)
|
35
228
|
|
36
|
-
|
229
|
+
return ds_doc
|
230
|
+
|
231
|
+
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
232
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
233
|
+
ds_doc = self._to_legacy_document(conv_res)
|
234
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
235
|
+
|
236
|
+
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
237
|
+
|
238
|
+
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
37
239
|
|
38
240
|
# DEBUG code:
|
39
|
-
def draw_clusters_and_cells(ds_document, page_no):
|
241
|
+
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
40
242
|
clusters_to_draw = []
|
41
243
|
image = copy.deepcopy(conv_res.pages[page_no].image)
|
42
244
|
for ix, elem in enumerate(ds_document.main_text):
|
43
245
|
if isinstance(elem, BaseText):
|
44
|
-
prov = elem.prov[0]
|
246
|
+
prov = elem.prov[0] # type: ignore
|
45
247
|
elif isinstance(elem, Ref):
|
46
248
|
_, arr, index = elem.ref.split("/")
|
47
|
-
index = int(index)
|
249
|
+
index = int(index) # type: ignore
|
48
250
|
if arr == "tables":
|
49
251
|
prov = ds_document.tables[index].prov[0]
|
50
252
|
elif arr == "figures":
|
51
|
-
prov = ds_document.
|
253
|
+
prov = ds_document.pictures[index].prov[0]
|
52
254
|
else:
|
53
255
|
prov = None
|
54
256
|
|
@@ -58,7 +260,7 @@ class GlmModel:
|
|
58
260
|
id=ix,
|
59
261
|
label=elem.name,
|
60
262
|
bbox=BoundingBox.from_tuple(
|
61
|
-
coord=prov.bbox,
|
263
|
+
coord=prov.bbox, # type: ignore
|
62
264
|
origin=CoordOrigin.BOTTOMLEFT,
|
63
265
|
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
64
266
|
)
|
@@ -78,9 +280,21 @@ class GlmModel:
|
|
78
280
|
for tc in c.cells: # [:1]:
|
79
281
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
80
282
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
81
|
-
image.show()
|
82
283
|
|
83
|
-
|
84
|
-
|
284
|
+
if show:
|
285
|
+
image.show()
|
286
|
+
else:
|
287
|
+
out_path: Path = (
|
288
|
+
Path(settings.debug.debug_output_path)
|
289
|
+
/ f"debug_{conv_res.input.file.stem}"
|
290
|
+
)
|
291
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
292
|
+
|
293
|
+
out_file = out_path / f"doc_page_{page_no:05}.png"
|
294
|
+
image.save(str(out_file), format="png")
|
295
|
+
|
296
|
+
# for item in ds_doc.page_dimensions:
|
297
|
+
# page_no = item.page
|
298
|
+
# draw_clusters_and_cells(ds_doc, page_no)
|
85
299
|
|
86
|
-
return
|
300
|
+
return docling_doc
|
docling/models/easyocr_model.py
CHANGED
@@ -2,10 +2,14 @@ import logging
|
|
2
2
|
from typing import Iterable
|
3
3
|
|
4
4
|
import numpy
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
6
|
|
6
|
-
from docling.datamodel.base_models import
|
7
|
+
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
7
9
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
10
|
+
from docling.datamodel.settings import settings
|
8
11
|
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
from docling.utils.profiling import TimeRecorder
|
9
13
|
|
10
14
|
_log = logging.getLogger(__name__)
|
11
15
|
|
@@ -32,54 +36,65 @@ class EasyOcrModel(BaseOcrModel):
|
|
32
36
|
download_enabled=self.options.download_enabled,
|
33
37
|
)
|
34
38
|
|
35
|
-
def __call__(
|
39
|
+
def __call__(
|
40
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
41
|
+
) -> Iterable[Page]:
|
36
42
|
|
37
43
|
if not self.enabled:
|
38
44
|
yield from page_batch
|
39
45
|
return
|
40
46
|
|
41
47
|
for page in page_batch:
|
42
|
-
ocr_rects = self.get_ocr_rects(page)
|
43
|
-
|
44
|
-
all_ocr_cells = []
|
45
|
-
for ocr_rect in ocr_rects:
|
46
|
-
# Skip zero area boxes
|
47
|
-
if ocr_rect.area() == 0:
|
48
|
-
continue
|
49
|
-
high_res_image = page._backend.get_page_image(
|
50
|
-
scale=self.scale, cropbox=ocr_rect
|
51
|
-
)
|
52
|
-
im = numpy.array(high_res_image)
|
53
|
-
result = self.reader.readtext(im)
|
54
|
-
|
55
|
-
del high_res_image
|
56
|
-
del im
|
57
|
-
|
58
|
-
cells = [
|
59
|
-
OcrCell(
|
60
|
-
id=ix,
|
61
|
-
text=line[1],
|
62
|
-
confidence=line[2],
|
63
|
-
bbox=BoundingBox.from_tuple(
|
64
|
-
coord=(
|
65
|
-
(line[0][0][0] / self.scale) + ocr_rect.l,
|
66
|
-
(line[0][0][1] / self.scale) + ocr_rect.t,
|
67
|
-
(line[0][2][0] / self.scale) + ocr_rect.l,
|
68
|
-
(line[0][2][1] / self.scale) + ocr_rect.t,
|
69
|
-
),
|
70
|
-
origin=CoordOrigin.TOPLEFT,
|
71
|
-
),
|
72
|
-
)
|
73
|
-
for ix, line in enumerate(result)
|
74
|
-
]
|
75
|
-
all_ocr_cells.extend(cells)
|
76
48
|
|
77
|
-
|
78
|
-
|
49
|
+
assert page._backend is not None
|
50
|
+
if not page._backend.is_valid():
|
51
|
+
yield page
|
52
|
+
else:
|
53
|
+
with TimeRecorder(conv_res, "ocr"):
|
54
|
+
ocr_rects = self.get_ocr_rects(page)
|
55
|
+
|
56
|
+
all_ocr_cells = []
|
57
|
+
for ocr_rect in ocr_rects:
|
58
|
+
# Skip zero area boxes
|
59
|
+
if ocr_rect.area() == 0:
|
60
|
+
continue
|
61
|
+
high_res_image = page._backend.get_page_image(
|
62
|
+
scale=self.scale, cropbox=ocr_rect
|
63
|
+
)
|
64
|
+
im = numpy.array(high_res_image)
|
65
|
+
result = self.reader.readtext(im)
|
66
|
+
|
67
|
+
del high_res_image
|
68
|
+
del im
|
69
|
+
|
70
|
+
cells = [
|
71
|
+
OcrCell(
|
72
|
+
id=ix,
|
73
|
+
text=line[1],
|
74
|
+
confidence=line[2],
|
75
|
+
bbox=BoundingBox.from_tuple(
|
76
|
+
coord=(
|
77
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
78
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
79
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
80
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
81
|
+
),
|
82
|
+
origin=CoordOrigin.TOPLEFT,
|
83
|
+
),
|
84
|
+
)
|
85
|
+
for ix, line in enumerate(result)
|
86
|
+
]
|
87
|
+
all_ocr_cells.extend(cells)
|
88
|
+
|
89
|
+
## Remove OCR cells which overlap with programmatic cells.
|
90
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
91
|
+
all_ocr_cells, page.cells
|
92
|
+
)
|
79
93
|
|
80
|
-
|
94
|
+
page.cells.extend(filtered_ocr_cells)
|
81
95
|
|
82
|
-
|
83
|
-
|
96
|
+
# DEBUG code:
|
97
|
+
if settings.debug.visualize_ocr:
|
98
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
84
99
|
|
85
|
-
|
100
|
+
yield page
|