python-doctr 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/__init__.py +1 -1
- doctr/contrib/__init__.py +0 -0
- doctr/contrib/artefacts.py +131 -0
- doctr/contrib/base.py +105 -0
- doctr/datasets/datasets/pytorch.py +2 -2
- doctr/datasets/generator/base.py +6 -5
- doctr/datasets/imgur5k.py +1 -1
- doctr/datasets/loader.py +1 -6
- doctr/datasets/utils.py +2 -1
- doctr/datasets/vocabs.py +9 -2
- doctr/file_utils.py +26 -12
- doctr/io/elements.py +40 -6
- doctr/io/html.py +2 -2
- doctr/io/image/pytorch.py +6 -8
- doctr/io/image/tensorflow.py +1 -1
- doctr/io/pdf.py +5 -2
- doctr/io/reader.py +6 -0
- doctr/models/__init__.py +0 -1
- doctr/models/_utils.py +57 -20
- doctr/models/builder.py +71 -13
- doctr/models/classification/mobilenet/pytorch.py +45 -9
- doctr/models/classification/mobilenet/tensorflow.py +38 -7
- doctr/models/classification/predictor/pytorch.py +18 -11
- doctr/models/classification/predictor/tensorflow.py +16 -10
- doctr/models/classification/textnet/pytorch.py +3 -3
- doctr/models/classification/textnet/tensorflow.py +3 -3
- doctr/models/classification/zoo.py +39 -15
- doctr/models/detection/__init__.py +1 -0
- doctr/models/detection/_utils/__init__.py +1 -0
- doctr/models/detection/_utils/base.py +66 -0
- doctr/models/detection/differentiable_binarization/base.py +4 -3
- doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
- doctr/models/detection/differentiable_binarization/tensorflow.py +14 -18
- doctr/models/detection/fast/__init__.py +6 -0
- doctr/models/detection/fast/base.py +257 -0
- doctr/models/detection/fast/pytorch.py +442 -0
- doctr/models/detection/fast/tensorflow.py +428 -0
- doctr/models/detection/linknet/base.py +4 -3
- doctr/models/detection/predictor/pytorch.py +15 -1
- doctr/models/detection/predictor/tensorflow.py +15 -1
- doctr/models/detection/zoo.py +21 -4
- doctr/models/factory/hub.py +3 -12
- doctr/models/kie_predictor/base.py +9 -3
- doctr/models/kie_predictor/pytorch.py +41 -20
- doctr/models/kie_predictor/tensorflow.py +36 -16
- doctr/models/modules/layers/pytorch.py +89 -10
- doctr/models/modules/layers/tensorflow.py +88 -10
- doctr/models/modules/transformer/pytorch.py +2 -2
- doctr/models/predictor/base.py +77 -50
- doctr/models/predictor/pytorch.py +31 -20
- doctr/models/predictor/tensorflow.py +27 -17
- doctr/models/preprocessor/pytorch.py +4 -4
- doctr/models/preprocessor/tensorflow.py +3 -2
- doctr/models/recognition/master/pytorch.py +2 -2
- doctr/models/recognition/parseq/pytorch.py +4 -3
- doctr/models/recognition/parseq/tensorflow.py +4 -3
- doctr/models/recognition/sar/pytorch.py +7 -6
- doctr/models/recognition/sar/tensorflow.py +3 -9
- doctr/models/recognition/vitstr/pytorch.py +1 -1
- doctr/models/recognition/zoo.py +1 -1
- doctr/models/zoo.py +2 -2
- doctr/py.typed +0 -0
- doctr/transforms/functional/base.py +1 -1
- doctr/transforms/functional/pytorch.py +4 -4
- doctr/transforms/modules/base.py +37 -15
- doctr/transforms/modules/pytorch.py +66 -8
- doctr/transforms/modules/tensorflow.py +63 -7
- doctr/utils/fonts.py +7 -5
- doctr/utils/geometry.py +35 -12
- doctr/utils/metrics.py +33 -174
- doctr/utils/reconstitution.py +126 -0
- doctr/utils/visualization.py +5 -118
- doctr/version.py +1 -1
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/METADATA +96 -91
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/RECORD +79 -75
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/WHEEL +1 -1
- doctr/models/artefacts/__init__.py +0 -2
- doctr/models/artefacts/barcode.py +0 -74
- doctr/models/artefacts/face.py +0 -63
- doctr/models/obj_detection/__init__.py +0 -1
- doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
- doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/zip-safe +0 -0
doctr/utils/metrics.py
CHANGED
|
@@ -5,16 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
from typing import Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import cv2
|
|
9
8
|
import numpy as np
|
|
9
|
+
from anyascii import anyascii
|
|
10
10
|
from scipy.optimize import linear_sum_assignment
|
|
11
|
-
from
|
|
11
|
+
from shapely.geometry import Polygon
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"TextMatch",
|
|
15
15
|
"box_iou",
|
|
16
|
-
"box_ioa",
|
|
17
|
-
"mask_iou",
|
|
18
16
|
"polygon_iou",
|
|
19
17
|
"nms",
|
|
20
18
|
"LocalizationConfusion",
|
|
@@ -34,16 +32,16 @@ def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
|
|
|
34
32
|
Returns:
|
|
35
33
|
-------
|
|
36
34
|
a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
|
|
37
|
-
|
|
35
|
+
anyascii counterparts and their lower-case anyascii counterparts match
|
|
38
36
|
"""
|
|
39
37
|
raw_match = word1 == word2
|
|
40
38
|
caseless_match = word1.lower() == word2.lower()
|
|
41
|
-
|
|
39
|
+
anyascii_match = anyascii(word1) == anyascii(word2)
|
|
42
40
|
|
|
43
41
|
# Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
|
|
44
|
-
unicase_match =
|
|
42
|
+
unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
|
|
45
43
|
|
|
46
|
-
return raw_match, caseless_match,
|
|
44
|
+
return raw_match, caseless_match, anyascii_match, unicase_match
|
|
47
45
|
|
|
48
46
|
|
|
49
47
|
class TextMatch:
|
|
@@ -94,10 +92,10 @@ class TextMatch:
|
|
|
94
92
|
raise AssertionError("prediction size does not match with ground-truth labels size")
|
|
95
93
|
|
|
96
94
|
for gt_word, pred_word in zip(gt, pred):
|
|
97
|
-
_raw, _caseless,
|
|
95
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
|
|
98
96
|
self.raw += int(_raw)
|
|
99
97
|
self.caseless += int(_caseless)
|
|
100
|
-
self.
|
|
98
|
+
self.anyascii += int(_anyascii)
|
|
101
99
|
self.unicase += int(_unicase)
|
|
102
100
|
|
|
103
101
|
self.total += len(gt)
|
|
@@ -107,8 +105,8 @@ class TextMatch:
|
|
|
107
105
|
|
|
108
106
|
Returns
|
|
109
107
|
-------
|
|
110
|
-
a dictionary with the exact match score for the raw data, its lower-case counterpart, its
|
|
111
|
-
counterpart and its lower-case
|
|
108
|
+
a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii
|
|
109
|
+
counterpart and its lower-case anyascii counterpart
|
|
112
110
|
"""
|
|
113
111
|
if self.total == 0:
|
|
114
112
|
raise AssertionError("you need to update the metric before getting the summary")
|
|
@@ -116,14 +114,14 @@ class TextMatch:
|
|
|
116
114
|
return dict(
|
|
117
115
|
raw=self.raw / self.total,
|
|
118
116
|
caseless=self.caseless / self.total,
|
|
119
|
-
|
|
117
|
+
anyascii=self.anyascii / self.total,
|
|
120
118
|
unicase=self.unicase / self.total,
|
|
121
119
|
)
|
|
122
120
|
|
|
123
121
|
def reset(self) -> None:
|
|
124
122
|
self.raw = 0
|
|
125
123
|
self.caseless = 0
|
|
126
|
-
self.
|
|
124
|
+
self.anyascii = 0
|
|
127
125
|
self.unicase = 0
|
|
128
126
|
self.total = 0
|
|
129
127
|
|
|
@@ -158,66 +156,7 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
|
|
|
158
156
|
return iou_mat
|
|
159
157
|
|
|
160
158
|
|
|
161
|
-
def
|
|
162
|
-
"""Computes the IoA (intersection over area) between two sets of bounding boxes:
|
|
163
|
-
ioa(i, j) = inter(i, j) / area(i)
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
----
|
|
167
|
-
boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
|
|
168
|
-
boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
-------
|
|
172
|
-
the IoA matrix of shape (N, M)
|
|
173
|
-
"""
|
|
174
|
-
ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
|
|
175
|
-
|
|
176
|
-
if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
|
|
177
|
-
l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
|
|
178
|
-
l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
|
|
179
|
-
|
|
180
|
-
left = np.maximum(l1, l2.T)
|
|
181
|
-
top = np.maximum(t1, t2.T)
|
|
182
|
-
right = np.minimum(r1, r2.T)
|
|
183
|
-
bot = np.minimum(b1, b2.T)
|
|
184
|
-
|
|
185
|
-
intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
|
|
186
|
-
area = (r1 - l1) * (b1 - t1)
|
|
187
|
-
ioa_mat = intersection / area
|
|
188
|
-
|
|
189
|
-
return ioa_mat
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
|
|
193
|
-
"""Computes the IoU between two sets of boolean masks
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
----
|
|
197
|
-
masks_1: boolean masks of shape (N, H, W)
|
|
198
|
-
masks_2: boolean masks of shape (M, H, W)
|
|
199
|
-
|
|
200
|
-
Returns:
|
|
201
|
-
-------
|
|
202
|
-
the IoU matrix of shape (N, M)
|
|
203
|
-
"""
|
|
204
|
-
if masks_1.shape[1:] != masks_2.shape[1:]:
|
|
205
|
-
raise AssertionError("both boolean masks should have the same spatial shape")
|
|
206
|
-
|
|
207
|
-
iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
|
|
208
|
-
|
|
209
|
-
if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
|
|
210
|
-
axes = tuple(range(2, masks_1.ndim + 1))
|
|
211
|
-
intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
212
|
-
union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
213
|
-
iou_mat = intersection / union
|
|
214
|
-
|
|
215
|
-
return iou_mat
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def polygon_iou(
|
|
219
|
-
polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
|
|
220
|
-
) -> np.ndarray:
|
|
159
|
+
def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
|
|
221
160
|
"""Computes the IoU between two sets of rotated bounding boxes
|
|
222
161
|
|
|
223
162
|
Args:
|
|
@@ -234,80 +173,18 @@ def polygon_iou(
|
|
|
234
173
|
if polys_1.ndim != 3 or polys_2.ndim != 3:
|
|
235
174
|
raise AssertionError("expects boxes to be in format (N, 4, 2)")
|
|
236
175
|
|
|
237
|
-
iou_mat
|
|
238
|
-
|
|
239
|
-
if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
|
|
240
|
-
if use_broadcasting:
|
|
241
|
-
masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
|
|
242
|
-
masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
|
|
243
|
-
iou_mat = mask_iou(masks_1, masks_2)
|
|
244
|
-
else:
|
|
245
|
-
# Save memory by doing the computation for each pair
|
|
246
|
-
for idx, b1 in enumerate(polys_1):
|
|
247
|
-
m1 = _rbox_to_mask(b1, mask_shape)
|
|
248
|
-
for _idx, b2 in enumerate(polys_2):
|
|
249
|
-
m2 = _rbox_to_mask(b2, mask_shape)
|
|
250
|
-
iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
|
|
251
|
-
|
|
252
|
-
return iou_mat
|
|
176
|
+
iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
|
|
253
177
|
|
|
178
|
+
shapely_polys_1 = [Polygon(poly) for poly in polys_1]
|
|
179
|
+
shapely_polys_2 = [Polygon(poly) for poly in polys_2]
|
|
254
180
|
|
|
255
|
-
|
|
256
|
-
|
|
181
|
+
for i, poly1 in enumerate(shapely_polys_1):
|
|
182
|
+
for j, poly2 in enumerate(shapely_polys_2):
|
|
183
|
+
intersection_area = poly1.intersection(poly2).area
|
|
184
|
+
union_area = poly1.area + poly2.area - intersection_area
|
|
185
|
+
iou_mat[i, j] = intersection_area / union_area
|
|
257
186
|
|
|
258
|
-
|
|
259
|
-
----
|
|
260
|
-
box: rotated bounding box of shape (4, 2)
|
|
261
|
-
shape: spatial shapes of the output masks
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
-------
|
|
265
|
-
the boolean mask of the specified shape
|
|
266
|
-
"""
|
|
267
|
-
mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
|
|
268
|
-
# Get absolute coords
|
|
269
|
-
if not np.issubdtype(box.dtype, np.integer):
|
|
270
|
-
abs_box = box.copy()
|
|
271
|
-
abs_box[:, 0] = abs_box[:, 0] * shape[1]
|
|
272
|
-
abs_box[:, 1] = abs_box[:, 1] * shape[0]
|
|
273
|
-
abs_box = abs_box.round().astype(int)
|
|
274
|
-
else:
|
|
275
|
-
abs_box = box
|
|
276
|
-
abs_box[2:] = abs_box[2:] + 1
|
|
277
|
-
cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload]
|
|
278
|
-
|
|
279
|
-
return mask.astype(bool)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
|
|
283
|
-
"""Converts rotated bounding boxes to boolean masks
|
|
284
|
-
|
|
285
|
-
Args:
|
|
286
|
-
----
|
|
287
|
-
boxes: rotated bounding boxes of shape (N, 4, 2)
|
|
288
|
-
shape: spatial shapes of the output masks
|
|
289
|
-
|
|
290
|
-
Returns:
|
|
291
|
-
-------
|
|
292
|
-
the boolean masks of shape (N, H, W)
|
|
293
|
-
"""
|
|
294
|
-
masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
|
|
295
|
-
|
|
296
|
-
if boxes.shape[0] > 0:
|
|
297
|
-
# Get absolute coordinates
|
|
298
|
-
if not np.issubdtype(boxes.dtype, np.integer):
|
|
299
|
-
abs_boxes = boxes.copy()
|
|
300
|
-
abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
|
|
301
|
-
abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
|
|
302
|
-
abs_boxes = abs_boxes.round().astype(int)
|
|
303
|
-
else:
|
|
304
|
-
abs_boxes = boxes
|
|
305
|
-
abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
|
|
306
|
-
|
|
307
|
-
# TODO: optimize slicing to improve vectorization
|
|
308
|
-
for idx, _box in enumerate(abs_boxes):
|
|
309
|
-
cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload]
|
|
310
|
-
return masks.astype(bool)
|
|
187
|
+
return iou_mat
|
|
311
188
|
|
|
312
189
|
|
|
313
190
|
def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
|
|
@@ -386,21 +263,15 @@ class LocalizationConfusion:
|
|
|
386
263
|
----
|
|
387
264
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
388
265
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
389
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
390
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
391
266
|
"""
|
|
392
267
|
|
|
393
268
|
def __init__(
|
|
394
269
|
self,
|
|
395
270
|
iou_thresh: float = 0.5,
|
|
396
271
|
use_polygons: bool = False,
|
|
397
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
398
|
-
use_broadcasting: bool = True,
|
|
399
272
|
) -> None:
|
|
400
273
|
self.iou_thresh = iou_thresh
|
|
401
274
|
self.use_polygons = use_polygons
|
|
402
|
-
self.mask_shape = mask_shape
|
|
403
|
-
self.use_broadcasting = use_broadcasting
|
|
404
275
|
self.reset()
|
|
405
276
|
|
|
406
277
|
def update(self, gts: np.ndarray, preds: np.ndarray) -> None:
|
|
@@ -414,7 +285,7 @@ class LocalizationConfusion:
|
|
|
414
285
|
if preds.shape[0] > 0:
|
|
415
286
|
# Compute IoU
|
|
416
287
|
if self.use_polygons:
|
|
417
|
-
iou_mat = polygon_iou(gts, preds
|
|
288
|
+
iou_mat = polygon_iou(gts, preds)
|
|
418
289
|
else:
|
|
419
290
|
iou_mat = box_iou(gts, preds)
|
|
420
291
|
self.tot_iou += float(iou_mat.max(axis=0).sum())
|
|
@@ -441,7 +312,7 @@ class LocalizationConfusion:
|
|
|
441
312
|
precision = self.matches / self.num_preds if self.num_preds > 0 else None
|
|
442
313
|
|
|
443
314
|
# mean IoU
|
|
444
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
315
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
445
316
|
|
|
446
317
|
return recall, precision, mean_iou
|
|
447
318
|
|
|
@@ -492,21 +363,15 @@ class OCRMetric:
|
|
|
492
363
|
----
|
|
493
364
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
494
365
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
495
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
496
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
497
366
|
"""
|
|
498
367
|
|
|
499
368
|
def __init__(
|
|
500
369
|
self,
|
|
501
370
|
iou_thresh: float = 0.5,
|
|
502
371
|
use_polygons: bool = False,
|
|
503
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
504
|
-
use_broadcasting: bool = True,
|
|
505
372
|
) -> None:
|
|
506
373
|
self.iou_thresh = iou_thresh
|
|
507
374
|
self.use_polygons = use_polygons
|
|
508
|
-
self.mask_shape = mask_shape
|
|
509
|
-
self.use_broadcasting = use_broadcasting
|
|
510
375
|
self.reset()
|
|
511
376
|
|
|
512
377
|
def update(
|
|
@@ -533,7 +398,7 @@ class OCRMetric:
|
|
|
533
398
|
# Compute IoU
|
|
534
399
|
if pred_boxes.shape[0] > 0:
|
|
535
400
|
if self.use_polygons:
|
|
536
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
401
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
537
402
|
else:
|
|
538
403
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
539
404
|
|
|
@@ -544,10 +409,10 @@ class OCRMetric:
|
|
|
544
409
|
is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
|
|
545
410
|
# String comparison
|
|
546
411
|
for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
|
|
547
|
-
_raw, _caseless,
|
|
412
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
|
|
548
413
|
self.raw_matches += int(_raw)
|
|
549
414
|
self.caseless_matches += int(_caseless)
|
|
550
|
-
self.
|
|
415
|
+
self.anyascii_matches += int(_anyascii)
|
|
551
416
|
self.unicase_matches += int(_unicase)
|
|
552
417
|
|
|
553
418
|
self.num_gts += gt_boxes.shape[0]
|
|
@@ -564,7 +429,7 @@ class OCRMetric:
|
|
|
564
429
|
recall = dict(
|
|
565
430
|
raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None,
|
|
566
431
|
caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None,
|
|
567
|
-
|
|
432
|
+
anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None,
|
|
568
433
|
unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None,
|
|
569
434
|
)
|
|
570
435
|
|
|
@@ -572,12 +437,12 @@ class OCRMetric:
|
|
|
572
437
|
precision = dict(
|
|
573
438
|
raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
|
|
574
439
|
caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
|
|
575
|
-
|
|
440
|
+
anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
|
|
576
441
|
unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
|
|
577
442
|
)
|
|
578
443
|
|
|
579
444
|
# mean IoU (overall detected boxes)
|
|
580
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
445
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
581
446
|
|
|
582
447
|
return recall, precision, mean_iou
|
|
583
448
|
|
|
@@ -587,7 +452,7 @@ class OCRMetric:
|
|
|
587
452
|
self.tot_iou = 0.0
|
|
588
453
|
self.raw_matches = 0
|
|
589
454
|
self.caseless_matches = 0
|
|
590
|
-
self.
|
|
455
|
+
self.anyascii_matches = 0
|
|
591
456
|
self.unicase_matches = 0
|
|
592
457
|
|
|
593
458
|
|
|
@@ -631,21 +496,15 @@ class DetectionMetric:
|
|
|
631
496
|
----
|
|
632
497
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
633
498
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
634
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
635
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
636
499
|
"""
|
|
637
500
|
|
|
638
501
|
def __init__(
|
|
639
502
|
self,
|
|
640
503
|
iou_thresh: float = 0.5,
|
|
641
504
|
use_polygons: bool = False,
|
|
642
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
643
|
-
use_broadcasting: bool = True,
|
|
644
505
|
) -> None:
|
|
645
506
|
self.iou_thresh = iou_thresh
|
|
646
507
|
self.use_polygons = use_polygons
|
|
647
|
-
self.mask_shape = mask_shape
|
|
648
|
-
self.use_broadcasting = use_broadcasting
|
|
649
508
|
self.reset()
|
|
650
509
|
|
|
651
510
|
def update(
|
|
@@ -672,7 +531,7 @@ class DetectionMetric:
|
|
|
672
531
|
# Compute IoU
|
|
673
532
|
if pred_boxes.shape[0] > 0:
|
|
674
533
|
if self.use_polygons:
|
|
675
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
534
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
676
535
|
else:
|
|
677
536
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
678
537
|
|
|
@@ -701,7 +560,7 @@ class DetectionMetric:
|
|
|
701
560
|
precision = self.num_matches / self.num_preds if self.num_preds > 0 else None
|
|
702
561
|
|
|
703
562
|
# mean IoU (overall detected boxes)
|
|
704
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
563
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
705
564
|
|
|
706
565
|
return recall, precision, mean_iou
|
|
707
566
|
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from anyascii import anyascii
|
|
9
|
+
from PIL import Image, ImageDraw
|
|
10
|
+
|
|
11
|
+
from .fonts import get_font
|
|
12
|
+
|
|
13
|
+
__all__ = ["synthesize_page", "synthesize_kie_page"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def synthesize_page(
|
|
17
|
+
page: Dict[str, Any],
|
|
18
|
+
draw_proba: bool = False,
|
|
19
|
+
font_family: Optional[str] = None,
|
|
20
|
+
) -> np.ndarray:
|
|
21
|
+
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
----
|
|
25
|
+
page: exported Page object to represent
|
|
26
|
+
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
27
|
+
font_size: size of the font, default font = 13
|
|
28
|
+
font_family: family of the font
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
-------
|
|
32
|
+
the synthesized page
|
|
33
|
+
"""
|
|
34
|
+
# Draw template
|
|
35
|
+
h, w = page["dimensions"]
|
|
36
|
+
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
37
|
+
|
|
38
|
+
# Draw each word
|
|
39
|
+
for block in page["blocks"]:
|
|
40
|
+
for line in block["lines"]:
|
|
41
|
+
for word in line["words"]:
|
|
42
|
+
# Get absolute word geometry
|
|
43
|
+
(xmin, ymin), (xmax, ymax) = word["geometry"]
|
|
44
|
+
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
45
|
+
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
46
|
+
|
|
47
|
+
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
48
|
+
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
49
|
+
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
50
|
+
d = ImageDraw.Draw(img)
|
|
51
|
+
# Draw in black the value of the word
|
|
52
|
+
try:
|
|
53
|
+
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
|
|
54
|
+
except UnicodeEncodeError:
|
|
55
|
+
# When character cannot be encoded, use its anyascii version
|
|
56
|
+
d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0))
|
|
57
|
+
|
|
58
|
+
# Colorize if draw_proba
|
|
59
|
+
if draw_proba:
|
|
60
|
+
p = int(255 * word["confidence"])
|
|
61
|
+
mask = np.where(np.array(img) == 0, 1, 0)
|
|
62
|
+
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
63
|
+
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
64
|
+
white_mask = 255 * (1 - mask)
|
|
65
|
+
img = color + white_mask
|
|
66
|
+
|
|
67
|
+
# Write to response page
|
|
68
|
+
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
69
|
+
|
|
70
|
+
return response
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def synthesize_kie_page(
|
|
74
|
+
page: Dict[str, Any],
|
|
75
|
+
draw_proba: bool = False,
|
|
76
|
+
font_family: Optional[str] = None,
|
|
77
|
+
) -> np.ndarray:
|
|
78
|
+
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
----
|
|
82
|
+
page: exported Page object to represent
|
|
83
|
+
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
84
|
+
font_size: size of the font, default font = 13
|
|
85
|
+
font_family: family of the font
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
-------
|
|
89
|
+
the synthesized page
|
|
90
|
+
"""
|
|
91
|
+
# Draw template
|
|
92
|
+
h, w = page["dimensions"]
|
|
93
|
+
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
94
|
+
|
|
95
|
+
# Draw each word
|
|
96
|
+
for predictions in page["predictions"].values():
|
|
97
|
+
for prediction in predictions:
|
|
98
|
+
# Get aboslute word geometry
|
|
99
|
+
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
|
|
100
|
+
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
101
|
+
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
102
|
+
|
|
103
|
+
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
104
|
+
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
105
|
+
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
106
|
+
d = ImageDraw.Draw(img)
|
|
107
|
+
# Draw in black the value of the word
|
|
108
|
+
try:
|
|
109
|
+
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
|
|
110
|
+
except UnicodeEncodeError:
|
|
111
|
+
# When character cannot be encoded, use its anyascii version
|
|
112
|
+
d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0))
|
|
113
|
+
|
|
114
|
+
# Colorize if draw_proba
|
|
115
|
+
if draw_proba:
|
|
116
|
+
p = int(255 * prediction["confidence"])
|
|
117
|
+
mask = np.where(np.array(img) == 0, 1, 0)
|
|
118
|
+
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
119
|
+
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
120
|
+
white_mask = 255 * (1 - mask)
|
|
121
|
+
img = color + white_mask
|
|
122
|
+
|
|
123
|
+
# Write to response page
|
|
124
|
+
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
125
|
+
|
|
126
|
+
return response
|
doctr/utils/visualization.py
CHANGED
|
@@ -9,16 +9,12 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
9
9
|
import cv2
|
|
10
10
|
import matplotlib.patches as patches
|
|
11
11
|
import matplotlib.pyplot as plt
|
|
12
|
-
import mplcursors
|
|
13
12
|
import numpy as np
|
|
14
13
|
from matplotlib.figure import Figure
|
|
15
|
-
from PIL import Image, ImageDraw
|
|
16
|
-
from unidecode import unidecode
|
|
17
14
|
|
|
18
15
|
from .common_types import BoundingBox, Polygon4P
|
|
19
|
-
from .fonts import get_font
|
|
20
16
|
|
|
21
|
-
__all__ = ["visualize_page", "
|
|
17
|
+
__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
def rect_patch(
|
|
@@ -281,6 +277,8 @@ def visualize_page(
|
|
|
281
277
|
artists.append(rect)
|
|
282
278
|
|
|
283
279
|
if interactive:
|
|
280
|
+
import mplcursors
|
|
281
|
+
|
|
284
282
|
# Create mlp Cursor to hover patches in artists
|
|
285
283
|
mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
|
|
286
284
|
fig.tight_layout(pad=0.0)
|
|
@@ -288,63 +286,6 @@ def visualize_page(
|
|
|
288
286
|
return fig
|
|
289
287
|
|
|
290
288
|
|
|
291
|
-
def synthesize_page(
|
|
292
|
-
page: Dict[str, Any],
|
|
293
|
-
draw_proba: bool = False,
|
|
294
|
-
font_family: Optional[str] = None,
|
|
295
|
-
) -> np.ndarray:
|
|
296
|
-
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
297
|
-
|
|
298
|
-
Args:
|
|
299
|
-
----
|
|
300
|
-
page: exported Page object to represent
|
|
301
|
-
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
302
|
-
font_size: size of the font, default font = 13
|
|
303
|
-
font_family: family of the font
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
-------
|
|
307
|
-
the synthesized page
|
|
308
|
-
"""
|
|
309
|
-
# Draw template
|
|
310
|
-
h, w = page["dimensions"]
|
|
311
|
-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
312
|
-
|
|
313
|
-
# Draw each word
|
|
314
|
-
for block in page["blocks"]:
|
|
315
|
-
for line in block["lines"]:
|
|
316
|
-
for word in line["words"]:
|
|
317
|
-
# Get aboslute word geometry
|
|
318
|
-
(xmin, ymin), (xmax, ymax) = word["geometry"]
|
|
319
|
-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
320
|
-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
321
|
-
|
|
322
|
-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
323
|
-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
324
|
-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
325
|
-
d = ImageDraw.Draw(img)
|
|
326
|
-
# Draw in black the value of the word
|
|
327
|
-
try:
|
|
328
|
-
d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
|
|
329
|
-
except UnicodeEncodeError:
|
|
330
|
-
# When character cannot be encoded, use its unidecode version
|
|
331
|
-
d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
|
|
332
|
-
|
|
333
|
-
# Colorize if draw_proba
|
|
334
|
-
if draw_proba:
|
|
335
|
-
p = int(255 * word["confidence"])
|
|
336
|
-
mask = np.where(np.array(img) == 0, 1, 0)
|
|
337
|
-
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
338
|
-
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
339
|
-
white_mask = 255 * (1 - mask)
|
|
340
|
-
img = color + white_mask
|
|
341
|
-
|
|
342
|
-
# Write to response page
|
|
343
|
-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
344
|
-
|
|
345
|
-
return response
|
|
346
|
-
|
|
347
|
-
|
|
348
289
|
def visualize_kie_page(
|
|
349
290
|
page: Dict[str, Any],
|
|
350
291
|
image: np.ndarray,
|
|
@@ -413,6 +354,8 @@ def visualize_kie_page(
|
|
|
413
354
|
artists.append(rect)
|
|
414
355
|
|
|
415
356
|
if interactive:
|
|
357
|
+
import mplcursors
|
|
358
|
+
|
|
416
359
|
# Create mlp Cursor to hover patches in artists
|
|
417
360
|
mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
|
|
418
361
|
fig.tight_layout(pad=0.0)
|
|
@@ -420,62 +363,6 @@ def visualize_kie_page(
|
|
|
420
363
|
return fig
|
|
421
364
|
|
|
422
365
|
|
|
423
|
-
def synthesize_kie_page(
|
|
424
|
-
page: Dict[str, Any],
|
|
425
|
-
draw_proba: bool = False,
|
|
426
|
-
font_family: Optional[str] = None,
|
|
427
|
-
) -> np.ndarray:
|
|
428
|
-
"""Draw a the content of the element page (OCR response) on a blank page.
|
|
429
|
-
|
|
430
|
-
Args:
|
|
431
|
-
----
|
|
432
|
-
page: exported Page object to represent
|
|
433
|
-
draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
|
|
434
|
-
font_size: size of the font, default font = 13
|
|
435
|
-
font_family: family of the font
|
|
436
|
-
|
|
437
|
-
Returns:
|
|
438
|
-
-------
|
|
439
|
-
the synthesized page
|
|
440
|
-
"""
|
|
441
|
-
# Draw template
|
|
442
|
-
h, w = page["dimensions"]
|
|
443
|
-
response = 255 * np.ones((h, w, 3), dtype=np.int32)
|
|
444
|
-
|
|
445
|
-
# Draw each word
|
|
446
|
-
for predictions in page["predictions"].values():
|
|
447
|
-
for prediction in predictions:
|
|
448
|
-
# Get aboslute word geometry
|
|
449
|
-
(xmin, ymin), (xmax, ymax) = prediction["geometry"]
|
|
450
|
-
xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
|
|
451
|
-
ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
|
|
452
|
-
|
|
453
|
-
# White drawing context adapted to font size, 0.75 factor to convert pts --> pix
|
|
454
|
-
font = get_font(font_family, int(0.75 * (ymax - ymin)))
|
|
455
|
-
img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
|
|
456
|
-
d = ImageDraw.Draw(img)
|
|
457
|
-
# Draw in black the value of the word
|
|
458
|
-
try:
|
|
459
|
-
d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
|
|
460
|
-
except UnicodeEncodeError:
|
|
461
|
-
# When character cannot be encoded, use its unidecode version
|
|
462
|
-
d.text((0, 0), unidecode(prediction["value"]), font=font, fill=(0, 0, 0))
|
|
463
|
-
|
|
464
|
-
# Colorize if draw_proba
|
|
465
|
-
if draw_proba:
|
|
466
|
-
p = int(255 * prediction["confidence"])
|
|
467
|
-
mask = np.where(np.array(img) == 0, 1, 0)
|
|
468
|
-
proba: np.ndarray = np.array([255 - p, 0, p])
|
|
469
|
-
color = mask * proba[np.newaxis, np.newaxis, :]
|
|
470
|
-
white_mask = 255 * (1 - mask)
|
|
471
|
-
img = color + white_mask
|
|
472
|
-
|
|
473
|
-
# Write to response page
|
|
474
|
-
response[ymin:ymax, xmin:xmax, :] = np.array(img)
|
|
475
|
-
|
|
476
|
-
return response
|
|
477
|
-
|
|
478
|
-
|
|
479
366
|
def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None:
|
|
480
367
|
"""Draw an array of relative straight boxes on an image
|
|
481
368
|
|
doctr/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = 'v0.
|
|
1
|
+
__version__ = 'v0.9.0'
|