python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/__init__.py +1 -1
- doctr/contrib/__init__.py +0 -0
- doctr/contrib/artefacts.py +131 -0
- doctr/contrib/base.py +105 -0
- doctr/datasets/cord.py +10 -1
- doctr/datasets/datasets/pytorch.py +2 -2
- doctr/datasets/funsd.py +11 -1
- doctr/datasets/generator/base.py +6 -5
- doctr/datasets/ic03.py +11 -1
- doctr/datasets/ic13.py +10 -1
- doctr/datasets/iiit5k.py +26 -16
- doctr/datasets/imgur5k.py +11 -2
- doctr/datasets/loader.py +1 -6
- doctr/datasets/sroie.py +11 -1
- doctr/datasets/svhn.py +11 -1
- doctr/datasets/svt.py +11 -1
- doctr/datasets/synthtext.py +11 -1
- doctr/datasets/utils.py +9 -3
- doctr/datasets/vocabs.py +15 -4
- doctr/datasets/wildreceipt.py +12 -1
- doctr/file_utils.py +45 -12
- doctr/io/elements.py +52 -10
- doctr/io/html.py +2 -2
- doctr/io/image/pytorch.py +6 -8
- doctr/io/image/tensorflow.py +1 -1
- doctr/io/pdf.py +5 -2
- doctr/io/reader.py +6 -0
- doctr/models/__init__.py +0 -1
- doctr/models/_utils.py +57 -20
- doctr/models/builder.py +73 -15
- doctr/models/classification/magc_resnet/tensorflow.py +13 -6
- doctr/models/classification/mobilenet/pytorch.py +47 -9
- doctr/models/classification/mobilenet/tensorflow.py +51 -14
- doctr/models/classification/predictor/pytorch.py +28 -17
- doctr/models/classification/predictor/tensorflow.py +26 -16
- doctr/models/classification/resnet/tensorflow.py +21 -8
- doctr/models/classification/textnet/pytorch.py +3 -3
- doctr/models/classification/textnet/tensorflow.py +11 -5
- doctr/models/classification/vgg/tensorflow.py +9 -3
- doctr/models/classification/vit/tensorflow.py +10 -4
- doctr/models/classification/zoo.py +55 -19
- doctr/models/detection/_utils/__init__.py +1 -0
- doctr/models/detection/_utils/base.py +66 -0
- doctr/models/detection/differentiable_binarization/base.py +4 -3
- doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
- doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
- doctr/models/detection/fast/base.py +6 -5
- doctr/models/detection/fast/pytorch.py +4 -4
- doctr/models/detection/fast/tensorflow.py +15 -12
- doctr/models/detection/linknet/base.py +4 -3
- doctr/models/detection/linknet/tensorflow.py +23 -11
- doctr/models/detection/predictor/pytorch.py +15 -1
- doctr/models/detection/predictor/tensorflow.py +17 -3
- doctr/models/detection/zoo.py +7 -2
- doctr/models/factory/hub.py +8 -18
- doctr/models/kie_predictor/base.py +13 -3
- doctr/models/kie_predictor/pytorch.py +45 -20
- doctr/models/kie_predictor/tensorflow.py +44 -17
- doctr/models/modules/layers/pytorch.py +2 -3
- doctr/models/modules/layers/tensorflow.py +6 -8
- doctr/models/modules/transformer/pytorch.py +2 -2
- doctr/models/modules/transformer/tensorflow.py +0 -2
- doctr/models/modules/vision_transformer/pytorch.py +1 -1
- doctr/models/modules/vision_transformer/tensorflow.py +1 -1
- doctr/models/predictor/base.py +97 -58
- doctr/models/predictor/pytorch.py +35 -20
- doctr/models/predictor/tensorflow.py +35 -18
- doctr/models/preprocessor/pytorch.py +4 -4
- doctr/models/preprocessor/tensorflow.py +3 -2
- doctr/models/recognition/crnn/tensorflow.py +8 -6
- doctr/models/recognition/master/pytorch.py +2 -2
- doctr/models/recognition/master/tensorflow.py +9 -4
- doctr/models/recognition/parseq/pytorch.py +4 -3
- doctr/models/recognition/parseq/tensorflow.py +14 -11
- doctr/models/recognition/sar/pytorch.py +7 -6
- doctr/models/recognition/sar/tensorflow.py +10 -12
- doctr/models/recognition/vitstr/pytorch.py +1 -1
- doctr/models/recognition/vitstr/tensorflow.py +9 -4
- doctr/models/recognition/zoo.py +1 -1
- doctr/models/utils/pytorch.py +1 -1
- doctr/models/utils/tensorflow.py +15 -15
- doctr/models/zoo.py +2 -2
- doctr/py.typed +0 -0
- doctr/transforms/functional/base.py +1 -1
- doctr/transforms/functional/pytorch.py +5 -5
- doctr/transforms/modules/base.py +37 -15
- doctr/transforms/modules/pytorch.py +73 -14
- doctr/transforms/modules/tensorflow.py +78 -19
- doctr/utils/fonts.py +7 -5
- doctr/utils/geometry.py +141 -31
- doctr/utils/metrics.py +34 -175
- doctr/utils/reconstitution.py +212 -0
- doctr/utils/visualization.py +5 -118
- doctr/version.py +1 -1
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
- python_doctr-0.10.0.dist-info/RECORD +173 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
- doctr/models/artefacts/__init__.py +0 -2
- doctr/models/artefacts/barcode.py +0 -74
- doctr/models/artefacts/face.py +0 -63
- doctr/models/obj_detection/__init__.py +0 -1
- doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
- doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
- python_doctr-0.8.1.dist-info/RECORD +0 -173
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0
doctr/utils/geometry.py
CHANGED
|
@@ -20,11 +20,13 @@ __all__ = [
|
|
|
20
20
|
"rotate_boxes",
|
|
21
21
|
"compute_expanded_shape",
|
|
22
22
|
"rotate_image",
|
|
23
|
+
"remove_image_padding",
|
|
23
24
|
"estimate_page_angle",
|
|
24
25
|
"convert_to_relative_coords",
|
|
25
26
|
"rotate_abs_geoms",
|
|
26
27
|
"extract_crops",
|
|
27
28
|
"extract_rcrops",
|
|
29
|
+
"detach_scores",
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
|
|
@@ -57,6 +59,28 @@ def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
|
|
|
57
59
|
return (min(x), min(y)), (max(x), max(y))
|
|
58
60
|
|
|
59
61
|
|
|
62
|
+
def detach_scores(boxes: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
|
63
|
+
"""Detach the objectness scores from box predictions
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
----
|
|
67
|
+
boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
-------
|
|
71
|
+
a tuple of two lists: the first one contains the boxes without the objectness scores,
|
|
72
|
+
the second one contains the objectness scores
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def _detach(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
76
|
+
if boxes.ndim == 2:
|
|
77
|
+
return boxes[:, :-1], boxes[:, -1]
|
|
78
|
+
return boxes[:, :-1], boxes[:, -1, -1]
|
|
79
|
+
|
|
80
|
+
loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
|
|
81
|
+
return list(loc_preds), list(obj_scores)
|
|
82
|
+
|
|
83
|
+
|
|
60
84
|
def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Union[BoundingBox, np.ndarray]:
|
|
61
85
|
"""Compute enclosing bbox either from:
|
|
62
86
|
|
|
@@ -64,18 +88,18 @@ def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Unio
|
|
|
64
88
|
----
|
|
65
89
|
bboxes: boxes in one of the following formats:
|
|
66
90
|
|
|
67
|
-
- an array of boxes: (*,
|
|
68
|
-
(xmin, ymin, xmax, ymax
|
|
91
|
+
- an array of boxes: (*, 4), where boxes have this shape:
|
|
92
|
+
(xmin, ymin, xmax, ymax)
|
|
69
93
|
|
|
70
94
|
- a list of BoundingBox
|
|
71
95
|
|
|
72
96
|
Returns:
|
|
73
97
|
-------
|
|
74
|
-
a (1,
|
|
98
|
+
a (1, 4) array (enclosing boxarray), or a BoundingBox
|
|
75
99
|
"""
|
|
76
100
|
if isinstance(bboxes, np.ndarray):
|
|
77
|
-
xmin, ymin, xmax, ymax
|
|
78
|
-
return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()
|
|
101
|
+
xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
|
|
102
|
+
return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
|
|
79
103
|
else:
|
|
80
104
|
x, y = zip(*[point for box in bboxes for point in box])
|
|
81
105
|
return (min(x), min(y)), (max(x), max(y))
|
|
@@ -88,21 +112,21 @@ def resolve_enclosing_rbbox(rbboxes: List[np.ndarray], intermed_size: int = 1024
|
|
|
88
112
|
----
|
|
89
113
|
rbboxes: boxes in one of the following formats:
|
|
90
114
|
|
|
91
|
-
- an array of boxes: (*,
|
|
92
|
-
(
|
|
115
|
+
- an array of boxes: (*, 4, 2), where boxes have this shape:
|
|
116
|
+
(x1, y1), (x2, y2), (x3, y3), (x4, y4)
|
|
93
117
|
|
|
94
118
|
- a list of BoundingBox
|
|
95
119
|
intermed_size: size of the intermediate image
|
|
96
120
|
|
|
97
121
|
Returns:
|
|
98
122
|
-------
|
|
99
|
-
a (
|
|
123
|
+
a (4, 2) array (enclosing rotated box)
|
|
100
124
|
"""
|
|
101
125
|
cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
|
|
102
126
|
# Convert to absolute for minAreaRect
|
|
103
127
|
cloud *= intermed_size
|
|
104
128
|
rect = cv2.minAreaRect(cloud.astype(np.int32))
|
|
105
|
-
return cv2.boxPoints(rect) / intermed_size # type: ignore[
|
|
129
|
+
return cv2.boxPoints(rect) / intermed_size # type: ignore[return-value]
|
|
106
130
|
|
|
107
131
|
|
|
108
132
|
def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
|
|
@@ -232,7 +256,7 @@ def rotate_boxes(
|
|
|
232
256
|
|
|
233
257
|
Args:
|
|
234
258
|
----
|
|
235
|
-
loc_preds: (N,
|
|
259
|
+
loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
|
|
236
260
|
angle: angle between -90 and +90 degrees
|
|
237
261
|
orig_shape: shape of the origin image
|
|
238
262
|
min_angle: minimum angle to rotate boxes
|
|
@@ -320,7 +344,7 @@ def rotate_image(
|
|
|
320
344
|
# Pad height
|
|
321
345
|
else:
|
|
322
346
|
h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
|
|
323
|
-
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
|
|
347
|
+
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) # type: ignore[assignment]
|
|
324
348
|
if preserve_origin_shape:
|
|
325
349
|
# rescale
|
|
326
350
|
rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
|
|
@@ -328,6 +352,26 @@ def rotate_image(
|
|
|
328
352
|
return rot_img
|
|
329
353
|
|
|
330
354
|
|
|
355
|
+
def remove_image_padding(image: np.ndarray) -> np.ndarray:
|
|
356
|
+
"""Remove black border padding from an image
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
----
|
|
360
|
+
image: numpy tensor to remove padding from
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
-------
|
|
364
|
+
Image with padding removed
|
|
365
|
+
"""
|
|
366
|
+
# Find the bounding box of the non-black region
|
|
367
|
+
rows = np.any(image, axis=1)
|
|
368
|
+
cols = np.any(image, axis=0)
|
|
369
|
+
rmin, rmax = np.where(rows)[0][[0, -1]]
|
|
370
|
+
cmin, cmax = np.where(cols)[0][[0, -1]]
|
|
371
|
+
|
|
372
|
+
return image[rmin : rmax + 1, cmin : cmax + 1]
|
|
373
|
+
|
|
374
|
+
|
|
331
375
|
def estimate_page_angle(polys: np.ndarray) -> float:
|
|
332
376
|
"""Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the
|
|
333
377
|
estimated angle ccw in degrees
|
|
@@ -408,7 +452,7 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray, channels_last: bool = True
|
|
|
408
452
|
|
|
409
453
|
|
|
410
454
|
def extract_rcrops(
|
|
411
|
-
img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True
|
|
455
|
+
img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True, assume_horizontal: bool = False
|
|
412
456
|
) -> List[np.ndarray]:
|
|
413
457
|
"""Created cropped images from list of rotated bounding boxes
|
|
414
458
|
|
|
@@ -418,6 +462,7 @@ def extract_rcrops(
|
|
|
418
462
|
polys: bounding boxes of shape (N, 4, 2)
|
|
419
463
|
dtype: target data type of bounding boxes
|
|
420
464
|
channels_last: whether the channel dimensions is the last one instead of the last one
|
|
465
|
+
assume_horizontal: whether the boxes are assumed to be only horizontally oriented
|
|
421
466
|
|
|
422
467
|
Returns:
|
|
423
468
|
-------
|
|
@@ -435,22 +480,87 @@ def extract_rcrops(
|
|
|
435
480
|
_boxes[:, :, 0] *= width
|
|
436
481
|
_boxes[:, :, 1] *= height
|
|
437
482
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
483
|
+
src_img = img if channels_last else img.transpose(1, 2, 0)
|
|
484
|
+
|
|
485
|
+
# Handle only horizontal oriented boxes
|
|
486
|
+
if assume_horizontal:
|
|
487
|
+
crops = []
|
|
488
|
+
|
|
489
|
+
for box in _boxes:
|
|
490
|
+
# Calculate the centroid of the quadrilateral
|
|
491
|
+
centroid = np.mean(box, axis=0)
|
|
492
|
+
|
|
493
|
+
# Divide the points into left and right
|
|
494
|
+
left_points = box[box[:, 0] < centroid[0]]
|
|
495
|
+
right_points = box[box[:, 0] >= centroid[0]]
|
|
496
|
+
|
|
497
|
+
# Sort the left points according to the y-axis
|
|
498
|
+
left_points = left_points[np.argsort(left_points[:, 1])]
|
|
499
|
+
top_left_pt = left_points[0]
|
|
500
|
+
bottom_left_pt = left_points[-1]
|
|
501
|
+
# Sort the right points according to the y-axis
|
|
502
|
+
right_points = right_points[np.argsort(right_points[:, 1])]
|
|
503
|
+
top_right_pt = right_points[0]
|
|
504
|
+
bottom_right_pt = right_points[-1]
|
|
505
|
+
box_points = np.array(
|
|
506
|
+
[top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt],
|
|
507
|
+
dtype=dtype,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Get the width and height of the rectangle that will contain the warped quadrilateral
|
|
511
|
+
width_upper = np.linalg.norm(top_right_pt - top_left_pt)
|
|
512
|
+
width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt)
|
|
513
|
+
height_left = np.linalg.norm(bottom_left_pt - top_left_pt)
|
|
514
|
+
height_right = np.linalg.norm(bottom_right_pt - top_right_pt)
|
|
515
|
+
|
|
516
|
+
# Get the maximum width and height
|
|
517
|
+
rect_width = max(int(width_upper), int(width_lower))
|
|
518
|
+
rect_height = max(int(height_left), int(height_right))
|
|
519
|
+
|
|
520
|
+
dst_pts = np.array(
|
|
521
|
+
[
|
|
522
|
+
[0, 0], # top-left
|
|
523
|
+
# bottom-left
|
|
524
|
+
[0, rect_height - 1],
|
|
525
|
+
# top-right
|
|
526
|
+
[rect_width - 1, 0],
|
|
527
|
+
# bottom-right
|
|
528
|
+
[rect_width - 1, rect_height - 1],
|
|
529
|
+
],
|
|
530
|
+
dtype=dtype,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Get the perspective transform matrix using the box points
|
|
534
|
+
affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts)
|
|
535
|
+
|
|
536
|
+
# Perform the perspective warp to get the rectified crop
|
|
537
|
+
crop = cv2.warpPerspective(
|
|
538
|
+
src_img,
|
|
539
|
+
affine_mat,
|
|
540
|
+
(rect_width, rect_height),
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# Add the crop to the list of crops
|
|
544
|
+
crops.append(crop)
|
|
545
|
+
|
|
546
|
+
# Handle any oriented boxes
|
|
547
|
+
else:
|
|
548
|
+
src_pts = _boxes[:, :3].astype(np.float32)
|
|
549
|
+
# Preserve size
|
|
550
|
+
d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
|
|
551
|
+
d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
|
|
552
|
+
# (N, 3, 2)
|
|
553
|
+
dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
|
|
554
|
+
dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
|
|
555
|
+
dst_pts[:, 2, 1] = d2 - 1
|
|
556
|
+
# Use a warp transformation to extract the crop
|
|
557
|
+
crops = [
|
|
558
|
+
cv2.warpAffine(
|
|
559
|
+
src_img,
|
|
560
|
+
# Transformation matrix
|
|
561
|
+
cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
|
|
562
|
+
(int(d1[idx]), int(d2[idx])),
|
|
563
|
+
)
|
|
564
|
+
for idx in range(_boxes.shape[0])
|
|
565
|
+
]
|
|
566
|
+
return crops # type: ignore[return-value]
|
doctr/utils/metrics.py
CHANGED
|
@@ -5,16 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
from typing import Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import cv2
|
|
9
8
|
import numpy as np
|
|
9
|
+
from anyascii import anyascii
|
|
10
10
|
from scipy.optimize import linear_sum_assignment
|
|
11
|
-
from
|
|
11
|
+
from shapely.geometry import Polygon
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"TextMatch",
|
|
15
15
|
"box_iou",
|
|
16
|
-
"box_ioa",
|
|
17
|
-
"mask_iou",
|
|
18
16
|
"polygon_iou",
|
|
19
17
|
"nms",
|
|
20
18
|
"LocalizationConfusion",
|
|
@@ -34,16 +32,16 @@ def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
|
|
|
34
32
|
Returns:
|
|
35
33
|
-------
|
|
36
34
|
a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
|
|
37
|
-
|
|
35
|
+
anyascii counterparts and their lower-case anyascii counterparts match
|
|
38
36
|
"""
|
|
39
37
|
raw_match = word1 == word2
|
|
40
38
|
caseless_match = word1.lower() == word2.lower()
|
|
41
|
-
|
|
39
|
+
anyascii_match = anyascii(word1) == anyascii(word2)
|
|
42
40
|
|
|
43
41
|
# Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
|
|
44
|
-
unicase_match =
|
|
42
|
+
unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
|
|
45
43
|
|
|
46
|
-
return raw_match, caseless_match,
|
|
44
|
+
return raw_match, caseless_match, anyascii_match, unicase_match
|
|
47
45
|
|
|
48
46
|
|
|
49
47
|
class TextMatch:
|
|
@@ -94,10 +92,10 @@ class TextMatch:
|
|
|
94
92
|
raise AssertionError("prediction size does not match with ground-truth labels size")
|
|
95
93
|
|
|
96
94
|
for gt_word, pred_word in zip(gt, pred):
|
|
97
|
-
_raw, _caseless,
|
|
95
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
|
|
98
96
|
self.raw += int(_raw)
|
|
99
97
|
self.caseless += int(_caseless)
|
|
100
|
-
self.
|
|
98
|
+
self.anyascii += int(_anyascii)
|
|
101
99
|
self.unicase += int(_unicase)
|
|
102
100
|
|
|
103
101
|
self.total += len(gt)
|
|
@@ -107,8 +105,8 @@ class TextMatch:
|
|
|
107
105
|
|
|
108
106
|
Returns
|
|
109
107
|
-------
|
|
110
|
-
a dictionary with the exact match score for the raw data, its lower-case counterpart, its
|
|
111
|
-
counterpart and its lower-case
|
|
108
|
+
a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii
|
|
109
|
+
counterpart and its lower-case anyascii counterpart
|
|
112
110
|
"""
|
|
113
111
|
if self.total == 0:
|
|
114
112
|
raise AssertionError("you need to update the metric before getting the summary")
|
|
@@ -116,14 +114,14 @@ class TextMatch:
|
|
|
116
114
|
return dict(
|
|
117
115
|
raw=self.raw / self.total,
|
|
118
116
|
caseless=self.caseless / self.total,
|
|
119
|
-
|
|
117
|
+
anyascii=self.anyascii / self.total,
|
|
120
118
|
unicase=self.unicase / self.total,
|
|
121
119
|
)
|
|
122
120
|
|
|
123
121
|
def reset(self) -> None:
|
|
124
122
|
self.raw = 0
|
|
125
123
|
self.caseless = 0
|
|
126
|
-
self.
|
|
124
|
+
self.anyascii = 0
|
|
127
125
|
self.unicase = 0
|
|
128
126
|
self.total = 0
|
|
129
127
|
|
|
@@ -151,73 +149,14 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
|
|
|
151
149
|
right = np.minimum(r1, r2.T)
|
|
152
150
|
bot = np.minimum(b1, b2.T)
|
|
153
151
|
|
|
154
|
-
intersection = np.clip(right - left, 0, np.
|
|
152
|
+
intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
|
|
155
153
|
union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
|
|
156
154
|
iou_mat = intersection / union
|
|
157
155
|
|
|
158
156
|
return iou_mat
|
|
159
157
|
|
|
160
158
|
|
|
161
|
-
def
|
|
162
|
-
"""Computes the IoA (intersection over area) between two sets of bounding boxes:
|
|
163
|
-
ioa(i, j) = inter(i, j) / area(i)
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
----
|
|
167
|
-
boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
|
|
168
|
-
boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
-------
|
|
172
|
-
the IoA matrix of shape (N, M)
|
|
173
|
-
"""
|
|
174
|
-
ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
|
|
175
|
-
|
|
176
|
-
if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
|
|
177
|
-
l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
|
|
178
|
-
l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
|
|
179
|
-
|
|
180
|
-
left = np.maximum(l1, l2.T)
|
|
181
|
-
top = np.maximum(t1, t2.T)
|
|
182
|
-
right = np.minimum(r1, r2.T)
|
|
183
|
-
bot = np.minimum(b1, b2.T)
|
|
184
|
-
|
|
185
|
-
intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
|
|
186
|
-
area = (r1 - l1) * (b1 - t1)
|
|
187
|
-
ioa_mat = intersection / area
|
|
188
|
-
|
|
189
|
-
return ioa_mat
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
|
|
193
|
-
"""Computes the IoU between two sets of boolean masks
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
----
|
|
197
|
-
masks_1: boolean masks of shape (N, H, W)
|
|
198
|
-
masks_2: boolean masks of shape (M, H, W)
|
|
199
|
-
|
|
200
|
-
Returns:
|
|
201
|
-
-------
|
|
202
|
-
the IoU matrix of shape (N, M)
|
|
203
|
-
"""
|
|
204
|
-
if masks_1.shape[1:] != masks_2.shape[1:]:
|
|
205
|
-
raise AssertionError("both boolean masks should have the same spatial shape")
|
|
206
|
-
|
|
207
|
-
iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
|
|
208
|
-
|
|
209
|
-
if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
|
|
210
|
-
axes = tuple(range(2, masks_1.ndim + 1))
|
|
211
|
-
intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
212
|
-
union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
213
|
-
iou_mat = intersection / union
|
|
214
|
-
|
|
215
|
-
return iou_mat
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def polygon_iou(
|
|
219
|
-
polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
|
|
220
|
-
) -> np.ndarray:
|
|
159
|
+
def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
|
|
221
160
|
"""Computes the IoU between two sets of rotated bounding boxes
|
|
222
161
|
|
|
223
162
|
Args:
|
|
@@ -234,80 +173,18 @@ def polygon_iou(
|
|
|
234
173
|
if polys_1.ndim != 3 or polys_2.ndim != 3:
|
|
235
174
|
raise AssertionError("expects boxes to be in format (N, 4, 2)")
|
|
236
175
|
|
|
237
|
-
iou_mat
|
|
238
|
-
|
|
239
|
-
if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
|
|
240
|
-
if use_broadcasting:
|
|
241
|
-
masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
|
|
242
|
-
masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
|
|
243
|
-
iou_mat = mask_iou(masks_1, masks_2)
|
|
244
|
-
else:
|
|
245
|
-
# Save memory by doing the computation for each pair
|
|
246
|
-
for idx, b1 in enumerate(polys_1):
|
|
247
|
-
m1 = _rbox_to_mask(b1, mask_shape)
|
|
248
|
-
for _idx, b2 in enumerate(polys_2):
|
|
249
|
-
m2 = _rbox_to_mask(b2, mask_shape)
|
|
250
|
-
iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
|
|
251
|
-
|
|
252
|
-
return iou_mat
|
|
176
|
+
iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
|
|
253
177
|
|
|
178
|
+
shapely_polys_1 = [Polygon(poly) for poly in polys_1]
|
|
179
|
+
shapely_polys_2 = [Polygon(poly) for poly in polys_2]
|
|
254
180
|
|
|
255
|
-
|
|
256
|
-
|
|
181
|
+
for i, poly1 in enumerate(shapely_polys_1):
|
|
182
|
+
for j, poly2 in enumerate(shapely_polys_2):
|
|
183
|
+
intersection_area = poly1.intersection(poly2).area
|
|
184
|
+
union_area = poly1.area + poly2.area - intersection_area
|
|
185
|
+
iou_mat[i, j] = intersection_area / union_area
|
|
257
186
|
|
|
258
|
-
|
|
259
|
-
----
|
|
260
|
-
box: rotated bounding box of shape (4, 2)
|
|
261
|
-
shape: spatial shapes of the output masks
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
-------
|
|
265
|
-
the boolean mask of the specified shape
|
|
266
|
-
"""
|
|
267
|
-
mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
|
|
268
|
-
# Get absolute coords
|
|
269
|
-
if not np.issubdtype(box.dtype, np.integer):
|
|
270
|
-
abs_box = box.copy()
|
|
271
|
-
abs_box[:, 0] = abs_box[:, 0] * shape[1]
|
|
272
|
-
abs_box[:, 1] = abs_box[:, 1] * shape[0]
|
|
273
|
-
abs_box = abs_box.round().astype(int)
|
|
274
|
-
else:
|
|
275
|
-
abs_box = box
|
|
276
|
-
abs_box[2:] = abs_box[2:] + 1
|
|
277
|
-
cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload]
|
|
278
|
-
|
|
279
|
-
return mask.astype(bool)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
|
|
283
|
-
"""Converts rotated bounding boxes to boolean masks
|
|
284
|
-
|
|
285
|
-
Args:
|
|
286
|
-
----
|
|
287
|
-
boxes: rotated bounding boxes of shape (N, 4, 2)
|
|
288
|
-
shape: spatial shapes of the output masks
|
|
289
|
-
|
|
290
|
-
Returns:
|
|
291
|
-
-------
|
|
292
|
-
the boolean masks of shape (N, H, W)
|
|
293
|
-
"""
|
|
294
|
-
masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
|
|
295
|
-
|
|
296
|
-
if boxes.shape[0] > 0:
|
|
297
|
-
# Get absolute coordinates
|
|
298
|
-
if not np.issubdtype(boxes.dtype, np.integer):
|
|
299
|
-
abs_boxes = boxes.copy()
|
|
300
|
-
abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
|
|
301
|
-
abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
|
|
302
|
-
abs_boxes = abs_boxes.round().astype(int)
|
|
303
|
-
else:
|
|
304
|
-
abs_boxes = boxes
|
|
305
|
-
abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
|
|
306
|
-
|
|
307
|
-
# TODO: optimize slicing to improve vectorization
|
|
308
|
-
for idx, _box in enumerate(abs_boxes):
|
|
309
|
-
cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload]
|
|
310
|
-
return masks.astype(bool)
|
|
187
|
+
return iou_mat
|
|
311
188
|
|
|
312
189
|
|
|
313
190
|
def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
|
|
@@ -386,21 +263,15 @@ class LocalizationConfusion:
|
|
|
386
263
|
----
|
|
387
264
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
388
265
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
389
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
390
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
391
266
|
"""
|
|
392
267
|
|
|
393
268
|
def __init__(
|
|
394
269
|
self,
|
|
395
270
|
iou_thresh: float = 0.5,
|
|
396
271
|
use_polygons: bool = False,
|
|
397
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
398
|
-
use_broadcasting: bool = True,
|
|
399
272
|
) -> None:
|
|
400
273
|
self.iou_thresh = iou_thresh
|
|
401
274
|
self.use_polygons = use_polygons
|
|
402
|
-
self.mask_shape = mask_shape
|
|
403
|
-
self.use_broadcasting = use_broadcasting
|
|
404
275
|
self.reset()
|
|
405
276
|
|
|
406
277
|
def update(self, gts: np.ndarray, preds: np.ndarray) -> None:
|
|
@@ -414,7 +285,7 @@ class LocalizationConfusion:
|
|
|
414
285
|
if preds.shape[0] > 0:
|
|
415
286
|
# Compute IoU
|
|
416
287
|
if self.use_polygons:
|
|
417
|
-
iou_mat = polygon_iou(gts, preds
|
|
288
|
+
iou_mat = polygon_iou(gts, preds)
|
|
418
289
|
else:
|
|
419
290
|
iou_mat = box_iou(gts, preds)
|
|
420
291
|
self.tot_iou += float(iou_mat.max(axis=0).sum())
|
|
@@ -441,7 +312,7 @@ class LocalizationConfusion:
|
|
|
441
312
|
precision = self.matches / self.num_preds if self.num_preds > 0 else None
|
|
442
313
|
|
|
443
314
|
# mean IoU
|
|
444
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
315
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
445
316
|
|
|
446
317
|
return recall, precision, mean_iou
|
|
447
318
|
|
|
@@ -492,21 +363,15 @@ class OCRMetric:
|
|
|
492
363
|
----
|
|
493
364
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
494
365
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
495
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
496
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
497
366
|
"""
|
|
498
367
|
|
|
499
368
|
def __init__(
|
|
500
369
|
self,
|
|
501
370
|
iou_thresh: float = 0.5,
|
|
502
371
|
use_polygons: bool = False,
|
|
503
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
504
|
-
use_broadcasting: bool = True,
|
|
505
372
|
) -> None:
|
|
506
373
|
self.iou_thresh = iou_thresh
|
|
507
374
|
self.use_polygons = use_polygons
|
|
508
|
-
self.mask_shape = mask_shape
|
|
509
|
-
self.use_broadcasting = use_broadcasting
|
|
510
375
|
self.reset()
|
|
511
376
|
|
|
512
377
|
def update(
|
|
@@ -533,7 +398,7 @@ class OCRMetric:
|
|
|
533
398
|
# Compute IoU
|
|
534
399
|
if pred_boxes.shape[0] > 0:
|
|
535
400
|
if self.use_polygons:
|
|
536
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
401
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
537
402
|
else:
|
|
538
403
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
539
404
|
|
|
@@ -544,10 +409,10 @@ class OCRMetric:
|
|
|
544
409
|
is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
|
|
545
410
|
# String comparison
|
|
546
411
|
for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
|
|
547
|
-
_raw, _caseless,
|
|
412
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
|
|
548
413
|
self.raw_matches += int(_raw)
|
|
549
414
|
self.caseless_matches += int(_caseless)
|
|
550
|
-
self.
|
|
415
|
+
self.anyascii_matches += int(_anyascii)
|
|
551
416
|
self.unicase_matches += int(_unicase)
|
|
552
417
|
|
|
553
418
|
self.num_gts += gt_boxes.shape[0]
|
|
@@ -564,7 +429,7 @@ class OCRMetric:
|
|
|
564
429
|
recall = dict(
|
|
565
430
|
raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None,
|
|
566
431
|
caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None,
|
|
567
|
-
|
|
432
|
+
anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None,
|
|
568
433
|
unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None,
|
|
569
434
|
)
|
|
570
435
|
|
|
@@ -572,12 +437,12 @@ class OCRMetric:
|
|
|
572
437
|
precision = dict(
|
|
573
438
|
raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
|
|
574
439
|
caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
|
|
575
|
-
|
|
440
|
+
anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
|
|
576
441
|
unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
|
|
577
442
|
)
|
|
578
443
|
|
|
579
444
|
# mean IoU (overall detected boxes)
|
|
580
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
445
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
581
446
|
|
|
582
447
|
return recall, precision, mean_iou
|
|
583
448
|
|
|
@@ -587,7 +452,7 @@ class OCRMetric:
|
|
|
587
452
|
self.tot_iou = 0.0
|
|
588
453
|
self.raw_matches = 0
|
|
589
454
|
self.caseless_matches = 0
|
|
590
|
-
self.
|
|
455
|
+
self.anyascii_matches = 0
|
|
591
456
|
self.unicase_matches = 0
|
|
592
457
|
|
|
593
458
|
|
|
@@ -631,21 +496,15 @@ class DetectionMetric:
|
|
|
631
496
|
----
|
|
632
497
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
633
498
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
634
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
635
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
636
499
|
"""
|
|
637
500
|
|
|
638
501
|
def __init__(
|
|
639
502
|
self,
|
|
640
503
|
iou_thresh: float = 0.5,
|
|
641
504
|
use_polygons: bool = False,
|
|
642
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
643
|
-
use_broadcasting: bool = True,
|
|
644
505
|
) -> None:
|
|
645
506
|
self.iou_thresh = iou_thresh
|
|
646
507
|
self.use_polygons = use_polygons
|
|
647
|
-
self.mask_shape = mask_shape
|
|
648
|
-
self.use_broadcasting = use_broadcasting
|
|
649
508
|
self.reset()
|
|
650
509
|
|
|
651
510
|
def update(
|
|
@@ -672,7 +531,7 @@ class DetectionMetric:
|
|
|
672
531
|
# Compute IoU
|
|
673
532
|
if pred_boxes.shape[0] > 0:
|
|
674
533
|
if self.use_polygons:
|
|
675
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
534
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
676
535
|
else:
|
|
677
536
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
678
537
|
|
|
@@ -701,7 +560,7 @@ class DetectionMetric:
|
|
|
701
560
|
precision = self.num_matches / self.num_preds if self.num_preds > 0 else None
|
|
702
561
|
|
|
703
562
|
# mean IoU (overall detected boxes)
|
|
704
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
563
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
705
564
|
|
|
706
565
|
return recall, precision, mean_iou
|
|
707
566
|
|