python-doctr 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/__init__.py +1 -1
- doctr/contrib/__init__.py +0 -0
- doctr/contrib/artefacts.py +131 -0
- doctr/contrib/base.py +105 -0
- doctr/datasets/datasets/pytorch.py +2 -2
- doctr/datasets/generator/base.py +6 -5
- doctr/datasets/imgur5k.py +1 -1
- doctr/datasets/loader.py +1 -6
- doctr/datasets/utils.py +2 -1
- doctr/datasets/vocabs.py +9 -2
- doctr/file_utils.py +26 -12
- doctr/io/elements.py +40 -6
- doctr/io/html.py +2 -2
- doctr/io/image/pytorch.py +6 -8
- doctr/io/image/tensorflow.py +1 -1
- doctr/io/pdf.py +5 -2
- doctr/io/reader.py +6 -0
- doctr/models/__init__.py +0 -1
- doctr/models/_utils.py +57 -20
- doctr/models/builder.py +71 -13
- doctr/models/classification/mobilenet/pytorch.py +45 -9
- doctr/models/classification/mobilenet/tensorflow.py +38 -7
- doctr/models/classification/predictor/pytorch.py +18 -11
- doctr/models/classification/predictor/tensorflow.py +16 -10
- doctr/models/classification/textnet/pytorch.py +3 -3
- doctr/models/classification/textnet/tensorflow.py +3 -3
- doctr/models/classification/zoo.py +39 -15
- doctr/models/detection/_utils/__init__.py +1 -0
- doctr/models/detection/_utils/base.py +66 -0
- doctr/models/detection/differentiable_binarization/base.py +4 -3
- doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
- doctr/models/detection/fast/base.py +6 -5
- doctr/models/detection/fast/pytorch.py +4 -4
- doctr/models/detection/fast/tensorflow.py +4 -4
- doctr/models/detection/linknet/base.py +4 -3
- doctr/models/detection/predictor/pytorch.py +15 -1
- doctr/models/detection/predictor/tensorflow.py +15 -1
- doctr/models/detection/zoo.py +7 -2
- doctr/models/factory/hub.py +3 -12
- doctr/models/kie_predictor/base.py +9 -3
- doctr/models/kie_predictor/pytorch.py +41 -20
- doctr/models/kie_predictor/tensorflow.py +36 -16
- doctr/models/modules/layers/pytorch.py +2 -3
- doctr/models/modules/layers/tensorflow.py +6 -8
- doctr/models/modules/transformer/pytorch.py +2 -2
- doctr/models/predictor/base.py +77 -50
- doctr/models/predictor/pytorch.py +31 -20
- doctr/models/predictor/tensorflow.py +27 -17
- doctr/models/preprocessor/pytorch.py +4 -4
- doctr/models/preprocessor/tensorflow.py +3 -2
- doctr/models/recognition/master/pytorch.py +2 -2
- doctr/models/recognition/parseq/pytorch.py +4 -3
- doctr/models/recognition/parseq/tensorflow.py +4 -3
- doctr/models/recognition/sar/pytorch.py +7 -6
- doctr/models/recognition/sar/tensorflow.py +3 -9
- doctr/models/recognition/vitstr/pytorch.py +1 -1
- doctr/models/recognition/zoo.py +1 -1
- doctr/models/zoo.py +2 -2
- doctr/py.typed +0 -0
- doctr/transforms/functional/base.py +1 -1
- doctr/transforms/functional/pytorch.py +4 -4
- doctr/transforms/modules/base.py +37 -15
- doctr/transforms/modules/pytorch.py +66 -8
- doctr/transforms/modules/tensorflow.py +63 -7
- doctr/utils/fonts.py +7 -5
- doctr/utils/geometry.py +35 -12
- doctr/utils/metrics.py +33 -174
- doctr/utils/reconstitution.py +126 -0
- doctr/utils/visualization.py +5 -118
- doctr/version.py +1 -1
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/METADATA +84 -80
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/RECORD +76 -76
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/WHEEL +1 -1
- doctr/models/artefacts/__init__.py +0 -2
- doctr/models/artefacts/barcode.py +0 -74
- doctr/models/artefacts/face.py +0 -63
- doctr/models/obj_detection/__init__.py +0 -1
- doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
- doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/zip-safe +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import random
|
|
7
|
-
from typing import Any, Callable,
|
|
7
|
+
from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import tensorflow as tf
|
|
@@ -30,6 +30,7 @@ __all__ = [
|
|
|
30
30
|
"GaussianNoise",
|
|
31
31
|
"RandomHorizontalFlip",
|
|
32
32
|
"RandomShadow",
|
|
33
|
+
"RandomResize",
|
|
33
34
|
]
|
|
34
35
|
|
|
35
36
|
|
|
@@ -457,10 +458,7 @@ class RandomHorizontalFlip(NestedObject):
|
|
|
457
458
|
>>> from doctr.transforms import RandomHorizontalFlip
|
|
458
459
|
>>> transfo = RandomHorizontalFlip(p=0.5)
|
|
459
460
|
>>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)
|
|
460
|
-
>>> target =
|
|
461
|
-
>>> "boxes": np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32),
|
|
462
|
-
>>> "labels": np.ones(1, dtype= np.int64)
|
|
463
|
-
>>> }
|
|
461
|
+
>>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32)
|
|
464
462
|
>>> out = transfo(image, target)
|
|
465
463
|
|
|
466
464
|
Args:
|
|
@@ -472,12 +470,15 @@ class RandomHorizontalFlip(NestedObject):
|
|
|
472
470
|
super().__init__()
|
|
473
471
|
self.p = p
|
|
474
472
|
|
|
475
|
-
def __call__(self, img: Union[tf.Tensor, np.ndarray], target:
|
|
473
|
+
def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
|
|
476
474
|
if np.random.rand(1) <= self.p:
|
|
477
475
|
_img = tf.image.flip_left_right(img)
|
|
478
476
|
_target = target.copy()
|
|
479
477
|
# Changing the relative bbox coordinates
|
|
480
|
-
|
|
478
|
+
if target.shape[1:] == (4,):
|
|
479
|
+
_target[:, ::2] = 1 - target[:, [2, 0]]
|
|
480
|
+
else:
|
|
481
|
+
_target[..., 0] = 1 - target[..., 0]
|
|
481
482
|
return _img, _target
|
|
482
483
|
return img, target
|
|
483
484
|
|
|
@@ -515,3 +516,58 @@ class RandomShadow(NestedObject):
|
|
|
515
516
|
|
|
516
517
|
def extra_repr(self) -> str:
|
|
517
518
|
return f"opacity_range={self.opacity_range}"
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class RandomResize(NestedObject):
|
|
522
|
+
"""Randomly resize the input image and align corresponding targets
|
|
523
|
+
|
|
524
|
+
>>> import tensorflow as tf
|
|
525
|
+
>>> from doctr.transforms import RandomResize
|
|
526
|
+
>>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5)
|
|
527
|
+
>>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
----
|
|
531
|
+
scale_range: range of the resizing factor for width and height (independently)
|
|
532
|
+
preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
|
|
533
|
+
given a float value, the aspect ratio will be preserved with this probability
|
|
534
|
+
symmetric_pad: whether to symmetrically pad the image,
|
|
535
|
+
given a float value, the symmetric padding will be applied with this probability
|
|
536
|
+
p: probability to apply the transformation
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
def __init__(
|
|
540
|
+
self,
|
|
541
|
+
scale_range: Tuple[float, float] = (0.3, 0.9),
|
|
542
|
+
preserve_aspect_ratio: Union[bool, float] = False,
|
|
543
|
+
symmetric_pad: Union[bool, float] = False,
|
|
544
|
+
p: float = 0.5,
|
|
545
|
+
):
|
|
546
|
+
super().__init__()
|
|
547
|
+
self.scale_range = scale_range
|
|
548
|
+
self.preserve_aspect_ratio = preserve_aspect_ratio
|
|
549
|
+
self.symmetric_pad = symmetric_pad
|
|
550
|
+
self.p = p
|
|
551
|
+
self._resize = Resize
|
|
552
|
+
|
|
553
|
+
def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
|
|
554
|
+
if np.random.rand(1) <= self.p:
|
|
555
|
+
scale_h = random.uniform(*self.scale_range)
|
|
556
|
+
scale_w = random.uniform(*self.scale_range)
|
|
557
|
+
new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w))
|
|
558
|
+
|
|
559
|
+
_img, _target = self._resize(
|
|
560
|
+
new_size,
|
|
561
|
+
preserve_aspect_ratio=self.preserve_aspect_ratio
|
|
562
|
+
if isinstance(self.preserve_aspect_ratio, bool)
|
|
563
|
+
else bool(np.random.rand(1) <= self.symmetric_pad),
|
|
564
|
+
symmetric_pad=self.symmetric_pad
|
|
565
|
+
if isinstance(self.symmetric_pad, bool)
|
|
566
|
+
else bool(np.random.rand(1) <= self.symmetric_pad),
|
|
567
|
+
)(img, target)
|
|
568
|
+
|
|
569
|
+
return _img, _target
|
|
570
|
+
return img, target
|
|
571
|
+
|
|
572
|
+
def extra_repr(self) -> str:
|
|
573
|
+
return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
|
doctr/utils/fonts.py
CHANGED
|
@@ -5,14 +5,16 @@
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
import platform
|
|
8
|
-
from typing import Optional
|
|
8
|
+
from typing import Optional, Union
|
|
9
9
|
|
|
10
10
|
from PIL import ImageFont
|
|
11
11
|
|
|
12
12
|
__all__ = ["get_font"]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def get_font(
|
|
15
|
+
def get_font(
|
|
16
|
+
font_family: Optional[str] = None, font_size: int = 13
|
|
17
|
+
) -> Union[ImageFont.FreeTypeFont, ImageFont.ImageFont]:
|
|
16
18
|
"""Resolves a compatible ImageFont for the system
|
|
17
19
|
|
|
18
20
|
Args:
|
|
@@ -28,14 +30,14 @@ def get_font(font_family: Optional[str] = None, font_size: int = 13) -> ImageFon
|
|
|
28
30
|
if font_family is None:
|
|
29
31
|
try:
|
|
30
32
|
font = ImageFont.truetype("FreeMono.ttf" if platform.system() == "Linux" else "Arial.ttf", font_size)
|
|
31
|
-
except OSError:
|
|
32
|
-
font = ImageFont.load_default()
|
|
33
|
+
except OSError: # pragma: no cover
|
|
34
|
+
font = ImageFont.load_default() # type: ignore[assignment]
|
|
33
35
|
logging.warning(
|
|
34
36
|
"unable to load recommended font family. Loading default PIL font,"
|
|
35
37
|
"font size issues may be expected."
|
|
36
38
|
"To prevent this, it is recommended to specify the value of 'font_family'."
|
|
37
39
|
)
|
|
38
|
-
else:
|
|
40
|
+
else: # pragma: no cover
|
|
39
41
|
font = ImageFont.truetype(font_family, font_size)
|
|
40
42
|
|
|
41
43
|
return font
|
doctr/utils/geometry.py
CHANGED
|
@@ -25,6 +25,7 @@ __all__ = [
|
|
|
25
25
|
"rotate_abs_geoms",
|
|
26
26
|
"extract_crops",
|
|
27
27
|
"extract_rcrops",
|
|
28
|
+
"detach_scores",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
|
|
@@ -57,6 +58,28 @@ def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
|
|
|
57
58
|
return (min(x), min(y)), (max(x), max(y))
|
|
58
59
|
|
|
59
60
|
|
|
61
|
+
def detach_scores(boxes: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
|
62
|
+
"""Detach the objectness scores from box predictions
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
----
|
|
66
|
+
boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
-------
|
|
70
|
+
a tuple of two lists: the first one contains the boxes without the objectness scores,
|
|
71
|
+
the second one contains the objectness scores
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def _detach(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
75
|
+
if boxes.ndim == 2:
|
|
76
|
+
return boxes[:, :-1], boxes[:, -1]
|
|
77
|
+
return boxes[:, :-1], boxes[:, -1, -1]
|
|
78
|
+
|
|
79
|
+
loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
|
|
80
|
+
return list(loc_preds), list(obj_scores)
|
|
81
|
+
|
|
82
|
+
|
|
60
83
|
def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Union[BoundingBox, np.ndarray]:
|
|
61
84
|
"""Compute enclosing bbox either from:
|
|
62
85
|
|
|
@@ -64,18 +87,18 @@ def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Unio
|
|
|
64
87
|
----
|
|
65
88
|
bboxes: boxes in one of the following formats:
|
|
66
89
|
|
|
67
|
-
- an array of boxes: (*,
|
|
68
|
-
(xmin, ymin, xmax, ymax
|
|
90
|
+
- an array of boxes: (*, 4), where boxes have this shape:
|
|
91
|
+
(xmin, ymin, xmax, ymax)
|
|
69
92
|
|
|
70
93
|
- a list of BoundingBox
|
|
71
94
|
|
|
72
95
|
Returns:
|
|
73
96
|
-------
|
|
74
|
-
a (1,
|
|
97
|
+
a (1, 4) array (enclosing boxarray), or a BoundingBox
|
|
75
98
|
"""
|
|
76
99
|
if isinstance(bboxes, np.ndarray):
|
|
77
|
-
xmin, ymin, xmax, ymax
|
|
78
|
-
return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()
|
|
100
|
+
xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
|
|
101
|
+
return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
|
|
79
102
|
else:
|
|
80
103
|
x, y = zip(*[point for box in bboxes for point in box])
|
|
81
104
|
return (min(x), min(y)), (max(x), max(y))
|
|
@@ -88,21 +111,21 @@ def resolve_enclosing_rbbox(rbboxes: List[np.ndarray], intermed_size: int = 1024
|
|
|
88
111
|
----
|
|
89
112
|
rbboxes: boxes in one of the following formats:
|
|
90
113
|
|
|
91
|
-
- an array of boxes: (*,
|
|
92
|
-
(
|
|
114
|
+
- an array of boxes: (*, 4, 2), where boxes have this shape:
|
|
115
|
+
(x1, y1), (x2, y2), (x3, y3), (x4, y4)
|
|
93
116
|
|
|
94
117
|
- a list of BoundingBox
|
|
95
118
|
intermed_size: size of the intermediate image
|
|
96
119
|
|
|
97
120
|
Returns:
|
|
98
121
|
-------
|
|
99
|
-
a (
|
|
122
|
+
a (4, 2) array (enclosing rotated box)
|
|
100
123
|
"""
|
|
101
124
|
cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
|
|
102
125
|
# Convert to absolute for minAreaRect
|
|
103
126
|
cloud *= intermed_size
|
|
104
127
|
rect = cv2.minAreaRect(cloud.astype(np.int32))
|
|
105
|
-
return cv2.boxPoints(rect) / intermed_size # type: ignore[
|
|
128
|
+
return cv2.boxPoints(rect) / intermed_size # type: ignore[return-value]
|
|
106
129
|
|
|
107
130
|
|
|
108
131
|
def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
|
|
@@ -232,7 +255,7 @@ def rotate_boxes(
|
|
|
232
255
|
|
|
233
256
|
Args:
|
|
234
257
|
----
|
|
235
|
-
loc_preds: (N,
|
|
258
|
+
loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
|
|
236
259
|
angle: angle between -90 and +90 degrees
|
|
237
260
|
orig_shape: shape of the origin image
|
|
238
261
|
min_angle: minimum angle to rotate boxes
|
|
@@ -320,7 +343,7 @@ def rotate_image(
|
|
|
320
343
|
# Pad height
|
|
321
344
|
else:
|
|
322
345
|
h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
|
|
323
|
-
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
|
|
346
|
+
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) # type: ignore[assignment]
|
|
324
347
|
if preserve_origin_shape:
|
|
325
348
|
# rescale
|
|
326
349
|
rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
|
|
@@ -453,4 +476,4 @@ def extract_rcrops(
|
|
|
453
476
|
)
|
|
454
477
|
for idx in range(_boxes.shape[0])
|
|
455
478
|
]
|
|
456
|
-
return crops
|
|
479
|
+
return crops # type: ignore[return-value]
|
doctr/utils/metrics.py
CHANGED
|
@@ -5,16 +5,14 @@
|
|
|
5
5
|
|
|
6
6
|
from typing import Dict, List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
import cv2
|
|
9
8
|
import numpy as np
|
|
9
|
+
from anyascii import anyascii
|
|
10
10
|
from scipy.optimize import linear_sum_assignment
|
|
11
|
-
from
|
|
11
|
+
from shapely.geometry import Polygon
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"TextMatch",
|
|
15
15
|
"box_iou",
|
|
16
|
-
"box_ioa",
|
|
17
|
-
"mask_iou",
|
|
18
16
|
"polygon_iou",
|
|
19
17
|
"nms",
|
|
20
18
|
"LocalizationConfusion",
|
|
@@ -34,16 +32,16 @@ def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
|
|
|
34
32
|
Returns:
|
|
35
33
|
-------
|
|
36
34
|
a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
|
|
37
|
-
|
|
35
|
+
anyascii counterparts and their lower-case anyascii counterparts match
|
|
38
36
|
"""
|
|
39
37
|
raw_match = word1 == word2
|
|
40
38
|
caseless_match = word1.lower() == word2.lower()
|
|
41
|
-
|
|
39
|
+
anyascii_match = anyascii(word1) == anyascii(word2)
|
|
42
40
|
|
|
43
41
|
# Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
|
|
44
|
-
unicase_match =
|
|
42
|
+
unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
|
|
45
43
|
|
|
46
|
-
return raw_match, caseless_match,
|
|
44
|
+
return raw_match, caseless_match, anyascii_match, unicase_match
|
|
47
45
|
|
|
48
46
|
|
|
49
47
|
class TextMatch:
|
|
@@ -94,10 +92,10 @@ class TextMatch:
|
|
|
94
92
|
raise AssertionError("prediction size does not match with ground-truth labels size")
|
|
95
93
|
|
|
96
94
|
for gt_word, pred_word in zip(gt, pred):
|
|
97
|
-
_raw, _caseless,
|
|
95
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
|
|
98
96
|
self.raw += int(_raw)
|
|
99
97
|
self.caseless += int(_caseless)
|
|
100
|
-
self.
|
|
98
|
+
self.anyascii += int(_anyascii)
|
|
101
99
|
self.unicase += int(_unicase)
|
|
102
100
|
|
|
103
101
|
self.total += len(gt)
|
|
@@ -107,8 +105,8 @@ class TextMatch:
|
|
|
107
105
|
|
|
108
106
|
Returns
|
|
109
107
|
-------
|
|
110
|
-
a dictionary with the exact match score for the raw data, its lower-case counterpart, its
|
|
111
|
-
counterpart and its lower-case
|
|
108
|
+
a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii
|
|
109
|
+
counterpart and its lower-case anyascii counterpart
|
|
112
110
|
"""
|
|
113
111
|
if self.total == 0:
|
|
114
112
|
raise AssertionError("you need to update the metric before getting the summary")
|
|
@@ -116,14 +114,14 @@ class TextMatch:
|
|
|
116
114
|
return dict(
|
|
117
115
|
raw=self.raw / self.total,
|
|
118
116
|
caseless=self.caseless / self.total,
|
|
119
|
-
|
|
117
|
+
anyascii=self.anyascii / self.total,
|
|
120
118
|
unicase=self.unicase / self.total,
|
|
121
119
|
)
|
|
122
120
|
|
|
123
121
|
def reset(self) -> None:
|
|
124
122
|
self.raw = 0
|
|
125
123
|
self.caseless = 0
|
|
126
|
-
self.
|
|
124
|
+
self.anyascii = 0
|
|
127
125
|
self.unicase = 0
|
|
128
126
|
self.total = 0
|
|
129
127
|
|
|
@@ -158,66 +156,7 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
|
|
|
158
156
|
return iou_mat
|
|
159
157
|
|
|
160
158
|
|
|
161
|
-
def
|
|
162
|
-
"""Computes the IoA (intersection over area) between two sets of bounding boxes:
|
|
163
|
-
ioa(i, j) = inter(i, j) / area(i)
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
----
|
|
167
|
-
boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
|
|
168
|
-
boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
-------
|
|
172
|
-
the IoA matrix of shape (N, M)
|
|
173
|
-
"""
|
|
174
|
-
ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
|
|
175
|
-
|
|
176
|
-
if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
|
|
177
|
-
l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
|
|
178
|
-
l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
|
|
179
|
-
|
|
180
|
-
left = np.maximum(l1, l2.T)
|
|
181
|
-
top = np.maximum(t1, t2.T)
|
|
182
|
-
right = np.minimum(r1, r2.T)
|
|
183
|
-
bot = np.minimum(b1, b2.T)
|
|
184
|
-
|
|
185
|
-
intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
|
|
186
|
-
area = (r1 - l1) * (b1 - t1)
|
|
187
|
-
ioa_mat = intersection / area
|
|
188
|
-
|
|
189
|
-
return ioa_mat
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
|
|
193
|
-
"""Computes the IoU between two sets of boolean masks
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
----
|
|
197
|
-
masks_1: boolean masks of shape (N, H, W)
|
|
198
|
-
masks_2: boolean masks of shape (M, H, W)
|
|
199
|
-
|
|
200
|
-
Returns:
|
|
201
|
-
-------
|
|
202
|
-
the IoU matrix of shape (N, M)
|
|
203
|
-
"""
|
|
204
|
-
if masks_1.shape[1:] != masks_2.shape[1:]:
|
|
205
|
-
raise AssertionError("both boolean masks should have the same spatial shape")
|
|
206
|
-
|
|
207
|
-
iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
|
|
208
|
-
|
|
209
|
-
if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
|
|
210
|
-
axes = tuple(range(2, masks_1.ndim + 1))
|
|
211
|
-
intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
212
|
-
union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
|
|
213
|
-
iou_mat = intersection / union
|
|
214
|
-
|
|
215
|
-
return iou_mat
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def polygon_iou(
|
|
219
|
-
polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
|
|
220
|
-
) -> np.ndarray:
|
|
159
|
+
def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
|
|
221
160
|
"""Computes the IoU between two sets of rotated bounding boxes
|
|
222
161
|
|
|
223
162
|
Args:
|
|
@@ -234,80 +173,18 @@ def polygon_iou(
|
|
|
234
173
|
if polys_1.ndim != 3 or polys_2.ndim != 3:
|
|
235
174
|
raise AssertionError("expects boxes to be in format (N, 4, 2)")
|
|
236
175
|
|
|
237
|
-
iou_mat
|
|
238
|
-
|
|
239
|
-
if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
|
|
240
|
-
if use_broadcasting:
|
|
241
|
-
masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
|
|
242
|
-
masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
|
|
243
|
-
iou_mat = mask_iou(masks_1, masks_2)
|
|
244
|
-
else:
|
|
245
|
-
# Save memory by doing the computation for each pair
|
|
246
|
-
for idx, b1 in enumerate(polys_1):
|
|
247
|
-
m1 = _rbox_to_mask(b1, mask_shape)
|
|
248
|
-
for _idx, b2 in enumerate(polys_2):
|
|
249
|
-
m2 = _rbox_to_mask(b2, mask_shape)
|
|
250
|
-
iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
|
|
251
|
-
|
|
252
|
-
return iou_mat
|
|
176
|
+
iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
|
|
253
177
|
|
|
178
|
+
shapely_polys_1 = [Polygon(poly) for poly in polys_1]
|
|
179
|
+
shapely_polys_2 = [Polygon(poly) for poly in polys_2]
|
|
254
180
|
|
|
255
|
-
|
|
256
|
-
|
|
181
|
+
for i, poly1 in enumerate(shapely_polys_1):
|
|
182
|
+
for j, poly2 in enumerate(shapely_polys_2):
|
|
183
|
+
intersection_area = poly1.intersection(poly2).area
|
|
184
|
+
union_area = poly1.area + poly2.area - intersection_area
|
|
185
|
+
iou_mat[i, j] = intersection_area / union_area
|
|
257
186
|
|
|
258
|
-
|
|
259
|
-
----
|
|
260
|
-
box: rotated bounding box of shape (4, 2)
|
|
261
|
-
shape: spatial shapes of the output masks
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
-------
|
|
265
|
-
the boolean mask of the specified shape
|
|
266
|
-
"""
|
|
267
|
-
mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
|
|
268
|
-
# Get absolute coords
|
|
269
|
-
if not np.issubdtype(box.dtype, np.integer):
|
|
270
|
-
abs_box = box.copy()
|
|
271
|
-
abs_box[:, 0] = abs_box[:, 0] * shape[1]
|
|
272
|
-
abs_box[:, 1] = abs_box[:, 1] * shape[0]
|
|
273
|
-
abs_box = abs_box.round().astype(int)
|
|
274
|
-
else:
|
|
275
|
-
abs_box = box
|
|
276
|
-
abs_box[2:] = abs_box[2:] + 1
|
|
277
|
-
cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload]
|
|
278
|
-
|
|
279
|
-
return mask.astype(bool)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
|
|
283
|
-
"""Converts rotated bounding boxes to boolean masks
|
|
284
|
-
|
|
285
|
-
Args:
|
|
286
|
-
----
|
|
287
|
-
boxes: rotated bounding boxes of shape (N, 4, 2)
|
|
288
|
-
shape: spatial shapes of the output masks
|
|
289
|
-
|
|
290
|
-
Returns:
|
|
291
|
-
-------
|
|
292
|
-
the boolean masks of shape (N, H, W)
|
|
293
|
-
"""
|
|
294
|
-
masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
|
|
295
|
-
|
|
296
|
-
if boxes.shape[0] > 0:
|
|
297
|
-
# Get absolute coordinates
|
|
298
|
-
if not np.issubdtype(boxes.dtype, np.integer):
|
|
299
|
-
abs_boxes = boxes.copy()
|
|
300
|
-
abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
|
|
301
|
-
abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
|
|
302
|
-
abs_boxes = abs_boxes.round().astype(int)
|
|
303
|
-
else:
|
|
304
|
-
abs_boxes = boxes
|
|
305
|
-
abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
|
|
306
|
-
|
|
307
|
-
# TODO: optimize slicing to improve vectorization
|
|
308
|
-
for idx, _box in enumerate(abs_boxes):
|
|
309
|
-
cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload]
|
|
310
|
-
return masks.astype(bool)
|
|
187
|
+
return iou_mat
|
|
311
188
|
|
|
312
189
|
|
|
313
190
|
def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
|
|
@@ -386,21 +263,15 @@ class LocalizationConfusion:
|
|
|
386
263
|
----
|
|
387
264
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
388
265
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
389
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
390
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
391
266
|
"""
|
|
392
267
|
|
|
393
268
|
def __init__(
|
|
394
269
|
self,
|
|
395
270
|
iou_thresh: float = 0.5,
|
|
396
271
|
use_polygons: bool = False,
|
|
397
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
398
|
-
use_broadcasting: bool = True,
|
|
399
272
|
) -> None:
|
|
400
273
|
self.iou_thresh = iou_thresh
|
|
401
274
|
self.use_polygons = use_polygons
|
|
402
|
-
self.mask_shape = mask_shape
|
|
403
|
-
self.use_broadcasting = use_broadcasting
|
|
404
275
|
self.reset()
|
|
405
276
|
|
|
406
277
|
def update(self, gts: np.ndarray, preds: np.ndarray) -> None:
|
|
@@ -414,7 +285,7 @@ class LocalizationConfusion:
|
|
|
414
285
|
if preds.shape[0] > 0:
|
|
415
286
|
# Compute IoU
|
|
416
287
|
if self.use_polygons:
|
|
417
|
-
iou_mat = polygon_iou(gts, preds
|
|
288
|
+
iou_mat = polygon_iou(gts, preds)
|
|
418
289
|
else:
|
|
419
290
|
iou_mat = box_iou(gts, preds)
|
|
420
291
|
self.tot_iou += float(iou_mat.max(axis=0).sum())
|
|
@@ -441,7 +312,7 @@ class LocalizationConfusion:
|
|
|
441
312
|
precision = self.matches / self.num_preds if self.num_preds > 0 else None
|
|
442
313
|
|
|
443
314
|
# mean IoU
|
|
444
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
315
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
445
316
|
|
|
446
317
|
return recall, precision, mean_iou
|
|
447
318
|
|
|
@@ -492,21 +363,15 @@ class OCRMetric:
|
|
|
492
363
|
----
|
|
493
364
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
494
365
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
495
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
496
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
497
366
|
"""
|
|
498
367
|
|
|
499
368
|
def __init__(
|
|
500
369
|
self,
|
|
501
370
|
iou_thresh: float = 0.5,
|
|
502
371
|
use_polygons: bool = False,
|
|
503
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
504
|
-
use_broadcasting: bool = True,
|
|
505
372
|
) -> None:
|
|
506
373
|
self.iou_thresh = iou_thresh
|
|
507
374
|
self.use_polygons = use_polygons
|
|
508
|
-
self.mask_shape = mask_shape
|
|
509
|
-
self.use_broadcasting = use_broadcasting
|
|
510
375
|
self.reset()
|
|
511
376
|
|
|
512
377
|
def update(
|
|
@@ -533,7 +398,7 @@ class OCRMetric:
|
|
|
533
398
|
# Compute IoU
|
|
534
399
|
if pred_boxes.shape[0] > 0:
|
|
535
400
|
if self.use_polygons:
|
|
536
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
401
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
537
402
|
else:
|
|
538
403
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
539
404
|
|
|
@@ -544,10 +409,10 @@ class OCRMetric:
|
|
|
544
409
|
is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
|
|
545
410
|
# String comparison
|
|
546
411
|
for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
|
|
547
|
-
_raw, _caseless,
|
|
412
|
+
_raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
|
|
548
413
|
self.raw_matches += int(_raw)
|
|
549
414
|
self.caseless_matches += int(_caseless)
|
|
550
|
-
self.
|
|
415
|
+
self.anyascii_matches += int(_anyascii)
|
|
551
416
|
self.unicase_matches += int(_unicase)
|
|
552
417
|
|
|
553
418
|
self.num_gts += gt_boxes.shape[0]
|
|
@@ -564,7 +429,7 @@ class OCRMetric:
|
|
|
564
429
|
recall = dict(
|
|
565
430
|
raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None,
|
|
566
431
|
caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None,
|
|
567
|
-
|
|
432
|
+
anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None,
|
|
568
433
|
unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None,
|
|
569
434
|
)
|
|
570
435
|
|
|
@@ -572,12 +437,12 @@ class OCRMetric:
|
|
|
572
437
|
precision = dict(
|
|
573
438
|
raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
|
|
574
439
|
caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
|
|
575
|
-
|
|
440
|
+
anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
|
|
576
441
|
unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
|
|
577
442
|
)
|
|
578
443
|
|
|
579
444
|
# mean IoU (overall detected boxes)
|
|
580
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
445
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
581
446
|
|
|
582
447
|
return recall, precision, mean_iou
|
|
583
448
|
|
|
@@ -587,7 +452,7 @@ class OCRMetric:
|
|
|
587
452
|
self.tot_iou = 0.0
|
|
588
453
|
self.raw_matches = 0
|
|
589
454
|
self.caseless_matches = 0
|
|
590
|
-
self.
|
|
455
|
+
self.anyascii_matches = 0
|
|
591
456
|
self.unicase_matches = 0
|
|
592
457
|
|
|
593
458
|
|
|
@@ -631,21 +496,15 @@ class DetectionMetric:
|
|
|
631
496
|
----
|
|
632
497
|
iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
|
|
633
498
|
use_polygons: if set to True, predictions and targets will be expected to have rotated format
|
|
634
|
-
mask_shape: if use_polygons is True, describes the spatial shape of the image used
|
|
635
|
-
use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
|
|
636
499
|
"""
|
|
637
500
|
|
|
638
501
|
def __init__(
|
|
639
502
|
self,
|
|
640
503
|
iou_thresh: float = 0.5,
|
|
641
504
|
use_polygons: bool = False,
|
|
642
|
-
mask_shape: Tuple[int, int] = (1024, 1024),
|
|
643
|
-
use_broadcasting: bool = True,
|
|
644
505
|
) -> None:
|
|
645
506
|
self.iou_thresh = iou_thresh
|
|
646
507
|
self.use_polygons = use_polygons
|
|
647
|
-
self.mask_shape = mask_shape
|
|
648
|
-
self.use_broadcasting = use_broadcasting
|
|
649
508
|
self.reset()
|
|
650
509
|
|
|
651
510
|
def update(
|
|
@@ -672,7 +531,7 @@ class DetectionMetric:
|
|
|
672
531
|
# Compute IoU
|
|
673
532
|
if pred_boxes.shape[0] > 0:
|
|
674
533
|
if self.use_polygons:
|
|
675
|
-
iou_mat = polygon_iou(gt_boxes, pred_boxes
|
|
534
|
+
iou_mat = polygon_iou(gt_boxes, pred_boxes)
|
|
676
535
|
else:
|
|
677
536
|
iou_mat = box_iou(gt_boxes, pred_boxes)
|
|
678
537
|
|
|
@@ -701,7 +560,7 @@ class DetectionMetric:
|
|
|
701
560
|
precision = self.num_matches / self.num_preds if self.num_preds > 0 else None
|
|
702
561
|
|
|
703
562
|
# mean IoU (overall detected boxes)
|
|
704
|
-
mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
|
|
563
|
+
mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
|
|
705
564
|
|
|
706
565
|
return recall, precision, mean_iou
|
|
707
566
|
|