python-doctr 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. doctr/__init__.py +1 -1
  2. doctr/contrib/__init__.py +0 -0
  3. doctr/contrib/artefacts.py +131 -0
  4. doctr/contrib/base.py +105 -0
  5. doctr/datasets/datasets/pytorch.py +2 -2
  6. doctr/datasets/generator/base.py +6 -5
  7. doctr/datasets/imgur5k.py +1 -1
  8. doctr/datasets/loader.py +1 -6
  9. doctr/datasets/utils.py +2 -1
  10. doctr/datasets/vocabs.py +9 -2
  11. doctr/file_utils.py +26 -12
  12. doctr/io/elements.py +40 -6
  13. doctr/io/html.py +2 -2
  14. doctr/io/image/pytorch.py +6 -8
  15. doctr/io/image/tensorflow.py +1 -1
  16. doctr/io/pdf.py +5 -2
  17. doctr/io/reader.py +6 -0
  18. doctr/models/__init__.py +0 -1
  19. doctr/models/_utils.py +57 -20
  20. doctr/models/builder.py +71 -13
  21. doctr/models/classification/mobilenet/pytorch.py +45 -9
  22. doctr/models/classification/mobilenet/tensorflow.py +38 -7
  23. doctr/models/classification/predictor/pytorch.py +18 -11
  24. doctr/models/classification/predictor/tensorflow.py +16 -10
  25. doctr/models/classification/textnet/pytorch.py +3 -3
  26. doctr/models/classification/textnet/tensorflow.py +3 -3
  27. doctr/models/classification/zoo.py +39 -15
  28. doctr/models/detection/_utils/__init__.py +1 -0
  29. doctr/models/detection/_utils/base.py +66 -0
  30. doctr/models/detection/differentiable_binarization/base.py +4 -3
  31. doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
  32. doctr/models/detection/fast/base.py +6 -5
  33. doctr/models/detection/fast/pytorch.py +4 -4
  34. doctr/models/detection/fast/tensorflow.py +4 -4
  35. doctr/models/detection/linknet/base.py +4 -3
  36. doctr/models/detection/predictor/pytorch.py +15 -1
  37. doctr/models/detection/predictor/tensorflow.py +15 -1
  38. doctr/models/detection/zoo.py +7 -2
  39. doctr/models/factory/hub.py +3 -12
  40. doctr/models/kie_predictor/base.py +9 -3
  41. doctr/models/kie_predictor/pytorch.py +41 -20
  42. doctr/models/kie_predictor/tensorflow.py +36 -16
  43. doctr/models/modules/layers/pytorch.py +2 -3
  44. doctr/models/modules/layers/tensorflow.py +6 -8
  45. doctr/models/modules/transformer/pytorch.py +2 -2
  46. doctr/models/predictor/base.py +77 -50
  47. doctr/models/predictor/pytorch.py +31 -20
  48. doctr/models/predictor/tensorflow.py +27 -17
  49. doctr/models/preprocessor/pytorch.py +4 -4
  50. doctr/models/preprocessor/tensorflow.py +3 -2
  51. doctr/models/recognition/master/pytorch.py +2 -2
  52. doctr/models/recognition/parseq/pytorch.py +4 -3
  53. doctr/models/recognition/parseq/tensorflow.py +4 -3
  54. doctr/models/recognition/sar/pytorch.py +7 -6
  55. doctr/models/recognition/sar/tensorflow.py +3 -9
  56. doctr/models/recognition/vitstr/pytorch.py +1 -1
  57. doctr/models/recognition/zoo.py +1 -1
  58. doctr/models/zoo.py +2 -2
  59. doctr/py.typed +0 -0
  60. doctr/transforms/functional/base.py +1 -1
  61. doctr/transforms/functional/pytorch.py +4 -4
  62. doctr/transforms/modules/base.py +37 -15
  63. doctr/transforms/modules/pytorch.py +66 -8
  64. doctr/transforms/modules/tensorflow.py +63 -7
  65. doctr/utils/fonts.py +7 -5
  66. doctr/utils/geometry.py +35 -12
  67. doctr/utils/metrics.py +33 -174
  68. doctr/utils/reconstitution.py +126 -0
  69. doctr/utils/visualization.py +5 -118
  70. doctr/version.py +1 -1
  71. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/METADATA +84 -80
  72. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/RECORD +76 -76
  73. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/WHEEL +1 -1
  74. doctr/models/artefacts/__init__.py +0 -2
  75. doctr/models/artefacts/barcode.py +0 -74
  76. doctr/models/artefacts/face.py +0 -63
  77. doctr/models/obj_detection/__init__.py +0 -1
  78. doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
  79. doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
  80. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/LICENSE +0 -0
  81. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/top_level.txt +0 -0
  82. {python_doctr-0.8.1.dist-info → python_doctr-0.9.0.dist-info}/zip-safe +0 -0
@@ -4,7 +4,7 @@
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import random
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
7
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
8
8
 
9
9
  import numpy as np
10
10
  import tensorflow as tf
@@ -30,6 +30,7 @@ __all__ = [
30
30
  "GaussianNoise",
31
31
  "RandomHorizontalFlip",
32
32
  "RandomShadow",
33
+ "RandomResize",
33
34
  ]
34
35
 
35
36
 
@@ -457,10 +458,7 @@ class RandomHorizontalFlip(NestedObject):
457
458
  >>> from doctr.transforms import RandomHorizontalFlip
458
459
  >>> transfo = RandomHorizontalFlip(p=0.5)
459
460
  >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)
460
- >>> target = {
461
- >>> "boxes": np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32),
462
- >>> "labels": np.ones(1, dtype= np.int64)
463
- >>> }
461
+ >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32)
464
462
  >>> out = transfo(image, target)
465
463
 
466
464
  Args:
@@ -472,12 +470,15 @@ class RandomHorizontalFlip(NestedObject):
472
470
  super().__init__()
473
471
  self.p = p
474
472
 
475
- def __call__(self, img: Union[tf.Tensor, np.ndarray], target: Dict[str, Any]) -> Tuple[tf.Tensor, Dict[str, Any]]:
473
+ def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
476
474
  if np.random.rand(1) <= self.p:
477
475
  _img = tf.image.flip_left_right(img)
478
476
  _target = target.copy()
479
477
  # Changing the relative bbox coordinates
480
- _target["boxes"][:, ::2] = 1 - target["boxes"][:, [2, 0]]
478
+ if target.shape[1:] == (4,):
479
+ _target[:, ::2] = 1 - target[:, [2, 0]]
480
+ else:
481
+ _target[..., 0] = 1 - target[..., 0]
481
482
  return _img, _target
482
483
  return img, target
483
484
 
@@ -515,3 +516,58 @@ class RandomShadow(NestedObject):
515
516
 
516
517
  def extra_repr(self) -> str:
517
518
  return f"opacity_range={self.opacity_range}"
519
+
520
+
521
+ class RandomResize(NestedObject):
522
+ """Randomly resize the input image and align corresponding targets
523
+
524
+ >>> import tensorflow as tf
525
+ >>> from doctr.transforms import RandomResize
526
+ >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5)
527
+ >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
528
+
529
+ Args:
530
+ ----
531
+ scale_range: range of the resizing factor for width and height (independently)
532
+ preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
533
+ given a float value, the aspect ratio will be preserved with this probability
534
+ symmetric_pad: whether to symmetrically pad the image,
535
+ given a float value, the symmetric padding will be applied with this probability
536
+ p: probability to apply the transformation
537
+ """
538
+
539
+ def __init__(
540
+ self,
541
+ scale_range: Tuple[float, float] = (0.3, 0.9),
542
+ preserve_aspect_ratio: Union[bool, float] = False,
543
+ symmetric_pad: Union[bool, float] = False,
544
+ p: float = 0.5,
545
+ ):
546
+ super().__init__()
547
+ self.scale_range = scale_range
548
+ self.preserve_aspect_ratio = preserve_aspect_ratio
549
+ self.symmetric_pad = symmetric_pad
550
+ self.p = p
551
+ self._resize = Resize
552
+
553
+ def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
554
+ if np.random.rand(1) <= self.p:
555
+ scale_h = random.uniform(*self.scale_range)
556
+ scale_w = random.uniform(*self.scale_range)
557
+ new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w))
558
+
559
+ _img, _target = self._resize(
560
+ new_size,
561
+ preserve_aspect_ratio=self.preserve_aspect_ratio
562
+ if isinstance(self.preserve_aspect_ratio, bool)
563
+ else bool(np.random.rand(1) <= self.symmetric_pad),
564
+ symmetric_pad=self.symmetric_pad
565
+ if isinstance(self.symmetric_pad, bool)
566
+ else bool(np.random.rand(1) <= self.symmetric_pad),
567
+ )(img, target)
568
+
569
+ return _img, _target
570
+ return img, target
571
+
572
+ def extra_repr(self) -> str:
573
+ return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
doctr/utils/fonts.py CHANGED
@@ -5,14 +5,16 @@
5
5
 
6
6
  import logging
7
7
  import platform
8
- from typing import Optional
8
+ from typing import Optional, Union
9
9
 
10
10
  from PIL import ImageFont
11
11
 
12
12
  __all__ = ["get_font"]
13
13
 
14
14
 
15
- def get_font(font_family: Optional[str] = None, font_size: int = 13) -> ImageFont.ImageFont:
15
+ def get_font(
16
+ font_family: Optional[str] = None, font_size: int = 13
17
+ ) -> Union[ImageFont.FreeTypeFont, ImageFont.ImageFont]:
16
18
  """Resolves a compatible ImageFont for the system
17
19
 
18
20
  Args:
@@ -28,14 +30,14 @@ def get_font(font_family: Optional[str] = None, font_size: int = 13) -> ImageFon
28
30
  if font_family is None:
29
31
  try:
30
32
  font = ImageFont.truetype("FreeMono.ttf" if platform.system() == "Linux" else "Arial.ttf", font_size)
31
- except OSError:
32
- font = ImageFont.load_default()
33
+ except OSError: # pragma: no cover
34
+ font = ImageFont.load_default() # type: ignore[assignment]
33
35
  logging.warning(
34
36
  "unable to load recommended font family. Loading default PIL font,"
35
37
  "font size issues may be expected."
36
38
  "To prevent this, it is recommended to specify the value of 'font_family'."
37
39
  )
38
- else:
40
+ else: # pragma: no cover
39
41
  font = ImageFont.truetype(font_family, font_size)
40
42
 
41
43
  return font
doctr/utils/geometry.py CHANGED
@@ -25,6 +25,7 @@ __all__ = [
25
25
  "rotate_abs_geoms",
26
26
  "extract_crops",
27
27
  "extract_rcrops",
28
+ "detach_scores",
28
29
  ]
29
30
 
30
31
 
@@ -57,6 +58,28 @@ def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
57
58
  return (min(x), min(y)), (max(x), max(y))
58
59
 
59
60
 
61
+ def detach_scores(boxes: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
62
+ """Detach the objectness scores from box predictions
63
+
64
+ Args:
65
+ ----
66
+ boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)
67
+
68
+ Returns:
69
+ -------
70
+ a tuple of two lists: the first one contains the boxes without the objectness scores,
71
+ the second one contains the objectness scores
72
+ """
73
+
74
+ def _detach(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
75
+ if boxes.ndim == 2:
76
+ return boxes[:, :-1], boxes[:, -1]
77
+ return boxes[:, :-1], boxes[:, -1, -1]
78
+
79
+ loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
80
+ return list(loc_preds), list(obj_scores)
81
+
82
+
60
83
  def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Union[BoundingBox, np.ndarray]:
61
84
  """Compute enclosing bbox either from:
62
85
 
@@ -64,18 +87,18 @@ def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Unio
64
87
  ----
65
88
  bboxes: boxes in one of the following formats:
66
89
 
67
- - an array of boxes: (*, 5), where boxes have this shape:
68
- (xmin, ymin, xmax, ymax, score)
90
+ - an array of boxes: (*, 4), where boxes have this shape:
91
+ (xmin, ymin, xmax, ymax)
69
92
 
70
93
  - a list of BoundingBox
71
94
 
72
95
  Returns:
73
96
  -------
74
- a (1, 5) array (enclosing boxarray), or a BoundingBox
97
+ a (1, 4) array (enclosing boxarray), or a BoundingBox
75
98
  """
76
99
  if isinstance(bboxes, np.ndarray):
77
- xmin, ymin, xmax, ymax, score = np.split(bboxes, 5, axis=1)
78
- return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max(), score.mean()])
100
+ xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
101
+ return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
79
102
  else:
80
103
  x, y = zip(*[point for box in bboxes for point in box])
81
104
  return (min(x), min(y)), (max(x), max(y))
@@ -88,21 +111,21 @@ def resolve_enclosing_rbbox(rbboxes: List[np.ndarray], intermed_size: int = 1024
88
111
  ----
89
112
  rbboxes: boxes in one of the following formats:
90
113
 
91
- - an array of boxes: (*, 5), where boxes have this shape:
92
- (xmin, ymin, xmax, ymax, score)
114
+ - an array of boxes: (*, 4, 2), where boxes have this shape:
115
+ (x1, y1), (x2, y2), (x3, y3), (x4, y4)
93
116
 
94
117
  - a list of BoundingBox
95
118
  intermed_size: size of the intermediate image
96
119
 
97
120
  Returns:
98
121
  -------
99
- a (1, 5) array (enclosing boxarray), or a BoundingBox
122
+ a (4, 2) array (enclosing rotated box)
100
123
  """
101
124
  cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
102
125
  # Convert to absolute for minAreaRect
103
126
  cloud *= intermed_size
104
127
  rect = cv2.minAreaRect(cloud.astype(np.int32))
105
- return cv2.boxPoints(rect) / intermed_size # type: ignore[operator]
128
+ return cv2.boxPoints(rect) / intermed_size # type: ignore[return-value]
106
129
 
107
130
 
108
131
  def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
@@ -232,7 +255,7 @@ def rotate_boxes(
232
255
 
233
256
  Args:
234
257
  ----
235
- loc_preds: (N, 5) or (N, 4, 2) array of RELATIVE boxes
258
+ loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
236
259
  angle: angle between -90 and +90 degrees
237
260
  orig_shape: shape of the origin image
238
261
  min_angle: minimum angle to rotate boxes
@@ -320,7 +343,7 @@ def rotate_image(
320
343
  # Pad height
321
344
  else:
322
345
  h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
323
- rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
346
+ rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) # type: ignore[assignment]
324
347
  if preserve_origin_shape:
325
348
  # rescale
326
349
  rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
@@ -453,4 +476,4 @@ def extract_rcrops(
453
476
  )
454
477
  for idx in range(_boxes.shape[0])
455
478
  ]
456
- return crops
479
+ return crops # type: ignore[return-value]
doctr/utils/metrics.py CHANGED
@@ -5,16 +5,14 @@
5
5
 
6
6
  from typing import Dict, List, Optional, Tuple
7
7
 
8
- import cv2
9
8
  import numpy as np
9
+ from anyascii import anyascii
10
10
  from scipy.optimize import linear_sum_assignment
11
- from unidecode import unidecode
11
+ from shapely.geometry import Polygon
12
12
 
13
13
  __all__ = [
14
14
  "TextMatch",
15
15
  "box_iou",
16
- "box_ioa",
17
- "mask_iou",
18
16
  "polygon_iou",
19
17
  "nms",
20
18
  "LocalizationConfusion",
@@ -34,16 +32,16 @@ def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
34
32
  Returns:
35
33
  -------
36
34
  a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
37
- unidecode counterparts and their lower-case unidecode counterparts match
35
+ anyascii counterparts and their lower-case anyascii counterparts match
38
36
  """
39
37
  raw_match = word1 == word2
40
38
  caseless_match = word1.lower() == word2.lower()
41
- unidecode_match = unidecode(word1) == unidecode(word2)
39
+ anyascii_match = anyascii(word1) == anyascii(word2)
42
40
 
43
41
  # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
44
- unicase_match = unidecode(word1).lower() == unidecode(word2).lower()
42
+ unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
45
43
 
46
- return raw_match, caseless_match, unidecode_match, unicase_match
44
+ return raw_match, caseless_match, anyascii_match, unicase_match
47
45
 
48
46
 
49
47
  class TextMatch:
@@ -94,10 +92,10 @@ class TextMatch:
94
92
  raise AssertionError("prediction size does not match with ground-truth labels size")
95
93
 
96
94
  for gt_word, pred_word in zip(gt, pred):
97
- _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
95
+ _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
98
96
  self.raw += int(_raw)
99
97
  self.caseless += int(_caseless)
100
- self.unidecode += int(_unidecode)
98
+ self.anyascii += int(_anyascii)
101
99
  self.unicase += int(_unicase)
102
100
 
103
101
  self.total += len(gt)
@@ -107,8 +105,8 @@ class TextMatch:
107
105
 
108
106
  Returns
109
107
  -------
110
- a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode
111
- counterpart and its lower-case unidecode counterpart
108
+ a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii
109
+ counterpart and its lower-case anyascii counterpart
112
110
  """
113
111
  if self.total == 0:
114
112
  raise AssertionError("you need to update the metric before getting the summary")
@@ -116,14 +114,14 @@ class TextMatch:
116
114
  return dict(
117
115
  raw=self.raw / self.total,
118
116
  caseless=self.caseless / self.total,
119
- unidecode=self.unidecode / self.total,
117
+ anyascii=self.anyascii / self.total,
120
118
  unicase=self.unicase / self.total,
121
119
  )
122
120
 
123
121
  def reset(self) -> None:
124
122
  self.raw = 0
125
123
  self.caseless = 0
126
- self.unidecode = 0
124
+ self.anyascii = 0
127
125
  self.unicase = 0
128
126
  self.total = 0
129
127
 
@@ -158,66 +156,7 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
158
156
  return iou_mat
159
157
 
160
158
 
161
- def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
162
- """Computes the IoA (intersection over area) between two sets of bounding boxes:
163
- ioa(i, j) = inter(i, j) / area(i)
164
-
165
- Args:
166
- ----
167
- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
168
- boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
169
-
170
- Returns:
171
- -------
172
- the IoA matrix of shape (N, M)
173
- """
174
- ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
175
-
176
- if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
177
- l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
178
- l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
179
-
180
- left = np.maximum(l1, l2.T)
181
- top = np.maximum(t1, t2.T)
182
- right = np.minimum(r1, r2.T)
183
- bot = np.minimum(b1, b2.T)
184
-
185
- intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
186
- area = (r1 - l1) * (b1 - t1)
187
- ioa_mat = intersection / area
188
-
189
- return ioa_mat
190
-
191
-
192
- def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
193
- """Computes the IoU between two sets of boolean masks
194
-
195
- Args:
196
- ----
197
- masks_1: boolean masks of shape (N, H, W)
198
- masks_2: boolean masks of shape (M, H, W)
199
-
200
- Returns:
201
- -------
202
- the IoU matrix of shape (N, M)
203
- """
204
- if masks_1.shape[1:] != masks_2.shape[1:]:
205
- raise AssertionError("both boolean masks should have the same spatial shape")
206
-
207
- iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
208
-
209
- if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
210
- axes = tuple(range(2, masks_1.ndim + 1))
211
- intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
212
- union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
213
- iou_mat = intersection / union
214
-
215
- return iou_mat
216
-
217
-
218
- def polygon_iou(
219
- polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
220
- ) -> np.ndarray:
159
+ def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
221
160
  """Computes the IoU between two sets of rotated bounding boxes
222
161
 
223
162
  Args:
@@ -234,80 +173,18 @@ def polygon_iou(
234
173
  if polys_1.ndim != 3 or polys_2.ndim != 3:
235
174
  raise AssertionError("expects boxes to be in format (N, 4, 2)")
236
175
 
237
- iou_mat: np.ndarray = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
238
-
239
- if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
240
- if use_broadcasting:
241
- masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
242
- masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
243
- iou_mat = mask_iou(masks_1, masks_2)
244
- else:
245
- # Save memory by doing the computation for each pair
246
- for idx, b1 in enumerate(polys_1):
247
- m1 = _rbox_to_mask(b1, mask_shape)
248
- for _idx, b2 in enumerate(polys_2):
249
- m2 = _rbox_to_mask(b2, mask_shape)
250
- iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
251
-
252
- return iou_mat
176
+ iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
253
177
 
178
+ shapely_polys_1 = [Polygon(poly) for poly in polys_1]
179
+ shapely_polys_2 = [Polygon(poly) for poly in polys_2]
254
180
 
255
- def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
256
- """Converts a rotated bounding box to a boolean mask
181
+ for i, poly1 in enumerate(shapely_polys_1):
182
+ for j, poly2 in enumerate(shapely_polys_2):
183
+ intersection_area = poly1.intersection(poly2).area
184
+ union_area = poly1.area + poly2.area - intersection_area
185
+ iou_mat[i, j] = intersection_area / union_area
257
186
 
258
- Args:
259
- ----
260
- box: rotated bounding box of shape (4, 2)
261
- shape: spatial shapes of the output masks
262
-
263
- Returns:
264
- -------
265
- the boolean mask of the specified shape
266
- """
267
- mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
268
- # Get absolute coords
269
- if not np.issubdtype(box.dtype, np.integer):
270
- abs_box = box.copy()
271
- abs_box[:, 0] = abs_box[:, 0] * shape[1]
272
- abs_box[:, 1] = abs_box[:, 1] * shape[0]
273
- abs_box = abs_box.round().astype(int)
274
- else:
275
- abs_box = box
276
- abs_box[2:] = abs_box[2:] + 1
277
- cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload]
278
-
279
- return mask.astype(bool)
280
-
281
-
282
- def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
283
- """Converts rotated bounding boxes to boolean masks
284
-
285
- Args:
286
- ----
287
- boxes: rotated bounding boxes of shape (N, 4, 2)
288
- shape: spatial shapes of the output masks
289
-
290
- Returns:
291
- -------
292
- the boolean masks of shape (N, H, W)
293
- """
294
- masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
295
-
296
- if boxes.shape[0] > 0:
297
- # Get absolute coordinates
298
- if not np.issubdtype(boxes.dtype, np.integer):
299
- abs_boxes = boxes.copy()
300
- abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
301
- abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
302
- abs_boxes = abs_boxes.round().astype(int)
303
- else:
304
- abs_boxes = boxes
305
- abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
306
-
307
- # TODO: optimize slicing to improve vectorization
308
- for idx, _box in enumerate(abs_boxes):
309
- cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload]
310
- return masks.astype(bool)
187
+ return iou_mat
311
188
 
312
189
 
313
190
  def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
@@ -386,21 +263,15 @@ class LocalizationConfusion:
386
263
  ----
387
264
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
388
265
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
389
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
390
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
391
266
  """
392
267
 
393
268
  def __init__(
394
269
  self,
395
270
  iou_thresh: float = 0.5,
396
271
  use_polygons: bool = False,
397
- mask_shape: Tuple[int, int] = (1024, 1024),
398
- use_broadcasting: bool = True,
399
272
  ) -> None:
400
273
  self.iou_thresh = iou_thresh
401
274
  self.use_polygons = use_polygons
402
- self.mask_shape = mask_shape
403
- self.use_broadcasting = use_broadcasting
404
275
  self.reset()
405
276
 
406
277
  def update(self, gts: np.ndarray, preds: np.ndarray) -> None:
@@ -414,7 +285,7 @@ class LocalizationConfusion:
414
285
  if preds.shape[0] > 0:
415
286
  # Compute IoU
416
287
  if self.use_polygons:
417
- iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting)
288
+ iou_mat = polygon_iou(gts, preds)
418
289
  else:
419
290
  iou_mat = box_iou(gts, preds)
420
291
  self.tot_iou += float(iou_mat.max(axis=0).sum())
@@ -441,7 +312,7 @@ class LocalizationConfusion:
441
312
  precision = self.matches / self.num_preds if self.num_preds > 0 else None
442
313
 
443
314
  # mean IoU
444
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
315
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
445
316
 
446
317
  return recall, precision, mean_iou
447
318
 
@@ -492,21 +363,15 @@ class OCRMetric:
492
363
  ----
493
364
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
494
365
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
495
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
496
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
497
366
  """
498
367
 
499
368
  def __init__(
500
369
  self,
501
370
  iou_thresh: float = 0.5,
502
371
  use_polygons: bool = False,
503
- mask_shape: Tuple[int, int] = (1024, 1024),
504
- use_broadcasting: bool = True,
505
372
  ) -> None:
506
373
  self.iou_thresh = iou_thresh
507
374
  self.use_polygons = use_polygons
508
- self.mask_shape = mask_shape
509
- self.use_broadcasting = use_broadcasting
510
375
  self.reset()
511
376
 
512
377
  def update(
@@ -533,7 +398,7 @@ class OCRMetric:
533
398
  # Compute IoU
534
399
  if pred_boxes.shape[0] > 0:
535
400
  if self.use_polygons:
536
- iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
401
+ iou_mat = polygon_iou(gt_boxes, pred_boxes)
537
402
  else:
538
403
  iou_mat = box_iou(gt_boxes, pred_boxes)
539
404
 
@@ -544,10 +409,10 @@ class OCRMetric:
544
409
  is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
545
410
  # String comparison
546
411
  for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
547
- _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
412
+ _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
548
413
  self.raw_matches += int(_raw)
549
414
  self.caseless_matches += int(_caseless)
550
- self.unidecode_matches += int(_unidecode)
415
+ self.anyascii_matches += int(_anyascii)
551
416
  self.unicase_matches += int(_unicase)
552
417
 
553
418
  self.num_gts += gt_boxes.shape[0]
@@ -564,7 +429,7 @@ class OCRMetric:
564
429
  recall = dict(
565
430
  raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None,
566
431
  caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None,
567
- unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None,
432
+ anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None,
568
433
  unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None,
569
434
  )
570
435
 
@@ -572,12 +437,12 @@ class OCRMetric:
572
437
  precision = dict(
573
438
  raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
574
439
  caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
575
- unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
440
+ anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
576
441
  unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
577
442
  )
578
443
 
579
444
  # mean IoU (overall detected boxes)
580
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
445
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
581
446
 
582
447
  return recall, precision, mean_iou
583
448
 
@@ -587,7 +452,7 @@ class OCRMetric:
587
452
  self.tot_iou = 0.0
588
453
  self.raw_matches = 0
589
454
  self.caseless_matches = 0
590
- self.unidecode_matches = 0
455
+ self.anyascii_matches = 0
591
456
  self.unicase_matches = 0
592
457
 
593
458
 
@@ -631,21 +496,15 @@ class DetectionMetric:
631
496
  ----
632
497
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
633
498
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
634
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
635
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
636
499
  """
637
500
 
638
501
  def __init__(
639
502
  self,
640
503
  iou_thresh: float = 0.5,
641
504
  use_polygons: bool = False,
642
- mask_shape: Tuple[int, int] = (1024, 1024),
643
- use_broadcasting: bool = True,
644
505
  ) -> None:
645
506
  self.iou_thresh = iou_thresh
646
507
  self.use_polygons = use_polygons
647
- self.mask_shape = mask_shape
648
- self.use_broadcasting = use_broadcasting
649
508
  self.reset()
650
509
 
651
510
  def update(
@@ -672,7 +531,7 @@ class DetectionMetric:
672
531
  # Compute IoU
673
532
  if pred_boxes.shape[0] > 0:
674
533
  if self.use_polygons:
675
- iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
534
+ iou_mat = polygon_iou(gt_boxes, pred_boxes)
676
535
  else:
677
536
  iou_mat = box_iou(gt_boxes, pred_boxes)
678
537
 
@@ -701,7 +560,7 @@ class DetectionMetric:
701
560
  precision = self.num_matches / self.num_preds if self.num_preds > 0 else None
702
561
 
703
562
  # mean IoU (overall detected boxes)
704
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
563
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
705
564
 
706
565
  return recall, precision, mean_iou
707
566