python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. doctr/__init__.py +1 -1
  2. doctr/contrib/__init__.py +0 -0
  3. doctr/contrib/artefacts.py +131 -0
  4. doctr/contrib/base.py +105 -0
  5. doctr/datasets/cord.py +10 -1
  6. doctr/datasets/datasets/pytorch.py +2 -2
  7. doctr/datasets/funsd.py +11 -1
  8. doctr/datasets/generator/base.py +6 -5
  9. doctr/datasets/ic03.py +11 -1
  10. doctr/datasets/ic13.py +10 -1
  11. doctr/datasets/iiit5k.py +26 -16
  12. doctr/datasets/imgur5k.py +11 -2
  13. doctr/datasets/loader.py +1 -6
  14. doctr/datasets/sroie.py +11 -1
  15. doctr/datasets/svhn.py +11 -1
  16. doctr/datasets/svt.py +11 -1
  17. doctr/datasets/synthtext.py +11 -1
  18. doctr/datasets/utils.py +9 -3
  19. doctr/datasets/vocabs.py +15 -4
  20. doctr/datasets/wildreceipt.py +12 -1
  21. doctr/file_utils.py +45 -12
  22. doctr/io/elements.py +52 -10
  23. doctr/io/html.py +2 -2
  24. doctr/io/image/pytorch.py +6 -8
  25. doctr/io/image/tensorflow.py +1 -1
  26. doctr/io/pdf.py +5 -2
  27. doctr/io/reader.py +6 -0
  28. doctr/models/__init__.py +0 -1
  29. doctr/models/_utils.py +57 -20
  30. doctr/models/builder.py +73 -15
  31. doctr/models/classification/magc_resnet/tensorflow.py +13 -6
  32. doctr/models/classification/mobilenet/pytorch.py +47 -9
  33. doctr/models/classification/mobilenet/tensorflow.py +51 -14
  34. doctr/models/classification/predictor/pytorch.py +28 -17
  35. doctr/models/classification/predictor/tensorflow.py +26 -16
  36. doctr/models/classification/resnet/tensorflow.py +21 -8
  37. doctr/models/classification/textnet/pytorch.py +3 -3
  38. doctr/models/classification/textnet/tensorflow.py +11 -5
  39. doctr/models/classification/vgg/tensorflow.py +9 -3
  40. doctr/models/classification/vit/tensorflow.py +10 -4
  41. doctr/models/classification/zoo.py +55 -19
  42. doctr/models/detection/_utils/__init__.py +1 -0
  43. doctr/models/detection/_utils/base.py +66 -0
  44. doctr/models/detection/differentiable_binarization/base.py +4 -3
  45. doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
  46. doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
  47. doctr/models/detection/fast/base.py +6 -5
  48. doctr/models/detection/fast/pytorch.py +4 -4
  49. doctr/models/detection/fast/tensorflow.py +15 -12
  50. doctr/models/detection/linknet/base.py +4 -3
  51. doctr/models/detection/linknet/tensorflow.py +23 -11
  52. doctr/models/detection/predictor/pytorch.py +15 -1
  53. doctr/models/detection/predictor/tensorflow.py +17 -3
  54. doctr/models/detection/zoo.py +7 -2
  55. doctr/models/factory/hub.py +8 -18
  56. doctr/models/kie_predictor/base.py +13 -3
  57. doctr/models/kie_predictor/pytorch.py +45 -20
  58. doctr/models/kie_predictor/tensorflow.py +44 -17
  59. doctr/models/modules/layers/pytorch.py +2 -3
  60. doctr/models/modules/layers/tensorflow.py +6 -8
  61. doctr/models/modules/transformer/pytorch.py +2 -2
  62. doctr/models/modules/transformer/tensorflow.py +0 -2
  63. doctr/models/modules/vision_transformer/pytorch.py +1 -1
  64. doctr/models/modules/vision_transformer/tensorflow.py +1 -1
  65. doctr/models/predictor/base.py +97 -58
  66. doctr/models/predictor/pytorch.py +35 -20
  67. doctr/models/predictor/tensorflow.py +35 -18
  68. doctr/models/preprocessor/pytorch.py +4 -4
  69. doctr/models/preprocessor/tensorflow.py +3 -2
  70. doctr/models/recognition/crnn/tensorflow.py +8 -6
  71. doctr/models/recognition/master/pytorch.py +2 -2
  72. doctr/models/recognition/master/tensorflow.py +9 -4
  73. doctr/models/recognition/parseq/pytorch.py +4 -3
  74. doctr/models/recognition/parseq/tensorflow.py +14 -11
  75. doctr/models/recognition/sar/pytorch.py +7 -6
  76. doctr/models/recognition/sar/tensorflow.py +10 -12
  77. doctr/models/recognition/vitstr/pytorch.py +1 -1
  78. doctr/models/recognition/vitstr/tensorflow.py +9 -4
  79. doctr/models/recognition/zoo.py +1 -1
  80. doctr/models/utils/pytorch.py +1 -1
  81. doctr/models/utils/tensorflow.py +15 -15
  82. doctr/models/zoo.py +2 -2
  83. doctr/py.typed +0 -0
  84. doctr/transforms/functional/base.py +1 -1
  85. doctr/transforms/functional/pytorch.py +5 -5
  86. doctr/transforms/modules/base.py +37 -15
  87. doctr/transforms/modules/pytorch.py +73 -14
  88. doctr/transforms/modules/tensorflow.py +78 -19
  89. doctr/utils/fonts.py +7 -5
  90. doctr/utils/geometry.py +141 -31
  91. doctr/utils/metrics.py +34 -175
  92. doctr/utils/reconstitution.py +212 -0
  93. doctr/utils/visualization.py +5 -118
  94. doctr/version.py +1 -1
  95. {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
  96. python_doctr-0.10.0.dist-info/RECORD +173 -0
  97. {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
  98. doctr/models/artefacts/__init__.py +0 -2
  99. doctr/models/artefacts/barcode.py +0 -74
  100. doctr/models/artefacts/face.py +0 -63
  101. doctr/models/obj_detection/__init__.py +0 -1
  102. doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
  103. doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
  104. python_doctr-0.8.1.dist-info/RECORD +0 -173
  105. {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
  106. {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
  107. {python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0
doctr/utils/geometry.py CHANGED
@@ -20,11 +20,13 @@ __all__ = [
20
20
  "rotate_boxes",
21
21
  "compute_expanded_shape",
22
22
  "rotate_image",
23
+ "remove_image_padding",
23
24
  "estimate_page_angle",
24
25
  "convert_to_relative_coords",
25
26
  "rotate_abs_geoms",
26
27
  "extract_crops",
27
28
  "extract_rcrops",
29
+ "detach_scores",
28
30
  ]
29
31
 
30
32
 
@@ -57,6 +59,28 @@ def polygon_to_bbox(polygon: Polygon4P) -> BoundingBox:
57
59
  return (min(x), min(y)), (max(x), max(y))
58
60
 
59
61
 
62
+ def detach_scores(boxes: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
63
+ """Detach the objectness scores from box predictions
64
+
65
+ Args:
66
+ ----
67
+ boxes: list of arrays with boxes of shape (N, 5) or (N, 5, 2)
68
+
69
+ Returns:
70
+ -------
71
+ a tuple of two lists: the first one contains the boxes without the objectness scores,
72
+ the second one contains the objectness scores
73
+ """
74
+
75
+ def _detach(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
76
+ if boxes.ndim == 2:
77
+ return boxes[:, :-1], boxes[:, -1]
78
+ return boxes[:, :-1], boxes[:, -1, -1]
79
+
80
+ loc_preds, obj_scores = zip(*(_detach(box) for box in boxes))
81
+ return list(loc_preds), list(obj_scores)
82
+
83
+
60
84
  def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Union[BoundingBox, np.ndarray]:
61
85
  """Compute enclosing bbox either from:
62
86
 
@@ -64,18 +88,18 @@ def resolve_enclosing_bbox(bboxes: Union[List[BoundingBox], np.ndarray]) -> Unio
64
88
  ----
65
89
  bboxes: boxes in one of the following formats:
66
90
 
67
- - an array of boxes: (*, 5), where boxes have this shape:
68
- (xmin, ymin, xmax, ymax, score)
91
+ - an array of boxes: (*, 4), where boxes have this shape:
92
+ (xmin, ymin, xmax, ymax)
69
93
 
70
94
  - a list of BoundingBox
71
95
 
72
96
  Returns:
73
97
  -------
74
- a (1, 5) array (enclosing boxarray), or a BoundingBox
98
+ a (1, 4) array (enclosing boxarray), or a BoundingBox
75
99
  """
76
100
  if isinstance(bboxes, np.ndarray):
77
- xmin, ymin, xmax, ymax, score = np.split(bboxes, 5, axis=1)
78
- return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max(), score.mean()])
101
+ xmin, ymin, xmax, ymax = np.split(bboxes, 4, axis=1)
102
+ return np.array([xmin.min(), ymin.min(), xmax.max(), ymax.max()])
79
103
  else:
80
104
  x, y = zip(*[point for box in bboxes for point in box])
81
105
  return (min(x), min(y)), (max(x), max(y))
@@ -88,21 +112,21 @@ def resolve_enclosing_rbbox(rbboxes: List[np.ndarray], intermed_size: int = 1024
88
112
  ----
89
113
  rbboxes: boxes in one of the following formats:
90
114
 
91
- - an array of boxes: (*, 5), where boxes have this shape:
92
- (xmin, ymin, xmax, ymax, score)
115
+ - an array of boxes: (*, 4, 2), where boxes have this shape:
116
+ (x1, y1), (x2, y2), (x3, y3), (x4, y4)
93
117
 
94
118
  - a list of BoundingBox
95
119
  intermed_size: size of the intermediate image
96
120
 
97
121
  Returns:
98
122
  -------
99
- a (1, 5) array (enclosing boxarray), or a BoundingBox
123
+ a (4, 2) array (enclosing rotated box)
100
124
  """
101
125
  cloud: np.ndarray = np.concatenate(rbboxes, axis=0)
102
126
  # Convert to absolute for minAreaRect
103
127
  cloud *= intermed_size
104
128
  rect = cv2.minAreaRect(cloud.astype(np.int32))
105
- return cv2.boxPoints(rect) / intermed_size # type: ignore[operator]
129
+ return cv2.boxPoints(rect) / intermed_size # type: ignore[return-value]
106
130
 
107
131
 
108
132
  def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
@@ -232,7 +256,7 @@ def rotate_boxes(
232
256
 
233
257
  Args:
234
258
  ----
235
- loc_preds: (N, 5) or (N, 4, 2) array of RELATIVE boxes
259
+ loc_preds: (N, 4) or (N, 4, 2) array of RELATIVE boxes
236
260
  angle: angle between -90 and +90 degrees
237
261
  orig_shape: shape of the origin image
238
262
  min_angle: minimum angle to rotate boxes
@@ -320,7 +344,7 @@ def rotate_image(
320
344
  # Pad height
321
345
  else:
322
346
  h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
323
- rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
347
+ rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) # type: ignore[assignment]
324
348
  if preserve_origin_shape:
325
349
  # rescale
326
350
  rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
@@ -328,6 +352,26 @@ def rotate_image(
328
352
  return rot_img
329
353
 
330
354
 
355
+ def remove_image_padding(image: np.ndarray) -> np.ndarray:
356
+ """Remove black border padding from an image
357
+
358
+ Args:
359
+ ----
360
+ image: numpy tensor to remove padding from
361
+
362
+ Returns:
363
+ -------
364
+ Image with padding removed
365
+ """
366
+ # Find the bounding box of the non-black region
367
+ rows = np.any(image, axis=1)
368
+ cols = np.any(image, axis=0)
369
+ rmin, rmax = np.where(rows)[0][[0, -1]]
370
+ cmin, cmax = np.where(cols)[0][[0, -1]]
371
+
372
+ return image[rmin : rmax + 1, cmin : cmax + 1]
373
+
374
+
331
375
  def estimate_page_angle(polys: np.ndarray) -> float:
332
376
  """Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the
333
377
  estimated angle ccw in degrees
@@ -408,7 +452,7 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray, channels_last: bool = True
408
452
 
409
453
 
410
454
  def extract_rcrops(
411
- img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True
455
+ img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True, assume_horizontal: bool = False
412
456
  ) -> List[np.ndarray]:
413
457
  """Created cropped images from list of rotated bounding boxes
414
458
 
@@ -418,6 +462,7 @@ def extract_rcrops(
418
462
  polys: bounding boxes of shape (N, 4, 2)
419
463
  dtype: target data type of bounding boxes
420
464
  channels_last: whether the channel dimensions is the last one instead of the last one
465
+ assume_horizontal: whether the boxes are assumed to be only horizontally oriented
421
466
 
422
467
  Returns:
423
468
  -------
@@ -435,22 +480,87 @@ def extract_rcrops(
435
480
  _boxes[:, :, 0] *= width
436
481
  _boxes[:, :, 1] *= height
437
482
 
438
- src_pts = _boxes[:, :3].astype(np.float32)
439
- # Preserve size
440
- d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
441
- d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
442
- # (N, 3, 2)
443
- dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
444
- dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
445
- dst_pts[:, 2, 1] = d2 - 1
446
- # Use a warp transformation to extract the crop
447
- crops = [
448
- cv2.warpAffine(
449
- img if channels_last else img.transpose(1, 2, 0),
450
- # Transformation matrix
451
- cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
452
- (int(d1[idx]), int(d2[idx])),
453
- )
454
- for idx in range(_boxes.shape[0])
455
- ]
456
- return crops
483
+ src_img = img if channels_last else img.transpose(1, 2, 0)
484
+
485
+ # Handle only horizontal oriented boxes
486
+ if assume_horizontal:
487
+ crops = []
488
+
489
+ for box in _boxes:
490
+ # Calculate the centroid of the quadrilateral
491
+ centroid = np.mean(box, axis=0)
492
+
493
+ # Divide the points into left and right
494
+ left_points = box[box[:, 0] < centroid[0]]
495
+ right_points = box[box[:, 0] >= centroid[0]]
496
+
497
+ # Sort the left points according to the y-axis
498
+ left_points = left_points[np.argsort(left_points[:, 1])]
499
+ top_left_pt = left_points[0]
500
+ bottom_left_pt = left_points[-1]
501
+ # Sort the right points according to the y-axis
502
+ right_points = right_points[np.argsort(right_points[:, 1])]
503
+ top_right_pt = right_points[0]
504
+ bottom_right_pt = right_points[-1]
505
+ box_points = np.array(
506
+ [top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt],
507
+ dtype=dtype,
508
+ )
509
+
510
+ # Get the width and height of the rectangle that will contain the warped quadrilateral
511
+ width_upper = np.linalg.norm(top_right_pt - top_left_pt)
512
+ width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt)
513
+ height_left = np.linalg.norm(bottom_left_pt - top_left_pt)
514
+ height_right = np.linalg.norm(bottom_right_pt - top_right_pt)
515
+
516
+ # Get the maximum width and height
517
+ rect_width = max(int(width_upper), int(width_lower))
518
+ rect_height = max(int(height_left), int(height_right))
519
+
520
+ dst_pts = np.array(
521
+ [
522
+ [0, 0], # top-left
523
+ # bottom-left
524
+ [0, rect_height - 1],
525
+ # top-right
526
+ [rect_width - 1, 0],
527
+ # bottom-right
528
+ [rect_width - 1, rect_height - 1],
529
+ ],
530
+ dtype=dtype,
531
+ )
532
+
533
+ # Get the perspective transform matrix using the box points
534
+ affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts)
535
+
536
+ # Perform the perspective warp to get the rectified crop
537
+ crop = cv2.warpPerspective(
538
+ src_img,
539
+ affine_mat,
540
+ (rect_width, rect_height),
541
+ )
542
+
543
+ # Add the crop to the list of crops
544
+ crops.append(crop)
545
+
546
+ # Handle any oriented boxes
547
+ else:
548
+ src_pts = _boxes[:, :3].astype(np.float32)
549
+ # Preserve size
550
+ d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
551
+ d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
552
+ # (N, 3, 2)
553
+ dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
554
+ dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
555
+ dst_pts[:, 2, 1] = d2 - 1
556
+ # Use a warp transformation to extract the crop
557
+ crops = [
558
+ cv2.warpAffine(
559
+ src_img,
560
+ # Transformation matrix
561
+ cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
562
+ (int(d1[idx]), int(d2[idx])),
563
+ )
564
+ for idx in range(_boxes.shape[0])
565
+ ]
566
+ return crops # type: ignore[return-value]
doctr/utils/metrics.py CHANGED
@@ -5,16 +5,14 @@
5
5
 
6
6
  from typing import Dict, List, Optional, Tuple
7
7
 
8
- import cv2
9
8
  import numpy as np
9
+ from anyascii import anyascii
10
10
  from scipy.optimize import linear_sum_assignment
11
- from unidecode import unidecode
11
+ from shapely.geometry import Polygon
12
12
 
13
13
  __all__ = [
14
14
  "TextMatch",
15
15
  "box_iou",
16
- "box_ioa",
17
- "mask_iou",
18
16
  "polygon_iou",
19
17
  "nms",
20
18
  "LocalizationConfusion",
@@ -34,16 +32,16 @@ def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
34
32
  Returns:
35
33
  -------
36
34
  a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
37
- unidecode counterparts and their lower-case unidecode counterparts match
35
+ anyascii counterparts and their lower-case anyascii counterparts match
38
36
  """
39
37
  raw_match = word1 == word2
40
38
  caseless_match = word1.lower() == word2.lower()
41
- unidecode_match = unidecode(word1) == unidecode(word2)
39
+ anyascii_match = anyascii(word1) == anyascii(word2)
42
40
 
43
41
  # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
44
- unicase_match = unidecode(word1).lower() == unidecode(word2).lower()
42
+ unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
45
43
 
46
- return raw_match, caseless_match, unidecode_match, unicase_match
44
+ return raw_match, caseless_match, anyascii_match, unicase_match
47
45
 
48
46
 
49
47
  class TextMatch:
@@ -94,10 +92,10 @@ class TextMatch:
94
92
  raise AssertionError("prediction size does not match with ground-truth labels size")
95
93
 
96
94
  for gt_word, pred_word in zip(gt, pred):
97
- _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
95
+ _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
98
96
  self.raw += int(_raw)
99
97
  self.caseless += int(_caseless)
100
- self.unidecode += int(_unidecode)
98
+ self.anyascii += int(_anyascii)
101
99
  self.unicase += int(_unicase)
102
100
 
103
101
  self.total += len(gt)
@@ -107,8 +105,8 @@ class TextMatch:
107
105
 
108
106
  Returns
109
107
  -------
110
- a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode
111
- counterpart and its lower-case unidecode counterpart
108
+ a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii
109
+ counterpart and its lower-case anyascii counterpart
112
110
  """
113
111
  if self.total == 0:
114
112
  raise AssertionError("you need to update the metric before getting the summary")
@@ -116,14 +114,14 @@ class TextMatch:
116
114
  return dict(
117
115
  raw=self.raw / self.total,
118
116
  caseless=self.caseless / self.total,
119
- unidecode=self.unidecode / self.total,
117
+ anyascii=self.anyascii / self.total,
120
118
  unicase=self.unicase / self.total,
121
119
  )
122
120
 
123
121
  def reset(self) -> None:
124
122
  self.raw = 0
125
123
  self.caseless = 0
126
- self.unidecode = 0
124
+ self.anyascii = 0
127
125
  self.unicase = 0
128
126
  self.total = 0
129
127
 
@@ -151,73 +149,14 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
151
149
  right = np.minimum(r1, r2.T)
152
150
  bot = np.minimum(b1, b2.T)
153
151
 
154
- intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
152
+ intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
155
153
  union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
156
154
  iou_mat = intersection / union
157
155
 
158
156
  return iou_mat
159
157
 
160
158
 
161
- def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
162
- """Computes the IoA (intersection over area) between two sets of bounding boxes:
163
- ioa(i, j) = inter(i, j) / area(i)
164
-
165
- Args:
166
- ----
167
- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
168
- boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
169
-
170
- Returns:
171
- -------
172
- the IoA matrix of shape (N, M)
173
- """
174
- ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
175
-
176
- if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
177
- l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
178
- l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
179
-
180
- left = np.maximum(l1, l2.T)
181
- top = np.maximum(t1, t2.T)
182
- right = np.minimum(r1, r2.T)
183
- bot = np.minimum(b1, b2.T)
184
-
185
- intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
186
- area = (r1 - l1) * (b1 - t1)
187
- ioa_mat = intersection / area
188
-
189
- return ioa_mat
190
-
191
-
192
- def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
193
- """Computes the IoU between two sets of boolean masks
194
-
195
- Args:
196
- ----
197
- masks_1: boolean masks of shape (N, H, W)
198
- masks_2: boolean masks of shape (M, H, W)
199
-
200
- Returns:
201
- -------
202
- the IoU matrix of shape (N, M)
203
- """
204
- if masks_1.shape[1:] != masks_2.shape[1:]:
205
- raise AssertionError("both boolean masks should have the same spatial shape")
206
-
207
- iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
208
-
209
- if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
210
- axes = tuple(range(2, masks_1.ndim + 1))
211
- intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
212
- union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
213
- iou_mat = intersection / union
214
-
215
- return iou_mat
216
-
217
-
218
- def polygon_iou(
219
- polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
220
- ) -> np.ndarray:
159
+ def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
221
160
  """Computes the IoU between two sets of rotated bounding boxes
222
161
 
223
162
  Args:
@@ -234,80 +173,18 @@ def polygon_iou(
234
173
  if polys_1.ndim != 3 or polys_2.ndim != 3:
235
174
  raise AssertionError("expects boxes to be in format (N, 4, 2)")
236
175
 
237
- iou_mat: np.ndarray = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
238
-
239
- if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
240
- if use_broadcasting:
241
- masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
242
- masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
243
- iou_mat = mask_iou(masks_1, masks_2)
244
- else:
245
- # Save memory by doing the computation for each pair
246
- for idx, b1 in enumerate(polys_1):
247
- m1 = _rbox_to_mask(b1, mask_shape)
248
- for _idx, b2 in enumerate(polys_2):
249
- m2 = _rbox_to_mask(b2, mask_shape)
250
- iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
251
-
252
- return iou_mat
176
+ iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
253
177
 
178
+ shapely_polys_1 = [Polygon(poly) for poly in polys_1]
179
+ shapely_polys_2 = [Polygon(poly) for poly in polys_2]
254
180
 
255
- def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
256
- """Converts a rotated bounding box to a boolean mask
181
+ for i, poly1 in enumerate(shapely_polys_1):
182
+ for j, poly2 in enumerate(shapely_polys_2):
183
+ intersection_area = poly1.intersection(poly2).area
184
+ union_area = poly1.area + poly2.area - intersection_area
185
+ iou_mat[i, j] = intersection_area / union_area
257
186
 
258
- Args:
259
- ----
260
- box: rotated bounding box of shape (4, 2)
261
- shape: spatial shapes of the output masks
262
-
263
- Returns:
264
- -------
265
- the boolean mask of the specified shape
266
- """
267
- mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
268
- # Get absolute coords
269
- if not np.issubdtype(box.dtype, np.integer):
270
- abs_box = box.copy()
271
- abs_box[:, 0] = abs_box[:, 0] * shape[1]
272
- abs_box[:, 1] = abs_box[:, 1] * shape[0]
273
- abs_box = abs_box.round().astype(int)
274
- else:
275
- abs_box = box
276
- abs_box[2:] = abs_box[2:] + 1
277
- cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload]
278
-
279
- return mask.astype(bool)
280
-
281
-
282
- def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
283
- """Converts rotated bounding boxes to boolean masks
284
-
285
- Args:
286
- ----
287
- boxes: rotated bounding boxes of shape (N, 4, 2)
288
- shape: spatial shapes of the output masks
289
-
290
- Returns:
291
- -------
292
- the boolean masks of shape (N, H, W)
293
- """
294
- masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
295
-
296
- if boxes.shape[0] > 0:
297
- # Get absolute coordinates
298
- if not np.issubdtype(boxes.dtype, np.integer):
299
- abs_boxes = boxes.copy()
300
- abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
301
- abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
302
- abs_boxes = abs_boxes.round().astype(int)
303
- else:
304
- abs_boxes = boxes
305
- abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
306
-
307
- # TODO: optimize slicing to improve vectorization
308
- for idx, _box in enumerate(abs_boxes):
309
- cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload]
310
- return masks.astype(bool)
187
+ return iou_mat
311
188
 
312
189
 
313
190
  def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
@@ -386,21 +263,15 @@ class LocalizationConfusion:
386
263
  ----
387
264
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
388
265
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
389
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
390
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
391
266
  """
392
267
 
393
268
  def __init__(
394
269
  self,
395
270
  iou_thresh: float = 0.5,
396
271
  use_polygons: bool = False,
397
- mask_shape: Tuple[int, int] = (1024, 1024),
398
- use_broadcasting: bool = True,
399
272
  ) -> None:
400
273
  self.iou_thresh = iou_thresh
401
274
  self.use_polygons = use_polygons
402
- self.mask_shape = mask_shape
403
- self.use_broadcasting = use_broadcasting
404
275
  self.reset()
405
276
 
406
277
  def update(self, gts: np.ndarray, preds: np.ndarray) -> None:
@@ -414,7 +285,7 @@ class LocalizationConfusion:
414
285
  if preds.shape[0] > 0:
415
286
  # Compute IoU
416
287
  if self.use_polygons:
417
- iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting)
288
+ iou_mat = polygon_iou(gts, preds)
418
289
  else:
419
290
  iou_mat = box_iou(gts, preds)
420
291
  self.tot_iou += float(iou_mat.max(axis=0).sum())
@@ -441,7 +312,7 @@ class LocalizationConfusion:
441
312
  precision = self.matches / self.num_preds if self.num_preds > 0 else None
442
313
 
443
314
  # mean IoU
444
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
315
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
445
316
 
446
317
  return recall, precision, mean_iou
447
318
 
@@ -492,21 +363,15 @@ class OCRMetric:
492
363
  ----
493
364
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
494
365
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
495
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
496
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
497
366
  """
498
367
 
499
368
  def __init__(
500
369
  self,
501
370
  iou_thresh: float = 0.5,
502
371
  use_polygons: bool = False,
503
- mask_shape: Tuple[int, int] = (1024, 1024),
504
- use_broadcasting: bool = True,
505
372
  ) -> None:
506
373
  self.iou_thresh = iou_thresh
507
374
  self.use_polygons = use_polygons
508
- self.mask_shape = mask_shape
509
- self.use_broadcasting = use_broadcasting
510
375
  self.reset()
511
376
 
512
377
  def update(
@@ -533,7 +398,7 @@ class OCRMetric:
533
398
  # Compute IoU
534
399
  if pred_boxes.shape[0] > 0:
535
400
  if self.use_polygons:
536
- iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
401
+ iou_mat = polygon_iou(gt_boxes, pred_boxes)
537
402
  else:
538
403
  iou_mat = box_iou(gt_boxes, pred_boxes)
539
404
 
@@ -544,10 +409,10 @@ class OCRMetric:
544
409
  is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
545
410
  # String comparison
546
411
  for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
547
- _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
412
+ _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
548
413
  self.raw_matches += int(_raw)
549
414
  self.caseless_matches += int(_caseless)
550
- self.unidecode_matches += int(_unidecode)
415
+ self.anyascii_matches += int(_anyascii)
551
416
  self.unicase_matches += int(_unicase)
552
417
 
553
418
  self.num_gts += gt_boxes.shape[0]
@@ -564,7 +429,7 @@ class OCRMetric:
564
429
  recall = dict(
565
430
  raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None,
566
431
  caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None,
567
- unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None,
432
+ anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None,
568
433
  unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None,
569
434
  )
570
435
 
@@ -572,12 +437,12 @@ class OCRMetric:
572
437
  precision = dict(
573
438
  raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
574
439
  caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
575
- unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
440
+ anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
576
441
  unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
577
442
  )
578
443
 
579
444
  # mean IoU (overall detected boxes)
580
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
445
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
581
446
 
582
447
  return recall, precision, mean_iou
583
448
 
@@ -587,7 +452,7 @@ class OCRMetric:
587
452
  self.tot_iou = 0.0
588
453
  self.raw_matches = 0
589
454
  self.caseless_matches = 0
590
- self.unidecode_matches = 0
455
+ self.anyascii_matches = 0
591
456
  self.unicase_matches = 0
592
457
 
593
458
 
@@ -631,21 +496,15 @@ class DetectionMetric:
631
496
  ----
632
497
  iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
633
498
  use_polygons: if set to True, predictions and targets will be expected to have rotated format
634
- mask_shape: if use_polygons is True, describes the spatial shape of the image used
635
- use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
636
499
  """
637
500
 
638
501
  def __init__(
639
502
  self,
640
503
  iou_thresh: float = 0.5,
641
504
  use_polygons: bool = False,
642
- mask_shape: Tuple[int, int] = (1024, 1024),
643
- use_broadcasting: bool = True,
644
505
  ) -> None:
645
506
  self.iou_thresh = iou_thresh
646
507
  self.use_polygons = use_polygons
647
- self.mask_shape = mask_shape
648
- self.use_broadcasting = use_broadcasting
649
508
  self.reset()
650
509
 
651
510
  def update(
@@ -672,7 +531,7 @@ class DetectionMetric:
672
531
  # Compute IoU
673
532
  if pred_boxes.shape[0] > 0:
674
533
  if self.use_polygons:
675
- iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
534
+ iou_mat = polygon_iou(gt_boxes, pred_boxes)
676
535
  else:
677
536
  iou_mat = box_iou(gt_boxes, pred_boxes)
678
537
 
@@ -701,7 +560,7 @@ class DetectionMetric:
701
560
  precision = self.num_matches / self.num_preds if self.num_preds > 0 else None
702
561
 
703
562
  # mean IoU (overall detected boxes)
704
- mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
563
+ mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
705
564
 
706
565
  return recall, precision, mean_iou
707
566