nrtk-albumentations 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nrtk-albumentations might be problematic. Click here for more details.

Files changed (62) hide show
  1. albumentations/__init__.py +21 -0
  2. albumentations/augmentations/__init__.py +23 -0
  3. albumentations/augmentations/blur/__init__.py +0 -0
  4. albumentations/augmentations/blur/functional.py +438 -0
  5. albumentations/augmentations/blur/transforms.py +1633 -0
  6. albumentations/augmentations/crops/__init__.py +0 -0
  7. albumentations/augmentations/crops/functional.py +494 -0
  8. albumentations/augmentations/crops/transforms.py +3647 -0
  9. albumentations/augmentations/dropout/__init__.py +0 -0
  10. albumentations/augmentations/dropout/channel_dropout.py +134 -0
  11. albumentations/augmentations/dropout/coarse_dropout.py +567 -0
  12. albumentations/augmentations/dropout/functional.py +1017 -0
  13. albumentations/augmentations/dropout/grid_dropout.py +166 -0
  14. albumentations/augmentations/dropout/mask_dropout.py +274 -0
  15. albumentations/augmentations/dropout/transforms.py +461 -0
  16. albumentations/augmentations/dropout/xy_masking.py +186 -0
  17. albumentations/augmentations/geometric/__init__.py +0 -0
  18. albumentations/augmentations/geometric/distortion.py +1238 -0
  19. albumentations/augmentations/geometric/flip.py +752 -0
  20. albumentations/augmentations/geometric/functional.py +4151 -0
  21. albumentations/augmentations/geometric/pad.py +676 -0
  22. albumentations/augmentations/geometric/resize.py +956 -0
  23. albumentations/augmentations/geometric/rotate.py +864 -0
  24. albumentations/augmentations/geometric/transforms.py +1962 -0
  25. albumentations/augmentations/mixing/__init__.py +0 -0
  26. albumentations/augmentations/mixing/domain_adaptation.py +787 -0
  27. albumentations/augmentations/mixing/domain_adaptation_functional.py +453 -0
  28. albumentations/augmentations/mixing/functional.py +878 -0
  29. albumentations/augmentations/mixing/transforms.py +832 -0
  30. albumentations/augmentations/other/__init__.py +0 -0
  31. albumentations/augmentations/other/lambda_transform.py +180 -0
  32. albumentations/augmentations/other/type_transform.py +261 -0
  33. albumentations/augmentations/pixel/__init__.py +0 -0
  34. albumentations/augmentations/pixel/functional.py +4226 -0
  35. albumentations/augmentations/pixel/transforms.py +7556 -0
  36. albumentations/augmentations/spectrogram/__init__.py +0 -0
  37. albumentations/augmentations/spectrogram/transform.py +220 -0
  38. albumentations/augmentations/text/__init__.py +0 -0
  39. albumentations/augmentations/text/functional.py +272 -0
  40. albumentations/augmentations/text/transforms.py +299 -0
  41. albumentations/augmentations/transforms3d/__init__.py +0 -0
  42. albumentations/augmentations/transforms3d/functional.py +393 -0
  43. albumentations/augmentations/transforms3d/transforms.py +1422 -0
  44. albumentations/augmentations/utils.py +249 -0
  45. albumentations/core/__init__.py +0 -0
  46. albumentations/core/bbox_utils.py +920 -0
  47. albumentations/core/composition.py +1885 -0
  48. albumentations/core/hub_mixin.py +299 -0
  49. albumentations/core/keypoints_utils.py +521 -0
  50. albumentations/core/label_manager.py +339 -0
  51. albumentations/core/pydantic.py +239 -0
  52. albumentations/core/serialization.py +352 -0
  53. albumentations/core/transforms_interface.py +976 -0
  54. albumentations/core/type_definitions.py +127 -0
  55. albumentations/core/utils.py +605 -0
  56. albumentations/core/validation.py +129 -0
  57. albumentations/pytorch/__init__.py +1 -0
  58. albumentations/pytorch/transforms.py +189 -0
  59. nrtk_albumentations-2.1.0.dist-info/METADATA +196 -0
  60. nrtk_albumentations-2.1.0.dist-info/RECORD +62 -0
  61. nrtk_albumentations-2.1.0.dist-info/WHEEL +4 -0
  62. nrtk_albumentations-2.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,4151 @@
1
+ """Functional implementations of geometric image transformations.
2
+
3
+ This module provides low-level functions for geometric operations such as rotation,
4
+ resizing, flipping, perspective transforms, and affine transformations on images,
5
+ bounding boxes and keypoints.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from collections import defaultdict
12
+ from collections.abc import Mapping, Sequence
13
+ from typing import Any, Literal, cast
14
+ from warnings import warn
15
+
16
+ import cv2
17
+ import numpy as np
18
+ from albucore import (
19
+ get_num_channels,
20
+ hflip,
21
+ maybe_process_in_chunks,
22
+ preserve_channel_dim,
23
+ vflip,
24
+ )
25
+
26
+ from albumentations.augmentations.utils import angle_2pi_range, handle_empty_array
27
+ from albumentations.core.bbox_utils import (
28
+ bboxes_from_masks,
29
+ bboxes_to_mask,
30
+ denormalize_bboxes,
31
+ mask_to_bboxes,
32
+ masks_from_bboxes,
33
+ normalize_bboxes,
34
+ )
35
+ from albumentations.core.type_definitions import (
36
+ NUM_BBOXES_COLUMNS_IN_ALBUMENTATIONS,
37
+ NUM_KEYPOINTS_COLUMNS_IN_ALBUMENTATIONS,
38
+ NUM_MULTI_CHANNEL_DIMENSIONS,
39
+ REFLECT_BORDER_MODES,
40
+ )
41
+
42
+ PAIR = 2
43
+
44
+ ROT90_180_FACTOR = 2
45
+ ROT90_270_FACTOR = 3
46
+
47
+
48
+ @handle_empty_array("bboxes")
49
+ def bboxes_rot90(bboxes: np.ndarray, factor: int) -> np.ndarray:
50
+ """Rotates bounding boxes by 90 degrees CCW (see np.rot90)
51
+
52
+ Args:
53
+ bboxes (np.ndarray): Array of bounding boxes with shape (num_boxes, 4+)
54
+ factor (int): Number of 90-degree rotations (1, 2, or 3)
55
+
56
+ Returns:
57
+ np.ndarray: Rotated bounding boxes
58
+
59
+ """
60
+ if factor == 0:
61
+ return bboxes
62
+
63
+ rotated_bboxes = bboxes.copy()
64
+ x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
65
+
66
+ if factor == 1:
67
+ rotated_bboxes[:, 0] = y_min
68
+ rotated_bboxes[:, 1] = 1 - x_max
69
+ rotated_bboxes[:, 2] = y_max
70
+ rotated_bboxes[:, 3] = 1 - x_min
71
+ elif factor == ROT90_180_FACTOR:
72
+ rotated_bboxes[:, 0] = 1 - x_max
73
+ rotated_bboxes[:, 1] = 1 - y_max
74
+ rotated_bboxes[:, 2] = 1 - x_min
75
+ rotated_bboxes[:, 3] = 1 - y_min
76
+ elif factor == ROT90_270_FACTOR:
77
+ rotated_bboxes[:, 0] = 1 - y_max
78
+ rotated_bboxes[:, 1] = x_min
79
+ rotated_bboxes[:, 2] = 1 - y_min
80
+ rotated_bboxes[:, 3] = x_max
81
+
82
+ return rotated_bboxes
83
+
84
+
85
+ @handle_empty_array("bboxes")
86
+ def bboxes_d4(
87
+ bboxes: np.ndarray,
88
+ group_member: Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"],
89
+ ) -> np.ndarray:
90
+ """Applies a `D_4` symmetry group transformation to a bounding box.
91
+
92
+ The function transforms a bounding box according to the specified group member from the `D_4` group.
93
+ These transformations include rotations and reflections, specified to work on an image's bounding box given
94
+ its dimensions.
95
+
96
+ Args:
97
+ bboxes (np.ndarray): A numpy array of bounding boxes with shape (num_bboxes, 4+).
98
+ Each row represents a bounding box (x_min, y_min, x_max, y_max, ...).
99
+ group_member (Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]): A string identifier for the
100
+ `D_4` group transformation to apply.
101
+
102
+ Returns:
103
+ BoxInternalType: The transformed bounding box.
104
+
105
+ Raises:
106
+ ValueError: If an invalid group member is specified.
107
+
108
+ """
109
+ transformations = {
110
+ "e": lambda x: x, # Identity transformation
111
+ "r90": lambda x: bboxes_rot90(x, 1), # Rotate 90 degrees
112
+ "r180": lambda x: bboxes_rot90(x, 2), # Rotate 180 degrees
113
+ "r270": lambda x: bboxes_rot90(x, 3), # Rotate 270 degrees
114
+ "v": lambda x: bboxes_vflip(x), # Vertical flip
115
+ "hvt": lambda x: bboxes_transpose(
116
+ bboxes_rot90(x, 2),
117
+ ), # Reflect over anti-diagonal
118
+ "h": lambda x: bboxes_hflip(x), # Horizontal flip
119
+ "t": lambda x: bboxes_transpose(x), # Transpose (reflect over main diagonal)
120
+ }
121
+
122
+ # Execute the appropriate transformation
123
+ if group_member in transformations:
124
+ return transformations[group_member](bboxes)
125
+
126
+ raise ValueError(f"Invalid group member: {group_member}")
127
+
128
+
129
+ @handle_empty_array("keypoints")
130
+ @angle_2pi_range
131
+ def keypoints_rot90(
132
+ keypoints: np.ndarray,
133
+ factor: Literal[0, 1, 2, 3],
134
+ image_shape: tuple[int, int],
135
+ ) -> np.ndarray:
136
+ """Rotate keypoints by 90 degrees counter-clockwise (CCW) a specified number of times.
137
+
138
+ Args:
139
+ keypoints (np.ndarray): An array of keypoints with shape (N, 4+) in the format (x, y, angle, scale, ...).
140
+ factor (int): The number of 90 degree CCW rotations to apply. Must be in the range [0, 3].
141
+ image_shape (tuple[int, int]): The shape of the image (height, width).
142
+
143
+ Returns:
144
+ np.ndarray: The rotated keypoints with the same shape as the input.
145
+
146
+ """
147
+ if factor == 0:
148
+ return keypoints
149
+
150
+ height, width = image_shape[:2]
151
+ rotated_keypoints = keypoints.copy().astype(np.float32)
152
+
153
+ x, y, angle = keypoints[:, 0], keypoints[:, 1], keypoints[:, 3]
154
+
155
+ if factor == 1:
156
+ rotated_keypoints[:, 0] = y
157
+ rotated_keypoints[:, 1] = width - 1 - x
158
+ rotated_keypoints[:, 3] = angle - np.pi / 2
159
+ elif factor == ROT90_180_FACTOR:
160
+ rotated_keypoints[:, 0] = width - 1 - x
161
+ rotated_keypoints[:, 1] = height - 1 - y
162
+ rotated_keypoints[:, 3] = angle - np.pi
163
+ elif factor == ROT90_270_FACTOR:
164
+ rotated_keypoints[:, 0] = height - 1 - y
165
+ rotated_keypoints[:, 1] = x
166
+ rotated_keypoints[:, 3] = angle + np.pi / 2
167
+
168
+ return rotated_keypoints
169
+
170
+
171
+ @handle_empty_array("keypoints")
172
+ def keypoints_d4(
173
+ keypoints: np.ndarray,
174
+ group_member: Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"],
175
+ image_shape: tuple[int, int],
176
+ **params: Any,
177
+ ) -> np.ndarray:
178
+ """Applies a `D_4` symmetry group transformation to a keypoint.
179
+
180
+ This function adjusts a keypoint's coordinates according to the specified `D_4` group transformation,
181
+ which includes rotations and reflections suitable for image processing tasks. These transformations account
182
+ for the dimensions of the image to ensure the keypoint remains within its boundaries.
183
+
184
+ Args:
185
+ keypoints (np.ndarray): An array of keypoints with shape (N, 4+) in the format (x, y, angle, scale, ...).
186
+ group_member (Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]): A string identifier for
187
+ the `D_4` group transformation to apply.
188
+ Valid values are 'e', 'r90', 'r180', 'r270', 'v', 'hv', 'h', 't'.
189
+ image_shape (tuple[int, int]): The shape of the image.
190
+ params (Any): Not used.
191
+
192
+ Returns:
193
+ KeypointInternalType: The transformed keypoint.
194
+
195
+ Raises:
196
+ ValueError: If an invalid group member is specified, indicating that the specified transformation
197
+ does not exist.
198
+
199
+ """
200
+ rows, cols = image_shape[:2]
201
+ transformations = {
202
+ "e": lambda x: x, # Identity transformation
203
+ "r90": lambda x: keypoints_rot90(x, 1, image_shape), # Rotate 90 degrees
204
+ "r180": lambda x: keypoints_rot90(x, 2, image_shape), # Rotate 180 degrees
205
+ "r270": lambda x: keypoints_rot90(x, 3, image_shape), # Rotate 270 degrees
206
+ "v": lambda x: keypoints_vflip(x, rows), # Vertical flip
207
+ "hvt": lambda x: keypoints_transpose(
208
+ keypoints_rot90(x, 2, image_shape),
209
+ ), # Reflect over anti diagonal
210
+ "h": lambda x: keypoints_hflip(x, cols), # Horizontal flip
211
+ "t": lambda x: keypoints_transpose(x), # Transpose (reflect over main diagonal)
212
+ }
213
+ # Execute the appropriate transformation
214
+ if group_member in transformations:
215
+ return transformations[group_member](keypoints)
216
+
217
+ raise ValueError(f"Invalid group member: {group_member}")
218
+
219
+
220
+ @preserve_channel_dim
221
+ def resize(
222
+ img: np.ndarray,
223
+ target_shape: tuple[int, int],
224
+ interpolation: int,
225
+ ) -> np.ndarray:
226
+ """Resize an image to the specified dimensions.
227
+
228
+ This function resizes an input image to the target shape using the specified
229
+ interpolation method. If the image is already the target size, it is returned unchanged.
230
+
231
+ Args:
232
+ img (np.ndarray): Input image to resize.
233
+ target_shape (tuple[int, int]): Target (height, width) dimensions.
234
+ interpolation (int): Interpolation method to use (cv2 interpolation flag).
235
+ Examples: cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_NEAREST, etc.
236
+
237
+ Returns:
238
+ np.ndarray: Resized image with shape target_shape + original channel dimensions.
239
+
240
+ """
241
+ if target_shape == img.shape[:2]:
242
+ return img
243
+
244
+ height, width = target_shape[:2]
245
+ resize_fn = maybe_process_in_chunks(
246
+ cv2.resize,
247
+ dsize=(width, height),
248
+ interpolation=interpolation,
249
+ )
250
+ return resize_fn(img)
251
+
252
+
253
+ @preserve_channel_dim
254
+ def scale(img: np.ndarray, scale: float, interpolation: int) -> np.ndarray:
255
+ """Scale an image by a factor while preserving aspect ratio.
256
+
257
+ This function scales both height and width dimensions of the image by the same factor.
258
+
259
+ Args:
260
+ img (np.ndarray): Input image to scale.
261
+ scale (float): Scale factor. Values > 1 will enlarge the image, values < 1 will shrink it.
262
+ interpolation (int): Interpolation method to use (cv2 interpolation flag).
263
+
264
+ Returns:
265
+ np.ndarray: Scaled image.
266
+
267
+ """
268
+ height, width = img.shape[:2]
269
+ new_size = int(height * scale), int(width * scale)
270
+ return resize(img, new_size, interpolation)
271
+
272
+
273
+ @handle_empty_array("keypoints")
274
+ def keypoints_scale(
275
+ keypoints: np.ndarray,
276
+ scale_x: float,
277
+ scale_y: float,
278
+ ) -> np.ndarray:
279
+ """Scale keypoints by given factors.
280
+
281
+ Args:
282
+ keypoints (np.ndarray): Array of keypoints with shape (num_keypoints, 2+)
283
+ scale_x (float): Scale factor for x coordinates
284
+ scale_y (float): Scale factor for y coordinates
285
+
286
+ Returns:
287
+ np.ndarray: Scaled keypoints
288
+
289
+ """
290
+ # Extract x, y, z, angle, and scale
291
+ x, y, z, angle, scale = (
292
+ keypoints[:, 0],
293
+ keypoints[:, 1],
294
+ keypoints[:, 2],
295
+ keypoints[:, 3],
296
+ keypoints[:, 4],
297
+ )
298
+
299
+ # Scale x and y
300
+ x_scaled = x * scale_x
301
+ y_scaled = y * scale_y
302
+
303
+ # Scale the keypoint scale by the maximum of scale_x and scale_y
304
+ scale_scaled = scale * max(scale_x, scale_y)
305
+
306
+ # Create the output array
307
+ scaled_keypoints = np.column_stack([x_scaled, y_scaled, z, angle, scale_scaled])
308
+
309
+ # If there are additional columns, preserve them
310
+ if keypoints.shape[1] > NUM_KEYPOINTS_COLUMNS_IN_ALBUMENTATIONS:
311
+ return np.column_stack(
312
+ [scaled_keypoints, keypoints[:, NUM_KEYPOINTS_COLUMNS_IN_ALBUMENTATIONS:]],
313
+ )
314
+
315
+ return scaled_keypoints
316
+
317
+
318
+ @preserve_channel_dim
319
+ def perspective(
320
+ img: np.ndarray,
321
+ matrix: np.ndarray,
322
+ max_width: int,
323
+ max_height: int,
324
+ border_val: float | list[float] | np.ndarray,
325
+ border_mode: int,
326
+ keep_size: bool,
327
+ interpolation: int,
328
+ ) -> np.ndarray:
329
+ """Apply perspective transformation to an image.
330
+
331
+ This function warps an image according to a perspective transformation matrix.
332
+ It can either maintain the original dimensions or use the specified max dimensions.
333
+
334
+ Args:
335
+ img (np.ndarray): Input image to transform.
336
+ matrix (np.ndarray): 3x3 perspective transformation matrix.
337
+ max_width (int): Maximum width of the output image if keep_size is False.
338
+ max_height (int): Maximum height of the output image if keep_size is False.
339
+ border_val (float | list[float] | np.ndarray): Border value(s) to fill areas outside the transformed image.
340
+ border_mode (int): OpenCV border mode (e.g., cv2.BORDER_CONSTANT, cv2.BORDER_REFLECT).
341
+ keep_size (bool): If True, maintain the original image dimensions.
342
+ interpolation (int): Interpolation method for resampling (cv2 interpolation flag).
343
+
344
+ Returns:
345
+ np.ndarray: Perspective-transformed image.
346
+
347
+ """
348
+ if not keep_size:
349
+ perspective_func = maybe_process_in_chunks(
350
+ cv2.warpPerspective,
351
+ M=matrix,
352
+ dsize=(max_width, max_height),
353
+ borderMode=border_mode,
354
+ borderValue=border_val,
355
+ flags=interpolation,
356
+ )
357
+ else:
358
+ height, width = img.shape[:2]
359
+
360
+ scale_x = width / max_width
361
+ scale_y = height / max_height
362
+ scale_matrix = np.array([[scale_x, 0, 0], [0, scale_y, 0], [0, 0, 1]])
363
+ adjusted_matrix = np.dot(scale_matrix, matrix)
364
+
365
+ perspective_func = maybe_process_in_chunks(
366
+ cv2.warpPerspective,
367
+ M=adjusted_matrix,
368
+ dsize=(width, height),
369
+ borderMode=border_mode,
370
+ borderValue=border_val,
371
+ flags=interpolation,
372
+ )
373
+
374
+ return perspective_func(img)
375
+
376
+
377
+ @handle_empty_array("bboxes")
378
+ def perspective_bboxes(
379
+ bboxes: np.ndarray,
380
+ image_shape: tuple[int, int],
381
+ matrix: np.ndarray,
382
+ max_width: int,
383
+ max_height: int,
384
+ keep_size: bool,
385
+ ) -> np.ndarray:
386
+ """Applies perspective transformation to bounding boxes.
387
+
388
+ This function transforms bounding boxes using the given perspective transformation matrix.
389
+ It handles bounding boxes with additional attributes beyond the standard coordinates.
390
+
391
+ Args:
392
+ bboxes (np.ndarray): An array of bounding boxes with shape (num_bboxes, 4+).
393
+ Each row represents a bounding box (x_min, y_min, x_max, y_max, ...).
394
+ Additional columns beyond the first 4 are preserved unchanged.
395
+ image_shape (tuple[int, int]): The shape of the image (height, width).
396
+ matrix (np.ndarray): The perspective transformation matrix.
397
+ max_width (int): The maximum width of the output image.
398
+ max_height (int): The maximum height of the output image.
399
+ keep_size (bool): If True, maintains the original image size after transformation.
400
+
401
+ Returns:
402
+ np.ndarray: An array of transformed bounding boxes with the same shape as input.
403
+ The first 4 columns contain the transformed coordinates, and any
404
+ additional columns are preserved from the input.
405
+
406
+ Note:
407
+ - This function modifies only the coordinate columns (first 4) of the input bounding boxes.
408
+ - Any additional attributes (columns beyond the first 4) are kept unchanged.
409
+ - The function handles denormalization and renormalization of coordinates internally.
410
+
411
+ Examples:
412
+ >>> bboxes = np.array([[0.1, 0.1, 0.3, 0.3, 1], [0.5, 0.5, 0.8, 0.8, 2]])
413
+ >>> image_shape = (100, 100)
414
+ >>> matrix = np.array([[1.5, 0.2, -20], [-0.1, 1.3, -10], [0.002, 0.001, 1]])
415
+ >>> transformed_bboxes = perspective_bboxes(bboxes, image_shape, matrix, 150, 150, False)
416
+
417
+ """
418
+ height, width = image_shape[:2]
419
+ transformed_bboxes = bboxes.copy()
420
+ denormalized_coords = denormalize_bboxes(bboxes[:, :4], image_shape)
421
+
422
+ x_min, y_min, x_max, y_max = denormalized_coords.T
423
+ points = np.array(
424
+ [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]],
425
+ ).transpose(2, 0, 1)
426
+ points_reshaped = points.reshape(-1, 1, 2)
427
+
428
+ transformed_points = cv2.perspectiveTransform(
429
+ points_reshaped.astype(np.float32),
430
+ matrix,
431
+ )
432
+ transformed_points = transformed_points.reshape(-1, 4, 2)
433
+
434
+ new_coords = np.array(
435
+ [[np.min(box[:, 0]), np.min(box[:, 1]), np.max(box[:, 0]), np.max(box[:, 1])] for box in transformed_points],
436
+ )
437
+
438
+ if keep_size:
439
+ scale_x, scale_y = width / max_width, height / max_height
440
+ new_coords[:, [0, 2]] *= scale_x
441
+ new_coords[:, [1, 3]] *= scale_y
442
+ output_shape = image_shape
443
+ else:
444
+ output_shape = (max_height, max_width)
445
+
446
+ normalized_coords = normalize_bboxes(new_coords, output_shape)
447
+ transformed_bboxes[:, :4] = normalized_coords
448
+
449
+ return transformed_bboxes
450
+
451
+
452
+ def rotation2d_matrix_to_euler_angles(matrix: np.ndarray, y_up: bool) -> float:
453
+ """Args:
454
+ matrix (np.ndarray): Rotation matrix
455
+ y_up (bool): is Y axis looks up or down
456
+
457
+ """
458
+ if y_up:
459
+ return np.arctan2(matrix[1, 0], matrix[0, 0])
460
+ return np.arctan2(-matrix[1, 0], matrix[0, 0])
461
+
462
+
463
+ @handle_empty_array("keypoints")
464
+ @angle_2pi_range
465
+ def perspective_keypoints(
466
+ keypoints: np.ndarray,
467
+ image_shape: tuple[int, int],
468
+ matrix: np.ndarray,
469
+ max_width: int,
470
+ max_height: int,
471
+ keep_size: bool,
472
+ ) -> np.ndarray:
473
+ """Apply perspective transformation to keypoints.
474
+
475
+ Args:
476
+ keypoints (np.ndarray): Array of shape (N, 5+) in format [x, y, z, angle, scale, ...]
477
+ image_shape (tuple[int, int]): Original image shape (height, width)
478
+ matrix (np.ndarray): 3x3 perspective transformation matrix
479
+ max_width (int): Maximum width after transformation
480
+ max_height (int): Maximum height after transformation
481
+ keep_size (bool): Whether to keep original size
482
+
483
+ Returns:
484
+ np.ndarray: Transformed keypoints array with same shape as input
485
+
486
+ """
487
+ keypoints = keypoints.copy().astype(np.float32)
488
+
489
+ height, width = image_shape[:2]
490
+
491
+ x, y, z, angle, scale = (
492
+ keypoints[:, 0],
493
+ keypoints[:, 1],
494
+ keypoints[:, 2],
495
+ keypoints[:, 3],
496
+ keypoints[:, 4],
497
+ )
498
+
499
+ # Reshape keypoints for perspective transform
500
+ keypoint_vector = np.column_stack((x, y)).astype(np.float32).reshape(-1, 1, 2)
501
+
502
+ # Apply perspective transform
503
+ transformed_points = cv2.perspectiveTransform(keypoint_vector, matrix).squeeze()
504
+
505
+ # Unsqueeze if we have a single keypoint
506
+ if transformed_points.ndim == 1:
507
+ transformed_points = transformed_points[np.newaxis, :]
508
+
509
+ x, y = transformed_points[:, 0], transformed_points[:, 1]
510
+
511
+ # Update angles
512
+ angle += rotation2d_matrix_to_euler_angles(matrix[:2, :2], y_up=True)
513
+
514
+ # Calculate scale factors
515
+ scale_x = np.sign(matrix[0, 0]) * np.sqrt(matrix[0, 0] ** 2 + matrix[0, 1] ** 2)
516
+ scale_y = np.sign(matrix[1, 1]) * np.sqrt(matrix[1, 0] ** 2 + matrix[1, 1] ** 2)
517
+ scale *= max(scale_x, scale_y)
518
+
519
+ if keep_size:
520
+ scale_x = width / max_width
521
+ scale_y = height / max_height
522
+ x *= scale_x
523
+ y *= scale_y
524
+ scale *= max(scale_x, scale_y)
525
+
526
+ # Create the output array with unchanged z coordinate
527
+ transformed_keypoints = np.column_stack([x, y, z, angle, scale])
528
+
529
+ # If there are additional columns, preserve them
530
+ if keypoints.shape[1] > NUM_KEYPOINTS_COLUMNS_IN_ALBUMENTATIONS:
531
+ return np.column_stack(
532
+ [
533
+ transformed_keypoints,
534
+ keypoints[:, NUM_KEYPOINTS_COLUMNS_IN_ALBUMENTATIONS:],
535
+ ],
536
+ )
537
+
538
+ return transformed_keypoints
539
+
540
+
541
+ def is_identity_matrix(matrix: np.ndarray) -> bool:
542
+ """Check if the given matrix is an identity matrix.
543
+
544
+ Args:
545
+ matrix (np.ndarray): A 3x3 affine transformation matrix.
546
+
547
+ Returns:
548
+ bool: True if the matrix is an identity matrix, False otherwise.
549
+
550
+ """
551
+ return np.allclose(matrix, np.eye(3, dtype=matrix.dtype))
552
+
553
+
554
+ def warp_affine_with_value_extension(
555
+ image: np.ndarray,
556
+ matrix: np.ndarray,
557
+ dsize: tuple[int, int],
558
+ flags: int,
559
+ border_mode: int,
560
+ border_value: tuple[float, ...] | float,
561
+ ) -> np.ndarray:
562
+ """Warp affine with value extension.
563
+
564
+ This function warps an image with a given affine transformation matrix.
565
+ It also extends the value to a sequence of floats.
566
+
567
+ Args:
568
+ image (np.ndarray): The image to warp.
569
+ matrix (np.ndarray): The affine transformation matrix.
570
+ dsize (tuple[int, int]): The size of the output image.
571
+ flags (int): The flags for the warp.
572
+ border_mode (int): The border mode to use.
573
+ border_value (tuple[float, ...] | float): The value to pad the image with.
574
+
575
+ Returns:
576
+ np.ndarray: The warped image.
577
+
578
+ """
579
+ num_channels = get_num_channels(image)
580
+ extended_value = extend_value(border_value, num_channels)
581
+
582
+ return cv2.warpAffine(
583
+ image,
584
+ matrix,
585
+ dsize,
586
+ flags=flags,
587
+ borderMode=border_mode,
588
+ borderValue=extended_value,
589
+ )
590
+
591
+
592
+ @preserve_channel_dim
593
+ def warp_affine(
594
+ image: np.ndarray,
595
+ matrix: np.ndarray,
596
+ interpolation: int,
597
+ fill: tuple[float, ...] | float,
598
+ border_mode: int,
599
+ output_shape: tuple[int, int],
600
+ ) -> np.ndarray:
601
+ """Apply an affine transformation to an image.
602
+
603
+ This function transforms an image using the specified affine transformation matrix.
604
+ If the transformation matrix is an identity matrix, the original image is returned.
605
+
606
+ Args:
607
+ image (np.ndarray): Input image to transform.
608
+ matrix (np.ndarray): 2x3 or 3x3 affine transformation matrix.
609
+ interpolation (int): Interpolation method for resampling.
610
+ fill (tuple[float, ...] | float): Border value(s) to fill areas outside the transformed image.
611
+ border_mode (int): OpenCV border mode for handling pixels outside the image boundaries.
612
+ output_shape (tuple[int, int]): Shape (height, width) of the output image.
613
+
614
+ Returns:
615
+ np.ndarray: Affine-transformed image with dimensions specified by output_shape.
616
+
617
+ """
618
+ if is_identity_matrix(matrix):
619
+ return image
620
+
621
+ height = int(np.round(output_shape[0]))
622
+ width = int(np.round(output_shape[1]))
623
+
624
+ cv2_matrix = matrix[:2, :]
625
+
626
+ warp_fn = maybe_process_in_chunks(
627
+ warp_affine_with_value_extension,
628
+ matrix=cv2_matrix,
629
+ dsize=(width, height),
630
+ flags=interpolation,
631
+ border_mode=border_mode,
632
+ border_value=fill,
633
+ )
634
+ return warp_fn(image)
635
+
636
+
637
+ @handle_empty_array("keypoints")
638
+ @angle_2pi_range
639
+ def keypoints_affine(
640
+ keypoints: np.ndarray,
641
+ matrix: np.ndarray,
642
+ image_shape: tuple[int, int],
643
+ scale: dict[str, float],
644
+ border_mode: int,
645
+ ) -> np.ndarray:
646
+ """Apply an affine transformation to keypoints.
647
+
648
+ This function transforms keypoints using the given affine transformation matrix.
649
+ It handles reflection padding if necessary, updates coordinates, angles, and scales.
650
+
651
+ Args:
652
+ keypoints (np.ndarray): Array of keypoints with shape (N, 4+) where N is the number of keypoints.
653
+ Each keypoint is represented as [x, y, angle, scale, ...].
654
+ matrix (np.ndarray): The 2x3 or 3x3 affine transformation matrix.
655
+ image_shape (tuple[int, int]): Shape of the image (height, width).
656
+ scale (dict[str, float]): Dictionary containing scale factors for x and y directions.
657
+ Expected keys are 'x' and 'y'.
658
+ border_mode (int): Border mode for handling keypoints near image edges.
659
+ Use cv2.BORDER_REFLECT_101, cv2.BORDER_REFLECT, etc.
660
+
661
+ Returns:
662
+ np.ndarray: Transformed keypoints array with the same shape as input.
663
+
664
+ Notes:
665
+ - The function applies reflection padding if the mode is in REFLECT_BORDER_MODES.
666
+ - Coordinates (x, y) are transformed using the affine matrix.
667
+ - Angles are adjusted based on the rotation component of the affine transformation.
668
+ - Scales are multiplied by the maximum of x and y scale factors.
669
+ - The @angle_2pi_range decorator ensures angles remain in the [0, 2π] range.
670
+
671
+ Examples:
672
+ >>> keypoints = np.array([[100, 100, 0, 1]])
673
+ >>> matrix = np.array([[1.5, 0, 10], [0, 1.2, 20]])
674
+ >>> scale = {'x': 1.5, 'y': 1.2}
675
+ >>> transformed_keypoints = keypoints_affine(keypoints, matrix, (480, 640), scale, cv2.BORDER_REFLECT_101)
676
+
677
+ """
678
+ keypoints = keypoints.copy().astype(np.float32)
679
+
680
+ if is_identity_matrix(matrix):
681
+ return keypoints
682
+
683
+ if border_mode in REFLECT_BORDER_MODES:
684
+ # Step 1: Compute affine transform padding
685
+ pad_left, pad_right, pad_top, pad_bottom = calculate_affine_transform_padding(
686
+ matrix,
687
+ image_shape,
688
+ )
689
+ grid_dimensions = get_pad_grid_dimensions(
690
+ pad_top,
691
+ pad_bottom,
692
+ pad_left,
693
+ pad_right,
694
+ image_shape,
695
+ )
696
+ keypoints = generate_reflected_keypoints(
697
+ keypoints,
698
+ grid_dimensions,
699
+ image_shape,
700
+ center_in_origin=True,
701
+ )
702
+
703
+ # Extract x, y coordinates (z is preserved)
704
+ xy = keypoints[:, :2]
705
+
706
+ # Ensure matrix is 2x3
707
+ if matrix.shape == (3, 3):
708
+ matrix = matrix[:2]
709
+
710
+ # Transform x, y coordinates
711
+ xy_transformed = cv2.transform(xy.reshape(-1, 1, 2), matrix).squeeze()
712
+
713
+ # Calculate angle adjustment
714
+ angle_adjustment = rotation2d_matrix_to_euler_angles(matrix[:2, :2], y_up=False)
715
+
716
+ # Update angles (now at index 3)
717
+ keypoints[:, 3] = keypoints[:, 3] + angle_adjustment
718
+
719
+ # Update scales (now at index 4)
720
+ max_scale = max(scale["x"], scale["y"])
721
+ keypoints[:, 4] *= max_scale
722
+
723
+ # Update x, y coordinates and preserve z
724
+ keypoints[:, :2] = xy_transformed
725
+
726
+ return keypoints
727
+
728
+
729
+ @handle_empty_array("points")
730
+ def apply_affine_to_points(points: np.ndarray, matrix: np.ndarray) -> np.ndarray:
731
+ """Apply affine transformation to a set of points.
732
+
733
+ This function handles potential division by zero by replacing zero values
734
+ in the homogeneous coordinate with a small epsilon value.
735
+
736
+ Args:
737
+ points (np.ndarray): Array of points with shape (N, 2).
738
+ matrix (np.ndarray): 3x3 affine transformation matrix.
739
+
740
+ Returns:
741
+ np.ndarray: Transformed points with shape (N, 2).
742
+
743
+ """
744
+ homogeneous_points = np.column_stack([points, np.ones(points.shape[0])])
745
+ transformed_points = homogeneous_points @ matrix.T
746
+
747
+ # Handle potential division by zero
748
+ epsilon = np.finfo(transformed_points.dtype).eps
749
+ transformed_points[:, 2] = np.where(
750
+ np.abs(transformed_points[:, 2]) < epsilon,
751
+ np.sign(transformed_points[:, 2]) * epsilon,
752
+ transformed_points[:, 2],
753
+ )
754
+
755
+ return transformed_points[:, :2] / transformed_points[:, 2:]
756
+
757
+
758
+ def calculate_affine_transform_padding(
759
+ matrix: np.ndarray,
760
+ image_shape: tuple[int, int],
761
+ ) -> tuple[int, int, int, int]:
762
+ """Calculate the necessary padding for an affine transformation to avoid empty spaces."""
763
+ height, width = image_shape[:2]
764
+
765
+ # Check for identity transform
766
+ if is_identity_matrix(matrix):
767
+ return (0, 0, 0, 0)
768
+
769
+ # Original corners
770
+ corners = np.array([[0, 0], [width, 0], [width, height], [0, height]])
771
+
772
+ # Transform corners
773
+ transformed_corners = apply_affine_to_points(corners, matrix)
774
+
775
+ # Ensure transformed_corners is 2D
776
+ transformed_corners = transformed_corners.reshape(-1, 2)
777
+
778
+ # Find box that includes both original and transformed corners
779
+ all_corners = np.vstack((corners, transformed_corners))
780
+ min_x, min_y = all_corners.min(axis=0)
781
+ max_x, max_y = all_corners.max(axis=0)
782
+
783
+ # Compute the inverse transform
784
+ inverse_matrix = np.linalg.inv(matrix)
785
+
786
+ # Apply inverse transform to all corners of the bounding box
787
+ bbox_corners = np.array(
788
+ [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]],
789
+ )
790
+ inverse_corners = apply_affine_to_points(bbox_corners, inverse_matrix).reshape(
791
+ -1,
792
+ 2,
793
+ )
794
+
795
+ min_x, min_y = inverse_corners.min(axis=0)
796
+ max_x, max_y = inverse_corners.max(axis=0)
797
+
798
+ pad_left = max(0, math.ceil(0 - min_x))
799
+ pad_right = max(0, math.ceil(max_x - width))
800
+ pad_top = max(0, math.ceil(0 - min_y))
801
+ pad_bottom = max(0, math.ceil(max_y - height))
802
+
803
+ return pad_left, pad_right, pad_top, pad_bottom
804
+
805
+
806
+ @handle_empty_array("bboxes")
807
+ def bboxes_affine_largest_box(bboxes: np.ndarray, matrix: np.ndarray) -> np.ndarray:
808
+ """Apply an affine transformation to bounding boxes and return the largest enclosing boxes.
809
+
810
+ This function transforms each corner of every bounding box using the given affine transformation
811
+ matrix, then computes the new bounding boxes that fully enclose the transformed corners.
812
+
813
+ Args:
814
+ bboxes (np.ndarray): An array of bounding boxes with shape (N, 4+) where N is the number of
815
+ bounding boxes. Each row should contain [x_min, y_min, x_max, y_max]
816
+ followed by any additional attributes (e.g., class labels).
817
+ matrix (np.ndarray): The 3x3 affine transformation matrix to apply.
818
+
819
+ Returns:
820
+ np.ndarray: An array of transformed bounding boxes with the same shape as the input.
821
+ Each row contains [new_x_min, new_y_min, new_x_max, new_y_max] followed by
822
+ any additional attributes from the input bounding boxes.
823
+
824
+ Note:
825
+ - This function assumes that the input bounding boxes are in the format [x_min, y_min, x_max, y_max].
826
+ - The resulting bounding boxes are the smallest axis-aligned boxes that completely
827
+ enclose the transformed original boxes. They may be larger than the minimal possible
828
+ bounding box if the original box becomes rotated.
829
+ - Any additional attributes beyond the first 4 coordinates are preserved unchanged.
830
+ - This method is called "largest box" because it returns the largest axis-aligned box
831
+ that encloses all corners of the transformed bounding box.
832
+
833
+ Examples:
834
+ >>> bboxes = np.array([[10, 10, 20, 20, 1], [30, 30, 40, 40, 2]]) # Two boxes with class labels
835
+ >>> matrix = np.array([[2, 0, 5], [0, 2, 5], [0, 0, 1]]) # Scale by 2 and translate by (5, 5)
836
+ >>> transformed_bboxes = bboxes_affine_largest_box(bboxes, matrix)
837
+ >>> print(transformed_bboxes)
838
+ [[ 25. 25. 45. 45. 1.]
839
+ [ 65. 65. 85. 85. 2.]]
840
+
841
+ """
842
+ # Extract corners of all bboxes
843
+ x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
844
+
845
+ corners = (
846
+ np.array([[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]).transpose(2, 0, 1).reshape(-1, 2)
847
+ )
848
+
849
+ # Transform all corners at once
850
+ transformed_corners = apply_affine_to_points(corners, matrix).reshape(-1, 4, 2)
851
+
852
+ # Compute new bounding boxes
853
+ new_x_min = np.min(transformed_corners[:, :, 0], axis=1)
854
+ new_x_max = np.max(transformed_corners[:, :, 0], axis=1)
855
+ new_y_min = np.min(transformed_corners[:, :, 1], axis=1)
856
+ new_y_max = np.max(transformed_corners[:, :, 1], axis=1)
857
+
858
+ return np.column_stack([new_x_min, new_y_min, new_x_max, new_y_max, bboxes[:, 4:]])
859
+
860
+
861
+ @handle_empty_array("bboxes")
862
+ def bboxes_affine_ellipse(bboxes: np.ndarray, matrix: np.ndarray) -> np.ndarray:
863
+ """Apply an affine transformation to bounding boxes using an ellipse approximation method.
864
+
865
+ This function transforms bounding boxes by approximating each box with an ellipse,
866
+ transforming points along the ellipse's circumference, and then computing the
867
+ new bounding box that encloses the transformed ellipse.
868
+
869
+ Args:
870
+ bboxes (np.ndarray): An array of bounding boxes with shape (N, 4+) where N is the number of
871
+ bounding boxes. Each row should contain [x_min, y_min, x_max, y_max]
872
+ followed by any additional attributes (e.g., class labels).
873
+ matrix (np.ndarray): The 3x3 affine transformation matrix to apply.
874
+
875
+ Returns:
876
+ np.ndarray: An array of transformed bounding boxes with the same shape as the input.
877
+ Each row contains [new_x_min, new_y_min, new_x_max, new_y_max] followed by
878
+ any additional attributes from the input bounding boxes.
879
+
880
+ Note:
881
+ - This function assumes that the input bounding boxes are in the format [x_min, y_min, x_max, y_max].
882
+ - The ellipse approximation method can provide a tighter bounding box compared to the
883
+ largest box method, especially for rotations.
884
+ - 360 points are used to approximate each ellipse, which provides a good balance between
885
+ accuracy and computational efficiency.
886
+ - Any additional attributes beyond the first 4 coordinates are preserved unchanged.
887
+ - This method may be more suitable for objects that are roughly elliptical in shape.
888
+
889
+ """
890
+ x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
891
+ bbox_width = (x_max - x_min) / 2
892
+ bbox_height = (y_max - y_min) / 2
893
+ center_x = x_min + bbox_width
894
+ center_y = y_min + bbox_height
895
+
896
+ angles = np.arange(0, 360, dtype=np.float32)
897
+ cos_angles = np.cos(np.radians(angles))
898
+ sin_angles = np.sin(np.radians(angles))
899
+
900
+ # Generate points for all ellipses at once
901
+ x = bbox_width[:, np.newaxis] * sin_angles + center_x[:, np.newaxis]
902
+ y = bbox_height[:, np.newaxis] * cos_angles + center_y[:, np.newaxis]
903
+ points = np.stack([x, y], axis=-1).reshape(-1, 2)
904
+
905
+ # Transform all points at once using the helper function
906
+ transformed_points = apply_affine_to_points(points, matrix)
907
+
908
+ transformed_points = transformed_points.reshape(len(bboxes), -1, 2)
909
+
910
+ # Compute new bounding boxes
911
+ new_x_min = np.min(transformed_points[:, :, 0], axis=1)
912
+ new_x_max = np.max(transformed_points[:, :, 0], axis=1)
913
+ new_y_min = np.min(transformed_points[:, :, 1], axis=1)
914
+ new_y_max = np.max(transformed_points[:, :, 1], axis=1)
915
+
916
+ return np.column_stack([new_x_min, new_y_min, new_x_max, new_y_max, bboxes[:, 4:]])
917
+
918
+
919
+ @handle_empty_array("bboxes")
920
+ def bboxes_affine(
921
+ bboxes: np.ndarray,
922
+ matrix: np.ndarray,
923
+ rotate_method: Literal["largest_box", "ellipse"],
924
+ image_shape: tuple[int, int],
925
+ border_mode: int,
926
+ output_shape: tuple[int, int],
927
+ ) -> np.ndarray:
928
+ """Apply an affine transformation to bounding boxes.
929
+
930
+ For reflection border modes (cv2.BORDER_REFLECT_101, cv2.BORDER_REFLECT), this function:
931
+ 1. Calculates necessary padding to avoid information loss
932
+ 2. Applies padding to the bounding boxes
933
+ 3. Adjusts the transformation matrix to account for padding
934
+ 4. Applies the affine transformation
935
+ 5. Validates the transformed bounding boxes
936
+
937
+ For other border modes, it directly applies the affine transformation without padding.
938
+
939
+ Args:
940
+ bboxes (np.ndarray): Input bounding boxes
941
+ matrix (np.ndarray): Affine transformation matrix
942
+ rotate_method (str): Method for rotating bounding boxes ('largest_box' or 'ellipse')
943
+ image_shape (Sequence[int]): Shape of the input image
944
+ border_mode (int): OpenCV border mode
945
+ output_shape (Sequence[int]): Shape of the output image
946
+
947
+ Returns:
948
+ np.ndarray: Transformed and normalized bounding boxes
949
+
950
+ """
951
+ if is_identity_matrix(matrix):
952
+ return bboxes
953
+
954
+ bboxes = denormalize_bboxes(bboxes, image_shape)
955
+
956
+ if border_mode in REFLECT_BORDER_MODES:
957
+ # Step 1: Compute affine transform padding
958
+ pad_left, pad_right, pad_top, pad_bottom = calculate_affine_transform_padding(
959
+ matrix,
960
+ image_shape,
961
+ )
962
+ grid_dimensions = get_pad_grid_dimensions(
963
+ pad_top,
964
+ pad_bottom,
965
+ pad_left,
966
+ pad_right,
967
+ image_shape,
968
+ )
969
+ bboxes = generate_reflected_bboxes(
970
+ bboxes,
971
+ grid_dimensions,
972
+ image_shape,
973
+ center_in_origin=True,
974
+ )
975
+
976
+ # Apply affine transform
977
+ if rotate_method == "largest_box":
978
+ transformed_bboxes = bboxes_affine_largest_box(bboxes, matrix)
979
+ elif rotate_method == "ellipse":
980
+ transformed_bboxes = bboxes_affine_ellipse(bboxes, matrix)
981
+ else:
982
+ raise ValueError(f"Method {rotate_method} is not a valid rotation method.")
983
+
984
+ # Validate and normalize bboxes
985
+ validated_bboxes = validate_bboxes(transformed_bboxes, output_shape)
986
+
987
+ return normalize_bboxes(validated_bboxes, output_shape)
988
+
989
+
990
+ def to_distance_maps(
991
+ keypoints: np.ndarray,
992
+ image_shape: tuple[int, int],
993
+ inverted: bool = False,
994
+ ) -> np.ndarray:
995
+ """Generate a ``(H,W,N)`` array of distance maps for ``N`` keypoints.
996
+ The ``n``-th distance map contains at every location ``(y, x)`` the
997
+ euclidean distance to the ``n``-th keypoint.
998
+ This function can be used as a helper when augmenting keypoints with a
999
+ method that only supports the augmentation of images.
1000
+
1001
+ Args:
1002
+ keypoints (np.ndarray): A numpy array of shape (N, 2+) where N is the number of keypoints.
1003
+ Each row represents a keypoint's (x, y) coordinates.
1004
+ image_shape (tuple[int, int]): Shape of the image (height, width)
1005
+ inverted (bool): If ``True``, inverted distance maps are returned where each
1006
+ distance value d is replaced by ``d/(d+1)``, i.e. the distance
1007
+ maps have values in the range ``(0.0, 1.0]`` with ``1.0`` denoting
1008
+ exactly the position of the respective keypoint.
1009
+
1010
+ Returns:
1011
+ np.ndarray: A float32 array of shape (H, W, N) containing ``N`` distance maps for ``N``
1012
+ keypoints. Each location ``(y, x, n)`` in the array denotes the
1013
+ euclidean distance at ``(y, x)`` to the ``n``-th keypoint.
1014
+ If `inverted` is ``True``, the distance ``d`` is replaced
1015
+ by ``d/(d+1)``. The height and width of the array match the
1016
+ height and width in ``image_shape``.
1017
+
1018
+ """
1019
+ height, width = image_shape[:2]
1020
+ if len(keypoints) == 0:
1021
+ return np.zeros((height, width, 0), dtype=np.float32)
1022
+
1023
+ # Create coordinate grids
1024
+ yy, xx = np.mgrid[:height, :width]
1025
+
1026
+ # Convert keypoints to numpy array
1027
+ keypoints_array = np.array(keypoints)
1028
+
1029
+ # Compute distances for all keypoints at once
1030
+ distances = np.sqrt(
1031
+ (xx[..., np.newaxis] - keypoints_array[:, 0]) ** 2 + (yy[..., np.newaxis] - keypoints_array[:, 1]) ** 2,
1032
+ )
1033
+
1034
+ if inverted:
1035
+ return (1 / (distances + 1)).astype(np.float32)
1036
+ return distances.astype(np.float32)
1037
+
1038
+
1039
+ def validate_if_not_found_coords(
1040
+ if_not_found_coords: Sequence[int] | dict[str, Any] | None,
1041
+ ) -> tuple[bool, float, float]:
1042
+ """Validate and process `if_not_found_coords` parameter."""
1043
+ if if_not_found_coords is None:
1044
+ return True, -1, -1
1045
+ if isinstance(if_not_found_coords, (tuple, list)):
1046
+ if len(if_not_found_coords) != PAIR:
1047
+ msg = "Expected tuple/list 'if_not_found_coords' to contain exactly two entries."
1048
+ raise ValueError(msg)
1049
+ return False, if_not_found_coords[0], if_not_found_coords[1]
1050
+ if isinstance(if_not_found_coords, dict):
1051
+ return False, if_not_found_coords["x"], if_not_found_coords["y"]
1052
+
1053
+ msg = "Expected if_not_found_coords to be None, tuple, list, or dict."
1054
+ raise ValueError(msg)
1055
+
1056
+
1057
+ def find_keypoint(
1058
+ position: tuple[int, int],
1059
+ distance_map: np.ndarray,
1060
+ threshold: float | None,
1061
+ inverted: bool,
1062
+ ) -> tuple[float, float] | None:
1063
+ """Determine if a valid keypoint can be found at the given position."""
1064
+ y, x = position
1065
+ value = distance_map[y, x]
1066
+ if not inverted and threshold is not None and value >= threshold:
1067
+ return None
1068
+ if inverted and threshold is not None and value <= threshold:
1069
+ return None
1070
+ return float(x), float(y)
1071
+
1072
+
1073
+ def from_distance_maps(
1074
+ distance_maps: np.ndarray,
1075
+ inverted: bool,
1076
+ if_not_found_coords: Sequence[int] | dict[str, Any] | None = None,
1077
+ threshold: float | None = None,
1078
+ ) -> np.ndarray:
1079
+ """Convert distance maps back to keypoints coordinates.
1080
+
1081
+ This function is the inverse of `to_distance_maps`. It takes distance maps generated for a set of keypoints
1082
+ and reconstructs the original keypoint coordinates. The function supports both regular and inverted distance maps,
1083
+ and can handle cases where keypoints are not found or fall outside a specified threshold.
1084
+
1085
+ Args:
1086
+ distance_maps (np.ndarray): A 3D numpy array of shape (height, width, nb_keypoints) containing
1087
+ distance maps for each keypoint. Each channel represents the distance map for one keypoint.
1088
+ inverted (bool): If True, treats the distance maps as inverted (where higher values indicate
1089
+ closer proximity to keypoints). If False, treats them as regular distance maps (where lower
1090
+ values indicate closer proximity).
1091
+ if_not_found_coords (Sequence[int] | dict[str, Any] | None, optional): Coordinates to use for
1092
+ keypoints that are not found or fall outside the threshold. Can be:
1093
+ - None: Drop keypoints that are not found.
1094
+ - Sequence of two integers: Use these as (x, y) coordinates for not found keypoints.
1095
+ - Dict with 'x' and 'y' keys: Use these values for not found keypoints.
1096
+ Defaults to None.
1097
+ threshold (float | None, optional): A threshold value to determine valid keypoints. For inverted
1098
+ maps, values >= threshold are considered valid. For regular maps, values <= threshold are
1099
+ considered valid. If None, all keypoints are considered valid. Defaults to None.
1100
+
1101
+ Returns:
1102
+ np.ndarray: A 2D numpy array of shape (nb_keypoints, 2) containing the (x, y) coordinates
1103
+ of the reconstructed keypoints. If `drop_if_not_found` is True (derived from if_not_found_coords),
1104
+ the output may have fewer rows than input keypoints.
1105
+
1106
+ Raises:
1107
+ ValueError: If the input `distance_maps` is not a 3D array.
1108
+
1109
+ Notes:
1110
+ - The function uses vectorized operations for improved performance, especially with large numbers of keypoints.
1111
+ - When `threshold` is None, all keypoints are considered valid, and `if_not_found_coords` is not used.
1112
+ - The function assumes that the input distance maps are properly normalized and scaled according to the
1113
+ original image dimensions.
1114
+
1115
+ Examples:
1116
+ >>> distance_maps = np.random.rand(100, 100, 3) # 3 keypoints
1117
+ >>> inverted = True
1118
+ >>> if_not_found_coords = [0, 0]
1119
+ >>> threshold = 0.5
1120
+ >>> keypoints = from_distance_maps(distance_maps, inverted, if_not_found_coords, threshold)
1121
+ >>> print(keypoints.shape)
1122
+ (3, 2)
1123
+
1124
+ """
1125
+ if distance_maps.ndim != NUM_MULTI_CHANNEL_DIMENSIONS:
1126
+ msg = f"Expected three-dimensional input, got {distance_maps.ndim} dimensions and shape {distance_maps.shape}."
1127
+ raise ValueError(msg)
1128
+ height, width, nb_keypoints = distance_maps.shape
1129
+
1130
+ drop_if_not_found, if_not_found_x, if_not_found_y = validate_if_not_found_coords(
1131
+ if_not_found_coords,
1132
+ )
1133
+
1134
+ # Find the indices of max/min values for all keypoints at once
1135
+ if inverted:
1136
+ hitidx_flat = np.argmax(
1137
+ distance_maps.reshape(height * width, nb_keypoints),
1138
+ axis=0,
1139
+ )
1140
+ else:
1141
+ hitidx_flat = np.argmin(
1142
+ distance_maps.reshape(height * width, nb_keypoints),
1143
+ axis=0,
1144
+ )
1145
+
1146
+ # Convert flat indices to 2D coordinates
1147
+ hitidx_y, hitidx_x = np.unravel_index(hitidx_flat, (height, width))
1148
+
1149
+ # Create keypoints array
1150
+ keypoints = np.column_stack((hitidx_x, hitidx_y)).astype(float)
1151
+
1152
+ if threshold is not None:
1153
+ # Check threshold condition
1154
+ if inverted:
1155
+ valid_mask = distance_maps[hitidx_y, hitidx_x, np.arange(nb_keypoints)] >= threshold
1156
+ else:
1157
+ valid_mask = distance_maps[hitidx_y, hitidx_x, np.arange(nb_keypoints)] <= threshold
1158
+
1159
+ if not drop_if_not_found:
1160
+ # Replace invalid keypoints with if_not_found_coords
1161
+ keypoints[~valid_mask] = [if_not_found_x, if_not_found_y]
1162
+ else:
1163
+ # Keep only valid keypoints
1164
+ return keypoints[valid_mask]
1165
+
1166
+ return keypoints
1167
+
1168
+
1169
+ # Group elements for D4 symmetry group
1170
+ D4_GROUP_ELEMENTS = ["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]
1171
+
1172
+
1173
+ def d4(img: np.ndarray, group_member: Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]) -> np.ndarray:
1174
+ """Applies a `D_4` symmetry group transformation to an image array.
1175
+
1176
+ This function manipulates an image using transformations such as rotations and flips,
1177
+ corresponding to the `D_4` dihedral group symmetry operations.
1178
+ Each transformation is identified by a unique group member code.
1179
+
1180
+ Args:
1181
+ img (np.ndarray): The input image array to transform.
1182
+ group_member (Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]): A string identifier indicating
1183
+ the specific transformation to apply. Valid codes include:
1184
+ - 'e': Identity (no transformation).
1185
+ - 'r90': Rotate 90 degrees counterclockwise.
1186
+ - 'r180': Rotate 180 degrees.
1187
+ - 'r270': Rotate 270 degrees counterclockwise.
1188
+ - 'v': Vertical flip.
1189
+ - 'hvt': Transpose over second diagonal
1190
+ - 'h': Horizontal flip.
1191
+ - 't': Transpose (reflect over the main diagonal).
1192
+
1193
+ Returns:
1194
+ np.ndarray: The transformed image array.
1195
+
1196
+ """
1197
+ # Execute the appropriate transformation
1198
+ return D4_TRANSFORMATIONS[group_member](img)
1199
+
1200
+
1201
+ def transpose(img: np.ndarray) -> np.ndarray:
1202
+ """Transposes the first two dimensions of an array of any dimensionality.
1203
+ Retains the order of any additional dimensions.
1204
+
1205
+ Args:
1206
+ img (np.ndarray): Input array.
1207
+
1208
+ Returns:
1209
+ np.ndarray: Transposed array.
1210
+
1211
+ """
1212
+ # Generate the new axes order
1213
+ new_axes = list(range(img.ndim))
1214
+ new_axes[0], new_axes[1] = 1, 0 # Swap the first two dimensions
1215
+
1216
+ # Transpose the array using the new axes order
1217
+ return img.transpose(new_axes)
1218
+
1219
+
1220
+ D4_TRANSFORMATIONS = {
1221
+ "e": lambda x: x, # Identity transformation
1222
+ "r90": lambda x: rot90(x, 1), # Rotate 90 degrees
1223
+ "r180": lambda x: rot90(x, 2), # Rotate 180 degrees
1224
+ "r270": lambda x: rot90(x, 3), # Rotate 270 degrees
1225
+ "v": vflip, # Vertical flip
1226
+ "hvt": lambda x: transpose(rot90(x, 2)), # Reflect over anti-diagonal
1227
+ "h": hflip, # Horizontal flip
1228
+ "t": transpose, # Transpose (reflect over main diagonal)
1229
+ }
1230
+
1231
+
1232
+ def transpose_images(images: np.ndarray) -> np.ndarray:
1233
+ """Transpose a batch of images.
1234
+
1235
+ Args:
1236
+ images (np.ndarray): Batch of images to transpose with shape:
1237
+ - (N, H, W) for grayscale images
1238
+ - (N, H, W, C) for multi-channel images
1239
+ where N is the batch size, H is height, W is width, C is channels
1240
+
1241
+ Returns:
1242
+ np.ndarray: Transposed batch of images with shape:
1243
+ - (N, W, H) for grayscale images
1244
+ - (N, W, H, C) for multi-channel images
1245
+
1246
+ """
1247
+ # Generate the new axes order
1248
+ new_axes = list(range(images.ndim))
1249
+ # Swap dimensions 1 and 2 (Height and Width), preserving batch dimension and channels
1250
+ new_axes[1], new_axes[2] = 2, 1
1251
+
1252
+ # Transpose the array using the new axes order
1253
+ return images.transpose(new_axes)
1254
+
1255
+
1256
+ def transpose_volumes(volumes: np.ndarray) -> np.ndarray:
1257
+ """Transpose a batch of volumes.
1258
+
1259
+ Args:
1260
+ volumes (np.ndarray): Batch of volumes to transpose with shape:
1261
+ - (N, D, H, W) for grayscale volumes
1262
+ - (N, D, H, W, C) for multi-channel volumes
1263
+ where N is the batch size, D is depth, H is height, W is width, C is channels
1264
+
1265
+ Returns:
1266
+ np.ndarray: Transposed batch of volumes with shape:
1267
+ - (N, D, W, H) for grayscale volumes
1268
+ - (N, D, W, H, C) for multi-channel volumes
1269
+
1270
+ """
1271
+ # Generate the new axes order
1272
+ new_axes = list(range(volumes.ndim))
1273
+ # Swap dimensions 2 and 3 (Height and Width), preserving batch, depth and channels
1274
+ new_axes[2], new_axes[3] = 3, 2
1275
+
1276
+ # Transpose the array using the new axes order
1277
+ return volumes.transpose(new_axes)
1278
+
1279
+
1280
+ def rot90(img: np.ndarray, factor: Literal[0, 1, 2, 3]) -> np.ndarray:
1281
+ """Rotate an image 90 degrees counterclockwise.
1282
+
1283
+ Args:
1284
+ img (np.ndarray): The input image to rotate.
1285
+ factor (Literal[0, 1, 2, 3]): The number of 90-degree rotations to apply.
1286
+
1287
+ Returns:
1288
+ np.ndarray: The rotated image.
1289
+
1290
+ """
1291
+ return np.rot90(img, factor)
1292
+
1293
+
1294
+ def rot90_images(images: np.ndarray, factor: Literal[0, 1, 2, 3]) -> np.ndarray:
1295
+ """Rotate a batch of images 90 degrees counter-clockwise multiple times.
1296
+
1297
+ Args:
1298
+ images (np.ndarray): Batch of images to rotate with shape:
1299
+ - (N, H, W) for grayscale images
1300
+ - (N, H, W, C) for multi-channel images
1301
+ where N is the batch size, H is height, W is width, C is channels
1302
+ factor (Literal[0, 1, 2, 3]): The number of 90-degree rotations to apply.
1303
+
1304
+ Returns:
1305
+ np.ndarray: Rotated batch of images with shape:
1306
+ - (N, W, H) for grayscale images when factor is 1 or 3
1307
+ - (N, H, W) for grayscale images when factor is 0 or 2
1308
+ - (N, W, H, C) for multi-channel images when factor is 1 or 3
1309
+ - (N, H, W, C) for multi-channel images when factor is 0 or 2
1310
+
1311
+ """
1312
+ # Axes 1 (height) and 2 (width) for rotation, preserving batch dimension
1313
+ return np.rot90(images, k=factor, axes=(1, 2))
1314
+
1315
+
1316
+ @handle_empty_array("bboxes")
1317
+ def bboxes_vflip(bboxes: np.ndarray) -> np.ndarray:
1318
+ """Flip bounding boxes vertically.
1319
+
1320
+ Args:
1321
+ bboxes (np.ndarray): Array of bounding boxes with shape (num_boxes, 4+)
1322
+
1323
+ Returns:
1324
+ np.ndarray: Vertically flipped bounding boxes
1325
+
1326
+ """
1327
+ flipped_bboxes = bboxes.copy()
1328
+ flipped_bboxes[:, 1] = 1 - bboxes[:, 3] # new y_min = 1 - y_max
1329
+ flipped_bboxes[:, 3] = 1 - bboxes[:, 1] # new y_max = 1 - y_min
1330
+
1331
+ return flipped_bboxes
1332
+
1333
+
1334
+ @handle_empty_array("bboxes")
1335
+ def bboxes_hflip(bboxes: np.ndarray) -> np.ndarray:
1336
+ """Flip bounding boxes horizontally.
1337
+
1338
+ Args:
1339
+ bboxes (np.ndarray): Array of bounding boxes with shape (num_boxes, 4+)
1340
+
1341
+ Returns:
1342
+ np.ndarray: Horizontally flipped bounding boxes
1343
+
1344
+ """
1345
+ flipped_bboxes = bboxes.copy()
1346
+ flipped_bboxes[:, 0] = 1 - bboxes[:, 2] # new x_min = 1 - x_max
1347
+ flipped_bboxes[:, 2] = 1 - bboxes[:, 0] # new x_max = 1 - x_min
1348
+
1349
+ return flipped_bboxes
1350
+
1351
+
1352
+ @handle_empty_array("bboxes")
1353
+ def bboxes_transpose(bboxes: np.ndarray) -> np.ndarray:
1354
+ """Transpose bounding boxes along the main diagonal.
1355
+
1356
+ Args:
1357
+ bboxes (np.ndarray): Array of bounding boxes with shape (num_boxes, 4+)
1358
+
1359
+ Returns:
1360
+ np.ndarray: Transposed bounding boxes
1361
+
1362
+ """
1363
+ transposed_bboxes = bboxes.copy()
1364
+ transposed_bboxes[:, [0, 1, 2, 3]] = bboxes[:, [1, 0, 3, 2]]
1365
+
1366
+ return transposed_bboxes
1367
+
1368
+
1369
+ @handle_empty_array("keypoints")
1370
+ @angle_2pi_range
1371
+ def keypoints_vflip(keypoints: np.ndarray, rows: int) -> np.ndarray:
1372
+ """Flip keypoints vertically.
1373
+
1374
+ Args:
1375
+ keypoints (np.ndarray): Array of keypoints with shape (num_keypoints, 2+)
1376
+ rows (int): Number of rows in the image
1377
+
1378
+ Returns:
1379
+ np.ndarray: Vertically flipped keypoints
1380
+
1381
+ """
1382
+ flipped_keypoints = keypoints.copy().astype(np.float32)
1383
+
1384
+ # Flip y-coordinates
1385
+ flipped_keypoints[:, 1] = (rows - 1) - keypoints[:, 1]
1386
+
1387
+ # Negate angles
1388
+ flipped_keypoints[:, 3] = -keypoints[:, 3]
1389
+
1390
+ return flipped_keypoints
1391
+
1392
+
1393
+ @handle_empty_array("keypoints")
1394
+ @angle_2pi_range
1395
+ def keypoints_hflip(keypoints: np.ndarray, cols: int) -> np.ndarray:
1396
+ """Flip keypoints horizontally.
1397
+
1398
+ Args:
1399
+ keypoints (np.ndarray): Array of keypoints with shape (num_keypoints, 2+)
1400
+ cols (int): Number of columns in the image
1401
+
1402
+ Returns:
1403
+ np.ndarray: Horizontally flipped keypoints
1404
+
1405
+ """
1406
+ flipped_keypoints = keypoints.copy().astype(np.float32)
1407
+
1408
+ # Flip x-coordinates
1409
+ flipped_keypoints[:, 0] = (cols - 1) - keypoints[:, 0]
1410
+
1411
+ # Adjust angles
1412
+ flipped_keypoints[:, 3] = np.pi - keypoints[:, 3]
1413
+
1414
+ return flipped_keypoints
1415
+
1416
+
1417
+ @handle_empty_array("keypoints")
1418
+ @angle_2pi_range
1419
+ def keypoints_transpose(keypoints: np.ndarray) -> np.ndarray:
1420
+ """Transpose keypoints along the main diagonal.
1421
+
1422
+ Args:
1423
+ keypoints (np.ndarray): Array of keypoints with shape (num_keypoints, 2+)
1424
+
1425
+ Returns:
1426
+ np.ndarray: Transposed keypoints
1427
+
1428
+ """
1429
+ transposed_keypoints = keypoints.copy()
1430
+
1431
+ # Swap x and y coordinates
1432
+ transposed_keypoints[:, [0, 1]] = keypoints[:, [1, 0]]
1433
+
1434
+ # Adjust angles to reflect the coordinate swap
1435
+ angles = keypoints[:, 3]
1436
+ transposed_keypoints[:, 3] = np.where(
1437
+ angles <= np.pi,
1438
+ np.pi / 2 - angles,
1439
+ 3 * np.pi / 2 - angles,
1440
+ )
1441
+
1442
+ return transposed_keypoints
1443
+
1444
+
1445
+ @preserve_channel_dim
1446
+ def pad(
1447
+ img: np.ndarray,
1448
+ min_height: int,
1449
+ min_width: int,
1450
+ border_mode: int,
1451
+ value: tuple[float, ...] | float | None,
1452
+ ) -> np.ndarray:
1453
+ """Pad an image to ensure minimum dimensions.
1454
+
1455
+ This function adds padding to an image if its dimensions are smaller than
1456
+ the specified minimum dimensions. Padding is added evenly on all sides.
1457
+
1458
+ Args:
1459
+ img (np.ndarray): Input image to pad.
1460
+ min_height (int): Minimum height of the output image.
1461
+ min_width (int): Minimum width of the output image.
1462
+ border_mode (int): OpenCV border mode for padding.
1463
+ value (tuple[float, ...] | float | None): Value(s) to fill the border pixels.
1464
+
1465
+ Returns:
1466
+ np.ndarray: Padded image with dimensions at least (min_height, min_width).
1467
+
1468
+ """
1469
+ height, width = img.shape[:2]
1470
+
1471
+ if height < min_height:
1472
+ h_pad_top = int((min_height - height) / 2.0)
1473
+ h_pad_bottom = min_height - height - h_pad_top
1474
+ else:
1475
+ h_pad_top = 0
1476
+ h_pad_bottom = 0
1477
+
1478
+ if width < min_width:
1479
+ w_pad_left = int((min_width - width) / 2.0)
1480
+ w_pad_right = min_width - width - w_pad_left
1481
+ else:
1482
+ w_pad_left = 0
1483
+ w_pad_right = 0
1484
+
1485
+ img = pad_with_params(
1486
+ img,
1487
+ h_pad_top,
1488
+ h_pad_bottom,
1489
+ w_pad_left,
1490
+ w_pad_right,
1491
+ border_mode,
1492
+ value,
1493
+ )
1494
+
1495
+ if img.shape[:2] != (max(min_height, height), max(min_width, width)):
1496
+ raise RuntimeError(
1497
+ f"Invalid result shape. Got: {img.shape[:2]}. Expected: {(max(min_height, height), max(min_width, width))}",
1498
+ )
1499
+
1500
+ return img
1501
+
1502
+
1503
+ def extend_value(value: tuple[float, ...] | float, num_channels: int) -> Sequence[float]:
1504
+ """Extend value to a sequence of floats.
1505
+
1506
+ This function extends a value to a sequence of floats.
1507
+ It is used to pad an image with a given value.
1508
+
1509
+ Args:
1510
+ value (tuple[float, ...] | float): The value to extend.
1511
+ num_channels (int): The number of channels in the image.
1512
+
1513
+ Returns:
1514
+ Sequence[float]: The extended value.
1515
+
1516
+ """
1517
+ return [value] * num_channels if isinstance(value, float) else value
1518
+
1519
+
1520
+ def copy_make_border_with_value_extension(
1521
+ img: np.ndarray,
1522
+ top: int,
1523
+ bottom: int,
1524
+ left: int,
1525
+ right: int,
1526
+ border_mode: int,
1527
+ value: tuple[float, ...] | float,
1528
+ ) -> np.ndarray:
1529
+ """Copy and make border with value extension.
1530
+
1531
+ This function copies and makes border with value extension.
1532
+ It is used to pad an image with a given value.
1533
+
1534
+ Args:
1535
+ img (np.ndarray): The image to pad.
1536
+ top (int): The amount to pad the top of the image.
1537
+ bottom (int): The amount to pad the bottom of the image.
1538
+ left (int): The amount to pad the left of the image.
1539
+ right (int): The amount to pad the right of the image.
1540
+ border_mode (int): The border mode to use.
1541
+ value (tuple[float, ...] | float): The value to pad the image with.
1542
+
1543
+ Returns:
1544
+ np.ndarray: The padded image.
1545
+
1546
+ """
1547
+ # For 0-channel images, return empty array of correct padded size
1548
+ if img.size == 0:
1549
+ height, width = img.shape[:2]
1550
+ return np.zeros(
1551
+ (height + top + bottom, width + left + right, 0),
1552
+ dtype=img.dtype,
1553
+ )
1554
+
1555
+ num_channels = get_num_channels(img)
1556
+ extended_value = extend_value(value, num_channels)
1557
+
1558
+ return cv2.copyMakeBorder(
1559
+ img,
1560
+ top,
1561
+ bottom,
1562
+ left,
1563
+ right,
1564
+ borderType=border_mode,
1565
+ value=extended_value,
1566
+ )
1567
+
1568
+
1569
+ @preserve_channel_dim
1570
+ def pad_with_params(
1571
+ img: np.ndarray,
1572
+ h_pad_top: int,
1573
+ h_pad_bottom: int,
1574
+ w_pad_left: int,
1575
+ w_pad_right: int,
1576
+ border_mode: int,
1577
+ value: tuple[float, ...] | float | None,
1578
+ ) -> np.ndarray:
1579
+ """Pad an image with explicitly defined padding on each side.
1580
+
1581
+ This function adds specified amounts of padding to each side of the image.
1582
+
1583
+ Args:
1584
+ img (np.ndarray): Input image to pad.
1585
+ h_pad_top (int): Number of pixels to add at the top.
1586
+ h_pad_bottom (int): Number of pixels to add at the bottom.
1587
+ w_pad_left (int): Number of pixels to add on the left.
1588
+ w_pad_right (int): Number of pixels to add on the right.
1589
+ border_mode (int): OpenCV border mode for padding.
1590
+ value (tuple[float, ...] | float | None): Value(s) to fill the border pixels.
1591
+
1592
+ Returns:
1593
+ np.ndarray: Padded image.
1594
+
1595
+ """
1596
+ pad_fn = maybe_process_in_chunks(
1597
+ copy_make_border_with_value_extension,
1598
+ top=h_pad_top,
1599
+ bottom=h_pad_bottom,
1600
+ left=w_pad_left,
1601
+ right=w_pad_right,
1602
+ border_mode=border_mode,
1603
+ value=value,
1604
+ )
1605
+
1606
+ return pad_fn(img)
1607
+
1608
+
1609
+ def pad_images_with_params(
1610
+ images: np.ndarray,
1611
+ h_pad_top: int,
1612
+ h_pad_bottom: int,
1613
+ w_pad_left: int,
1614
+ w_pad_right: int,
1615
+ border_mode: int,
1616
+ value: tuple[float, ...] | float | None,
1617
+ ) -> np.ndarray:
1618
+ """Pad a batch of images with explicitly defined padding on each side.
1619
+
1620
+ This function adds specified amounts of padding to each side of the image for each
1621
+ image in the batch.
1622
+
1623
+ Args:
1624
+ images (np.ndarray): Input batch of images to pad.
1625
+ h_pad_top (int): Number of pixels to add at the top.
1626
+ h_pad_bottom (int): Number of pixels to add at the bottom.
1627
+ w_pad_left (int): Number of pixels to add on the left.
1628
+ w_pad_right (int): Number of pixels to add on the right.
1629
+ border_mode (int): OpenCV border mode for padding.
1630
+ value (tuple[float, ...] | float | None): Value(s) to fill the border pixels.
1631
+
1632
+ Returns:
1633
+ np.ndarray: Padded batch of images.
1634
+
1635
+ """
1636
+ no_channel_dim = images.ndim == 3
1637
+ if no_channel_dim:
1638
+ images = images[..., np.newaxis]
1639
+
1640
+ cv2np_border_modes = {
1641
+ cv2.BORDER_CONSTANT: "constant",
1642
+ cv2.BORDER_REPLICATE: "edge",
1643
+ cv2.BORDER_REFLECT: "symmetric",
1644
+ cv2.BORDER_WRAP: "wrap",
1645
+ cv2.BORDER_REFLECT_101: "reflect",
1646
+ cv2.BORDER_REFLECT101: "reflect",
1647
+ cv2.BORDER_DEFAULT: "reflect", # same as cv2.BORDER_REFLECT_101
1648
+ }
1649
+ mode = cv2np_border_modes[border_mode]
1650
+
1651
+ pad_width = ((0, 0), (h_pad_top, h_pad_bottom), (w_pad_left, w_pad_right), (0, 0))
1652
+ if mode == "constant":
1653
+ constant_values = np.array(((0, 0), (value, value), (value, value), (0, 0)), dtype=object)
1654
+ kwargs = {"constant_values": constant_values}
1655
+ else:
1656
+ kwargs = {}
1657
+
1658
+ images = np.pad(images, pad_width=pad_width, mode=mode, **kwargs)
1659
+ if no_channel_dim:
1660
+ images = images[..., 0]
1661
+
1662
+ return images
1663
+
1664
+
1665
+ @preserve_channel_dim
1666
+ def remap(
1667
+ img: np.ndarray,
1668
+ map_x: np.ndarray,
1669
+ map_y: np.ndarray,
1670
+ interpolation: int,
1671
+ border_mode: int,
1672
+ value: tuple[float, ...] | float | None = None,
1673
+ ) -> np.ndarray:
1674
+ """Remap an image according to given coordinate maps.
1675
+
1676
+ This function applies a generic geometrical transformation using
1677
+ mapping functions that specify the position of each pixel in the output image.
1678
+
1679
+ Args:
1680
+ img (np.ndarray): Input image to transform.
1681
+ map_x (np.ndarray): Map of x-coordinates with same height and width as the input image.
1682
+ map_y (np.ndarray): Map of y-coordinates with same height and width as the input image.
1683
+ interpolation (int): Interpolation method for resampling.
1684
+ border_mode (int): OpenCV border mode for handling pixels outside the image boundaries.
1685
+ value (tuple[float, ...] | float | None, optional): Border value(s) if border_mode is BORDER_CONSTANT.
1686
+
1687
+ Returns:
1688
+ np.ndarray: Remapped image with the same shape as the input image.
1689
+
1690
+ """
1691
+ # Combine map_x and map_y into a single map array of type CV_32FC2
1692
+ map_xy = np.stack([map_x, map_y], axis=-1).astype(np.float32)
1693
+
1694
+ # Create remap function with chunks processing
1695
+ remap_func = maybe_process_in_chunks(
1696
+ cv2.remap,
1697
+ map1=map_xy,
1698
+ map2=None,
1699
+ interpolation=interpolation,
1700
+ borderMode=border_mode,
1701
+ borderValue=value,
1702
+ )
1703
+
1704
+ # Apply the remapping
1705
+ return remap_func(img)
1706
+
1707
+
1708
+ def remap_keypoints_via_mask(
1709
+ keypoints: np.ndarray,
1710
+ map_x: np.ndarray,
1711
+ map_y: np.ndarray,
1712
+ image_shape: tuple[int, int],
1713
+ ) -> np.ndarray:
1714
+ """Remap keypoints using mask and cv2.remap method."""
1715
+ height, width = image_shape[:2]
1716
+
1717
+ # Handle empty keypoints array
1718
+ if len(keypoints) == 0:
1719
+ return np.zeros((0, 2 if keypoints.size == 0 else keypoints.shape[1]))
1720
+
1721
+ # Create mask where each keypoint has unique index
1722
+ kp_mask = np.zeros((height, width), dtype=np.int16)
1723
+ for idx, kp in enumerate(keypoints, start=1):
1724
+ x, y = round(kp[0]), round(kp[1])
1725
+ if 0 <= x < width and 0 <= y < height:
1726
+ # Note: cv2.circle takes (x,y) coordinates
1727
+ cv2.circle(kp_mask, (x, y), 1, idx, -1)
1728
+
1729
+ # Remap the mask
1730
+ transformed_kp_mask = cv2.remap(
1731
+ kp_mask,
1732
+ map_x.astype(np.float32),
1733
+ map_y.astype(np.float32),
1734
+ cv2.INTER_NEAREST,
1735
+ )
1736
+
1737
+ # Extract transformed keypoints
1738
+ new_points = []
1739
+ for idx, kp in enumerate(keypoints, start=1):
1740
+ # Find points with this index
1741
+ points = np.where(transformed_kp_mask == idx)
1742
+ if len(points[0]) > 0:
1743
+ # Convert back to (x,y) coordinates
1744
+ new_points.append(np.concatenate([[points[1][0], points[0][0]], kp[2:]]))
1745
+
1746
+ return np.array(new_points) if new_points else np.zeros((0, keypoints.shape[1]))
1747
+
1748
+
1749
+ @handle_empty_array("keypoints")
1750
+ def remap_keypoints(
1751
+ keypoints: np.ndarray,
1752
+ map_x: np.ndarray,
1753
+ map_y: np.ndarray,
1754
+ image_shape: tuple[int, int],
1755
+ ) -> np.ndarray:
1756
+ """Transform keypoints using coordinate mapping functions.
1757
+
1758
+ This function applies the inverse of the mapping defined by map_x and map_y
1759
+ to keypoint coordinates. The inverse mapping is necessary because the mapping
1760
+ functions define how pixels move from the source to the destination image,
1761
+ while keypoints need to be transformed from the destination back to the source.
1762
+
1763
+ Args:
1764
+ keypoints (np.ndarray): Array of keypoints with shape (N, 2+), where
1765
+ the first two columns are x and y coordinates.
1766
+ map_x (np.ndarray): Map of x-coordinates with shape equal to image_shape.
1767
+ map_y (np.ndarray): Map of y-coordinates with shape equal to image_shape.
1768
+ image_shape (tuple[int, int]): Shape (height, width) of the original image.
1769
+
1770
+ Returns:
1771
+ np.ndarray: Transformed keypoints with the same shape as the input keypoints.
1772
+ Returns an empty array if input keypoints is empty.
1773
+
1774
+ """
1775
+ height, width = image_shape[:2]
1776
+
1777
+ # Extract x and y coordinates
1778
+ x, y = keypoints[:, 0], keypoints[:, 1]
1779
+
1780
+ # Clip coordinates to image boundaries
1781
+ x = np.clip(x, 0, width - 1)
1782
+ y = np.clip(y, 0, height - 1)
1783
+
1784
+ # Convert to integer indices
1785
+ x_idx, y_idx = x.astype(int), y.astype(int)
1786
+ inv_map_x, inv_map_y = generate_inverse_distortion_map(map_x, map_y, image_shape[:2])
1787
+ # Apply the inverse mapping
1788
+ new_x = inv_map_x[y_idx, x_idx]
1789
+ new_y = inv_map_y[y_idx, x_idx]
1790
+
1791
+ # Clip the new coordinates to ensure they're within the image bounds
1792
+ new_x = np.clip(new_x, 0, width - 1)
1793
+ new_y = np.clip(new_y, 0, height - 1)
1794
+
1795
+ # Create the transformed keypoints array
1796
+ return np.column_stack([new_x, new_y, keypoints[:, 2:]])
1797
+
1798
+
1799
+ def generate_inverse_distortion_map(
1800
+ map_x: np.ndarray,
1801
+ map_y: np.ndarray,
1802
+ shape: tuple[int, int],
1803
+ ) -> tuple[np.ndarray, np.ndarray]:
1804
+ """Generate inverse mapping for strong distortions."""
1805
+ h, w = shape
1806
+
1807
+ # Initialize inverse maps
1808
+ inv_map_x = np.zeros((h, w), dtype=np.float32)
1809
+ inv_map_y = np.zeros((h, w), dtype=np.float32)
1810
+
1811
+ # For each source point, record where it maps to
1812
+ for y in range(h):
1813
+ for x in range(w):
1814
+ # Get destination point
1815
+ dst_x = map_x[y, x]
1816
+ dst_y = map_y[y, x]
1817
+
1818
+ # If destination is within bounds
1819
+ if 0 <= dst_x < w and 0 <= dst_y < h:
1820
+ # Get neighborhood coordinates
1821
+ dst_x_floor = int(np.floor(dst_x))
1822
+ dst_x_ceil = min(dst_x_floor + 1, w - 1)
1823
+ dst_y_floor = int(np.floor(dst_y))
1824
+ dst_y_ceil = min(dst_y_floor + 1, h - 1)
1825
+
1826
+ # Fill neighborhood
1827
+ for ny in range(dst_y_floor, dst_y_ceil + 1):
1828
+ for nx in range(dst_x_floor, dst_x_ceil + 1):
1829
+ # Only update if empty or closer to pixel center
1830
+ if inv_map_x[ny, nx] == 0 or (
1831
+ abs(nx - dst_x) + abs(ny - dst_y)
1832
+ < abs(nx - inv_map_x[ny, nx]) + abs(ny - inv_map_y[ny, nx])
1833
+ ):
1834
+ inv_map_x[ny, nx] = x
1835
+ inv_map_y[ny, nx] = y
1836
+
1837
+ return inv_map_x, inv_map_y
1838
+
1839
+
1840
+ @handle_empty_array("bboxes")
1841
+ def remap_bboxes(
1842
+ bboxes: np.ndarray,
1843
+ map_x: np.ndarray,
1844
+ map_y: np.ndarray,
1845
+ image_shape: tuple[int, int],
1846
+ ) -> np.ndarray:
1847
+ """Remap bounding boxes using displacement maps."""
1848
+ # Convert bboxes to mask
1849
+ bbox_masks = bboxes_to_mask(bboxes, image_shape)
1850
+
1851
+ # Ensure maps are float32
1852
+ map_x = map_x.astype(np.float32)
1853
+ map_y = map_y.astype(np.float32)
1854
+
1855
+ transformed_masks = remap(bbox_masks, map_x, map_y, cv2.INTER_NEAREST, cv2.BORDER_CONSTANT, value=0)
1856
+
1857
+ # Convert masks back to bboxes
1858
+ return mask_to_bboxes(transformed_masks, bboxes)
1859
+
1860
+
1861
+ def generate_displacement_fields(
1862
+ image_shape: tuple[int, int],
1863
+ alpha: float,
1864
+ sigma: float,
1865
+ same_dxdy: bool,
1866
+ kernel_size: tuple[int, int],
1867
+ random_generator: np.random.Generator,
1868
+ noise_distribution: Literal["gaussian", "uniform"],
1869
+ ) -> tuple[np.ndarray, np.ndarray]:
1870
+ """Generate displacement fields for elastic transform.
1871
+
1872
+ This function generates displacement fields for elastic transform based on the provided parameters.
1873
+ It generates noise either from a Gaussian or uniform distribution and normalizes it to the range [-1, 1].
1874
+
1875
+ Args:
1876
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
1877
+ alpha (float): The alpha parameter for the elastic transform.
1878
+ sigma (float): The sigma parameter for the elastic transform.
1879
+ same_dxdy (bool): Whether to use the same displacement field for both x and y directions.
1880
+ kernel_size (tuple[int, int]): The size of the kernel for the elastic transform.
1881
+ random_generator (np.random.Generator): The random number generator to use.
1882
+ noise_distribution (Literal["gaussian", "uniform"]): The distribution of the noise.
1883
+
1884
+ Returns:
1885
+ tuple[np.ndarray, np.ndarray]: A tuple containing:
1886
+ - fields: The displacement fields for the elastic transform.
1887
+ - output_shape: The output shape of the elastic warp.
1888
+
1889
+ """
1890
+ # Pre-allocate memory and generate noise in one step
1891
+ if noise_distribution == "gaussian":
1892
+ # Generate and normalize in one step, directly as float32
1893
+ fields = random_generator.standard_normal(
1894
+ (1 if same_dxdy else 2, *image_shape[:2]),
1895
+ dtype=np.float32,
1896
+ )
1897
+ # Normalize inplace
1898
+ max_abs = np.abs(fields, out=np.empty_like(fields)).max()
1899
+ if max_abs > 1e-6:
1900
+ fields /= max_abs
1901
+ else: # uniform is already normalized to [-1, 1]
1902
+ fields = random_generator.uniform(
1903
+ -1,
1904
+ 1,
1905
+ size=(1 if same_dxdy else 2, *image_shape[:2]),
1906
+ ).astype(np.float32)
1907
+
1908
+ # # Apply Gaussian blur if needed using fast OpenCV operations
1909
+ # When kernel_size is (0,0) cv2.GaussianBlur uses automatic kernel size. Kernel == (0,0) is NOT a noop.
1910
+ # Reshape to 2D array (combining first dimension with height)
1911
+ shape = fields.shape
1912
+ fields = fields.reshape(-1, shape[-1])
1913
+
1914
+ # Apply blur to all fields at once
1915
+ cv2.GaussianBlur(
1916
+ fields,
1917
+ kernel_size,
1918
+ sigma,
1919
+ dst=fields,
1920
+ borderType=cv2.BORDER_REPLICATE,
1921
+ )
1922
+
1923
+ # Restore original shape
1924
+ fields = fields.reshape(shape)
1925
+
1926
+ # Scale by alpha inplace
1927
+ fields *= alpha
1928
+
1929
+ # Return views of the array to avoid copies
1930
+ return (fields[0], fields[0]) if same_dxdy else (fields[0], fields[1])
1931
+
1932
+
1933
+ @handle_empty_array("bboxes")
1934
+ def pad_bboxes(
1935
+ bboxes: np.ndarray,
1936
+ pad_top: int,
1937
+ pad_bottom: int,
1938
+ pad_left: int,
1939
+ pad_right: int,
1940
+ border_mode: int,
1941
+ image_shape: tuple[int, int],
1942
+ ) -> np.ndarray:
1943
+ """Pad bounding boxes by a given amount.
1944
+
1945
+ This function pads bounding boxes by a given amount.
1946
+ It handles both reflection and padding.
1947
+
1948
+ Args:
1949
+ bboxes (np.ndarray): The bounding boxes to pad.
1950
+ pad_top (int): The amount to pad the top of the bounding boxes.
1951
+ pad_bottom (int): The amount to pad the bottom of the bounding boxes.
1952
+ pad_left (int): The amount to pad the left of the bounding boxes.
1953
+ pad_right (int): The amount to pad the right of the bounding boxes.
1954
+ border_mode (int): The border mode to use.
1955
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
1956
+
1957
+ Returns:
1958
+ np.ndarray: The padded bounding boxes.
1959
+
1960
+ """
1961
+ if border_mode not in REFLECT_BORDER_MODES:
1962
+ shift_vector = np.array([pad_left, pad_top, pad_left, pad_top])
1963
+ return shift_bboxes(bboxes, shift_vector)
1964
+
1965
+ grid_dimensions = get_pad_grid_dimensions(
1966
+ pad_top,
1967
+ pad_bottom,
1968
+ pad_left,
1969
+ pad_right,
1970
+ image_shape,
1971
+ )
1972
+
1973
+ bboxes = generate_reflected_bboxes(bboxes, grid_dimensions, image_shape)
1974
+
1975
+ # Calculate the number of grid cells added on each side
1976
+ original_row, original_col = grid_dimensions["original_position"]
1977
+
1978
+ image_height, image_width = image_shape[:2]
1979
+
1980
+ # Subtract the offset based on the number of added grid cells
1981
+ left_shift = original_col * image_width - pad_left
1982
+ top_shift = original_row * image_height - pad_top
1983
+
1984
+ shift_vector = np.array([-left_shift, -top_shift, -left_shift, -top_shift])
1985
+
1986
+ bboxes = shift_bboxes(bboxes, shift_vector)
1987
+
1988
+ new_height = pad_top + pad_bottom + image_height
1989
+ new_width = pad_left + pad_right + image_width
1990
+
1991
+ return validate_bboxes(bboxes, (new_height, new_width))
1992
+
1993
+
1994
+ def validate_bboxes(bboxes: np.ndarray, image_shape: Sequence[int]) -> np.ndarray:
1995
+ """Validate bounding boxes and remove invalid ones.
1996
+
1997
+ Args:
1998
+ bboxes (np.ndarray): Array of bounding boxes with shape (n, 4) where each row is [x_min, y_min, x_max, y_max].
1999
+ image_shape (tuple[int, int]): Shape of the image as (height, width).
2000
+
2001
+ Returns:
2002
+ np.ndarray: Array of valid bounding boxes, potentially with fewer boxes than the input.
2003
+
2004
+ Examples:
2005
+ >>> bboxes = np.array([[10, 20, 30, 40], [-10, -10, 5, 5], [100, 100, 120, 120]])
2006
+ >>> valid_bboxes = validate_bboxes(bboxes, (100, 100))
2007
+ >>> print(valid_bboxes)
2008
+ [[10 20 30 40]]
2009
+
2010
+ """
2011
+ rows, cols = image_shape[:2]
2012
+
2013
+ x_min, y_min, x_max, y_max = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
2014
+
2015
+ valid_indices = (x_max > 0) & (y_max > 0) & (x_min < cols) & (y_min < rows)
2016
+
2017
+ return bboxes[valid_indices]
2018
+
2019
+
2020
+ def shift_bboxes(bboxes: np.ndarray, shift_vector: np.ndarray) -> np.ndarray:
2021
+ """Shift bounding boxes by a given vector.
2022
+
2023
+ Args:
2024
+ bboxes (np.ndarray): Array of bounding boxes with shape (n, m) where n is the number of bboxes
2025
+ and m >= 4. The first 4 columns are [x_min, y_min, x_max, y_max].
2026
+ shift_vector (np.ndarray): Vector to shift the bounding boxes by, with shape (4,) for
2027
+ [shift_x, shift_y, shift_x, shift_y].
2028
+
2029
+ Returns:
2030
+ np.ndarray: Shifted bounding boxes with the same shape as input.
2031
+
2032
+ """
2033
+ # Create a copy of the input array to avoid modifying it in-place
2034
+ shifted_bboxes = bboxes.copy()
2035
+
2036
+ # Add the shift vector to the first 4 columns
2037
+ shifted_bboxes[:, :4] += shift_vector
2038
+
2039
+ return shifted_bboxes
2040
+
2041
+
2042
+ def get_pad_grid_dimensions(
2043
+ pad_top: int,
2044
+ pad_bottom: int,
2045
+ pad_left: int,
2046
+ pad_right: int,
2047
+ image_shape: tuple[int, int],
2048
+ ) -> dict[str, tuple[int, int]]:
2049
+ """Calculate the dimensions of the grid needed for reflection padding and the position of the original image.
2050
+
2051
+ Args:
2052
+ pad_top (int): Number of pixels to pad above the image.
2053
+ pad_bottom (int): Number of pixels to pad below the image.
2054
+ pad_left (int): Number of pixels to pad to the left of the image.
2055
+ pad_right (int): Number of pixels to pad to the right of the image.
2056
+ image_shape (tuple[int, int]): Shape of the original image as (height, width).
2057
+
2058
+ Returns:
2059
+ dict[str, tuple[int, int]]: A dictionary containing:
2060
+ - 'grid_shape': A tuple (grid_rows, grid_cols) where:
2061
+ - grid_rows (int): Number of times the image needs to be repeated vertically.
2062
+ - grid_cols (int): Number of times the image needs to be repeated horizontally.
2063
+ - 'original_position': A tuple (original_row, original_col) where:
2064
+ - original_row (int): Row index of the original image in the grid.
2065
+ - original_col (int): Column index of the original image in the grid.
2066
+
2067
+ """
2068
+ rows, cols = image_shape[:2]
2069
+
2070
+ grid_rows = 1 + math.ceil(pad_top / rows) + math.ceil(pad_bottom / rows)
2071
+ grid_cols = 1 + math.ceil(pad_left / cols) + math.ceil(pad_right / cols)
2072
+ original_row = math.ceil(pad_top / rows)
2073
+ original_col = math.ceil(pad_left / cols)
2074
+
2075
+ return {
2076
+ "grid_shape": (grid_rows, grid_cols),
2077
+ "original_position": (original_row, original_col),
2078
+ }
2079
+
2080
+
2081
+ def generate_reflected_bboxes(
2082
+ bboxes: np.ndarray,
2083
+ grid_dims: dict[str, tuple[int, int]],
2084
+ image_shape: tuple[int, int],
2085
+ center_in_origin: bool = False,
2086
+ ) -> np.ndarray:
2087
+ """Generate reflected bounding boxes for the entire reflection grid.
2088
+
2089
+ Args:
2090
+ bboxes (np.ndarray): Original bounding boxes.
2091
+ grid_dims (dict[str, tuple[int, int]]): Grid dimensions and original position.
2092
+ image_shape (tuple[int, int]): Shape of the original image as (height, width).
2093
+ center_in_origin (bool): If True, center the grid at the origin. Default is False.
2094
+
2095
+ Returns:
2096
+ np.ndarray: Array of reflected and shifted bounding boxes for the entire grid.
2097
+
2098
+ """
2099
+ rows, cols = image_shape[:2]
2100
+ grid_rows, grid_cols = grid_dims["grid_shape"]
2101
+ original_row, original_col = grid_dims["original_position"]
2102
+
2103
+ # Prepare flipped versions of bboxes
2104
+ bboxes_hflipped = flip_bboxes(bboxes, flip_horizontal=True, image_shape=image_shape)
2105
+ bboxes_vflipped = flip_bboxes(bboxes, flip_vertical=True, image_shape=image_shape)
2106
+ bboxes_hvflipped = flip_bboxes(
2107
+ bboxes,
2108
+ flip_horizontal=True,
2109
+ flip_vertical=True,
2110
+ image_shape=image_shape,
2111
+ )
2112
+
2113
+ # Shift all versions to the original position
2114
+ shift_vector = np.array(
2115
+ [
2116
+ original_col * cols,
2117
+ original_row * rows,
2118
+ original_col * cols,
2119
+ original_row * rows,
2120
+ ],
2121
+ )
2122
+ bboxes = shift_bboxes(bboxes, shift_vector)
2123
+ bboxes_hflipped = shift_bboxes(bboxes_hflipped, shift_vector)
2124
+ bboxes_vflipped = shift_bboxes(bboxes_vflipped, shift_vector)
2125
+ bboxes_hvflipped = shift_bboxes(bboxes_hvflipped, shift_vector)
2126
+
2127
+ new_bboxes = []
2128
+
2129
+ for grid_row in range(grid_rows):
2130
+ for grid_col in range(grid_cols):
2131
+ # Determine which version of bboxes to use based on grid position
2132
+ if (grid_row - original_row) % 2 == 0 and (grid_col - original_col) % 2 == 0:
2133
+ current_bboxes = bboxes
2134
+ elif (grid_row - original_row) % 2 == 0:
2135
+ current_bboxes = bboxes_hflipped
2136
+ elif (grid_col - original_col) % 2 == 0:
2137
+ current_bboxes = bboxes_vflipped
2138
+ else:
2139
+ current_bboxes = bboxes_hvflipped
2140
+
2141
+ # Shift to the current grid cell
2142
+ cell_shift = np.array(
2143
+ [
2144
+ (grid_col - original_col) * cols,
2145
+ (grid_row - original_row) * rows,
2146
+ (grid_col - original_col) * cols,
2147
+ (grid_row - original_row) * rows,
2148
+ ],
2149
+ )
2150
+ shifted_bboxes = shift_bboxes(current_bboxes, cell_shift)
2151
+
2152
+ new_bboxes.append(shifted_bboxes)
2153
+
2154
+ result = np.vstack(new_bboxes)
2155
+
2156
+ return shift_bboxes(result, -shift_vector) if center_in_origin else result
2157
+
2158
+
2159
+ @handle_empty_array("bboxes")
2160
+ def flip_bboxes(
2161
+ bboxes: np.ndarray,
2162
+ flip_horizontal: bool = False,
2163
+ flip_vertical: bool = False,
2164
+ image_shape: tuple[int, int] = (0, 0),
2165
+ ) -> np.ndarray:
2166
+ """Flip bounding boxes horizontally and/or vertically.
2167
+
2168
+ Args:
2169
+ bboxes (np.ndarray): Array of bounding boxes with shape (n, m) where each row is
2170
+ [x_min, y_min, x_max, y_max, ...].
2171
+ flip_horizontal (bool): Whether to flip horizontally.
2172
+ flip_vertical (bool): Whether to flip vertically.
2173
+ image_shape (tuple[int, int]): Shape of the image as (height, width).
2174
+
2175
+ Returns:
2176
+ np.ndarray: Flipped bounding boxes.
2177
+
2178
+ """
2179
+ rows, cols = image_shape[:2]
2180
+ flipped_bboxes = bboxes.copy()
2181
+ if flip_horizontal:
2182
+ flipped_bboxes[:, [0, 2]] = cols - flipped_bboxes[:, [2, 0]]
2183
+ if flip_vertical:
2184
+ flipped_bboxes[:, [1, 3]] = rows - flipped_bboxes[:, [3, 1]]
2185
+ return flipped_bboxes
2186
+
2187
+
2188
+ @preserve_channel_dim
2189
+ def distort_image(
2190
+ image: np.ndarray,
2191
+ generated_mesh: np.ndarray,
2192
+ interpolation: int,
2193
+ ) -> np.ndarray:
2194
+ """Apply perspective distortion to an image based on a generated mesh.
2195
+
2196
+ This function applies a perspective transformation to each cell of the image defined by the
2197
+ generated mesh. The distortion is applied using OpenCV's perspective transformation and
2198
+ blending techniques.
2199
+
2200
+ Args:
2201
+ image (np.ndarray): The input image to be distorted. Can be a 2D grayscale image or a
2202
+ 3D color image.
2203
+ generated_mesh (np.ndarray): A 2D array where each row represents a quadrilateral cell
2204
+ as [x1, y1, x2, y2, dst_x1, dst_y1, dst_x2, dst_y2, dst_x3, dst_y3, dst_x4, dst_y4].
2205
+ The first four values define the source rectangle, and the last eight values
2206
+ define the destination quadrilateral.
2207
+ interpolation (int): Interpolation method to be used in the perspective transformation.
2208
+ Should be one of the OpenCV interpolation flags (e.g., cv2.INTER_LINEAR).
2209
+
2210
+ Returns:
2211
+ np.ndarray: The distorted image with the same shape and dtype as the input image.
2212
+
2213
+ Note:
2214
+ - The function preserves the channel dimension of the input image.
2215
+ - Each cell of the generated mesh is transformed independently and then blended into the output image.
2216
+ - The distortion is applied using perspective transformation, which allows for more complex
2217
+ distortions compared to affine transformations.
2218
+
2219
+ Examples:
2220
+ >>> image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
2221
+ >>> mesh = np.array([[0, 0, 50, 50, 5, 5, 45, 5, 45, 45, 5, 45]])
2222
+ >>> distorted = distort_image(image, mesh, cv2.INTER_LINEAR)
2223
+ >>> distorted.shape
2224
+ (100, 100, 3)
2225
+
2226
+ """
2227
+ distorted_image = np.zeros_like(image)
2228
+
2229
+ for mesh in generated_mesh:
2230
+ # Extract source rectangle and destination quadrilateral
2231
+ x1, y1, x2, y2 = mesh[:4] # Source rectangle
2232
+ dst_quad = mesh[4:].reshape(4, 2) # Destination quadrilateral
2233
+
2234
+ # Convert source rectangle to quadrilateral
2235
+ src_quad = np.array(
2236
+ [
2237
+ [x1, y1], # Top-left
2238
+ [x2, y1], # Top-right
2239
+ [x2, y2], # Bottom-right
2240
+ [x1, y2], # Bottom-left
2241
+ ],
2242
+ dtype=np.float32,
2243
+ )
2244
+
2245
+ # Calculate Perspective transformation matrix
2246
+ perspective_mat = cv2.getPerspectiveTransform(src_quad, dst_quad)
2247
+
2248
+ # Apply Perspective transformation
2249
+ warped = cv2.warpPerspective(
2250
+ image,
2251
+ perspective_mat,
2252
+ (image.shape[1], image.shape[0]),
2253
+ flags=interpolation,
2254
+ )
2255
+
2256
+ # Create mask for the transformed region
2257
+ mask = np.zeros(image.shape[:2], dtype=np.uint8)
2258
+ cv2.fillConvexPoly(mask, np.int32(dst_quad), 255)
2259
+
2260
+ # Copy only the warped quadrilateral area to the output image
2261
+ distorted_image = cv2.copyTo(warped, mask, distorted_image)
2262
+
2263
+ return distorted_image
2264
+
2265
+
2266
+ @handle_empty_array("bboxes")
2267
+ def bbox_distort_image(
2268
+ bboxes: np.ndarray,
2269
+ generated_mesh: np.ndarray,
2270
+ image_shape: tuple[int, int],
2271
+ ) -> np.ndarray:
2272
+ """Distort bounding boxes based on a generated mesh.
2273
+
2274
+ This function applies a perspective transformation to each bounding box based on the provided generated mesh.
2275
+ It ensures that the bounding boxes are clipped to the image boundaries after transformation.
2276
+
2277
+ Args:
2278
+ bboxes (np.ndarray): The bounding boxes to distort.
2279
+ generated_mesh (np.ndarray): The generated mesh to distort the bounding boxes with.
2280
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2281
+
2282
+ Returns:
2283
+ np.ndarray: The distorted bounding boxes.
2284
+
2285
+ """
2286
+ bboxes = bboxes.copy()
2287
+ masks = masks_from_bboxes(bboxes, image_shape)
2288
+
2289
+ transformed_masks = cv2.merge(
2290
+ [distort_image(mask, generated_mesh, cv2.INTER_NEAREST) for mask in masks],
2291
+ )
2292
+
2293
+ if transformed_masks.ndim == NUM_MULTI_CHANNEL_DIMENSIONS:
2294
+ transformed_masks = transformed_masks.transpose(2, 0, 1)
2295
+
2296
+ # Normalize the returned bboxes
2297
+ bboxes[:, :4] = bboxes_from_masks(transformed_masks)
2298
+
2299
+ return bboxes
2300
+
2301
+
2302
+ @handle_empty_array("keypoints")
2303
+ def distort_image_keypoints(
2304
+ keypoints: np.ndarray,
2305
+ generated_mesh: np.ndarray,
2306
+ image_shape: tuple[int, int],
2307
+ ) -> np.ndarray:
2308
+ """Distort keypoints based on a generated mesh.
2309
+
2310
+ This function applies a perspective transformation to each keypoint based on the provided generated mesh.
2311
+ It ensures that the keypoints are clipped to the image boundaries after transformation.
2312
+
2313
+ Args:
2314
+ keypoints (np.ndarray): The keypoints to distort.
2315
+ generated_mesh (np.ndarray): The generated mesh to distort the keypoints with.
2316
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2317
+
2318
+ Returns:
2319
+ np.ndarray: The distorted keypoints.
2320
+
2321
+ """
2322
+ distorted_keypoints = keypoints.copy()
2323
+ height, width = image_shape[:2]
2324
+
2325
+ for mesh in generated_mesh:
2326
+ x1, y1, x2, y2 = mesh[:4] # Source rectangle
2327
+ dst_quad = mesh[4:].reshape(4, 2) # Destination quadrilateral
2328
+
2329
+ src_quad = np.array(
2330
+ [
2331
+ [x1, y1], # Top-left
2332
+ [x2, y1], # Top-right
2333
+ [x2, y2], # Bottom-right
2334
+ [x1, y2], # Bottom-left
2335
+ ],
2336
+ dtype=np.float32,
2337
+ )
2338
+
2339
+ perspective_mat = cv2.getPerspectiveTransform(src_quad, dst_quad)
2340
+
2341
+ mask = (keypoints[:, 0] >= x1) & (keypoints[:, 0] < x2) & (keypoints[:, 1] >= y1) & (keypoints[:, 1] < y2)
2342
+ cell_keypoints = keypoints[mask]
2343
+
2344
+ if len(cell_keypoints) > 0:
2345
+ # Convert to float32 before applying the transformation
2346
+ points_float32 = cell_keypoints[:, :2].astype(np.float32).reshape(-1, 1, 2)
2347
+ transformed_points = cv2.perspectiveTransform(
2348
+ points_float32,
2349
+ perspective_mat,
2350
+ ).reshape(-1, 2)
2351
+
2352
+ # Update distorted keypoints
2353
+ distorted_keypoints[mask, :2] = transformed_points
2354
+
2355
+ # Clip keypoints to image boundaries
2356
+ distorted_keypoints[:, 0] = np.clip(
2357
+ distorted_keypoints[:, 0],
2358
+ 0,
2359
+ width - 1,
2360
+ out=distorted_keypoints[:, 0],
2361
+ )
2362
+ distorted_keypoints[:, 1] = np.clip(
2363
+ distorted_keypoints[:, 1],
2364
+ 0,
2365
+ height - 1,
2366
+ out=distorted_keypoints[:, 1],
2367
+ )
2368
+
2369
+ return distorted_keypoints
2370
+
2371
+
2372
+ def generate_distorted_grid_polygons(
2373
+ dimensions: np.ndarray,
2374
+ magnitude: int,
2375
+ random_generator: np.random.Generator,
2376
+ ) -> np.ndarray:
2377
+ """Generate distorted grid polygons based on input dimensions and magnitude.
2378
+
2379
+ This function creates a grid of polygons and applies random distortions to the internal vertices,
2380
+ while keeping the boundary vertices fixed. The distortion is applied consistently across shared
2381
+ vertices to avoid gaps or overlaps in the resulting grid.
2382
+
2383
+ Args:
2384
+ dimensions (np.ndarray): A 3D array of shape (grid_height, grid_width, 4) where each element
2385
+ is [x_min, y_min, x_max, y_max] representing the dimensions of a grid cell.
2386
+ magnitude (int): Maximum pixel-wise displacement for distortion. The actual displacement
2387
+ will be randomly chosen in the range [-magnitude, magnitude].
2388
+ random_generator (np.random.Generator): A random number generator.
2389
+
2390
+ Returns:
2391
+ np.ndarray: A 2D array of shape (total_cells, 8) where each row represents a distorted polygon
2392
+ as [x1, y1, x2, y1, x2, y2, x1, y2]. The total_cells is equal to grid_height * grid_width.
2393
+
2394
+ Note:
2395
+ - Only internal grid points are distorted; boundary points remain fixed.
2396
+ - The function ensures consistent distortion across shared vertices of adjacent cells.
2397
+ - The distortion is applied to the following points of each internal cell:
2398
+ * Bottom-right of the cell above and to the left
2399
+ * Bottom-left of the cell above
2400
+ * Top-right of the cell to the left
2401
+ * Top-left of the current cell
2402
+ - Each square represents a cell, and the X marks indicate the coordinates where displacement occurs.
2403
+ +--+--+--+--+
2404
+ | | | | |
2405
+ +--X--X--X--+
2406
+ | | | | |
2407
+ +--X--X--X--+
2408
+ | | | | |
2409
+ +--X--X--X--+
2410
+ | | | | |
2411
+ +--+--+--+--+
2412
+ - For each X, the coordinates of the left, right, top, and bottom edges
2413
+ in the four adjacent cells are displaced.
2414
+
2415
+ Examples:
2416
+ >>> dimensions = np.array([[[0, 0, 50, 50], [50, 0, 100, 50]],
2417
+ ... [[0, 50, 50, 100], [50, 50, 100, 100]]])
2418
+ >>> distorted = generate_distorted_grid_polygons(dimensions, magnitude=10)
2419
+ >>> distorted.shape
2420
+ (4, 8)
2421
+
2422
+ """
2423
+ grid_height, grid_width = dimensions.shape[:2]
2424
+ total_cells = grid_height * grid_width
2425
+
2426
+ # Initialize polygons
2427
+ polygons = np.zeros((total_cells, 8), dtype=np.float32)
2428
+ polygons[:, 0:2] = dimensions.reshape(-1, 4)[:, [0, 1]] # x1, y1
2429
+ polygons[:, 2:4] = dimensions.reshape(-1, 4)[:, [2, 1]] # x2, y1
2430
+ polygons[:, 4:6] = dimensions.reshape(-1, 4)[:, [2, 3]] # x2, y2
2431
+ polygons[:, 6:8] = dimensions.reshape(-1, 4)[:, [0, 3]] # x1, y2
2432
+
2433
+ # Generate displacements for internal grid points only
2434
+ internal_points_height, internal_points_width = grid_height - 1, grid_width - 1
2435
+ displacements = random_generator.integers(
2436
+ -magnitude,
2437
+ magnitude + 1,
2438
+ size=(internal_points_height, internal_points_width, 2),
2439
+ ).astype(np.float32)
2440
+
2441
+ # Apply displacements to internal polygon vertices
2442
+ for i in range(1, grid_height):
2443
+ for j in range(1, grid_width):
2444
+ dx, dy = displacements[i - 1, j - 1]
2445
+
2446
+ # Bottom-right of cell (i-1, j-1)
2447
+ polygons[(i - 1) * grid_width + (j - 1), 4:6] += [dx, dy]
2448
+
2449
+ # Bottom-left of cell (i-1, j)
2450
+ polygons[(i - 1) * grid_width + j, 6:8] += [dx, dy]
2451
+
2452
+ # Top-right of cell (i, j-1)
2453
+ polygons[i * grid_width + (j - 1), 2:4] += [dx, dy]
2454
+
2455
+ # Top-left of cell (i, j)
2456
+ polygons[i * grid_width + j, 0:2] += [dx, dy]
2457
+
2458
+ return polygons
2459
+
2460
+
2461
+ @handle_empty_array("keypoints")
2462
+ def pad_keypoints(
2463
+ keypoints: np.ndarray,
2464
+ pad_top: int,
2465
+ pad_bottom: int,
2466
+ pad_left: int,
2467
+ pad_right: int,
2468
+ border_mode: int,
2469
+ image_shape: tuple[int, int],
2470
+ ) -> np.ndarray:
2471
+ """Pad keypoints by a given amount.
2472
+
2473
+ This function pads keypoints by a given amount.
2474
+ It handles both reflection and padding.
2475
+
2476
+ Args:
2477
+ keypoints (np.ndarray): The keypoints to pad.
2478
+ pad_top (int): The amount to pad the top of the keypoints.
2479
+ pad_bottom (int): The amount to pad the bottom of the keypoints.
2480
+ pad_left (int): The amount to pad the left of the keypoints.
2481
+ pad_right (int): The amount to pad the right of the keypoints.
2482
+ border_mode (int): The border mode to use.
2483
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2484
+
2485
+ Returns:
2486
+ np.ndarray: The padded keypoints.
2487
+
2488
+ """
2489
+ if border_mode not in REFLECT_BORDER_MODES:
2490
+ shift_vector = np.array([pad_left, pad_top, 0])
2491
+ return shift_keypoints(keypoints, shift_vector)
2492
+
2493
+ grid_dimensions = get_pad_grid_dimensions(
2494
+ pad_top,
2495
+ pad_bottom,
2496
+ pad_left,
2497
+ pad_right,
2498
+ image_shape,
2499
+ )
2500
+
2501
+ keypoints = generate_reflected_keypoints(keypoints, grid_dimensions, image_shape)
2502
+
2503
+ rows, cols = image_shape[:2]
2504
+
2505
+ # Calculate the number of grid cells added on each side
2506
+ original_row, original_col = grid_dimensions["original_position"]
2507
+
2508
+ # Subtract the offset based on the number of added grid cells
2509
+ keypoints[:, 0] -= original_col * cols - pad_left # x
2510
+ keypoints[:, 1] -= original_row * rows - pad_top # y
2511
+
2512
+ new_height = pad_top + pad_bottom + rows
2513
+ new_width = pad_left + pad_right + cols
2514
+
2515
+ return validate_keypoints(keypoints, (new_height, new_width))
2516
+
2517
+
2518
+ def validate_keypoints(
2519
+ keypoints: np.ndarray,
2520
+ image_shape: tuple[int, int],
2521
+ ) -> np.ndarray:
2522
+ """Validate keypoints and remove those that fall outside the image boundaries.
2523
+
2524
+ Args:
2525
+ keypoints (np.ndarray): Array of keypoints with shape (N, M) where N is the number of keypoints
2526
+ and M >= 2. The first two columns represent x and y coordinates.
2527
+ image_shape (tuple[int, int]): Shape of the image as (height, width).
2528
+
2529
+ Returns:
2530
+ np.ndarray: Array of valid keypoints that fall within the image boundaries.
2531
+
2532
+ Note:
2533
+ This function only checks the x and y coordinates (first two columns) of the keypoints.
2534
+ Any additional columns (e.g., angle, scale) are preserved for valid keypoints.
2535
+
2536
+ """
2537
+ rows, cols = image_shape[:2]
2538
+
2539
+ x, y = keypoints[:, 0], keypoints[:, 1]
2540
+
2541
+ valid_indices = (x >= 0) & (x < cols) & (y >= 0) & (y < rows)
2542
+
2543
+ return keypoints[valid_indices]
2544
+
2545
+
2546
+ def shift_keypoints(keypoints: np.ndarray, shift_vector: np.ndarray) -> np.ndarray:
2547
+ """Shift keypoints by a given shift vector.
2548
+
2549
+ This function shifts the keypoints by a given shift vector.
2550
+ It only shifts the x, y and z coordinates of the keypoints.
2551
+
2552
+ Args:
2553
+ keypoints (np.ndarray): The keypoints to shift.
2554
+ shift_vector (np.ndarray): The shift vector to apply to the keypoints.
2555
+
2556
+ Returns:
2557
+ np.ndarray: The shifted keypoints.
2558
+
2559
+ """
2560
+ shifted_keypoints = keypoints.copy()
2561
+ shifted_keypoints[:, :3] += shift_vector[:3] # Only shift x, y and z
2562
+ return shifted_keypoints
2563
+
2564
+
2565
+ def generate_reflected_keypoints(
2566
+ keypoints: np.ndarray,
2567
+ grid_dims: dict[str, tuple[int, int]],
2568
+ image_shape: tuple[int, int],
2569
+ center_in_origin: bool = False,
2570
+ ) -> np.ndarray:
2571
+ """Generate reflected keypoints for the entire reflection grid.
2572
+
2573
+ This function creates a grid of keypoints by reflecting and shifting the original keypoints.
2574
+ It handles both centered and non-centered grids based on the `center_in_origin` parameter.
2575
+
2576
+ Args:
2577
+ keypoints (np.ndarray): Original keypoints array of shape (N, 4+), where N is the number of keypoints,
2578
+ and each keypoint is represented by at least 4 values (x, y, angle, scale, ...).
2579
+ grid_dims (dict[str, tuple[int, int]]): A dictionary containing grid dimensions and original position.
2580
+ It should have the following keys:
2581
+ - "grid_shape": tuple[int, int] representing (grid_rows, grid_cols)
2582
+ - "original_position": tuple[int, int] representing (original_row, original_col)
2583
+ image_shape (tuple[int, int]): Shape of the original image as (height, width).
2584
+ center_in_origin (bool, optional): If True, center the grid at the origin. Default is False.
2585
+
2586
+ Returns:
2587
+ np.ndarray: Array of reflected and shifted keypoints for the entire grid. The shape is
2588
+ (N * grid_rows * grid_cols, 4+), where N is the number of original keypoints.
2589
+
2590
+ Note:
2591
+ - The function handles keypoint flipping and shifting to create a grid of reflected keypoints.
2592
+ - It preserves the angle and scale information of the keypoints during transformations.
2593
+ - The resulting grid can be either centered at the origin or positioned based on the original grid.
2594
+
2595
+ """
2596
+ grid_rows, grid_cols = grid_dims["grid_shape"]
2597
+ original_row, original_col = grid_dims["original_position"]
2598
+
2599
+ # Prepare flipped versions of keypoints
2600
+ keypoints_hflipped = flip_keypoints(
2601
+ keypoints,
2602
+ flip_horizontal=True,
2603
+ image_shape=image_shape,
2604
+ )
2605
+ keypoints_vflipped = flip_keypoints(
2606
+ keypoints,
2607
+ flip_vertical=True,
2608
+ image_shape=image_shape,
2609
+ )
2610
+ keypoints_hvflipped = flip_keypoints(
2611
+ keypoints,
2612
+ flip_horizontal=True,
2613
+ flip_vertical=True,
2614
+ image_shape=image_shape,
2615
+ )
2616
+
2617
+ rows, cols = image_shape[:2]
2618
+
2619
+ # Shift all versions to the original position
2620
+ shift_vector = np.array(
2621
+ [original_col * cols, original_row * rows, 0, 0, 0],
2622
+ ) # Only shift x and y
2623
+ keypoints = shift_keypoints(keypoints, shift_vector)
2624
+ keypoints_hflipped = shift_keypoints(keypoints_hflipped, shift_vector)
2625
+ keypoints_vflipped = shift_keypoints(keypoints_vflipped, shift_vector)
2626
+ keypoints_hvflipped = shift_keypoints(keypoints_hvflipped, shift_vector)
2627
+
2628
+ new_keypoints = []
2629
+
2630
+ for grid_row in range(grid_rows):
2631
+ for grid_col in range(grid_cols):
2632
+ # Determine which version of keypoints to use based on grid position
2633
+ if (grid_row - original_row) % 2 == 0 and (grid_col - original_col) % 2 == 0:
2634
+ current_keypoints = keypoints
2635
+ elif (grid_row - original_row) % 2 == 0:
2636
+ current_keypoints = keypoints_hflipped
2637
+ elif (grid_col - original_col) % 2 == 0:
2638
+ current_keypoints = keypoints_vflipped
2639
+ else:
2640
+ current_keypoints = keypoints_hvflipped
2641
+
2642
+ # Shift to the current grid cell
2643
+ cell_shift = np.array(
2644
+ [
2645
+ (grid_col - original_col) * cols,
2646
+ (grid_row - original_row) * rows,
2647
+ 0,
2648
+ 0,
2649
+ 0,
2650
+ ],
2651
+ )
2652
+ shifted_keypoints = shift_keypoints(current_keypoints, cell_shift)
2653
+
2654
+ new_keypoints.append(shifted_keypoints)
2655
+
2656
+ result = np.vstack(new_keypoints)
2657
+
2658
+ return shift_keypoints(result, -shift_vector) if center_in_origin else result
2659
+
2660
+
2661
+ @handle_empty_array("keypoints")
2662
+ def flip_keypoints(
2663
+ keypoints: np.ndarray,
2664
+ flip_horizontal: bool = False,
2665
+ flip_vertical: bool = False,
2666
+ image_shape: tuple[int, int] = (0, 0),
2667
+ ) -> np.ndarray:
2668
+ """Flip keypoints horizontally or vertically.
2669
+
2670
+ This function flips keypoints horizontally or vertically based on the provided parameters.
2671
+ It also flips the angle of the keypoints when flipping horizontally.
2672
+
2673
+ Args:
2674
+ keypoints (np.ndarray): The keypoints to flip.
2675
+ flip_horizontal (bool): Whether to flip horizontally.
2676
+ flip_vertical (bool): Whether to flip vertically.
2677
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2678
+
2679
+ Returns:
2680
+ np.ndarray: The flipped keypoints.
2681
+
2682
+ """
2683
+ rows, cols = image_shape[:2]
2684
+ flipped_keypoints = keypoints.copy()
2685
+ if flip_horizontal:
2686
+ flipped_keypoints[:, 0] = cols - flipped_keypoints[:, 0]
2687
+ flipped_keypoints[:, 3] = -flipped_keypoints[:, 3] # Flip angle
2688
+ if flip_vertical:
2689
+ flipped_keypoints[:, 1] = rows - flipped_keypoints[:, 1]
2690
+ flipped_keypoints[:, 3] = -flipped_keypoints[:, 3] # Flip angle
2691
+ return flipped_keypoints
2692
+
2693
+
2694
+ def create_affine_transformation_matrix(
2695
+ translate: Mapping[str, float],
2696
+ shear: dict[str, float],
2697
+ scale: dict[str, float],
2698
+ rotate: float,
2699
+ shift: tuple[float, float],
2700
+ ) -> np.ndarray:
2701
+ """Create an affine transformation matrix combining translation, shear, scale, and rotation.
2702
+
2703
+ Args:
2704
+ translate (dict[str, float]): Translation in x and y directions.
2705
+ shear (dict[str, float]): Shear in x and y directions (in degrees).
2706
+ scale (dict[str, float]): Scale factors for x and y directions.
2707
+ rotate (float): Rotation angle in degrees.
2708
+ shift (tuple[float, float]): Shift to apply before and after transformations.
2709
+
2710
+ Returns:
2711
+ np.ndarray: The resulting 3x3 affine transformation matrix.
2712
+
2713
+ """
2714
+ # Convert angles to radians
2715
+ rotate_rad = np.deg2rad(rotate % 360)
2716
+
2717
+ shear_x_rad = np.deg2rad(shear["x"])
2718
+ shear_y_rad = np.deg2rad(shear["y"])
2719
+
2720
+ # Create individual transformation matrices
2721
+ # 1. Shift to top-left
2722
+ m_shift_topleft = np.array([[1, 0, -shift[0]], [0, 1, -shift[1]], [0, 0, 1]])
2723
+
2724
+ # 2. Scale
2725
+ m_scale = np.array([[scale["x"], 0, 0], [0, scale["y"], 0], [0, 0, 1]])
2726
+
2727
+ # 3. Rotation
2728
+ m_rotate = np.array(
2729
+ [
2730
+ [np.cos(rotate_rad), np.sin(rotate_rad), 0],
2731
+ [-np.sin(rotate_rad), np.cos(rotate_rad), 0],
2732
+ [0, 0, 1],
2733
+ ],
2734
+ )
2735
+
2736
+ # 4. Shear
2737
+ m_shear = np.array(
2738
+ [[1, np.tan(shear_x_rad), 0], [np.tan(shear_y_rad), 1, 0], [0, 0, 1]],
2739
+ )
2740
+
2741
+ # 5. Translation
2742
+ m_translate = np.array([[1, 0, translate["x"]], [0, 1, translate["y"]], [0, 0, 1]])
2743
+
2744
+ # 6. Shift back to center
2745
+ m_shift_center = np.array([[1, 0, shift[0]], [0, 1, shift[1]], [0, 0, 1]])
2746
+
2747
+ # Combine all transformations
2748
+ # The order is important: transformations are applied from right to left
2749
+ m = m_shift_center @ m_translate @ m_shear @ m_rotate @ m_scale @ m_shift_topleft
2750
+
2751
+ # Ensure the last row is exactly [0, 0, 1]
2752
+ m[2] = [0, 0, 1]
2753
+
2754
+ return m
2755
+
2756
+
2757
+ def compute_transformed_image_bounds(
2758
+ matrix: np.ndarray,
2759
+ image_shape: tuple[int, int],
2760
+ ) -> tuple[np.ndarray, np.ndarray]:
2761
+ """Compute the bounds of an image after applying an affine transformation.
2762
+
2763
+ Args:
2764
+ matrix (np.ndarray): The 3x3 affine transformation matrix.
2765
+ image_shape (Tuple[int, int]): The shape of the image as (height, width).
2766
+
2767
+ Returns:
2768
+ tuple[np.ndarray, np.ndarray]: A tuple containing:
2769
+ - min_coords: An array with the minimum x and y coordinates.
2770
+ - max_coords: An array with the maximum x and y coordinates.
2771
+
2772
+ """
2773
+ height, width = image_shape[:2]
2774
+
2775
+ # Define the corners of the image
2776
+ corners = np.array([[0, 0, 1], [width, 0, 1], [width, height, 1], [0, height, 1]])
2777
+
2778
+ # Transform the corners
2779
+ transformed_corners = corners @ matrix.T
2780
+ transformed_corners = transformed_corners[:, :2] / transformed_corners[:, 2:]
2781
+
2782
+ # Calculate the bounding box of the transformed corners
2783
+ min_coords = np.floor(transformed_corners.min(axis=0)).astype(int)
2784
+ max_coords = np.ceil(transformed_corners.max(axis=0)).astype(int)
2785
+
2786
+ return min_coords, max_coords
2787
+
2788
+
2789
+ def compute_affine_warp_output_shape(
2790
+ matrix: np.ndarray,
2791
+ input_shape: tuple[int, ...],
2792
+ ) -> tuple[np.ndarray, tuple[int, int]]:
2793
+ """Compute the output shape of an affine warp.
2794
+
2795
+ This function computes the output shape of an affine warp based on the input matrix and input shape.
2796
+ It calculates the transformed image bounds and then determines the output shape based on the input shape.
2797
+
2798
+ Args:
2799
+ matrix (np.ndarray): The 3x3 affine transformation matrix.
2800
+ input_shape (tuple[int, ...]): The shape of the input image as (height, width, ...).
2801
+
2802
+ Returns:
2803
+ tuple[np.ndarray, tuple[int, int]]: A tuple containing:
2804
+ - matrix: The 3x3 affine transformation matrix.
2805
+ - output_shape: The output shape of the affine warp.
2806
+
2807
+ """
2808
+ height, width = input_shape[:2]
2809
+
2810
+ if height == 0 or width == 0:
2811
+ return matrix, cast("tuple[int, int]", input_shape[:2])
2812
+
2813
+ min_coords, max_coords = compute_transformed_image_bounds(matrix, (height, width))
2814
+ minc, minr = min_coords
2815
+ maxc, maxr = max_coords
2816
+
2817
+ out_height = maxr - minr + 1
2818
+ out_width = maxc - minc + 1
2819
+
2820
+ if len(input_shape) == NUM_MULTI_CHANNEL_DIMENSIONS:
2821
+ output_shape = np.ceil((out_height, out_width, input_shape[2]))
2822
+ else:
2823
+ output_shape = np.ceil((out_height, out_width))
2824
+
2825
+ output_shape_tuple = tuple(int(v) for v in output_shape.tolist())
2826
+
2827
+ # fit output image in new shape
2828
+ translation = np.array([[1, 0, -minc], [0, 1, -minr], [0, 0, 1]])
2829
+ matrix = translation @ matrix
2830
+
2831
+ return matrix, cast("tuple[int, int]", output_shape_tuple)
2832
+
2833
+
2834
+ def center(image_shape: tuple[int, int]) -> tuple[float, float]:
2835
+ """Calculate the center coordinates if image. Used by images, masks and keypoints.
2836
+
2837
+ Args:
2838
+ image_shape (tuple[int, int]): The shape of the image.
2839
+
2840
+ Returns:
2841
+ tuple[float, float]: center_x, center_y
2842
+
2843
+ """
2844
+ height, width = image_shape[:2]
2845
+ return width / 2 - 0.5, height / 2 - 0.5
2846
+
2847
+
2848
+ def center_bbox(image_shape: tuple[int, int]) -> tuple[float, float]:
2849
+ """Calculate the center coordinates for of image for bounding boxes.
2850
+
2851
+ Args:
2852
+ image_shape (tuple[int, int]): The shape of the image.
2853
+
2854
+ Returns:
2855
+ tuple[float, float]: center_x, center_y
2856
+
2857
+ """
2858
+ height, width = image_shape[:2]
2859
+ return width / 2, height / 2
2860
+
2861
+
2862
+ def generate_grid(
2863
+ image_shape: tuple[int, int],
2864
+ steps_x: list[float],
2865
+ steps_y: list[float],
2866
+ num_steps: int,
2867
+ ) -> tuple[np.ndarray, np.ndarray]:
2868
+ """Generate a distorted grid for image transformation based on given step sizes.
2869
+
2870
+ This function creates two 2D arrays (map_x and map_y) that represent a distorted version
2871
+ of the original image grid. These arrays can be used with OpenCV's remap function to
2872
+ apply grid distortion to an image.
2873
+
2874
+ Args:
2875
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2876
+ steps_x (list[float]): List of step sizes for the x-axis distortion. The length
2877
+ should be num_steps + 1. Each value represents the relative step size for
2878
+ a segment of the grid in the x direction.
2879
+ steps_y (list[float]): List of step sizes for the y-axis distortion. The length
2880
+ should be num_steps + 1. Each value represents the relative step size for
2881
+ a segment of the grid in the y direction.
2882
+ num_steps (int): The number of steps to divide each axis into. This determines
2883
+ the granularity of the distortion grid.
2884
+
2885
+ Returns:
2886
+ tuple[np.ndarray, np.ndarray]: A tuple containing two 2D numpy arrays:
2887
+ - map_x: A 2D array of float32 values representing the x-coordinates
2888
+ of the distorted grid.
2889
+ - map_y: A 2D array of float32 values representing the y-coordinates
2890
+ of the distorted grid.
2891
+
2892
+ Note:
2893
+ - The function generates a grid where each cell can be distorted independently.
2894
+ - The distortion is controlled by the steps_x and steps_y parameters, which
2895
+ determine how much each grid line is shifted.
2896
+ - The resulting map_x and map_y can be used directly with cv2.remap() to
2897
+ apply the distortion to an image.
2898
+ - The distortion is applied smoothly across each grid cell using linear
2899
+ interpolation.
2900
+
2901
+ Examples:
2902
+ >>> image_shape = (100, 100)
2903
+ >>> steps_x = [1.1, 0.9, 1.0, 1.2, 0.95, 1.05]
2904
+ >>> steps_y = [0.9, 1.1, 1.0, 1.1, 0.9, 1.0]
2905
+ >>> num_steps = 5
2906
+ >>> map_x, map_y = generate_grid(image_shape, steps_x, steps_y, num_steps)
2907
+ >>> distorted_image = cv2.remap(image, map_x, map_y, cv2.INTER_LINEAR)
2908
+
2909
+ """
2910
+ height, width = image_shape[:2]
2911
+ x_step = width // num_steps
2912
+ xx = np.zeros(width, np.float32)
2913
+ prev = 0.0
2914
+ for idx, step in enumerate(steps_x):
2915
+ x = idx * x_step
2916
+ start = int(x)
2917
+ end = min(int(x) + x_step, width)
2918
+ cur = prev + x_step * step
2919
+ xx[start:end] = np.linspace(prev, cur, end - start)
2920
+ prev = cur
2921
+
2922
+ y_step = height // num_steps
2923
+ yy = np.zeros(height, np.float32)
2924
+ prev = 0.0
2925
+ for idx, step in enumerate(steps_y):
2926
+ y = idx * y_step
2927
+ start = int(y)
2928
+ end = min(int(y) + y_step, height)
2929
+ cur = prev + y_step * step
2930
+ yy[start:end] = np.linspace(prev, cur, end - start)
2931
+ prev = cur
2932
+
2933
+ return np.meshgrid(xx, yy)
2934
+
2935
+
2936
+ def normalize_grid_distortion_steps(
2937
+ image_shape: tuple[int, int],
2938
+ num_steps: int,
2939
+ x_steps: list[float],
2940
+ y_steps: list[float],
2941
+ ) -> dict[str, np.ndarray]:
2942
+ """Normalize the grid distortion steps.
2943
+
2944
+ This function normalizes the grid distortion steps, ensuring that the distortion never leaves the image bounds.
2945
+ It compensates for smaller last steps in the source image and normalizes the steps such that the distortion
2946
+ never leaves the image bounds.
2947
+
2948
+ Args:
2949
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
2950
+ num_steps (int): The number of steps to divide each axis into. This determines
2951
+ the granularity of the distortion grid.
2952
+ x_steps (list[float]): List of step sizes for the x-axis distortion. The length
2953
+ should be num_steps + 1. Each value represents the relative step size for
2954
+ a segment of the grid in the x direction.
2955
+ y_steps (list[float]): List of step sizes for the y-axis distortion. The length
2956
+ should be num_steps + 1. Each value represents the relative step size for
2957
+ a segment of the grid in the y direction.
2958
+
2959
+ Returns:
2960
+ dict[str, np.ndarray]: A dictionary containing the normalized step sizes for the x and y axes.
2961
+
2962
+ """
2963
+ height, width = image_shape[:2]
2964
+
2965
+ # compensate for smaller last steps in source image.
2966
+ x_step = width // num_steps
2967
+ last_x_step = min(width, ((num_steps + 1) * x_step)) - (num_steps * x_step)
2968
+ x_steps[-1] *= last_x_step / x_step
2969
+
2970
+ y_step = height // num_steps
2971
+ last_y_step = min(height, ((num_steps + 1) * y_step)) - (num_steps * y_step)
2972
+ y_steps[-1] *= last_y_step / y_step
2973
+
2974
+ # now normalize such that distortion never leaves image bounds.
2975
+ tx = width / math.floor(width / num_steps)
2976
+ ty = height / math.floor(height / num_steps)
2977
+ x_steps = np.array(x_steps) * (tx / np.sum(x_steps))
2978
+ y_steps = np.array(y_steps) * (ty / np.sum(y_steps))
2979
+
2980
+ return {"steps_x": x_steps, "steps_y": y_steps}
2981
+
2982
+
2983
+ def almost_equal_intervals(n: int, parts: int) -> np.ndarray:
2984
+ """Generates an array of nearly equal integer intervals that sum up to `n`.
2985
+
2986
+ This function divides the number `n` into `parts` nearly equal parts. It ensures that
2987
+ the sum of all parts equals `n`, and the difference between any two parts is at most one.
2988
+ This is useful for distributing a total amount into nearly equal discrete parts.
2989
+
2990
+ Args:
2991
+ n (int): The total value to be split.
2992
+ parts (int): The number of parts to split into.
2993
+
2994
+ Returns:
2995
+ np.ndarray: An array of integers where each integer represents the size of a part.
2996
+
2997
+ Examples:
2998
+ >>> almost_equal_intervals(20, 3)
2999
+ array([7, 7, 6]) # Splits 20 into three parts: 7, 7, and 6
3000
+ >>> almost_equal_intervals(16, 4)
3001
+ array([4, 4, 4, 4]) # Splits 16 into four equal parts
3002
+
3003
+ """
3004
+ part_size, remainder = divmod(n, parts)
3005
+ # Create an array with the base part size and adjust the first `remainder` parts by adding 1
3006
+ return np.array(
3007
+ [part_size + 1 if i < remainder else part_size for i in range(parts)],
3008
+ )
3009
+
3010
+
3011
+ def generate_shuffled_splits(
3012
+ size: int,
3013
+ divisions: int,
3014
+ random_generator: np.random.Generator,
3015
+ ) -> np.ndarray:
3016
+ """Generate shuffled splits for a given dimension size and number of divisions.
3017
+
3018
+ Args:
3019
+ size (int): Total size of the dimension (height or width).
3020
+ divisions (int): Number of divisions (rows or columns).
3021
+ random_generator (np.random.Generator | None): The random generator to use for shuffling the splits.
3022
+ If None, the splits are not shuffled.
3023
+
3024
+ Returns:
3025
+ np.ndarray: Cumulative edges of the shuffled intervals.
3026
+
3027
+ """
3028
+ intervals = almost_equal_intervals(size, divisions)
3029
+ random_generator.shuffle(intervals)
3030
+ return np.insert(np.cumsum(intervals), 0, 0)
3031
+
3032
+
3033
+ def split_uniform_grid(
3034
+ image_shape: tuple[int, int],
3035
+ grid: tuple[int, int],
3036
+ random_generator: np.random.Generator,
3037
+ ) -> np.ndarray:
3038
+ """Splits an image shape into a uniform grid specified by the grid dimensions.
3039
+
3040
+ Args:
3041
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
3042
+ grid (tuple[int, int]): The grid size as (rows, columns).
3043
+ random_generator (np.random.Generator): The random generator to use for shuffling the splits.
3044
+ If None, the splits are not shuffled.
3045
+
3046
+ Returns:
3047
+ np.ndarray: An array containing the tiles' coordinates in the format (start_y, start_x, end_y, end_x).
3048
+
3049
+ Note:
3050
+ The function uses `generate_shuffled_splits` to generate the splits for the height and width of the image.
3051
+ The splits are then used to calculate the coordinates of the tiles.
3052
+
3053
+ """
3054
+ n_rows, n_cols = grid
3055
+
3056
+ height_splits = generate_shuffled_splits(
3057
+ image_shape[0],
3058
+ grid[0],
3059
+ random_generator=random_generator,
3060
+ )
3061
+ width_splits = generate_shuffled_splits(
3062
+ image_shape[1],
3063
+ grid[1],
3064
+ random_generator=random_generator,
3065
+ )
3066
+
3067
+ # Calculate tiles coordinates
3068
+ tiles = [
3069
+ (height_splits[i], width_splits[j], height_splits[i + 1], width_splits[j + 1])
3070
+ for i in range(n_rows)
3071
+ for j in range(n_cols)
3072
+ ]
3073
+
3074
+ return np.array(tiles, dtype=np.int16)
3075
+
3076
+
3077
+ def generate_perspective_points(
3078
+ image_shape: tuple[int, int],
3079
+ scale: float,
3080
+ random_generator: np.random.Generator,
3081
+ ) -> np.ndarray:
3082
+ """Generate perspective points for a given image shape and scale.
3083
+
3084
+ This function generates perspective points for a given image shape and scale.
3085
+ It uses a normal distribution to generate the points, and then modulates them to be within the image bounds.
3086
+
3087
+ Args:
3088
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
3089
+ scale (float): The scale of the perspective points.
3090
+ random_generator (np.random.Generator): The random generator to use for generating the points.
3091
+
3092
+ Returns:
3093
+ np.ndarray: The perspective points.
3094
+
3095
+ """
3096
+ height, width = image_shape[:2]
3097
+ points = random_generator.normal(0, scale, (4, 2))
3098
+ points = np.mod(np.abs(points), 0.32)
3099
+
3100
+ # top left -- no changes needed, just use jitter
3101
+ # top right
3102
+ points[1, 0] = 1.0 - points[1, 0] # w = 1.0 - jitter
3103
+ # bottom right
3104
+ points[2] = 1.0 - points[2] # w = 1.0 - jitter
3105
+ # bottom left
3106
+ points[3, 1] = 1.0 - points[3, 1] # h = 1.0 - jitter
3107
+
3108
+ points[:, 0] *= width
3109
+ points[:, 1] *= height
3110
+
3111
+ return points
3112
+
3113
+
3114
+ def order_points(pts: np.ndarray) -> np.ndarray:
3115
+ """Order points in a clockwise manner.
3116
+
3117
+ This function orders the points in a clockwise manner, ensuring that the points are in the correct
3118
+ order for perspective transformation.
3119
+
3120
+ Args:
3121
+ pts (np.ndarray): The points to order.
3122
+
3123
+ Returns:
3124
+ np.ndarray: The ordered points.
3125
+
3126
+ """
3127
+ pts = np.array(sorted(pts, key=lambda x: x[0]))
3128
+ left = pts[:2] # points with smallest x coordinate - left points
3129
+ right = pts[2:] # points with greatest x coordinate - right points
3130
+
3131
+ if left[0][1] < left[1][1]:
3132
+ tl, bl = left
3133
+ else:
3134
+ bl, tl = left
3135
+
3136
+ if right[0][1] < right[1][1]:
3137
+ tr, br = right
3138
+ else:
3139
+ br, tr = right
3140
+
3141
+ return np.array([tl, tr, br, bl], dtype=np.float32)
3142
+
3143
+
3144
+ def compute_perspective_params(
3145
+ points: np.ndarray,
3146
+ image_shape: tuple[int, int],
3147
+ ) -> tuple[np.ndarray, int, int]:
3148
+ """Compute perspective transformation parameters.
3149
+
3150
+ This function computes the perspective transformation parameters for a given set of points.
3151
+ It adjusts the points to ensure that the transformed image retains its original dimensions.
3152
+
3153
+ Args:
3154
+ points (np.ndarray): The points to compute the perspective transformation parameters for.
3155
+ image_shape (tuple[int, int]): The shape of the image.
3156
+
3157
+ Returns:
3158
+ tuple[np.ndarray, int, int]: The perspective transformation parameters and the maximum
3159
+ dimensions of the transformed image.
3160
+
3161
+ """
3162
+ height, width = image_shape
3163
+ top_left, top_right, bottom_right, bottom_left = points
3164
+
3165
+ def adjust_dimension(
3166
+ dim1: np.ndarray,
3167
+ dim2: np.ndarray,
3168
+ min_size: int = 2,
3169
+ ) -> float:
3170
+ size = np.sqrt(np.sum((dim1 - dim2) ** 2))
3171
+ if size < min_size:
3172
+ step_size = (min_size - size) / 2
3173
+ dim1[dim1 > dim2] += step_size
3174
+ dim2[dim1 > dim2] -= step_size
3175
+ dim1[dim1 <= dim2] -= step_size
3176
+ dim2[dim1 <= dim2] += step_size
3177
+ size = min_size
3178
+ return size
3179
+
3180
+ max_width = max(
3181
+ adjust_dimension(top_right, top_left),
3182
+ adjust_dimension(bottom_right, bottom_left),
3183
+ )
3184
+ max_height = max(
3185
+ adjust_dimension(bottom_right, top_right),
3186
+ adjust_dimension(bottom_left, top_left),
3187
+ )
3188
+
3189
+ dst = np.array([[0, 0], [width, 0], [width, height], [0, height]], dtype=np.float32)
3190
+ matrix = cv2.getPerspectiveTransform(points, dst)
3191
+
3192
+ return matrix, int(max_width), int(max_height)
3193
+
3194
+
3195
+ def expand_transform(
3196
+ matrix: np.ndarray,
3197
+ shape: tuple[int, int],
3198
+ ) -> tuple[np.ndarray, int, int]:
3199
+ """Expand a transformation matrix to include padding.
3200
+
3201
+ This function expands a transformation matrix to include padding, ensuring that the transformed
3202
+ image retains its original dimensions. It first calculates the destination points of the transformed
3203
+ image, then adjusts the matrix to include padding, and finally returns the expanded matrix and the
3204
+ maximum dimensions of the transformed image.
3205
+
3206
+ Args:
3207
+ matrix (np.ndarray): The transformation matrix to expand.
3208
+ shape (tuple[int, int]): The shape of the image.
3209
+
3210
+ Returns:
3211
+ tuple[np.ndarray, int, int]: The expanded matrix and the maximum dimensions of the transformed image.
3212
+
3213
+ """
3214
+ height, width = shape[:2]
3215
+ rect = np.array(
3216
+ [[0, 0], [width, 0], [width, height], [0, height]],
3217
+ dtype=np.float32,
3218
+ )
3219
+ dst = cv2.perspectiveTransform(np.array([rect]), matrix)[0]
3220
+
3221
+ dst -= dst.min(axis=0, keepdims=True)
3222
+ dst = np.around(dst, decimals=0)
3223
+
3224
+ matrix_expanded = cv2.getPerspectiveTransform(rect, dst)
3225
+ max_width, max_height = dst.max(axis=0)
3226
+ return matrix_expanded, int(max_width), int(max_height)
3227
+
3228
+
3229
+ def create_piecewise_affine_maps(
3230
+ image_shape: tuple[int, int],
3231
+ grid: tuple[int, int],
3232
+ scale: float,
3233
+ absolute_scale: bool,
3234
+ random_generator: np.random.Generator,
3235
+ ) -> tuple[np.ndarray | None, np.ndarray | None]:
3236
+ """Create maps for piecewise affine transformation using OpenCV's remap function.
3237
+
3238
+ This function creates maps for piecewise affine transformation using OpenCV's remap function.
3239
+ It generates the control points for the transformation, then uses the remap function to create
3240
+ the transformation maps.
3241
+
3242
+ Args:
3243
+ image_shape (tuple[int, int]): The shape of the image as (height, width).
3244
+ grid (tuple[int, int]): The grid size as (rows, columns).
3245
+ scale (float): The scale of the transformation.
3246
+ absolute_scale (bool): Whether to use absolute scale.
3247
+ random_generator (np.random.Generator): The random generator to use for generating the points.
3248
+
3249
+ Returns:
3250
+ tuple[np.ndarray | None, np.ndarray | None]: The transformation maps.
3251
+
3252
+ """
3253
+ height, width = image_shape[:2]
3254
+ nb_rows, nb_cols = grid
3255
+
3256
+ # Input validation
3257
+ if height <= 0 or width <= 0 or nb_rows <= 0 or nb_cols <= 0:
3258
+ raise ValueError("Dimensions must be positive")
3259
+ if scale <= 0:
3260
+ return None, None
3261
+
3262
+ # Create source points grid
3263
+ y = np.linspace(0, height - 1, nb_rows, dtype=np.float32)
3264
+ x = np.linspace(0, width - 1, nb_cols, dtype=np.float32)
3265
+ xx_src, yy_src = np.meshgrid(x, y)
3266
+
3267
+ # Initialize destination maps at full resolution
3268
+ map_x = np.zeros((height, width), dtype=np.float32)
3269
+ map_y = np.zeros((height, width), dtype=np.float32)
3270
+
3271
+ # Generate jitter for control points
3272
+ jitter_scale = scale / 3 if absolute_scale else scale * min(width, height) / 3
3273
+
3274
+ jitter = random_generator.normal(0, jitter_scale, (nb_rows, nb_cols, 2)).astype(
3275
+ np.float32,
3276
+ )
3277
+
3278
+ # Create control points with jitter
3279
+ control_points = np.zeros((nb_rows * nb_cols, 4), dtype=np.float32)
3280
+ for i in range(nb_rows):
3281
+ for j in range(nb_cols):
3282
+ idx = i * nb_cols + j
3283
+ # Source points
3284
+ control_points[idx, 0] = xx_src[i, j]
3285
+ control_points[idx, 1] = yy_src[i, j]
3286
+ # Destination points with jitter
3287
+ control_points[idx, 2] = np.clip(
3288
+ xx_src[i, j] + jitter[i, j, 1],
3289
+ 0,
3290
+ width - 1,
3291
+ )
3292
+ control_points[idx, 3] = np.clip(
3293
+ yy_src[i, j] + jitter[i, j, 0],
3294
+ 0,
3295
+ height - 1,
3296
+ )
3297
+
3298
+ # Create full resolution maps
3299
+ for i in range(height):
3300
+ for j in range(width):
3301
+ # Find nearest control points and interpolate
3302
+ dx = j - control_points[:, 0]
3303
+ dy = i - control_points[:, 1]
3304
+ dist = dx * dx + dy * dy
3305
+ weights = 1 / (dist + 1e-8)
3306
+ weights = weights / np.sum(weights)
3307
+
3308
+ map_x[i, j] = np.sum(weights * control_points[:, 2])
3309
+ map_y[i, j] = np.sum(weights * control_points[:, 3])
3310
+
3311
+ # Ensure output is within bounds
3312
+ map_x = np.clip(map_x, 0, width - 1, out=map_x)
3313
+ map_y = np.clip(map_y, 0, height - 1, out=map_y)
3314
+
3315
+ return map_x, map_y
3316
+
3317
+
3318
+ @handle_empty_array("bboxes")
3319
+ def bboxes_piecewise_affine(
3320
+ bboxes: np.ndarray,
3321
+ map_x: np.ndarray,
3322
+ map_y: np.ndarray,
3323
+ border_mode: int,
3324
+ image_shape: tuple[int, int],
3325
+ ) -> np.ndarray:
3326
+ """Apply a piecewise affine transformation to bounding boxes.
3327
+
3328
+ This function applies a piecewise affine transformation to the bounding boxes of an image.
3329
+ It first converts the bounding boxes to masks, then applies the transformation, and finally
3330
+ converts the transformed masks back to bounding boxes.
3331
+
3332
+ Args:
3333
+ bboxes (np.ndarray): The bounding boxes to transform.
3334
+ map_x (np.ndarray): The x-coordinates of the transformation.
3335
+ map_y (np.ndarray): The y-coordinates of the transformation.
3336
+ border_mode (int): The border mode to use for the transformation.
3337
+ image_shape (tuple[int, int]): The shape of the image.
3338
+
3339
+ Returns:
3340
+ np.ndarray: The transformed bounding boxes.
3341
+
3342
+ """
3343
+ masks = masks_from_bboxes(bboxes, image_shape).transpose(1, 2, 0)
3344
+
3345
+ map_xy = np.stack([map_x, map_y], axis=-1).astype(np.float32)
3346
+
3347
+ # Call remap with the combined map and empty second map
3348
+ transformed_masks = cv2.remap(
3349
+ masks,
3350
+ map_xy,
3351
+ None,
3352
+ cv2.INTER_NEAREST,
3353
+ borderMode=border_mode,
3354
+ borderValue=0,
3355
+ )
3356
+
3357
+ if transformed_masks.ndim == NUM_MULTI_CHANNEL_DIMENSIONS:
3358
+ transformed_masks = transformed_masks.transpose(2, 0, 1)
3359
+
3360
+ # Normalize the returned bboxes
3361
+ bboxes[:, :4] = bboxes_from_masks(transformed_masks)
3362
+
3363
+ return bboxes
3364
+
3365
+
3366
+ def get_dimension_padding(
3367
+ current_size: int,
3368
+ min_size: int | None,
3369
+ divisor: int | None,
3370
+ ) -> tuple[int, int]:
3371
+ """Calculate padding for a single dimension.
3372
+
3373
+ Args:
3374
+ current_size (int): Current size of the dimension
3375
+ min_size (int | None): Minimum size requirement, if any
3376
+ divisor (int | None): Divisor for padding to make size divisible, if any
3377
+
3378
+ Returns:
3379
+ tuple[int, int]: (pad_before, pad_after)
3380
+
3381
+ """
3382
+ if min_size is not None:
3383
+ if current_size < min_size:
3384
+ pad_before = int((min_size - current_size) / 2.0)
3385
+ pad_after = min_size - current_size - pad_before
3386
+ return pad_before, pad_after
3387
+ elif divisor is not None:
3388
+ remainder = current_size % divisor
3389
+ if remainder > 0:
3390
+ total_pad = divisor - remainder
3391
+ pad_before = total_pad // 2
3392
+ pad_after = total_pad - pad_before
3393
+ return pad_before, pad_after
3394
+
3395
+ return 0, 0
3396
+
3397
+
3398
+ def get_padding_params(
3399
+ image_shape: tuple[int, int],
3400
+ min_height: int | None,
3401
+ min_width: int | None,
3402
+ pad_height_divisor: int | None,
3403
+ pad_width_divisor: int | None,
3404
+ ) -> tuple[int, int, int, int]:
3405
+ """Calculate padding parameters based on target dimensions.
3406
+
3407
+ Args:
3408
+ image_shape (tuple[int, int]): (height, width) of the image
3409
+ min_height (int | None): Minimum height requirement, if any
3410
+ min_width (int | None): Minimum width requirement, if any
3411
+ pad_height_divisor (int | None): Divisor for height padding, if any
3412
+ pad_width_divisor (int | None): Divisor for width padding, if any
3413
+
3414
+ Returns:
3415
+ tuple[int, int, int, int]: (pad_top, pad_bottom, pad_left, pad_right)
3416
+
3417
+ """
3418
+ rows, cols = image_shape[:2]
3419
+
3420
+ h_pad_top, h_pad_bottom = get_dimension_padding(
3421
+ rows,
3422
+ min_height,
3423
+ pad_height_divisor,
3424
+ )
3425
+ w_pad_left, w_pad_right = get_dimension_padding(cols, min_width, pad_width_divisor)
3426
+
3427
+ return h_pad_top, h_pad_bottom, w_pad_left, w_pad_right
3428
+
3429
+
3430
+ def adjust_padding_by_position(
3431
+ h_top: int,
3432
+ h_bottom: int,
3433
+ w_left: int,
3434
+ w_right: int,
3435
+ position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"],
3436
+ py_random: np.random.RandomState,
3437
+ ) -> tuple[int, int, int, int]:
3438
+ """Adjust padding values based on desired position."""
3439
+ if position == "center":
3440
+ return h_top, h_bottom, w_left, w_right
3441
+
3442
+ if position == "top_left":
3443
+ return 0, h_top + h_bottom, 0, w_left + w_right
3444
+
3445
+ if position == "top_right":
3446
+ return 0, h_top + h_bottom, w_left + w_right, 0
3447
+
3448
+ if position == "bottom_left":
3449
+ return h_top + h_bottom, 0, 0, w_left + w_right
3450
+
3451
+ if position == "bottom_right":
3452
+ return h_top + h_bottom, 0, w_left + w_right, 0
3453
+
3454
+ if position == "random":
3455
+ h_pad = h_top + h_bottom
3456
+ w_pad = w_left + w_right
3457
+ h_top = py_random.randint(0, h_pad)
3458
+ h_bottom = h_pad - h_top
3459
+ w_left = py_random.randint(0, w_pad)
3460
+ w_right = w_pad - w_left
3461
+ return h_top, h_bottom, w_left, w_right
3462
+
3463
+ raise ValueError(f"Unknown position: {position}")
3464
+
3465
+
3466
+ def swap_tiles_on_keypoints(
3467
+ keypoints: np.ndarray,
3468
+ tiles: np.ndarray,
3469
+ mapping: np.ndarray,
3470
+ ) -> np.ndarray:
3471
+ """Swap the positions of keypoints based on a tile mapping.
3472
+
3473
+ This function takes a set of keypoints and repositions them according to a mapping of tile swaps.
3474
+ Keypoints are moved from their original tiles to new positions in the swapped tiles.
3475
+
3476
+ Args:
3477
+ keypoints (np.ndarray): A 2D numpy array of shape (N, 2) where N is the number of keypoints.
3478
+ Each row represents a keypoint's (x, y) coordinates.
3479
+ tiles (np.ndarray): A 2D numpy array of shape (M, 4) where M is the number of tiles.
3480
+ Each row represents a tile's (start_y, start_x, end_y, end_x) coordinates.
3481
+ mapping (np.ndarray): A 1D numpy array of shape (M,) where M is the number of tiles.
3482
+ Each element i contains the index of the tile that tile i should be swapped with.
3483
+
3484
+ Returns:
3485
+ np.ndarray: A 2D numpy array of the same shape as the input keypoints, containing the new positions
3486
+ of the keypoints after the tile swap.
3487
+
3488
+ Raises:
3489
+ RuntimeWarning: If any keypoint is not found within any tile.
3490
+
3491
+ Notes:
3492
+ - Keypoints that do not fall within any tile will remain unchanged.
3493
+ - The function assumes that the tiles do not overlap and cover the entire image space.
3494
+
3495
+ """
3496
+ if not keypoints.size:
3497
+ return keypoints
3498
+
3499
+ # Broadcast keypoints and tiles for vectorized comparison
3500
+ kp_x = keypoints[:, 0][:, np.newaxis] # Shape: (num_keypoints, 1)
3501
+ kp_y = keypoints[:, 1][:, np.newaxis] # Shape: (num_keypoints, 1)
3502
+
3503
+ start_y, start_x, end_y, end_x = tiles.T # Each shape: (num_tiles,)
3504
+
3505
+ # Check if each keypoint is inside each tile
3506
+ in_tile = (kp_y >= start_y) & (kp_y < end_y) & (kp_x >= start_x) & (kp_x < end_x)
3507
+
3508
+ # Find which tile each keypoint belongs to
3509
+ tile_indices = np.argmax(in_tile, axis=1)
3510
+
3511
+ # Check if any keypoint is not in any tile
3512
+ not_in_any_tile = ~np.any(in_tile, axis=1)
3513
+ if np.any(not_in_any_tile):
3514
+ warn(
3515
+ "Some keypoints are not in any tile. They will be returned unchanged. This is unexpected and should be "
3516
+ "investigated.",
3517
+ RuntimeWarning,
3518
+ stacklevel=2,
3519
+ )
3520
+
3521
+ # Get the new tile indices
3522
+ new_tile_indices = np.array(mapping)[tile_indices]
3523
+
3524
+ # Calculate the offsets
3525
+ old_start_x = tiles[tile_indices, 1]
3526
+ old_start_y = tiles[tile_indices, 0]
3527
+ new_start_x = tiles[new_tile_indices, 1]
3528
+ new_start_y = tiles[new_tile_indices, 0]
3529
+
3530
+ # Apply the transformation
3531
+ new_keypoints = keypoints.copy()
3532
+ new_keypoints[:, 0] = (keypoints[:, 0] - old_start_x) + new_start_x
3533
+ new_keypoints[:, 1] = (keypoints[:, 1] - old_start_y) + new_start_y
3534
+
3535
+ # Keep original coordinates for keypoints not in any tile
3536
+ new_keypoints[not_in_any_tile] = keypoints[not_in_any_tile]
3537
+
3538
+ return new_keypoints
3539
+
3540
+
3541
+ def swap_tiles_on_image(
3542
+ image: np.ndarray,
3543
+ tiles: np.ndarray,
3544
+ mapping: list[int] | None = None,
3545
+ ) -> np.ndarray:
3546
+ """Swap tiles on the image according to the new format.
3547
+
3548
+ Args:
3549
+ image (np.ndarray): Input image.
3550
+ tiles (np.ndarray): Array of tiles with each tile as [start_y, start_x, end_y, end_x].
3551
+ mapping (list[int] | None): list of new tile indices.
3552
+
3553
+ Returns:
3554
+ np.ndarray: Output image with tiles swapped according to the random shuffle.
3555
+
3556
+ """
3557
+ # If no tiles are provided, return a copy of the original image
3558
+ if tiles.size == 0 or mapping is None:
3559
+ return image.copy()
3560
+
3561
+ # Create a copy of the image to retain original for reference
3562
+ new_image = np.empty_like(image)
3563
+ for num, new_index in enumerate(mapping):
3564
+ start_y, start_x, end_y, end_x = tiles[new_index]
3565
+ start_y_orig, start_x_orig, end_y_orig, end_x_orig = tiles[num]
3566
+ # Assign the corresponding tile from the original image to the new image
3567
+ new_image[start_y:end_y, start_x:end_x] = image[
3568
+ start_y_orig:end_y_orig,
3569
+ start_x_orig:end_x_orig,
3570
+ ]
3571
+
3572
+ return new_image
3573
+
3574
+
3575
+ def is_valid_component(
3576
+ component_area: float,
3577
+ original_area: float,
3578
+ min_area: float | None,
3579
+ min_visibility: float | None,
3580
+ ) -> bool:
3581
+ """Validate if a component meets the minimum requirements."""
3582
+ visibility = component_area / original_area
3583
+ return (min_area is None or component_area >= min_area) and (min_visibility is None or visibility >= min_visibility)
3584
+
3585
+
3586
+ @handle_empty_array("bboxes")
3587
+ def bboxes_grid_shuffle(
3588
+ bboxes: np.ndarray,
3589
+ tiles: np.ndarray,
3590
+ mapping: list[int],
3591
+ image_shape: tuple[int, int],
3592
+ min_area: float,
3593
+ min_visibility: float,
3594
+ ) -> np.ndarray:
3595
+ """Shuffle bounding boxes according to grid mapping.
3596
+
3597
+ Args:
3598
+ bboxes (np.ndarray): Array of bounding boxes with shape (num_boxes, 4+)
3599
+ tiles (np.ndarray): Array of grid tiles
3600
+ mapping (list[int]): Mapping of tile indices
3601
+ image_shape (tuple[int, int]): Shape of the image (height, width)
3602
+ min_area (float): Minimum area of a bounding box to keep
3603
+ min_visibility (float): Minimum visibility ratio of a bounding box to keep
3604
+
3605
+ Returns:
3606
+ np.ndarray: Shuffled bounding boxes
3607
+
3608
+ """
3609
+ # Convert bboxes to masks
3610
+ masks = masks_from_bboxes(bboxes, image_shape)
3611
+
3612
+ # Apply grid shuffle to each mask and handle split components
3613
+ all_component_masks = []
3614
+ extra_bbox_data = [] # Store additional bbox data for each component
3615
+
3616
+ for idx, mask in enumerate(masks):
3617
+ original_area = np.sum(mask) # Get original mask area
3618
+
3619
+ # Shuffle the mask
3620
+ shuffled_mask = swap_tiles_on_image(mask, tiles, mapping)
3621
+
3622
+ # Find connected components
3623
+ num_components, components = cv2.connectedComponents(
3624
+ shuffled_mask.astype(np.uint8),
3625
+ )
3626
+
3627
+ # For each component, create a separate binary mask
3628
+ for comp_idx in range(1, num_components): # Skip background (0)
3629
+ component_mask = (components == comp_idx).astype(np.uint8)
3630
+
3631
+ # Calculate area and visibility ratio
3632
+ component_area = np.sum(component_mask)
3633
+ # Check if component meets minimum requirements
3634
+ if is_valid_component(
3635
+ component_area,
3636
+ original_area,
3637
+ min_area,
3638
+ min_visibility,
3639
+ ):
3640
+ all_component_masks.append(component_mask)
3641
+ # Append additional bbox data for this component
3642
+ if bboxes.shape[1] > NUM_BBOXES_COLUMNS_IN_ALBUMENTATIONS:
3643
+ extra_bbox_data.append(bboxes[idx, 4:])
3644
+
3645
+ # Convert all component masks to bboxes
3646
+ if all_component_masks:
3647
+ all_component_masks = np.array(all_component_masks)
3648
+ shuffled_bboxes = bboxes_from_masks(all_component_masks)
3649
+
3650
+ # Add back additional bbox data if present
3651
+ if extra_bbox_data:
3652
+ extra_bbox_data = np.array(extra_bbox_data)
3653
+ return np.column_stack([shuffled_bboxes, extra_bbox_data])
3654
+ else:
3655
+ # Handle case where no valid components were found
3656
+ return np.zeros((0, bboxes.shape[1]), dtype=bboxes.dtype)
3657
+
3658
+ return shuffled_bboxes
3659
+
3660
+
3661
+ def create_shape_groups(tiles: np.ndarray) -> dict[tuple[int, int], list[int]]:
3662
+ """Groups tiles by their shape and stores the indices for each shape."""
3663
+ shape_groups = defaultdict(list)
3664
+ for index, (start_y, start_x, end_y, end_x) in enumerate(tiles):
3665
+ shape = (end_y - start_y, end_x - start_x)
3666
+ shape_groups[shape].append(index)
3667
+ return shape_groups
3668
+
3669
+
3670
+ def shuffle_tiles_within_shape_groups(
3671
+ shape_groups: dict[tuple[int, int], list[int]],
3672
+ random_generator: np.random.Generator,
3673
+ ) -> list[int]:
3674
+ """Shuffles indices within each group of similar shapes and creates a list where each
3675
+ index points to the index of the tile it should be mapped to.
3676
+
3677
+ Args:
3678
+ shape_groups (dict[tuple[int, int], list[int]]): Groups of tile indices categorized by shape.
3679
+ random_generator (np.random.Generator): The random generator to use for shuffling the indices.
3680
+ If None, a new random generator will be used.
3681
+
3682
+ Returns:
3683
+ list[int]: A list where each index is mapped to the new index of the tile after shuffling.
3684
+
3685
+ """
3686
+ # Initialize the output list with the same size as the total number of tiles, filled with -1
3687
+ num_tiles = sum(len(indices) for indices in shape_groups.values())
3688
+ mapping = [-1] * num_tiles
3689
+
3690
+ # Prepare the random number generator
3691
+
3692
+ for indices in shape_groups.values():
3693
+ shuffled_indices = indices.copy()
3694
+ random_generator.shuffle(shuffled_indices)
3695
+
3696
+ for old, new in zip(indices, shuffled_indices):
3697
+ mapping[old] = new
3698
+
3699
+ return mapping
3700
+
3701
+
3702
+ def compute_pairwise_distances(
3703
+ points1: np.ndarray,
3704
+ points2: np.ndarray,
3705
+ ) -> np.ndarray:
3706
+ """Compute pairwise distances between two sets of points.
3707
+
3708
+ Args:
3709
+ points1 (np.ndarray): First set of points with shape (N, 2)
3710
+ points2 (np.ndarray): Second set of points with shape (M, 2)
3711
+
3712
+ Returns:
3713
+ np.ndarray: Matrix of pairwise distances with shape (N, M)
3714
+
3715
+ """
3716
+ points1 = np.ascontiguousarray(points1, dtype=np.float32)
3717
+ points2 = np.ascontiguousarray(points2, dtype=np.float32)
3718
+
3719
+ # Compute squared terms
3720
+ p1_squared = cv2.multiply(points1, points1).sum(axis=1, keepdims=True)
3721
+ p2_squared = cv2.multiply(points2, points2).sum(axis=1)[None, :]
3722
+
3723
+ # Compute dot product
3724
+ dot_product = cv2.gemm(points1, points2.T, 1, None, 0)
3725
+
3726
+ return p1_squared + p2_squared - 2 * dot_product
3727
+
3728
+
3729
+ def compute_tps_weights(
3730
+ src_points: np.ndarray,
3731
+ dst_points: np.ndarray,
3732
+ ) -> tuple[np.ndarray, np.ndarray]:
3733
+ """Compute Thin Plate Spline weights.
3734
+
3735
+ Args:
3736
+ src_points (np.ndarray): Source control points with shape (num_points, 2)
3737
+ dst_points (np.ndarray): Destination control points with shape (num_points, 2)
3738
+
3739
+ Returns:
3740
+ tuple[np.ndarray, np.ndarray]: Tuple of (nonlinear_weights, affine_weights)
3741
+ - nonlinear_weights: TPS kernel weights for nonlinear deformation (num_points, 2)
3742
+ - affine_weights: Weights for affine transformation (3, 2)
3743
+ [constant term, x scale/shear, y scale/shear]
3744
+
3745
+ Note:
3746
+ The TPS interpolation is decomposed into:
3747
+ 1. Nonlinear part (controlled by kernel weights)
3748
+ 2. Affine part (global scaling, rotation, translation)
3749
+
3750
+ """
3751
+ num_points = src_points.shape[0]
3752
+
3753
+ # Compute pairwise distances
3754
+ distances = compute_pairwise_distances(src_points, src_points)
3755
+
3756
+ kernel_matrix = np.where(
3757
+ distances > 0,
3758
+ distances * distances * cv2.log(distances + 1e-6),
3759
+ 0,
3760
+ ).astype(np.float32)
3761
+
3762
+ # Build system matrix efficiently
3763
+ affine_terms = np.empty((num_points, 3), dtype=np.float32)
3764
+ affine_terms[:, 0] = 1
3765
+ affine_terms[:, 1:] = src_points
3766
+
3767
+ # Construct system matrix
3768
+ system_matrix = np.zeros((num_points + 3, num_points + 3), dtype=np.float32)
3769
+ system_matrix[:num_points, :num_points] = kernel_matrix
3770
+ system_matrix[:num_points, num_points:] = affine_terms
3771
+ system_matrix[num_points:, :num_points] = affine_terms.T
3772
+
3773
+ # Prepare target coordinates
3774
+ target = np.zeros((num_points + 3, 2), dtype=np.float32)
3775
+ target[:num_points] = dst_points
3776
+
3777
+ weights = cv2.solve(system_matrix, target, flags=cv2.DECOMP_LU)[1]
3778
+
3779
+ return weights[:num_points], weights[num_points:]
3780
+
3781
+
3782
+ def tps_transform(
3783
+ target_points: np.ndarray,
3784
+ control_points: np.ndarray,
3785
+ nonlinear_weights: np.ndarray,
3786
+ affine_weights: np.ndarray,
3787
+ ) -> np.ndarray:
3788
+ """Apply TPS transformation with consistent types."""
3789
+ # Ensure float32 type for all inputs
3790
+ target_points = np.ascontiguousarray(target_points, dtype=np.float32)
3791
+ control_points = np.ascontiguousarray(control_points, dtype=np.float32)
3792
+ nonlinear_weights = np.ascontiguousarray(nonlinear_weights, dtype=np.float32)
3793
+ affine_weights = np.ascontiguousarray(affine_weights, dtype=np.float32)
3794
+
3795
+ distances = compute_pairwise_distances(target_points, control_points)
3796
+
3797
+ # Ensure kernel matrix is float32
3798
+ kernel_matrix = np.where(
3799
+ distances > 0,
3800
+ distances * cv2.log(distances + 1e-6),
3801
+ 0,
3802
+ ).astype(np.float32)
3803
+
3804
+ # Prepare affine terms
3805
+ num_points = len(target_points)
3806
+ affine_terms = np.empty((num_points, 3), dtype=np.float32)
3807
+ affine_terms[:, 0] = 1
3808
+ affine_terms[:, 1:] = target_points
3809
+
3810
+ # Matrix multiplications with consistent float32 type
3811
+ nonlinear_part = cv2.gemm(kernel_matrix, nonlinear_weights, 1, None, 0)
3812
+ affine_part = cv2.gemm(affine_terms, affine_weights, 1, None, 0)
3813
+
3814
+ return nonlinear_part + affine_part
3815
+
3816
+
3817
+ def get_camera_matrix_distortion_maps(
3818
+ image_shape: tuple[int, int],
3819
+ k: float,
3820
+ ) -> tuple[np.ndarray, np.ndarray]:
3821
+ """Generate distortion maps using camera matrix model.
3822
+
3823
+ Args:
3824
+ image_shape (tuple[int, int]): Image shape (height, width)
3825
+ k (float): Distortion coefficient
3826
+
3827
+ Returns:
3828
+ tuple[np.ndarray, np.ndarray]: Tuple of (map_x, map_y) distortion maps
3829
+
3830
+ """
3831
+ height, width = image_shape[:2]
3832
+
3833
+ center_x, center_y = width / 2, height / 2
3834
+
3835
+ camera_matrix = np.array(
3836
+ [[width, 0, center_x], [0, height, center_y], [0, 0, 1]],
3837
+ dtype=np.float32,
3838
+ )
3839
+ distortion = np.array([k, k, 0, 0, 0], dtype=np.float32)
3840
+ return cv2.initUndistortRectifyMap(
3841
+ camera_matrix,
3842
+ distortion,
3843
+ None,
3844
+ None,
3845
+ (width, height),
3846
+ cv2.CV_32FC1,
3847
+ )
3848
+
3849
+
3850
+ def get_fisheye_distortion_maps(
3851
+ image_shape: tuple[int, int],
3852
+ k: float,
3853
+ ) -> tuple[np.ndarray, np.ndarray]:
3854
+ """Generate distortion maps using fisheye model.
3855
+
3856
+ Args:
3857
+ image_shape (tuple[int, int]): Image shape (height, width)
3858
+ k (float): Distortion coefficient
3859
+
3860
+ Returns:
3861
+ tuple[np.ndarray, np.ndarray]: Tuple of (map_x, map_y) distortion maps
3862
+
3863
+ """
3864
+ height, width = image_shape[:2]
3865
+
3866
+ center_x, center_y = width / 2, height / 2
3867
+ # Create coordinate grid
3868
+ y, x = np.mgrid[:height, :width].astype(np.float32)
3869
+
3870
+ x = x - center_x
3871
+ y = y - center_y
3872
+
3873
+ # Calculate polar coordinates
3874
+ r = np.sqrt(x * x + y * y)
3875
+ theta = np.arctan2(y, x)
3876
+
3877
+ # Normalize radius by the maximum possible radius to keep distortion in check
3878
+ max_radius = math.sqrt(max(center_x, width - center_x) ** 2 + max(center_y, height - center_y) ** 2)
3879
+ r_norm = r / max_radius
3880
+
3881
+ # Apply fisheye distortion to normalized radius
3882
+ r_dist = r * (1 + k * r_norm * r_norm)
3883
+
3884
+ # Convert back to cartesian coordinates
3885
+ map_x = r_dist * np.cos(theta) + center_x
3886
+ map_y = r_dist * np.sin(theta) + center_y
3887
+
3888
+ return map_x, map_y
3889
+
3890
+
3891
+ def generate_control_points(num_control_points: int) -> np.ndarray:
3892
+ """Generate control points for TPS transformation.
3893
+
3894
+ Args:
3895
+ num_control_points (int): Number of control points per side
3896
+
3897
+ Returns:
3898
+ np.ndarray: Control points with shape (N, 2)
3899
+
3900
+ """
3901
+ if num_control_points == 2:
3902
+ # Generate 4 corners + center point similar to Kornia
3903
+ return np.array(
3904
+ [
3905
+ [0, 0], # top-left
3906
+ [0, 1], # bottom-left
3907
+ [1, 0], # top-right
3908
+ [1, 1], # bottom-right
3909
+ [0.5, 0.5], # center
3910
+ ],
3911
+ dtype=np.float32,
3912
+ )
3913
+
3914
+ # Generate regular grid
3915
+ x = np.linspace(0, 1, num_control_points)
3916
+ y = np.linspace(0, 1, num_control_points)
3917
+ return np.stack(np.meshgrid(x, y), axis=-1).reshape(-1, 2)
3918
+
3919
+
3920
+ def volume_hflip(volume: np.ndarray) -> np.ndarray:
3921
+ """Perform horizontal flip on a volume (numpy array).
3922
+
3923
+ Flips the volume along the width axis (axis=2). Handles inputs with
3924
+ shapes (D, H, W) or (D, H, W, C).
3925
+
3926
+ Args:
3927
+ volume (np.ndarray): Input volume.
3928
+
3929
+ Returns:
3930
+ np.ndarray: Horizontally flipped volume.
3931
+
3932
+ """
3933
+ return np.flip(volume, axis=2)
3934
+
3935
+
3936
+ def volume_vflip(volume: np.ndarray) -> np.ndarray:
3937
+ """Perform vertical flip on a volume (numpy array).
3938
+
3939
+ Flips the volume along the height axis (axis=1). Handles inputs with
3940
+ shapes (D, H, W) or (D, H, W, C).
3941
+
3942
+ Args:
3943
+ volume (np.ndarray): Input volume.
3944
+
3945
+ Returns:
3946
+ np.ndarray: Vertically flipped volume.
3947
+
3948
+ """
3949
+ return np.flip(volume, axis=1)
3950
+
3951
+
3952
+ def volumes_hflip(volumes: np.ndarray) -> np.ndarray:
3953
+ """Perform horizontal flip on a batch of volumes (numpy array).
3954
+
3955
+ Flips the volumes along the width axis (axis=3). Handles inputs with
3956
+ shapes (B, D, H, W) or (B, D, H, W, C).
3957
+
3958
+ Args:
3959
+ volumes (np.ndarray): Input batch of volumes.
3960
+
3961
+ Returns:
3962
+ np.ndarray: Horizontally flipped batch of volumes.
3963
+
3964
+ """
3965
+ # Width axis is 3 for both (B, D, H, W) and (B, D, H, W, C)
3966
+ return np.flip(volumes, axis=3)
3967
+
3968
+
3969
+ def volumes_vflip(volumes: np.ndarray) -> np.ndarray:
3970
+ """Perform vertical flip on a batch of volumes (numpy array).
3971
+
3972
+ Flips the volumes along the height axis (axis=2). Handles inputs with
3973
+ shapes (B, D, H, W) or (B, D, H, W, C).
3974
+
3975
+ Args:
3976
+ volumes (np.ndarray): Input batch of volumes.
3977
+
3978
+ Returns:
3979
+ np.ndarray: Vertically flipped batch of volumes.
3980
+
3981
+ """
3982
+ # Height axis is 2 for both (B, D, H, W) and (B, D, H, W, C)
3983
+ return np.flip(volumes, axis=2)
3984
+
3985
+
3986
+ def volume_rot90(volume: np.ndarray, factor: Literal[0, 1, 2, 3]) -> np.ndarray:
3987
+ """Rotate a volume 90 degrees counter-clockwise multiple times.
3988
+
3989
+ Rotates the volume in the height-width plane (axes 1 and 2).
3990
+ Handles inputs with shapes (D, H, W) or (D, H, W, C).
3991
+
3992
+ Args:
3993
+ volume (np.ndarray): Input volume.
3994
+ factor (Literal[0, 1, 2, 3]): Number of 90-degree rotations.
3995
+
3996
+ Returns:
3997
+ np.ndarray: Rotated volume.
3998
+
3999
+ """
4000
+ # Axes 1 (height) and 2 (width) for rotation
4001
+ return np.rot90(volume, k=factor, axes=(1, 2))
4002
+
4003
+
4004
+ def volumes_rot90(volumes: np.ndarray, factor: Literal[0, 1, 2, 3]) -> np.ndarray:
4005
+ """Rotate a batch of volumes 90 degrees counter-clockwise multiple times.
4006
+
4007
+ Rotates the volumes in the height-width plane (axes 2 and 3).
4008
+ Handles inputs with shapes (B, D, H, W) or (B, D, H, W, C).
4009
+
4010
+ Args:
4011
+ volumes (np.ndarray): Input batch of volumes.
4012
+ factor (Literal[0, 1, 2, 3]): Number of 90-degree rotations
4013
+
4014
+ Returns:
4015
+ np.ndarray: Rotated batch of volumes.
4016
+
4017
+ """
4018
+ # Axes 2 (height) and 3 (width) for rotation
4019
+ return np.rot90(volumes, k=factor, axes=(2, 3))
4020
+
4021
+
4022
+ @preserve_channel_dim
4023
+ def erode(img: np.ndarray, kernel: np.ndarray) -> np.ndarray:
4024
+ """Apply erosion to an image.
4025
+
4026
+ This function applies erosion to an image using the cv2.erode function.
4027
+
4028
+ Args:
4029
+ img (np.ndarray): Input image as a numpy array.
4030
+ kernel (np.ndarray): Kernel as a numpy array.
4031
+
4032
+ Returns:
4033
+ np.ndarray: The eroded image.
4034
+
4035
+ """
4036
+ return cv2.erode(img, kernel, iterations=1)
4037
+
4038
+
4039
+ @preserve_channel_dim
4040
+ def dilate(img: np.ndarray, kernel: np.ndarray) -> np.ndarray:
4041
+ """Apply dilation to an image.
4042
+
4043
+ This function applies dilation to an image using the cv2.dilate function.
4044
+
4045
+ Args:
4046
+ img (np.ndarray): Input image as a numpy array.
4047
+ kernel (np.ndarray): Kernel as a numpy array.
4048
+
4049
+ Returns:
4050
+ np.ndarray: The dilated image.
4051
+
4052
+ """
4053
+ return cv2.dilate(img, kernel, iterations=1)
4054
+
4055
+
4056
+ def morphology(
4057
+ img: np.ndarray,
4058
+ kernel: np.ndarray,
4059
+ operation: Literal["dilation", "erosion"],
4060
+ ) -> np.ndarray:
4061
+ """Apply morphology to an image.
4062
+
4063
+ This function applies morphology to an image using the cv2.morphologyEx function.
4064
+
4065
+ Args:
4066
+ img (np.ndarray): Input image as a numpy array.
4067
+ kernel (np.ndarray): Kernel as a numpy array.
4068
+ operation (Literal["dilation", "erosion"]): The operation to apply.
4069
+
4070
+ Returns:
4071
+ np.ndarray: The morphology applied to the image.
4072
+
4073
+ """
4074
+ if operation == "dilation":
4075
+ return dilate(img, kernel)
4076
+ if operation == "erosion":
4077
+ return erode(img, kernel)
4078
+
4079
+ raise ValueError(f"Unsupported operation: {operation}")
4080
+
4081
+
4082
+ @handle_empty_array("bboxes")
4083
+ def bboxes_morphology(
4084
+ bboxes: np.ndarray,
4085
+ kernel: np.ndarray,
4086
+ operation: Literal["dilation", "erosion"],
4087
+ image_shape: tuple[int, int],
4088
+ ) -> np.ndarray:
4089
+ """Apply morphology to bounding boxes.
4090
+
4091
+ This function applies morphology to bounding boxes by first converting the bounding
4092
+ boxes to a mask and then applying the morphology to the mask.
4093
+
4094
+ Args:
4095
+ bboxes (np.ndarray): Bounding boxes as a numpy array.
4096
+ kernel (np.ndarray): Kernel as a numpy array.
4097
+ operation (Literal["dilation", "erosion"]): The operation to apply.
4098
+ image_shape (tuple[int, int]): The shape of the image.
4099
+
4100
+ Returns:
4101
+ np.ndarray: The morphology applied to the bounding boxes.
4102
+
4103
+ """
4104
+ bboxes = bboxes.copy()
4105
+ masks = masks_from_bboxes(bboxes, image_shape)
4106
+ masks = morphology(masks, kernel, operation)
4107
+ bboxes[:, :4] = bboxes_from_masks(masks)
4108
+ return bboxes
4109
+
4110
+
4111
+ D4_TRANSFORMATIONS_IMAGES = {
4112
+ "e": lambda x: x, # Identity transformation
4113
+ "r90": lambda x: rot90_images(x, 1), # Rotate 90 degrees
4114
+ "r180": lambda x: rot90_images(x, 2), # Rotate 180 degrees
4115
+ "r270": lambda x: rot90_images(x, 3), # Rotate 270 degrees
4116
+ "v": vflip, # Vertical flip (already batch-aware)
4117
+ "hvt": lambda x: transpose_images(rot90_images(x, 2)), # Reflect over anti-diagonal
4118
+ "h": hflip, # Horizontal flip (already batch-aware)
4119
+ "t": transpose_images, # Transpose (reflect over main diagonal)
4120
+ }
4121
+
4122
+
4123
+ def d4_images(img: np.ndarray, group_member: Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]) -> np.ndarray:
4124
+ """Applies a `D_4` symmetry group transformation to a batch of images.
4125
+
4126
+ This function manipulates a batch of images using transformations such as rotations and flips,
4127
+ corresponding to the `D_4` dihedral group symmetry operations.
4128
+ Each transformation is identified by a unique group member code.
4129
+
4130
+ Args:
4131
+ img (np.ndarray): The input batch of images to transform with shape:
4132
+ - (N, H, W) for grayscale images
4133
+ - (N, H, W, C) for multi-channel images
4134
+ where N is the batch size, H is height, W is width, C is channels
4135
+ group_member (Literal["e", "r90", "r180", "r270", "v", "hvt", "h", "t"]): A string identifier indicating
4136
+ the specific transformation to apply. Valid codes include:
4137
+ - 'e': Identity (no transformation).
4138
+ - 'r90': Rotate 90 degrees counterclockwise.
4139
+ - 'r180': Rotate 180 degrees.
4140
+ - 'r270': Rotate 270 degrees counterclockwise.
4141
+ - 'v': Vertical flip.
4142
+ - 'hvt': Transpose over second diagonal
4143
+ - 'h': Horizontal flip.
4144
+ - 't': Transpose (reflect over the main diagonal).
4145
+
4146
+ Returns:
4147
+ np.ndarray: The transformed batch of images.
4148
+
4149
+ """
4150
+ # Execute the appropriate transformation
4151
+ return D4_TRANSFORMATIONS_IMAGES[group_member](img)