nrtk-albumentations 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nrtk-albumentations might be problematic. Click here for more details.

Files changed (62) hide show
  1. albumentations/__init__.py +21 -0
  2. albumentations/augmentations/__init__.py +23 -0
  3. albumentations/augmentations/blur/__init__.py +0 -0
  4. albumentations/augmentations/blur/functional.py +438 -0
  5. albumentations/augmentations/blur/transforms.py +1633 -0
  6. albumentations/augmentations/crops/__init__.py +0 -0
  7. albumentations/augmentations/crops/functional.py +494 -0
  8. albumentations/augmentations/crops/transforms.py +3647 -0
  9. albumentations/augmentations/dropout/__init__.py +0 -0
  10. albumentations/augmentations/dropout/channel_dropout.py +134 -0
  11. albumentations/augmentations/dropout/coarse_dropout.py +567 -0
  12. albumentations/augmentations/dropout/functional.py +1017 -0
  13. albumentations/augmentations/dropout/grid_dropout.py +166 -0
  14. albumentations/augmentations/dropout/mask_dropout.py +274 -0
  15. albumentations/augmentations/dropout/transforms.py +461 -0
  16. albumentations/augmentations/dropout/xy_masking.py +186 -0
  17. albumentations/augmentations/geometric/__init__.py +0 -0
  18. albumentations/augmentations/geometric/distortion.py +1238 -0
  19. albumentations/augmentations/geometric/flip.py +752 -0
  20. albumentations/augmentations/geometric/functional.py +4151 -0
  21. albumentations/augmentations/geometric/pad.py +676 -0
  22. albumentations/augmentations/geometric/resize.py +956 -0
  23. albumentations/augmentations/geometric/rotate.py +864 -0
  24. albumentations/augmentations/geometric/transforms.py +1962 -0
  25. albumentations/augmentations/mixing/__init__.py +0 -0
  26. albumentations/augmentations/mixing/domain_adaptation.py +787 -0
  27. albumentations/augmentations/mixing/domain_adaptation_functional.py +453 -0
  28. albumentations/augmentations/mixing/functional.py +878 -0
  29. albumentations/augmentations/mixing/transforms.py +832 -0
  30. albumentations/augmentations/other/__init__.py +0 -0
  31. albumentations/augmentations/other/lambda_transform.py +180 -0
  32. albumentations/augmentations/other/type_transform.py +261 -0
  33. albumentations/augmentations/pixel/__init__.py +0 -0
  34. albumentations/augmentations/pixel/functional.py +4226 -0
  35. albumentations/augmentations/pixel/transforms.py +7556 -0
  36. albumentations/augmentations/spectrogram/__init__.py +0 -0
  37. albumentations/augmentations/spectrogram/transform.py +220 -0
  38. albumentations/augmentations/text/__init__.py +0 -0
  39. albumentations/augmentations/text/functional.py +272 -0
  40. albumentations/augmentations/text/transforms.py +299 -0
  41. albumentations/augmentations/transforms3d/__init__.py +0 -0
  42. albumentations/augmentations/transforms3d/functional.py +393 -0
  43. albumentations/augmentations/transforms3d/transforms.py +1422 -0
  44. albumentations/augmentations/utils.py +249 -0
  45. albumentations/core/__init__.py +0 -0
  46. albumentations/core/bbox_utils.py +920 -0
  47. albumentations/core/composition.py +1885 -0
  48. albumentations/core/hub_mixin.py +299 -0
  49. albumentations/core/keypoints_utils.py +521 -0
  50. albumentations/core/label_manager.py +339 -0
  51. albumentations/core/pydantic.py +239 -0
  52. albumentations/core/serialization.py +352 -0
  53. albumentations/core/transforms_interface.py +976 -0
  54. albumentations/core/type_definitions.py +127 -0
  55. albumentations/core/utils.py +605 -0
  56. albumentations/core/validation.py +129 -0
  57. albumentations/pytorch/__init__.py +1 -0
  58. albumentations/pytorch/transforms.py +189 -0
  59. nrtk_albumentations-2.1.0.dist-info/METADATA +196 -0
  60. nrtk_albumentations-2.1.0.dist-info/RECORD +62 -0
  61. nrtk_albumentations-2.1.0.dist-info/WHEEL +4 -0
  62. nrtk_albumentations-2.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,3647 @@
1
+ """Transform classes for cropping operations on images and other data types.
2
+
3
+ This module provides various crop transforms that can be applied to images, masks,
4
+ bounding boxes, and keypoints. The transforms include simple cropping, random cropping,
5
+ center cropping, cropping near bounding boxes, and other specialized cropping operations
6
+ that maintain the integrity of bounding boxes. These transforms are designed to work within
7
+ the albumentations pipeline and can be used for data augmentation in computer vision tasks.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from collections.abc import Sequence
14
+ from typing import Annotated, Any, Literal, Union, cast
15
+
16
+ import cv2
17
+ import numpy as np
18
+ from pydantic import AfterValidator, Field, model_validator
19
+ from typing_extensions import Self
20
+
21
+ from albumentations.augmentations.geometric import functional as fgeometric
22
+ from albumentations.core.bbox_utils import denormalize_bboxes, normalize_bboxes, union_of_bboxes
23
+ from albumentations.core.pydantic import (
24
+ OnePlusIntRangeType,
25
+ ZeroOneRangeType,
26
+ check_range_bounds,
27
+ nondecreasing,
28
+ )
29
+ from albumentations.core.transforms_interface import BaseTransformInitSchema, DualTransform
30
+ from albumentations.core.type_definitions import (
31
+ ALL_TARGETS,
32
+ NUM_MULTI_CHANNEL_DIMENSIONS,
33
+ PAIR,
34
+ PercentType,
35
+ PxType,
36
+ )
37
+
38
+ from . import functional as fcrops
39
+
40
+ __all__ = [
41
+ "AtLeastOneBBoxRandomCrop",
42
+ "BBoxSafeRandomCrop",
43
+ "CenterCrop",
44
+ "Crop",
45
+ "CropAndPad",
46
+ "CropNonEmptyMaskIfExists",
47
+ "RandomCrop",
48
+ "RandomCropFromBorders",
49
+ "RandomCropNearBBox",
50
+ "RandomResizedCrop",
51
+ "RandomSizedBBoxSafeCrop",
52
+ "RandomSizedCrop",
53
+ ]
54
+
55
+
56
+ class CropSizeError(Exception):
57
+ pass
58
+
59
+
60
+ class BaseCrop(DualTransform):
61
+ """Base class for transforms that only perform cropping.
62
+
63
+ This abstract class provides the foundation for all cropping transformations.
64
+ It handles cropping of different data types including images, masks, bounding boxes,
65
+ keypoints, and volumes while keeping their spatial relationships intact.
66
+
67
+ Child classes must implement the `get_params_dependent_on_data` method to determine
68
+ crop coordinates based on transform-specific logic. This method should return a dictionary
69
+ containing at least a 'crop_coords' key with a tuple value (x_min, y_min, x_max, y_max).
70
+
71
+ Args:
72
+ p (float): Probability of applying the transform. Default: 1.0.
73
+
74
+ Targets:
75
+ image, mask, bboxes, keypoints, volume, mask3d
76
+
77
+ Image types:
78
+ uint8, float32
79
+
80
+ Note:
81
+ This class is not meant to be used directly. Instead, use or create derived
82
+ transforms that implement the specific cropping behavior required.
83
+
84
+ Examples:
85
+ >>> import numpy as np
86
+ >>> import albumentations as A
87
+ >>> from albumentations.augmentations.crops.transforms import BaseCrop
88
+ >>>
89
+ >>> # Example of a custom crop transform that inherits from BaseCrop
90
+ >>> class CustomCenterCrop(BaseCrop):
91
+ ... '''A simple custom center crop with configurable size'''
92
+ ... def __init__(self, crop_height, crop_width, p=1.0):
93
+ ... super().__init__(p=p)
94
+ ... self.crop_height = crop_height
95
+ ... self.crop_width = crop_width
96
+ ...
97
+ ... def get_params_dependent_on_data(self, params, data):
98
+ ... '''Calculate crop coordinates based on center of image'''
99
+ ... image_height, image_width = params["shape"][:2]
100
+ ...
101
+ ... # Calculate center crop coordinates
102
+ ... x_min = max(0, (image_width - self.crop_width) // 2)
103
+ ... y_min = max(0, (image_height - self.crop_height) // 2)
104
+ ... x_max = min(image_width, x_min + self.crop_width)
105
+ ... y_max = min(image_height, y_min + self.crop_height)
106
+ ...
107
+ ... return {"crop_coords": (x_min, y_min, x_max, y_max)}
108
+ >>>
109
+ >>> # Prepare sample data
110
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
111
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
112
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
113
+ >>> bbox_labels = [1, 2]
114
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
115
+ >>> keypoint_labels = [0, 1]
116
+ >>>
117
+ >>> # Use the custom transform in a pipeline
118
+ >>> transform = A.Compose(
119
+ ... [CustomCenterCrop(crop_height=80, crop_width=80)],
120
+ ... bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
121
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels'])
122
+ ... )
123
+ >>>
124
+ >>> # Apply the transform to data
125
+ >>> result = transform(
126
+ ... image=image,
127
+ ... mask=mask,
128
+ ... bboxes=bboxes,
129
+ ... bbox_labels=bbox_labels,
130
+ ... keypoints=keypoints,
131
+ ... keypoint_labels=keypoint_labels
132
+ ... )
133
+ >>>
134
+ >>> # Get the transformed data
135
+ >>> transformed_image = result['image'] # Will be 80x80
136
+ >>> transformed_mask = result['mask'] # Will be 80x80
137
+ >>> transformed_bboxes = result['bboxes'] # Bounding boxes adjusted to the cropped area
138
+ >>> transformed_bbox_labels = result['bbox_labels'] # Labels for bboxes that remain after cropping
139
+ >>> transformed_keypoints = result['keypoints'] # Keypoints adjusted to the cropped area
140
+ >>> transformed_keypoint_labels = result['keypoint_labels'] # Labels for keypoints that remain after cropping
141
+
142
+ """
143
+
144
+ _targets = ALL_TARGETS
145
+
146
+ def apply(
147
+ self,
148
+ img: np.ndarray,
149
+ crop_coords: tuple[int, int, int, int],
150
+ **params: Any,
151
+ ) -> np.ndarray:
152
+ """Apply the crop transform to an image.
153
+
154
+ Args:
155
+ img (np.ndarray): The image to apply the crop transform to.
156
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
157
+ params (dict[str, Any]): Additional parameters for the transform.
158
+
159
+ Returns:
160
+ np.ndarray: The cropped image.
161
+
162
+ """
163
+ return fcrops.crop(img, x_min=crop_coords[0], y_min=crop_coords[1], x_max=crop_coords[2], y_max=crop_coords[3])
164
+
165
+ def apply_to_bboxes(
166
+ self,
167
+ bboxes: np.ndarray,
168
+ crop_coords: tuple[int, int, int, int],
169
+ **params: Any,
170
+ ) -> np.ndarray:
171
+ """Apply the crop transform to bounding boxes.
172
+
173
+ Args:
174
+ bboxes (np.ndarray): The bounding boxes to apply the crop transform to.
175
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
176
+ params (dict[str, Any]): Additional parameters for the transform.
177
+
178
+ Returns:
179
+ np.ndarray: The cropped bounding boxes.
180
+
181
+ """
182
+ return fcrops.crop_bboxes_by_coords(bboxes, crop_coords, params["shape"][:2])
183
+
184
+ def apply_to_keypoints(
185
+ self,
186
+ keypoints: np.ndarray,
187
+ crop_coords: tuple[int, int, int, int],
188
+ **params: Any,
189
+ ) -> np.ndarray:
190
+ """Apply the crop transform to keypoints.
191
+
192
+ Args:
193
+ keypoints (np.ndarray): The keypoints to apply the crop transform to.
194
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
195
+ params (dict[str, Any]): Additional parameters for the transform.
196
+
197
+ Returns:
198
+ np.ndarray: The cropped keypoints.
199
+
200
+ """
201
+ return fcrops.crop_keypoints_by_coords(keypoints, crop_coords)
202
+
203
+ def apply_to_images(
204
+ self,
205
+ images: np.ndarray,
206
+ crop_coords: tuple[int, int, int, int],
207
+ **params: Any,
208
+ ) -> np.ndarray:
209
+ return fcrops.volume_crop_yx(images, crop_coords[0], crop_coords[1], crop_coords[2], crop_coords[3])
210
+
211
+ def apply_to_volume(
212
+ self,
213
+ volume: np.ndarray,
214
+ crop_coords: tuple[int, int, int, int],
215
+ **params: Any,
216
+ ) -> np.ndarray:
217
+ return self.apply_to_images(volume, crop_coords, **params)
218
+
219
+ def apply_to_volumes(
220
+ self,
221
+ volumes: np.ndarray,
222
+ crop_coords: tuple[int, int, int, int],
223
+ **params: Any,
224
+ ) -> np.ndarray:
225
+ return fcrops.volumes_crop_yx(volumes, crop_coords[0], crop_coords[1], crop_coords[2], crop_coords[3])
226
+
227
+ def apply_to_mask3d(
228
+ self,
229
+ mask3d: np.ndarray,
230
+ crop_coords: tuple[int, int, int, int],
231
+ **params: Any,
232
+ ) -> np.ndarray:
233
+ return self.apply_to_images(mask3d, crop_coords, **params)
234
+
235
+ def apply_to_masks3d(
236
+ self,
237
+ masks3d: np.ndarray,
238
+ crop_coords: tuple[int, int, int, int],
239
+ **params: Any,
240
+ ) -> np.ndarray:
241
+ return self.apply_to_volumes(masks3d, crop_coords, **params)
242
+
243
+ @staticmethod
244
+ def _clip_bbox(bbox: tuple[int, int, int, int], image_shape: tuple[int, int]) -> tuple[int, int, int, int]:
245
+ height, width = image_shape[:2]
246
+ x_min, y_min, x_max, y_max = bbox
247
+ x_min = np.clip(x_min, 0, width)
248
+ y_min = np.clip(y_min, 0, height)
249
+
250
+ x_max = np.clip(x_max, x_min, width)
251
+ y_max = np.clip(y_max, y_min, height)
252
+ return x_min, y_min, x_max, y_max
253
+
254
+
255
+ class BaseCropAndPad(BaseCrop):
256
+ """Base class for transforms that need both cropping and padding.
257
+
258
+ This abstract class extends BaseCrop by adding padding capabilities. It's the foundation
259
+ for transforms that may need to both crop parts of the input and add padding, such as when
260
+ converting inputs to a specific target size. The class handles the complexities of applying
261
+ these operations to different data types (images, masks, bounding boxes, keypoints) while
262
+ maintaining their spatial relationships.
263
+
264
+ Child classes must implement the `get_params_dependent_on_data` method to determine
265
+ crop coordinates and padding parameters based on transform-specific logic.
266
+
267
+ Args:
268
+ pad_if_needed (bool): Whether to pad the input if the crop size exceeds input dimensions.
269
+ border_mode (int): OpenCV border mode used for padding.
270
+ fill (tuple[float, ...] | float): Value to fill the padded area if border_mode is BORDER_CONSTANT.
271
+ For multi-channel images, this can be a tuple with a value for each channel.
272
+ fill_mask (tuple[float, ...] | float): Value to fill the padded area in masks.
273
+ pad_position (Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"]):
274
+ Position of padding when pad_if_needed is True.
275
+ p (float): Probability of applying the transform. Default: 1.0.
276
+
277
+ Targets:
278
+ image, mask, bboxes, keypoints, volume, mask3d
279
+
280
+ Image types:
281
+ uint8, float32
282
+
283
+ Note:
284
+ This class is not meant to be used directly. Instead, use or create derived
285
+ transforms that implement the specific cropping and padding behavior required.
286
+
287
+ Examples:
288
+ >>> import numpy as np
289
+ >>> import cv2
290
+ >>> import albumentations as A
291
+ >>> from albumentations.augmentations.crops.transforms import BaseCropAndPad
292
+ >>>
293
+ >>> # Example of a custom transform that inherits from BaseCropAndPad
294
+ >>> # This transform crops to a fixed size, padding if needed to maintain dimensions
295
+ >>> class CustomFixedSizeCrop(BaseCropAndPad):
296
+ ... '''A custom fixed-size crop that pads if needed to maintain output size'''
297
+ ... def __init__(
298
+ ... self,
299
+ ... height=224,
300
+ ... width=224,
301
+ ... offset_x=0, # Offset for crop position
302
+ ... offset_y=0, # Offset for crop position
303
+ ... pad_if_needed=True,
304
+ ... border_mode=cv2.BORDER_CONSTANT,
305
+ ... fill=0,
306
+ ... fill_mask=0,
307
+ ... pad_position="center",
308
+ ... p=1.0,
309
+ ... ):
310
+ ... super().__init__(
311
+ ... pad_if_needed=pad_if_needed,
312
+ ... border_mode=border_mode,
313
+ ... fill=fill,
314
+ ... fill_mask=fill_mask,
315
+ ... pad_position=pad_position,
316
+ ... p=p,
317
+ ... )
318
+ ... self.height = height
319
+ ... self.width = width
320
+ ... self.offset_x = offset_x
321
+ ... self.offset_y = offset_y
322
+ ...
323
+ ... def get_params_dependent_on_data(self, params, data):
324
+ ... '''Calculate crop coordinates and padding if needed'''
325
+ ... image_shape = params["shape"][:2]
326
+ ... image_height, image_width = image_shape
327
+ ...
328
+ ... # Calculate crop coordinates with offsets
329
+ ... x_min = self.offset_x
330
+ ... y_min = self.offset_y
331
+ ... x_max = min(x_min + self.width, image_width)
332
+ ... y_max = min(y_min + self.height, image_height)
333
+ ...
334
+ ... # Get padding params if needed
335
+ ... pad_params = self._get_pad_params(
336
+ ... image_shape,
337
+ ... (self.height, self.width)
338
+ ... ) if self.pad_if_needed else None
339
+ ...
340
+ ... return {
341
+ ... "crop_coords": (x_min, y_min, x_max, y_max),
342
+ ... "pad_params": pad_params,
343
+ ... }
344
+ >>>
345
+ >>> # Prepare sample data
346
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
347
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
348
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
349
+ >>> bbox_labels = [1, 2]
350
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
351
+ >>> keypoint_labels = [0, 1]
352
+ >>>
353
+ >>> # Use the custom transform in a pipeline
354
+ >>> # This will create a 224x224 crop with padding as needed
355
+ >>> transform = A.Compose(
356
+ ... [CustomFixedSizeCrop(
357
+ ... height=224,
358
+ ... width=224,
359
+ ... offset_x=20,
360
+ ... offset_y=10,
361
+ ... fill=127, # Gray color for padding
362
+ ... fill_mask=0
363
+ ... )],
364
+ ... bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
365
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
366
+ >>>
367
+ >>> # Apply the transform to data
368
+ >>> result = transform(
369
+ ... image=image,
370
+ ... mask=mask,
371
+ ... bboxes=bboxes,
372
+ ... bbox_labels=bbox_labels,
373
+ ... keypoints=keypoints,
374
+ ... keypoint_labels=keypoint_labels
375
+ ... )
376
+ >>>
377
+ >>> # Get the transformed data
378
+ >>> transformed_image = result['image'] # Will be 224x224 with padding
379
+ >>> transformed_mask = result['mask'] # Will be 224x224 with padding
380
+ >>> transformed_bboxes = result['bboxes'] # Bounding boxes adjusted to the cropped and padded area
381
+ >>> transformed_bbox_labels = result['bbox_labels'] # Bounding box labels after crop
382
+ >>> transformed_keypoints = result['keypoints'] # Keypoints adjusted to the cropped and padded area
383
+ >>> transformed_keypoint_labels = result['keypoint_labels'] # Keypoint labels after crop
384
+
385
+ """
386
+
387
+ class InitSchema(BaseTransformInitSchema):
388
+ pad_if_needed: bool
389
+ border_mode: Literal[
390
+ cv2.BORDER_CONSTANT,
391
+ cv2.BORDER_REPLICATE,
392
+ cv2.BORDER_REFLECT,
393
+ cv2.BORDER_WRAP,
394
+ cv2.BORDER_REFLECT_101,
395
+ ]
396
+ fill: tuple[float, ...] | float
397
+ fill_mask: tuple[float, ...] | float
398
+ pad_position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"]
399
+
400
+ def __init__(
401
+ self,
402
+ pad_if_needed: bool,
403
+ border_mode: Literal[
404
+ cv2.BORDER_CONSTANT,
405
+ cv2.BORDER_REPLICATE,
406
+ cv2.BORDER_REFLECT,
407
+ cv2.BORDER_WRAP,
408
+ cv2.BORDER_REFLECT_101,
409
+ ],
410
+ fill: tuple[float, ...] | float,
411
+ fill_mask: tuple[float, ...] | float,
412
+ pad_position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"],
413
+ p: float,
414
+ ):
415
+ super().__init__(p=p)
416
+ self.pad_if_needed = pad_if_needed
417
+ self.border_mode = border_mode
418
+ self.fill = fill
419
+ self.fill_mask = fill_mask
420
+ self.pad_position = pad_position
421
+
422
+ def _get_pad_params(self, image_shape: tuple[int, int], target_shape: tuple[int, int]) -> dict[str, Any] | None:
423
+ """Calculate padding parameters if needed."""
424
+ if not self.pad_if_needed:
425
+ return None
426
+
427
+ h_pad_top, h_pad_bottom, w_pad_left, w_pad_right = fgeometric.get_padding_params(
428
+ image_shape=image_shape,
429
+ min_height=target_shape[0],
430
+ min_width=target_shape[1],
431
+ pad_height_divisor=None,
432
+ pad_width_divisor=None,
433
+ )
434
+
435
+ if h_pad_top == h_pad_bottom == w_pad_left == w_pad_right == 0:
436
+ return None
437
+
438
+ h_pad_top, h_pad_bottom, w_pad_left, w_pad_right = fgeometric.adjust_padding_by_position(
439
+ h_top=h_pad_top,
440
+ h_bottom=h_pad_bottom,
441
+ w_left=w_pad_left,
442
+ w_right=w_pad_right,
443
+ position=self.pad_position,
444
+ py_random=self.py_random,
445
+ )
446
+
447
+ return {
448
+ "pad_top": h_pad_top,
449
+ "pad_bottom": h_pad_bottom,
450
+ "pad_left": w_pad_left,
451
+ "pad_right": w_pad_right,
452
+ }
453
+
454
+ def apply(
455
+ self,
456
+ img: np.ndarray,
457
+ crop_coords: tuple[int, int, int, int],
458
+ **params: Any,
459
+ ) -> np.ndarray:
460
+ """Apply the crop and pad transform to an image.
461
+
462
+ Args:
463
+ img (np.ndarray): The image to apply the crop and pad transform to.
464
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
465
+ params (dict[str, Any]): Additional parameters for the transform.
466
+
467
+ Returns:
468
+ np.ndarray: The cropped and padded image.
469
+
470
+ """
471
+ pad_params = params.get("pad_params")
472
+ if pad_params is not None:
473
+ img = fgeometric.pad_with_params(
474
+ img,
475
+ pad_params["pad_top"],
476
+ pad_params["pad_bottom"],
477
+ pad_params["pad_left"],
478
+ pad_params["pad_right"],
479
+ border_mode=self.border_mode,
480
+ value=self.fill,
481
+ )
482
+ return BaseCrop.apply(self, img, crop_coords, **params)
483
+
484
+ def apply_to_mask(
485
+ self,
486
+ mask: np.ndarray,
487
+ crop_coords: Any,
488
+ **params: Any,
489
+ ) -> np.ndarray:
490
+ """Apply the crop and pad transform to a mask.
491
+
492
+ Args:
493
+ mask (np.ndarray): The mask to apply the crop and pad transform to.
494
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
495
+ params (dict[str, Any]): Additional parameters for the transform.
496
+
497
+ Returns:
498
+ np.ndarray: The cropped and padded mask.
499
+
500
+ """
501
+ pad_params = params.get("pad_params")
502
+ if pad_params is not None:
503
+ mask = fgeometric.pad_with_params(
504
+ mask,
505
+ pad_params["pad_top"],
506
+ pad_params["pad_bottom"],
507
+ pad_params["pad_left"],
508
+ pad_params["pad_right"],
509
+ border_mode=self.border_mode,
510
+ value=self.fill_mask,
511
+ )
512
+ # Note' that super().apply would apply the padding twice as it is looped to this.apply
513
+ return BaseCrop.apply(self, mask, crop_coords=crop_coords, **params)
514
+
515
+ def apply_to_images(
516
+ self,
517
+ images: np.ndarray,
518
+ crop_coords: tuple[int, int, int, int],
519
+ **params: Any,
520
+ ) -> np.ndarray:
521
+ pad_params = params.get("pad_params")
522
+ if pad_params is not None:
523
+ images = fcrops.pad_along_axes(
524
+ images,
525
+ pad_params["pad_top"],
526
+ pad_params["pad_bottom"],
527
+ pad_params["pad_left"],
528
+ pad_params["pad_right"],
529
+ h_axis=1,
530
+ w_axis=2,
531
+ border_mode=self.border_mode,
532
+ pad_value=self.fill,
533
+ )
534
+ return BaseCrop.apply_to_images(self, images, crop_coords, **params)
535
+
536
+ def apply_to_volume(
537
+ self,
538
+ volume: np.ndarray,
539
+ crop_coords: tuple[int, int, int, int],
540
+ **params: Any,
541
+ ) -> np.ndarray:
542
+ return self.apply_to_images(volume, crop_coords, **params)
543
+
544
+ def apply_to_volumes(
545
+ self,
546
+ volumes: np.ndarray,
547
+ crop_coords: tuple[int, int, int, int],
548
+ **params: Any,
549
+ ) -> np.ndarray:
550
+ pad_params = params.get("pad_params")
551
+ if pad_params is not None:
552
+ volumes = fcrops.pad_along_axes(
553
+ volumes,
554
+ pad_params["pad_top"],
555
+ pad_params["pad_bottom"],
556
+ pad_params["pad_left"],
557
+ pad_params["pad_right"],
558
+ h_axis=2,
559
+ w_axis=3,
560
+ border_mode=self.border_mode,
561
+ pad_value=self.fill,
562
+ )
563
+ return BaseCrop.apply_to_volumes(self, volumes, crop_coords, **params)
564
+
565
+ def apply_to_mask3d(
566
+ self,
567
+ mask3d: np.ndarray,
568
+ crop_coords: tuple[int, int, int, int],
569
+ **params: Any,
570
+ ) -> np.ndarray:
571
+ return self.apply_to_images(mask3d, crop_coords, **params)
572
+
573
+ def apply_to_masks3d(
574
+ self,
575
+ masks3d: np.ndarray,
576
+ crop_coords: tuple[int, int, int, int],
577
+ **params: Any,
578
+ ) -> np.ndarray:
579
+ return self.apply_to_volumes(masks3d, crop_coords, **params)
580
+
581
+ def apply_to_bboxes(
582
+ self,
583
+ bboxes: np.ndarray,
584
+ crop_coords: tuple[int, int, int, int],
585
+ **params: Any,
586
+ ) -> np.ndarray:
587
+ """Apply the crop and pad transform to bounding boxes.
588
+
589
+ Args:
590
+ bboxes (np.ndarray): The bounding boxes to apply the crop and pad transform to.
591
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
592
+ params (dict[str, Any]): Additional parameters for the transform.
593
+
594
+ Returns:
595
+ np.ndarray: The cropped and padded bounding boxes.
596
+
597
+ """
598
+ pad_params = params.get("pad_params")
599
+ image_shape = params["shape"][:2]
600
+
601
+ if pad_params is not None:
602
+ # First denormalize bboxes to absolute coordinates
603
+ bboxes_np = denormalize_bboxes(bboxes, image_shape)
604
+
605
+ # Apply padding to bboxes (already works with absolute coordinates)
606
+ bboxes_np = fgeometric.pad_bboxes(
607
+ bboxes_np,
608
+ pad_params["pad_top"],
609
+ pad_params["pad_bottom"],
610
+ pad_params["pad_left"],
611
+ pad_params["pad_right"],
612
+ self.border_mode,
613
+ image_shape=image_shape,
614
+ )
615
+
616
+ # Update shape to padded dimensions
617
+ padded_height = image_shape[0] + pad_params["pad_top"] + pad_params["pad_bottom"]
618
+ padded_width = image_shape[1] + pad_params["pad_left"] + pad_params["pad_right"]
619
+ padded_shape = (padded_height, padded_width)
620
+
621
+ bboxes_np = normalize_bboxes(bboxes_np, padded_shape)
622
+
623
+ params["shape"] = padded_shape
624
+
625
+ return BaseCrop.apply_to_bboxes(self, bboxes_np, crop_coords, **params)
626
+
627
+ # If no padding, use original function behavior
628
+ return BaseCrop.apply_to_bboxes(self, bboxes, crop_coords, **params)
629
+
630
+ def apply_to_keypoints(
631
+ self,
632
+ keypoints: np.ndarray,
633
+ crop_coords: tuple[int, int, int, int],
634
+ **params: Any,
635
+ ) -> np.ndarray:
636
+ """Apply the crop and pad transform to keypoints.
637
+
638
+ Args:
639
+ keypoints (np.ndarray): The keypoints to apply the crop and pad transform to.
640
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
641
+ params (dict[str, Any]): Additional parameters for the transform.
642
+
643
+ Returns:
644
+ np.ndarray: The cropped and padded keypoints.
645
+
646
+ """
647
+ pad_params = params.get("pad_params")
648
+ image_shape = params["shape"][:2]
649
+
650
+ if pad_params is not None:
651
+ # Calculate padded dimensions
652
+ padded_height = image_shape[0] + pad_params["pad_top"] + pad_params["pad_bottom"]
653
+ padded_width = image_shape[1] + pad_params["pad_left"] + pad_params["pad_right"]
654
+
655
+ # First apply padding to keypoints using original image shape
656
+ keypoints = fgeometric.pad_keypoints(
657
+ keypoints,
658
+ pad_params["pad_top"],
659
+ pad_params["pad_bottom"],
660
+ pad_params["pad_left"],
661
+ pad_params["pad_right"],
662
+ self.border_mode,
663
+ image_shape=image_shape,
664
+ )
665
+
666
+ # Update image shape for subsequent crop operation
667
+ params = {**params, "shape": (padded_height, padded_width)}
668
+
669
+ return BaseCrop.apply_to_keypoints(self, keypoints, crop_coords, **params)
670
+
671
+
672
+ class RandomCrop(BaseCropAndPad):
673
+ """Crop a random part of the input.
674
+
675
+ Args:
676
+ height (int): height of the crop.
677
+ width (int): width of the crop.
678
+ pad_if_needed (bool): Whether to pad if crop size exceeds image size. Default: False.
679
+ border_mode (OpenCV flag): OpenCV border mode used for padding. Default: cv2.BORDER_CONSTANT.
680
+ fill (tuple[float, ...] | float): Padding value for images if border_mode is
681
+ cv2.BORDER_CONSTANT. Default: 0.
682
+ fill_mask (tuple[float, ...] | float): Padding value for masks if border_mode is
683
+ cv2.BORDER_CONSTANT. Default: 0.
684
+ pad_position (Literal['center', 'top_left', 'top_right', 'bottom_left', 'bottom_right', 'random']):
685
+ Position of padding. Default: 'center'.
686
+ p (float): Probability of applying the transform. Default: 1.0.
687
+
688
+ Targets:
689
+ image, mask, bboxes, keypoints, volume, mask3d
690
+
691
+ Image types:
692
+ uint8, float32
693
+
694
+ Note:
695
+ If pad_if_needed is True and crop size exceeds image dimensions, the image will be padded
696
+ before applying the random crop.
697
+
698
+ Examples:
699
+ >>> import numpy as np
700
+ >>> import albumentations as A
701
+ >>> import cv2
702
+ >>>
703
+ >>> # Prepare sample data
704
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
705
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
706
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
707
+ >>> bbox_labels = [1, 2]
708
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
709
+ >>> keypoint_labels = [0, 1]
710
+ >>>
711
+ >>> # Example 1: Basic random crop
712
+ >>> transform = A.Compose([
713
+ ... A.RandomCrop(height=64, width=64),
714
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
715
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
716
+ >>>
717
+ >>> # Apply the transform
718
+ >>> transformed = transform(
719
+ ... image=image,
720
+ ... mask=mask,
721
+ ... bboxes=bboxes,
722
+ ... bbox_labels=bbox_labels,
723
+ ... keypoints=keypoints,
724
+ ... keypoint_labels=keypoint_labels
725
+ ... )
726
+ >>>
727
+ >>> # Get the transformed data
728
+ >>> transformed_image = transformed['image'] # Will be 64x64
729
+ >>> transformed_mask = transformed['mask'] # Will be 64x64
730
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to the cropped area
731
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for boxes that remain after cropping
732
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to the cropped area
733
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for keypoints that remain
734
+ >>>
735
+ >>> # Example 2: Random crop with padding when needed
736
+ >>> # This is useful when you want to crop to a size larger than some images
737
+ >>> transform_padded = A.Compose([
738
+ ... A.RandomCrop(
739
+ ... height=120, # Larger than original image height
740
+ ... width=120, # Larger than original image width
741
+ ... pad_if_needed=True,
742
+ ... border_mode=cv2.BORDER_CONSTANT,
743
+ ... fill=0, # Black padding for image
744
+ ... fill_mask=0 # Zero padding for mask
745
+ ... ),
746
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
747
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
748
+ >>>
749
+ >>> # Apply the padded transform
750
+ >>> padded_transformed = transform_padded(
751
+ ... image=image,
752
+ ... mask=mask,
753
+ ... bboxes=bboxes,
754
+ ... bbox_labels=bbox_labels,
755
+ ... keypoints=keypoints,
756
+ ... keypoint_labels=keypoint_labels
757
+ ... )
758
+ >>>
759
+ >>> # The result will be 120x120 with padding
760
+ >>> padded_image = padded_transformed['image']
761
+ >>> padded_mask = padded_transformed['mask']
762
+ >>> padded_bboxes = padded_transformed['bboxes'] # Coordinates adjusted to the new dimensions
763
+
764
+ """
765
+
766
+ class InitSchema(BaseCropAndPad.InitSchema):
767
+ height: Annotated[int, Field(ge=1)]
768
+ width: Annotated[int, Field(ge=1)]
769
+ border_mode: Literal[
770
+ cv2.BORDER_CONSTANT,
771
+ cv2.BORDER_REPLICATE,
772
+ cv2.BORDER_REFLECT,
773
+ cv2.BORDER_WRAP,
774
+ cv2.BORDER_REFLECT_101,
775
+ ]
776
+
777
+ fill: tuple[float, ...] | float
778
+ fill_mask: tuple[float, ...] | float
779
+
780
+ def __init__(
781
+ self,
782
+ height: int,
783
+ width: int,
784
+ pad_if_needed: bool = False,
785
+ pad_position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"] = "center",
786
+ border_mode: Literal[
787
+ cv2.BORDER_CONSTANT,
788
+ cv2.BORDER_REPLICATE,
789
+ cv2.BORDER_REFLECT,
790
+ cv2.BORDER_WRAP,
791
+ cv2.BORDER_REFLECT_101,
792
+ ] = cv2.BORDER_CONSTANT,
793
+ fill: tuple[float, ...] | float = 0.0,
794
+ fill_mask: tuple[float, ...] | float = 0.0,
795
+ p: float = 1.0,
796
+ ):
797
+ super().__init__(
798
+ pad_if_needed=pad_if_needed,
799
+ border_mode=border_mode,
800
+ fill=fill,
801
+ fill_mask=fill_mask,
802
+ pad_position=pad_position,
803
+ p=p,
804
+ )
805
+ self.height = height
806
+ self.width = width
807
+
808
+ def get_params_dependent_on_data(
809
+ self,
810
+ params: dict[str, Any],
811
+ data: dict[str, Any],
812
+ ) -> dict[str, Any]: # Changed return type to be more flexible
813
+ """Get parameters that depend on input data.
814
+
815
+ Args:
816
+ params (dict[str, Any]): Parameters.
817
+ data (dict[str, Any]): Input data.
818
+
819
+ Returns:
820
+ dict[str, Any]: Dictionary with parameters.
821
+
822
+ """
823
+ image_shape = params["shape"][:2]
824
+ image_height, image_width = image_shape
825
+
826
+ if not self.pad_if_needed and (self.height > image_height or self.width > image_width):
827
+ raise CropSizeError(
828
+ f"Crop size (height, width) exceeds image dimensions (height, width):"
829
+ f" {(self.height, self.width)} vs {image_shape[:2]}",
830
+ )
831
+
832
+ # Get padding params first if needed
833
+ pad_params = self._get_pad_params(image_shape, (self.height, self.width))
834
+
835
+ # If padding is needed, adjust the image shape for crop calculation
836
+ if pad_params is not None:
837
+ pad_top = pad_params["pad_top"]
838
+ pad_bottom = pad_params["pad_bottom"]
839
+ pad_left = pad_params["pad_left"]
840
+ pad_right = pad_params["pad_right"]
841
+
842
+ padded_height = image_height + pad_top + pad_bottom
843
+ padded_width = image_width + pad_left + pad_right
844
+ padded_shape = (padded_height, padded_width)
845
+
846
+ # Get random crop coordinates based on padded dimensions
847
+ h_start = self.py_random.random()
848
+ w_start = self.py_random.random()
849
+ crop_coords = fcrops.get_crop_coords(padded_shape, (self.height, self.width), h_start, w_start)
850
+ else:
851
+ # Get random crop coordinates based on original dimensions
852
+ h_start = self.py_random.random()
853
+ w_start = self.py_random.random()
854
+ crop_coords = fcrops.get_crop_coords(image_shape, (self.height, self.width), h_start, w_start)
855
+
856
+ return {
857
+ "crop_coords": crop_coords,
858
+ "pad_params": pad_params,
859
+ }
860
+
861
+
862
+ class CenterCrop(BaseCropAndPad):
863
+ """Crop the central part of the input.
864
+
865
+ This transform crops the center of the input image, mask, bounding boxes, and keypoints to the specified dimensions.
866
+ It's useful when you want to focus on the central region of the input, discarding peripheral information.
867
+
868
+ Args:
869
+ height (int): The height of the crop. Must be greater than 0.
870
+ width (int): The width of the crop. Must be greater than 0.
871
+ pad_if_needed (bool): Whether to pad if crop size exceeds image size. Default: False.
872
+ border_mode (OpenCV flag): OpenCV border mode used for padding. Default: cv2.BORDER_CONSTANT.
873
+ fill (tuple[float, ...] | float): Padding value for images if border_mode is
874
+ cv2.BORDER_CONSTANT. Default: 0.
875
+ fill_mask (tuple[float, ...] | float): Padding value for masks if border_mode is
876
+ cv2.BORDER_CONSTANT. Default: 0.
877
+ pad_position (Literal['center', 'top_left', 'top_right', 'bottom_left', 'bottom_right', 'random']):
878
+ Position of padding. Default: 'center'.
879
+ p (float): Probability of applying the transform. Default: 1.0.
880
+
881
+ Targets:
882
+ image, mask, bboxes, keypoints, volume, mask3d
883
+
884
+ Image types:
885
+ uint8, float32
886
+
887
+ Note:
888
+ - If pad_if_needed is False and crop size exceeds image dimensions, it will raise a CropSizeError.
889
+ - If pad_if_needed is True and crop size exceeds image dimensions, the image will be padded.
890
+ - For bounding boxes and keypoints, coordinates are adjusted appropriately for both padding and cropping.
891
+
892
+ Examples:
893
+ >>> import numpy as np
894
+ >>> import albumentations as A
895
+ >>> import cv2
896
+ >>>
897
+ >>> # Prepare sample data
898
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
899
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
900
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
901
+ >>> bbox_labels = [1, 2]
902
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
903
+ >>> keypoint_labels = [0, 1]
904
+ >>>
905
+ >>> # Example 1: Basic center crop without padding
906
+ >>> transform = A.Compose([
907
+ ... A.CenterCrop(height=64, width=64),
908
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
909
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
910
+ >>>
911
+ >>> # Apply the transform
912
+ >>> transformed = transform(
913
+ ... image=image,
914
+ ... mask=mask,
915
+ ... bboxes=bboxes,
916
+ ... bbox_labels=bbox_labels,
917
+ ... keypoints=keypoints,
918
+ ... keypoint_labels=keypoint_labels
919
+ ... )
920
+ >>>
921
+ >>> # Get the transformed data
922
+ >>> transformed_image = transformed['image'] # Will be 64x64
923
+ >>> transformed_mask = transformed['mask'] # Will be 64x64
924
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to the cropped area
925
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for boxes that remain after cropping
926
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to the cropped area
927
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for keypoints that remain
928
+ >>>
929
+ >>> # Example 2: Center crop with padding when needed
930
+ >>> transform_padded = A.Compose([
931
+ ... A.CenterCrop(
932
+ ... height=120, # Larger than original image height
933
+ ... width=120, # Larger than original image width
934
+ ... pad_if_needed=True,
935
+ ... border_mode=cv2.BORDER_CONSTANT,
936
+ ... fill=0, # Black padding for image
937
+ ... fill_mask=0 # Zero padding for mask
938
+ ... ),
939
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
940
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
941
+ >>>
942
+ >>> # Apply the padded transform
943
+ >>> padded_transformed = transform_padded(
944
+ ... image=image,
945
+ ... mask=mask,
946
+ ... bboxes=bboxes,
947
+ ... bbox_labels=bbox_labels,
948
+ ... keypoints=keypoints,
949
+ ... keypoint_labels=keypoint_labels
950
+ ... )
951
+ >>>
952
+ >>> # The result will be 120x120 with padding
953
+ >>> padded_image = padded_transformed['image']
954
+ >>> padded_mask = padded_transformed['mask']
955
+ >>> padded_bboxes = padded_transformed['bboxes'] # Coordinates adjusted to the new dimensions
956
+ >>> padded_keypoints = padded_transformed['keypoints'] # Coordinates adjusted to the new dimensions
957
+
958
+ """
959
+
960
+ class InitSchema(BaseCropAndPad.InitSchema):
961
+ height: Annotated[int, Field(ge=1)]
962
+ width: Annotated[int, Field(ge=1)]
963
+ border_mode: Literal[
964
+ cv2.BORDER_CONSTANT,
965
+ cv2.BORDER_REPLICATE,
966
+ cv2.BORDER_REFLECT,
967
+ cv2.BORDER_WRAP,
968
+ cv2.BORDER_REFLECT_101,
969
+ ]
970
+
971
+ fill: tuple[float, ...] | float
972
+ fill_mask: tuple[float, ...] | float
973
+
974
+ def __init__(
975
+ self,
976
+ height: int,
977
+ width: int,
978
+ pad_if_needed: bool = False,
979
+ pad_position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"] = "center",
980
+ border_mode: Literal[
981
+ cv2.BORDER_CONSTANT,
982
+ cv2.BORDER_REPLICATE,
983
+ cv2.BORDER_REFLECT,
984
+ cv2.BORDER_WRAP,
985
+ cv2.BORDER_REFLECT_101,
986
+ ] = cv2.BORDER_CONSTANT,
987
+ fill: tuple[float, ...] | float = 0.0,
988
+ fill_mask: tuple[float, ...] | float = 0.0,
989
+ p: float = 1.0,
990
+ ):
991
+ super().__init__(
992
+ pad_if_needed=pad_if_needed,
993
+ border_mode=border_mode,
994
+ fill=fill,
995
+ fill_mask=fill_mask,
996
+ pad_position=pad_position,
997
+ p=p,
998
+ )
999
+ self.height = height
1000
+ self.width = width
1001
+
1002
+ def get_params_dependent_on_data(
1003
+ self,
1004
+ params: dict[str, Any],
1005
+ data: dict[str, Any],
1006
+ ) -> dict[str, Any]:
1007
+ """Get the parameters dependent on the data.
1008
+
1009
+ Args:
1010
+ params (dict[str, Any]): The parameters of the transform.
1011
+ data (dict[str, Any]): The data of the transform.
1012
+
1013
+ """
1014
+ image_shape = params["shape"][:2]
1015
+ image_height, image_width = image_shape
1016
+
1017
+ if not self.pad_if_needed and (self.height > image_height or self.width > image_width):
1018
+ raise CropSizeError(
1019
+ f"Crop size (height, width) exceeds image dimensions (height, width):"
1020
+ f" {(self.height, self.width)} vs {image_shape[:2]}",
1021
+ )
1022
+
1023
+ # Get padding params first if needed
1024
+ pad_params = self._get_pad_params(image_shape, (self.height, self.width))
1025
+
1026
+ # If padding is needed, adjust the image shape for crop calculation
1027
+ if pad_params is not None:
1028
+ pad_top = pad_params["pad_top"]
1029
+ pad_bottom = pad_params["pad_bottom"]
1030
+ pad_left = pad_params["pad_left"]
1031
+ pad_right = pad_params["pad_right"]
1032
+
1033
+ padded_height = image_height + pad_top + pad_bottom
1034
+ padded_width = image_width + pad_left + pad_right
1035
+ padded_shape = (padded_height, padded_width)
1036
+
1037
+ # Get crop coordinates based on padded dimensions
1038
+ crop_coords = fcrops.get_center_crop_coords(padded_shape, (self.height, self.width))
1039
+ else:
1040
+ # Get crop coordinates based on original dimensions
1041
+ crop_coords = fcrops.get_center_crop_coords(image_shape, (self.height, self.width))
1042
+
1043
+ return {
1044
+ "crop_coords": crop_coords,
1045
+ "pad_params": pad_params,
1046
+ }
1047
+
1048
+
1049
+ class Crop(BaseCropAndPad):
1050
+ """Crop a specific region from the input image.
1051
+
1052
+ This transform crops a rectangular region from the input image, mask, bounding boxes, and keypoints
1053
+ based on specified coordinates. It's useful when you want to extract a specific area of interest
1054
+ from your inputs.
1055
+
1056
+ Args:
1057
+ x_min (int): Minimum x-coordinate of the crop region (left edge). Must be >= 0. Default: 0.
1058
+ y_min (int): Minimum y-coordinate of the crop region (top edge). Must be >= 0. Default: 0.
1059
+ x_max (int): Maximum x-coordinate of the crop region (right edge). Must be > x_min. Default: 1024.
1060
+ y_max (int): Maximum y-coordinate of the crop region (bottom edge). Must be > y_min. Default: 1024.
1061
+ pad_if_needed (bool): Whether to pad if crop coordinates exceed image dimensions. Default: False.
1062
+ border_mode (OpenCV flag): OpenCV border mode used for padding. Default: cv2.BORDER_CONSTANT.
1063
+ fill (tuple[float, ...] | float): Padding value if border_mode is cv2.BORDER_CONSTANT. Default: 0.
1064
+ fill_mask (tuple[float, ...] | float): Padding value for masks. Default: 0.
1065
+ pad_position (Literal['center', 'top_left', 'top_right', 'bottom_left', 'bottom_right', 'random']):
1066
+ Position of padding. Default: 'center'.
1067
+ p (float): Probability of applying the transform. Default: 1.0.
1068
+
1069
+ Targets:
1070
+ image, mask, bboxes, keypoints, volume, mask3d
1071
+
1072
+ Image types:
1073
+ uint8, float32
1074
+
1075
+ Note:
1076
+ - The crop coordinates are applied as follows: x_min <= x < x_max and y_min <= y < y_max.
1077
+ - If pad_if_needed is False and crop region extends beyond image boundaries, it will be clipped.
1078
+ - If pad_if_needed is True, image will be padded to accommodate the full crop region.
1079
+ - For bounding boxes and keypoints, coordinates are adjusted appropriately for both padding and cropping.
1080
+
1081
+ Examples:
1082
+ >>> import numpy as np
1083
+ >>> import albumentations as A
1084
+ >>> import cv2
1085
+ >>>
1086
+ >>> # Prepare sample data
1087
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
1088
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
1089
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
1090
+ >>> bbox_labels = [1, 2]
1091
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
1092
+ >>> keypoint_labels = [0, 1]
1093
+ >>>
1094
+ >>> # Example 1: Basic crop with fixed coordinates
1095
+ >>> transform = A.Compose([
1096
+ ... A.Crop(x_min=20, y_min=20, x_max=80, y_max=80),
1097
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
1098
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
1099
+ >>>
1100
+ >>> # Apply the transform
1101
+ >>> transformed = transform(
1102
+ ... image=image,
1103
+ ... mask=mask,
1104
+ ... bboxes=bboxes,
1105
+ ... bbox_labels=bbox_labels,
1106
+ ... keypoints=keypoints,
1107
+ ... keypoint_labels=keypoint_labels
1108
+ ... )
1109
+ >>>
1110
+ >>> # Get the transformed data
1111
+ >>> transformed_image = transformed['image'] # Will be 60x60 - cropped from (20,20) to (80,80)
1112
+ >>> transformed_mask = transformed['mask'] # Will be 60x60
1113
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to the cropped area
1114
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for boxes that remain after cropping
1115
+ >>>
1116
+ >>> # Example 2: Crop with padding when the crop region extends beyond image dimensions
1117
+ >>> transform_padded = A.Compose([
1118
+ ... A.Crop(
1119
+ ... x_min=50, y_min=50, x_max=150, y_max=150, # Extends beyond the 100x100 image
1120
+ ... pad_if_needed=True,
1121
+ ... border_mode=cv2.BORDER_CONSTANT,
1122
+ ... fill=0, # Black padding for image
1123
+ ... fill_mask=0 # Zero padding for mask
1124
+ ... ),
1125
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
1126
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
1127
+ >>>
1128
+ >>> # Apply the padded transform
1129
+ >>> padded_transformed = transform_padded(
1130
+ ... image=image,
1131
+ ... mask=mask,
1132
+ ... bboxes=bboxes,
1133
+ ... bbox_labels=bbox_labels,
1134
+ ... keypoints=keypoints,
1135
+ ... keypoint_labels=keypoint_labels
1136
+ ... )
1137
+ >>>
1138
+ >>> # The result will be 100x100 (50:150, 50:150) with padding on right and bottom
1139
+ >>> padded_image = padded_transformed['image'] # 100x100 with 50 pixels of original + 50 pixels of padding
1140
+ >>> padded_mask = padded_transformed['mask']
1141
+ >>> padded_bboxes = padded_transformed['bboxes'] # Coordinates adjusted to the cropped and padded area
1142
+ >>>
1143
+ >>> # Example 3: Crop with reflection padding and custom position
1144
+ >>> transform_reflect = A.Compose([
1145
+ ... A.Crop(
1146
+ ... x_min=-20, y_min=-20, x_max=80, y_max=80, # Negative coordinates (outside image)
1147
+ ... pad_if_needed=True,
1148
+ ... border_mode=cv2.BORDER_REFLECT_101, # Reflect image for padding
1149
+ ... pad_position="top_left" # Apply padding at top-left
1150
+ ... ),
1151
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']))
1152
+ >>>
1153
+ >>> # The resulting crop will use reflection padding for the negative coordinates
1154
+ >>> reflect_result = transform_reflect(
1155
+ ... image=image,
1156
+ ... bboxes=bboxes,
1157
+ ... bbox_labels=bbox_labels
1158
+ ... )
1159
+
1160
+ """
1161
+
1162
+ class InitSchema(BaseCropAndPad.InitSchema):
1163
+ x_min: Annotated[int, Field(ge=0)]
1164
+ y_min: Annotated[int, Field(ge=0)]
1165
+ x_max: Annotated[int, Field(gt=0)]
1166
+ y_max: Annotated[int, Field(gt=0)]
1167
+ border_mode: Literal[
1168
+ cv2.BORDER_CONSTANT,
1169
+ cv2.BORDER_REPLICATE,
1170
+ cv2.BORDER_REFLECT,
1171
+ cv2.BORDER_WRAP,
1172
+ cv2.BORDER_REFLECT_101,
1173
+ ]
1174
+
1175
+ fill: tuple[float, ...] | float
1176
+ fill_mask: tuple[float, ...] | float
1177
+
1178
+ @model_validator(mode="after")
1179
+ def _validate_coordinates(self) -> Self:
1180
+ if not self.x_min < self.x_max:
1181
+ msg = "x_max must be greater than x_min"
1182
+ raise ValueError(msg)
1183
+ if not self.y_min < self.y_max:
1184
+ msg = "y_max must be greater than y_min"
1185
+ raise ValueError(msg)
1186
+
1187
+ return self
1188
+
1189
+ def __init__(
1190
+ self,
1191
+ x_min: int = 0,
1192
+ y_min: int = 0,
1193
+ x_max: int = 1024,
1194
+ y_max: int = 1024,
1195
+ pad_if_needed: bool = False,
1196
+ pad_position: Literal["center", "top_left", "top_right", "bottom_left", "bottom_right", "random"] = "center",
1197
+ border_mode: Literal[
1198
+ cv2.BORDER_CONSTANT,
1199
+ cv2.BORDER_REPLICATE,
1200
+ cv2.BORDER_REFLECT,
1201
+ cv2.BORDER_WRAP,
1202
+ cv2.BORDER_REFLECT_101,
1203
+ ] = cv2.BORDER_CONSTANT,
1204
+ fill: tuple[float, ...] | float = 0,
1205
+ fill_mask: tuple[float, ...] | float = 0,
1206
+ p: float = 1.0,
1207
+ ):
1208
+ super().__init__(
1209
+ pad_if_needed=pad_if_needed,
1210
+ border_mode=border_mode,
1211
+ fill=fill,
1212
+ fill_mask=fill_mask,
1213
+ pad_position=pad_position,
1214
+ p=p,
1215
+ )
1216
+ self.x_min = x_min
1217
+ self.y_min = y_min
1218
+ self.x_max = x_max
1219
+ self.y_max = y_max
1220
+
1221
+ # New helper function for computing minimum padding
1222
+ def _compute_min_padding(self, image_height: int, image_width: int) -> tuple[int, int, int, int]:
1223
+ pad_top = 0
1224
+ pad_bottom = max(0, self.y_max - image_height)
1225
+ pad_left = 0
1226
+ pad_right = max(0, self.x_max - image_width)
1227
+ return pad_top, pad_bottom, pad_left, pad_right
1228
+
1229
+ # New helper function for distributing and adjusting padding
1230
+ def _compute_adjusted_padding(self, pad_top: int, pad_bottom: int, pad_left: int, pad_right: int) -> dict[str, int]:
1231
+ delta_h = pad_top + pad_bottom
1232
+ delta_w = pad_left + pad_right
1233
+ pad_top_dist = delta_h // 2
1234
+ pad_bottom_dist = delta_h - pad_top_dist
1235
+ pad_left_dist = delta_w // 2
1236
+ pad_right_dist = delta_w - pad_left_dist
1237
+
1238
+ (pad_top_adj, pad_bottom_adj, pad_left_adj, pad_right_adj) = fgeometric.adjust_padding_by_position(
1239
+ h_top=pad_top_dist,
1240
+ h_bottom=pad_bottom_dist,
1241
+ w_left=pad_left_dist,
1242
+ w_right=pad_right_dist,
1243
+ position=self.pad_position,
1244
+ py_random=self.py_random,
1245
+ )
1246
+
1247
+ final_top = max(pad_top_adj, pad_top)
1248
+ final_bottom = max(pad_bottom_adj, pad_bottom)
1249
+ final_left = max(pad_left_adj, pad_left)
1250
+ final_right = max(pad_right_adj, pad_right)
1251
+
1252
+ return {
1253
+ "pad_top": final_top,
1254
+ "pad_bottom": final_bottom,
1255
+ "pad_left": final_left,
1256
+ "pad_right": final_right,
1257
+ }
1258
+
1259
+ def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, Any]) -> dict[str, Any]:
1260
+ """Get parameters for crop.
1261
+
1262
+ Args:
1263
+ params (dict): Dictionary with parameters for crop.
1264
+ data (dict): Dictionary with data.
1265
+
1266
+ Returns:
1267
+ dict: Dictionary with parameters for crop.
1268
+
1269
+ """
1270
+ image_shape = params["shape"][:2]
1271
+ image_height, image_width = image_shape
1272
+
1273
+ if not self.pad_if_needed:
1274
+ return {"crop_coords": (self.x_min, self.y_min, self.x_max, self.y_max), "pad_params": None}
1275
+
1276
+ pad_top, pad_bottom, pad_left, pad_right = self._compute_min_padding(image_height, image_width)
1277
+ pad_params = None
1278
+
1279
+ if any([pad_top, pad_bottom, pad_left, pad_right]):
1280
+ pad_params = self._compute_adjusted_padding(pad_top, pad_bottom, pad_left, pad_right)
1281
+
1282
+ return {"crop_coords": (self.x_min, self.y_min, self.x_max, self.y_max), "pad_params": pad_params}
1283
+
1284
+
1285
+ class CropNonEmptyMaskIfExists(BaseCrop):
1286
+ """Crop area with mask if mask is non-empty, else make random crop.
1287
+
1288
+ This transform attempts to crop a region containing a mask (non-zero pixels). If the mask is empty or not provided,
1289
+ it falls back to a random crop. This is particularly useful for segmentation tasks where you want to focus on
1290
+ regions of interest defined by the mask.
1291
+
1292
+ Args:
1293
+ height (int): Vertical size of crop in pixels. Must be > 0.
1294
+ width (int): Horizontal size of crop in pixels. Must be > 0.
1295
+ ignore_values (list of int, optional): Values to ignore in mask, `0` values are always ignored.
1296
+ For example, if background value is 5, set `ignore_values=[5]` to ignore it. Default: None.
1297
+ ignore_channels (list of int, optional): Channels to ignore in mask.
1298
+ For example, if background is the first channel, set `ignore_channels=[0]` to ignore it. Default: None.
1299
+ p (float): Probability of applying the transform. Default: 1.0.
1300
+
1301
+ Targets:
1302
+ image, mask, bboxes, keypoints, volume, mask3d
1303
+
1304
+ Image types:
1305
+ uint8, float32
1306
+
1307
+ Note:
1308
+ - If a mask is provided, the transform will try to crop an area containing non-zero (or non-ignored) pixels.
1309
+ - If no suitable area is found in the mask or no mask is provided, it will perform a random crop.
1310
+ - The crop size (height, width) must not exceed the original image dimensions.
1311
+ - Bounding boxes and keypoints are also cropped along with the image and mask.
1312
+
1313
+ Raises:
1314
+ ValueError: If the specified crop size is larger than the input image dimensions.
1315
+
1316
+ Example:
1317
+ >>> import numpy as np
1318
+ >>> import albumentations as A
1319
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
1320
+ >>> mask = np.zeros((100, 100), dtype=np.uint8)
1321
+ >>> mask[25:75, 25:75] = 1 # Create a non-empty region in the mask
1322
+ >>> transform = A.Compose([
1323
+ ... A.CropNonEmptyMaskIfExists(height=50, width=50, p=1.0),
1324
+ ... ])
1325
+ >>> transformed = transform(image=image, mask=mask)
1326
+ >>> transformed_image = transformed['image']
1327
+ >>> transformed_mask = transformed['mask']
1328
+ # The resulting crop will likely include part of the non-zero region in the mask
1329
+
1330
+ Raises:
1331
+ ValueError: If the specified crop size is larger than the input image dimensions.
1332
+
1333
+ Examples:
1334
+ >>> import numpy as np
1335
+ >>> import albumentations as A
1336
+ >>>
1337
+ >>> # Prepare sample data
1338
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
1339
+ >>> # Create a mask with non-empty region in the center
1340
+ >>> mask = np.zeros((100, 100), dtype=np.uint8)
1341
+ >>> mask[25:75, 25:75] = 1 # Create a non-empty region in the mask
1342
+ >>>
1343
+ >>> # Create bounding boxes and keypoints in the mask region
1344
+ >>> bboxes = np.array([
1345
+ ... [20, 20, 60, 60], # Box overlapping with non-empty region
1346
+ ... [30, 30, 70, 70], # Box mostly inside non-empty region
1347
+ ... ], dtype=np.float32)
1348
+ >>> bbox_labels = ['cat', 'dog']
1349
+ >>>
1350
+ >>> # Add some keypoints inside mask region
1351
+ >>> keypoints = np.array([
1352
+ ... [40, 40], # Inside non-empty region
1353
+ ... [60, 60], # At edge of non-empty region
1354
+ ... [90, 90] # Outside non-empty region
1355
+ ... ], dtype=np.float32)
1356
+ >>> keypoint_labels = ['eye', 'nose', 'ear']
1357
+ >>>
1358
+ >>> # Define transform that will crop around the non-empty mask region
1359
+ >>> transform = A.Compose([
1360
+ ... A.CropNonEmptyMaskIfExists(
1361
+ ... height=50,
1362
+ ... width=50,
1363
+ ... ignore_values=None,
1364
+ ... ignore_channels=None,
1365
+ ... p=1.0
1366
+ ... ),
1367
+ ... ], bbox_params=A.BboxParams(
1368
+ ... format='pascal_voc',
1369
+ ... label_fields=['bbox_labels']
1370
+ ... ), keypoint_params=A.KeypointParams(
1371
+ ... format='xy',
1372
+ ... label_fields=['keypoint_labels']
1373
+ ... ))
1374
+ >>>
1375
+ >>> # Apply the transform
1376
+ >>> transformed = transform(
1377
+ ... image=image,
1378
+ ... mask=mask,
1379
+ ... bboxes=bboxes,
1380
+ ... bbox_labels=bbox_labels,
1381
+ ... keypoints=keypoints,
1382
+ ... keypoint_labels=keypoint_labels
1383
+ ... )
1384
+ >>>
1385
+ >>> # Get the transformed data
1386
+ >>> transformed_image = transformed['image'] # 50x50 image centered on mask region
1387
+ >>> transformed_mask = transformed['mask'] # 50x50 mask showing part of non-empty region
1388
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to new coordinates
1389
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels preserved for visible boxes
1390
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to new coordinates
1391
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for visible keypoints
1392
+
1393
+ """
1394
+
1395
+ class InitSchema(BaseCrop.InitSchema):
1396
+ ignore_values: list[int] | None
1397
+ ignore_channels: list[int] | None
1398
+ height: Annotated[int, Field(ge=1)]
1399
+ width: Annotated[int, Field(ge=1)]
1400
+
1401
+ def __init__(
1402
+ self,
1403
+ height: int,
1404
+ width: int,
1405
+ ignore_values: list[int] | None = None,
1406
+ ignore_channels: list[int] | None = None,
1407
+ p: float = 1.0,
1408
+ ):
1409
+ super().__init__(p=p)
1410
+
1411
+ self.height = height
1412
+ self.width = width
1413
+ self.ignore_values = ignore_values
1414
+ self.ignore_channels = ignore_channels
1415
+
1416
+ def _preprocess_mask(self, mask: np.ndarray) -> np.ndarray:
1417
+ mask_height, mask_width = mask.shape[:2]
1418
+
1419
+ if self.ignore_values is not None:
1420
+ ignore_values_np = np.array(self.ignore_values)
1421
+ mask = np.where(np.isin(mask, ignore_values_np), 0, mask)
1422
+
1423
+ if mask.ndim == NUM_MULTI_CHANNEL_DIMENSIONS and self.ignore_channels is not None:
1424
+ target_channels = np.array([ch for ch in range(mask.shape[-1]) if ch not in self.ignore_channels])
1425
+ mask = np.take(mask, target_channels, axis=-1)
1426
+
1427
+ if self.height > mask_height or self.width > mask_width:
1428
+ raise ValueError(
1429
+ f"Crop size ({self.height},{self.width}) is larger than image ({mask_height},{mask_width})",
1430
+ )
1431
+
1432
+ return mask
1433
+
1434
+ def get_params_dependent_on_data(
1435
+ self,
1436
+ params: dict[str, Any],
1437
+ data: dict[str, Any],
1438
+ ) -> dict[str, Any]:
1439
+ """Get crop coordinates based on mask content.
1440
+
1441
+ Args:
1442
+ params (dict[str, Any]): The parameters of the transform.
1443
+ data (dict[str, Any]): The data of the transform.
1444
+
1445
+ """
1446
+ if "mask" in data:
1447
+ mask = self._preprocess_mask(data["mask"])
1448
+ elif "masks" in data and len(data["masks"]):
1449
+ masks = data["masks"]
1450
+ mask = self._preprocess_mask(np.copy(masks[0]))
1451
+ for m in masks[1:]:
1452
+ mask |= self._preprocess_mask(m)
1453
+ else:
1454
+ msg = "Can not find mask for CropNonEmptyMaskIfExists"
1455
+ raise RuntimeError(msg)
1456
+
1457
+ mask_height, mask_width = mask.shape[:2]
1458
+
1459
+ if mask.any():
1460
+ # Find non-zero regions in mask
1461
+ mask_sum = mask.sum(axis=-1) if mask.ndim == NUM_MULTI_CHANNEL_DIMENSIONS else mask
1462
+ non_zero_yx = np.argwhere(mask_sum)
1463
+ y, x = self.py_random.choice(non_zero_yx)
1464
+
1465
+ # Calculate crop coordinates centered around chosen point
1466
+ x_min = x - self.py_random.randint(0, self.width - 1)
1467
+ y_min = y - self.py_random.randint(0, self.height - 1)
1468
+ x_min = np.clip(x_min, 0, mask_width - self.width)
1469
+ y_min = np.clip(y_min, 0, mask_height - self.height)
1470
+ else:
1471
+ # Random crop if no non-zero regions
1472
+ x_min = self.py_random.randint(0, mask_width - self.width)
1473
+ y_min = self.py_random.randint(0, mask_height - self.height)
1474
+
1475
+ x_max = x_min + self.width
1476
+ y_max = y_min + self.height
1477
+
1478
+ return {"crop_coords": (x_min, y_min, x_max, y_max)}
1479
+
1480
+
1481
+ class BaseRandomSizedCropInitSchema(BaseTransformInitSchema):
1482
+ size: Annotated[tuple[int, int], AfterValidator(check_range_bounds(1, None))]
1483
+
1484
+
1485
+ class _BaseRandomSizedCrop(DualTransform):
1486
+ """Base class for transforms that crop an image randomly and resize it to a specific size.
1487
+
1488
+ This abstract class provides the foundation for RandomSizedCrop and RandomResizedCrop transforms.
1489
+ It handles cropping and resizing for different data types (image, mask, bboxes, keypoints) while
1490
+ maintaining their spatial relationships.
1491
+
1492
+ Child classes must implement the `get_params_dependent_on_data` method to determine how the
1493
+ crop coordinates are selected according to transform-specific parameters and logic.
1494
+
1495
+ Args:
1496
+ size (tuple[int, int]): Target size (height, width) after cropping and resizing.
1497
+ interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm
1498
+ for image resizing. Default: cv2.INTER_LINEAR.
1499
+ mask_interpolation (OpenCV flag): Flag that is used to specify the interpolation
1500
+ algorithm for mask resizing. Default: cv2.INTER_NEAREST.
1501
+ area_for_downscale (Literal[None, "image", "image_mask"]): Controls automatic use of INTER_AREA interpolation
1502
+ for downscaling. Options:
1503
+ - None: No automatic interpolation selection, always use the specified interpolation method
1504
+ - "image": Use INTER_AREA when downscaling images, retain specified interpolation for upscaling and masks
1505
+ - "image_mask": Use INTER_AREA when downscaling both images and masks
1506
+ Default: None.
1507
+ p (float): Probability of applying the transform. Default: 1.0.
1508
+
1509
+ Targets:
1510
+ image, mask, bboxes, keypoints, volume, mask3d
1511
+
1512
+ Image types:
1513
+ uint8, float32
1514
+
1515
+ Note:
1516
+ This class is not meant to be used directly. Instead, use derived transforms
1517
+ like RandomSizedCrop or RandomResizedCrop that implement specific crop selection
1518
+ strategies.
1519
+ When area_for_downscale is set, INTER_AREA interpolation will be used automatically for
1520
+ downscaling (when the crop is larger than the target size), which provides better quality for size reduction.
1521
+
1522
+ Examples:
1523
+ >>> import numpy as np
1524
+ >>> import albumentations as A
1525
+ >>> import cv2
1526
+ >>>
1527
+ >>> # Example of a custom transform that inherits from _BaseRandomSizedCrop
1528
+ >>> class CustomRandomCrop(_BaseRandomSizedCrop):
1529
+ ... def __init__(
1530
+ ... self,
1531
+ ... size=(224, 224),
1532
+ ... custom_parameter=0.5,
1533
+ ... interpolation=cv2.INTER_LINEAR,
1534
+ ... mask_interpolation=cv2.INTER_NEAREST,
1535
+ ... area_for_downscale="image",
1536
+ ... p=1.0
1537
+ ... ):
1538
+ ... super().__init__(
1539
+ ... size=size,
1540
+ ... interpolation=interpolation,
1541
+ ... mask_interpolation=mask_interpolation,
1542
+ ... area_for_downscale=area_for_downscale,
1543
+ ... p=p,
1544
+ ... )
1545
+ ... self.custom_parameter = custom_parameter
1546
+ ...
1547
+ ... def get_params_dependent_on_data(self, params, data):
1548
+ ... # Custom logic to select crop coordinates
1549
+ ... image_height, image_width = params["shape"][:2]
1550
+ ...
1551
+ ... # Simple example: calculate crop size based on custom_parameter
1552
+ ... crop_height = int(image_height * self.custom_parameter)
1553
+ ... crop_width = int(image_width * self.custom_parameter)
1554
+ ...
1555
+ ... # Random position
1556
+ ... y1 = self.py_random.randint(0, image_height - crop_height + 1)
1557
+ ... x1 = self.py_random.randint(0, image_width - crop_width + 1)
1558
+ ... y2 = y1 + crop_height
1559
+ ... x2 = x1 + crop_width
1560
+ ...
1561
+ ... return {"crop_coords": (x1, y1, x2, y2)}
1562
+ >>>
1563
+ >>> # Prepare sample data
1564
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
1565
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
1566
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
1567
+ >>> bbox_labels = [1, 2]
1568
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
1569
+ >>> keypoint_labels = [0, 1]
1570
+ >>>
1571
+ >>> # Create a pipeline with our custom transform
1572
+ >>> transform = A.Compose(
1573
+ ... [CustomRandomCrop(size=(64, 64), custom_parameter=0.6, area_for_downscale="image")],
1574
+ ... bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
1575
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels'])
1576
+ ... )
1577
+ >>>
1578
+ >>> # Apply the transform
1579
+ >>> transformed = transform(
1580
+ ... image=image,
1581
+ ... mask=mask,
1582
+ ... bboxes=bboxes,
1583
+ ... bbox_labels=bbox_labels,
1584
+ ... keypoints=keypoints,
1585
+ ... keypoint_labels=keypoint_labels
1586
+ ... )
1587
+ >>>
1588
+ >>> # Get the transformed data
1589
+ >>> transformed_image = transformed['image'] # Will be 64x64
1590
+ >>> transformed_mask = transformed['mask'] # Will be 64x64
1591
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to new dimensions
1592
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for bboxes that remain after cropping
1593
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to new dimensions
1594
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for keypoints that remain
1595
+
1596
+ """
1597
+
1598
+ class InitSchema(BaseRandomSizedCropInitSchema):
1599
+ interpolation: Literal[
1600
+ cv2.INTER_NEAREST,
1601
+ cv2.INTER_NEAREST_EXACT,
1602
+ cv2.INTER_LINEAR,
1603
+ cv2.INTER_CUBIC,
1604
+ cv2.INTER_AREA,
1605
+ cv2.INTER_LANCZOS4,
1606
+ cv2.INTER_LINEAR_EXACT,
1607
+ ]
1608
+ mask_interpolation: Literal[
1609
+ cv2.INTER_NEAREST,
1610
+ cv2.INTER_NEAREST_EXACT,
1611
+ cv2.INTER_LINEAR,
1612
+ cv2.INTER_CUBIC,
1613
+ cv2.INTER_AREA,
1614
+ cv2.INTER_LANCZOS4,
1615
+ cv2.INTER_LINEAR_EXACT,
1616
+ ]
1617
+ area_for_downscale: Literal[None, "image", "image_mask"]
1618
+
1619
+ def __init__(
1620
+ self,
1621
+ size: tuple[int, int],
1622
+ interpolation: Literal[
1623
+ cv2.INTER_NEAREST,
1624
+ cv2.INTER_NEAREST_EXACT,
1625
+ cv2.INTER_LINEAR,
1626
+ cv2.INTER_CUBIC,
1627
+ cv2.INTER_AREA,
1628
+ cv2.INTER_LANCZOS4,
1629
+ cv2.INTER_LINEAR_EXACT,
1630
+ ] = cv2.INTER_LINEAR,
1631
+ mask_interpolation: Literal[
1632
+ cv2.INTER_NEAREST,
1633
+ cv2.INTER_NEAREST_EXACT,
1634
+ cv2.INTER_LINEAR,
1635
+ cv2.INTER_CUBIC,
1636
+ cv2.INTER_AREA,
1637
+ cv2.INTER_LANCZOS4,
1638
+ cv2.INTER_LINEAR_EXACT,
1639
+ ] = cv2.INTER_NEAREST,
1640
+ area_for_downscale: Literal[None, "image", "image_mask"] = None,
1641
+ p: float = 1.0,
1642
+ ):
1643
+ super().__init__(p=p)
1644
+ self.size = size
1645
+ self.interpolation = interpolation
1646
+ self.mask_interpolation = mask_interpolation
1647
+ self.area_for_downscale = area_for_downscale
1648
+
1649
+ def _get_interpolation_for_resize(self, crop_shape: tuple[int, int], target_type: str) -> int:
1650
+ """Get the appropriate interpolation method for resizing.
1651
+
1652
+ Args:
1653
+ crop_shape: Shape of the crop (height, width)
1654
+ target_type: Either "image" or "mask" to determine base interpolation
1655
+
1656
+ Returns:
1657
+ OpenCV interpolation flag
1658
+
1659
+ """
1660
+ crop_height, crop_width = crop_shape
1661
+ target_height, target_width = self.size
1662
+
1663
+ # Determine if this is downscaling
1664
+ is_downscale = (crop_height > target_height) or (crop_width > target_width)
1665
+
1666
+ # Use INTER_AREA for downscaling if configured
1667
+ if (is_downscale and (target_type == "image" and self.area_for_downscale in ["image", "image_mask"])) or (
1668
+ target_type == "mask" and self.area_for_downscale == "image_mask"
1669
+ ):
1670
+ return cv2.INTER_AREA
1671
+ # Get base interpolation
1672
+ return self.interpolation if target_type == "image" else self.mask_interpolation
1673
+
1674
+ def apply(
1675
+ self,
1676
+ img: np.ndarray,
1677
+ crop_coords: tuple[int, int, int, int],
1678
+ **params: Any,
1679
+ ) -> np.ndarray:
1680
+ """Apply the crop to the image.
1681
+
1682
+ Args:
1683
+ img (np.ndarray): The image to crop.
1684
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1685
+ **params (Any): Additional parameters.
1686
+
1687
+ """
1688
+ crop = fcrops.crop(img, *crop_coords)
1689
+ interpolation = self._get_interpolation_for_resize(crop.shape[:2], "image")
1690
+ return fgeometric.resize(crop, self.size, interpolation)
1691
+
1692
+ def apply_to_mask(
1693
+ self,
1694
+ mask: np.ndarray,
1695
+ crop_coords: tuple[int, int, int, int],
1696
+ **params: Any,
1697
+ ) -> np.ndarray:
1698
+ """Apply the crop to the mask.
1699
+
1700
+ Args:
1701
+ mask (np.ndarray): The mask to crop.
1702
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1703
+ **params (Any): Additional parameters.
1704
+
1705
+ """
1706
+ crop = fcrops.crop(mask, *crop_coords)
1707
+ interpolation = self._get_interpolation_for_resize(crop.shape[:2], "mask")
1708
+ return fgeometric.resize(crop, self.size, interpolation)
1709
+
1710
+ def apply_to_bboxes(
1711
+ self,
1712
+ bboxes: np.ndarray,
1713
+ crop_coords: tuple[int, int, int, int],
1714
+ **params: Any,
1715
+ ) -> np.ndarray:
1716
+ """Apply the crop to the bounding boxes.
1717
+
1718
+ Args:
1719
+ bboxes (np.ndarray): The bounding boxes to crop.
1720
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1721
+ **params (Any): Additional parameters.
1722
+
1723
+ """
1724
+ return fcrops.crop_bboxes_by_coords(bboxes, crop_coords, params["shape"])
1725
+
1726
+ def apply_to_keypoints(
1727
+ self,
1728
+ keypoints: np.ndarray,
1729
+ crop_coords: tuple[int, int, int, int],
1730
+ **params: Any,
1731
+ ) -> np.ndarray:
1732
+ """Apply the crop to the keypoints.
1733
+
1734
+ Args:
1735
+ keypoints (np.ndarray): The keypoints to crop.
1736
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1737
+ **params (Any): Additional parameters.
1738
+
1739
+ """
1740
+ # First, crop the keypoints
1741
+ cropped_keypoints = fcrops.crop_keypoints_by_coords(keypoints, crop_coords)
1742
+
1743
+ # Calculate the dimensions of the crop
1744
+ crop_height = crop_coords[3] - crop_coords[1]
1745
+ crop_width = crop_coords[2] - crop_coords[0]
1746
+
1747
+ # Calculate scaling factors
1748
+ scale_x = self.size[1] / crop_width
1749
+ scale_y = self.size[0] / crop_height
1750
+
1751
+ # Scale the cropped keypoints
1752
+ return fgeometric.keypoints_scale(cropped_keypoints, scale_x, scale_y)
1753
+
1754
+ def apply_to_images(
1755
+ self,
1756
+ images: np.ndarray,
1757
+ crop_coords: tuple[int, int, int, int],
1758
+ **params: Any,
1759
+ ) -> np.ndarray:
1760
+ """Apply the crop and resize to a volume/images.
1761
+
1762
+ This method crops the volume first (reducing data size), then resizes using
1763
+ a helper method with batch transform decorator.
1764
+
1765
+ Args:
1766
+ images (np.ndarray): The volume/images to crop and resize with shape (D, H, W) or (D, H, W, C).
1767
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1768
+ **params (Any): Additional parameters.
1769
+
1770
+ """
1771
+ # First crop the volume using volume_crop_yx (reduces data size)
1772
+ crop = fcrops.volume_crop_yx(images, *crop_coords)
1773
+
1774
+ # Get interpolation method based on crop dimensions
1775
+ interpolation = self._get_interpolation_for_resize(crop.shape[1:3], "image")
1776
+
1777
+ # Then resize the smaller cropped volume using the selected interpolation
1778
+ return np.stack([fgeometric.resize(crop[i], self.size, interpolation) for i in range(images.shape[0])])
1779
+
1780
+ def apply_to_volume(
1781
+ self,
1782
+ volume: np.ndarray,
1783
+ crop_coords: tuple[int, int, int, int],
1784
+ **params: Any,
1785
+ ) -> np.ndarray:
1786
+ """Apply the crop and resize to a volume.
1787
+
1788
+ Args:
1789
+ volume (np.ndarray): The volume to crop.
1790
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1791
+ **params (Any): Additional parameters.
1792
+
1793
+ """
1794
+ return self.apply_to_images(volume, crop_coords, **params)
1795
+
1796
+ def apply_to_mask3d(
1797
+ self,
1798
+ mask3d: np.ndarray,
1799
+ crop_coords: tuple[int, int, int, int],
1800
+ **params: Any,
1801
+ ) -> np.ndarray:
1802
+ """Apply the crop and resize to a mask3d.
1803
+
1804
+ Args:
1805
+ mask3d (np.ndarray): The mask3d to crop.
1806
+ crop_coords (tuple[int, int, int, int]): The coordinates of the crop.
1807
+ **params (Any): Additional parameters.
1808
+
1809
+ """
1810
+ return self.apply_to_images(mask3d, crop_coords, **params)
1811
+
1812
+
1813
+ class RandomSizedCrop(_BaseRandomSizedCrop):
1814
+ """Crop a random part of the input and rescale it to a specific size.
1815
+
1816
+ This transform first crops a random portion of the input and then resizes it to a specified size.
1817
+ The size of the random crop is controlled by the 'min_max_height' parameter.
1818
+
1819
+ Args:
1820
+ min_max_height (tuple[int, int]): Minimum and maximum height of the crop in pixels.
1821
+ size (tuple[int, int]): Target size for the output image, i.e. (height, width) after crop and resize.
1822
+ w2h_ratio (float): Aspect ratio (width/height) of crop. Default: 1.0
1823
+ interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm. Should be one of:
1824
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
1825
+ Default: cv2.INTER_LINEAR.
1826
+ mask_interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm for mask.
1827
+ Should be one of: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
1828
+ Default: cv2.INTER_NEAREST.
1829
+ area_for_downscale (Literal[None, "image", "image_mask"]): Controls automatic use of INTER_AREA interpolation
1830
+ for downscaling. Options:
1831
+ - None: No automatic interpolation selection, always use the specified interpolation method
1832
+ - "image": Use INTER_AREA when downscaling images, retain specified interpolation for upscaling and masks
1833
+ - "image_mask": Use INTER_AREA when downscaling both images and masks
1834
+ Default: None.
1835
+ p (float): Probability of applying the transform. Default: 1.0
1836
+
1837
+ Targets:
1838
+ image, mask, bboxes, keypoints, volume, mask3d
1839
+
1840
+ Image types:
1841
+ uint8, float32
1842
+
1843
+ Note:
1844
+ - The crop size is randomly selected for each execution within the range specified by 'min_max_height'.
1845
+ - The aspect ratio of the crop is determined by the 'w2h_ratio' parameter.
1846
+ - After cropping, the result is resized to the specified 'size'.
1847
+ - Bounding boxes that end up fully outside the cropped area will be removed.
1848
+ - Keypoints that end up outside the cropped area will be removed.
1849
+ - This transform differs from RandomResizedCrop in that it allows more control over the crop size
1850
+ through the 'min_max_height' parameter, rather than using a scale parameter.
1851
+ - When area_for_downscale is set, INTER_AREA interpolation will be used automatically for
1852
+ downscaling (when the crop is larger than the target size), which provides better quality for size reduction.
1853
+
1854
+ Mathematical Details:
1855
+ 1. A random crop height h is sampled from the range [min_max_height[0], min_max_height[1]].
1856
+ 2. The crop width w is calculated as: w = h * w2h_ratio
1857
+ 3. A random location for the crop is selected within the input image.
1858
+ 4. The image is cropped to the size (h, w).
1859
+ 5. The crop is then resized to the specified 'size'.
1860
+
1861
+ Examples:
1862
+ >>> import numpy as np
1863
+ >>> import albumentations as A
1864
+ >>> import cv2
1865
+ >>>
1866
+ >>> # Prepare sample data
1867
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
1868
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
1869
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
1870
+ >>> bbox_labels = [1, 2]
1871
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
1872
+ >>> keypoint_labels = [0, 1]
1873
+ >>>
1874
+ >>> # Define transform with parameters as tuples
1875
+ >>> transform = A.Compose([
1876
+ ... A.RandomSizedCrop(
1877
+ ... min_max_height=(50, 80),
1878
+ ... size=(64, 64),
1879
+ ... w2h_ratio=1.0,
1880
+ ... interpolation=cv2.INTER_LINEAR,
1881
+ ... mask_interpolation=cv2.INTER_NEAREST,
1882
+ ... area_for_downscale="image", # Use INTER_AREA for image downscaling
1883
+ ... p=1.0
1884
+ ... ),
1885
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
1886
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
1887
+ >>>
1888
+ >>> # Apply the transform
1889
+ >>> transformed = transform(
1890
+ ... image=image,
1891
+ ... mask=mask,
1892
+ ... bboxes=bboxes,
1893
+ ... bbox_labels=bbox_labels,
1894
+ ... keypoints=keypoints,
1895
+ ... keypoint_labels=keypoint_labels
1896
+ ... )
1897
+ >>>
1898
+ >>> # Get the transformed data
1899
+ >>> transformed_image = transformed['image'] # Shape: (64, 64, 3)
1900
+ >>> transformed_mask = transformed['mask'] # Shape: (64, 64)
1901
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to new crop and size
1902
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for the preserved bboxes
1903
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to new crop and size
1904
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for the preserved keypoints
1905
+
1906
+ """
1907
+
1908
+ _targets = ALL_TARGETS
1909
+
1910
+ class InitSchema(BaseTransformInitSchema):
1911
+ interpolation: Literal[
1912
+ cv2.INTER_NEAREST,
1913
+ cv2.INTER_NEAREST_EXACT,
1914
+ cv2.INTER_LINEAR,
1915
+ cv2.INTER_CUBIC,
1916
+ cv2.INTER_AREA,
1917
+ cv2.INTER_LANCZOS4,
1918
+ cv2.INTER_LINEAR_EXACT,
1919
+ ]
1920
+ mask_interpolation: Literal[
1921
+ cv2.INTER_NEAREST,
1922
+ cv2.INTER_NEAREST_EXACT,
1923
+ cv2.INTER_LINEAR,
1924
+ cv2.INTER_CUBIC,
1925
+ cv2.INTER_AREA,
1926
+ cv2.INTER_LANCZOS4,
1927
+ cv2.INTER_LINEAR_EXACT,
1928
+ ]
1929
+ min_max_height: OnePlusIntRangeType
1930
+ w2h_ratio: Annotated[float, Field(gt=0)]
1931
+ size: Annotated[tuple[int, int], AfterValidator(check_range_bounds(1, None))]
1932
+ area_for_downscale: Literal[None, "image", "image_mask"]
1933
+
1934
+ def __init__(
1935
+ self,
1936
+ min_max_height: tuple[int, int],
1937
+ size: tuple[int, int],
1938
+ w2h_ratio: float = 1.0,
1939
+ interpolation: Literal[
1940
+ cv2.INTER_NEAREST,
1941
+ cv2.INTER_NEAREST_EXACT,
1942
+ cv2.INTER_LINEAR,
1943
+ cv2.INTER_CUBIC,
1944
+ cv2.INTER_AREA,
1945
+ cv2.INTER_LANCZOS4,
1946
+ cv2.INTER_LINEAR_EXACT,
1947
+ ] = cv2.INTER_LINEAR,
1948
+ mask_interpolation: Literal[
1949
+ cv2.INTER_NEAREST,
1950
+ cv2.INTER_NEAREST_EXACT,
1951
+ cv2.INTER_LINEAR,
1952
+ cv2.INTER_CUBIC,
1953
+ cv2.INTER_AREA,
1954
+ cv2.INTER_LANCZOS4,
1955
+ cv2.INTER_LINEAR_EXACT,
1956
+ ] = cv2.INTER_NEAREST,
1957
+ area_for_downscale: Literal[None, "image", "image_mask"] = None,
1958
+ p: float = 1.0,
1959
+ ):
1960
+ super().__init__(
1961
+ size=size,
1962
+ interpolation=interpolation,
1963
+ mask_interpolation=mask_interpolation,
1964
+ area_for_downscale=area_for_downscale,
1965
+ p=p,
1966
+ )
1967
+ self.min_max_height = min_max_height
1968
+ self.w2h_ratio = w2h_ratio
1969
+
1970
+ def get_params_dependent_on_data(
1971
+ self,
1972
+ params: dict[str, Any],
1973
+ data: dict[str, Any],
1974
+ ) -> dict[str, tuple[int, int, int, int]]:
1975
+ """Get the parameters dependent on the data.
1976
+
1977
+ Args:
1978
+ params (dict[str, Any]): The parameters of the transform.
1979
+ data (dict[str, Any]): The data of the transform.
1980
+
1981
+ """
1982
+ image_shape = params["shape"][:2]
1983
+
1984
+ crop_height = self.py_random.randint(*self.min_max_height)
1985
+ crop_width = int(crop_height * self.w2h_ratio)
1986
+
1987
+ crop_shape = (crop_height, crop_width)
1988
+
1989
+ h_start = self.py_random.random()
1990
+ w_start = self.py_random.random()
1991
+
1992
+ crop_coords = fcrops.get_crop_coords(image_shape, crop_shape, h_start, w_start)
1993
+
1994
+ return {"crop_coords": crop_coords}
1995
+
1996
+
1997
+ class RandomResizedCrop(_BaseRandomSizedCrop):
1998
+ """Crop a random part of the input and rescale it to a specified size.
1999
+
2000
+ This transform first crops a random portion of the input image (or mask, bounding boxes, keypoints)
2001
+ and then resizes the crop to a specified size. It's particularly useful for training neural networks
2002
+ on images of varying sizes and aspect ratios.
2003
+
2004
+ Args:
2005
+ size (tuple[int, int]): Target size for the output image, i.e. (height, width) after crop and resize.
2006
+ scale (tuple[float, float]): Range of the random size of the crop relative to the input size.
2007
+ For example, (0.08, 1.0) means the crop size will be between 8% and 100% of the input size.
2008
+ Default: (0.08, 1.0)
2009
+ ratio (tuple[float, float]): Range of aspect ratios of the random crop.
2010
+ For example, (0.75, 1.3333) allows crop aspect ratios from 3:4 to 4:3.
2011
+ Default: (0.75, 1.3333333333333333)
2012
+ interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm. Should be one of:
2013
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
2014
+ Default: cv2.INTER_LINEAR
2015
+ mask_interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm for mask.
2016
+ Should be one of: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
2017
+ Default: cv2.INTER_NEAREST
2018
+ area_for_downscale (Literal[None, "image", "image_mask"]): Controls automatic use of INTER_AREA interpolation
2019
+ for downscaling. Options:
2020
+ - None: No automatic interpolation selection, always use the specified interpolation method
2021
+ - "image": Use INTER_AREA when downscaling images, retain specified interpolation for upscaling and masks
2022
+ - "image_mask": Use INTER_AREA when downscaling both images and masks
2023
+ Default: None.
2024
+ p (float): Probability of applying the transform. Default: 1.0
2025
+
2026
+ Targets:
2027
+ image, mask, bboxes, keypoints, volume, mask3d
2028
+
2029
+ Image types:
2030
+ uint8, float32
2031
+
2032
+ Note:
2033
+ - This transform attempts to crop a random area with an aspect ratio and relative size
2034
+ specified by 'ratio' and 'scale' parameters. If it fails to find a suitable crop after
2035
+ 10 attempts, it will return a crop from the center of the image.
2036
+ - The crop's aspect ratio is defined as width / height.
2037
+ - Bounding boxes that end up fully outside the cropped area will be removed.
2038
+ - Keypoints that end up outside the cropped area will be removed.
2039
+ - After cropping, the result is resized to the specified size.
2040
+ - When area_for_downscale is set, INTER_AREA interpolation will be used automatically for
2041
+ downscaling (when the crop is larger than the target size), which provides better quality for size reduction.
2042
+
2043
+ Mathematical Details:
2044
+ 1. A target area A is sampled from the range [scale[0] * input_area, scale[1] * input_area].
2045
+ 2. A target aspect ratio r is sampled from the range [ratio[0], ratio[1]].
2046
+ 3. The crop width and height are computed as:
2047
+ w = sqrt(A * r)
2048
+ h = sqrt(A / r)
2049
+ 4. If w and h are within the input image dimensions, the crop is accepted.
2050
+ Otherwise, steps 1-3 are repeated (up to 10 times).
2051
+ 5. If no valid crop is found after 10 attempts, a centered crop is taken.
2052
+ 6. The crop is then resized to the specified size.
2053
+
2054
+ Examples:
2055
+ >>> import numpy as np
2056
+ >>> import albumentations as A
2057
+ >>> import cv2
2058
+ >>>
2059
+ >>> # Prepare sample data
2060
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
2061
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
2062
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
2063
+ >>> bbox_labels = [1, 2]
2064
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
2065
+ >>> keypoint_labels = [0, 1]
2066
+ >>>
2067
+ >>> # Define transform with parameters as tuples
2068
+ >>> transform = A.Compose([
2069
+ ... A.RandomResizedCrop(
2070
+ ... size=(64, 64),
2071
+ ... scale=(0.5, 0.9), # Crop size will be 50-90% of original image
2072
+ ... ratio=(0.75, 1.33), # Aspect ratio will vary from 3:4 to 4:3
2073
+ ... interpolation=cv2.INTER_LINEAR,
2074
+ ... mask_interpolation=cv2.INTER_NEAREST,
2075
+ ... area_for_downscale="image", # Use INTER_AREA for image downscaling
2076
+ ... p=1.0
2077
+ ... ),
2078
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
2079
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
2080
+ >>>
2081
+ >>> # Apply the transform
2082
+ >>> transformed = transform(
2083
+ ... image=image,
2084
+ ... mask=mask,
2085
+ ... bboxes=bboxes,
2086
+ ... bbox_labels=bbox_labels,
2087
+ ... keypoints=keypoints,
2088
+ ... keypoint_labels=keypoint_labels
2089
+ ... )
2090
+ >>>
2091
+ >>> # Get the transformed data
2092
+ >>> transformed_image = transformed['image'] # Shape: (64, 64, 3)
2093
+ >>> transformed_mask = transformed['mask'] # Shape: (64, 64)
2094
+ >>> transformed_bboxes = transformed['bboxes'] # Bounding boxes adjusted to new crop and size
2095
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for the preserved bboxes
2096
+ >>> transformed_keypoints = transformed['keypoints'] # Keypoints adjusted to new crop and size
2097
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Labels for the preserved keypoints
2098
+
2099
+ """
2100
+
2101
+ _targets = ALL_TARGETS
2102
+
2103
+ class InitSchema(BaseTransformInitSchema):
2104
+ scale: Annotated[tuple[float, float], AfterValidator(check_range_bounds(0, 1)), AfterValidator(nondecreasing)]
2105
+ ratio: Annotated[
2106
+ tuple[float, float],
2107
+ AfterValidator(check_range_bounds(0, None)),
2108
+ AfterValidator(nondecreasing),
2109
+ ]
2110
+ size: Annotated[tuple[int, int], AfterValidator(check_range_bounds(1, None))]
2111
+ interpolation: Literal[
2112
+ cv2.INTER_NEAREST,
2113
+ cv2.INTER_NEAREST_EXACT,
2114
+ cv2.INTER_LINEAR,
2115
+ cv2.INTER_CUBIC,
2116
+ cv2.INTER_AREA,
2117
+ cv2.INTER_LANCZOS4,
2118
+ cv2.INTER_LINEAR_EXACT,
2119
+ ]
2120
+ mask_interpolation: Literal[
2121
+ cv2.INTER_NEAREST,
2122
+ cv2.INTER_NEAREST_EXACT,
2123
+ cv2.INTER_LINEAR,
2124
+ cv2.INTER_CUBIC,
2125
+ cv2.INTER_AREA,
2126
+ cv2.INTER_LANCZOS4,
2127
+ cv2.INTER_LINEAR_EXACT,
2128
+ ]
2129
+ area_for_downscale: Literal[None, "image", "image_mask"]
2130
+
2131
+ def __init__(
2132
+ self,
2133
+ size: tuple[int, int],
2134
+ scale: tuple[float, float] = (0.08, 1.0),
2135
+ ratio: tuple[float, float] = (0.75, 1.3333333333333333),
2136
+ interpolation: Literal[
2137
+ cv2.INTER_NEAREST,
2138
+ cv2.INTER_NEAREST_EXACT,
2139
+ cv2.INTER_LINEAR,
2140
+ cv2.INTER_CUBIC,
2141
+ cv2.INTER_AREA,
2142
+ cv2.INTER_LANCZOS4,
2143
+ cv2.INTER_LINEAR_EXACT,
2144
+ ] = cv2.INTER_LINEAR,
2145
+ mask_interpolation: Literal[
2146
+ cv2.INTER_NEAREST,
2147
+ cv2.INTER_NEAREST_EXACT,
2148
+ cv2.INTER_LINEAR,
2149
+ cv2.INTER_CUBIC,
2150
+ cv2.INTER_AREA,
2151
+ cv2.INTER_LANCZOS4,
2152
+ cv2.INTER_LINEAR_EXACT,
2153
+ ] = cv2.INTER_NEAREST,
2154
+ area_for_downscale: Literal[None, "image", "image_mask"] = None,
2155
+ p: float = 1.0,
2156
+ ):
2157
+ super().__init__(
2158
+ size=size,
2159
+ interpolation=interpolation,
2160
+ mask_interpolation=mask_interpolation,
2161
+ area_for_downscale=area_for_downscale,
2162
+ p=p,
2163
+ )
2164
+ self.scale = scale
2165
+ self.ratio = ratio
2166
+
2167
+ def get_params_dependent_on_data(
2168
+ self,
2169
+ params: dict[str, Any],
2170
+ data: dict[str, Any],
2171
+ ) -> dict[str, tuple[int, int, int, int]]:
2172
+ """Get the parameters dependent on the data.
2173
+
2174
+ Args:
2175
+ params (dict[str, Any]): The parameters of the transform.
2176
+ data (dict[str, Any]): The data of the transform.
2177
+
2178
+ """
2179
+ image_shape = params["shape"][:2]
2180
+ image_height, image_width = image_shape
2181
+
2182
+ area = image_height * image_width
2183
+
2184
+ # Pre-compute constants to avoid repeated calculations
2185
+ scale_min_area = self.scale[0] * area
2186
+ scale_max_area = self.scale[1] * area
2187
+ log_ratio_min = math.log(self.ratio[0])
2188
+ log_ratio_max = math.log(self.ratio[1])
2189
+
2190
+ for _ in range(10):
2191
+ target_area = self.py_random.uniform(scale_min_area, scale_max_area)
2192
+ aspect_ratio = math.exp(self.py_random.uniform(log_ratio_min, log_ratio_max))
2193
+
2194
+ width = round(math.sqrt(target_area * aspect_ratio))
2195
+ height = round(math.sqrt(target_area / aspect_ratio))
2196
+
2197
+ if 0 < width <= image_width and 0 < height <= image_height:
2198
+ h_start = self.py_random.random()
2199
+ w_start = self.py_random.random()
2200
+ crop_coords = fcrops.get_crop_coords(image_shape, (height, width), h_start, w_start)
2201
+ return {"crop_coords": crop_coords}
2202
+
2203
+ # Fallback to central crop - use proper function
2204
+ in_ratio = image_width / image_height
2205
+ if in_ratio < self.ratio[0]:
2206
+ width = image_width
2207
+ height = round(image_width / self.ratio[0])
2208
+ elif in_ratio > self.ratio[1]:
2209
+ height = image_height
2210
+ width = round(height * self.ratio[1])
2211
+ else: # whole image
2212
+ width = image_width
2213
+ height = image_height
2214
+
2215
+ crop_coords = fcrops.get_center_crop_coords(image_shape, (height, width))
2216
+ return {"crop_coords": crop_coords}
2217
+
2218
+
2219
+ class RandomCropNearBBox(BaseCrop):
2220
+ """Crop bbox from image with random shift by x,y coordinates
2221
+
2222
+ Args:
2223
+ max_part_shift (float, (float, float)): Max shift in `height` and `width` dimensions relative
2224
+ to `cropping_bbox` dimension.
2225
+ If max_part_shift is a single float, the range will be (0, max_part_shift).
2226
+ Default (0, 0.3).
2227
+ cropping_bbox_key (str): Additional target key for cropping box. Default `cropping_bbox`.
2228
+ p (float): probability of applying the transform. Default: 1.
2229
+
2230
+ Targets:
2231
+ image, mask, bboxes, keypoints, volume, mask3d
2232
+
2233
+ Image types:
2234
+ uint8, float32
2235
+
2236
+ Examples:
2237
+ >>> aug = Compose([RandomCropNearBBox(max_part_shift=(0.1, 0.5), cropping_bbox_key='test_bbox')],
2238
+ >>> bbox_params=BboxParams("pascal_voc"))
2239
+ >>> result = aug(image=image, bboxes=bboxes, test_bbox=[0, 5, 10, 20])
2240
+
2241
+ """
2242
+
2243
+ _targets = ALL_TARGETS
2244
+
2245
+ class InitSchema(BaseTransformInitSchema):
2246
+ max_part_shift: ZeroOneRangeType
2247
+ cropping_bbox_key: str
2248
+
2249
+ def __init__(
2250
+ self,
2251
+ max_part_shift: tuple[float, float] | float = (0, 0.3),
2252
+ cropping_bbox_key: str = "cropping_bbox",
2253
+ p: float = 1.0,
2254
+ ):
2255
+ super().__init__(p=p)
2256
+ self.max_part_shift = cast("tuple[float, float]", max_part_shift)
2257
+ self.cropping_bbox_key = cropping_bbox_key
2258
+
2259
+ def get_params_dependent_on_data(
2260
+ self,
2261
+ params: dict[str, Any],
2262
+ data: dict[str, Any],
2263
+ ) -> dict[str, tuple[float, ...]]:
2264
+ """Get the parameters dependent on the data.
2265
+
2266
+ Args:
2267
+ params (dict[str, Any]): The parameters of the transform.
2268
+ data (dict[str, Any]): The data of the transform.
2269
+
2270
+ """
2271
+ bbox = data[self.cropping_bbox_key]
2272
+
2273
+ image_shape = params["shape"][:2]
2274
+
2275
+ bbox = self._clip_bbox(bbox, image_shape)
2276
+
2277
+ h_max_shift = round((bbox[3] - bbox[1]) * self.max_part_shift[0])
2278
+ w_max_shift = round((bbox[2] - bbox[0]) * self.max_part_shift[1])
2279
+
2280
+ x_min = bbox[0] - self.py_random.randint(-w_max_shift, w_max_shift)
2281
+ x_max = bbox[2] + self.py_random.randint(-w_max_shift, w_max_shift)
2282
+
2283
+ y_min = bbox[1] - self.py_random.randint(-h_max_shift, h_max_shift)
2284
+ y_max = bbox[3] + self.py_random.randint(-h_max_shift, h_max_shift)
2285
+
2286
+ crop_coords = self._clip_bbox((x_min, y_min, x_max, y_max), image_shape)
2287
+
2288
+ if crop_coords[0] == crop_coords[2] or crop_coords[1] == crop_coords[3]:
2289
+ crop_shape = (bbox[3] - bbox[1], bbox[2] - bbox[0])
2290
+ crop_coords = fcrops.get_center_crop_coords(image_shape, crop_shape)
2291
+
2292
+ return {"crop_coords": crop_coords}
2293
+
2294
+ @property
2295
+ def targets_as_params(self) -> list[str]:
2296
+ """Get the targets as parameters.
2297
+
2298
+ Returns:
2299
+ list[str]: The targets as parameters.
2300
+
2301
+ """
2302
+ return [self.cropping_bbox_key]
2303
+
2304
+
2305
+ class BBoxSafeRandomCrop(BaseCrop):
2306
+ """Crop an area from image while ensuring all bounding boxes are preserved in the crop.
2307
+
2308
+ Similar to AtLeastOneBboxRandomCrop, but with a key difference:
2309
+ - BBoxSafeRandomCrop ensures ALL bounding boxes are preserved in the crop when erosion_rate=0.0
2310
+ - AtLeastOneBboxRandomCrop ensures AT LEAST ONE bounding box is present in the crop
2311
+
2312
+ This makes BBoxSafeRandomCrop more suitable for scenarios where:
2313
+ - You need to preserve all objects in the scene
2314
+ - Losing any bounding box would be problematic (e.g., rare object classes)
2315
+ - You're training a model that needs to detect multiple objects simultaneously
2316
+
2317
+ The algorithm:
2318
+ 1. If bounding boxes exist:
2319
+ - Computes the union of all bounding boxes
2320
+ - Applies erosion based on erosion_rate to this union
2321
+ - Clips the eroded union to valid image coordinates [0,1]
2322
+ - Randomly samples crop coordinates within the clipped union area
2323
+ 2. If no bounding boxes exist:
2324
+ - Computes crop height based on erosion_rate
2325
+ - Sets crop width to maintain original aspect ratio
2326
+ - Randomly places the crop within the image
2327
+
2328
+ Args:
2329
+ erosion_rate (float): Controls how much the valid crop region can deviate from the bbox union.
2330
+ Must be in range [0.0, 1.0].
2331
+ - 0.0: crop must contain the exact bbox union (safest option that guarantees all boxes are preserved)
2332
+ - 1.0: crop can deviate maximally from the bbox union (increases likelihood of cutting off some boxes)
2333
+ Defaults to 0.0.
2334
+ p (float, optional): Probability of applying the transform. Defaults to 1.0.
2335
+
2336
+ Targets:
2337
+ image, mask, bboxes, keypoints, volume, mask3d
2338
+
2339
+ Image types:
2340
+ uint8, float32
2341
+
2342
+ Raises:
2343
+ CropSizeError: If requested crop size exceeds image dimensions
2344
+
2345
+ Examples:
2346
+ >>> import numpy as np
2347
+ >>> import albumentations as A
2348
+ >>>
2349
+ >>> # Prepare sample data
2350
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
2351
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
2352
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
2353
+ >>> bbox_labels = [1, 2]
2354
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
2355
+ >>> keypoint_labels = [0, 1]
2356
+ >>>
2357
+ >>> # Define transform with erosion_rate parameter
2358
+ >>> transform = A.Compose([
2359
+ ... A.BBoxSafeRandomCrop(erosion_rate=0.2),
2360
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
2361
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
2362
+ >>>
2363
+ >>> # Apply the transform
2364
+ >>> result = transform(
2365
+ ... image=image,
2366
+ ... mask=mask,
2367
+ ... bboxes=bboxes,
2368
+ ... bbox_labels=bbox_labels,
2369
+ ... keypoints=keypoints,
2370
+ ... keypoint_labels=keypoint_labels
2371
+ ... )
2372
+ >>>
2373
+ >>> # Get the transformed data
2374
+ >>> transformed_image = result['image'] # Cropped image containing all bboxes
2375
+ >>> transformed_mask = result['mask'] # Cropped mask
2376
+ >>> transformed_bboxes = result['bboxes'] # All bounding boxes preserved with adjusted coordinates
2377
+ >>> transformed_bbox_labels = result['bbox_labels'] # Original labels preserved
2378
+ >>> transformed_keypoints = result['keypoints'] # Keypoints with adjusted coordinates
2379
+ >>> transformed_keypoint_labels = result['keypoint_labels'] # Original keypoint labels preserved
2380
+ >>>
2381
+ >>> # Example with a different erosion_rate
2382
+ >>> transform_more_flexible = A.Compose([
2383
+ ... A.BBoxSafeRandomCrop(erosion_rate=0.5), # More flexibility in crop placement
2384
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']))
2385
+ >>>
2386
+ >>> # Apply transform with only image and bboxes
2387
+ >>> result_bboxes_only = transform_more_flexible(
2388
+ ... image=image,
2389
+ ... bboxes=bboxes,
2390
+ ... bbox_labels=bbox_labels
2391
+ ... )
2392
+ >>> transformed_image = result_bboxes_only['image']
2393
+ >>> transformed_bboxes = result_bboxes_only['bboxes'] # All bboxes still preserved
2394
+
2395
+ Note:
2396
+ - IMPORTANT: Using erosion_rate > 0.0 may result in some bounding boxes being cut off,
2397
+ particularly narrow boxes at the boundary of the union area. For guaranteed preservation
2398
+ of all bounding boxes, use erosion_rate=0.0.
2399
+ - Aspect ratio is preserved only when no bounding boxes are present
2400
+ - May be more restrictive in crop placement compared to AtLeastOneBboxRandomCrop
2401
+ - The crop size is determined by the bounding boxes when present
2402
+
2403
+ """
2404
+
2405
+ _targets = ALL_TARGETS
2406
+
2407
+ class InitSchema(BaseTransformInitSchema):
2408
+ erosion_rate: float = Field(
2409
+ ge=0.0,
2410
+ le=1.0,
2411
+ )
2412
+
2413
+ def __init__(self, erosion_rate: float = 0.0, p: float = 1.0):
2414
+ super().__init__(p=p)
2415
+ self.erosion_rate = erosion_rate
2416
+
2417
+ def _get_coords_no_bbox(self, image_shape: tuple[int, int]) -> tuple[int, int, int, int]:
2418
+ image_height, image_width = image_shape
2419
+
2420
+ erosive_h = int(image_height * (1.0 - self.erosion_rate))
2421
+ crop_height = image_height if erosive_h >= image_height else self.py_random.randint(erosive_h, image_height)
2422
+
2423
+ crop_width = int(crop_height * image_width / image_height)
2424
+
2425
+ h_start = self.py_random.random()
2426
+ w_start = self.py_random.random()
2427
+
2428
+ crop_shape = (crop_height, crop_width)
2429
+
2430
+ return fcrops.get_crop_coords(image_shape, crop_shape, h_start, w_start)
2431
+
2432
+ def get_params_dependent_on_data(
2433
+ self,
2434
+ params: dict[str, Any],
2435
+ data: dict[str, Any],
2436
+ ) -> dict[str, tuple[int, int, int, int]]:
2437
+ """Get the parameters dependent on the data.
2438
+
2439
+ Args:
2440
+ params (dict[str, Any]): The parameters of the transform.
2441
+ data (dict[str, Any]): The data of the transform.
2442
+
2443
+ """
2444
+ image_shape = params["shape"][:2]
2445
+
2446
+ if len(data["bboxes"]) == 0: # less likely, this class is for use with bboxes.
2447
+ crop_coords = self._get_coords_no_bbox(image_shape)
2448
+ return {"crop_coords": crop_coords}
2449
+
2450
+ bbox_union = union_of_bboxes(bboxes=data["bboxes"], erosion_rate=self.erosion_rate)
2451
+
2452
+ if bbox_union is None:
2453
+ crop_coords = self._get_coords_no_bbox(image_shape)
2454
+ return {"crop_coords": crop_coords}
2455
+
2456
+ x_min, y_min, x_max, y_max = bbox_union
2457
+
2458
+ x_min = np.clip(x_min, 0, 1)
2459
+ y_min = np.clip(y_min, 0, 1)
2460
+ x_max = np.clip(x_max, x_min, 1)
2461
+ y_max = np.clip(y_max, y_min, 1)
2462
+
2463
+ image_height, image_width = image_shape
2464
+
2465
+ crop_x_min = int(x_min * self.py_random.random() * image_width)
2466
+ crop_y_min = int(y_min * self.py_random.random() * image_height)
2467
+
2468
+ bbox_xmax = x_max + (1 - x_max) * self.py_random.random()
2469
+ bbox_ymax = y_max + (1 - y_max) * self.py_random.random()
2470
+ crop_x_max = int(bbox_xmax * image_width)
2471
+ crop_y_max = int(bbox_ymax * image_height)
2472
+
2473
+ return {"crop_coords": (crop_x_min, crop_y_min, crop_x_max, crop_y_max)}
2474
+
2475
+
2476
+ class RandomSizedBBoxSafeCrop(BBoxSafeRandomCrop):
2477
+ """Crop a random part of the input and rescale it to a specific size without loss of bounding boxes.
2478
+
2479
+ This transform first attempts to crop a random portion of the input image while ensuring that all bounding boxes
2480
+ remain within the cropped area. It then resizes the crop to the specified size. This is particularly useful for
2481
+ object detection tasks where preserving all objects in the image is crucial while also standardizing the image size.
2482
+
2483
+ Args:
2484
+ height (int): Height of the output image after resizing.
2485
+ width (int): Width of the output image after resizing.
2486
+ erosion_rate (float): A value between 0.0 and 1.0 that determines the minimum allowable size of the crop
2487
+ as a fraction of the original image size. For example, an erosion_rate of 0.2 means the crop will be
2488
+ at least 80% of the original image height and width. Default: 0.0 (no minimum size).
2489
+ interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm. Should be one of:
2490
+ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.AREA, cv2.INTER_LANCZOS4.
2491
+ Default: cv2.INTER_LINEAR.
2492
+ mask_interpolation (OpenCV flag): Flag that is used to specify the interpolation algorithm for mask.
2493
+ Should be one of: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.AREA, cv2.INTER_LANCZOS4.
2494
+ Default: cv2.INTER_NEAREST.
2495
+ p (float): Probability of applying the transform. Default: 1.0.
2496
+
2497
+ Targets:
2498
+ image, mask, bboxes, keypoints, volume, mask3d
2499
+
2500
+ Image types:
2501
+ uint8, float32
2502
+
2503
+ Note:
2504
+ - This transform ensures that all bounding boxes in the original image are fully contained within the
2505
+ cropped area. If it's not possible to find such a crop (e.g., when bounding boxes are too spread out),
2506
+ it will default to cropping the entire image.
2507
+ - After cropping, the result is resized to the specified (height, width) size.
2508
+ - Bounding box coordinates are adjusted to match the new image size.
2509
+ - Keypoints are moved along with the crop and scaled to the new image size.
2510
+ - If there are no bounding boxes in the image, it will fall back to a random crop.
2511
+
2512
+ Mathematical Details:
2513
+ 1. A crop region is selected that includes all bounding boxes.
2514
+ 2. The crop size is determined by the erosion_rate:
2515
+ min_crop_size = (1 - erosion_rate) * original_size
2516
+ 3. If the selected crop is smaller than min_crop_size, it's expanded to meet this requirement.
2517
+ 4. The crop is then resized to the specified (height, width) size.
2518
+ 5. Bounding box coordinates are transformed to match the new image size:
2519
+ new_coord = (old_coord - crop_start) * (new_size / crop_size)
2520
+
2521
+ Examples:
2522
+ >>> import numpy as np
2523
+ >>> import albumentations as A
2524
+ >>> import cv2
2525
+ >>>
2526
+ >>> # Prepare sample data
2527
+ >>> image = np.random.randint(0, 256, (300, 300, 3), dtype=np.uint8)
2528
+ >>> mask = np.random.randint(0, 2, (300, 300), dtype=np.uint8)
2529
+ >>>
2530
+ >>> # Create bounding boxes with some overlap and separation
2531
+ >>> bboxes = np.array([
2532
+ ... [10, 10, 80, 80], # top-left box
2533
+ ... [100, 100, 200, 200], # center box
2534
+ ... [210, 210, 290, 290] # bottom-right box
2535
+ ... ], dtype=np.float32)
2536
+ >>> bbox_labels = ['cat', 'dog', 'bird']
2537
+ >>>
2538
+ >>> # Create keypoints inside the bounding boxes
2539
+ >>> keypoints = np.array([
2540
+ ... [45, 45], # inside first box
2541
+ ... [150, 150], # inside second box
2542
+ ... [250, 250] # inside third box
2543
+ ... ], dtype=np.float32)
2544
+ >>> keypoint_labels = ['nose', 'eye', 'tail']
2545
+ >>>
2546
+ >>> # Example 1: Basic usage with default parameters
2547
+ >>> transform_basic = A.Compose([
2548
+ ... A.RandomSizedBBoxSafeCrop(height=224, width=224, p=1.0),
2549
+ ... ], bbox_params=A.BboxParams(
2550
+ ... format='pascal_voc',
2551
+ ... label_fields=['bbox_labels']
2552
+ ... ), keypoint_params=A.KeypointParams(
2553
+ ... format='xy',
2554
+ ... label_fields=['keypoint_labels']
2555
+ ... ))
2556
+ >>>
2557
+ >>> # Apply the transform
2558
+ >>> result_basic = transform_basic(
2559
+ ... image=image,
2560
+ ... mask=mask,
2561
+ ... bboxes=bboxes,
2562
+ ... bbox_labels=bbox_labels,
2563
+ ... keypoints=keypoints,
2564
+ ... keypoint_labels=keypoint_labels
2565
+ ... )
2566
+ >>>
2567
+ >>> # Access the transformed data
2568
+ >>> transformed_image = result_basic['image'] # Shape will be (224, 224, 3)
2569
+ >>> transformed_mask = result_basic['mask'] # Shape will be (224, 224)
2570
+ >>> transformed_bboxes = result_basic['bboxes'] # All original bounding boxes preserved
2571
+ >>> transformed_bbox_labels = result_basic['bbox_labels'] # Original labels preserved
2572
+ >>> transformed_keypoints = result_basic['keypoints'] # Keypoints adjusted to new coordinates
2573
+ >>> transformed_keypoint_labels = result_basic['keypoint_labels'] # Original labels preserved
2574
+ >>>
2575
+ >>> # Example 2: With erosion_rate for more flexibility in crop placement
2576
+ >>> transform_erosion = A.Compose([
2577
+ ... A.RandomSizedBBoxSafeCrop(
2578
+ ... height=256,
2579
+ ... width=256,
2580
+ ... erosion_rate=0.2, # Allows 20% flexibility in crop placement
2581
+ ... interpolation=cv2.INTER_CUBIC, # Higher quality interpolation
2582
+ ... mask_interpolation=cv2.INTER_NEAREST, # Preserve mask edges
2583
+ ... p=1.0
2584
+ ... ),
2585
+ ... ], bbox_params=A.BboxParams(
2586
+ ... format='pascal_voc',
2587
+ ... label_fields=['bbox_labels'],
2588
+ ... min_visibility=0.3 # Only keep bboxes with at least 30% visibility
2589
+ ... ), keypoint_params=A.KeypointParams(
2590
+ ... format='xy',
2591
+ ... label_fields=['keypoint_labels'],
2592
+ ... remove_invisible=True # Remove keypoints outside the crop
2593
+ ... ))
2594
+ >>>
2595
+ >>> # Apply the transform with erosion
2596
+ >>> result_erosion = transform_erosion(
2597
+ ... image=image,
2598
+ ... mask=mask,
2599
+ ... bboxes=bboxes,
2600
+ ... bbox_labels=bbox_labels,
2601
+ ... keypoints=keypoints,
2602
+ ... keypoint_labels=keypoint_labels
2603
+ ... )
2604
+ >>>
2605
+ >>> # With erosion_rate=0.2, the crop has more flexibility in placement
2606
+ >>> # while still ensuring all bounding boxes are included
2607
+
2608
+ """
2609
+
2610
+ _targets = ALL_TARGETS
2611
+
2612
+ class InitSchema(BaseTransformInitSchema):
2613
+ height: Annotated[int, Field(ge=1)]
2614
+ width: Annotated[int, Field(ge=1)]
2615
+ erosion_rate: float = Field(
2616
+ ge=0.0,
2617
+ le=1.0,
2618
+ )
2619
+ interpolation: Literal[
2620
+ cv2.INTER_NEAREST,
2621
+ cv2.INTER_NEAREST_EXACT,
2622
+ cv2.INTER_LINEAR,
2623
+ cv2.INTER_CUBIC,
2624
+ cv2.INTER_AREA,
2625
+ cv2.INTER_LANCZOS4,
2626
+ cv2.INTER_LINEAR_EXACT,
2627
+ ]
2628
+ mask_interpolation: Literal[
2629
+ cv2.INTER_NEAREST,
2630
+ cv2.INTER_NEAREST_EXACT,
2631
+ cv2.INTER_LINEAR,
2632
+ cv2.INTER_CUBIC,
2633
+ cv2.INTER_AREA,
2634
+ cv2.INTER_LANCZOS4,
2635
+ cv2.INTER_LINEAR_EXACT,
2636
+ ]
2637
+
2638
+ def __init__(
2639
+ self,
2640
+ height: int,
2641
+ width: int,
2642
+ erosion_rate: float = 0.0,
2643
+ interpolation: Literal[
2644
+ cv2.INTER_NEAREST,
2645
+ cv2.INTER_NEAREST_EXACT,
2646
+ cv2.INTER_LINEAR,
2647
+ cv2.INTER_CUBIC,
2648
+ cv2.INTER_AREA,
2649
+ cv2.INTER_LANCZOS4,
2650
+ cv2.INTER_LINEAR_EXACT,
2651
+ ] = cv2.INTER_LINEAR,
2652
+ mask_interpolation: Literal[
2653
+ cv2.INTER_NEAREST,
2654
+ cv2.INTER_NEAREST_EXACT,
2655
+ cv2.INTER_LINEAR,
2656
+ cv2.INTER_CUBIC,
2657
+ cv2.INTER_AREA,
2658
+ cv2.INTER_LANCZOS4,
2659
+ cv2.INTER_LINEAR_EXACT,
2660
+ ] = cv2.INTER_NEAREST,
2661
+ p: float = 1.0,
2662
+ ):
2663
+ super().__init__(erosion_rate=erosion_rate, p=p)
2664
+ self.height = height
2665
+ self.width = width
2666
+ self.interpolation = interpolation
2667
+ self.mask_interpolation = mask_interpolation
2668
+
2669
+ def apply(
2670
+ self,
2671
+ img: np.ndarray,
2672
+ crop_coords: tuple[int, int, int, int],
2673
+ **params: Any,
2674
+ ) -> np.ndarray:
2675
+ """Apply the crop and pad transform to an image.
2676
+
2677
+ Args:
2678
+ img (np.ndarray): The image to apply the crop and pad transform to.
2679
+ crop_coords (tuple[int, int, int, int]): The parameters for the crop.
2680
+ params (dict[str, Any]): Additional parameters for the transform.
2681
+
2682
+ """
2683
+ crop = fcrops.crop(img, *crop_coords)
2684
+ return fgeometric.resize(crop, (self.height, self.width), self.interpolation)
2685
+
2686
+ def apply_to_mask(
2687
+ self,
2688
+ mask: np.ndarray,
2689
+ crop_coords: tuple[int, int, int, int],
2690
+ **params: Any,
2691
+ ) -> np.ndarray:
2692
+ """Apply the crop and pad transform to a mask.
2693
+
2694
+ Args:
2695
+ mask (np.ndarray): The mask to apply the crop and pad transform to.
2696
+ crop_coords (tuple[int, int, int, int]): The parameters for the crop.
2697
+ params (dict[str, Any]): Additional parameters for the transform.
2698
+
2699
+ """
2700
+ crop = fcrops.crop(mask, *crop_coords)
2701
+ return fgeometric.resize(crop, (self.height, self.width), self.mask_interpolation)
2702
+
2703
+ def apply_to_keypoints(
2704
+ self,
2705
+ keypoints: np.ndarray,
2706
+ crop_coords: tuple[int, int, int, int],
2707
+ **params: Any,
2708
+ ) -> np.ndarray:
2709
+ """Apply the crop and pad transform to keypoints.
2710
+
2711
+ Args:
2712
+ keypoints (np.ndarray): The keypoints to apply the crop and pad transform to.
2713
+ crop_coords (tuple[int, int, int, int]): The parameters for the crop.
2714
+ params (dict[str, Any]): Additional parameters for the transform.
2715
+
2716
+ Returns:
2717
+ np.ndarray: The keypoints after the crop and pad transform.
2718
+
2719
+ """
2720
+ keypoints = fcrops.crop_keypoints_by_coords(keypoints, crop_coords)
2721
+
2722
+ crop_height = crop_coords[3] - crop_coords[1]
2723
+ crop_width = crop_coords[2] - crop_coords[0]
2724
+
2725
+ scale_y = self.height / crop_height
2726
+ scale_x = self.width / crop_width
2727
+ return fgeometric.keypoints_scale(keypoints, scale_x=scale_x, scale_y=scale_y)
2728
+
2729
+
2730
+ class CropAndPad(DualTransform):
2731
+ """Crop and pad images by pixel amounts or fractions of image sizes.
2732
+
2733
+ This transform allows for simultaneous cropping and padding of images. Cropping removes pixels from the sides
2734
+ (i.e., extracts a subimage), while padding adds pixels to the sides (e.g., black pixels). The amount of
2735
+ cropping/padding can be specified either in absolute pixels or as a fraction of the image size.
2736
+
2737
+ Args:
2738
+ px (int, tuple of int, tuple of tuples of int, or None):
2739
+ The number of pixels to crop (negative values) or pad (positive values) on each side of the image.
2740
+ Either this or the parameter `percent` may be set, not both at the same time.
2741
+ - If int: crop/pad all sides by this value.
2742
+ - If tuple of 2 ints: crop/pad by (top/bottom, left/right).
2743
+ - If tuple of 4 ints: crop/pad by (top, right, bottom, left).
2744
+ - Each int can also be a tuple of 2 ints for a range, or a list of ints for discrete choices.
2745
+ Default: None.
2746
+
2747
+ percent (float, tuple of float, tuple of tuples of float, or None):
2748
+ The fraction of the image size to crop (negative values) or pad (positive values) on each side.
2749
+ Either this or the parameter `px` may be set, not both at the same time.
2750
+ - If float: crop/pad all sides by this fraction.
2751
+ - If tuple of 2 floats: crop/pad by (top/bottom, left/right) fractions.
2752
+ - If tuple of 4 floats: crop/pad by (top, right, bottom, left) fractions.
2753
+ - Each float can also be a tuple of 2 floats for a range, or a list of floats for discrete choices.
2754
+ Default: None.
2755
+
2756
+ border_mode (int):
2757
+ OpenCV border mode used for padding. Default: cv2.BORDER_CONSTANT.
2758
+
2759
+ fill (tuple[float, ...] | float):
2760
+ The constant value to use for padding if border_mode is cv2.BORDER_CONSTANT.
2761
+ Default: 0.
2762
+
2763
+ fill_mask (tuple[float, ...] | float):
2764
+ Same as fill but used for mask padding. Default: 0.
2765
+
2766
+ keep_size (bool):
2767
+ If True, the output image will be resized to the input image size after cropping/padding.
2768
+ Default: True.
2769
+
2770
+ sample_independently (bool):
2771
+ If True and ranges are used for px/percent, sample a value for each side independently.
2772
+ If False, sample one value and use it for all sides. Default: True.
2773
+
2774
+ interpolation (int):
2775
+ OpenCV interpolation flag used for resizing if keep_size is True.
2776
+ Default: cv2.INTER_LINEAR.
2777
+
2778
+ mask_interpolation (int):
2779
+ OpenCV interpolation flag used for resizing if keep_size is True.
2780
+ Should be one of: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4.
2781
+ Default: cv2.INTER_NEAREST.
2782
+
2783
+ p (float):
2784
+ Probability of applying the transform. Default: 1.0.
2785
+
2786
+ Targets:
2787
+ image, mask, bboxes, keypoints, volume, mask3d
2788
+
2789
+ Image types:
2790
+ uint8, float32
2791
+
2792
+ Note:
2793
+ - This transform will never crop images below a height or width of 1.
2794
+ - When using pixel values (px), the image will be cropped/padded by exactly that many pixels.
2795
+ - When using percentages (percent), the amount of crop/pad will be calculated based on the image size.
2796
+ - Bounding boxes that end up fully outside the image after cropping will be removed.
2797
+ - Keypoints that end up outside the image after cropping will be removed.
2798
+
2799
+ Examples:
2800
+ >>> import numpy as np
2801
+ >>> import albumentations as A
2802
+ >>> import cv2
2803
+ >>>
2804
+ >>> # Prepare sample data
2805
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
2806
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
2807
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
2808
+ >>> bbox_labels = [1, 2]
2809
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
2810
+ >>> keypoint_labels = [0, 1]
2811
+ >>>
2812
+ >>> # Example 1: Using px parameter with specific values for each side
2813
+ >>> # Crop 10px from top, pad 20px on right, pad 30px on bottom, crop 40px from left
2814
+ >>> transform_px = A.Compose([
2815
+ ... A.CropAndPad(
2816
+ ... px=(-10, 20, 30, -40), # (top, right, bottom, left)
2817
+ ... border_mode=cv2.BORDER_CONSTANT,
2818
+ ... fill=128, # Gray padding color
2819
+ ... fill_mask=0,
2820
+ ... keep_size=False, # Don't resize back to original dimensions
2821
+ ... p=1.0
2822
+ ... ),
2823
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
2824
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
2825
+ >>>
2826
+ >>> # Apply the transform
2827
+ >>> result_px = transform_px(
2828
+ ... image=image,
2829
+ ... mask=mask,
2830
+ ... bboxes=bboxes,
2831
+ ... bbox_labels=bbox_labels,
2832
+ ... keypoints=keypoints,
2833
+ ... keypoint_labels=keypoint_labels
2834
+ ... )
2835
+ >>>
2836
+ >>> # Get the transformed data with px parameters
2837
+ >>> transformed_image_px = result_px['image'] # Shape will be different from original
2838
+ >>> transformed_mask_px = result_px['mask']
2839
+ >>> transformed_bboxes_px = result_px['bboxes'] # Adjusted to new dimensions
2840
+ >>> transformed_bbox_labels_px = result_px['bbox_labels'] # Bounding box labels after crop
2841
+ >>> transformed_keypoints_px = result_px['keypoints'] # Adjusted to new dimensions
2842
+ >>> transformed_keypoint_labels_px = result_px['keypoint_labels'] # Keypoint labels after crop
2843
+ >>>
2844
+ >>> # Example 2: Using percent parameter as a single value
2845
+ >>> # This will pad all sides by 10% of image dimensions
2846
+ >>> transform_percent = A.Compose([
2847
+ ... A.CropAndPad(
2848
+ ... percent=0.1, # Pad all sides by 10%
2849
+ ... border_mode=cv2.BORDER_REFLECT, # Use reflection padding
2850
+ ... keep_size=True, # Resize back to original dimensions
2851
+ ... p=1.0
2852
+ ... ),
2853
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
2854
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
2855
+ >>>
2856
+ >>> # Apply the transform
2857
+ >>> result_percent = transform_percent(
2858
+ ... image=image,
2859
+ ... mask=mask,
2860
+ ... bboxes=bboxes,
2861
+ ... bbox_labels=bbox_labels,
2862
+ ... keypoints=keypoints,
2863
+ ... keypoint_labels=keypoint_labels
2864
+ ... )
2865
+ >>>
2866
+ >>> # Get the transformed data with percent parameters
2867
+ >>> # Since keep_size=True, image dimensions remain the same (100x100)
2868
+ >>> transformed_image_pct = result_percent['image']
2869
+ >>> transformed_mask_pct = result_percent['mask']
2870
+ >>> transformed_bboxes_pct = result_percent['bboxes']
2871
+ >>> transformed_bbox_labels_pct = result_percent['bbox_labels']
2872
+ >>> transformed_keypoints_pct = result_percent['keypoints']
2873
+ >>> transformed_keypoint_labels_pct = result_percent['keypoint_labels']
2874
+ >>>
2875
+ >>> # Example 3: Random padding within a range
2876
+ >>> # Pad top and bottom by 5-15%, left and right by 10-20%
2877
+ >>> transform_random = A.Compose([
2878
+ ... A.CropAndPad(
2879
+ ... percent=[(0.05, 0.15), (0.1, 0.2), (0.05, 0.15), (0.1, 0.2)], # (top, right, bottom, left)
2880
+ ... sample_independently=True, # Sample each side independently
2881
+ ... border_mode=cv2.BORDER_CONSTANT,
2882
+ ... fill=0, # Black padding
2883
+ ... keep_size=False,
2884
+ ... p=1.0
2885
+ ... ),
2886
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
2887
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
2888
+ >>>
2889
+ >>> # Result dimensions will vary based on the random padding values chosen
2890
+
2891
+ """
2892
+
2893
+ _targets = ALL_TARGETS
2894
+
2895
+ class InitSchema(BaseTransformInitSchema):
2896
+ px: PxType | None
2897
+ percent: PercentType | None
2898
+ keep_size: bool
2899
+ sample_independently: bool
2900
+ interpolation: Literal[
2901
+ cv2.INTER_NEAREST,
2902
+ cv2.INTER_NEAREST_EXACT,
2903
+ cv2.INTER_LINEAR,
2904
+ cv2.INTER_CUBIC,
2905
+ cv2.INTER_AREA,
2906
+ cv2.INTER_LANCZOS4,
2907
+ cv2.INTER_LINEAR_EXACT,
2908
+ ]
2909
+ mask_interpolation: Literal[
2910
+ cv2.INTER_NEAREST,
2911
+ cv2.INTER_NEAREST_EXACT,
2912
+ cv2.INTER_LINEAR,
2913
+ cv2.INTER_CUBIC,
2914
+ cv2.INTER_AREA,
2915
+ cv2.INTER_LANCZOS4,
2916
+ cv2.INTER_LINEAR_EXACT,
2917
+ ]
2918
+ fill: tuple[float, ...] | float
2919
+ fill_mask: tuple[float, ...] | float
2920
+ border_mode: Literal[
2921
+ cv2.BORDER_CONSTANT,
2922
+ cv2.BORDER_REPLICATE,
2923
+ cv2.BORDER_REFLECT,
2924
+ cv2.BORDER_WRAP,
2925
+ cv2.BORDER_REFLECT_101,
2926
+ ]
2927
+
2928
+ @model_validator(mode="after")
2929
+ def _check_px_percent(self) -> Self:
2930
+ if self.px is None and self.percent is None:
2931
+ msg = "Both px and percent parameters cannot be None simultaneously."
2932
+ raise ValueError(msg)
2933
+ if self.px is not None and self.percent is not None:
2934
+ msg = "Only px or percent may be set!"
2935
+ raise ValueError(msg)
2936
+
2937
+ return self
2938
+
2939
+ def __init__(
2940
+ self,
2941
+ px: int | list[int] | None = None,
2942
+ percent: float | list[float] | None = None,
2943
+ keep_size: bool = True,
2944
+ sample_independently: bool = True,
2945
+ interpolation: Literal[
2946
+ cv2.INTER_NEAREST,
2947
+ cv2.INTER_NEAREST_EXACT,
2948
+ cv2.INTER_LINEAR,
2949
+ cv2.INTER_CUBIC,
2950
+ cv2.INTER_AREA,
2951
+ cv2.INTER_LANCZOS4,
2952
+ cv2.INTER_LINEAR_EXACT,
2953
+ ] = cv2.INTER_LINEAR,
2954
+ mask_interpolation: Literal[
2955
+ cv2.INTER_NEAREST,
2956
+ cv2.INTER_NEAREST_EXACT,
2957
+ cv2.INTER_LINEAR,
2958
+ cv2.INTER_CUBIC,
2959
+ cv2.INTER_AREA,
2960
+ cv2.INTER_LANCZOS4,
2961
+ cv2.INTER_LINEAR_EXACT,
2962
+ ] = cv2.INTER_NEAREST,
2963
+ border_mode: Literal[
2964
+ cv2.BORDER_CONSTANT,
2965
+ cv2.BORDER_REPLICATE,
2966
+ cv2.BORDER_REFLECT,
2967
+ cv2.BORDER_WRAP,
2968
+ cv2.BORDER_REFLECT_101,
2969
+ ] = cv2.BORDER_CONSTANT,
2970
+ fill: tuple[float, ...] | float = 0,
2971
+ fill_mask: tuple[float, ...] | float = 0,
2972
+ p: float = 1.0,
2973
+ ):
2974
+ super().__init__(p=p)
2975
+
2976
+ self.px = px
2977
+ self.percent = percent
2978
+
2979
+ self.border_mode = border_mode
2980
+ self.fill = fill
2981
+ self.fill_mask = fill_mask
2982
+
2983
+ self.keep_size = keep_size
2984
+ self.sample_independently = sample_independently
2985
+
2986
+ self.interpolation = interpolation
2987
+ self.mask_interpolation = mask_interpolation
2988
+
2989
+ def apply(
2990
+ self,
2991
+ img: np.ndarray,
2992
+ crop_params: Sequence[int],
2993
+ pad_params: Sequence[int],
2994
+ fill: tuple[float, ...] | float,
2995
+ **params: Any,
2996
+ ) -> np.ndarray:
2997
+ """Apply the crop and pad transform to an image.
2998
+
2999
+ Args:
3000
+ img (np.ndarray): The image to apply the crop and pad transform to.
3001
+ crop_params (Sequence[int]): The parameters for the crop.
3002
+ pad_params (Sequence[int]): The parameters for the pad.
3003
+ fill (tuple[float, ...] | float): The value to fill the image with.
3004
+ params (dict[str, Any]): Additional parameters for the transform.
3005
+
3006
+ Returns:
3007
+ np.ndarray: The image after the crop and pad transform.
3008
+
3009
+ """
3010
+ return fcrops.crop_and_pad(
3011
+ img,
3012
+ crop_params,
3013
+ pad_params,
3014
+ fill,
3015
+ params["shape"][:2],
3016
+ self.interpolation,
3017
+ self.border_mode,
3018
+ self.keep_size,
3019
+ )
3020
+
3021
+ def apply_to_mask(
3022
+ self,
3023
+ mask: np.ndarray,
3024
+ crop_params: Sequence[int],
3025
+ pad_params: Sequence[int],
3026
+ fill_mask: tuple[float, ...] | float,
3027
+ **params: Any,
3028
+ ) -> np.ndarray:
3029
+ """Apply the crop and pad transform to a mask.
3030
+
3031
+ Args:
3032
+ mask (np.ndarray): The mask to apply the crop and pad transform to.
3033
+ crop_params (Sequence[int]): The parameters for the crop.
3034
+ pad_params (Sequence[int]): The parameters for the pad.
3035
+ fill_mask (tuple[float, ...] | float): The value to fill the mask with.
3036
+ params (dict[str, Any]): Additional parameters for the transform.
3037
+
3038
+ Returns:
3039
+ np.ndarray: The mask after the crop and pad transform.
3040
+
3041
+ """
3042
+ return fcrops.crop_and_pad(
3043
+ mask,
3044
+ crop_params,
3045
+ pad_params,
3046
+ fill_mask,
3047
+ params["shape"][:2],
3048
+ self.mask_interpolation,
3049
+ self.border_mode,
3050
+ self.keep_size,
3051
+ )
3052
+
3053
+ def apply_to_bboxes(
3054
+ self,
3055
+ bboxes: np.ndarray,
3056
+ crop_params: tuple[int, int, int, int],
3057
+ pad_params: tuple[int, int, int, int],
3058
+ result_shape: tuple[int, int],
3059
+ **params: Any,
3060
+ ) -> np.ndarray:
3061
+ """Apply the crop and pad transform to bounding boxes.
3062
+
3063
+ Args:
3064
+ bboxes (np.ndarray): The bounding boxes to apply the crop and pad transform to.
3065
+ crop_params (tuple[int, int, int, int]): The parameters for the crop.
3066
+ pad_params (tuple[int, int, int, int]): The parameters for the pad.
3067
+ result_shape (tuple[int, int]): The shape of the result.
3068
+ params (dict[str, Any]): Additional parameters for the transform.
3069
+
3070
+ Returns:
3071
+ np.ndarray: The bounding boxes after the crop and pad transform.
3072
+
3073
+ """
3074
+ return fcrops.crop_and_pad_bboxes(bboxes, crop_params, pad_params, params["shape"][:2], result_shape)
3075
+
3076
+ def apply_to_keypoints(
3077
+ self,
3078
+ keypoints: np.ndarray,
3079
+ crop_params: tuple[int, int, int, int],
3080
+ pad_params: tuple[int, int, int, int],
3081
+ result_shape: tuple[int, int],
3082
+ **params: Any,
3083
+ ) -> np.ndarray:
3084
+ """Apply the crop and pad transform to keypoints.
3085
+
3086
+ Args:
3087
+ keypoints (np.ndarray): The keypoints to apply the crop and pad transform to.
3088
+ crop_params (tuple[int, int, int, int]): The parameters for the crop.
3089
+ pad_params (tuple[int, int, int, int]): The parameters for the pad.
3090
+ result_shape (tuple[int, int]): The shape of the result.
3091
+ params (dict[str, Any]): Additional parameters for the transform.
3092
+
3093
+ Returns:
3094
+ np.ndarray: The keypoints after the crop and pad transform.
3095
+
3096
+ """
3097
+ return fcrops.crop_and_pad_keypoints(
3098
+ keypoints,
3099
+ crop_params,
3100
+ pad_params,
3101
+ params["shape"][:2],
3102
+ result_shape,
3103
+ self.keep_size,
3104
+ )
3105
+
3106
+ @staticmethod
3107
+ def __prevent_zero(val1: int, val2: int, max_val: int) -> tuple[int, int]:
3108
+ regain = abs(max_val) + 1
3109
+ regain1 = regain // 2
3110
+ regain2 = regain // 2
3111
+ if regain1 + regain2 < regain:
3112
+ regain1 += 1
3113
+
3114
+ if regain1 > val1:
3115
+ diff = regain1 - val1
3116
+ regain1 = val1
3117
+ regain2 += diff
3118
+ elif regain2 > val2:
3119
+ diff = regain2 - val2
3120
+ regain2 = val2
3121
+ regain1 += diff
3122
+
3123
+ return val1 - regain1, val2 - regain2
3124
+
3125
+ @staticmethod
3126
+ def _prevent_zero(crop_params: list[int], height: int, width: int) -> list[int]:
3127
+ top, right, bottom, left = crop_params
3128
+
3129
+ remaining_height = height - (top + bottom)
3130
+ remaining_width = width - (left + right)
3131
+
3132
+ if remaining_height < 1:
3133
+ top, bottom = CropAndPad.__prevent_zero(top, bottom, height)
3134
+ if remaining_width < 1:
3135
+ left, right = CropAndPad.__prevent_zero(left, right, width)
3136
+
3137
+ return [max(top, 0), max(right, 0), max(bottom, 0), max(left, 0)]
3138
+
3139
+ def get_params_dependent_on_data(self, params: dict[str, Any], data: dict[str, Any]) -> dict[str, Any]:
3140
+ """Get the parameters for the crop.
3141
+
3142
+ Args:
3143
+ params (dict[str, Any]): The parameters for the transform.
3144
+ data (dict[str, Any]): The data for the transform.
3145
+
3146
+ Returns:
3147
+ dict[str, Any]: The parameters for the crop.
3148
+
3149
+ """
3150
+ height, width = params["shape"][:2]
3151
+
3152
+ if self.px is not None:
3153
+ new_params = self._get_px_params()
3154
+ else:
3155
+ percent_params = self._get_percent_params()
3156
+ new_params = [
3157
+ int(percent_params[0] * height),
3158
+ int(percent_params[1] * width),
3159
+ int(percent_params[2] * height),
3160
+ int(percent_params[3] * width),
3161
+ ]
3162
+
3163
+ pad_params = [max(i, 0) for i in new_params]
3164
+
3165
+ crop_params = self._prevent_zero([-min(i, 0) for i in new_params], height, width)
3166
+
3167
+ top, right, bottom, left = crop_params
3168
+ crop_params = [left, top, width - right, height - bottom]
3169
+ result_rows = crop_params[3] - crop_params[1]
3170
+ result_cols = crop_params[2] - crop_params[0]
3171
+ if result_cols == width and result_rows == height:
3172
+ crop_params = []
3173
+
3174
+ top, right, bottom, left = pad_params
3175
+ pad_params = [top, bottom, left, right]
3176
+ if any(pad_params):
3177
+ result_rows += top + bottom
3178
+ result_cols += left + right
3179
+ else:
3180
+ pad_params = []
3181
+
3182
+ return {
3183
+ "crop_params": crop_params or None,
3184
+ "pad_params": pad_params or None,
3185
+ "fill": None if pad_params is None else self._get_pad_value(self.fill),
3186
+ "fill_mask": None
3187
+ if pad_params is None
3188
+ else self._get_pad_value(cast("Union[tuple[float, ...], float]", self.fill_mask)),
3189
+ "result_shape": (result_rows, result_cols),
3190
+ }
3191
+
3192
+ def _get_px_params(self) -> list[int]:
3193
+ if self.px is None:
3194
+ msg = "px is not set"
3195
+ raise ValueError(msg)
3196
+
3197
+ if isinstance(self.px, int):
3198
+ return [self.px] * 4
3199
+ if len(self.px) == PAIR:
3200
+ if self.sample_independently:
3201
+ return [self.py_random.randrange(*self.px) for _ in range(4)]
3202
+ px = self.py_random.randrange(*self.px)
3203
+ return [px] * 4
3204
+ if isinstance(self.px[0], int):
3205
+ return self.px
3206
+ if len(self.px[0]) == PAIR:
3207
+ return [self.py_random.randrange(*i) for i in self.px]
3208
+
3209
+ return [self.py_random.choice(i) for i in self.px]
3210
+
3211
+ def _get_percent_params(self) -> list[float]:
3212
+ if self.percent is None:
3213
+ msg = "percent is not set"
3214
+ raise ValueError(msg)
3215
+
3216
+ if isinstance(self.percent, float):
3217
+ params = [self.percent] * 4
3218
+ elif len(self.percent) == PAIR:
3219
+ if self.sample_independently:
3220
+ params = [self.py_random.uniform(*self.percent) for _ in range(4)]
3221
+ else:
3222
+ px = self.py_random.uniform(*self.percent)
3223
+ params = [px] * 4
3224
+ elif isinstance(self.percent[0], (int, float)):
3225
+ params = self.percent
3226
+ elif len(self.percent[0]) == PAIR:
3227
+ params = [self.py_random.uniform(*i) for i in self.percent]
3228
+ else:
3229
+ params = [self.py_random.choice(i) for i in self.percent]
3230
+
3231
+ return params # params = [top, right, bottom, left]
3232
+
3233
+ def _get_pad_value(
3234
+ self,
3235
+ fill: Sequence[float] | float,
3236
+ ) -> int | float:
3237
+ if isinstance(fill, (list, tuple)):
3238
+ if len(fill) == PAIR:
3239
+ a, b = fill
3240
+ if isinstance(a, int) and isinstance(b, int):
3241
+ return self.py_random.randint(a, b)
3242
+ return self.py_random.uniform(a, b)
3243
+ return self.py_random.choice(fill)
3244
+
3245
+ if isinstance(fill, (int, float)):
3246
+ return fill
3247
+
3248
+ msg = "fill should be a number or list, or tuple of two numbers."
3249
+ raise ValueError(msg)
3250
+
3251
+
3252
+ class RandomCropFromBorders(BaseCrop):
3253
+ """Randomly crops the input from its borders without resizing.
3254
+
3255
+ This transform randomly crops parts of the input (image, mask, bounding boxes, or keypoints)
3256
+ from each of its borders. The amount of cropping is specified as a fraction of the input's
3257
+ dimensions for each side independently.
3258
+
3259
+ Args:
3260
+ crop_left (float): The maximum fraction of width to crop from the left side.
3261
+ Must be in the range [0.0, 1.0]. Default: 0.1
3262
+ crop_right (float): The maximum fraction of width to crop from the right side.
3263
+ Must be in the range [0.0, 1.0]. Default: 0.1
3264
+ crop_top (float): The maximum fraction of height to crop from the top.
3265
+ Must be in the range [0.0, 1.0]. Default: 0.1
3266
+ crop_bottom (float): The maximum fraction of height to crop from the bottom.
3267
+ Must be in the range [0.0, 1.0]. Default: 0.1
3268
+ p (float): Probability of applying the transform. Default: 1.0
3269
+
3270
+ Targets:
3271
+ image, mask, bboxes, keypoints, volume, mask3d
3272
+
3273
+ Image types:
3274
+ uint8, float32
3275
+
3276
+ Note:
3277
+ - The actual amount of cropping for each side is randomly chosen between 0 and
3278
+ the specified maximum for each application of the transform.
3279
+ - The sum of crop_left and crop_right must not exceed 1.0, and the sum of
3280
+ crop_top and crop_bottom must not exceed 1.0. Otherwise, a ValueError will be raised.
3281
+ - This transform does not resize the input after cropping, so the output dimensions
3282
+ will be smaller than the input dimensions.
3283
+ - Bounding boxes that end up fully outside the cropped area will be removed.
3284
+ - Keypoints that end up outside the cropped area will be removed.
3285
+
3286
+ Examples:
3287
+ >>> import numpy as np
3288
+ >>> import albumentations as A
3289
+ >>>
3290
+ >>> # Prepare sample data
3291
+ >>> image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
3292
+ >>> mask = np.random.randint(0, 2, (100, 100), dtype=np.uint8)
3293
+ >>> bboxes = np.array([[10, 10, 50, 50], [40, 40, 80, 80]], dtype=np.float32)
3294
+ >>> bbox_labels = [1, 2]
3295
+ >>> keypoints = np.array([[20, 30], [60, 70]], dtype=np.float32)
3296
+ >>> keypoint_labels = [0, 1]
3297
+ >>>
3298
+ >>> # Define transform with crop fractions for each border
3299
+ >>> transform = A.Compose([
3300
+ ... A.RandomCropFromBorders(
3301
+ ... crop_left=0.1, # Max 10% crop from left
3302
+ ... crop_right=0.2, # Max 20% crop from right
3303
+ ... crop_top=0.15, # Max 15% crop from top
3304
+ ... crop_bottom=0.05, # Max 5% crop from bottom
3305
+ ... p=1.0
3306
+ ... ),
3307
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
3308
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
3309
+ >>>
3310
+ >>> # Apply transform
3311
+ >>> result = transform(
3312
+ ... image=image,
3313
+ ... mask=mask,
3314
+ ... bboxes=bboxes,
3315
+ ... bbox_labels=bbox_labels,
3316
+ ... keypoints=keypoints,
3317
+ ... keypoint_labels=keypoint_labels
3318
+ ... )
3319
+ >>>
3320
+ >>> # Access transformed data
3321
+ >>> transformed_image = result['image'] # Reduced size image with borders cropped
3322
+ >>> transformed_mask = result['mask'] # Reduced size mask with borders cropped
3323
+ >>> transformed_bboxes = result['bboxes'] # Bounding boxes adjusted to new dimensions
3324
+ >>> transformed_bbox_labels = result['bbox_labels'] # Bounding box labels after crop
3325
+ >>> transformed_keypoints = result['keypoints'] # Keypoints adjusted to new dimensions
3326
+ >>> transformed_keypoint_labels = result['keypoint_labels'] # Keypoint labels after crop
3327
+ >>>
3328
+ >>> # The resulting output shapes will be smaller, with dimensions reduced by
3329
+ >>> # the random crop amounts from each side (within the specified maximums)
3330
+ >>> print(f"Original image shape: (100, 100, 3)")
3331
+ >>> print(f"Transformed image shape: {transformed_image.shape}") # e.g., (85, 75, 3)
3332
+
3333
+ """
3334
+
3335
+ _targets = ALL_TARGETS
3336
+
3337
+ class InitSchema(BaseTransformInitSchema):
3338
+ crop_left: float = Field(
3339
+ ge=0.0,
3340
+ le=1.0,
3341
+ )
3342
+ crop_right: float = Field(
3343
+ ge=0.0,
3344
+ le=1.0,
3345
+ )
3346
+ crop_top: float = Field(
3347
+ ge=0.0,
3348
+ le=1.0,
3349
+ )
3350
+ crop_bottom: float = Field(
3351
+ ge=0.0,
3352
+ le=1.0,
3353
+ )
3354
+
3355
+ @model_validator(mode="after")
3356
+ def _validate_crop_values(self) -> Self:
3357
+ if self.crop_left + self.crop_right > 1.0:
3358
+ msg = "The sum of crop_left and crop_right must be <= 1."
3359
+ raise ValueError(msg)
3360
+ if self.crop_top + self.crop_bottom > 1.0:
3361
+ msg = "The sum of crop_top and crop_bottom must be <= 1."
3362
+ raise ValueError(msg)
3363
+ return self
3364
+
3365
+ def __init__(
3366
+ self,
3367
+ crop_left: float = 0.1,
3368
+ crop_right: float = 0.1,
3369
+ crop_top: float = 0.1,
3370
+ crop_bottom: float = 0.1,
3371
+ p: float = 1.0,
3372
+ ):
3373
+ super().__init__(p=p)
3374
+ self.crop_left = crop_left
3375
+ self.crop_right = crop_right
3376
+ self.crop_top = crop_top
3377
+ self.crop_bottom = crop_bottom
3378
+
3379
+ def get_params_dependent_on_data(
3380
+ self,
3381
+ params: dict[str, Any],
3382
+ data: dict[str, Any],
3383
+ ) -> dict[str, tuple[int, int, int, int]]:
3384
+ """Get the parameters for the crop.
3385
+
3386
+ Args:
3387
+ params (dict[str, Any]): The parameters for the transform.
3388
+ data (dict[str, Any]): The data for the transform.
3389
+
3390
+ Returns:
3391
+ dict[str, tuple[int, int, int, int]]: The parameters for the crop.
3392
+
3393
+ """
3394
+ height, width = params["shape"][:2]
3395
+
3396
+ x_min = self.py_random.randint(0, int(self.crop_left * width))
3397
+ x_max = self.py_random.randint(max(x_min + 1, int((1 - self.crop_right) * width)), width)
3398
+
3399
+ y_min = self.py_random.randint(0, int(self.crop_top * height))
3400
+ y_max = self.py_random.randint(max(y_min + 1, int((1 - self.crop_bottom) * height)), height)
3401
+
3402
+ crop_coords = x_min, y_min, x_max, y_max
3403
+
3404
+ return {"crop_coords": crop_coords}
3405
+
3406
+
3407
+ class AtLeastOneBBoxRandomCrop(BaseCrop):
3408
+ """Crop an area from image while ensuring at least one bounding box is present in the crop.
3409
+
3410
+ Similar to BBoxSafeRandomCrop, but with a key difference:
3411
+ - BBoxSafeRandomCrop ensures ALL bounding boxes are preserved in the crop
3412
+ - AtLeastOneBBoxRandomCrop ensures AT LEAST ONE bounding box is present in the crop
3413
+
3414
+ This makes AtLeastOneBBoxRandomCrop more flexible for scenarios where:
3415
+ - You want to focus on individual objects rather than all objects
3416
+ - You're willing to lose some bounding boxes to get more varied crops
3417
+ - The image has many bounding boxes and keeping all of them would be too restrictive
3418
+
3419
+ The algorithm:
3420
+ 1. If bounding boxes exist:
3421
+ - Randomly selects a reference bounding box from available boxes
3422
+ - Computes an eroded version of this box (shrunk by erosion_factor)
3423
+ - Calculates valid crop bounds that ensure overlap with the eroded box
3424
+ - Randomly samples crop coordinates within these bounds
3425
+ 2. If no bounding boxes exist:
3426
+ - Uses full image dimensions as valid bounds
3427
+ - Randomly samples crop coordinates within these bounds
3428
+
3429
+ Args:
3430
+ height (int): Fixed height of the crop
3431
+ width (int): Fixed width of the crop
3432
+ erosion_factor (float, optional): Factor by which to erode (shrink) the reference
3433
+ bounding box when computing valid crop regions. Must be in range [0.0, 1.0].
3434
+ - 0.0 means no erosion (crop must fully contain the reference box)
3435
+ - 1.0 means maximum erosion (crop can be anywhere that intersects the reference box)
3436
+ Defaults to 0.0.
3437
+ p (float, optional): Probability of applying the transform. Defaults to 1.0.
3438
+
3439
+ Targets:
3440
+ image, mask, bboxes, keypoints, volume, mask3d
3441
+
3442
+ Image types:
3443
+ uint8, float32
3444
+
3445
+ Raises:
3446
+ CropSizeError: If requested crop size exceeds image dimensions
3447
+
3448
+ Examples:
3449
+ >>> import numpy as np
3450
+ >>> import albumentations as A
3451
+ >>> import cv2
3452
+ >>>
3453
+ >>> # Prepare sample data
3454
+ >>> image = np.random.randint(0, 256, (300, 300, 3), dtype=np.uint8)
3455
+ >>> mask = np.random.randint(0, 2, (300, 300), dtype=np.uint8)
3456
+ >>> # Create multiple bounding boxes - the transform will ensure at least one is in the crop
3457
+ >>> bboxes = np.array([
3458
+ ... [30, 50, 100, 140], # first box
3459
+ ... [150, 120, 270, 250], # second box
3460
+ ... [200, 30, 280, 90] # third box
3461
+ ... ], dtype=np.float32)
3462
+ >>> bbox_labels = [1, 2, 3]
3463
+ >>> keypoints = np.array([
3464
+ ... [50, 70], # keypoint inside first box
3465
+ ... [190, 170], # keypoint inside second box
3466
+ ... [240, 60] # keypoint inside third box
3467
+ ... ], dtype=np.float32)
3468
+ >>> keypoint_labels = [0, 1, 2]
3469
+ >>>
3470
+ >>> # Define transform with different erosion_factor values
3471
+ >>> transform = A.Compose([
3472
+ ... A.AtLeastOneBBoxRandomCrop(
3473
+ ... height=200,
3474
+ ... width=200,
3475
+ ... erosion_factor=0.2, # Allows moderate flexibility in crop placement
3476
+ ... p=1.0
3477
+ ... ),
3478
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']),
3479
+ ... keypoint_params=A.KeypointParams(format='xy', label_fields=['keypoint_labels']))
3480
+ >>>
3481
+ >>> # Apply the transform
3482
+ >>> transformed = transform(
3483
+ ... image=image,
3484
+ ... mask=mask,
3485
+ ... bboxes=bboxes,
3486
+ ... bbox_labels=bbox_labels,
3487
+ ... keypoints=keypoints,
3488
+ ... keypoint_labels=keypoint_labels
3489
+ ... )
3490
+ >>>
3491
+ >>> # Get the transformed data
3492
+ >>> transformed_image = transformed['image'] # Shape: (200, 200, 3)
3493
+ >>> transformed_mask = transformed['mask'] # Shape: (200, 200)
3494
+ >>> transformed_bboxes = transformed['bboxes'] # At least one bbox is guaranteed
3495
+ >>> transformed_bbox_labels = transformed['bbox_labels'] # Labels for the preserved bboxes
3496
+ >>> transformed_keypoints = transformed['keypoints'] # Only keypoints in crop are kept
3497
+ >>> transformed_keypoint_labels = transformed['keypoint_labels'] # Their labels
3498
+ >>>
3499
+ >>> # Verify that at least one bounding box was preserved
3500
+ >>> assert len(transformed_bboxes) > 0, "Should have at least one bbox in the crop"
3501
+ >>>
3502
+ >>> # With erosion_factor=0.0, the crop must fully contain the selected reference bbox
3503
+ >>> conservative_transform = A.Compose([
3504
+ ... A.AtLeastOneBBoxRandomCrop(
3505
+ ... height=200,
3506
+ ... width=200,
3507
+ ... erosion_factor=0.0, # No erosion - crop must fully contain a bbox
3508
+ ... p=1.0
3509
+ ... ),
3510
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']))
3511
+ >>>
3512
+ >>> # With erosion_factor=1.0, the crop must only intersect with the selected reference bbox
3513
+ >>> flexible_transform = A.Compose([
3514
+ ... A.AtLeastOneBBoxRandomCrop(
3515
+ ... height=200,
3516
+ ... width=200,
3517
+ ... erosion_factor=1.0, # Maximum erosion - crop only needs to intersect a bbox
3518
+ ... p=1.0
3519
+ ... ),
3520
+ ... ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bbox_labels']))
3521
+
3522
+ Note:
3523
+ - Uses fixed crop dimensions (height and width)
3524
+ - Bounding boxes that end up partially outside the crop will be adjusted
3525
+ - Bounding boxes that end up completely outside the crop will be removed
3526
+ - If no bounding boxes are provided, acts as a regular random crop
3527
+
3528
+ """
3529
+
3530
+ _targets = ALL_TARGETS
3531
+
3532
+ class InitSchema(BaseCrop.InitSchema):
3533
+ height: Annotated[int, Field(ge=1)]
3534
+ width: Annotated[int, Field(ge=1)]
3535
+ erosion_factor: Annotated[float, Field(ge=0.0, le=1.0)]
3536
+
3537
+ def __init__(
3538
+ self,
3539
+ height: int,
3540
+ width: int,
3541
+ erosion_factor: float = 0.0,
3542
+ p: float = 1.0,
3543
+ ):
3544
+ super().__init__(p=p)
3545
+ self.height = height
3546
+ self.width = width
3547
+ self.erosion_factor = erosion_factor
3548
+
3549
+ def get_params_dependent_on_data(
3550
+ self,
3551
+ params: dict[str, Any],
3552
+ data: dict[str, Any],
3553
+ ) -> dict[str, tuple[int, int, int, int]]:
3554
+ """Get the parameters for the crop.
3555
+
3556
+ Args:
3557
+ params (dict[str, Any]): The parameters for the transform.
3558
+ data (dict[str, Any]): The data for the transform.
3559
+
3560
+ """
3561
+ image_height, image_width = params["shape"][:2]
3562
+ bboxes = data.get("bboxes", [])
3563
+
3564
+ if self.height > image_height or self.width > image_width:
3565
+ raise CropSizeError(
3566
+ f"Crop size (height, width) exceeds image dimensions (height, width):"
3567
+ f" {(self.height, self.width)} vs {image_height, image_width}",
3568
+ )
3569
+
3570
+ if len(bboxes) > 0:
3571
+ bboxes = denormalize_bboxes(bboxes, shape=(image_height, image_width))
3572
+
3573
+ # Pick a bbox amongst all possible as our reference bbox.
3574
+ reference_bbox = self.py_random.choice(bboxes)
3575
+
3576
+ bbox_x1, bbox_y1, bbox_x2, bbox_y2 = reference_bbox[:4]
3577
+
3578
+ # Compute valid crop bounds:
3579
+ # erosion_factor = 0.0: crop must fully contain the bbox
3580
+ # erosion_factor = 1.0: crop can be anywhere that intersects the bbox
3581
+ if self.erosion_factor < 1.0:
3582
+ # Regular case: compute eroded box dimensions
3583
+ bbox_width = bbox_x2 - bbox_x1
3584
+ bbox_height = bbox_y2 - bbox_y1
3585
+ eroded_width = bbox_width * (1.0 - self.erosion_factor)
3586
+ eroded_height = bbox_height * (1.0 - self.erosion_factor)
3587
+
3588
+ min_crop_x = np.clip(
3589
+ a=bbox_x1 + eroded_width - self.width,
3590
+ a_min=0.0,
3591
+ a_max=image_width - self.width,
3592
+ )
3593
+ max_crop_x = np.clip(
3594
+ a=bbox_x2 - eroded_width,
3595
+ a_min=0.0,
3596
+ a_max=image_width - self.width,
3597
+ )
3598
+
3599
+ min_crop_y = np.clip(
3600
+ a=bbox_y1 + eroded_height - self.height,
3601
+ a_min=0.0,
3602
+ a_max=image_height - self.height,
3603
+ )
3604
+ max_crop_y = np.clip(
3605
+ a=bbox_y2 - eroded_height,
3606
+ a_min=0.0,
3607
+ a_max=image_height - self.height,
3608
+ )
3609
+ else:
3610
+ # Maximum erosion case: crop can be anywhere that intersects the bbox
3611
+ min_crop_x = np.clip(
3612
+ a=bbox_x1 - self.width, # leftmost position that still intersects
3613
+ a_min=0.0,
3614
+ a_max=image_width - self.width,
3615
+ )
3616
+ max_crop_x = np.clip(
3617
+ a=bbox_x2, # rightmost position that still intersects
3618
+ a_min=0.0,
3619
+ a_max=image_width - self.width,
3620
+ )
3621
+
3622
+ min_crop_y = np.clip(
3623
+ a=bbox_y1 - self.height, # topmost position that still intersects
3624
+ a_min=0.0,
3625
+ a_max=image_height - self.height,
3626
+ )
3627
+ max_crop_y = np.clip(
3628
+ a=bbox_y2, # bottommost position that still intersects
3629
+ a_min=0.0,
3630
+ a_max=image_height - self.height,
3631
+ )
3632
+ else:
3633
+ # If there are no bboxes, just crop anywhere in the image.
3634
+ min_crop_x = 0.0
3635
+ max_crop_x = image_width - self.width
3636
+
3637
+ min_crop_y = 0.0
3638
+ max_crop_y = image_height - self.height
3639
+
3640
+ # Randomly draw the upper-left corner of the crop.
3641
+ crop_x1 = int(self.py_random.uniform(a=min_crop_x, b=max_crop_x))
3642
+ crop_y1 = int(self.py_random.uniform(a=min_crop_y, b=max_crop_y))
3643
+
3644
+ crop_x2 = crop_x1 + self.width
3645
+ crop_y2 = crop_y1 + self.height
3646
+
3647
+ return {"crop_coords": (crop_x1, crop_y1, crop_x2, crop_y2)}