nrtk-albumentations 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nrtk-albumentations might be problematic. Click here for more details.

Files changed (62) hide show
  1. albumentations/__init__.py +21 -0
  2. albumentations/augmentations/__init__.py +23 -0
  3. albumentations/augmentations/blur/__init__.py +0 -0
  4. albumentations/augmentations/blur/functional.py +438 -0
  5. albumentations/augmentations/blur/transforms.py +1633 -0
  6. albumentations/augmentations/crops/__init__.py +0 -0
  7. albumentations/augmentations/crops/functional.py +494 -0
  8. albumentations/augmentations/crops/transforms.py +3647 -0
  9. albumentations/augmentations/dropout/__init__.py +0 -0
  10. albumentations/augmentations/dropout/channel_dropout.py +134 -0
  11. albumentations/augmentations/dropout/coarse_dropout.py +567 -0
  12. albumentations/augmentations/dropout/functional.py +1017 -0
  13. albumentations/augmentations/dropout/grid_dropout.py +166 -0
  14. albumentations/augmentations/dropout/mask_dropout.py +274 -0
  15. albumentations/augmentations/dropout/transforms.py +461 -0
  16. albumentations/augmentations/dropout/xy_masking.py +186 -0
  17. albumentations/augmentations/geometric/__init__.py +0 -0
  18. albumentations/augmentations/geometric/distortion.py +1238 -0
  19. albumentations/augmentations/geometric/flip.py +752 -0
  20. albumentations/augmentations/geometric/functional.py +4151 -0
  21. albumentations/augmentations/geometric/pad.py +676 -0
  22. albumentations/augmentations/geometric/resize.py +956 -0
  23. albumentations/augmentations/geometric/rotate.py +864 -0
  24. albumentations/augmentations/geometric/transforms.py +1962 -0
  25. albumentations/augmentations/mixing/__init__.py +0 -0
  26. albumentations/augmentations/mixing/domain_adaptation.py +787 -0
  27. albumentations/augmentations/mixing/domain_adaptation_functional.py +453 -0
  28. albumentations/augmentations/mixing/functional.py +878 -0
  29. albumentations/augmentations/mixing/transforms.py +832 -0
  30. albumentations/augmentations/other/__init__.py +0 -0
  31. albumentations/augmentations/other/lambda_transform.py +180 -0
  32. albumentations/augmentations/other/type_transform.py +261 -0
  33. albumentations/augmentations/pixel/__init__.py +0 -0
  34. albumentations/augmentations/pixel/functional.py +4226 -0
  35. albumentations/augmentations/pixel/transforms.py +7556 -0
  36. albumentations/augmentations/spectrogram/__init__.py +0 -0
  37. albumentations/augmentations/spectrogram/transform.py +220 -0
  38. albumentations/augmentations/text/__init__.py +0 -0
  39. albumentations/augmentations/text/functional.py +272 -0
  40. albumentations/augmentations/text/transforms.py +299 -0
  41. albumentations/augmentations/transforms3d/__init__.py +0 -0
  42. albumentations/augmentations/transforms3d/functional.py +393 -0
  43. albumentations/augmentations/transforms3d/transforms.py +1422 -0
  44. albumentations/augmentations/utils.py +249 -0
  45. albumentations/core/__init__.py +0 -0
  46. albumentations/core/bbox_utils.py +920 -0
  47. albumentations/core/composition.py +1885 -0
  48. albumentations/core/hub_mixin.py +299 -0
  49. albumentations/core/keypoints_utils.py +521 -0
  50. albumentations/core/label_manager.py +339 -0
  51. albumentations/core/pydantic.py +239 -0
  52. albumentations/core/serialization.py +352 -0
  53. albumentations/core/transforms_interface.py +976 -0
  54. albumentations/core/type_definitions.py +127 -0
  55. albumentations/core/utils.py +605 -0
  56. albumentations/core/validation.py +129 -0
  57. albumentations/pytorch/__init__.py +1 -0
  58. albumentations/pytorch/transforms.py +189 -0
  59. nrtk_albumentations-2.1.0.dist-info/METADATA +196 -0
  60. nrtk_albumentations-2.1.0.dist-info/RECORD +62 -0
  61. nrtk_albumentations-2.1.0.dist-info/WHEEL +4 -0
  62. nrtk_albumentations-2.1.0.dist-info/licenses/LICENSE +21 -0
File without changes
@@ -0,0 +1,220 @@
1
+ """Transforms for spectrogram augmentation.
2
+
3
+ This module provides transforms specifically designed for augmenting spectrograms
4
+ in audio processing tasks. Includes time reversal, time masking, and frequency
5
+ masking transforms commonly used in audio machine learning applications.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from warnings import warn
11
+
12
+ from pydantic import Field
13
+
14
+ from albumentations.augmentations.dropout.xy_masking import XYMasking
15
+ from albumentations.augmentations.geometric.flip import HorizontalFlip
16
+ from albumentations.core.transforms_interface import BaseTransformInitSchema
17
+ from albumentations.core.type_definitions import ALL_TARGETS
18
+
19
+ __all__ = [
20
+ "FrequencyMasking",
21
+ "TimeMasking",
22
+ "TimeReverse",
23
+ ]
24
+
25
+
26
+ class TimeReverse(HorizontalFlip):
27
+ """Reverse the time axis of a spectrogram image, also known as time inversion.
28
+
29
+ Time inversion of a spectrogram is analogous to the random flip of an image,
30
+ an augmentation technique widely used in the visual domain. This can be relevant
31
+ in the context of audio classification tasks when working with spectrograms.
32
+ The technique was successfully applied in the AudioCLIP paper, which extended
33
+ CLIP to handle image, text, and audio inputs.
34
+
35
+ This transform is implemented as a subclass of HorizontalFlip since reversing
36
+ time in a spectrogram is equivalent to flipping the image horizontally.
37
+
38
+ Args:
39
+ p (float): probability of applying the transform. Default: 0.5.
40
+
41
+ Targets:
42
+ image, mask, bboxes, keypoints, volume, mask3d
43
+
44
+ Image types:
45
+ uint8, float32
46
+
47
+ Number of channels:
48
+ Any
49
+
50
+ Note:
51
+ This transform is functionally identical to HorizontalFlip but provides
52
+ a more semantically meaningful name when working with spectrograms and
53
+ other time-series visualizations.
54
+
55
+ References:
56
+ - AudioCLIP paper: https://arxiv.org/abs/2106.13043
57
+ - Audiomentations: https://iver56.github.io/audiomentations/waveform_transforms/reverse/
58
+
59
+ """
60
+
61
+ _targets = ALL_TARGETS
62
+
63
+ class InitSchema(BaseTransformInitSchema):
64
+ pass
65
+
66
+ def __init__(
67
+ self,
68
+ p: float = 0.5,
69
+ ):
70
+ warn(
71
+ "TimeReverse is an alias for HorizontalFlip transform. "
72
+ "Consider using HorizontalFlip directly from albumentations.HorizontalFlip. ",
73
+ UserWarning,
74
+ stacklevel=2,
75
+ )
76
+ super().__init__(p=p)
77
+
78
+
79
+ class TimeMasking(XYMasking):
80
+ """Apply masking to a spectrogram in the time domain.
81
+
82
+ This transform masks random segments along the time axis of a spectrogram,
83
+ implementing the time masking technique proposed in the SpecAugment paper.
84
+ Time masking helps in training models to be robust against temporal variations
85
+ and missing information in audio signals.
86
+
87
+ This is a specialized version of XYMasking configured for time masking only.
88
+ For more advanced use cases (e.g., multiple masks, frequency masking, or custom
89
+ fill values), consider using XYMasking directly.
90
+
91
+ Args:
92
+ time_mask_param (int): Maximum possible length of the mask in the time domain.
93
+ Must be a positive integer. Length of the mask is uniformly sampled from (0, time_mask_param).
94
+ p (float): probability of applying the transform. Default: 0.5.
95
+
96
+ Targets:
97
+ image, mask, bboxes, keypoints, volume, mask3d
98
+
99
+ Image types:
100
+ uint8, float32
101
+
102
+ Number of channels:
103
+ Any
104
+
105
+ Note:
106
+ This transform is implemented as a subset of XYMasking with fixed parameters:
107
+ - Single horizontal mask (num_masks_x=1)
108
+ - No vertical masks (num_masks_y=0)
109
+ - Zero fill value
110
+ - Random mask length up to time_mask_param
111
+
112
+ For more flexibility, including:
113
+ - Multiple masks
114
+ - Custom fill values
115
+ - Frequency masking
116
+ - Combined time-frequency masking
117
+ Consider using albumentations.XYMasking directly.
118
+
119
+ References:
120
+ - SpecAugment paper: https://arxiv.org/abs/1904.08779
121
+ - Original implementation: https://pytorch.org/audio/stable/transforms.html#timemask
122
+
123
+ """
124
+
125
+ class InitSchema(BaseTransformInitSchema):
126
+ time_mask_param: int = Field(gt=0)
127
+
128
+ def __init__(
129
+ self,
130
+ time_mask_param: int = 40,
131
+ p: float = 0.5,
132
+ ):
133
+ warn(
134
+ "TimeMasking is a specialized version of XYMasking. "
135
+ "For more flexibility (multiple masks, custom fill values, frequency masking), "
136
+ "consider using XYMasking directly from albumentations.XYMasking.",
137
+ UserWarning,
138
+ stacklevel=2,
139
+ )
140
+ super().__init__(
141
+ num_masks_x=1,
142
+ num_masks_y=0,
143
+ mask_x_length=(0, time_mask_param),
144
+ fill=0,
145
+ fill_mask=0,
146
+ p=p,
147
+ )
148
+ self.time_mask_param = time_mask_param
149
+
150
+
151
+ class FrequencyMasking(XYMasking):
152
+ """Apply masking to a spectrogram in the frequency domain.
153
+
154
+ This transform masks random segments along the frequency axis of a spectrogram,
155
+ implementing the frequency masking technique proposed in the SpecAugment paper.
156
+ Frequency masking helps in training models to be robust against frequency variations
157
+ and missing spectral information in audio signals.
158
+
159
+ This is a specialized version of XYMasking configured for frequency masking only.
160
+ For more advanced use cases (e.g., multiple masks, time masking, or custom
161
+ fill values), consider using XYMasking directly.
162
+
163
+ Args:
164
+ freq_mask_param (int): Maximum possible length of the mask in the frequency domain.
165
+ Must be a positive integer. Length of the mask is uniformly sampled from (0, freq_mask_param).
166
+ p (float): probability of applying the transform. Default: 0.5.
167
+
168
+ Targets:
169
+ image, mask, bboxes, keypoints, volume, mask3d
170
+
171
+ Image types:
172
+ uint8, float32
173
+
174
+ Number of channels:
175
+ Any
176
+
177
+ Note:
178
+ This transform is implemented as a subset of XYMasking with fixed parameters:
179
+ - Single vertical mask (num_masks_y=1)
180
+ - No horizontal masks (num_masks_x=0)
181
+ - Zero fill value
182
+ - Random mask length up to freq_mask_param
183
+
184
+ For more flexibility, including:
185
+ - Multiple masks
186
+ - Custom fill values
187
+ - Time masking
188
+ - Combined time-frequency masking
189
+ Consider using albumentations.XYMasking directly.
190
+
191
+ References:
192
+ - SpecAugment paper: https://arxiv.org/abs/1904.08779
193
+ - Original implementation: https://pytorch.org/audio/stable/transforms.html#freqmask
194
+
195
+ """
196
+
197
+ class InitSchema(BaseTransformInitSchema):
198
+ freq_mask_param: int = Field(gt=0)
199
+
200
+ def __init__(
201
+ self,
202
+ freq_mask_param: int = 30,
203
+ p: float = 0.5,
204
+ ):
205
+ warn(
206
+ "FrequencyMasking is a specialized version of XYMasking. "
207
+ "For more flexibility (multiple masks, custom fill values, time masking), "
208
+ "consider using XYMasking directly from albumentations.XYMasking.",
209
+ UserWarning,
210
+ stacklevel=2,
211
+ )
212
+ super().__init__(
213
+ p=p,
214
+ fill=0,
215
+ fill_mask=0,
216
+ mask_y_length=(0, freq_mask_param),
217
+ num_masks_x=0,
218
+ num_masks_y=1,
219
+ )
220
+ self.freq_mask_param = freq_mask_param
File without changes
@@ -0,0 +1,272 @@
1
+ """Functional implementations for text manipulation and rendering.
2
+
3
+ This module provides utility functions for manipulating text in strings and
4
+ rendering text onto images. Includes functions for word manipulation, text drawing,
5
+ and handling text regions in images.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ import cv2
14
+ import numpy as np
15
+ from albucore import (
16
+ MONO_CHANNEL_DIMENSIONS,
17
+ NUM_MULTI_CHANNEL_DIMENSIONS,
18
+ NUM_RGB_CHANNELS,
19
+ preserve_channel_dim,
20
+ uint8_io,
21
+ )
22
+
23
+ from albumentations.core.type_definitions import PAIR
24
+
25
+ # Importing wordnet and other dependencies only for type checking
26
+ if TYPE_CHECKING:
27
+ from PIL import Image
28
+
29
+
30
+ def delete_random_words(words: list[str], num_words: int, py_random: random.Random) -> str:
31
+ """Delete a specified number of random words from a list.
32
+
33
+ This function randomly removes words from the input list and joins the remaining
34
+ words with spaces to form a new string.
35
+
36
+ Args:
37
+ words (list[str]): List of words to process.
38
+ num_words (int): Number of words to delete.
39
+ py_random (random.Random): Random number generator for reproducibility.
40
+
41
+ Returns:
42
+ str: New string with specified words removed. Returns empty string if
43
+ num_words is greater than or equal to the length of words.
44
+
45
+ """
46
+ if num_words >= len(words):
47
+ return ""
48
+
49
+ indices_to_delete = py_random.sample(range(len(words)), num_words)
50
+ new_words = [word for idx, word in enumerate(words) if idx not in indices_to_delete]
51
+ return " ".join(new_words)
52
+
53
+
54
+ def swap_random_words(words: list[str], num_words: int, py_random: random.Random) -> str:
55
+ """Swap random pairs of words in a list of words.
56
+
57
+ This function randomly selects pairs of words and swaps their positions
58
+ a specified number of times.
59
+
60
+ Args:
61
+ words (list[str]): List of words to process.
62
+ num_words (int): Number of swaps to perform.
63
+ py_random (random.Random): Random number generator for reproducibility.
64
+
65
+ Returns:
66
+ str: New string with words swapped. If num_words is 0 or the list has fewer
67
+ than 2 words, returns the original string.
68
+
69
+ """
70
+ if num_words == 0 or len(words) < PAIR:
71
+ return " ".join(words)
72
+
73
+ words = words.copy()
74
+
75
+ for _ in range(num_words):
76
+ idx1, idx2 = py_random.sample(range(len(words)), 2)
77
+ words[idx1], words[idx2] = words[idx2], words[idx1]
78
+ return " ".join(words)
79
+
80
+
81
+ def insert_random_stopwords(
82
+ words: list[str],
83
+ num_insertions: int,
84
+ stopwords: tuple[str, ...] | None,
85
+ py_random: random.Random,
86
+ ) -> str:
87
+ """Insert random stopwords into a list of words.
88
+
89
+ This function randomly inserts stopwords at random positions in the
90
+ list of words a specified number of times.
91
+
92
+ Args:
93
+ words (list[str]): List of words to process.
94
+ num_insertions (int): Number of stopwords to insert.
95
+ stopwords (tuple[str, ...] | None): Tuple of stopwords to choose from.
96
+ If None, default stopwords will be used.
97
+ py_random (random.Random): Random number generator for reproducibility.
98
+
99
+ Returns:
100
+ str: New string with stopwords inserted.
101
+
102
+ """
103
+ if stopwords is None:
104
+ stopwords = ("and", "the", "is", "in", "at", "of") # Default stopwords if none provided
105
+
106
+ for _ in range(num_insertions):
107
+ idx = py_random.randint(0, len(words))
108
+ words.insert(idx, py_random.choice(stopwords))
109
+ return " ".join(words)
110
+
111
+
112
+ def convert_image_to_pil(image: np.ndarray) -> Image:
113
+ """Convert a NumPy array image to a PIL image."""
114
+ try:
115
+ from PIL import Image
116
+ except ImportError:
117
+ raise ImportError("Pillow is not installed") from ImportError
118
+
119
+ if image.ndim == MONO_CHANNEL_DIMENSIONS: # (height, width)
120
+ return Image.fromarray(image)
121
+ if image.ndim == NUM_MULTI_CHANNEL_DIMENSIONS and image.shape[2] == 1: # (height, width, 1)
122
+ return Image.fromarray(image[:, :, 0], mode="L")
123
+ if image.ndim == NUM_MULTI_CHANNEL_DIMENSIONS and image.shape[2] == NUM_RGB_CHANNELS: # (height, width, 3)
124
+ return Image.fromarray(image)
125
+
126
+ raise TypeError(f"Unsupported image shape: {image.shape}")
127
+
128
+
129
+ def draw_text_on_pil_image(pil_image: Image, metadata_list: list[dict[str, Any]]) -> Image:
130
+ """Draw text on a PIL image."""
131
+ try:
132
+ from PIL import ImageDraw
133
+ except ImportError:
134
+ raise ImportError("Pillow is not installed") from ImportError
135
+
136
+ draw = ImageDraw.Draw(pil_image)
137
+ for metadata in metadata_list:
138
+ bbox_coords = metadata["bbox_coords"]
139
+ text = metadata["text"]
140
+ font = metadata["font"]
141
+ font_color = metadata["font_color"]
142
+
143
+ # Adapt font_color based on image mode
144
+ if pil_image.mode == "L": # Grayscale
145
+ # For grayscale images, use only the first value or average the RGB values
146
+ if isinstance(font_color, tuple):
147
+ if len(font_color) >= 3:
148
+ # Average RGB values for grayscale
149
+ font_color = int(sum(font_color[:3]) / 3)
150
+ elif len(font_color) == 1:
151
+ font_color = int(font_color[0])
152
+ # For RGB and other modes, ensure font_color is a tuple of integers
153
+ elif isinstance(font_color, tuple):
154
+ font_color = tuple(int(c) for c in font_color)
155
+
156
+ position = bbox_coords[:2]
157
+ draw.text(position, text, font=font, fill=font_color)
158
+ return pil_image
159
+
160
+
161
+ def draw_text_on_multi_channel_image(image: np.ndarray, metadata_list: list[dict[str, Any]]) -> np.ndarray:
162
+ """Draw text on a multi-channel image with more than three channels."""
163
+ try:
164
+ from PIL import Image, ImageDraw
165
+ except ImportError:
166
+ raise ImportError("Pillow is not installed") from ImportError
167
+
168
+ channels = [Image.fromarray(image[:, :, i]) for i in range(image.shape[2])]
169
+ pil_images = [ImageDraw.Draw(channel) for channel in channels]
170
+
171
+ for metadata in metadata_list:
172
+ bbox_coords = metadata["bbox_coords"]
173
+ text = metadata["text"]
174
+ font = metadata["font"]
175
+ font_color = metadata["font_color"]
176
+
177
+ # Handle font_color as tuple[float, ...]
178
+ # Ensure we have enough color values for all channels
179
+ if len(font_color) < image.shape[2]:
180
+ # If fewer values than channels, pad with zeros
181
+ font_color = tuple(list(font_color) + [0] * (image.shape[2] - len(font_color)))
182
+ elif len(font_color) > image.shape[2]:
183
+ # If more values than channels, truncate
184
+ font_color = font_color[: image.shape[2]]
185
+
186
+ # Convert to integers for PIL
187
+ font_color = [int(c) for c in font_color]
188
+
189
+ position = bbox_coords[:2]
190
+
191
+ # For each channel, use the corresponding color value
192
+ for channel_id, pil_image in enumerate(pil_images):
193
+ # For single-channel PIL images, color must be an integer
194
+ pil_image.text(position, text, font=font, fill=font_color[channel_id])
195
+
196
+ return np.stack([np.array(channel) for channel in channels], axis=2)
197
+
198
+
199
+ @uint8_io
200
+ @preserve_channel_dim
201
+ def render_text(image: np.ndarray, metadata_list: list[dict[str, Any]], clear_bg: bool) -> np.ndarray:
202
+ """Render text onto an image based on provided metadata.
203
+
204
+ This function draws text on an image using metadata that specifies text content,
205
+ position, font, and color. It can optionally clear the background before rendering.
206
+ The function handles different image types (grayscale, RGB, multi-channel).
207
+
208
+ Args:
209
+ image (np.ndarray): Image to draw text on.
210
+ metadata_list (list[dict[str, Any]]): List of metadata dictionaries containing:
211
+ - bbox_coords: Bounding box coordinates (x_min, y_min, x_max, y_max)
212
+ - text: Text string to render
213
+ - font: PIL ImageFont object
214
+ - font_color: Color for the text
215
+ clear_bg (bool): Whether to clear (inpaint) the background under the text.
216
+
217
+ Returns:
218
+ np.ndarray: Image with text rendered on it.
219
+
220
+ """
221
+ # First clean background under boxes using seamless clone if clear_bg is True
222
+ if clear_bg:
223
+ image = inpaint_text_background(image, metadata_list)
224
+
225
+ if len(image.shape) == MONO_CHANNEL_DIMENSIONS or (
226
+ len(image.shape) == NUM_MULTI_CHANNEL_DIMENSIONS and image.shape[2] in {1, NUM_RGB_CHANNELS}
227
+ ):
228
+ pil_image = convert_image_to_pil(image)
229
+ pil_image = draw_text_on_pil_image(pil_image, metadata_list)
230
+ return np.array(pil_image)
231
+
232
+ return draw_text_on_multi_channel_image(image, metadata_list)
233
+
234
+
235
+ def inpaint_text_background(
236
+ image: np.ndarray,
237
+ metadata_list: list[dict[str, Any]],
238
+ method: int = cv2.INPAINT_TELEA,
239
+ ) -> np.ndarray:
240
+ """Inpaint (clear) regions in an image where text will be rendered.
241
+
242
+ This function creates a clean background for text by inpainting rectangular
243
+ regions specified in the metadata. It removes any existing content in those
244
+ regions to provide a clean slate for rendering text.
245
+
246
+ Args:
247
+ image (np.ndarray): Image to inpaint.
248
+ metadata_list (list[dict[str, Any]]): List of metadata dictionaries containing:
249
+ - bbox_coords: Bounding box coordinates (x_min, y_min, x_max, y_max)
250
+ method (int, optional): Inpainting method to use. Defaults to cv2.INPAINT_TELEA.
251
+ Options include:
252
+ - cv2.INPAINT_TELEA: Fast Marching Method
253
+ - cv2.INPAINT_NS: Navier-Stokes method
254
+
255
+ Returns:
256
+ np.ndarray: Image with specified regions inpainted.
257
+
258
+ """
259
+ result_image = image.copy()
260
+ mask = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint8)
261
+
262
+ for metadata in metadata_list:
263
+ x_min, y_min, x_max, y_max = metadata["bbox_coords"]
264
+
265
+ # Black out the region
266
+ result_image[y_min:y_max, x_min:x_max] = 0
267
+
268
+ # Update the mask to indicate the region to inpaint
269
+ mask[y_min:y_max, x_min:x_max] = 255
270
+
271
+ # Inpaint the blacked-out regions
272
+ return cv2.inpaint(result_image, mask, inpaintRadius=3, flags=method)