stouputils 1.14.3__py3-none-any.whl → 1.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. stouputils/data_science/config/get.py +51 -51
  2. stouputils/data_science/data_processing/image/__init__.py +66 -66
  3. stouputils/data_science/data_processing/image/auto_contrast.py +79 -79
  4. stouputils/data_science/data_processing/image/axis_flip.py +58 -58
  5. stouputils/data_science/data_processing/image/bias_field_correction.py +74 -74
  6. stouputils/data_science/data_processing/image/binary_threshold.py +73 -73
  7. stouputils/data_science/data_processing/image/blur.py +59 -59
  8. stouputils/data_science/data_processing/image/brightness.py +54 -54
  9. stouputils/data_science/data_processing/image/canny.py +110 -110
  10. stouputils/data_science/data_processing/image/clahe.py +92 -92
  11. stouputils/data_science/data_processing/image/common.py +30 -30
  12. stouputils/data_science/data_processing/image/contrast.py +53 -53
  13. stouputils/data_science/data_processing/image/curvature_flow_filter.py +74 -74
  14. stouputils/data_science/data_processing/image/denoise.py +378 -378
  15. stouputils/data_science/data_processing/image/histogram_equalization.py +123 -123
  16. stouputils/data_science/data_processing/image/invert.py +64 -64
  17. stouputils/data_science/data_processing/image/laplacian.py +60 -60
  18. stouputils/data_science/data_processing/image/median_blur.py +52 -52
  19. stouputils/data_science/data_processing/image/noise.py +59 -59
  20. stouputils/data_science/data_processing/image/normalize.py +65 -65
  21. stouputils/data_science/data_processing/image/random_erase.py +66 -66
  22. stouputils/data_science/data_processing/image/resize.py +69 -69
  23. stouputils/data_science/data_processing/image/rotation.py +80 -80
  24. stouputils/data_science/data_processing/image/salt_pepper.py +68 -68
  25. stouputils/data_science/data_processing/image/sharpening.py +55 -55
  26. stouputils/data_science/data_processing/image/shearing.py +64 -64
  27. stouputils/data_science/data_processing/image/threshold.py +64 -64
  28. stouputils/data_science/data_processing/image/translation.py +71 -71
  29. stouputils/data_science/data_processing/image/zoom.py +83 -83
  30. stouputils/data_science/data_processing/image_augmentation.py +118 -118
  31. stouputils/data_science/data_processing/image_preprocess.py +183 -183
  32. stouputils/data_science/data_processing/prosthesis_detection.py +359 -359
  33. stouputils/data_science/data_processing/technique.py +481 -481
  34. stouputils/data_science/dataset/__init__.py +45 -45
  35. stouputils/data_science/dataset/dataset.py +292 -292
  36. stouputils/data_science/dataset/dataset_loader.py +135 -135
  37. stouputils/data_science/dataset/grouping_strategy.py +296 -296
  38. stouputils/data_science/dataset/image_loader.py +100 -100
  39. stouputils/data_science/dataset/xy_tuple.py +696 -696
  40. stouputils/data_science/metric_dictionnary.py +106 -106
  41. stouputils/data_science/mlflow_utils.py +206 -206
  42. stouputils/data_science/models/abstract_model.py +149 -149
  43. stouputils/data_science/models/all.py +85 -85
  44. stouputils/data_science/models/keras/all.py +38 -38
  45. stouputils/data_science/models/keras/convnext.py +62 -62
  46. stouputils/data_science/models/keras/densenet.py +50 -50
  47. stouputils/data_science/models/keras/efficientnet.py +60 -60
  48. stouputils/data_science/models/keras/mobilenet.py +56 -56
  49. stouputils/data_science/models/keras/resnet.py +52 -52
  50. stouputils/data_science/models/keras/squeezenet.py +233 -233
  51. stouputils/data_science/models/keras/vgg.py +42 -42
  52. stouputils/data_science/models/keras/xception.py +38 -38
  53. stouputils/data_science/models/keras_utils/callbacks/__init__.py +20 -20
  54. stouputils/data_science/models/keras_utils/callbacks/colored_progress_bar.py +219 -219
  55. stouputils/data_science/models/keras_utils/callbacks/learning_rate_finder.py +148 -148
  56. stouputils/data_science/models/keras_utils/callbacks/model_checkpoint_v2.py +31 -31
  57. stouputils/data_science/models/keras_utils/callbacks/progressive_unfreezing.py +249 -249
  58. stouputils/data_science/models/keras_utils/callbacks/warmup_scheduler.py +66 -66
  59. stouputils/data_science/models/keras_utils/losses/__init__.py +12 -12
  60. stouputils/data_science/models/keras_utils/losses/next_generation_loss.py +56 -56
  61. stouputils/data_science/models/keras_utils/visualizations.py +416 -416
  62. stouputils/data_science/models/sandbox.py +116 -116
  63. stouputils/data_science/range_tuple.py +234 -234
  64. stouputils/data_science/utils.py +285 -285
  65. stouputils/decorators.py +53 -39
  66. stouputils/decorators.pyi +2 -2
  67. stouputils/installer/__init__.py +18 -18
  68. stouputils/installer/linux.py +144 -144
  69. stouputils/installer/main.py +223 -223
  70. stouputils/installer/windows.py +136 -136
  71. stouputils/io.py +16 -9
  72. stouputils/print.py +229 -2
  73. stouputils/print.pyi +90 -1
  74. stouputils/py.typed +1 -1
  75. {stouputils-1.14.3.dist-info → stouputils-1.15.0.dist-info}/METADATA +1 -1
  76. {stouputils-1.14.3.dist-info → stouputils-1.15.0.dist-info}/RECORD +78 -78
  77. {stouputils-1.14.3.dist-info → stouputils-1.15.0.dist-info}/WHEEL +1 -1
  78. {stouputils-1.14.3.dist-info → stouputils-1.15.0.dist-info}/entry_points.txt +0 -0
@@ -1,296 +1,296 @@
1
- """
2
- This module contains the GroupingStrategy class, which provides a strategy for grouping images when loading a dataset.
3
-
4
- There are 3 strategies, NONE, SIMPLE and CONCATENATE.
5
- Refer to the docstrings of the GroupingStrategy class for more information.
6
- """
7
- # pyright: reportUnknownMemberType=false
8
-
9
- # Imports
10
- from __future__ import annotations
11
-
12
- import os
13
- from enum import Enum
14
- from typing import Any
15
-
16
- import numpy as np
17
- from numpy.typing import NDArray
18
-
19
- from ...decorators import handle_error
20
- from ...parallel import multiprocessing
21
- from ...print import warning
22
- from ...io import clean_path
23
- from ..config.get import DataScienceConfig
24
- from .image_loader import load_images_from_directory
25
- from .xy_tuple import XyTuple
26
-
27
-
28
- # Grouping strategy class for the dataset
29
- class GroupingStrategy(Enum):
30
- """ Grouping strategy for the dataset """
31
-
32
- NONE = 0
33
- """ Default behavior: A subfolder "subject1" is a group of images, all images are grouped together (list of features)
34
- and the label is the class of the folder above (class1)
35
-
36
- Example file tree:
37
-
38
- - dataset/class1/subject1/image1.png
39
- - dataset/class1/subject1/image2.png
40
- - dataset/class1/subject1/image3.png
41
-
42
- Example data (if binary classification):
43
-
44
- - features = [features_image1, features_image2, features_image3] where
45
- features_image1, features_image2, features_image3 are NDArray[Any] of shape `(224, 224, 3)`
46
- - labels = [1.0, 0.0]
47
-
48
- If subjects do not have the same number of images,
49
- the missing images are padded with zeros so every features have the same shape.
50
-
51
- This strategy preserves the relationship between images of the same subject when splitting the dataset,
52
- ensuring that all images from the same subject stay together in either train or test sets.
53
- """
54
-
55
- CONCATENATE = 1
56
- """ A subfolder "subject1" is a group of images, all images are concatenated into a single feature (NDArray[Any])
57
- and the label is the class of the folder above (class1)
58
-
59
- Example file tree:
60
-
61
- - dataset/class1/subject1/image1.png
62
- - dataset/class1/subject1/image2.png
63
- - dataset/class1/subject1/image3.png
64
-
65
- Example data (if binary classification):
66
-
67
- - features will have a shape of `(224, 224, 3*num_images)` (if RGB images).
68
- Notice that the concatenation is done along the last axis.
69
- - labels = [1.0, 0.0]
70
-
71
- If subjects do not have the same number of images,
72
- the missing images are padded with zeros so every features have the same shape.
73
- """
74
-
75
- @staticmethod
76
- def _load_folder(
77
- folder_path: str,
78
- class_idx: int,
79
- num_classes: int,
80
- kwargs: dict[str, Any]
81
- ) -> tuple[list[NDArray[Any]], NDArray[Any], tuple[str, ...]]:
82
- """ Load images from a single folder.
83
-
84
- Args:
85
- folder_path (str): Path to the folder
86
- class_idx (int): Index of the class
87
- num_classes (int): Total number of classes
88
- kwargs (dict[str, Any]): Additional arguments for image_dataset_from_directory
89
- Returns:
90
- list[tuple[NDArray[Any], NDArray[Any], str]]: List of tuples containing (images, one-hot label, filepaths)
91
-
92
- Examples:
93
- .. code-block:: python
94
-
95
- > data = GroupingStrategy._load_folder(
96
- folder_path="data/pizza/pizza1",
97
- class_idx=0,
98
- num_classes=2,
99
- kwargs={"color_mode": "grayscale"}
100
- )
101
- > features, label, filepaths = zip(*data, strict=True)
102
- """
103
- # Load images from the folder
104
- images_and_paths: list[tuple[NDArray[Any], str]] = load_images_from_directory(folder_path, **kwargs)
105
- images, paths = zip(*images_and_paths, strict=True) if images_and_paths else ([], [])
106
- images: list[NDArray[Any]]
107
- paths: list[str]
108
-
109
- # Create a one-hot encoded label vector
110
- label: NDArray[Any] = np.zeros(num_classes)
111
- label[class_idx] = 1.0
112
-
113
- return list(images), label, tuple(paths)
114
-
115
- @staticmethod
116
- @handle_error(error_log=DataScienceConfig.ERROR_LOG)
117
- def image_dataset_from_directory(
118
- grouping_strategy: GroupingStrategy,
119
- path: str,
120
- seed: int,
121
- **kwargs: Any
122
- ) -> tuple[XyTuple, tuple[str, ...], GroupingStrategy]:
123
- """ Load images from a directory while keeping groups of images together.
124
-
125
- Args:
126
- grouping_strategy (GroupingStrategy): Grouping strategy to use
127
- path (str): Path to the dataset directory
128
- seed (int): Random seed for shuffling
129
- **kwargs (Any): Additional arguments passed to image_dataset_from_directory
130
-
131
- Returns:
132
- XyTuple: XyTuple with the data
133
- tuple[str, ...]: List of class labels (strings)
134
- GroupingStrategy: Grouping strategy used (because it can be updated)
135
-
136
- Examples:
137
- .. code-block:: python
138
-
139
- > data = GroupingStrategy.image_dataset_from_directory(
140
- grouping_strategy=GroupingStrategy.NONE,
141
- path="data/pizza",
142
- seed=42,
143
- color_mode="grayscale"
144
- )
145
- > all_data: XyTuple = data[0]
146
- > all_labels: tuple[str, ...] = data[1]
147
- """
148
- # Get all subdirectories (classes)
149
- path = clean_path(path)
150
- class_dirs: tuple[str, ...] = tuple(d for d in os.listdir(path) if os.path.isdir(f"{path}/{d}"))
151
-
152
- # Check if there are subfolders in each class
153
- any_subfolders: bool = any(
154
- os.path.isdir(f"{path}/{class_dir}/{sub_dir}")
155
- for class_dir in class_dirs for sub_dir in os.listdir(f"{path}/{class_dir}")
156
- )
157
-
158
- # Verify if wrong grouping strategy, then adjust it
159
- if grouping_strategy != GroupingStrategy.NONE and not any_subfolders:
160
- warning(
161
- f"Strategy is {grouping_strategy.name} but there are no subfolders in each class, adjusting to NONE "
162
- "as there is no way to group the images together, that just doesn't make sense"
163
- )
164
- grouping_strategy = GroupingStrategy.NONE
165
-
166
- # Prepare multithreading arguments
167
- queue: list[tuple[str, int, int, dict[str, Any]]] = []
168
- for class_idx, class_dir in enumerate(class_dirs):
169
- class_path: str = f"{path}/{class_dir}"
170
-
171
- # Get subfolders (class1/subject1/image1.png) to the queue
172
- sub_folders: list[str] = [d for d in os.listdir(class_path) if os.path.isdir(f"{class_path}/{d}")]
173
- for sub_folder in sub_folders:
174
- folder_path: str = f"{class_path}/{sub_folder}"
175
- queue.append((folder_path, class_idx, len(class_dirs), kwargs))
176
-
177
- # Get files in the class folder
178
- files: list[str] = [f for f in os.listdir(class_path) if os.path.isfile(f"{class_path}/{f}")]
179
- for file in files:
180
- queue.append((f"{class_path}/{file}", class_idx, len(class_dirs), kwargs))
181
-
182
- # Process folders in parallel
183
- splitted: list[str] = path.split('/')
184
- description: str = f".../{splitted[-1]}" if len(splitted) > 2 else path
185
- extracted_folders: list[tuple[list[NDArray[Any]], NDArray[Any], tuple[str, ...]]] = multiprocessing(
186
- GroupingStrategy._load_folder,
187
- queue,
188
- use_starmap=True,
189
- desc=f"Loading dataset '{description}'"
190
- )
191
-
192
- # Extract results properly
193
- all_X: list[list[NDArray[Any]]] = []
194
- all_y: list[NDArray[Any]] = []
195
- all_filenames: list[tuple[str, ...]] = []
196
-
197
- # For each folder extracted (each subject maybe)
198
- for images, label, filepaths in extracted_folders:
199
- if not images:
200
- continue # Skip if no images are found
201
-
202
- to_append_X: list[NDArray[Any]] = []
203
- to_append_filepaths: list[str] = []
204
-
205
- # For each image of the subject,
206
- for image, filepath in zip(images, filepaths, strict=True):
207
-
208
- # Add the data
209
- to_append_X.append(image)
210
- to_append_filepaths.append(filepath)
211
-
212
- # Append the subject if there are images
213
- if to_append_X:
214
-
215
- # If concatenate strategy, combine images along the channel axis
216
- if grouping_strategy == GroupingStrategy.CONCATENATE:
217
- # Step 1: Make an array of shape (num_images, height, width, channels)
218
- images_array = np.array(to_append_X)
219
-
220
- # Step 2: Transpose to move channels next to num_images
221
- # From (num_images, height, width, channels) to (height, width, num_images, channels)
222
- images_array = np.transpose(images_array, (1, 2, 0, 3))
223
-
224
- # Step 3: Reshape to combine num_images and channels dimensions
225
- # From (height, width, num_images, channels) to (height, width, num_images * channels)
226
- images_array = images_array.reshape(images_array.shape[0], images_array.shape[1], -1)
227
-
228
- # Step 4: Add single concatenated feature array
229
- all_X.append([images_array])
230
-
231
- # Else, just add the images
232
- else:
233
- all_X.append(to_append_X)
234
-
235
- all_y.append(label)
236
- all_filenames.append(tuple(to_append_filepaths))
237
-
238
- # Fix different sizes of images
239
- if grouping_strategy == GroupingStrategy.CONCATENATE:
240
- all_X = GroupingStrategy.fix_different_sizes(all_X, grouping_strategy)
241
-
242
- # Shuffle the data
243
- combined = list(zip(all_X, all_y, all_filenames, strict=True))
244
- np.random.seed(seed)
245
- np.random.shuffle(combined) # pyright: ignore [reportArgumentType]
246
- all_X, all_y, all_filenames = zip(*combined, strict=True)
247
-
248
- # Create a XyTuple and return it
249
- return XyTuple(all_X, all_y, tuple(all_filenames)), class_dirs, grouping_strategy
250
-
251
-
252
- @staticmethod
253
- def fix_different_sizes(data: list[list[NDArray[Any]]], grouping_strategy: GroupingStrategy) -> list[list[NDArray[Any]]]:
254
- """ Fix different sizes of images in a list of lists of numpy arrays.
255
-
256
- Simple strategy will add empty images to shape[0]
257
- Concatenate strategy will add empty channels to shape[-1]
258
-
259
- Args:
260
- data (list[list[NDArray[Any]]]): List of lists of numpy arrays
261
- grouping_strategy (GroupingStrategy): Grouping strategy used
262
-
263
- Returns:
264
- list[list[NDArray[Any]]]: List of lists of numpy arrays with consistent shapes
265
-
266
- Examples:
267
- >>> # Concatenate grouping strategy
268
- >>> data = [[np.zeros((7, 224, 224, 3))], [np.zeros((1, 224, 224, 1))]]
269
- >>> data = GroupingStrategy.fix_different_sizes(data, GroupingStrategy.CONCATENATE)
270
- >>> data[0][0].shape
271
- (7, 224, 224, 3)
272
- >>> data[1][0].shape
273
- (1, 224, 224, 3)
274
- >>> data[1][0].shape[0] == data[0][0].shape[0]
275
- False
276
- >>> data[1][0].shape[-1] == data[0][0].shape[-1]
277
- True
278
- """
279
- # Add empty channels to images that have less channels than others
280
- if grouping_strategy == GroupingStrategy.CONCATENATE:
281
- # Find the maximum number of channels across all images in all groups
282
- max_num_channels: int = max(x.shape[-1] for group in data for x in group)
283
-
284
- for i, group in enumerate(data):
285
- for j, image in enumerate(group):
286
- if image.shape[-1] < max_num_channels:
287
- # Calculate how many times to repeat the channels
288
- repeat_count: int = int(np.ceil(max_num_channels / image.shape[-1]))
289
-
290
- # Repeat the channels and then slice to get exactly the right number
291
- repeated_channels = np.repeat(image, repeat_count, axis=-1)
292
- data[i][j] = repeated_channels[..., :max_num_channels]
293
-
294
- # Return the fixed list of lists of numpy arrays
295
- return data
296
-
1
+ """
2
+ This module contains the GroupingStrategy class, which provides a strategy for grouping images when loading a dataset.
3
+
4
+ There are 3 strategies, NONE, SIMPLE and CONCATENATE.
5
+ Refer to the docstrings of the GroupingStrategy class for more information.
6
+ """
7
+ # pyright: reportUnknownMemberType=false
8
+
9
+ # Imports
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from enum import Enum
14
+ from typing import Any
15
+
16
+ import numpy as np
17
+ from numpy.typing import NDArray
18
+
19
+ from ...decorators import handle_error
20
+ from ...parallel import multiprocessing
21
+ from ...print import warning
22
+ from ...io import clean_path
23
+ from ..config.get import DataScienceConfig
24
+ from .image_loader import load_images_from_directory
25
+ from .xy_tuple import XyTuple
26
+
27
+
28
+ # Grouping strategy class for the dataset
29
+ class GroupingStrategy(Enum):
30
+ """ Grouping strategy for the dataset """
31
+
32
+ NONE = 0
33
+ """ Default behavior: A subfolder "subject1" is a group of images, all images are grouped together (list of features)
34
+ and the label is the class of the folder above (class1)
35
+
36
+ Example file tree:
37
+
38
+ - dataset/class1/subject1/image1.png
39
+ - dataset/class1/subject1/image2.png
40
+ - dataset/class1/subject1/image3.png
41
+
42
+ Example data (if binary classification):
43
+
44
+ - features = [features_image1, features_image2, features_image3] where
45
+ features_image1, features_image2, features_image3 are NDArray[Any] of shape `(224, 224, 3)`
46
+ - labels = [1.0, 0.0]
47
+
48
+ If subjects do not have the same number of images,
49
+ the missing images are padded with zeros so every features have the same shape.
50
+
51
+ This strategy preserves the relationship between images of the same subject when splitting the dataset,
52
+ ensuring that all images from the same subject stay together in either train or test sets.
53
+ """
54
+
55
+ CONCATENATE = 1
56
+ """ A subfolder "subject1" is a group of images, all images are concatenated into a single feature (NDArray[Any])
57
+ and the label is the class of the folder above (class1)
58
+
59
+ Example file tree:
60
+
61
+ - dataset/class1/subject1/image1.png
62
+ - dataset/class1/subject1/image2.png
63
+ - dataset/class1/subject1/image3.png
64
+
65
+ Example data (if binary classification):
66
+
67
+ - features will have a shape of `(224, 224, 3*num_images)` (if RGB images).
68
+ Notice that the concatenation is done along the last axis.
69
+ - labels = [1.0, 0.0]
70
+
71
+ If subjects do not have the same number of images,
72
+ the missing images are padded with zeros so every features have the same shape.
73
+ """
74
+
75
+ @staticmethod
76
+ def _load_folder(
77
+ folder_path: str,
78
+ class_idx: int,
79
+ num_classes: int,
80
+ kwargs: dict[str, Any]
81
+ ) -> tuple[list[NDArray[Any]], NDArray[Any], tuple[str, ...]]:
82
+ """ Load images from a single folder.
83
+
84
+ Args:
85
+ folder_path (str): Path to the folder
86
+ class_idx (int): Index of the class
87
+ num_classes (int): Total number of classes
88
+ kwargs (dict[str, Any]): Additional arguments for image_dataset_from_directory
89
+ Returns:
90
+ list[tuple[NDArray[Any], NDArray[Any], str]]: List of tuples containing (images, one-hot label, filepaths)
91
+
92
+ Examples:
93
+ .. code-block:: python
94
+
95
+ > data = GroupingStrategy._load_folder(
96
+ folder_path="data/pizza/pizza1",
97
+ class_idx=0,
98
+ num_classes=2,
99
+ kwargs={"color_mode": "grayscale"}
100
+ )
101
+ > features, label, filepaths = zip(*data, strict=True)
102
+ """
103
+ # Load images from the folder
104
+ images_and_paths: list[tuple[NDArray[Any], str]] = load_images_from_directory(folder_path, **kwargs)
105
+ images, paths = zip(*images_and_paths, strict=True) if images_and_paths else ([], [])
106
+ images: list[NDArray[Any]]
107
+ paths: list[str]
108
+
109
+ # Create a one-hot encoded label vector
110
+ label: NDArray[Any] = np.zeros(num_classes)
111
+ label[class_idx] = 1.0
112
+
113
+ return list(images), label, tuple(paths)
114
+
115
+ @staticmethod
116
+ @handle_error(error_log=DataScienceConfig.ERROR_LOG)
117
+ def image_dataset_from_directory(
118
+ grouping_strategy: GroupingStrategy,
119
+ path: str,
120
+ seed: int,
121
+ **kwargs: Any
122
+ ) -> tuple[XyTuple, tuple[str, ...], GroupingStrategy]:
123
+ """ Load images from a directory while keeping groups of images together.
124
+
125
+ Args:
126
+ grouping_strategy (GroupingStrategy): Grouping strategy to use
127
+ path (str): Path to the dataset directory
128
+ seed (int): Random seed for shuffling
129
+ **kwargs (Any): Additional arguments passed to image_dataset_from_directory
130
+
131
+ Returns:
132
+ XyTuple: XyTuple with the data
133
+ tuple[str, ...]: List of class labels (strings)
134
+ GroupingStrategy: Grouping strategy used (because it can be updated)
135
+
136
+ Examples:
137
+ .. code-block:: python
138
+
139
+ > data = GroupingStrategy.image_dataset_from_directory(
140
+ grouping_strategy=GroupingStrategy.NONE,
141
+ path="data/pizza",
142
+ seed=42,
143
+ color_mode="grayscale"
144
+ )
145
+ > all_data: XyTuple = data[0]
146
+ > all_labels: tuple[str, ...] = data[1]
147
+ """
148
+ # Get all subdirectories (classes)
149
+ path = clean_path(path)
150
+ class_dirs: tuple[str, ...] = tuple(d for d in os.listdir(path) if os.path.isdir(f"{path}/{d}"))
151
+
152
+ # Check if there are subfolders in each class
153
+ any_subfolders: bool = any(
154
+ os.path.isdir(f"{path}/{class_dir}/{sub_dir}")
155
+ for class_dir in class_dirs for sub_dir in os.listdir(f"{path}/{class_dir}")
156
+ )
157
+
158
+ # Verify if wrong grouping strategy, then adjust it
159
+ if grouping_strategy != GroupingStrategy.NONE and not any_subfolders:
160
+ warning(
161
+ f"Strategy is {grouping_strategy.name} but there are no subfolders in each class, adjusting to NONE "
162
+ "as there is no way to group the images together, that just doesn't make sense"
163
+ )
164
+ grouping_strategy = GroupingStrategy.NONE
165
+
166
+ # Prepare multithreading arguments
167
+ queue: list[tuple[str, int, int, dict[str, Any]]] = []
168
+ for class_idx, class_dir in enumerate(class_dirs):
169
+ class_path: str = f"{path}/{class_dir}"
170
+
171
+ # Get subfolders (class1/subject1/image1.png) to the queue
172
+ sub_folders: list[str] = [d for d in os.listdir(class_path) if os.path.isdir(f"{class_path}/{d}")]
173
+ for sub_folder in sub_folders:
174
+ folder_path: str = f"{class_path}/{sub_folder}"
175
+ queue.append((folder_path, class_idx, len(class_dirs), kwargs))
176
+
177
+ # Get files in the class folder
178
+ files: list[str] = [f for f in os.listdir(class_path) if os.path.isfile(f"{class_path}/{f}")]
179
+ for file in files:
180
+ queue.append((f"{class_path}/{file}", class_idx, len(class_dirs), kwargs))
181
+
182
+ # Process folders in parallel
183
+ splitted: list[str] = path.split('/')
184
+ description: str = f".../{splitted[-1]}" if len(splitted) > 2 else path
185
+ extracted_folders: list[tuple[list[NDArray[Any]], NDArray[Any], tuple[str, ...]]] = multiprocessing(
186
+ GroupingStrategy._load_folder,
187
+ queue,
188
+ use_starmap=True,
189
+ desc=f"Loading dataset '{description}'"
190
+ )
191
+
192
+ # Extract results properly
193
+ all_X: list[list[NDArray[Any]]] = []
194
+ all_y: list[NDArray[Any]] = []
195
+ all_filenames: list[tuple[str, ...]] = []
196
+
197
+ # For each folder extracted (each subject maybe)
198
+ for images, label, filepaths in extracted_folders:
199
+ if not images:
200
+ continue # Skip if no images are found
201
+
202
+ to_append_X: list[NDArray[Any]] = []
203
+ to_append_filepaths: list[str] = []
204
+
205
+ # For each image of the subject,
206
+ for image, filepath in zip(images, filepaths, strict=True):
207
+
208
+ # Add the data
209
+ to_append_X.append(image)
210
+ to_append_filepaths.append(filepath)
211
+
212
+ # Append the subject if there are images
213
+ if to_append_X:
214
+
215
+ # If concatenate strategy, combine images along the channel axis
216
+ if grouping_strategy == GroupingStrategy.CONCATENATE:
217
+ # Step 1: Make an array of shape (num_images, height, width, channels)
218
+ images_array = np.array(to_append_X)
219
+
220
+ # Step 2: Transpose to move channels next to num_images
221
+ # From (num_images, height, width, channels) to (height, width, num_images, channels)
222
+ images_array = np.transpose(images_array, (1, 2, 0, 3))
223
+
224
+ # Step 3: Reshape to combine num_images and channels dimensions
225
+ # From (height, width, num_images, channels) to (height, width, num_images * channels)
226
+ images_array = images_array.reshape(images_array.shape[0], images_array.shape[1], -1)
227
+
228
+ # Step 4: Add single concatenated feature array
229
+ all_X.append([images_array])
230
+
231
+ # Else, just add the images
232
+ else:
233
+ all_X.append(to_append_X)
234
+
235
+ all_y.append(label)
236
+ all_filenames.append(tuple(to_append_filepaths))
237
+
238
+ # Fix different sizes of images
239
+ if grouping_strategy == GroupingStrategy.CONCATENATE:
240
+ all_X = GroupingStrategy.fix_different_sizes(all_X, grouping_strategy)
241
+
242
+ # Shuffle the data
243
+ combined = list(zip(all_X, all_y, all_filenames, strict=True))
244
+ np.random.seed(seed)
245
+ np.random.shuffle(combined) # pyright: ignore [reportArgumentType]
246
+ all_X, all_y, all_filenames = zip(*combined, strict=True)
247
+
248
+ # Create a XyTuple and return it
249
+ return XyTuple(all_X, all_y, tuple(all_filenames)), class_dirs, grouping_strategy
250
+
251
+
252
+ @staticmethod
253
+ def fix_different_sizes(data: list[list[NDArray[Any]]], grouping_strategy: GroupingStrategy) -> list[list[NDArray[Any]]]:
254
+ """ Fix different sizes of images in a list of lists of numpy arrays.
255
+
256
+ Simple strategy will add empty images to shape[0]
257
+ Concatenate strategy will add empty channels to shape[-1]
258
+
259
+ Args:
260
+ data (list[list[NDArray[Any]]]): List of lists of numpy arrays
261
+ grouping_strategy (GroupingStrategy): Grouping strategy used
262
+
263
+ Returns:
264
+ list[list[NDArray[Any]]]: List of lists of numpy arrays with consistent shapes
265
+
266
+ Examples:
267
+ >>> # Concatenate grouping strategy
268
+ >>> data = [[np.zeros((7, 224, 224, 3))], [np.zeros((1, 224, 224, 1))]]
269
+ >>> data = GroupingStrategy.fix_different_sizes(data, GroupingStrategy.CONCATENATE)
270
+ >>> data[0][0].shape
271
+ (7, 224, 224, 3)
272
+ >>> data[1][0].shape
273
+ (1, 224, 224, 3)
274
+ >>> data[1][0].shape[0] == data[0][0].shape[0]
275
+ False
276
+ >>> data[1][0].shape[-1] == data[0][0].shape[-1]
277
+ True
278
+ """
279
+ # Add empty channels to images that have less channels than others
280
+ if grouping_strategy == GroupingStrategy.CONCATENATE:
281
+ # Find the maximum number of channels across all images in all groups
282
+ max_num_channels: int = max(x.shape[-1] for group in data for x in group)
283
+
284
+ for i, group in enumerate(data):
285
+ for j, image in enumerate(group):
286
+ if image.shape[-1] < max_num_channels:
287
+ # Calculate how many times to repeat the channels
288
+ repeat_count: int = int(np.ceil(max_num_channels / image.shape[-1]))
289
+
290
+ # Repeat the channels and then slice to get exactly the right number
291
+ repeated_channels = np.repeat(image, repeat_count, axis=-1)
292
+ data[i][j] = repeated_channels[..., :max_num_channels]
293
+
294
+ # Return the fixed list of lists of numpy arrays
295
+ return data
296
+