spacr 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +3 -2
- spacr/app_classify.py +10 -0
- spacr/app_mask.py +9 -0
- spacr/app_measure.py +9 -0
- spacr/app_sequencing.py +9 -0
- spacr/core.py +172 -1
- spacr/deep_spacr.py +296 -7
- spacr/gui.py +68 -0
- spacr/gui_core.py +319 -10
- spacr/gui_elements.py +772 -13
- spacr/gui_utils.py +304 -12
- spacr/io.py +887 -71
- spacr/logger.py +36 -0
- spacr/measure.py +206 -28
- spacr/ml.py +606 -142
- spacr/plot.py +797 -131
- spacr/sequencing.py +363 -8
- spacr/settings.py +1158 -38
- spacr/sp_stats.py +80 -12
- spacr/spacr_cellpose.py +115 -2
- spacr/submodules.py +747 -19
- spacr/timelapse.py +237 -53
- spacr/toxo.py +132 -6
- spacr/utils.py +2422 -80
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/METADATA +31 -17
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/RECORD +30 -30
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/LICENSE +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/WHEEL +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/entry_points.txt +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/top_level.txt +0 -0
spacr/io.py
CHANGED
@@ -25,11 +25,51 @@ from sklearn.model_selection import train_test_split
|
|
25
25
|
from pylibCZIrw import czi as pyczi
|
26
26
|
|
27
27
|
def process_non_tif_non_2D_images(folder):
|
28
|
-
"""
|
29
|
-
|
28
|
+
"""
|
29
|
+
Process and standardize image files in a folder by converting or splitting them into grayscale TIFFs.
|
30
|
+
|
31
|
+
This function supports various image formats (PNG, JPEG, CZI, ND2, TIFF) and ensures all output images
|
32
|
+
are grayscale TIFF files saved with consistent naming based on dimensions (channel, z-plane, timepoint).
|
33
|
+
|
34
|
+
For 2D grayscale images in non-TIFF formats, it converts them to TIFF.
|
35
|
+
For 3D, 4D, or 5D images, it splits them into individual grayscale channels and saves them with suffixes
|
36
|
+
(_C#, _Z#, _T#) to indicate channel, z-stack, and time point respectively.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
folder (str): Path to the folder containing image files to be processed.
|
40
|
+
|
41
|
+
Supported file extensions:
|
42
|
+
- .tif, .tiff
|
43
|
+
- .png
|
44
|
+
- .jpg, .jpeg
|
45
|
+
- .czi
|
46
|
+
- .nd2
|
47
|
+
|
48
|
+
Output:
|
49
|
+
- Saves standardized grayscale TIFF images in the same folder with descriptive filenames.
|
50
|
+
- Prints a log message for each file processed or skipped.
|
51
|
+
"""
|
30
52
|
# Helper function to save grayscale images
|
31
53
|
def save_grayscale_images(image, base_name, folder, dtype, channel=None, z=None, t=None):
|
32
|
-
"""
|
54
|
+
"""
|
55
|
+
Save a single grayscale image slice as a TIFF file with a descriptive filename.
|
56
|
+
|
57
|
+
The output filename is constructed from the base name and optionally includes
|
58
|
+
suffixes for channel (C#), z-plane (Z#), and timepoint (T#) to reflect its position
|
59
|
+
in a multidimensional dataset.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
image (np.ndarray): The grayscale image array to save.
|
63
|
+
base_name (str): The base filename (without extension).
|
64
|
+
folder (str): Directory in which to save the output TIFF.
|
65
|
+
dtype (np.dtype): Desired data type to cast the image before saving (e.g., np.uint8, np.uint16).
|
66
|
+
channel (int, optional): Channel index (1-based) to include in the filename.
|
67
|
+
z (int, optional): Z-plane index (1-based) to include in the filename.
|
68
|
+
t (int, optional): Timepoint index (1-based) to include in the filename.
|
69
|
+
|
70
|
+
Output:
|
71
|
+
Saves the image as a `.tif` file in the specified folder with the constructed filename.
|
72
|
+
"""
|
33
73
|
suffix = ""
|
34
74
|
if channel is not None:
|
35
75
|
suffix += f"_C{channel}"
|
@@ -43,7 +83,27 @@ def process_non_tif_non_2D_images(folder):
|
|
43
83
|
|
44
84
|
# Function to handle splitting of multi-dimensional images into grayscale channels
|
45
85
|
def split_channels(image, folder, base_name, dtype):
|
46
|
-
"""
|
86
|
+
"""
|
87
|
+
Split and save multi-dimensional image data into individual grayscale TIFF files.
|
88
|
+
|
89
|
+
This function handles 3D, 4D, and 5D images by separating each channel (and optionally
|
90
|
+
z-slices and timepoints) and saving each as an individual grayscale image using the
|
91
|
+
`save_grayscale_images` function.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
image (np.ndarray): Input image array with shape:
|
95
|
+
- 3D: (height, width, channels)
|
96
|
+
- 4D: (height, width, channels, z)
|
97
|
+
- 5D: (height, width, channels, z, t)
|
98
|
+
folder (str): Output directory where the grayscale images will be saved.
|
99
|
+
base_name (str): Base name used for constructing output filenames.
|
100
|
+
dtype (np.dtype): Desired data type to cast images before saving.
|
101
|
+
|
102
|
+
Note:
|
103
|
+
2D grayscale images are ignored, as they should be handled separately.
|
104
|
+
The output TIFF filenames will include suffixes like `_C1`, `_Z1`, `_T1` to
|
105
|
+
indicate channel, z-plane, and timepoint respectively.
|
106
|
+
"""
|
47
107
|
if image.ndim == 2:
|
48
108
|
# Grayscale image, already processed separately
|
49
109
|
return
|
@@ -68,7 +128,29 @@ def process_non_tif_non_2D_images(folder):
|
|
68
128
|
|
69
129
|
# Function to load images in various formats
|
70
130
|
def load_image(file_path):
|
71
|
-
"""
|
131
|
+
"""
|
132
|
+
Load an image file of various supported formats and return it as a NumPy array.
|
133
|
+
|
134
|
+
Supports TIFF, PNG, JPEG, CZI, and ND2 image formats. Converts the image to a NumPy array
|
135
|
+
and returns it along with its data type for further processing.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
file_path (str): Path to the image file.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
tuple: A tuple (image, dtype) where:
|
142
|
+
- image (np.ndarray): Loaded image as a NumPy array.
|
143
|
+
- dtype: Data type of the image (e.g., np.uint16, np.float32, etc.).
|
144
|
+
|
145
|
+
Raises:
|
146
|
+
ValueError: If the file extension is not supported.
|
147
|
+
|
148
|
+
Supported formats:
|
149
|
+
- .tif, .tiff (TIFF)
|
150
|
+
- .png, .jpg, .jpeg (standard image formats)
|
151
|
+
- .czi (Zeiss CZI microscopy format)
|
152
|
+
- .nd2 (Nikon ND2 microscopy format)
|
153
|
+
"""
|
72
154
|
ext = os.path.splitext(file_path)[1].lower()
|
73
155
|
|
74
156
|
if ext in ['.tif', '.tiff']:
|
@@ -94,7 +176,21 @@ def process_non_tif_non_2D_images(folder):
|
|
94
176
|
|
95
177
|
# Function to check if an image is grayscale and save it as a TIFF if it isn't already
|
96
178
|
def convert_grayscale_to_tiff(image, filename, folder, dtype):
|
97
|
-
"""
|
179
|
+
"""
|
180
|
+
Convert a grayscale image to TIFF format and save it, preserving the original bit depth.
|
181
|
+
|
182
|
+
This function is intended for grayscale (2D) images in non-TIFF formats (e.g., PNG, JPEG).
|
183
|
+
It converts the image to the specified dtype and saves it as a TIFF in the specified folder.
|
184
|
+
|
185
|
+
Args:
|
186
|
+
image (np.ndarray): Grayscale image as a NumPy array.
|
187
|
+
filename (str): Original filename (used to derive output name).
|
188
|
+
folder (str): Destination folder where the TIFF image will be saved.
|
189
|
+
dtype (np.dtype): Data type to cast the image before saving.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
None
|
193
|
+
"""
|
98
194
|
base_name = os.path.splitext(filename)[0]
|
99
195
|
output_filename = os.path.join(folder, f"{base_name}.tif")
|
100
196
|
tifffile.imwrite(output_filename, image.astype(dtype))
|
@@ -130,7 +226,20 @@ def process_non_tif_non_2D_images(folder):
|
|
130
226
|
print(f"Error processing {filename}: {str(e)}")
|
131
227
|
|
132
228
|
def _load_images_and_labels(image_files, label_files, invert=False):
|
133
|
-
|
229
|
+
"""
|
230
|
+
Load image and label files from disk and optionally normalize intensity.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
image_files (list[str]): List of paths to image files.
|
234
|
+
label_files (list[str]): List of paths to label (mask) files.
|
235
|
+
invert (bool): If True, invert the intensity of input images.
|
236
|
+
|
237
|
+
Returns:
|
238
|
+
images (list[np.ndarray]): List of loaded image arrays.
|
239
|
+
labels (list[np.ndarray]): List of loaded label arrays.
|
240
|
+
image_names (list[str]): List of image file names (no paths).
|
241
|
+
label_names (list[str]): List of label file names (no paths).
|
242
|
+
"""
|
134
243
|
from .utils import invert_image
|
135
244
|
|
136
245
|
images = []
|
@@ -191,7 +300,29 @@ def _load_images_and_labels(image_files, label_files, invert=False):
|
|
191
300
|
def _load_normalized_images_and_labels(image_files, label_files, channels=None, percentiles=None,
|
192
301
|
invert=False, visualize=False, remove_background=False,
|
193
302
|
background=0, Signal_to_noise=10, target_height=None, target_width=None):
|
194
|
-
|
303
|
+
"""
|
304
|
+
Load, normalize, and optionally resize images and labels for downstream analysis.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
image_files (list[str]): List of paths to image files.
|
308
|
+
label_files (list[str] or None): List of paths to label (mask) files.
|
309
|
+
channels (list[int] or None): Indices of image channels to retain.
|
310
|
+
percentiles (list[int, int] or None): Percentile range for intensity normalization.
|
311
|
+
invert (bool): If True, invert image intensity.
|
312
|
+
visualize (bool): If True, display plots of raw and normalized images.
|
313
|
+
remove_background (bool): If True, zero pixels below `background` threshold.
|
314
|
+
background (float): Background intensity threshold.
|
315
|
+
Signal_to_noise (float): Minimum signal-to-noise ratio used to detect saturation.
|
316
|
+
target_height (int or None): Target height for image resizing.
|
317
|
+
target_width (int or None): Target width for image resizing.
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
normalized_images (list[np.ndarray]): List of normalized image arrays.
|
321
|
+
labels (list[np.ndarray]): List of label arrays (resized if needed).
|
322
|
+
image_names (list[str]): List of image file names.
|
323
|
+
label_names (list[str]): List of label file names.
|
324
|
+
orig_dims (list[Tuple[int, int]]): Original dimensions of each image before resizing.
|
325
|
+
"""
|
195
326
|
from .plot import normalize_and_visualize, plot_resize
|
196
327
|
from .utils import invert_image, apply_mask
|
197
328
|
from skimage.transform import resize as resizescikit
|
@@ -297,23 +428,54 @@ def _load_normalized_images_and_labels(image_files, label_files, channels=None,
|
|
297
428
|
|
298
429
|
class CombineLoaders:
|
299
430
|
"""
|
300
|
-
A class that combines multiple data loaders into a single
|
431
|
+
A class that combines multiple PyTorch data loaders into a single iterable.
|
432
|
+
|
433
|
+
This class allows iteration over a mixed sequence of batches from several
|
434
|
+
data loaders, yielding a tuple with the loader index and the corresponding batch.
|
435
|
+
Once a loader is exhausted, it is removed from the iteration pool.
|
301
436
|
|
302
437
|
Args:
|
303
|
-
train_loaders (list): A list of
|
438
|
+
train_loaders (list): A list of PyTorch DataLoader objects.
|
304
439
|
|
305
440
|
Raises:
|
306
|
-
StopIteration:
|
441
|
+
StopIteration: When all data loaders are exhausted.
|
307
442
|
"""
|
308
443
|
|
309
444
|
def __init__(self, train_loaders):
|
445
|
+
"""
|
446
|
+
Initialize the CombineLoaders instance.
|
447
|
+
|
448
|
+
Converts each data loader into an iterator for independent traversal.
|
449
|
+
|
450
|
+
Args:
|
451
|
+
train_loaders (list): List of torch.utils.data.DataLoader instances.
|
452
|
+
"""
|
310
453
|
self.train_loaders = train_loaders
|
311
454
|
self.loader_iters = [iter(loader) for loader in train_loaders]
|
312
455
|
|
313
456
|
def __iter__(self):
|
457
|
+
"""
|
458
|
+
Return the iterator object (self).
|
459
|
+
|
460
|
+
Returns:
|
461
|
+
CombineLoaders: The iterator object itself.
|
462
|
+
"""
|
314
463
|
return self
|
315
464
|
|
316
465
|
def __next__(self):
|
466
|
+
"""
|
467
|
+
Return the next batch from the available data loaders.
|
468
|
+
|
469
|
+
Data loaders are shuffled at each step to randomize the batch source.
|
470
|
+
If a data loader is exhausted, it is removed from the pool.
|
471
|
+
|
472
|
+
Returns:
|
473
|
+
tuple: A tuple (i, batch) where i is the index of the originating loader,
|
474
|
+
and batch is the next batch of data from that loader.
|
475
|
+
|
476
|
+
Raises:
|
477
|
+
StopIteration: When all loaders have been exhausted.
|
478
|
+
"""
|
317
479
|
while self.loader_iters:
|
318
480
|
random.shuffle(self.loader_iters)
|
319
481
|
for i, loader_iter in enumerate(self.loader_iters):
|
@@ -329,14 +491,26 @@ class CombineLoaders:
|
|
329
491
|
|
330
492
|
class CombinedDataset(Dataset):
|
331
493
|
"""
|
332
|
-
A dataset that combines multiple datasets into one.
|
494
|
+
A dataset that combines multiple datasets into one seamless dataset.
|
495
|
+
|
496
|
+
This class supports optional shuffling across datasets and presents
|
497
|
+
a unified indexing interface for training or evaluation.
|
333
498
|
|
334
499
|
Args:
|
335
|
-
datasets (list): A list of
|
336
|
-
shuffle (bool, optional): Whether to shuffle the
|
500
|
+
datasets (list): A list of PyTorch Dataset objects to combine.
|
501
|
+
shuffle (bool, optional): Whether to shuffle the indices for data access. Defaults to True.
|
337
502
|
"""
|
338
503
|
|
339
504
|
def __init__(self, datasets, shuffle=True):
|
505
|
+
"""
|
506
|
+
Initialize the CombinedDataset.
|
507
|
+
|
508
|
+
Computes lengths of each dataset and optionally shuffles the access indices.
|
509
|
+
|
510
|
+
Args:
|
511
|
+
datasets (list): A list of datasets to be combined.
|
512
|
+
shuffle (bool, optional): Whether to shuffle the combined dataset. Defaults to True.
|
513
|
+
"""
|
340
514
|
self.datasets = datasets
|
341
515
|
self.lengths = [len(dataset) for dataset in datasets]
|
342
516
|
self.total_length = sum(self.lengths)
|
@@ -347,6 +521,17 @@ class CombinedDataset(Dataset):
|
|
347
521
|
else:
|
348
522
|
self.indices = None
|
349
523
|
def __getitem__(self, index):
|
524
|
+
"""
|
525
|
+
Retrieve an item from the combined dataset.
|
526
|
+
|
527
|
+
The method accounts for shuffling and maps the index to the appropriate dataset.
|
528
|
+
|
529
|
+
Args:
|
530
|
+
index (int): Index of the item in the combined dataset.
|
531
|
+
|
532
|
+
Returns:
|
533
|
+
Any: The item retrieved from the corresponding sub-dataset.
|
534
|
+
"""
|
350
535
|
if self.shuffle:
|
351
536
|
index = self.indices[index]
|
352
537
|
for dataset, length in zip(self.datasets, self.lengths):
|
@@ -354,6 +539,12 @@ class CombinedDataset(Dataset):
|
|
354
539
|
return dataset[index]
|
355
540
|
index -= length
|
356
541
|
def __len__(self):
|
542
|
+
"""
|
543
|
+
Return the total length of the combined dataset.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
int: Total number of items across all datasets.
|
547
|
+
"""
|
357
548
|
return self.total_length
|
358
549
|
|
359
550
|
class NoClassDataset(Dataset):
|
@@ -431,9 +622,38 @@ class NoClassDataset(Dataset):
|
|
431
622
|
img = ToTensor()(img)
|
432
623
|
return img, self.filenames[index]
|
433
624
|
|
434
|
-
|
435
625
|
class spacrDataset(Dataset):
|
626
|
+
"""
|
627
|
+
Custom PyTorch Dataset for loading labeled image data organized by class folders or from specified file lists.
|
628
|
+
|
629
|
+
This dataset supports loading images either from directory structures organized by class or from explicit
|
630
|
+
file and label lists. It supports optional preloading of all images into memory for faster access.
|
631
|
+
|
632
|
+
Args:
|
633
|
+
data_dir (str): Root directory containing subfolders for each class.
|
634
|
+
loader_classes (list[str]): List of class names corresponding to subfolder names in `data_dir`.
|
635
|
+
transform (callable, optional): Transform to apply to images (e.g., torchvision transforms).
|
636
|
+
shuffle (bool): Whether to shuffle the dataset. Default is True.
|
637
|
+
pin_memory (bool): If True, pre-load all images into memory using multiprocessing. Default is False.
|
638
|
+
specific_files (list[str], optional): Specific image file paths to load instead of scanning `data_dir`.
|
639
|
+
specific_labels (list[int], optional): Corresponding labels for `specific_files`.
|
640
|
+
"""
|
436
641
|
def __init__(self, data_dir, loader_classes, transform=None, shuffle=True, pin_memory=False, specific_files=None, specific_labels=None):
|
642
|
+
"""
|
643
|
+
Initialize the spacrDataset.
|
644
|
+
|
645
|
+
Constructs the dataset either by scanning the data directory or using provided file paths and labels.
|
646
|
+
Optionally shuffles and preloads images into memory.
|
647
|
+
|
648
|
+
Args:
|
649
|
+
data_dir (str): Directory containing class subfolders.
|
650
|
+
loader_classes (list): List of class names.
|
651
|
+
transform (callable, optional): Transform function to apply to images.
|
652
|
+
shuffle (bool): Whether to shuffle the dataset. Default is True.
|
653
|
+
pin_memory (bool): Whether to preload images into memory. Default is False.
|
654
|
+
specific_files (list[str], optional): List of file paths to use directly.
|
655
|
+
specific_labels (list[int], optional): List of labels corresponding to specific files.
|
656
|
+
"""
|
437
657
|
self.data_dir = data_dir
|
438
658
|
self.classes = loader_classes
|
439
659
|
self.transform = transform
|
@@ -463,23 +683,59 @@ class spacrDataset(Dataset):
|
|
463
683
|
self.images = None
|
464
684
|
|
465
685
|
def load_image(self, img_path):
|
686
|
+
"""
|
687
|
+
Load and return a single image with orientation correction.
|
688
|
+
|
689
|
+
Args:
|
690
|
+
img_path (str): Path to the image file.
|
691
|
+
|
692
|
+
Returns:
|
693
|
+
PIL.Image: Loaded RGB image.
|
694
|
+
"""
|
466
695
|
img = Image.open(img_path).convert('RGB')
|
467
696
|
img = ImageOps.exif_transpose(img) # Handle image orientation
|
468
697
|
return img
|
469
698
|
|
470
699
|
def __len__(self):
|
700
|
+
"""
|
701
|
+
Return the number of samples in the dataset.
|
702
|
+
|
703
|
+
Returns:
|
704
|
+
int: Total number of images.
|
705
|
+
"""
|
471
706
|
return len(self.filenames)
|
472
707
|
|
473
708
|
def shuffle_dataset(self):
|
709
|
+
"""
|
710
|
+
Shuffle the dataset filenames and labels in unison.
|
711
|
+
"""
|
474
712
|
combined = list(zip(self.filenames, self.labels))
|
475
713
|
random.shuffle(combined)
|
476
714
|
self.filenames, self.labels = zip(*combined)
|
477
715
|
|
478
716
|
def get_plate(self, filepath):
|
717
|
+
"""
|
718
|
+
Extract the plate identifier from the filename.
|
719
|
+
|
720
|
+
Args:
|
721
|
+
filepath (str): Full path to the file.
|
722
|
+
|
723
|
+
Returns:
|
724
|
+
str: Plate ID extracted from the filename.
|
725
|
+
"""
|
479
726
|
filename = os.path.basename(filepath)
|
480
727
|
return filename.split('_')[0]
|
481
728
|
|
482
729
|
def __getitem__(self, index):
|
730
|
+
"""
|
731
|
+
Retrieve an image, its label, and the filename.
|
732
|
+
|
733
|
+
Args:
|
734
|
+
index (int): Index of the image to retrieve.
|
735
|
+
|
736
|
+
Returns:
|
737
|
+
tuple: (image, label, filename)
|
738
|
+
"""
|
483
739
|
if self.pin_memory:
|
484
740
|
img = self.images[index]
|
485
741
|
else:
|
@@ -491,7 +747,29 @@ class spacrDataset(Dataset):
|
|
491
747
|
return img, label, filename
|
492
748
|
|
493
749
|
class spacrDataLoader(DataLoader):
|
750
|
+
"""
|
751
|
+
Custom DataLoader with background batch preloading support using multiprocessing.
|
752
|
+
|
753
|
+
This class extends `torch.utils.data.DataLoader` and adds asynchronous background
|
754
|
+
preloading of a specified number of batches using a separate process or in-place loading
|
755
|
+
if `pin_memory=True`.
|
756
|
+
|
757
|
+
Args:
|
758
|
+
*args: Arguments passed to the base DataLoader.
|
759
|
+
preload_batches (int): Number of batches to preload in a background process. Default is 1.
|
760
|
+
**kwargs: Keyword arguments passed to the base DataLoader. Supports all standard DataLoader arguments.
|
761
|
+
"""
|
494
762
|
def __init__(self, *args, preload_batches=1, **kwargs):
|
763
|
+
"""
|
764
|
+
Initialize the spacrDataLoader.
|
765
|
+
|
766
|
+
Sets up the queue and multiprocessing process for background preloading of batches.
|
767
|
+
|
768
|
+
Args:
|
769
|
+
*args: Arguments passed to torch.utils.data.DataLoader.
|
770
|
+
preload_batches (int): Number of batches to preload. Default is 1.
|
771
|
+
**kwargs: Keyword arguments passed to the base DataLoader.
|
772
|
+
"""
|
495
773
|
super().__init__(*args, **kwargs)
|
496
774
|
self.preload_batches = preload_batches
|
497
775
|
self.batch_queue = Queue(maxsize=preload_batches)
|
@@ -502,6 +780,12 @@ class spacrDataLoader(DataLoader):
|
|
502
780
|
atexit.register(self.cleanup)
|
503
781
|
|
504
782
|
def _preload_next_batches(self):
|
783
|
+
"""
|
784
|
+
Internal method to fetch the next N batches and put them in the queue.
|
785
|
+
|
786
|
+
If `pin_memory` is True, batches are pinned to CUDA memory.
|
787
|
+
Stops if the iterator is exhausted or the stop event is set.
|
788
|
+
"""
|
505
789
|
try:
|
506
790
|
for _ in range(self.preload_batches):
|
507
791
|
if self._stop_event:
|
@@ -514,6 +798,11 @@ class spacrDataLoader(DataLoader):
|
|
514
798
|
pass
|
515
799
|
|
516
800
|
def _start_preloading(self):
|
801
|
+
"""
|
802
|
+
Start a new background process to preload batches.
|
803
|
+
|
804
|
+
If `pin_memory` is True, loading is done in the main thread instead.
|
805
|
+
"""
|
517
806
|
if self.process is None or not self.process.is_alive():
|
518
807
|
self._iterator = iter(super().__iter__())
|
519
808
|
if not self.pin_memory:
|
@@ -523,6 +812,15 @@ class spacrDataLoader(DataLoader):
|
|
523
812
|
self._preload_next_batches() # Directly load if pin_memory is True
|
524
813
|
|
525
814
|
def _pin_memory_batch(self, batch):
|
815
|
+
"""
|
816
|
+
Recursively pin memory for all tensors in the batch.
|
817
|
+
|
818
|
+
Args:
|
819
|
+
batch: A batch of data, possibly a tuple, list, or tensor.
|
820
|
+
|
821
|
+
Returns:
|
822
|
+
The batch with pinned memory (if applicable).
|
823
|
+
"""
|
526
824
|
if isinstance(batch, (list, tuple)):
|
527
825
|
return [b.pin_memory() if isinstance(b, torch.Tensor) else b for b in batch]
|
528
826
|
elif isinstance(batch, torch.Tensor):
|
@@ -531,10 +829,24 @@ class spacrDataLoader(DataLoader):
|
|
531
829
|
return batch
|
532
830
|
|
533
831
|
def __iter__(self):
|
832
|
+
"""
|
833
|
+
Return the iterator and initiate background preloading.
|
834
|
+
|
835
|
+
Returns:
|
836
|
+
self
|
837
|
+
"""
|
534
838
|
self._start_preloading()
|
535
839
|
return self
|
536
840
|
|
537
841
|
def __next__(self):
|
842
|
+
"""
|
843
|
+
Return the next batch from the queue.
|
844
|
+
|
845
|
+
If the queue is empty and the process has exited, raises StopIteration.
|
846
|
+
|
847
|
+
Returns:
|
848
|
+
The next batch of data.
|
849
|
+
"""
|
538
850
|
if self.process and not self.process.is_alive() and self.batch_queue.empty():
|
539
851
|
raise StopIteration
|
540
852
|
|
@@ -554,51 +866,45 @@ class spacrDataLoader(DataLoader):
|
|
554
866
|
raise StopIteration
|
555
867
|
|
556
868
|
def cleanup(self):
|
869
|
+
"""
|
870
|
+
Cleanup method to terminate background preloading processes.
|
871
|
+
|
872
|
+
Ensures graceful shutdown of worker processes at exit.
|
873
|
+
"""
|
557
874
|
self._stop_event = True
|
558
875
|
if self.process and self.process.is_alive():
|
559
876
|
self.process.terminate()
|
560
877
|
self.process.join()
|
561
878
|
|
562
879
|
def __del__(self):
|
880
|
+
"""
|
881
|
+
Destructor to ensure cleanup is called when the object is deleted.
|
882
|
+
"""
|
563
883
|
self.cleanup()
|
564
884
|
|
565
|
-
class
|
566
|
-
|
567
|
-
|
568
|
-
self.transform = transform
|
569
|
-
self.shuffle = shuffle
|
570
|
-
self.load_to_memory = load_to_memory
|
571
|
-
self.filenames = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
|
572
|
-
if self.shuffle:
|
573
|
-
self.shuffle_dataset()
|
574
|
-
if self.load_to_memory:
|
575
|
-
self.images = [self.load_image(f) for f in self.filenames]
|
576
|
-
|
577
|
-
def load_image(self, img_path):
|
578
|
-
img = Image.open(img_path).convert('RGB')
|
579
|
-
return img
|
580
|
-
|
581
|
-
def __len__(self):
|
885
|
+
class TarImageDataset(Dataset):
|
886
|
+
"""
|
887
|
+
A PyTorch Dataset for loading images directly from a .tar archive without extraction.
|
582
888
|
|
583
|
-
|
889
|
+
This is useful for large datasets stored as compressed tar archives, enabling on-the-fly
|
890
|
+
access to individual image files without unpacking the archive to disk.
|
584
891
|
|
585
|
-
|
586
|
-
|
587
|
-
|
892
|
+
Args:
|
893
|
+
tar_path (str): Path to the .tar archive containing image files.
|
894
|
+
transform (callable, optional): Optional transform to be applied on a sample.
|
588
895
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
else:
|
593
|
-
img = self.load_image(self.filenames[index])
|
594
|
-
if self.transform is not None:
|
595
|
-
img = self.transform(img)
|
596
|
-
else:
|
597
|
-
img = ToTensor()(img)
|
598
|
-
return img, self.filenames[index]
|
896
|
+
Attributes:
|
897
|
+
members (List[TarInfo]): List of image members in the tar archive.
|
898
|
+
"""
|
599
899
|
|
600
|
-
class TarImageDataset(Dataset):
|
601
900
|
def __init__(self, tar_path, transform=None):
|
901
|
+
"""
|
902
|
+
Initialize the dataset and index image members from the tar archive.
|
903
|
+
|
904
|
+
Args:
|
905
|
+
tar_path (str): Path to the .tar file.
|
906
|
+
transform (callable, optional): Transform function to apply to each image.
|
907
|
+
"""
|
602
908
|
self.tar_path = tar_path
|
603
909
|
self.transform = transform
|
604
910
|
|
@@ -607,9 +913,24 @@ class TarImageDataset(Dataset):
|
|
607
913
|
self.members = [m for m in f.getmembers() if m.isfile()]
|
608
914
|
|
609
915
|
def __len__(self):
|
916
|
+
"""
|
917
|
+
Return the number of image files in the archive.
|
918
|
+
|
919
|
+
Returns:
|
920
|
+
int: Number of image files.
|
921
|
+
"""
|
610
922
|
return len(self.members)
|
611
923
|
|
612
924
|
def __getitem__(self, idx):
|
925
|
+
"""
|
926
|
+
Retrieve an image by index directly from the tar archive.
|
927
|
+
|
928
|
+
Args:
|
929
|
+
idx (int): Index of the image to retrieve.
|
930
|
+
|
931
|
+
Returns:
|
932
|
+
tuple: (PIL.Image.Image or transformed image, str) where the string is the file name.
|
933
|
+
"""
|
613
934
|
with tarfile.open(self.tar_path, 'r') as f:
|
614
935
|
m = self.members[idx]
|
615
936
|
img_file = f.extractfile(m)
|
@@ -621,8 +942,24 @@ class TarImageDataset(Dataset):
|
|
621
942
|
return img, m.name
|
622
943
|
|
623
944
|
def load_images_from_paths(images_by_key):
|
624
|
-
|
945
|
+
"""
|
946
|
+
Load images from a dictionary mapping keys to lists of image file paths.
|
625
947
|
|
948
|
+
Each key in the input dictionary corresponds to a list of file paths. The function
|
949
|
+
loads each image as a NumPy array and returns a new dictionary with the same keys,
|
950
|
+
where each value is a list of loaded images.
|
951
|
+
|
952
|
+
Args:
|
953
|
+
images_by_key (dict): A dictionary where each key maps to a list of image file paths (str).
|
954
|
+
|
955
|
+
Returns:
|
956
|
+
dict: A dictionary where each key maps to a list of NumPy arrays representing the loaded images.
|
957
|
+
|
958
|
+
Notes:
|
959
|
+
- Images are loaded using PIL and converted to NumPy arrays.
|
960
|
+
- Any image that fails to load will be skipped, and an error message will be printed.
|
961
|
+
"""
|
962
|
+
images_dict = {}
|
626
963
|
for key, paths in images_by_key.items():
|
627
964
|
images_dict[key] = []
|
628
965
|
for path in paths:
|
@@ -796,7 +1133,33 @@ def _generate_time_lists(file_list):
|
|
796
1133
|
return sorted_file_lists
|
797
1134
|
|
798
1135
|
def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
|
799
|
-
|
1136
|
+
"""
|
1137
|
+
Organize image files in a source directory into channel-specific subfolders
|
1138
|
+
based on metadata extracted from filenames using a regular expression.
|
1139
|
+
|
1140
|
+
This function assumes filenames contain fields like plate ID, well ID, field ID,
|
1141
|
+
channel ID, and time point. It parses these from the filename using the provided
|
1142
|
+
regex, reformats the filename, and moves the file into a subdirectory named after
|
1143
|
+
the channel ID.
|
1144
|
+
|
1145
|
+
Args:
|
1146
|
+
src (str or Path): Path to the source directory containing image files.
|
1147
|
+
regex (str): Regular expression to extract metadata from filenames.
|
1148
|
+
Expected named groups: plateID, wellID, fieldID, chanID, timeID.
|
1149
|
+
timelapse (bool, optional): Whether to include the timeID in the new filename. Defaults to False.
|
1150
|
+
metadata_type (str, optional): Special handling for specific metadata types.
|
1151
|
+
If 'cq1', converts wellID to CQ1 format. Defaults to ''.
|
1152
|
+
|
1153
|
+
Notes:
|
1154
|
+
- Only `.tif` and `.png` files are processed.
|
1155
|
+
- Files are copied into folders named after their channel ID.
|
1156
|
+
- A backup of the original files is moved to a new `orig/` folder.
|
1157
|
+
- Skips files that do not match the regex or are missing required groups.
|
1158
|
+
- Issues warnings if destination files already exist.
|
1159
|
+
|
1160
|
+
Returns:
|
1161
|
+
None
|
1162
|
+
"""
|
800
1163
|
from .utils import _safe_int_convert, _convert_cq1_well_id
|
801
1164
|
|
802
1165
|
src_path = src
|
@@ -860,9 +1223,28 @@ def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
|
|
860
1223
|
|
861
1224
|
def _merge_channels(src, plot=False):
|
862
1225
|
"""
|
863
|
-
Merge
|
864
|
-
|
1226
|
+
Merge single-channel image files from multiple folders into multi-channel NumPy arrays.
|
1227
|
+
|
1228
|
+
This function assumes the source directory `src` contains subdirectories named as channel
|
1229
|
+
identifiers (e.g., '0', '01', ..., '100'), each holding single-channel image files
|
1230
|
+
with identical filenames. It merges images with the same name across these folders
|
1231
|
+
into a single multi-channel `.npy` file stored in the `stack/` subdirectory.
|
865
1232
|
|
1233
|
+
Args:
|
1234
|
+
src (str or Path): Path to the parent directory containing channel subfolders.
|
1235
|
+
plot (bool, optional): If True, plot the merged arrays after processing using `plot_arrays`. Defaults to False.
|
1236
|
+
|
1237
|
+
Returns:
|
1238
|
+
int: The number of matching channel folders that were merged.
|
1239
|
+
|
1240
|
+
Notes:
|
1241
|
+
- Only processes if `stack/` directory is empty.
|
1242
|
+
- Output is saved as `.npy` files in `src/stack/`.
|
1243
|
+
- Channel folders must be named as integers or zero-padded strings from '0' to '100'.
|
1244
|
+
- Files are matched by filename across all channel folders.
|
1245
|
+
- Skips if a file is not present in all channels or is not a file.
|
1246
|
+
- Uses `_merge_file` to perform the merging operation.
|
1247
|
+
"""
|
866
1248
|
from .plot import plot_arrays
|
867
1249
|
from .utils import print_progress
|
868
1250
|
|
@@ -1131,9 +1513,6 @@ def _normalize_img_batch(stack, channels, save_dtype, settings):
|
|
1131
1513
|
return normalized_stack.astype(save_dtype)
|
1132
1514
|
|
1133
1515
|
def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={}):
|
1134
|
-
from .utils import print_progress
|
1135
|
-
from .plot import plot_arrays
|
1136
|
-
|
1137
1516
|
"""
|
1138
1517
|
Concatenates and normalizes channel data from multiple files and saves the normalized data.
|
1139
1518
|
|
@@ -1153,7 +1532,9 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
|
|
1153
1532
|
Returns:
|
1154
1533
|
str: The directory path where the concatenated and normalized channel data is saved.
|
1155
1534
|
"""
|
1156
|
-
|
1535
|
+
from .utils import print_progress
|
1536
|
+
from .plot import plot_arrays
|
1537
|
+
|
1157
1538
|
channels = [item for item in channels if item is not None]
|
1158
1539
|
|
1159
1540
|
print(f"Generating concatenated and normalized channel data for channels: {channels}")
|
@@ -1528,12 +1909,7 @@ def delete_empty_subdirectories(folder_path):
|
|
1528
1909
|
#print(f"Skipping non-empty directory: {full_dir_path}")
|
1529
1910
|
|
1530
1911
|
#@log_function_call
|
1531
|
-
def preprocess_img_data(settings):
|
1532
|
-
|
1533
|
-
from .plot import plot_arrays
|
1534
|
-
from .utils import _run_test_mode, _get_regex
|
1535
|
-
from .settings import set_default_settings_preprocess_img_data
|
1536
|
-
|
1912
|
+
def preprocess_img_data(settings):
|
1537
1913
|
"""
|
1538
1914
|
Preprocesses image data by converting z-stack images to maximum intensity projection (MIP) images.
|
1539
1915
|
|
@@ -1560,6 +1936,11 @@ def preprocess_img_data(settings):
|
|
1560
1936
|
Returns:
|
1561
1937
|
None
|
1562
1938
|
"""
|
1939
|
+
|
1940
|
+
from .plot import plot_arrays
|
1941
|
+
from .utils import _run_test_mode, _get_regex
|
1942
|
+
from .settings import set_default_settings_preprocess_img_data
|
1943
|
+
|
1563
1944
|
src = settings['src']
|
1564
1945
|
|
1565
1946
|
if len(os.listdir(src)) < 100:
|
@@ -2159,6 +2540,32 @@ def _results_to_csv(src, df, df_well):
|
|
2159
2540
|
return cells, wells
|
2160
2541
|
|
2161
2542
|
def read_plot_model_stats(train_file_path, val_file_path ,save=False):
|
2543
|
+
def read_plot_model_stats(train_file_path, val_file_path, save=False):
|
2544
|
+
"""
|
2545
|
+
Reads training and validation statistics from CSV files, generates plots for various metrics,
|
2546
|
+
and optionally saves the plots as PDF files.
|
2547
|
+
Args:
|
2548
|
+
train_file_path (str): Path to the CSV file containing training statistics.
|
2549
|
+
val_file_path (str): Path to the CSV file containing validation statistics.
|
2550
|
+
save (bool, optional): If True, saves the plots as PDF files in the same directory as
|
2551
|
+
the training file. If False, displays the plots interactively. Defaults to False.
|
2552
|
+
Metrics Plotted:
|
2553
|
+
- accuracy
|
2554
|
+
- neg_accuracy
|
2555
|
+
- pos_accuracy
|
2556
|
+
- loss
|
2557
|
+
- prauc
|
2558
|
+
- optimal_threshold
|
2559
|
+
Notes:
|
2560
|
+
- The CSV files should have a column named 'epoch' and columns corresponding to the
|
2561
|
+
metrics listed above.
|
2562
|
+
- The plots are saved with filenames corresponding to the metric name (e.g., 'accuracy.pdf').
|
2563
|
+
Raises:
|
2564
|
+
FileNotFoundError: If the specified CSV files do not exist.
|
2565
|
+
ValueError: If the CSV files do not contain the required columns.
|
2566
|
+
Example:
|
2567
|
+
>>> read_plot_model_stats("train_stats.csv", "val_stats.csv", save=True)
|
2568
|
+
"""
|
2162
2569
|
|
2163
2570
|
def _plot_and_save(train_df, val_df, column='accuracy', save=False, path=None, dpi=600):
|
2164
2571
|
|
@@ -2295,6 +2702,30 @@ def _save_progress(dst, train_df, validation_df):
|
|
2295
2702
|
return
|
2296
2703
|
|
2297
2704
|
def _copy_missclassified(df):
|
2705
|
+
"""
|
2706
|
+
Copies misclassified images to designated folders based on their classification.
|
2707
|
+
|
2708
|
+
This function identifies rows in the given DataFrame where the 'true_label'
|
2709
|
+
does not match the 'predicted_label'. It then copies the corresponding files
|
2710
|
+
to a "missclassified" directory, organizing them into subdirectories
|
2711
|
+
("pc" or "nc") based on the presence of "pc" in the original file path.
|
2712
|
+
|
2713
|
+
Args:
|
2714
|
+
df (pandas.DataFrame): A DataFrame containing at least the following columns:
|
2715
|
+
- 'filename': The file path of the image.
|
2716
|
+
- 'true_label': The actual label of the image.
|
2717
|
+
- 'predicted_label': The predicted label of the image.
|
2718
|
+
|
2719
|
+
Side Effects:
|
2720
|
+
- Creates directories for storing misclassified images if they do not exist.
|
2721
|
+
- Copies files from their original locations to the appropriate "missclassified" subdirectory.
|
2722
|
+
|
2723
|
+
Prints:
|
2724
|
+
A message indicating the number of misclassified images copied.
|
2725
|
+
|
2726
|
+
Returns:
|
2727
|
+
None
|
2728
|
+
"""
|
2298
2729
|
misclassified = df[df['true_label'] != df['predicted_label']]
|
2299
2730
|
for _, row in misclassified.iterrows():
|
2300
2731
|
original_path = row['filename']
|
@@ -2310,6 +2741,20 @@ def _copy_missclassified(df):
|
|
2310
2741
|
return
|
2311
2742
|
|
2312
2743
|
def _read_db(db_loc, tables):
|
2744
|
+
"""
|
2745
|
+
Reads data from specified tables in a SQLite database and applies metadata corrections.
|
2746
|
+
Args:
|
2747
|
+
db_loc (str): The file path to the SQLite database.
|
2748
|
+
tables (list of str): A list of table names to read from the database.
|
2749
|
+
Returns:
|
2750
|
+
list of pandas.DataFrame: A list of DataFrames, each containing the data from one of the specified tables.
|
2751
|
+
Notes:
|
2752
|
+
- The function assumes the presence of utility functions `rename_columns_in_db` and `correct_metadata`
|
2753
|
+
in the `utils` module.
|
2754
|
+
- `rename_columns_in_db` is called to preprocess the database before reading.
|
2755
|
+
- `correct_metadata` is applied to each DataFrame after reading.
|
2756
|
+
- The database connection is closed after all tables are read.
|
2757
|
+
"""
|
2313
2758
|
|
2314
2759
|
from .utils import rename_columns_in_db, correct_metadata
|
2315
2760
|
|
@@ -2325,6 +2770,31 @@ def _read_db(db_loc, tables):
|
|
2325
2770
|
return dfs
|
2326
2771
|
|
2327
2772
|
def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10, change_plate=False):
|
2773
|
+
"""
|
2774
|
+
Reads and merges data from multiple locations and tables, processes the data, and returns a merged DataFrame
|
2775
|
+
along with a list of object-specific DataFrames.
|
2776
|
+
Args:
|
2777
|
+
locs (list): List of file paths or locations containing the data to be read.
|
2778
|
+
tables (list): List of table names to be extracted and processed.
|
2779
|
+
verbose (bool, optional): If True, prints detailed information about the processing steps. Defaults to False.
|
2780
|
+
nuclei_limit (int or bool, optional): Limit on the number of nuclei per cell. If False, only single nuclei
|
2781
|
+
per cell are retained. Defaults to 10.
|
2782
|
+
pathogen_limit (int, float, or bool, optional): Limit on the number of pathogens per cell. If False, only
|
2783
|
+
single pathogens per cell are retained. Defaults to 10.
|
2784
|
+
change_plate (bool, optional): If True, assigns unique plate IDs to each location. Defaults to False.
|
2785
|
+
Returns:
|
2786
|
+
tuple:
|
2787
|
+
- pd.DataFrame: A merged DataFrame containing processed data from all specified tables.
|
2788
|
+
- list: A list of DataFrames for individual object types (e.g., cell, cytoplasm, nucleus, pathogen)
|
2789
|
+
if they exist in the input data.
|
2790
|
+
Notes:
|
2791
|
+
- The function processes data from multiple tables such as 'cell', 'cytoplasm', 'nucleus', 'pathogen',
|
2792
|
+
and 'png_list', if available.
|
2793
|
+
- Data is grouped and merged based on unique identifiers such as 'prcfo' (plate, row, column, field, object).
|
2794
|
+
- Metadata is generated and merged with the final DataFrame.
|
2795
|
+
- The function handles missing data and applies limits on nuclei and pathogens per cell if specified.
|
2796
|
+
- Verbose mode provides detailed logs of the processing steps and the resulting data dimensions.
|
2797
|
+
"""
|
2328
2798
|
|
2329
2799
|
from .utils import _split_data
|
2330
2800
|
|
@@ -2459,6 +2929,15 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
|
|
2459
2929
|
return merged_df, obj_df_ls
|
2460
2930
|
|
2461
2931
|
def _read_mask(mask_path):
|
2932
|
+
"""
|
2933
|
+
Reads a mask image from the specified file path and ensures it is of type uint16.
|
2934
|
+
|
2935
|
+
Parameters:
|
2936
|
+
mask_path (str): The file path to the mask image.
|
2937
|
+
|
2938
|
+
Returns:
|
2939
|
+
numpy.ndarray: The mask image as a NumPy array with dtype uint16.
|
2940
|
+
"""
|
2462
2941
|
mask = imageio2.imread(mask_path)
|
2463
2942
|
if mask.dtype != np.uint16:
|
2464
2943
|
mask = img_as_uint(mask)
|
@@ -2466,10 +2945,24 @@ def _read_mask(mask_path):
|
|
2466
2945
|
|
2467
2946
|
def convert_numpy_to_tiff(folder_path, limit=None):
|
2468
2947
|
"""
|
2469
|
-
Converts all
|
2470
|
-
|
2471
|
-
|
2472
|
-
|
2948
|
+
Converts all .npy files in a folder to .tiff images and saves them in a 'tiff' subdirectory.
|
2949
|
+
|
2950
|
+
This function searches for `.npy` files in the specified folder, loads each as a NumPy array,
|
2951
|
+
and writes it as a `.tiff` image using `tifffile.imwrite`. The resulting images are saved in
|
2952
|
+
a `tiff` subdirectory within the input folder. Optionally, processing can be limited to a
|
2953
|
+
specific number of files.
|
2954
|
+
|
2955
|
+
Parameters
|
2956
|
+
----------
|
2957
|
+
folder_path : str
|
2958
|
+
The path to the directory containing `.npy` files to be converted.
|
2959
|
+
limit : int, optional
|
2960
|
+
Maximum number of `.npy` files to convert. If None (default), all `.npy` files are converted.
|
2961
|
+
|
2962
|
+
Returns
|
2963
|
+
-------
|
2964
|
+
None
|
2965
|
+
The function saves the converted TIFF files to disk and prints status messages.
|
2473
2966
|
"""
|
2474
2967
|
# Create the subdirectory 'tiff' within the specified folder if it doesn't already exist
|
2475
2968
|
tiff_subdir = os.path.join(folder_path, 'tiff')
|
@@ -2502,6 +2995,27 @@ def convert_numpy_to_tiff(folder_path, limit=None):
|
|
2502
2995
|
return
|
2503
2996
|
|
2504
2997
|
def generate_cellpose_train_test(src, test_split=0.1):
|
2998
|
+
"""
|
2999
|
+
Splits a directory of TIFF images and corresponding Cellpose masks into training and test sets.
|
3000
|
+
|
3001
|
+
This function searches the `src` directory for TIFF images and ensures that corresponding
|
3002
|
+
masks exist in the `src/masks/` folder. It then shuffles and splits the dataset into
|
3003
|
+
training and test sets based on the specified `test_split` ratio. The resulting subsets
|
3004
|
+
are copied into `train/` and `test/` folders (with `masks/` subfolders) located
|
3005
|
+
in the parent directory of `src`.
|
3006
|
+
|
3007
|
+
Parameters
|
3008
|
+
----------
|
3009
|
+
src : str
|
3010
|
+
Path to the directory containing TIFF images and a subdirectory `masks/` with corresponding mask files.
|
3011
|
+
test_split : float, optional
|
3012
|
+
Proportion of the dataset to be used for testing (default is 0.1, i.e., 10%).
|
3013
|
+
|
3014
|
+
Returns
|
3015
|
+
-------
|
3016
|
+
None
|
3017
|
+
Files are copied to disk and progress messages are printed.
|
3018
|
+
"""
|
2505
3019
|
mask_src = os.path.join(src, 'masks')
|
2506
3020
|
img_paths = glob.glob(os.path.join(src, '*.tif'))
|
2507
3021
|
img_filenames = [os.path.basename(file) for file in img_paths]
|
@@ -2575,7 +3089,28 @@ def parse_gz_files(folder_path):
|
|
2575
3089
|
return samples_dict
|
2576
3090
|
|
2577
3091
|
def generate_dataset(settings={}):
|
3092
|
+
"""
|
3093
|
+
Generates a tar archive containing a dataset of images collected from one or more database sources.
|
3094
|
+
|
3095
|
+
This function selects image paths from one or more SQLite databases, optionally samples from them,
|
3096
|
+
and writes the images into a tar archive using multiprocessing to parallelize the process. Temporary tar
|
3097
|
+
files are created and merged into a final tar file. The function also logs and saves dataset settings.
|
3098
|
+
|
3099
|
+
Parameters
|
3100
|
+
----------
|
3101
|
+
settings : dict, optional
|
3102
|
+
Dictionary of user-defined settings. The following keys are used:
|
3103
|
+
|
3104
|
+
- 'src' (str or list of str): Path(s) to the source folder(s) containing the database(s).
|
3105
|
+
- 'experiment' (str): Name of the experiment, used to name the output tar.
|
3106
|
+
- 'sample' (int or list, optional): If int, randomly sample that many images; if list, sample per src index.
|
3107
|
+
- 'file_metadata' (str, optional): Metadata column name used to filter/select files.
|
2578
3108
|
|
3109
|
+
Returns
|
3110
|
+
-------
|
3111
|
+
str
|
3112
|
+
Path to the final tar archive containing the dataset.
|
3113
|
+
"""
|
2579
3114
|
from .utils import initiate_counter, add_images_to_tar, save_settings, generate_path_list_from_db, correct_paths
|
2580
3115
|
from .settings import set_generate_dataset_defaults
|
2581
3116
|
|
@@ -2792,7 +3327,35 @@ def generate_loaders(src, mode='train', image_size=224, batch_size=32, classes=[
|
|
2792
3327
|
return train_loaders, val_loaders, train_fig
|
2793
3328
|
|
2794
3329
|
def generate_training_dataset(settings):
|
2795
|
-
|
3330
|
+
"""
|
3331
|
+
Generate a training dataset from a SQLite database using measurement-based or annotation/metadata-based selection.
|
3332
|
+
|
3333
|
+
Depending on the `settings`, this function selects images corresponding to high and low phenotypes (e.g., recruitment)
|
3334
|
+
or based on metadata or manual annotation. Selected image paths are grouped by class and returned for further use
|
3335
|
+
(e.g., saving to folders or training models).
|
3336
|
+
|
3337
|
+
Parameters
|
3338
|
+
----------
|
3339
|
+
settings : dict
|
3340
|
+
Configuration dictionary with the following required keys:
|
3341
|
+
|
3342
|
+
- 'class_metadata' (list of str): Metadata conditions to define classes (e.g., treatment names).
|
3343
|
+
- 'channel_of_interest' (int): Channel index used for computing recruitment scores.
|
3344
|
+
- 'png_type' (str): Type of PNG to retrieve ('raw', 'outline', etc.).
|
3345
|
+
- 'size' (int): Number of images to sample per class.
|
3346
|
+
- 'nuclei_limit' (int or None): Minimum nucleus size for filtering (used in _read_and_merge_data).
|
3347
|
+
- 'pathogen_limit' (int or None): Minimum pathogen size for filtering.
|
3348
|
+
- 'custom_measurement' (list of str or None): If provided, defines custom numerator and denominator columns.
|
3349
|
+
- 'classes' (list of str): Treatments to annotate using `annotate_conditions`.
|
3350
|
+
- 'metadata_type_by' (str): Column in the DB to use for metadata classification ('columnID' or 'rowID').
|
3351
|
+
- 'tables' (list of str): Tables to extract from database, e.g., ['cell', 'nucleus'].
|
3352
|
+
- 'dataset_mode' (str): Either 'annotation' or 'metadata'. Controls how class sizes are determined.
|
3353
|
+
|
3354
|
+
Returns
|
3355
|
+
-------
|
3356
|
+
list of list of str
|
3357
|
+
A list where each sublist contains paths to PNGs belonging to one class (e.g., low vs high recruitment).
|
3358
|
+
"""
|
2796
3359
|
# Function to filter png_list_df by prcfo present in df without merging
|
2797
3360
|
def filter_png_list(db_path, settings, tables = ['cell', 'nucleus', 'pathogen', 'cytoplasm']):
|
2798
3361
|
df, _ = _read_and_merge_data(locs=[db_path],
|
@@ -2975,6 +3538,41 @@ def generate_training_dataset(settings):
|
|
2975
3538
|
return train_class_dir, test_class_dir
|
2976
3539
|
|
2977
3540
|
def training_dataset_from_annotation(db_path, dst, annotation_column='test', annotated_classes=(1, 2)):
|
3541
|
+
"""
|
3542
|
+
Extracts image paths from a database and groups them into class-based lists based on annotation values.
|
3543
|
+
|
3544
|
+
This function reads from a SQLite database (`png_list` table), extracts image paths and corresponding
|
3545
|
+
class annotations, and groups them by the specified `annotated_classes`. If only one class is provided,
|
3546
|
+
it automatically generates a second class by sampling the remaining entries not in the target class
|
3547
|
+
to create a balanced binary dataset.
|
3548
|
+
|
3549
|
+
Parameters
|
3550
|
+
----------
|
3551
|
+
db_path : str
|
3552
|
+
Path to the SQLite database file containing the `png_list` table.
|
3553
|
+
|
3554
|
+
dst : str
|
3555
|
+
Output path (currently unused in the function, included for compatibility with caller).
|
3556
|
+
|
3557
|
+
annotation_column : str, default='test'
|
3558
|
+
Column name in the `png_list` table that contains class annotations.
|
3559
|
+
|
3560
|
+
annotated_classes : tuple of int, default=(1, 2)
|
3561
|
+
Class labels to extract from the annotation column.
|
3562
|
+
|
3563
|
+
Returns
|
3564
|
+
-------
|
3565
|
+
class_paths : list of list of str
|
3566
|
+
A list where each sublist contains the file paths for images belonging to one class.
|
3567
|
+
The number of sublists equals the number of unique classes returned.
|
3568
|
+
|
3569
|
+
Notes
|
3570
|
+
-----
|
3571
|
+
- If only one annotated class is provided, the function creates a balanced second class
|
3572
|
+
from non-annotated images.
|
3573
|
+
- This function does not copy or move any files — it only collects and returns path lists.
|
3574
|
+
- All path and annotation data is assumed to be stored in the `png_list` table of the SQLite DB.
|
3575
|
+
"""
|
2978
3576
|
all_paths = []
|
2979
3577
|
|
2980
3578
|
# Connect to the database and retrieve the image paths and annotations
|
@@ -3030,6 +3628,51 @@ def training_dataset_from_annotation(db_path, dst, annotation_column='test', ann
|
|
3030
3628
|
return class_paths
|
3031
3629
|
|
3032
3630
|
def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='columnID', class_metadata=['c1','c2']):
|
3631
|
+
"""
|
3632
|
+
Extracts annotated image paths from a database, filtered by metadata location (row/column).
|
3633
|
+
|
3634
|
+
This function reads image paths and annotations from a SQLite database (`png_list` table), filters them
|
3635
|
+
by metadata (either `row_name` or `column_name`), and organizes them into class-specific lists based on
|
3636
|
+
annotation values. If only one class is specified, the function samples a balanced second class from
|
3637
|
+
remaining entries.
|
3638
|
+
|
3639
|
+
Parameters
|
3640
|
+
----------
|
3641
|
+
db_path : str
|
3642
|
+
Path to the SQLite database containing the `png_list` table.
|
3643
|
+
|
3644
|
+
dst : str
|
3645
|
+
Output directory (unused in this function but required for compatibility).
|
3646
|
+
|
3647
|
+
annotation_column : str, default='test'
|
3648
|
+
The column name in `png_list` storing the annotation labels.
|
3649
|
+
|
3650
|
+
annotated_classes : tuple of int, default=(1, 2)
|
3651
|
+
Annotation values to be used for splitting data into separate class groups.
|
3652
|
+
|
3653
|
+
metadata_type_by : str, {'rowID', 'columnID'}, default='columnID'
|
3654
|
+
Which metadata field to filter by — either 'rowID' (uses `row_name`) or 'columnID' (uses `column_name`).
|
3655
|
+
|
3656
|
+
class_metadata : list of str, default=['c1', 'c2']
|
3657
|
+
The metadata values to include (e.g., specific row or column identifiers to filter on).
|
3658
|
+
|
3659
|
+
Returns
|
3660
|
+
-------
|
3661
|
+
class_paths : list of list of str
|
3662
|
+
A list where each sublist contains paths to images in one class.
|
3663
|
+
|
3664
|
+
Raises
|
3665
|
+
------
|
3666
|
+
ValueError
|
3667
|
+
If `metadata_type_by` is not one of 'rowID' or 'columnID'.
|
3668
|
+
|
3669
|
+
Notes
|
3670
|
+
-----
|
3671
|
+
- If only one class is specified in `annotated_classes`, a second class is constructed by sampling
|
3672
|
+
from non-target annotations in the filtered set to ensure balanced class representation.
|
3673
|
+
- This function assumes that `png_path`, `annotation_column`, `row_name`, and `column_name` exist
|
3674
|
+
in the `png_list` table.
|
3675
|
+
"""
|
3033
3676
|
all_paths = []
|
3034
3677
|
|
3035
3678
|
# Connect to the database and retrieve the image paths and annotations
|
@@ -3099,6 +3742,44 @@ def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='t
|
|
3099
3742
|
return class_paths
|
3100
3743
|
|
3101
3744
|
def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
|
3745
|
+
"""
|
3746
|
+
Generates a train/test image dataset directory structure from class-wise path lists.
|
3747
|
+
|
3748
|
+
This function creates `train` and `test` subdirectories under the given destination directory (`dst`)
|
3749
|
+
and copies the image files into class-specific folders after performing a train-test split.
|
3750
|
+
|
3751
|
+
Parameters
|
3752
|
+
----------
|
3753
|
+
dst : str
|
3754
|
+
Destination directory where the dataset will be created. Subdirectories for each class will be made under `train/` and `test/`.
|
3755
|
+
|
3756
|
+
class_data : list of list of str
|
3757
|
+
A list where each sublist contains paths to image files belonging to a specific class.
|
3758
|
+
|
3759
|
+
classes : list of str
|
3760
|
+
Class names corresponding to the order of `class_data`.
|
3761
|
+
|
3762
|
+
test_split : float, default=0.1
|
3763
|
+
Proportion of data to be used for the test set. The remainder is used for training.
|
3764
|
+
|
3765
|
+
Returns
|
3766
|
+
-------
|
3767
|
+
train_dir : str
|
3768
|
+
Path to the top-level training directory.
|
3769
|
+
|
3770
|
+
test_dir : str
|
3771
|
+
Path to the top-level test directory.
|
3772
|
+
|
3773
|
+
Raises
|
3774
|
+
------
|
3775
|
+
ValueError
|
3776
|
+
If the number of class labels does not match the number of class data lists.
|
3777
|
+
|
3778
|
+
Notes
|
3779
|
+
-----
|
3780
|
+
The train/test split is deterministic (random_state=42).
|
3781
|
+
File copying is timed and progress is reported via `print_progress`.
|
3782
|
+
"""
|
3102
3783
|
from .utils import print_progress
|
3103
3784
|
# Make sure that the length of class_data matches the length of classes
|
3104
3785
|
if len(class_data) != len(classes):
|
@@ -3146,7 +3827,38 @@ def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
|
|
3146
3827
|
return os.path.join(dst, 'train'), os.path.join(dst, 'test')
|
3147
3828
|
|
3148
3829
|
def convert_separate_files_to_yokogawa(folder, regex):
|
3149
|
-
|
3830
|
+
"""
|
3831
|
+
Converts image files from a folder into Yokogawa-style naming format with optional MIP across Z-slices.
|
3832
|
+
|
3833
|
+
This function parses filenames using a provided regex, extracts metadata such as well ID, channel, field,
|
3834
|
+
timepoint, and slice, and renames the images to the Yokogawa convention:
|
3835
|
+
`plateX_WELL_TttttFfffL01Ccc.tif`. If multiple Z-slices exist, it computes a maximum intensity projection (MIP).
|
3836
|
+
|
3837
|
+
Parameters
|
3838
|
+
----------
|
3839
|
+
folder : str
|
3840
|
+
Path to the folder containing input TIFF images.
|
3841
|
+
|
3842
|
+
regex : str
|
3843
|
+
Regular expression with named capture groups:
|
3844
|
+
- 'plateID' (optional)
|
3845
|
+
- 'wellID' (required)
|
3846
|
+
- 'fieldID' (optional)
|
3847
|
+
- 'timeID' (optional)
|
3848
|
+
- 'chanID' (optional)
|
3849
|
+
- 'sliceID' (optional)
|
3850
|
+
|
3851
|
+
Returns
|
3852
|
+
-------
|
3853
|
+
None
|
3854
|
+
Saves renamed TIFF files and a CSV log (`rename_log.csv`) in the same folder.
|
3855
|
+
|
3856
|
+
Notes
|
3857
|
+
-----
|
3858
|
+
- Automatically assigns new well names (`plateX_WELL`) if missing or non-standard.
|
3859
|
+
- Groups images by region (plate, well, field, time, channel) and performs MIP if multiple slices are present.
|
3860
|
+
- Skips files that do not match the regex or are missing required metadata.
|
3861
|
+
"""
|
3150
3862
|
ROWS = "ABCDEFGHIJKLMNOP"
|
3151
3863
|
COLS = [f"{i:02d}" for i in range(1, 25)]
|
3152
3864
|
WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
|
@@ -3236,10 +3948,52 @@ def convert_separate_files_to_yokogawa(folder, regex):
|
|
3236
3948
|
|
3237
3949
|
def convert_to_yokogawa(folder):
|
3238
3950
|
"""
|
3239
|
-
|
3240
|
-
|
3951
|
+
Converts microscopy image files in a folder to Yokogawa-style TIFF filenames.
|
3952
|
+
|
3953
|
+
This function processes raw microscopy images in various formats (ND2, CZI, LIF, TIFF, PNG, JPEG, BMP)
|
3954
|
+
and converts them into a standardized Yokogawa naming scheme using maximum intensity projections (MIPs).
|
3955
|
+
Each image is assigned a unique well location (e.g., plate1_A01) across one or more 384-well plates.
|
3956
|
+
The output files are saved in the same directory with renamed filenames. A CSV log is generated
|
3957
|
+
to track the mapping between original files and the renamed TIFFs.
|
3958
|
+
|
3959
|
+
Parameters
|
3960
|
+
----------
|
3961
|
+
folder : str
|
3962
|
+
Path to the directory containing the input microscopy files.
|
3963
|
+
|
3964
|
+
Supported Formats
|
3965
|
+
-----------------
|
3966
|
+
- `.nd2` : Nikon ND2 format (processed using ND2Reader)
|
3967
|
+
- `.czi` : Zeiss CZI format (processed using pyczi)
|
3968
|
+
- `.lif` : Leica LIF format (processed using readlif)
|
3969
|
+
- `.tif`, `.tiff`, `.png`, `.jpg`, `.jpeg`, `.bmp` : Image files (processed using tifffile)
|
3970
|
+
|
3971
|
+
Behavior
|
3972
|
+
--------
|
3973
|
+
- Computes maximum intensity projections across Z-stacks.
|
3974
|
+
- Generates Yokogawa-style filenames: `plateX_<WELL>_T####F###L01C##.tif`
|
3975
|
+
- Handles timepoints, Z-stacks, channels, fields, and scenes depending on format.
|
3976
|
+
- Avoids reusing well positions across multiple files and scenes.
|
3977
|
+
- Skips malformed or incomplete image structures.
|
3978
|
+
- Logs all renamed output files to `rename_log.csv` in the same folder.
|
3979
|
+
|
3980
|
+
Output
|
3981
|
+
------
|
3982
|
+
- Converted TIFF images saved in the input folder with Yokogawa-style filenames.
|
3983
|
+
- A CSV log `rename_log.csv` containing columns:
|
3984
|
+
'Original File', 'Renamed TIFF', 'ext', 'time', 'field', 'channel', 'z', 'scene', 'slice', 'well'
|
3985
|
+
|
3986
|
+
Notes
|
3987
|
+
-----
|
3988
|
+
- Requires `ND2Reader`, `pyczi`, `readlif`, `tifffile`, and `pandas`.
|
3989
|
+
- Handles multi-dimensional images (2D, 3D, 4D).
|
3990
|
+
- Images with unsupported dimensions or structure are skipped with warnings.
|
3991
|
+
|
3992
|
+
Example
|
3993
|
+
-------
|
3994
|
+
>>> convert_to_yokogawa("/path/to/raw_images")
|
3995
|
+
Processing complete. Files saved in /path/to/raw_images and rename log saved as rename_log.csv.
|
3241
3996
|
"""
|
3242
|
-
|
3243
3997
|
def _get_next_well(used_wells):
|
3244
3998
|
"""
|
3245
3999
|
Determines the next available well position across multiple 384-well plates.
|
@@ -3477,6 +4231,22 @@ def convert_to_yokogawa(folder):
|
|
3477
4231
|
print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
|
3478
4232
|
|
3479
4233
|
def apply_augmentation(image, method):
|
4234
|
+
"""
|
4235
|
+
Applies the specified augmentation method to the given image.
|
4236
|
+
|
4237
|
+
Parameters:
|
4238
|
+
image (numpy.ndarray): The input image to be augmented.
|
4239
|
+
method (str): The augmentation method to apply. Supported methods are:
|
4240
|
+
- 'rotate90': Rotates the image 90 degrees clockwise.
|
4241
|
+
- 'rotate180': Rotates the image 180 degrees.
|
4242
|
+
- 'rotate270': Rotates the image 90 degrees counterclockwise.
|
4243
|
+
- 'flip_h': Flips the image horizontally.
|
4244
|
+
- 'flip_v': Flips the image vertically.
|
4245
|
+
|
4246
|
+
Returns:
|
4247
|
+
numpy.ndarray: The augmented image. If the method is not recognized,
|
4248
|
+
the original image is returned unchanged.
|
4249
|
+
"""
|
3480
4250
|
if method == 'rotate90':
|
3481
4251
|
return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
|
3482
4252
|
elif method == 'rotate180':
|
@@ -3490,6 +4260,24 @@ def apply_augmentation(image, method):
|
|
3490
4260
|
return image
|
3491
4261
|
|
3492
4262
|
def process_instruction(entry):
|
4263
|
+
"""
|
4264
|
+
Processes a single image/mask entry by reading, optionally augmenting, and saving both image and mask.
|
4265
|
+
|
4266
|
+
Parameters
|
4267
|
+
----------
|
4268
|
+
entry : dict
|
4269
|
+
A dictionary with the following keys:
|
4270
|
+
- 'src_img' (str): Path to the source image file.
|
4271
|
+
- 'src_msk' (str): Path to the source mask file.
|
4272
|
+
- 'dst_img' (str): Path to save the processed image.
|
4273
|
+
- 'dst_msk' (str): Path to save the processed mask.
|
4274
|
+
- 'augment' (str or None): Augmentation identifier to apply (e.g., 'rotate90', 'flip', or None).
|
4275
|
+
|
4276
|
+
Returns
|
4277
|
+
-------
|
4278
|
+
int
|
4279
|
+
Returns 1 upon successful completion.
|
4280
|
+
"""
|
3493
4281
|
img = tifffile.imread(entry["src_img"])
|
3494
4282
|
msk = tifffile.imread(entry["src_msk"])
|
3495
4283
|
if entry["augment"]:
|
@@ -3500,7 +4288,35 @@ def process_instruction(entry):
|
|
3500
4288
|
return 1
|
3501
4289
|
|
3502
4290
|
def prepare_cellpose_dataset(input_root, augment_data=False, train_fraction=0.8, n_jobs=None):
|
3503
|
-
|
4291
|
+
"""
|
4292
|
+
Prepare a training and testing dataset for Cellpose from multiple subdirectories containing TIFF images and corresponding masks.
|
4293
|
+
|
4294
|
+
This function scans all subfolders in `input_root` that contain a "masks/" directory, finds image-mask pairs,
|
4295
|
+
and splits them into train/test sets. Optionally, it augments data using rotation and flipping to balance dataset sizes
|
4296
|
+
across all datasets. All output is saved in a standardized format to a new "cellpose_dataset/" folder inside `input_root`.
|
4297
|
+
|
4298
|
+
Parameters
|
4299
|
+
----------
|
4300
|
+
input_root : str
|
4301
|
+
Path to the folder containing one or more datasets. Each dataset should have a 'masks/' subfolder with mask files
|
4302
|
+
matching the TIFF filenames.
|
4303
|
+
|
4304
|
+
augment_data : bool, optional
|
4305
|
+
If True, perform data augmentation (rotation/flipping) to increase or equalize the number of samples per dataset.
|
4306
|
+
Default is False.
|
4307
|
+
|
4308
|
+
train_fraction : float, optional
|
4309
|
+
Fraction of data to use for training. The rest will go to testing. Default is 0.8 (i.e., 80% train, 20% test).
|
4310
|
+
|
4311
|
+
n_jobs : int or None, optional
|
4312
|
+
Number of parallel worker processes. If None, uses all available CPUs minus one.
|
4313
|
+
|
4314
|
+
Returns
|
4315
|
+
-------
|
4316
|
+
None
|
4317
|
+
All output TIFFs are saved under `input_root/cellpose_dataset/train/` and `.../test/` folders with consistent naming.
|
4318
|
+
A progress bar is printed to track the status of preprocessing.
|
4319
|
+
"""
|
3504
4320
|
from .utils import print_progress
|
3505
4321
|
|
3506
4322
|
time_ls = []
|