spacr 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/io.py CHANGED
@@ -25,11 +25,51 @@ from sklearn.model_selection import train_test_split
25
25
  from pylibCZIrw import czi as pyczi
26
26
 
27
27
  def process_non_tif_non_2D_images(folder):
28
- """Processes all images in the folder and splits them into grayscale channels, preserving bit depth."""
29
-
28
+ """
29
+ Process and standardize image files in a folder by converting or splitting them into grayscale TIFFs.
30
+
31
+ This function supports various image formats (PNG, JPEG, CZI, ND2, TIFF) and ensures all output images
32
+ are grayscale TIFF files saved with consistent naming based on dimensions (channel, z-plane, timepoint).
33
+
34
+ For 2D grayscale images in non-TIFF formats, it converts them to TIFF.
35
+ For 3D, 4D, or 5D images, it splits them into individual grayscale channels and saves them with suffixes
36
+ (_C#, _Z#, _T#) to indicate channel, z-stack, and time point respectively.
37
+
38
+ Args:
39
+ folder (str): Path to the folder containing image files to be processed.
40
+
41
+ Supported file extensions:
42
+ - .tif, .tiff
43
+ - .png
44
+ - .jpg, .jpeg
45
+ - .czi
46
+ - .nd2
47
+
48
+ Output:
49
+ - Saves standardized grayscale TIFF images in the same folder with descriptive filenames.
50
+ - Prints a log message for each file processed or skipped.
51
+ """
30
52
  # Helper function to save grayscale images
31
53
  def save_grayscale_images(image, base_name, folder, dtype, channel=None, z=None, t=None):
32
- """Save grayscale images with appropriate suffix based on channel, z, and t, preserving bit depth."""
54
+ """
55
+ Save a single grayscale image slice as a TIFF file with a descriptive filename.
56
+
57
+ The output filename is constructed from the base name and optionally includes
58
+ suffixes for channel (C#), z-plane (Z#), and timepoint (T#) to reflect its position
59
+ in a multidimensional dataset.
60
+
61
+ Args:
62
+ image (np.ndarray): The grayscale image array to save.
63
+ base_name (str): The base filename (without extension).
64
+ folder (str): Directory in which to save the output TIFF.
65
+ dtype (np.dtype): Desired data type to cast the image before saving (e.g., np.uint8, np.uint16).
66
+ channel (int, optional): Channel index (1-based) to include in the filename.
67
+ z (int, optional): Z-plane index (1-based) to include in the filename.
68
+ t (int, optional): Timepoint index (1-based) to include in the filename.
69
+
70
+ Output:
71
+ Saves the image as a `.tif` file in the specified folder with the constructed filename.
72
+ """
33
73
  suffix = ""
34
74
  if channel is not None:
35
75
  suffix += f"_C{channel}"
@@ -43,7 +83,27 @@ def process_non_tif_non_2D_images(folder):
43
83
 
44
84
  # Function to handle splitting of multi-dimensional images into grayscale channels
45
85
  def split_channels(image, folder, base_name, dtype):
46
- """Splits the image into channels and handles 3D, 4D, and 5D image cases."""
86
+ """
87
+ Split and save multi-dimensional image data into individual grayscale TIFF files.
88
+
89
+ This function handles 3D, 4D, and 5D images by separating each channel (and optionally
90
+ z-slices and timepoints) and saving each as an individual grayscale image using the
91
+ `save_grayscale_images` function.
92
+
93
+ Args:
94
+ image (np.ndarray): Input image array with shape:
95
+ - 3D: (height, width, channels)
96
+ - 4D: (height, width, channels, z)
97
+ - 5D: (height, width, channels, z, t)
98
+ folder (str): Output directory where the grayscale images will be saved.
99
+ base_name (str): Base name used for constructing output filenames.
100
+ dtype (np.dtype): Desired data type to cast images before saving.
101
+
102
+ Note:
103
+ 2D grayscale images are ignored, as they should be handled separately.
104
+ The output TIFF filenames will include suffixes like `_C1`, `_Z1`, `_T1` to
105
+ indicate channel, z-plane, and timepoint respectively.
106
+ """
47
107
  if image.ndim == 2:
48
108
  # Grayscale image, already processed separately
49
109
  return
@@ -68,7 +128,29 @@ def process_non_tif_non_2D_images(folder):
68
128
 
69
129
  # Function to load images in various formats
70
130
  def load_image(file_path):
71
- """Loads image from various formats and returns it as a numpy array along with its dtype."""
131
+ """
132
+ Load an image file of various supported formats and return it as a NumPy array.
133
+
134
+ Supports TIFF, PNG, JPEG, CZI, and ND2 image formats. Converts the image to a NumPy array
135
+ and returns it along with its data type for further processing.
136
+
137
+ Args:
138
+ file_path (str): Path to the image file.
139
+
140
+ Returns:
141
+ tuple: A tuple (image, dtype) where:
142
+ - image (np.ndarray): Loaded image as a NumPy array.
143
+ - dtype: Data type of the image (e.g., np.uint16, np.float32, etc.).
144
+
145
+ Raises:
146
+ ValueError: If the file extension is not supported.
147
+
148
+ Supported formats:
149
+ - .tif, .tiff (TIFF)
150
+ - .png, .jpg, .jpeg (standard image formats)
151
+ - .czi (Zeiss CZI microscopy format)
152
+ - .nd2 (Nikon ND2 microscopy format)
153
+ """
72
154
  ext = os.path.splitext(file_path)[1].lower()
73
155
 
74
156
  if ext in ['.tif', '.tiff']:
@@ -94,7 +176,21 @@ def process_non_tif_non_2D_images(folder):
94
176
 
95
177
  # Function to check if an image is grayscale and save it as a TIFF if it isn't already
96
178
  def convert_grayscale_to_tiff(image, filename, folder, dtype):
97
- """Convert grayscale images that are not in TIFF format to TIFF, preserving bit depth."""
179
+ """
180
+ Convert a grayscale image to TIFF format and save it, preserving the original bit depth.
181
+
182
+ This function is intended for grayscale (2D) images in non-TIFF formats (e.g., PNG, JPEG).
183
+ It converts the image to the specified dtype and saves it as a TIFF in the specified folder.
184
+
185
+ Args:
186
+ image (np.ndarray): Grayscale image as a NumPy array.
187
+ filename (str): Original filename (used to derive output name).
188
+ folder (str): Destination folder where the TIFF image will be saved.
189
+ dtype (np.dtype): Data type to cast the image before saving.
190
+
191
+ Returns:
192
+ None
193
+ """
98
194
  base_name = os.path.splitext(filename)[0]
99
195
  output_filename = os.path.join(folder, f"{base_name}.tif")
100
196
  tifffile.imwrite(output_filename, image.astype(dtype))
@@ -130,7 +226,20 @@ def process_non_tif_non_2D_images(folder):
130
226
  print(f"Error processing {filename}: {str(e)}")
131
227
 
132
228
  def _load_images_and_labels(image_files, label_files, invert=False):
133
-
229
+ """
230
+ Load image and label files from disk and optionally normalize intensity.
231
+
232
+ Args:
233
+ image_files (list[str]): List of paths to image files.
234
+ label_files (list[str]): List of paths to label (mask) files.
235
+ invert (bool): If True, invert the intensity of input images.
236
+
237
+ Returns:
238
+ images (list[np.ndarray]): List of loaded image arrays.
239
+ labels (list[np.ndarray]): List of loaded label arrays.
240
+ image_names (list[str]): List of image file names (no paths).
241
+ label_names (list[str]): List of label file names (no paths).
242
+ """
134
243
  from .utils import invert_image
135
244
 
136
245
  images = []
@@ -191,7 +300,29 @@ def _load_images_and_labels(image_files, label_files, invert=False):
191
300
  def _load_normalized_images_and_labels(image_files, label_files, channels=None, percentiles=None,
192
301
  invert=False, visualize=False, remove_background=False,
193
302
  background=0, Signal_to_noise=10, target_height=None, target_width=None):
194
-
303
+ """
304
+ Load, normalize, and optionally resize images and labels for downstream analysis.
305
+
306
+ Args:
307
+ image_files (list[str]): List of paths to image files.
308
+ label_files (list[str] or None): List of paths to label (mask) files.
309
+ channels (list[int] or None): Indices of image channels to retain.
310
+ percentiles (list[int, int] or None): Percentile range for intensity normalization.
311
+ invert (bool): If True, invert image intensity.
312
+ visualize (bool): If True, display plots of raw and normalized images.
313
+ remove_background (bool): If True, zero pixels below `background` threshold.
314
+ background (float): Background intensity threshold.
315
+ Signal_to_noise (float): Minimum signal-to-noise ratio used to detect saturation.
316
+ target_height (int or None): Target height for image resizing.
317
+ target_width (int or None): Target width for image resizing.
318
+
319
+ Returns:
320
+ normalized_images (list[np.ndarray]): List of normalized image arrays.
321
+ labels (list[np.ndarray]): List of label arrays (resized if needed).
322
+ image_names (list[str]): List of image file names.
323
+ label_names (list[str]): List of label file names.
324
+ orig_dims (list[Tuple[int, int]]): Original dimensions of each image before resizing.
325
+ """
195
326
  from .plot import normalize_and_visualize, plot_resize
196
327
  from .utils import invert_image, apply_mask
197
328
  from skimage.transform import resize as resizescikit
@@ -297,23 +428,54 @@ def _load_normalized_images_and_labels(image_files, label_files, channels=None,
297
428
 
298
429
  class CombineLoaders:
299
430
  """
300
- A class that combines multiple data loaders into a single iterator.
431
+ A class that combines multiple PyTorch data loaders into a single iterable.
432
+
433
+ This class allows iteration over a mixed sequence of batches from several
434
+ data loaders, yielding a tuple with the loader index and the corresponding batch.
435
+ Once a loader is exhausted, it is removed from the iteration pool.
301
436
 
302
437
  Args:
303
- train_loaders (list): A list of data loaders.
438
+ train_loaders (list): A list of PyTorch DataLoader objects.
304
439
 
305
440
  Raises:
306
- StopIteration: If all data loaders have been exhausted.
441
+ StopIteration: When all data loaders are exhausted.
307
442
  """
308
443
 
309
444
  def __init__(self, train_loaders):
445
+ """
446
+ Initialize the CombineLoaders instance.
447
+
448
+ Converts each data loader into an iterator for independent traversal.
449
+
450
+ Args:
451
+ train_loaders (list): List of torch.utils.data.DataLoader instances.
452
+ """
310
453
  self.train_loaders = train_loaders
311
454
  self.loader_iters = [iter(loader) for loader in train_loaders]
312
455
 
313
456
  def __iter__(self):
457
+ """
458
+ Return the iterator object (self).
459
+
460
+ Returns:
461
+ CombineLoaders: The iterator object itself.
462
+ """
314
463
  return self
315
464
 
316
465
  def __next__(self):
466
+ """
467
+ Return the next batch from the available data loaders.
468
+
469
+ Data loaders are shuffled at each step to randomize the batch source.
470
+ If a data loader is exhausted, it is removed from the pool.
471
+
472
+ Returns:
473
+ tuple: A tuple (i, batch) where i is the index of the originating loader,
474
+ and batch is the next batch of data from that loader.
475
+
476
+ Raises:
477
+ StopIteration: When all loaders have been exhausted.
478
+ """
317
479
  while self.loader_iters:
318
480
  random.shuffle(self.loader_iters)
319
481
  for i, loader_iter in enumerate(self.loader_iters):
@@ -329,14 +491,26 @@ class CombineLoaders:
329
491
 
330
492
  class CombinedDataset(Dataset):
331
493
  """
332
- A dataset that combines multiple datasets into one.
494
+ A dataset that combines multiple datasets into one seamless dataset.
495
+
496
+ This class supports optional shuffling across datasets and presents
497
+ a unified indexing interface for training or evaluation.
333
498
 
334
499
  Args:
335
- datasets (list): A list of datasets to be combined.
336
- shuffle (bool, optional): Whether to shuffle the combined dataset. Defaults to True.
500
+ datasets (list): A list of PyTorch Dataset objects to combine.
501
+ shuffle (bool, optional): Whether to shuffle the indices for data access. Defaults to True.
337
502
  """
338
503
 
339
504
  def __init__(self, datasets, shuffle=True):
505
+ """
506
+ Initialize the CombinedDataset.
507
+
508
+ Computes lengths of each dataset and optionally shuffles the access indices.
509
+
510
+ Args:
511
+ datasets (list): A list of datasets to be combined.
512
+ shuffle (bool, optional): Whether to shuffle the combined dataset. Defaults to True.
513
+ """
340
514
  self.datasets = datasets
341
515
  self.lengths = [len(dataset) for dataset in datasets]
342
516
  self.total_length = sum(self.lengths)
@@ -347,6 +521,17 @@ class CombinedDataset(Dataset):
347
521
  else:
348
522
  self.indices = None
349
523
  def __getitem__(self, index):
524
+ """
525
+ Retrieve an item from the combined dataset.
526
+
527
+ The method accounts for shuffling and maps the index to the appropriate dataset.
528
+
529
+ Args:
530
+ index (int): Index of the item in the combined dataset.
531
+
532
+ Returns:
533
+ Any: The item retrieved from the corresponding sub-dataset.
534
+ """
350
535
  if self.shuffle:
351
536
  index = self.indices[index]
352
537
  for dataset, length in zip(self.datasets, self.lengths):
@@ -354,6 +539,12 @@ class CombinedDataset(Dataset):
354
539
  return dataset[index]
355
540
  index -= length
356
541
  def __len__(self):
542
+ """
543
+ Return the total length of the combined dataset.
544
+
545
+ Returns:
546
+ int: Total number of items across all datasets.
547
+ """
357
548
  return self.total_length
358
549
 
359
550
  class NoClassDataset(Dataset):
@@ -431,9 +622,38 @@ class NoClassDataset(Dataset):
431
622
  img = ToTensor()(img)
432
623
  return img, self.filenames[index]
433
624
 
434
-
435
625
  class spacrDataset(Dataset):
626
+ """
627
+ Custom PyTorch Dataset for loading labeled image data organized by class folders or from specified file lists.
628
+
629
+ This dataset supports loading images either from directory structures organized by class or from explicit
630
+ file and label lists. It supports optional preloading of all images into memory for faster access.
631
+
632
+ Args:
633
+ data_dir (str): Root directory containing subfolders for each class.
634
+ loader_classes (list[str]): List of class names corresponding to subfolder names in `data_dir`.
635
+ transform (callable, optional): Transform to apply to images (e.g., torchvision transforms).
636
+ shuffle (bool): Whether to shuffle the dataset. Default is True.
637
+ pin_memory (bool): If True, pre-load all images into memory using multiprocessing. Default is False.
638
+ specific_files (list[str], optional): Specific image file paths to load instead of scanning `data_dir`.
639
+ specific_labels (list[int], optional): Corresponding labels for `specific_files`.
640
+ """
436
641
  def __init__(self, data_dir, loader_classes, transform=None, shuffle=True, pin_memory=False, specific_files=None, specific_labels=None):
642
+ """
643
+ Initialize the spacrDataset.
644
+
645
+ Constructs the dataset either by scanning the data directory or using provided file paths and labels.
646
+ Optionally shuffles and preloads images into memory.
647
+
648
+ Args:
649
+ data_dir (str): Directory containing class subfolders.
650
+ loader_classes (list): List of class names.
651
+ transform (callable, optional): Transform function to apply to images.
652
+ shuffle (bool): Whether to shuffle the dataset. Default is True.
653
+ pin_memory (bool): Whether to preload images into memory. Default is False.
654
+ specific_files (list[str], optional): List of file paths to use directly.
655
+ specific_labels (list[int], optional): List of labels corresponding to specific files.
656
+ """
437
657
  self.data_dir = data_dir
438
658
  self.classes = loader_classes
439
659
  self.transform = transform
@@ -463,23 +683,59 @@ class spacrDataset(Dataset):
463
683
  self.images = None
464
684
 
465
685
  def load_image(self, img_path):
686
+ """
687
+ Load and return a single image with orientation correction.
688
+
689
+ Args:
690
+ img_path (str): Path to the image file.
691
+
692
+ Returns:
693
+ PIL.Image: Loaded RGB image.
694
+ """
466
695
  img = Image.open(img_path).convert('RGB')
467
696
  img = ImageOps.exif_transpose(img) # Handle image orientation
468
697
  return img
469
698
 
470
699
  def __len__(self):
700
+ """
701
+ Return the number of samples in the dataset.
702
+
703
+ Returns:
704
+ int: Total number of images.
705
+ """
471
706
  return len(self.filenames)
472
707
 
473
708
  def shuffle_dataset(self):
709
+ """
710
+ Shuffle the dataset filenames and labels in unison.
711
+ """
474
712
  combined = list(zip(self.filenames, self.labels))
475
713
  random.shuffle(combined)
476
714
  self.filenames, self.labels = zip(*combined)
477
715
 
478
716
  def get_plate(self, filepath):
717
+ """
718
+ Extract the plate identifier from the filename.
719
+
720
+ Args:
721
+ filepath (str): Full path to the file.
722
+
723
+ Returns:
724
+ str: Plate ID extracted from the filename.
725
+ """
479
726
  filename = os.path.basename(filepath)
480
727
  return filename.split('_')[0]
481
728
 
482
729
  def __getitem__(self, index):
730
+ """
731
+ Retrieve an image, its label, and the filename.
732
+
733
+ Args:
734
+ index (int): Index of the image to retrieve.
735
+
736
+ Returns:
737
+ tuple: (image, label, filename)
738
+ """
483
739
  if self.pin_memory:
484
740
  img = self.images[index]
485
741
  else:
@@ -491,7 +747,29 @@ class spacrDataset(Dataset):
491
747
  return img, label, filename
492
748
 
493
749
  class spacrDataLoader(DataLoader):
750
+ """
751
+ Custom DataLoader with background batch preloading support using multiprocessing.
752
+
753
+ This class extends `torch.utils.data.DataLoader` and adds asynchronous background
754
+ preloading of a specified number of batches using a separate process or in-place loading
755
+ if `pin_memory=True`.
756
+
757
+ Args:
758
+ *args: Arguments passed to the base DataLoader.
759
+ preload_batches (int): Number of batches to preload in a background process. Default is 1.
760
+ **kwargs: Keyword arguments passed to the base DataLoader. Supports all standard DataLoader arguments.
761
+ """
494
762
  def __init__(self, *args, preload_batches=1, **kwargs):
763
+ """
764
+ Initialize the spacrDataLoader.
765
+
766
+ Sets up the queue and multiprocessing process for background preloading of batches.
767
+
768
+ Args:
769
+ *args: Arguments passed to torch.utils.data.DataLoader.
770
+ preload_batches (int): Number of batches to preload. Default is 1.
771
+ **kwargs: Keyword arguments passed to the base DataLoader.
772
+ """
495
773
  super().__init__(*args, **kwargs)
496
774
  self.preload_batches = preload_batches
497
775
  self.batch_queue = Queue(maxsize=preload_batches)
@@ -502,6 +780,12 @@ class spacrDataLoader(DataLoader):
502
780
  atexit.register(self.cleanup)
503
781
 
504
782
  def _preload_next_batches(self):
783
+ """
784
+ Internal method to fetch the next N batches and put them in the queue.
785
+
786
+ If `pin_memory` is True, batches are pinned to CUDA memory.
787
+ Stops if the iterator is exhausted or the stop event is set.
788
+ """
505
789
  try:
506
790
  for _ in range(self.preload_batches):
507
791
  if self._stop_event:
@@ -514,6 +798,11 @@ class spacrDataLoader(DataLoader):
514
798
  pass
515
799
 
516
800
  def _start_preloading(self):
801
+ """
802
+ Start a new background process to preload batches.
803
+
804
+ If `pin_memory` is True, loading is done in the main thread instead.
805
+ """
517
806
  if self.process is None or not self.process.is_alive():
518
807
  self._iterator = iter(super().__iter__())
519
808
  if not self.pin_memory:
@@ -523,6 +812,15 @@ class spacrDataLoader(DataLoader):
523
812
  self._preload_next_batches() # Directly load if pin_memory is True
524
813
 
525
814
  def _pin_memory_batch(self, batch):
815
+ """
816
+ Recursively pin memory for all tensors in the batch.
817
+
818
+ Args:
819
+ batch: A batch of data, possibly a tuple, list, or tensor.
820
+
821
+ Returns:
822
+ The batch with pinned memory (if applicable).
823
+ """
526
824
  if isinstance(batch, (list, tuple)):
527
825
  return [b.pin_memory() if isinstance(b, torch.Tensor) else b for b in batch]
528
826
  elif isinstance(batch, torch.Tensor):
@@ -531,10 +829,24 @@ class spacrDataLoader(DataLoader):
531
829
  return batch
532
830
 
533
831
  def __iter__(self):
832
+ """
833
+ Return the iterator and initiate background preloading.
834
+
835
+ Returns:
836
+ self
837
+ """
534
838
  self._start_preloading()
535
839
  return self
536
840
 
537
841
  def __next__(self):
842
+ """
843
+ Return the next batch from the queue.
844
+
845
+ If the queue is empty and the process has exited, raises StopIteration.
846
+
847
+ Returns:
848
+ The next batch of data.
849
+ """
538
850
  if self.process and not self.process.is_alive() and self.batch_queue.empty():
539
851
  raise StopIteration
540
852
 
@@ -554,51 +866,45 @@ class spacrDataLoader(DataLoader):
554
866
  raise StopIteration
555
867
 
556
868
  def cleanup(self):
869
+ """
870
+ Cleanup method to terminate background preloading processes.
871
+
872
+ Ensures graceful shutdown of worker processes at exit.
873
+ """
557
874
  self._stop_event = True
558
875
  if self.process and self.process.is_alive():
559
876
  self.process.terminate()
560
877
  self.process.join()
561
878
 
562
879
  def __del__(self):
880
+ """
881
+ Destructor to ensure cleanup is called when the object is deleted.
882
+ """
563
883
  self.cleanup()
564
884
 
565
- class NoClassDataset_v1(Dataset):
566
- def __init__(self, data_dir, transform=None, shuffle=True, load_to_memory=False):
567
- self.data_dir = data_dir
568
- self.transform = transform
569
- self.shuffle = shuffle
570
- self.load_to_memory = load_to_memory
571
- self.filenames = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
572
- if self.shuffle:
573
- self.shuffle_dataset()
574
- if self.load_to_memory:
575
- self.images = [self.load_image(f) for f in self.filenames]
576
-
577
- def load_image(self, img_path):
578
- img = Image.open(img_path).convert('RGB')
579
- return img
580
-
581
- def __len__(self):
885
+ class TarImageDataset(Dataset):
886
+ """
887
+ A PyTorch Dataset for loading images directly from a .tar archive without extraction.
582
888
 
583
- return len(self.filenames)
889
+ This is useful for large datasets stored as compressed tar archives, enabling on-the-fly
890
+ access to individual image files without unpacking the archive to disk.
584
891
 
585
- def shuffle_dataset(self):
586
- if self.shuffle:
587
- random.shuffle(self.filenames)
892
+ Args:
893
+ tar_path (str): Path to the .tar archive containing image files.
894
+ transform (callable, optional): Optional transform to be applied on a sample.
588
895
 
589
- def __getitem__(self, index):
590
- if self.load_to_memory:
591
- img = self.images[index]
592
- else:
593
- img = self.load_image(self.filenames[index])
594
- if self.transform is not None:
595
- img = self.transform(img)
596
- else:
597
- img = ToTensor()(img)
598
- return img, self.filenames[index]
896
+ Attributes:
897
+ members (List[TarInfo]): List of image members in the tar archive.
898
+ """
599
899
 
600
- class TarImageDataset(Dataset):
601
900
  def __init__(self, tar_path, transform=None):
901
+ """
902
+ Initialize the dataset and index image members from the tar archive.
903
+
904
+ Args:
905
+ tar_path (str): Path to the .tar file.
906
+ transform (callable, optional): Transform function to apply to each image.
907
+ """
602
908
  self.tar_path = tar_path
603
909
  self.transform = transform
604
910
 
@@ -607,9 +913,24 @@ class TarImageDataset(Dataset):
607
913
  self.members = [m for m in f.getmembers() if m.isfile()]
608
914
 
609
915
  def __len__(self):
916
+ """
917
+ Return the number of image files in the archive.
918
+
919
+ Returns:
920
+ int: Number of image files.
921
+ """
610
922
  return len(self.members)
611
923
 
612
924
  def __getitem__(self, idx):
925
+ """
926
+ Retrieve an image by index directly from the tar archive.
927
+
928
+ Args:
929
+ idx (int): Index of the image to retrieve.
930
+
931
+ Returns:
932
+ tuple: (PIL.Image.Image or transformed image, str) where the string is the file name.
933
+ """
613
934
  with tarfile.open(self.tar_path, 'r') as f:
614
935
  m = self.members[idx]
615
936
  img_file = f.extractfile(m)
@@ -621,8 +942,24 @@ class TarImageDataset(Dataset):
621
942
  return img, m.name
622
943
 
623
944
  def load_images_from_paths(images_by_key):
624
- images_dict = {}
945
+ """
946
+ Load images from a dictionary mapping keys to lists of image file paths.
625
947
 
948
+ Each key in the input dictionary corresponds to a list of file paths. The function
949
+ loads each image as a NumPy array and returns a new dictionary with the same keys,
950
+ where each value is a list of loaded images.
951
+
952
+ Args:
953
+ images_by_key (dict): A dictionary where each key maps to a list of image file paths (str).
954
+
955
+ Returns:
956
+ dict: A dictionary where each key maps to a list of NumPy arrays representing the loaded images.
957
+
958
+ Notes:
959
+ - Images are loaded using PIL and converted to NumPy arrays.
960
+ - Any image that fails to load will be skipped, and an error message will be printed.
961
+ """
962
+ images_dict = {}
626
963
  for key, paths in images_by_key.items():
627
964
  images_dict[key] = []
628
965
  for path in paths:
@@ -796,7 +1133,33 @@ def _generate_time_lists(file_list):
796
1133
  return sorted_file_lists
797
1134
 
798
1135
  def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
799
-
1136
+ """
1137
+ Organize image files in a source directory into channel-specific subfolders
1138
+ based on metadata extracted from filenames using a regular expression.
1139
+
1140
+ This function assumes filenames contain fields like plate ID, well ID, field ID,
1141
+ channel ID, and time point. It parses these from the filename using the provided
1142
+ regex, reformats the filename, and moves the file into a subdirectory named after
1143
+ the channel ID.
1144
+
1145
+ Args:
1146
+ src (str or Path): Path to the source directory containing image files.
1147
+ regex (str): Regular expression to extract metadata from filenames.
1148
+ Expected named groups: plateID, wellID, fieldID, chanID, timeID.
1149
+ timelapse (bool, optional): Whether to include the timeID in the new filename. Defaults to False.
1150
+ metadata_type (str, optional): Special handling for specific metadata types.
1151
+ If 'cq1', converts wellID to CQ1 format. Defaults to ''.
1152
+
1153
+ Notes:
1154
+ - Only `.tif` and `.png` files are processed.
1155
+ - Files are copied into folders named after their channel ID.
1156
+ - A backup of the original files is moved to a new `orig/` folder.
1157
+ - Skips files that do not match the regex or are missing required groups.
1158
+ - Issues warnings if destination files already exist.
1159
+
1160
+ Returns:
1161
+ None
1162
+ """
800
1163
  from .utils import _safe_int_convert, _convert_cq1_well_id
801
1164
 
802
1165
  src_path = src
@@ -860,9 +1223,28 @@ def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
860
1223
 
861
1224
  def _merge_channels(src, plot=False):
862
1225
  """
863
- Merge the channels in the given source directory and save the merged files in a 'stack' directory without using multiprocessing.
864
- """
1226
+ Merge single-channel image files from multiple folders into multi-channel NumPy arrays.
1227
+
1228
+ This function assumes the source directory `src` contains subdirectories named as channel
1229
+ identifiers (e.g., '0', '01', ..., '100'), each holding single-channel image files
1230
+ with identical filenames. It merges images with the same name across these folders
1231
+ into a single multi-channel `.npy` file stored in the `stack/` subdirectory.
865
1232
 
1233
+ Args:
1234
+ src (str or Path): Path to the parent directory containing channel subfolders.
1235
+ plot (bool, optional): If True, plot the merged arrays after processing using `plot_arrays`. Defaults to False.
1236
+
1237
+ Returns:
1238
+ int: The number of matching channel folders that were merged.
1239
+
1240
+ Notes:
1241
+ - Only processes if `stack/` directory is empty.
1242
+ - Output is saved as `.npy` files in `src/stack/`.
1243
+ - Channel folders must be named as integers or zero-padded strings from '0' to '100'.
1244
+ - Files are matched by filename across all channel folders.
1245
+ - Skips if a file is not present in all channels or is not a file.
1246
+ - Uses `_merge_file` to perform the merging operation.
1247
+ """
866
1248
  from .plot import plot_arrays
867
1249
  from .utils import print_progress
868
1250
 
@@ -1131,9 +1513,6 @@ def _normalize_img_batch(stack, channels, save_dtype, settings):
1131
1513
  return normalized_stack.astype(save_dtype)
1132
1514
 
1133
1515
  def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={}):
1134
- from .utils import print_progress
1135
- from .plot import plot_arrays
1136
-
1137
1516
  """
1138
1517
  Concatenates and normalizes channel data from multiple files and saves the normalized data.
1139
1518
 
@@ -1153,7 +1532,9 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
1153
1532
  Returns:
1154
1533
  str: The directory path where the concatenated and normalized channel data is saved.
1155
1534
  """
1156
-
1535
+ from .utils import print_progress
1536
+ from .plot import plot_arrays
1537
+
1157
1538
  channels = [item for item in channels if item is not None]
1158
1539
 
1159
1540
  print(f"Generating concatenated and normalized channel data for channels: {channels}")
@@ -1528,12 +1909,7 @@ def delete_empty_subdirectories(folder_path):
1528
1909
  #print(f"Skipping non-empty directory: {full_dir_path}")
1529
1910
 
1530
1911
  #@log_function_call
1531
- def preprocess_img_data(settings):
1532
-
1533
- from .plot import plot_arrays
1534
- from .utils import _run_test_mode, _get_regex
1535
- from .settings import set_default_settings_preprocess_img_data
1536
-
1912
+ def preprocess_img_data(settings):
1537
1913
  """
1538
1914
  Preprocesses image data by converting z-stack images to maximum intensity projection (MIP) images.
1539
1915
 
@@ -1560,6 +1936,11 @@ def preprocess_img_data(settings):
1560
1936
  Returns:
1561
1937
  None
1562
1938
  """
1939
+
1940
+ from .plot import plot_arrays
1941
+ from .utils import _run_test_mode, _get_regex
1942
+ from .settings import set_default_settings_preprocess_img_data
1943
+
1563
1944
  src = settings['src']
1564
1945
 
1565
1946
  if len(os.listdir(src)) < 100:
@@ -2159,6 +2540,32 @@ def _results_to_csv(src, df, df_well):
2159
2540
  return cells, wells
2160
2541
 
2161
2542
  def read_plot_model_stats(train_file_path, val_file_path ,save=False):
2543
+ def read_plot_model_stats(train_file_path, val_file_path, save=False):
2544
+ """
2545
+ Reads training and validation statistics from CSV files, generates plots for various metrics,
2546
+ and optionally saves the plots as PDF files.
2547
+ Args:
2548
+ train_file_path (str): Path to the CSV file containing training statistics.
2549
+ val_file_path (str): Path to the CSV file containing validation statistics.
2550
+ save (bool, optional): If True, saves the plots as PDF files in the same directory as
2551
+ the training file. If False, displays the plots interactively. Defaults to False.
2552
+ Metrics Plotted:
2553
+ - accuracy
2554
+ - neg_accuracy
2555
+ - pos_accuracy
2556
+ - loss
2557
+ - prauc
2558
+ - optimal_threshold
2559
+ Notes:
2560
+ - The CSV files should have a column named 'epoch' and columns corresponding to the
2561
+ metrics listed above.
2562
+ - The plots are saved with filenames corresponding to the metric name (e.g., 'accuracy.pdf').
2563
+ Raises:
2564
+ FileNotFoundError: If the specified CSV files do not exist.
2565
+ ValueError: If the CSV files do not contain the required columns.
2566
+ Example:
2567
+ >>> read_plot_model_stats("train_stats.csv", "val_stats.csv", save=True)
2568
+ """
2162
2569
 
2163
2570
  def _plot_and_save(train_df, val_df, column='accuracy', save=False, path=None, dpi=600):
2164
2571
 
@@ -2295,6 +2702,30 @@ def _save_progress(dst, train_df, validation_df):
2295
2702
  return
2296
2703
 
2297
2704
  def _copy_missclassified(df):
2705
+ """
2706
+ Copies misclassified images to designated folders based on their classification.
2707
+
2708
+ This function identifies rows in the given DataFrame where the 'true_label'
2709
+ does not match the 'predicted_label'. It then copies the corresponding files
2710
+ to a "missclassified" directory, organizing them into subdirectories
2711
+ ("pc" or "nc") based on the presence of "pc" in the original file path.
2712
+
2713
+ Args:
2714
+ df (pandas.DataFrame): A DataFrame containing at least the following columns:
2715
+ - 'filename': The file path of the image.
2716
+ - 'true_label': The actual label of the image.
2717
+ - 'predicted_label': The predicted label of the image.
2718
+
2719
+ Side Effects:
2720
+ - Creates directories for storing misclassified images if they do not exist.
2721
+ - Copies files from their original locations to the appropriate "missclassified" subdirectory.
2722
+
2723
+ Prints:
2724
+ A message indicating the number of misclassified images copied.
2725
+
2726
+ Returns:
2727
+ None
2728
+ """
2298
2729
  misclassified = df[df['true_label'] != df['predicted_label']]
2299
2730
  for _, row in misclassified.iterrows():
2300
2731
  original_path = row['filename']
@@ -2310,6 +2741,20 @@ def _copy_missclassified(df):
2310
2741
  return
2311
2742
 
2312
2743
  def _read_db(db_loc, tables):
2744
+ """
2745
+ Reads data from specified tables in a SQLite database and applies metadata corrections.
2746
+ Args:
2747
+ db_loc (str): The file path to the SQLite database.
2748
+ tables (list of str): A list of table names to read from the database.
2749
+ Returns:
2750
+ list of pandas.DataFrame: A list of DataFrames, each containing the data from one of the specified tables.
2751
+ Notes:
2752
+ - The function assumes the presence of utility functions `rename_columns_in_db` and `correct_metadata`
2753
+ in the `utils` module.
2754
+ - `rename_columns_in_db` is called to preprocess the database before reading.
2755
+ - `correct_metadata` is applied to each DataFrame after reading.
2756
+ - The database connection is closed after all tables are read.
2757
+ """
2313
2758
 
2314
2759
  from .utils import rename_columns_in_db, correct_metadata
2315
2760
 
@@ -2325,6 +2770,31 @@ def _read_db(db_loc, tables):
2325
2770
  return dfs
2326
2771
 
2327
2772
  def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10, change_plate=False):
2773
+ """
2774
+ Reads and merges data from multiple locations and tables, processes the data, and returns a merged DataFrame
2775
+ along with a list of object-specific DataFrames.
2776
+ Args:
2777
+ locs (list): List of file paths or locations containing the data to be read.
2778
+ tables (list): List of table names to be extracted and processed.
2779
+ verbose (bool, optional): If True, prints detailed information about the processing steps. Defaults to False.
2780
+ nuclei_limit (int or bool, optional): Limit on the number of nuclei per cell. If False, only single nuclei
2781
+ per cell are retained. Defaults to 10.
2782
+ pathogen_limit (int, float, or bool, optional): Limit on the number of pathogens per cell. If False, only
2783
+ single pathogens per cell are retained. Defaults to 10.
2784
+ change_plate (bool, optional): If True, assigns unique plate IDs to each location. Defaults to False.
2785
+ Returns:
2786
+ tuple:
2787
+ - pd.DataFrame: A merged DataFrame containing processed data from all specified tables.
2788
+ - list: A list of DataFrames for individual object types (e.g., cell, cytoplasm, nucleus, pathogen)
2789
+ if they exist in the input data.
2790
+ Notes:
2791
+ - The function processes data from multiple tables such as 'cell', 'cytoplasm', 'nucleus', 'pathogen',
2792
+ and 'png_list', if available.
2793
+ - Data is grouped and merged based on unique identifiers such as 'prcfo' (plate, row, column, field, object).
2794
+ - Metadata is generated and merged with the final DataFrame.
2795
+ - The function handles missing data and applies limits on nuclei and pathogens per cell if specified.
2796
+ - Verbose mode provides detailed logs of the processing steps and the resulting data dimensions.
2797
+ """
2328
2798
 
2329
2799
  from .utils import _split_data
2330
2800
 
@@ -2459,6 +2929,15 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_
2459
2929
  return merged_df, obj_df_ls
2460
2930
 
2461
2931
  def _read_mask(mask_path):
2932
+ """
2933
+ Reads a mask image from the specified file path and ensures it is of type uint16.
2934
+
2935
+ Parameters:
2936
+ mask_path (str): The file path to the mask image.
2937
+
2938
+ Returns:
2939
+ numpy.ndarray: The mask image as a NumPy array with dtype uint16.
2940
+ """
2462
2941
  mask = imageio2.imread(mask_path)
2463
2942
  if mask.dtype != np.uint16:
2464
2943
  mask = img_as_uint(mask)
@@ -2466,10 +2945,24 @@ def _read_mask(mask_path):
2466
2945
 
2467
2946
  def convert_numpy_to_tiff(folder_path, limit=None):
2468
2947
  """
2469
- Converts all numpy files in a folder to TIFF format and saves them in a subdirectory 'tiff'.
2470
-
2471
- Args:
2472
- folder_path (str): The path to the folder containing numpy files.
2948
+ Converts all .npy files in a folder to .tiff images and saves them in a 'tiff' subdirectory.
2949
+
2950
+ This function searches for `.npy` files in the specified folder, loads each as a NumPy array,
2951
+ and writes it as a `.tiff` image using `tifffile.imwrite`. The resulting images are saved in
2952
+ a `tiff` subdirectory within the input folder. Optionally, processing can be limited to a
2953
+ specific number of files.
2954
+
2955
+ Parameters
2956
+ ----------
2957
+ folder_path : str
2958
+ The path to the directory containing `.npy` files to be converted.
2959
+ limit : int, optional
2960
+ Maximum number of `.npy` files to convert. If None (default), all `.npy` files are converted.
2961
+
2962
+ Returns
2963
+ -------
2964
+ None
2965
+ The function saves the converted TIFF files to disk and prints status messages.
2473
2966
  """
2474
2967
  # Create the subdirectory 'tiff' within the specified folder if it doesn't already exist
2475
2968
  tiff_subdir = os.path.join(folder_path, 'tiff')
@@ -2502,6 +2995,27 @@ def convert_numpy_to_tiff(folder_path, limit=None):
2502
2995
  return
2503
2996
 
2504
2997
  def generate_cellpose_train_test(src, test_split=0.1):
2998
+ """
2999
+ Splits a directory of TIFF images and corresponding Cellpose masks into training and test sets.
3000
+
3001
+ This function searches the `src` directory for TIFF images and ensures that corresponding
3002
+ masks exist in the `src/masks/` folder. It then shuffles and splits the dataset into
3003
+ training and test sets based on the specified `test_split` ratio. The resulting subsets
3004
+ are copied into `train/` and `test/` folders (with `masks/` subfolders) located
3005
+ in the parent directory of `src`.
3006
+
3007
+ Parameters
3008
+ ----------
3009
+ src : str
3010
+ Path to the directory containing TIFF images and a subdirectory `masks/` with corresponding mask files.
3011
+ test_split : float, optional
3012
+ Proportion of the dataset to be used for testing (default is 0.1, i.e., 10%).
3013
+
3014
+ Returns
3015
+ -------
3016
+ None
3017
+ Files are copied to disk and progress messages are printed.
3018
+ """
2505
3019
  mask_src = os.path.join(src, 'masks')
2506
3020
  img_paths = glob.glob(os.path.join(src, '*.tif'))
2507
3021
  img_filenames = [os.path.basename(file) for file in img_paths]
@@ -2575,7 +3089,28 @@ def parse_gz_files(folder_path):
2575
3089
  return samples_dict
2576
3090
 
2577
3091
  def generate_dataset(settings={}):
3092
+ """
3093
+ Generates a tar archive containing a dataset of images collected from one or more database sources.
3094
+
3095
+ This function selects image paths from one or more SQLite databases, optionally samples from them,
3096
+ and writes the images into a tar archive using multiprocessing to parallelize the process. Temporary tar
3097
+ files are created and merged into a final tar file. The function also logs and saves dataset settings.
3098
+
3099
+ Parameters
3100
+ ----------
3101
+ settings : dict, optional
3102
+ Dictionary of user-defined settings. The following keys are used:
3103
+
3104
+ - 'src' (str or list of str): Path(s) to the source folder(s) containing the database(s).
3105
+ - 'experiment' (str): Name of the experiment, used to name the output tar.
3106
+ - 'sample' (int or list, optional): If int, randomly sample that many images; if list, sample per src index.
3107
+ - 'file_metadata' (str, optional): Metadata column name used to filter/select files.
2578
3108
 
3109
+ Returns
3110
+ -------
3111
+ str
3112
+ Path to the final tar archive containing the dataset.
3113
+ """
2579
3114
  from .utils import initiate_counter, add_images_to_tar, save_settings, generate_path_list_from_db, correct_paths
2580
3115
  from .settings import set_generate_dataset_defaults
2581
3116
 
@@ -2792,7 +3327,35 @@ def generate_loaders(src, mode='train', image_size=224, batch_size=32, classes=[
2792
3327
  return train_loaders, val_loaders, train_fig
2793
3328
 
2794
3329
  def generate_training_dataset(settings):
2795
-
3330
+ """
3331
+ Generate a training dataset from a SQLite database using measurement-based or annotation/metadata-based selection.
3332
+
3333
+ Depending on the `settings`, this function selects images corresponding to high and low phenotypes (e.g., recruitment)
3334
+ or based on metadata or manual annotation. Selected image paths are grouped by class and returned for further use
3335
+ (e.g., saving to folders or training models).
3336
+
3337
+ Parameters
3338
+ ----------
3339
+ settings : dict
3340
+ Configuration dictionary with the following required keys:
3341
+
3342
+ - 'class_metadata' (list of str): Metadata conditions to define classes (e.g., treatment names).
3343
+ - 'channel_of_interest' (int): Channel index used for computing recruitment scores.
3344
+ - 'png_type' (str): Type of PNG to retrieve ('raw', 'outline', etc.).
3345
+ - 'size' (int): Number of images to sample per class.
3346
+ - 'nuclei_limit' (int or None): Minimum nucleus size for filtering (used in _read_and_merge_data).
3347
+ - 'pathogen_limit' (int or None): Minimum pathogen size for filtering.
3348
+ - 'custom_measurement' (list of str or None): If provided, defines custom numerator and denominator columns.
3349
+ - 'classes' (list of str): Treatments to annotate using `annotate_conditions`.
3350
+ - 'metadata_type_by' (str): Column in the DB to use for metadata classification ('columnID' or 'rowID').
3351
+ - 'tables' (list of str): Tables to extract from database, e.g., ['cell', 'nucleus'].
3352
+ - 'dataset_mode' (str): Either 'annotation' or 'metadata'. Controls how class sizes are determined.
3353
+
3354
+ Returns
3355
+ -------
3356
+ list of list of str
3357
+ A list where each sublist contains paths to PNGs belonging to one class (e.g., low vs high recruitment).
3358
+ """
2796
3359
  # Function to filter png_list_df by prcfo present in df without merging
2797
3360
  def filter_png_list(db_path, settings, tables = ['cell', 'nucleus', 'pathogen', 'cytoplasm']):
2798
3361
  df, _ = _read_and_merge_data(locs=[db_path],
@@ -2975,6 +3538,41 @@ def generate_training_dataset(settings):
2975
3538
  return train_class_dir, test_class_dir
2976
3539
 
2977
3540
  def training_dataset_from_annotation(db_path, dst, annotation_column='test', annotated_classes=(1, 2)):
3541
+ """
3542
+ Extracts image paths from a database and groups them into class-based lists based on annotation values.
3543
+
3544
+ This function reads from a SQLite database (`png_list` table), extracts image paths and corresponding
3545
+ class annotations, and groups them by the specified `annotated_classes`. If only one class is provided,
3546
+ it automatically generates a second class by sampling the remaining entries not in the target class
3547
+ to create a balanced binary dataset.
3548
+
3549
+ Parameters
3550
+ ----------
3551
+ db_path : str
3552
+ Path to the SQLite database file containing the `png_list` table.
3553
+
3554
+ dst : str
3555
+ Output path (currently unused in the function, included for compatibility with caller).
3556
+
3557
+ annotation_column : str, default='test'
3558
+ Column name in the `png_list` table that contains class annotations.
3559
+
3560
+ annotated_classes : tuple of int, default=(1, 2)
3561
+ Class labels to extract from the annotation column.
3562
+
3563
+ Returns
3564
+ -------
3565
+ class_paths : list of list of str
3566
+ A list where each sublist contains the file paths for images belonging to one class.
3567
+ The number of sublists equals the number of unique classes returned.
3568
+
3569
+ Notes
3570
+ -----
3571
+ - If only one annotated class is provided, the function creates a balanced second class
3572
+ from non-annotated images.
3573
+ - This function does not copy or move any files — it only collects and returns path lists.
3574
+ - All path and annotation data is assumed to be stored in the `png_list` table of the SQLite DB.
3575
+ """
2978
3576
  all_paths = []
2979
3577
 
2980
3578
  # Connect to the database and retrieve the image paths and annotations
@@ -3030,6 +3628,51 @@ def training_dataset_from_annotation(db_path, dst, annotation_column='test', ann
3030
3628
  return class_paths
3031
3629
 
3032
3630
  def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='test', annotated_classes=(1, 2), metadata_type_by='columnID', class_metadata=['c1','c2']):
3631
+ """
3632
+ Extracts annotated image paths from a database, filtered by metadata location (row/column).
3633
+
3634
+ This function reads image paths and annotations from a SQLite database (`png_list` table), filters them
3635
+ by metadata (either `row_name` or `column_name`), and organizes them into class-specific lists based on
3636
+ annotation values. If only one class is specified, the function samples a balanced second class from
3637
+ remaining entries.
3638
+
3639
+ Parameters
3640
+ ----------
3641
+ db_path : str
3642
+ Path to the SQLite database containing the `png_list` table.
3643
+
3644
+ dst : str
3645
+ Output directory (unused in this function but required for compatibility).
3646
+
3647
+ annotation_column : str, default='test'
3648
+ The column name in `png_list` storing the annotation labels.
3649
+
3650
+ annotated_classes : tuple of int, default=(1, 2)
3651
+ Annotation values to be used for splitting data into separate class groups.
3652
+
3653
+ metadata_type_by : str, {'rowID', 'columnID'}, default='columnID'
3654
+ Which metadata field to filter by — either 'rowID' (uses `row_name`) or 'columnID' (uses `column_name`).
3655
+
3656
+ class_metadata : list of str, default=['c1', 'c2']
3657
+ The metadata values to include (e.g., specific row or column identifiers to filter on).
3658
+
3659
+ Returns
3660
+ -------
3661
+ class_paths : list of list of str
3662
+ A list where each sublist contains paths to images in one class.
3663
+
3664
+ Raises
3665
+ ------
3666
+ ValueError
3667
+ If `metadata_type_by` is not one of 'rowID' or 'columnID'.
3668
+
3669
+ Notes
3670
+ -----
3671
+ - If only one class is specified in `annotated_classes`, a second class is constructed by sampling
3672
+ from non-target annotations in the filtered set to ensure balanced class representation.
3673
+ - This function assumes that `png_path`, `annotation_column`, `row_name`, and `column_name` exist
3674
+ in the `png_list` table.
3675
+ """
3033
3676
  all_paths = []
3034
3677
 
3035
3678
  # Connect to the database and retrieve the image paths and annotations
@@ -3099,6 +3742,44 @@ def training_dataset_from_annotation_metadata(db_path, dst, annotation_column='t
3099
3742
  return class_paths
3100
3743
 
3101
3744
  def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
3745
+ """
3746
+ Generates a train/test image dataset directory structure from class-wise path lists.
3747
+
3748
+ This function creates `train` and `test` subdirectories under the given destination directory (`dst`)
3749
+ and copies the image files into class-specific folders after performing a train-test split.
3750
+
3751
+ Parameters
3752
+ ----------
3753
+ dst : str
3754
+ Destination directory where the dataset will be created. Subdirectories for each class will be made under `train/` and `test/`.
3755
+
3756
+ class_data : list of list of str
3757
+ A list where each sublist contains paths to image files belonging to a specific class.
3758
+
3759
+ classes : list of str
3760
+ Class names corresponding to the order of `class_data`.
3761
+
3762
+ test_split : float, default=0.1
3763
+ Proportion of data to be used for the test set. The remainder is used for training.
3764
+
3765
+ Returns
3766
+ -------
3767
+ train_dir : str
3768
+ Path to the top-level training directory.
3769
+
3770
+ test_dir : str
3771
+ Path to the top-level test directory.
3772
+
3773
+ Raises
3774
+ ------
3775
+ ValueError
3776
+ If the number of class labels does not match the number of class data lists.
3777
+
3778
+ Notes
3779
+ -----
3780
+ The train/test split is deterministic (random_state=42).
3781
+ File copying is timed and progress is reported via `print_progress`.
3782
+ """
3102
3783
  from .utils import print_progress
3103
3784
  # Make sure that the length of class_data matches the length of classes
3104
3785
  if len(class_data) != len(classes):
@@ -3146,7 +3827,38 @@ def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
3146
3827
  return os.path.join(dst, 'train'), os.path.join(dst, 'test')
3147
3828
 
3148
3829
  def convert_separate_files_to_yokogawa(folder, regex):
3149
-
3830
+ """
3831
+ Converts image files from a folder into Yokogawa-style naming format with optional MIP across Z-slices.
3832
+
3833
+ This function parses filenames using a provided regex, extracts metadata such as well ID, channel, field,
3834
+ timepoint, and slice, and renames the images to the Yokogawa convention:
3835
+ `plateX_WELL_TttttFfffL01Ccc.tif`. If multiple Z-slices exist, it computes a maximum intensity projection (MIP).
3836
+
3837
+ Parameters
3838
+ ----------
3839
+ folder : str
3840
+ Path to the folder containing input TIFF images.
3841
+
3842
+ regex : str
3843
+ Regular expression with named capture groups:
3844
+ - 'plateID' (optional)
3845
+ - 'wellID' (required)
3846
+ - 'fieldID' (optional)
3847
+ - 'timeID' (optional)
3848
+ - 'chanID' (optional)
3849
+ - 'sliceID' (optional)
3850
+
3851
+ Returns
3852
+ -------
3853
+ None
3854
+ Saves renamed TIFF files and a CSV log (`rename_log.csv`) in the same folder.
3855
+
3856
+ Notes
3857
+ -----
3858
+ - Automatically assigns new well names (`plateX_WELL`) if missing or non-standard.
3859
+ - Groups images by region (plate, well, field, time, channel) and performs MIP if multiple slices are present.
3860
+ - Skips files that do not match the regex or are missing required metadata.
3861
+ """
3150
3862
  ROWS = "ABCDEFGHIJKLMNOP"
3151
3863
  COLS = [f"{i:02d}" for i in range(1, 25)]
3152
3864
  WELLS = [f"{r}{c}" for r in ROWS for c in COLS]
@@ -3236,10 +3948,52 @@ def convert_separate_files_to_yokogawa(folder, regex):
3236
3948
 
3237
3949
  def convert_to_yokogawa(folder):
3238
3950
  """
3239
- Detects file type in the folder and converts them
3240
- to Yokogawa-style naming with Maximum Intensity Projection (MIP).
3951
+ Converts microscopy image files in a folder to Yokogawa-style TIFF filenames.
3952
+
3953
+ This function processes raw microscopy images in various formats (ND2, CZI, LIF, TIFF, PNG, JPEG, BMP)
3954
+ and converts them into a standardized Yokogawa naming scheme using maximum intensity projections (MIPs).
3955
+ Each image is assigned a unique well location (e.g., plate1_A01) across one or more 384-well plates.
3956
+ The output files are saved in the same directory with renamed filenames. A CSV log is generated
3957
+ to track the mapping between original files and the renamed TIFFs.
3958
+
3959
+ Parameters
3960
+ ----------
3961
+ folder : str
3962
+ Path to the directory containing the input microscopy files.
3963
+
3964
+ Supported Formats
3965
+ -----------------
3966
+ - `.nd2` : Nikon ND2 format (processed using ND2Reader)
3967
+ - `.czi` : Zeiss CZI format (processed using pyczi)
3968
+ - `.lif` : Leica LIF format (processed using readlif)
3969
+ - `.tif`, `.tiff`, `.png`, `.jpg`, `.jpeg`, `.bmp` : Image files (processed using tifffile)
3970
+
3971
+ Behavior
3972
+ --------
3973
+ - Computes maximum intensity projections across Z-stacks.
3974
+ - Generates Yokogawa-style filenames: `plateX_<WELL>_T####F###L01C##.tif`
3975
+ - Handles timepoints, Z-stacks, channels, fields, and scenes depending on format.
3976
+ - Avoids reusing well positions across multiple files and scenes.
3977
+ - Skips malformed or incomplete image structures.
3978
+ - Logs all renamed output files to `rename_log.csv` in the same folder.
3979
+
3980
+ Output
3981
+ ------
3982
+ - Converted TIFF images saved in the input folder with Yokogawa-style filenames.
3983
+ - A CSV log `rename_log.csv` containing columns:
3984
+ 'Original File', 'Renamed TIFF', 'ext', 'time', 'field', 'channel', 'z', 'scene', 'slice', 'well'
3985
+
3986
+ Notes
3987
+ -----
3988
+ - Requires `ND2Reader`, `pyczi`, `readlif`, `tifffile`, and `pandas`.
3989
+ - Handles multi-dimensional images (2D, 3D, 4D).
3990
+ - Images with unsupported dimensions or structure are skipped with warnings.
3991
+
3992
+ Example
3993
+ -------
3994
+ >>> convert_to_yokogawa("/path/to/raw_images")
3995
+ Processing complete. Files saved in /path/to/raw_images and rename log saved as rename_log.csv.
3241
3996
  """
3242
-
3243
3997
  def _get_next_well(used_wells):
3244
3998
  """
3245
3999
  Determines the next available well position across multiple 384-well plates.
@@ -3477,6 +4231,22 @@ def convert_to_yokogawa(folder):
3477
4231
  print(f"Processing complete. Files saved in {folder} and rename log saved as {csv_path}.")
3478
4232
 
3479
4233
  def apply_augmentation(image, method):
4234
+ """
4235
+ Applies the specified augmentation method to the given image.
4236
+
4237
+ Parameters:
4238
+ image (numpy.ndarray): The input image to be augmented.
4239
+ method (str): The augmentation method to apply. Supported methods are:
4240
+ - 'rotate90': Rotates the image 90 degrees clockwise.
4241
+ - 'rotate180': Rotates the image 180 degrees.
4242
+ - 'rotate270': Rotates the image 90 degrees counterclockwise.
4243
+ - 'flip_h': Flips the image horizontally.
4244
+ - 'flip_v': Flips the image vertically.
4245
+
4246
+ Returns:
4247
+ numpy.ndarray: The augmented image. If the method is not recognized,
4248
+ the original image is returned unchanged.
4249
+ """
3480
4250
  if method == 'rotate90':
3481
4251
  return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
3482
4252
  elif method == 'rotate180':
@@ -3490,6 +4260,24 @@ def apply_augmentation(image, method):
3490
4260
  return image
3491
4261
 
3492
4262
  def process_instruction(entry):
4263
+ """
4264
+ Processes a single image/mask entry by reading, optionally augmenting, and saving both image and mask.
4265
+
4266
+ Parameters
4267
+ ----------
4268
+ entry : dict
4269
+ A dictionary with the following keys:
4270
+ - 'src_img' (str): Path to the source image file.
4271
+ - 'src_msk' (str): Path to the source mask file.
4272
+ - 'dst_img' (str): Path to save the processed image.
4273
+ - 'dst_msk' (str): Path to save the processed mask.
4274
+ - 'augment' (str or None): Augmentation identifier to apply (e.g., 'rotate90', 'flip', or None).
4275
+
4276
+ Returns
4277
+ -------
4278
+ int
4279
+ Returns 1 upon successful completion.
4280
+ """
3493
4281
  img = tifffile.imread(entry["src_img"])
3494
4282
  msk = tifffile.imread(entry["src_msk"])
3495
4283
  if entry["augment"]:
@@ -3500,7 +4288,35 @@ def process_instruction(entry):
3500
4288
  return 1
3501
4289
 
3502
4290
  def prepare_cellpose_dataset(input_root, augment_data=False, train_fraction=0.8, n_jobs=None):
3503
-
4291
+ """
4292
+ Prepare a training and testing dataset for Cellpose from multiple subdirectories containing TIFF images and corresponding masks.
4293
+
4294
+ This function scans all subfolders in `input_root` that contain a "masks/" directory, finds image-mask pairs,
4295
+ and splits them into train/test sets. Optionally, it augments data using rotation and flipping to balance dataset sizes
4296
+ across all datasets. All output is saved in a standardized format to a new "cellpose_dataset/" folder inside `input_root`.
4297
+
4298
+ Parameters
4299
+ ----------
4300
+ input_root : str
4301
+ Path to the folder containing one or more datasets. Each dataset should have a 'masks/' subfolder with mask files
4302
+ matching the TIFF filenames.
4303
+
4304
+ augment_data : bool, optional
4305
+ If True, perform data augmentation (rotation/flipping) to increase or equalize the number of samples per dataset.
4306
+ Default is False.
4307
+
4308
+ train_fraction : float, optional
4309
+ Fraction of data to use for training. The rest will go to testing. Default is 0.8 (i.e., 80% train, 20% test).
4310
+
4311
+ n_jobs : int or None, optional
4312
+ Number of parallel worker processes. If None, uses all available CPUs minus one.
4313
+
4314
+ Returns
4315
+ -------
4316
+ None
4317
+ All output TIFFs are saved under `input_root/cellpose_dataset/train/` and `.../test/` folders with consistent naming.
4318
+ A progress bar is printed to track the status of preprocessing.
4319
+ """
3504
4320
  from .utils import print_progress
3505
4321
 
3506
4322
  time_ls = []