PyPI - spacr - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

spacr 0.2.4py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

spacr/io.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob
+import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue
 import numpy as np
 import pandas as pd
 import tifffile
-from PIL import Image
-from collections import defaultdict, Counter
+from PIL import Image, ImageOps
+from collections import defaultdict, Counter, deque
 from pathlib import Path
 from functools import partial
 from matplotlib.animation import FuncAnimation
@@ -17,12 +17,12 @@ import imageio.v2 as imageio2
 import matplotlib.pyplot as plt
 from io import BytesIO
 from IPython.display import display, clear_output
-from multiprocessing import Pool, cpu_count
-from torch.utils.data import Dataset
+from multiprocessing import Pool, cpu_count, Process, Queue
+from torch.utils.data import Dataset, DataLoader
 import matplotlib.pyplot as plt
 from torchvision.transforms import ToTensor
 import seaborn as sns
+import atexit
 from .logger import log_function_call
@@ -444,20 +444,7 @@ class NoClassDataset(Dataset):
         # Return both the image and its filename
         return img, self.filenames[index]
-class MyDataset(Dataset):
-    """
-    A custom dataset class for loading and processing image data.
-    Args:
-        data_dir (str): The directory path where the image data is stored.
-        loader_classes (list): A list of class names for the dataset.
-        transform (callable, optional): A function/transform to apply to the image data. Default is None.
-        shuffle (bool, optional): Whether to shuffle the dataset. Default is True.
-        pin_memory (bool, optional): Whether to pin the loaded images to memory. Default is False.
-        specific_files (list, optional): A list of specific file paths to include in the dataset. Default is None.
-        specific_labels (list, optional): A list of specific labels corresponding to the specific files. Default is None.
-    """
+class spacrDataset(Dataset):
     def __init__(self, data_dir, loader_classes, transform=None, shuffle=True, pin_memory=False, specific_files=None, specific_labels=None):
         self.data_dir = data_dir
         self.classes = loader_classes
@@ -466,7 +453,7 @@ class MyDataset(Dataset):
         self.pin_memory = pin_memory
         self.filenames = []
         self.labels = []
         if specific_files and specific_labels:
             self.filenames = specific_files
             self.labels = specific_labels
@@ -479,33 +466,113 @@ class MyDataset(Dataset):
         if self.shuffle:
             self.shuffle_dataset()
         if self.pin_memory:
-            self.images = [self.load_image(f) for f in self.filenames]
+            # Use multiprocessing to load images in parallel
+            with Pool(processes=cpu_count()) as pool:
+                self.images = pool.map(self.load_image, self.filenames)
+        else:
+            self.images = None
     def load_image(self, img_path):
         img = Image.open(img_path).convert('RGB')
+        img = ImageOps.exif_transpose(img)  # Handle image orientation
         return img
     def __len__(self):
         return len(self.filenames)
     def shuffle_dataset(self):
         combined = list(zip(self.filenames, self.labels))
         random.shuffle(combined)
         self.filenames, self.labels = zip(*combined)
     def get_plate(self, filepath):
-        filename = os.path.basename(filepath)  # Get just the filename from the full path
+        filename = os.path.basename(filepath)
         return filename.split('_')[0]
     def __getitem__(self, index):
+        if self.pin_memory:
+            img = self.images[index]
+        else:
+            img = self.load_image(self.filenames[index])
         label = self.labels[index]
         filename = self.filenames[index]
-        img = self.load_image(filename)
         if self.transform:
             img = self.transform(img)
         return img, label, filename
+class spacrDataLoader(DataLoader):
+    def __init__(self, *args, preload_batches=1, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.preload_batches = preload_batches
+        self.batch_queue = Queue(maxsize=preload_batches)
+        self.process = None
+        self.current_batch_index = 0
+        self._stop_event = False
+        self.pin_memory = kwargs.get('pin_memory', False)
+        atexit.register(self.cleanup)
+    def _preload_next_batches(self):
+        try:
+            for _ in range(self.preload_batches):
+                if self._stop_event:
+                    break
+                batch = next(self._iterator)
+                if self.pin_memory:
+                    batch = self._pin_memory_batch(batch)
+                self.batch_queue.put(batch)
+        except StopIteration:
+            pass
+    def _start_preloading(self):
+        if self.process is None or not self.process.is_alive():
+            self._iterator = iter(super().__iter__())
+            if not self.pin_memory:
+                self.process = Process(target=self._preload_next_batches)
+                self.process.start()
+            else:
+                self._preload_next_batches()  # Directly load if pin_memory is True
+    def _pin_memory_batch(self, batch):
+        if isinstance(batch, (list, tuple)):
+            return [b.pin_memory() if isinstance(b, torch.Tensor) else b for b in batch]
+        elif isinstance(batch, torch.Tensor):
+            return batch.pin_memory()
+        else:
+            return batch
+    def __iter__(self):
+        self._start_preloading()
+        return self
+    def __next__(self):
+        if self.process and not self.process.is_alive() and self.batch_queue.empty():
+            raise StopIteration
+        try:
+            if self.pin_memory:
+                next_batch = self.batch_queue.get(timeout=60)
+            else:
+                next_batch = self.batch_queue.get(timeout=60)
+            self.current_batch_index += 1
+            # Start preloading the next batches
+            if self.batch_queue.qsize() < self.preload_batches:
+                self._start_preloading()
+            return next_batch
+        except queue.Empty:
+            raise StopIteration
+    def cleanup(self):
+        self._stop_event = True
+        if self.process and self.process.is_alive():
+            self.process.terminate()
+            self.process.join()
+    def __del__(self):
+        self.cleanup()
 class NoClassDataset(Dataset):
     def __init__(self, data_dir, transform=None, shuffle=True, load_to_memory=False):
@@ -588,20 +655,20 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
     regular_expression = re.compile(regex)
     images_by_key = defaultdict(list)
     stack_path = os.path.join(src, 'stack')
+    files_processed = 0
     if not os.path.exists(stack_path) or (os.path.isdir(stack_path) and len(os.listdir(stack_path)) == 0):
         all_filenames = [filename for filename in os.listdir(src) if filename.endswith(img_format)]
-        print(f'All_files:{len(all_filenames)} in {src}')
+        print(f'All_files: {len(all_filenames)} in {src}')
         time_ls = []
-        processed = 0
-        for i in range(0, len(all_filenames), batch_size):
+        for idx in range(0, len(all_filenames), batch_size):
             start = time.time()
-            batch_filenames = all_filenames[i:i+batch_size]
-            processed += len(batch_filenames)
+            batch_filenames = all_filenames[idx:idx+batch_size]
             for filename in batch_filenames:
                 images_by_key = _extract_filename_metadata(batch_filenames, src, images_by_key, regular_expression, metadata_type, pick_slice, skip_mode)
             if pick_slice:
-                for key in images_by_key:
+                for i, key in enumerate(images_by_key):
                     plate, well, field, channel, mode = key
                     max_intensity_slice = max(images_by_key[key], key=lambda x: np.percentile(x, 90))
                     mip_image = Image.fromarray(max_intensity_slice)
@@ -609,21 +676,19 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
                     os.makedirs(output_dir, exist_ok=True)
                     output_filename = f'{plate}_{well}_{field}.tif'
                     output_path = os.path.join(output_dir, output_filename)
-                    if os.path.exists(output_path):
-                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
-                    else:
-                        mip_image.save(output_path)
+                    files_processed += 1
                     stop = time.time()
                     duration = stop - start
                     time_ls.append(duration)
-                    files_processed = processed
                     files_to_process = len(all_filenames)
                     print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
+                    if os.path.exists(output_path):
+                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
+                    else:
+                        mip_image.save(output_path)
             else:
-                for key, images in images_by_key.items():
+                for i, (key, images) in enumerate(images_by_key.items()):
                     mip = np.max(np.stack(images), axis=0)
                     mip_image = Image.fromarray(mip)
                     plate, well, field, channel = key[:4]
@@ -631,18 +696,17 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
                     os.makedirs(output_dir, exist_ok=True)
                     output_filename = f'{plate}_{well}_{field}.tif'
                     output_path = os.path.join(output_dir, output_filename)
-                    if os.path.exists(output_path):
-                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
-                    else:
-                        mip_image.save(output_path)
+                    files_processed += 1
                     stop = time.time()
                     duration = stop - start
                     time_ls.append(duration)
-                    files_processed = processed
                     files_to_process = len(all_filenames)
                     print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
+                    if os.path.exists(output_path):
+                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
+                    else:
+                        mip_image.save(output_path)
             images_by_key.clear()
         # Move original images to a new directory
@@ -656,6 +720,7 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
                     print(f'WARNING: A file with the same name already exists at location {move}')
                 else:
                     shutil.move(os.path.join(src, filename), move)
+    files_processed = 0
     return
 def _merge_file(chan_dirs, stack_dir, file_name):
@@ -975,7 +1040,7 @@ def _concatenate_channel(src, channels, randomize=True, timelapse=False, batch_s
                 time_ls.append(duration)
                 files_processed = i+1
                 files_to_process = time_stack_path_lists
-                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=None, batch_size=None, operation_type="Concatinating")
+                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type="Concatinating")
                 stack = np.stack(stack_region)
                 save_loc = os.path.join(channel_stack_loc, f'{name}.npz')
                 np.savez(save_loc, data=stack, filenames=filenames_region)
@@ -1006,7 +1071,7 @@ def _concatenate_channel(src, channels, randomize=True, timelapse=False, batch_s
             time_ls.append(duration)
             files_processed = i+1
             files_to_process = nr_files
-            print_progress(files_processed, files_to_process, n_jobs=1, time_ls=None, batch_size=None, operation_type="Concatinating")
+            print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type="Concatinating")
             if (i+1) % batch_size == 0 or i+1 == nr_files:
                 unique_shapes = {arr.shape[:-1] for arr in stack_ls}
                 if len(unique_shapes) > 1:
@@ -1104,7 +1169,7 @@ def _normalize_img_batch(stack, channels, save_dtype, settings):
         time_ls.append(duration)
         files_processed = i+1
         files_to_process = len(channels)
-        print_progress(files_processed, files_to_process, n_jobs=1, time_ls=None, batch_size=None, operation_type=f"Normalizing: Channel: {channel}")
+        print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type=f"Normalizing")
     return normalized_stack.astype(save_dtype)
@@ -1151,7 +1216,6 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
                         parts = file.split('_')
                         name = parts[0] + '_' + parts[1] + '_' + parts[2]
                     array = np.load(path)
-                    #array = np.take(array, channels, axis=2)
                     stack_region.append(array)
                     filenames_region.append(os.path.basename(path))
                 stop = time.time()
@@ -1159,7 +1223,7 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
                 time_ls.append(duration)
                 files_processed = i+1
                 files_to_process = len(time_stack_path_lists)
-                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=None, batch_size=None, operation_type="Concatinating")
+                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Concatinating")
                 stack = np.stack(stack_region)
                 normalized_stack = _normalize_img_batch(stack=stack,
@@ -1188,18 +1252,18 @@ def concatenate_and_normalize(src, channels, save_dtype=np.float32, settings={})
         stack_ls = []
         filenames_batch = []
         time_ls = []
+        files_processed = 0
         for i, path in enumerate(paths):
             start = time.time()
             array = np.load(path)
-            #array = np.take(array, channels, axis=2)
             stack_ls.append(array)
             filenames_batch.append(os.path.basename(path))
             stop = time.time()
             duration = stop - start
             time_ls.append(duration)
-            files_processed = i+1
+            files_processed += 1
             files_to_process = nr_files
-            print_progress(files_processed, files_to_process, n_jobs=1, time_ls=None, batch_size=None, operation_type="Concatinating")
+            print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Concatinating")
             if (i + 1) % settings['batch_size'] == 0 or i + 1 == nr_files:
                 unique_shapes = {arr.shape[:-1] for arr in stack_ls}
@@ -1350,12 +1414,12 @@ def _normalize_stack(src, backgrounds=[100, 100, 100], remove_backgrounds=[False
                 average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
                 print(f'channels:{chan_index}/{stack.shape[-1] - 1}, arrays:{array_index + 1}/{single_channel.shape[0]}, Signal:{upper:.1f}, noise:{lower:.1f}, Signal-to-noise:{average_stnr:.1f}, Time/channel:{average_time:.2f}sec')
-                stop = time.time()
-                duration = stop - start
-                time_ls.append(duration)
-                files_processed = file_index + 1
-                files_to_process = len(paths)
-                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Normalizing")
+                #stop = time.time()
+                #duration = stop - start
+                #time_ls.append(duration)
+                #files_processed = file_index + 1
+                #files_to_process = len(paths)
+                #print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Normalizing")
             normalized_stack[:, :, :, channel] = arr_2d_normalized
@@ -1405,12 +1469,12 @@ def _normalize_timelapse(src, lower_percentile=2, save_dtype=np.float32):
                 print(f'channels:{chan_index+1}/{stack.shape[-1]}, arrays:{array_index+1}/{single_channel.shape[0]}', end='\r')
-                stop = time.time()
-                duration = stop - start
-                time_ls.append(duration)
-                files_processed = file_index+1
-                files_to_process = len(paths)
-                print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Normalizing")
+                #stop = time.time()
+                #duration = stop - start
+                #time_ls.append(duration)
+                #files_processed = file_index+1
+                #files_to_process = len(paths)
+                #print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Normalizing")
         save_loc = os.path.join(output_fldr, f'{name}_norm_timelapse.npz')
         np.savez(save_loc, data=normalized_stack, filenames=filenames)
@@ -1620,8 +1684,8 @@ def preprocess_img_data(settings):
                               save_dtype=np.float32,
                               settings=settings)
-    if plot:
-        _plot_4D_arrays(src+'/norm_channel_stack', nr_npz=1, nr=nr)
+    #if plot:
+    #    _plot_4D_arrays(src+'/norm_channel_stack', nr_npz=1, nr=nr)
     return settings, src
@@ -1951,7 +2015,7 @@ def _load_and_concatenate_arrays(src, channels, cell_chann_dim, nucleus_chann_di
     all_imgs = len(os.listdir(reference_folder))
     time_ls = []
     # Iterate through each file in the reference folder
-    for filename in os.listdir(reference_folder):
+    for idx, filename in enumerate(os.listdir(reference_folder)):
         start = time.time()
         stack_ls = []
         if filename.endswith('.npy'):
@@ -2012,7 +2076,7 @@ def _load_and_concatenate_arrays(src, channels, cell_chann_dim, nucleus_chann_di
         stop = time.time()
         duration = stop - start
         time_ls.append(duration)
-        files_processed = count
+        files_processed = idx+1
         files_to_process = all_imgs
         print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=None, operation_type="Merging Arrays")
@@ -2295,18 +2359,27 @@ def _save_model(model, model_type, results_df, dst, epoch, epochs, intermedeate_
     def save_model_at_threshold(threshold, epoch, suffix=""):
         percentile = str(threshold * 100)
-        print(f'\rfound: {percentile}% accurate model')#, end='\r', flush=True)
-        torch.save(model, f'{dst}/{model_type}_epoch_{str(epoch)}{suffix}_acc_{percentile}_channels_{channels_str}.pth')
+        print(f'Found: {percentile}% accurate model')
+        model_path = f'{dst}/{model_type}_epoch_{str(epoch)}{suffix}_acc_{percentile}_channels_{channels_str}.pth'
+        torch.save(model, model_path)
+        return model_path
     if epoch % 100 == 0 or epoch == epochs:
-        torch.save(model, f'{dst}/{model_type}_epoch_{str(epoch)}_channels_{channels_str}.pth')
+        model_path = f'{dst}/{model_type}_epoch_{str(epoch)}_channels_{channels_str}.pth'
+        torch.save(model, model_path)
+        return model_path
     for threshold in intermedeate_save:
-        if results_df['neg_accuracy'].dropna().mean() >= threshold and results_df['pos_accuracy'].dropna().mean() >= threshold:
-            save_model_at_threshold(threshold, epoch)
-            break  # Ensure we only save for the highest matching threshold
+        if results_df['neg_accuracy'] >= threshold and results_df['pos_accuracy'] >= threshold:
+            print(f"Nc class accuracy: {results_df['neg_accuracy']} Pc class Accuracy: {results_df['pos_accuracy']}")
+            model_path = save_model_at_threshold(threshold, epoch)
+            break
+        else:
+            model_path = None
+    return model_path
-def _save_progress(dst, results_df, train_metrics_df, epoch, epochs):
+def _save_progress(dst, results_df, result_type='train'):
     """
     Save the progress of the classification model.
@@ -2320,18 +2393,13 @@ def _save_progress(dst, results_df, train_metrics_df, epoch, epochs):
     """
     # Save accuracy, loss, PRAUC
     os.makedirs(dst, exist_ok=True)
-    results_path = os.path.join(dst, 'acc_loss_prauc.csv')
+    results_path = os.path.join(dst, f'{result_type}.csv')
     if not os.path.exists(results_path):
         results_df.to_csv(results_path, index=True, header=True, mode='w')
     else:
         results_df.to_csv(results_path, index=True, header=False, mode='a')
-    training_metrics_path = os.path.join(dst, 'training_metrics.csv')
-    if not os.path.exists(training_metrics_path):
-        train_metrics_df.to_csv(training_metrics_path, index=True, header=True, mode='w')
-    else:
-        train_metrics_df.to_csv(training_metrics_path, index=True, header=False, mode='a')
-    if epoch == epochs:
+    if result_type == 'train':
         read_plot_model_stats(results_path, save=True)
     return
@@ -2550,7 +2618,6 @@ def _read_mask(mask_path):
         mask = img_as_uint(mask)
     return mask
 def convert_numpy_to_tiff(folder_path, limit=None):
     """
     Converts all numpy files in a folder to TIFF format and saves them in a subdirectory 'tiff'.

spacr 0.2.4__py3-none-any.whl → 0.2.8__py3-none-any.whl

spacr 0.2.4py3-none-any.whl → 0.2.8py3-none-any.whl