PyPI - spacr - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

spacr 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

spacr/__init__.py +19 -3
spacr/cellpose.py +311 -0
spacr/core.py +245 -2494
spacr/deep_spacr.py +335 -163
spacr/gui.py +2 -0
spacr/gui_core.py +85 -65
spacr/gui_elements.py +110 -5
spacr/gui_utils.py +375 -7
spacr/io.py +680 -141
spacr/logger.py +28 -9
spacr/measure.py +108 -133
spacr/mediar.py +0 -3
spacr/ml.py +1051 -0
spacr/openai.py +37 -0
spacr/plot.py +707 -20
spacr/resources/data/lopit.csv +3833 -0
spacr/resources/data/toxoplasma_metadata.csv +8843 -0
spacr/resources/icons/convert.png +0 -0
spacr/resources/{models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model → icons/dna_matrix.mp4} +0 -0
spacr/sequencing.py +241 -1311
spacr/settings.py +181 -50
spacr/sim.py +0 -2
spacr/submodules.py +349 -0
spacr/timelapse.py +0 -2
spacr/toxo.py +238 -0
spacr/utils.py +776 -182
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/METADATA +31 -22
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/RECORD +32 -33
spacr/chris.py +0 -50
spacr/graph_learning.py +0 -340
spacr/resources/MEDIAR/.git +0 -1
spacr/resources/MEDIAR_weights/.DS_Store +0 -0
spacr/resources/icons/.DS_Store +0 -0
spacr/resources/icons/spacr_logo_rotation.gif +0 -0
spacr/resources/models/cp/toxo_plaque_cyto_e25000_X1120_Y1120.CP_model_settings.csv +0 -23
spacr/resources/models/cp/toxo_pv_lumen.CP_model +0 -0
spacr/sim_app.py +0 -0
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/LICENSE +0 -0
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/WHEEL +0 -0
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/entry_points.txt +0 -0
{spacr-0.3.1.dist-info → spacr-0.3.3.dist-info}/top_level.txt +0 -0

spacr/io.py CHANGED Viewed

@@ -1,30 +1,133 @@
-import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue
+import os, re, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, cellpose, glob, queue, tifffile, czifile, atexit, datetime
 import numpy as np
 import pandas as pd
-import tifffile
 from PIL import Image, ImageOps
-from collections import defaultdict, Counter, deque
+from collections import defaultdict, Counter
 from pathlib import Path
 from functools import partial
 from matplotlib.animation import FuncAnimation
 from IPython.display import display
 from skimage.util import img_as_uint
 from skimage.exposure import rescale_intensity
-from skimage import filters
 import skimage.measure as measure
 from skimage import exposure
 import imageio.v2 as imageio2
 import matplotlib.pyplot as plt
 from io import BytesIO
-from IPython.display import display, clear_output
-from multiprocessing import Pool, cpu_count, Process, Queue
-from torch.utils.data import Dataset, DataLoader
+from IPython.display import display
+from multiprocessing import Pool, cpu_count, Process, Queue, Value, Lock
+from torch.utils.data import Dataset, DataLoader, random_split
 import matplotlib.pyplot as plt
 from torchvision.transforms import ToTensor
-import seaborn as sns
-import atexit
-from .logger import log_function_call
+import seaborn as sns
+from nd2reader import ND2Reader
+from torchvision import transforms
+from sklearn.model_selection import train_test_split
+def process_non_tif_non_2D_images(folder):
+    """Processes all images in the folder and splits them into grayscale channels, preserving bit depth."""
+    # Helper function to save grayscale images
+    def save_grayscale_images(image, base_name, folder, dtype, channel=None, z=None, t=None):
+        """Save grayscale images with appropriate suffix based on channel, z, and t, preserving bit depth."""
+        suffix = ""
+        if channel is not None:
+            suffix += f"_C{channel}"
+        if z is not None:
+            suffix += f"_Z{z}"
+        if t is not None:
+            suffix += f"_T{t}"
+        output_filename = os.path.join(folder, f"{base_name}{suffix}.tif")
+        tifffile.imwrite(output_filename, image.astype(dtype))
+    # Function to handle splitting of multi-dimensional images into grayscale channels
+    def split_channels(image, folder, base_name, dtype):
+        """Splits the image into channels and handles 3D, 4D, and 5D image cases."""
+        if image.ndim == 2:
+            # Grayscale image, already processed separately
+            return
+        elif image.ndim == 3:
+            # 3D image: (height, width, channels)
+            for c in range(image.shape[2]):
+                save_grayscale_images(image[..., c], base_name, folder, dtype, channel=c+1)
+        elif image.ndim == 4:
+            # 4D image: (height, width, channels, Z-dimension)
+            for z in range(image.shape[3]):
+                for c in range(image.shape[2]):
+                    save_grayscale_images(image[..., c, z], base_name, folder, dtype, channel=c+1, z=z+1)
+        elif image.ndim == 5:
+            # 5D image: (height, width, channels, Z-dimension, Time)
+            for t in range(image.shape[4]):
+                for z in range(image.shape[3]):
+                    for c in range(image.shape[2]):
+                        save_grayscale_images(image[..., c, z, t], base_name, folder, dtype, channel=c+1, z=z+1, t=t+1)
+    # Function to load images in various formats
+    def load_image(file_path):
+        """Loads image from various formats and returns it as a numpy array along with its dtype."""
+        ext = os.path.splitext(file_path)[1].lower()
+        if ext in ['.tif', '.tiff']:
+            image = tifffile.imread(file_path)
+            return image, image.dtype
+        elif ext in ['.png', '.jpg', '.jpeg']:
+            image = Image.open(file_path)
+            return np.array(image), image.mode
+        elif ext == '.czi':
+            with czifile.CziFile(file_path) as czi:
+                image = czi.asarray()
+                return image, image.dtype
+        elif ext == '.nd2':
+            with ND2Reader(file_path) as nd2:
+                image = np.array(nd2)
+                return image, image.dtype
+        else:
+            raise ValueError(f"Unsupported file extension: {ext}")
+    # Function to check if an image is grayscale and save it as a TIFF if it isn't already
+    def convert_grayscale_to_tiff(image, filename, folder, dtype):
+        """Convert grayscale images that are not in TIFF format to TIFF, preserving bit depth."""
+        base_name = os.path.splitext(filename)[0]
+        output_filename = os.path.join(folder, f"{base_name}.tif")
+        tifffile.imwrite(output_filename, image.astype(dtype))
+        print(f"Converted grayscale image {filename} to TIFF with bit depth {dtype}.")
+    # Supported formats
+    supported_formats = ['.tif', '.tiff', '.png', '.jpg', '.jpeg', '.czi', '.nd2']
+    # Loop through all files in the folder
+    for filename in os.listdir(folder):
+        file_path = os.path.join(folder, filename)
+        ext = os.path.splitext(file_path)[1].lower()
+        if ext in supported_formats:
+            print(f"Processing {filename}")
+            try:
+                # Load the image and its dtype
+                image, dtype = load_image(file_path)
+                # If the image is grayscale (2D), convert it to TIFF if it's not already in TIFF format
+                if image.ndim == 2:
+                    if ext not in ['.tif', '.tiff']:
+                        convert_grayscale_to_tiff(image, filename, folder, dtype)
+                    else:
+                        print(f"Image {filename} is already grayscale and in TIFF format, skipping.")
+                    continue
+                # Otherwise, split channels and save images
+                base_name = os.path.splitext(filename)[0]
+                split_channels(image, folder, base_name, dtype)
+            except Exception as e:
+                print(f"Error processing {filename}: {str(e)}")
 def _load_images_and_labels(image_files, label_files, circular=False, invert=False):
@@ -632,6 +735,20 @@ class TarImageDataset(Dataset):
             img = self.transform(img)
         return img, m.name
+def load_images_from_paths(images_by_key):
+    images_dict = {}
+    for key, paths in images_by_key.items():
+        images_dict[key] = []
+        for path in paths:
+            try:
+                with Image.open(path) as img:
+                    images_dict[key].append(np.array(img))
+            except Exception as e:
+                print(f"Error loading image from {path}: {e}")
+    return images_dict
 #@log_function_call
 def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=False, skip_mode='01', metadata_type='', img_format='.tif'):
@@ -657,15 +774,20 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
     files_processed = 0
     if not os.path.exists(stack_path) or (os.path.isdir(stack_path) and len(os.listdir(stack_path)) == 0):
         all_filenames = [filename for filename in os.listdir(src) if filename.endswith(img_format)]
-        print(f'All_files: {len(all_filenames)} in {src}')
+        print(f'All files: {len(all_filenames)} in {src}')
         time_ls = []
-        for idx in range(0, len(all_filenames), batch_size):
+        image_paths_by_key = _extract_filename_metadata(all_filenames, src, regular_expression, metadata_type, pick_slice, skip_mode)
+        # Convert dictionary keys to a list for batching
+        batching_keys = list(image_paths_by_key.keys())
+        print(f'All unique FOV: {len(image_paths_by_key)} in {src}')
+        for idx in range(0, len(image_paths_by_key), batch_size):
             start = time.time()
-            batch_filenames = all_filenames[idx:idx+batch_size]
-            for filename in batch_filenames:
-                images_by_key = _extract_filename_metadata(batch_filenames, src, regular_expression, metadata_type, pick_slice, skip_mode)
+            # Select batch keys and create a subset of the dictionary for this batch
+            batch_keys = batching_keys[idx:idx+batch_size]
+            batch_images_by_key = {key: image_paths_by_key[key] for key in batch_keys}
+            images_by_key = load_images_from_paths(batch_images_by_key)
             if pick_slice:
                 for i, key in enumerate(images_by_key):
                     plate, well, field, channel, mode = key
@@ -682,10 +804,10 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
                     files_to_process = len(all_filenames)
                     print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
-                    #if os.path.exists(output_path):
-                    #    print(f'WARNING: A file with the same name already exists at location {output_filename}')
                     if not os.path.exists(output_path):
                         mip_image.save(output_path)
+                    else:
+                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
             else:
                 for i, (key, images) in enumerate(images_by_key.items()):
                     plate, well, field, channel = key[:4]
@@ -702,10 +824,11 @@ def _rename_and_organize_image_files(src, regex, batch_size=100, pick_slice=Fals
                     files_to_process = len(all_filenames)
                     print_progress(files_processed, files_to_process, n_jobs=1, time_ls=time_ls, batch_size=batch_size, operation_type='Preprocessing filenames')
-                    #if os.path.exists(output_path):
-                    #    print(f'WARNING: A file with the same name already exists at location {output_filename}')
                     if not os.path.exists(output_path):
                         mip_image.save(output_path)
+                    else:
+                        print(f'WARNING: A file with the same name already exists at location {output_filename}')
             images_by_key.clear()
         # Move original images to a new directory
@@ -862,47 +985,6 @@ def _move_to_chan_folder(src, regex, timelapse=False, metadata_type=''):
                     shutil.move(os.path.join(src, filename), move)
     return
-def _merge_channels_v2(src, plot=False):
-    from .plot import plot_arrays
-    """
-    Merge the channels in the given source directory and save the merged files in a 'stack' directory.
-    Args:
-        src (str): The path to the source directory containing the channel folders.
-        plot (bool, optional): Whether to plot the merged arrays. Defaults to False.
-    Returns:
-        None
-    """
-    src = Path(src)
-    stack_dir = src / 'stack'
-    chan_dirs = [d for d in src.iterdir() if d.is_dir() and d.name in ['01', '02', '03', '04', '00', '1', '2', '3', '4','0']]
-    chan_dirs.sort(key=lambda x: x.name)
-    print(f'List of folders in src: {[d.name for d in chan_dirs]}. Single channel folders.')
-    start_time = time.time()
-    # First directory and its files
-    dir_files = list(chan_dirs[0].iterdir())
-    # Create the 'stack' directory if it doesn't exist
-    stack_dir.mkdir(exist_ok=True)
-    print(f'generated folder with merged arrays: {stack_dir}')
-    if _is_dir_empty(stack_dir):
-        with Pool(max(cpu_count() // 2, 1)) as pool:
-        #with Pool(cpu_count()) as pool:
-            merge_func = partial(_merge_file, chan_dirs, stack_dir)
-            pool.map(merge_func, dir_files)
-    avg_time = (time.time() - start_time) / len(dir_files)
-    print(f'Average Time: {avg_time:.3f} sec')
-    if plot:
-        plot_arrays(src+'/stack')
-    return
 def _merge_channels(src, plot=False):
     """
     Merge the channels in the given source directory and save the merged files in a 'stack' directory without using multiprocessing.
@@ -961,9 +1043,7 @@ def _mip_all(src, include_first_chan=True):
     Returns:
         None
     """
-    from .utils import normalize_to_dtype
     #print('========== generating MIPs ==========')
     # Iterate over each file in the specified directory (src).
     for filename in os.listdir(src):
@@ -1337,7 +1417,6 @@ def _get_lists_for_normalization(settings):
     return backgrounds, signal_to_noise, signal_thresholds, remove_background
 def _normalize_stack(src, backgrounds=[100, 100, 100], remove_backgrounds=[False, False, False], lower_percentile=2, save_dtype=np.float32, signal_to_noise=[5, 5, 5], signal_thresholds=[1000, 1000, 1000]):
-    from .utils import print_progress
     """
     Normalize the stack of images.
@@ -1430,7 +1509,6 @@ def _normalize_stack(src, backgrounds=[100, 100, 100], remove_backgrounds=[False
     return print(f'Saved stacks: {output_fldr}')
 def _normalize_timelapse(src, lower_percentile=2, save_dtype=np.float32):
-    from .utils import print_progress
     """
     Normalize the timelapse data by rescaling the intensity values based on percentiles.
@@ -1559,7 +1637,7 @@ def delete_empty_subdirectories(folder_path):
 #@log_function_call
 def preprocess_img_data(settings):
-    from .plot import plot_arrays, _plot_4D_arrays
+    from .plot import plot_arrays
     from .utils import _run_test_mode, _get_regex
     from .settings import set_default_settings_preprocess_img_data
@@ -2054,7 +2132,6 @@ def _load_and_concatenate_arrays(src, channels, cell_chann_dim, nucleus_chann_di
                     padded_shapes = [shape + (0,) * (max_tuple_length - len(shape)) for shape in unique_shapes]
                     # Now create a NumPy array and find the maximum dimensions
                     max_dims = np.max(np.array(padded_shapes), axis=0)
-                    #clear_output(wait=True)
                     print(f'Warning: arrays with multiple shapes found. Padding arrays to max X,Y dimentions {max_dims}')
                     #print(f'Warning: arrays with multiple shapes found. Padding arrays to max X,Y dimentions {max_dims}', end='\r', flush=True)
                     padded_stack_ls = []
@@ -2102,7 +2179,7 @@ def _read_db(db_loc, tables):
     conn.close()
     return dfs
-def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=False, include_multiinfected=False, include_noninfected=False):
+def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False, uninfected=False):
     """
     Read and merge data from SQLite databases and perform data preprocessing.
@@ -2110,9 +2187,9 @@ def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=Fal
     - locs (list): A list of file paths to the SQLite database files.
     - tables (list): A list of table names to read from the databases.
     - verbose (bool): Whether to print verbose output. Default is False.
-    - include_multinucleated (bool): Whether to include multinucleated cells. Default is False.
-    - include_multiinfected (bool): Whether to include cells with multiple infections. Default is False.
-    - include_noninfected (bool): Whether to include non-infected cells. Default is False.
+    - nuclei_limit (bool): Whether to include multinucleated cells. Default is False.
+    - pathogen_limit (bool): Whether to include cells with multiple infections. Default is False.
+    - uninfected (bool): Whether to include non-infected cells. Default is False.
     Returns:
     - merged_df (pandas.DataFrame): The merged and preprocessed dataframe.
@@ -2187,7 +2264,7 @@ def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=Fal
         nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
         nucleus = nucleus.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
         nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
-        if include_multinucleated == False:
+        if nuclei_limit == False:
             #nucleus = nucleus[~nucleus['prcfo'].duplicated()]
             nucleus = nucleus[nucleus['nucleus_prcfo_count']==1]
         nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
@@ -2203,9 +2280,9 @@ def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=Fal
         pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
         pathogens = pathogens.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
         pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
-        if include_noninfected == False:
+        if uninfected == False:
             pathogens = pathogens[pathogens['pathogen_prcfo_count']>=1]
-        if include_multiinfected == False:
+        if pathogen_limit == False:
             pathogens = pathogens[pathogens['pathogen_prcfo_count']<=1]
         pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
         print(f'pathogens: {len(pathogens)}')
@@ -2267,12 +2344,8 @@ def _results_to_csv(src, df, df_well):
     wells.to_csv(wells_loc, index=True, header=True)
     cells.to_csv(cells_loc, index=True, header=True)
     return cells, wells
-###################################################
-#  Classify
-###################################################
-def read_plot_model_stats(file_path ,save=False):
+def read_plot_model_stats(train_file_path, val_file_path ,save=False):
     def _plot_and_save(train_df, val_df, column='accuracy', save=False, path=None, dpi=600):
@@ -2301,37 +2374,19 @@ def read_plot_model_stats(file_path ,save=False):
             plt.savefig(pdf_path, format='pdf', dpi=dpi)
         else:
             plt.show()
-    # Read the CSV into a dataframe
-    df = pd.read_csv(file_path, index_col=0)
-    # Split the dataframe into train and validation based on the index
-    train_df = df.filter(like='_train', axis=0).copy()
-    val_df = df.filter(like='_val', axis=0).copy()
-    fldr_1 = os.path.dirname(file_path)
-    train_csv_path = os.path.join(fldr_1, 'train.csv')
-    val_csv_path = os.path.join(fldr_1, 'validation.csv')
-    fldr_2 = os.path.dirname(fldr_1)
-    fldr_3 = os.path.dirname(fldr_2)
-    bn_1 = os.path.basename(fldr_1)
-    bn_2 = os.path.basename(fldr_2)
-    bn_3 = os.path.basename(fldr_3)
-    model_name = str(f'{bn_1}_{bn_2}_{bn_3}')
+    # Read the CSVs into DataFrames
+    train_df = pd.read_csv(train_file_path, index_col=0)
+    val_df = pd.read_csv(val_file_path, index_col=0)
-    # Extract epochs from index
-    train_df['epoch'] = [int(idx.split('_')[0]) for idx in train_df.index]
-    val_df['epoch'] = [int(idx.split('_')[0]) for idx in val_df.index]
-    # Save dataframes to a CSV file
-    train_df.to_csv(train_csv_path)
-    val_df.to_csv(val_csv_path)
+    # Get the folder path for saving plots
+    fldr_1 = os.path.dirname(train_file_path)
     if save:
         # Setting the style
         sns.set(style="whitegrid")
+    # Plot and save the results
     _plot_and_save(train_df, val_df, column='accuracy', save=save, path=fldr_1)
     _plot_and_save(train_df, val_df, column='neg_accuracy', save=save, path=fldr_1)
     _plot_and_save(train_df, val_df, column='pos_accuracy', save=save, path=fldr_1)
@@ -2379,50 +2434,53 @@ def _save_model(model, model_type, results_df, dst, epoch, epochs, intermedeate_
     return model_path
-def _save_progress(dst, results_df, result_type='train'):
+def _save_progress(dst, train_df, validation_df):
     """
     Save the progress of the classification model.
     Parameters:
     dst (str): The destination directory to save the progress.
-    results_df (pandas.DataFrame): The DataFrame containing accuracy, loss, and PRAUC.
-    train_metrics_df (pandas.DataFrame): The DataFrame containing training metrics.
+    train_df (pandas.DataFrame): The DataFrame containing training stats.
+    validation_df (pandas.DataFrame): The DataFrame containing validation stats (if available).
     Returns:
     None
     """
+    def _save_df_to_csv(file_path, df):
+        """
+        Save the given DataFrame to the specified CSV file, either creating a new file or appending to an existing one.
+        Parameters:
+        file_path (str): The file path where the CSV will be saved.
+        df (pandas.DataFrame): The DataFrame to save.
+        """
+        if not os.path.exists(file_path):
+            with open(file_path, 'w') as f:
+                df.to_csv(f, index=True, header=True)
+                f.flush()  # Ensure data is written to the file system
+        else:
+            with open(file_path, 'a') as f:
+                df.to_csv(f, index=True, header=False)
+                f.flush()
     # Save accuracy, loss, PRAUC
     os.makedirs(dst, exist_ok=True)
-    results_path = os.path.join(dst, f'{result_type}.csv')
-    if not os.path.exists(results_path):
-        results_df.to_csv(results_path, index=True, header=True, mode='w')
-    else:
-        results_df.to_csv(results_path, index=True, header=False, mode='a')
+    results_path_train = os.path.join(dst, 'train.csv')
+    results_path_validation = os.path.join(dst, 'validation.csv')
-    if result_type == 'train':
-        read_plot_model_stats(results_path, save=True)
-    return
+    # Save training data
+    _save_df_to_csv(results_path_train, train_df)
-def _save_settings(settings, src):
-    """
-    Save the settings dictionary to a CSV file.
+    # Save validation data if available
+    if validation_df is not None:
+        _save_df_to_csv(results_path_validation, validation_df)
-    Parameters:
-    - settings (dict): A dictionary containing the settings.
-    - src (str): The source directory where the settings file will be saved.
+        # Call read_plot_model_stats after ensuring the files are saved
+        read_plot_model_stats(results_path_train, results_path_validation, save=True)
-    Returns:
-    None
-    """
-    dst = os.path.join(src,'model')
-    settings_loc =  os.path.join(dst,'settings.csv')
-    os.makedirs(dst, exist_ok=True)
-    settings_df = pd.DataFrame(list(settings.items()), columns=['setting_key', 'setting_value'])
-    display(settings_df)
-    settings_df.to_csv(settings_loc, index=False)
     return
 def _copy_missclassified(df):
     misclassified = df[df['true_label'] != df['predicted_label']]
     for _, row in misclassified.iterrows():
@@ -2448,7 +2506,7 @@ def _read_db(db_loc, tables):
     conn.close() # Close the connection
     return dfs
-def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=False, include_multiinfected=False, include_noninfected=False):
+def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False, uninfected=False):
     from .utils import _split_data
@@ -2533,7 +2591,7 @@ def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=Fal
         nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
         nucleus = nucleus.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
         nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
-        if include_multinucleated == False:
+        if nuclei_limit == False:
             nucleus = nucleus[nucleus['nucleus_prcfo_count']==1]
         nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
         if verbose:
@@ -2559,20 +2617,30 @@ def _read_and_merge_data(locs, tables, verbose=False, include_multinucleated=Fal
         pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
         pathogens = pathogens.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
         pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
-        if include_noninfected == False:
+        print(f"before noninfected: {len(pathogens)}")
+        if uninfected == False:
             pathogens = pathogens[pathogens['pathogen_prcfo_count']>=1]
-        if isinstance(include_multiinfected, bool):
-            if include_multiinfected == False:
+            print(f"after noninfected: {len(pathogens)}")
+        if isinstance(pathogen_limit, bool):
+            if pathogen_limit == False:
                 pathogens = pathogens[pathogens['pathogen_prcfo_count']<=1]
-        if isinstance(include_multiinfected, float):
-            pathogens = pathogens[pathogens['pathogen_prcfo_count']<=include_multiinfected]
+                print(f"after multiinfected Bool: {len(pathogens)}")
+        if isinstance(pathogen_limit, float):
+            pathogen_limit = int(pathogen_limit)
+        if isinstance(pathogen_limit, int):
+            pathogens = pathogens[pathogens['pathogen_prcfo_count']<=pathogen_limit]
+            print(f"afer multiinfected Float: {len(pathogens)}")
         if not 'cell' in tables:
             pathogens_g_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
         else:
             pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
         if verbose:
             print(f'pathogens: {len(pathogens)}')
             print(f'pathogens grouped: {len(pathogens_g_df)}')
         if len(merged_df) == 0:
             merged_df = pathogens_g_df
         else:
@@ -2697,4 +2765,475 @@ def generate_cellpose_train_test(src, test_split=0.1):
             shutil.copy(img_path, new_img_path)
             shutil.copy(mask_path, new_mask_path)
             print(f'Copied {idx+1}/{len(ls)} images to {_type} set')#, end='\r', flush=True)
+def parse_gz_files(folder_path):
+    """
+    Parses the .fastq.gz files in the specified folder path and returns a dictionary
+    containing the sample names and their corresponding file paths.
+    Args:
+        folder_path (str): The path to the folder containing the .fastq.gz files.
+    Returns:
+        dict: A dictionary where the keys are the sample names and the values are
+        dictionaries containing the file paths for the 'R1' and 'R2' read directions.
+    """
+    files = os.listdir(folder_path)
+    gz_files = [f for f in files if f.endswith('.fastq.gz')]
+    samples_dict = {}
+    for gz_file in gz_files:
+        parts = gz_file.split('_')
+        sample_name = parts[0]
+        read_direction = parts[1]
+        if sample_name not in samples_dict:
+            samples_dict[sample_name] = {}
+        if read_direction == "R1":
+            samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
+        elif read_direction == "R2":
+            samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
+    return samples_dict
+def generate_dataset(settings={}):
+    from .utils import initiate_counter, add_images_to_tar, save_settings, generate_path_list_from_db, correct_paths
+    from .settings import set_generate_dataset_defaults
+    settings = set_generate_dataset_defaults(settings)
+    save_settings(settings, 'generate_dataset', show=True)
+    if isinstance(settings['src'], str):
+        settings['src'] = [settings['src']]
+    if isinstance(settings['src'], list):
+        all_paths = []
+        for i, src in enumerate(settings['src']):
+            db_path = os.path.join(src, 'measurements', 'measurements.db')
+            if i == 0:
+                dst = os.path.join(src, 'datasets')
+            paths = generate_path_list_from_db(db_path, file_metadata=settings['file_metadata'])
+            correct_paths(paths, src)
+            all_paths.extend(paths)
+        if isinstance(settings['sample'], int):
+            selected_paths = random.sample(all_paths, settings['sample'])
+            print(f"Random selection of {len(selected_paths)} paths")
+        elif isinstance(settings['sample'], list):
+            sample = settings['sample'][i]
+            selected_paths = random.sample(all_paths, settings['sample'])
+            print(f"Random selection of {len(selected_paths)} paths")
+        else:
+            selected_paths = all_paths
+            random.shuffle(selected_paths)
+            print(f"All paths: {len(selected_paths)} paths")
+    total_images = len(selected_paths)
+    print(f"Found {total_images} images")
+    # Create a temp folder in dst
+    temp_dir = os.path.join(dst, "temp_tars")
+    os.makedirs(temp_dir, exist_ok=True)
+    # Chunking the data
+    num_procs = max(2, cpu_count() - 2)
+    chunk_size = len(selected_paths) // num_procs
+    remainder = len(selected_paths) % num_procs
+    paths_chunks = []
+    start = 0
+    for i in range(num_procs):
+        end = start + chunk_size + (1 if i < remainder else 0)
+        paths_chunks.append(selected_paths[start:end])
+        start = end
+    temp_tar_files = [os.path.join(temp_dir, f"temp_{i}.tar") for i in range(num_procs)]
+    print(f"Generating temporary tar files in {dst}")
+    # Initialize shared counter and lock
+    counter = Value('i', 0)
+    lock = Lock()
+    with Pool(processes=num_procs, initializer=initiate_counter, initargs=(counter, lock)) as pool:
+        pool.starmap(add_images_to_tar, [(paths_chunks[i], temp_tar_files[i], total_images) for i in range(num_procs)])
+    # Combine the temporary tar files into a final tar
+    date_name = datetime.date.today().strftime('%y%m%d')
+    if len(settings['src']) > 1:
+        date_name = f"{date_name}_combined"
+    #if not settings['file_metadata'] is None:
+    #    tar_name = f"{date_name}_{settings['experiment']}_{settings['file_metadata']}.tar"
+    #else:
+    tar_name = f"{date_name}_{settings['experiment']}.tar"
+    tar_name = os.path.join(dst, tar_name)
+    if os.path.exists(tar_name):
+        number = random.randint(1, 100)
+        tar_name_2 = f"{date_name}_{settings['experiment']}_{settings['file_metadata']}_{number}.tar"
+        print(f"Warning: {os.path.basename(tar_name)} exists, saving as {os.path.basename(tar_name_2)} ")
+        tar_name = os.path.join(dst, tar_name_2)
+    print(f"Merging temporary files")
+    with tarfile.open(tar_name, 'w') as final_tar:
+        for temp_tar_path in temp_tar_files:
+            with tarfile.open(temp_tar_path, 'r') as temp_tar:
+                for member in temp_tar.getmembers():
+                    file_obj = temp_tar.extractfile(member)
+                    final_tar.addfile(member, file_obj)
+            os.remove(temp_tar_path)
+    # Delete the temp folder
+    shutil.rmtree(temp_dir)
+    print(f"\nSaved {total_images} images to {tar_name}")
+    return tar_name
+def generate_loaders(src, mode='train', image_size=224, batch_size=32, classes=['nc','pc'], n_jobs=None, validation_split=0.0, pin_memory=False, normalize=False, channels=[1, 2, 3], augment=False, verbose=False):
+    """
+    Generate data loaders for training and validation/test datasets.
+    Parameters:
+    - src (str): The source directory containing the data.
+    - mode (str): The mode of operation. Options are 'train' or 'test'.
+    - image_size (int): The size of the input images.
+    - batch_size (int): The batch size for the data loaders.
+    - classes (list): The list of classes to consider.
+    - n_jobs (int): The number of worker threads for data loading.
+    - validation_split (float): The fraction of data to use for validation.
+    - pin_memory (bool): Whether to pin memory for faster data transfer.
+    - normalize (bool): Whether to normalize the input images.
+    - verbose (bool): Whether to print additional information and show images.
+    - channels (list): The list of channels to retain. Options are [1, 2, 3] for all channels, [1, 2] for blue and green, etc.
+    Returns:
+    - train_loaders (list): List of data loaders for training datasets.
+    - val_loaders (list): List of data loaders for validation datasets.
+    """
+    from .utils import SelectChannels, augment_dataset
+    chans = []
+    if 'r' in channels:
+        chans.append(1)
+    if 'g' in channels:
+        chans.append(2)
+    if 'b' in channels:
+        chans.append(3)
+    channels = chans
+    if verbose:
+        print(f'Training a network on channels: {channels}')
+        print(f'Channel 1: Red, Channel 2: Green, Channel 3: Blue')
+    train_loaders = []
+    val_loaders = []
+    if normalize:
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels),
+            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
+    else:
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels)])
+    if mode == 'train':
+        data_dir = os.path.join(src, 'train')
+        shuffle = True
+        print('Loading Train and validation datasets')
+    elif mode == 'test':
+        data_dir = os.path.join(src, 'test')
+        val_loaders = []
+        validation_split = 0.0
+        shuffle = True
+        print('Loading test dataset')
+    else:
+        print(f'mode:{mode} is not valid, use mode = train or test')
+        return
+    data = spacrDataset(data_dir, classes, transform=transform, shuffle=shuffle, pin_memory=pin_memory)
+    num_workers = n_jobs if n_jobs is not None else 0
+    if validation_split > 0:
+        train_size = int((1 - validation_split) * len(data))
+        val_size = len(data) - train_size
+        if not augment:
+            print(f'Train data:{train_size}, Validation data:{val_size}')
+        train_dataset, val_dataset = random_split(data, [train_size, val_size])
+        if augment:
+            print(f'Data before augmentation: Train: {len(train_dataset)}, Validataion:{len(val_dataset)}')
+            train_dataset = augment_dataset(train_dataset, is_grayscale=(len(channels) == 1))
+            print(f'Data after augmentation: Train: {len(train_dataset)}')
+        print(f'Generating Dataloader with {n_jobs} workers')
+        train_loaders = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=1, pin_memory=pin_memory, persistent_workers=True)
+        val_loaders = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=1, pin_memory=pin_memory, persistent_workers=True)
+    else:
+        train_loaders = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=1, pin_memory=pin_memory, persistent_workers=True)
+    #dataset (Dataset) – dataset from which to load the data.
+    #batch_size (int, optional) – how many samples per batch to load (default: 1).
+    #shuffle (bool, optional) – set to True to have the data reshuffled at every epoch (default: False).
+    #sampler (Sampler or Iterable, optional) – defines the strategy to draw samples from the dataset. Can be any Iterable with __len__ implemented. If specified, shuffle must not be specified.
+    #batch_sampler (Sampler or Iterable, optional) – like sampler, but returns a batch of indices at a time. Mutually exclusive with batch_size, shuffle, sampler, and drop_last.
+    #num_workers (int, optional) – how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)
+    #collate_fn (Callable, optional) – merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
+    #pin_memory (bool, optional) – If True, the data loader will copy Tensors into device/CUDA pinned memory before returning them. If your data elements are a custom type, or your collate_fn returns a batch that is a custom type, see the example below.
+    #drop_last (bool, optional) – set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False)
+    #timeout (numeric, optional) – if positive, the timeout value for collecting a batch from workers. Should always be non-negative. (default: 0)
+    #worker_init_fn (Callable, optional) – If not None, this will be called on each worker subprocess with the worker id (an int in [0, num_workers - 1]) as input, after seeding and before data loading. (default: None)
+    #multiprocessing_context (str or multiprocessing.context.BaseContext, optional) – If None, the default multiprocessing context of your operating system will be used. (default: None)
+    #generator (torch.Generator, optional) – If not None, this RNG will be used by RandomSampler to generate random indexes and multiprocessing to generate base_seed for workers. (default: None)
+    #prefetch_factor (int, optional, keyword-only arg) – Number of batches loaded in advance by each worker. 2 means there will be a total of 2 * num_workers batches prefetched across all workers. (default value depends on the set value for num_workers. If value of num_workers=0 default is None. Otherwise, if value of num_workers > 0 default is 2).
+    #persistent_workers (bool, optional) – If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. (default: False)
+    #pin_memory_device (str, optional) – the device to pin_memory to if pin_memory is True.
+    #images, labels, filenames = next(iter(train_loaders))
+    #images = images.cpu()
+    #label_strings = [str(label.item()) for label in labels]
+    #train_fig = _imshow_gpu(images, label_strings, nrow=20, fontsize=12)
+    #if verbose:
+    #    plt.show()
+    train_fig = None
+    return train_loaders, val_loaders, train_fig
+def generate_training_dataset(settings):
+    # Function to filter png_list_df by prcfo present in df without merging
+    def filter_png_list(db_path, settings):
+        tables = ['cell', 'nucleus', 'pathogen', 'cytoplasm']
+        df, _ = _read_and_merge_data(locs=[db_path],
+                                     tables=tables,
+                                     verbose=False,
+                                     nuclei_limit=settings['nuclei_limit'],
+                                     pathogen_limit=settings['pathogen_limit'],
+                                     uninfected=settings['uninfected'])
+        [png_list_df] = _read_db(db_loc=db_path, tables=['png_list'])
+        filtered_png_list_df = png_list_df[png_list_df['prcfo'].isin(df.index)]
+        return filtered_png_list_df
+    # Function to get the smallest class size based on the dataset mode
+    def get_smallest_class_size(df, settings, dataset_mode):
+        if dataset_mode == 'metadata':
+            sizes = [len(df[df['metadata_based_class'] == c]) for c in settings['classes']]
+        elif dataset_mode == 'annotation':
+            sizes = [len(class_paths) for class_paths in df]
+        size = min(sizes)
+        print(f'Using the smallest class size: {size}')
+        return size
+    # Measurement-based selection logic
+    def measurement_based_selection(settings, db_path):
+        class_paths_ls = []
+        tables = ['cell', 'nucleus', 'pathogen', 'cytoplasm']
+        df, _ = _read_and_merge_data(locs=[db_path],
+                                     tables=tables,
+                                     verbose=False,
+                                     nuclei_limit=settings['nuclei_limit'],
+                                     pathogen_limit=settings['pathogen_limit'],
+                                     uninfected=settings['uninfected'])
+        print('length df 1', len(df))
+        df = annotate_conditions(df, cells=['HeLa'], pathogens=['pathogen'], treatments=settings['classes'],
+                                 treatment_loc=settings['class_metadata'])#, types=settings['metadata_type_by'])
+        print('length df 2', len(df))
+        png_list_df = filter_png_list(db_path, settings)
+        if settings['custom_measurement']:
+            if isinstance(settings['custom_measurement'], list):
+                if len(settings['custom_measurement']) == 2:
+                    df['recruitment'] = df[f"{settings['custom_measurement'][0]}"] / df[f"{settings['custom_measurement'][1]}"]
+                else:
+                    df['recruitment'] = df[f"{settings['custom_measurement'][0]}"]
+            else:
+                print("custom_measurement should be a list.")
+                return
+        else:
+            df['recruitment'] = df[f"pathogen_channel_{settings['channel_of_interest']}_mean_intensity"] / df[f"cytoplasm_channel_{settings['channel_of_interest']}_mean_intensity"]
+        q25 = df['recruitment'].quantile(0.25)
+        q75 = df['recruitment'].quantile(0.75)
+        df_lower = df[df['recruitment'] <= q25]
+        df_upper = df[df['recruitment'] >= q75]
+        class_paths_lower = get_paths_from_db(df=df_lower, png_df=png_list_df, image_type=settings['png_type'])
+        class_paths_lower = random.sample(class_paths_lower['png_path'].tolist(), settings['size'])
+        class_paths_ls.append(class_paths_lower)
+        class_paths_upper = get_paths_from_db(df=df_upper, png_df=png_list_df, image_type=settings['png_type'])
+        class_paths_upper = random.sample(class_paths_upper['png_path'].tolist(), settings['size'])
+        class_paths_ls.append(class_paths_upper)
+        return class_paths_ls
+    # Metadata-based selection logic
+    def metadata_based_selection(db_path, settings):
+        class_paths_ls = []
+        df = filter_png_list(db_path, settings)
+        df['metadata_based_class'] = pd.NA
+        for i, class_ in enumerate(settings['classes']):
+            ls = settings['class_metadata'][i]
+            df.loc[df[settings['metadata_type_by']].isin(ls), 'metadata_based_class'] = class_
+        size = get_smallest_class_size(df, settings, 'metadata')
+        for class_ in settings['classes']:
+            class_temp_df = df[df['metadata_based_class'] == class_]
+            print(f'Found {len(class_temp_df)} images for class {class_}')
+            class_paths_temp = class_temp_df['png_path'].tolist()
+            # Ensure to sample `size` number of images (smallest class size)
+            if len(class_paths_temp) > size:
+                class_paths_temp = random.sample(class_paths_temp, size)
+            class_paths_ls.append(class_paths_temp)
+        return class_paths_ls
+    # Annotation-based selection logic
+    def annotation_based_selection(db_path, dst, settings):
+        class_paths_ls = training_dataset_from_annotation(db_path, dst, settings['annotation_column'], annotated_classes=settings['annotated_classes'])
+        size = get_smallest_class_size(class_paths_ls, settings, 'annotation')
+        for i, class_paths in enumerate(class_paths_ls):
+            if len(class_paths) > size:
+                class_paths_ls[i] = random.sample(class_paths, size)
+        return class_paths_ls
+    from .io import _read_and_merge_data, _read_db
+    from .utils import get_paths_from_db, annotate_conditions, save_settings
+    from .settings import set_generate_training_dataset_defaults
+    # Set default settings and save
+    settings = set_generate_training_dataset_defaults(settings)
+    save_settings(settings, 'cv_dataset', show=True)
+    class_path_list = None
+    if isinstance(settings['src'], str):
+        src = [settings['src']]
+    for i, src in enumerate(settings['src']):
+        db_path = os.path.join(src, 'measurements', 'measurements.db')
+        if len(settings['src']) > 1 and i == 0:
+            dst = os.path.join(src, 'datasets', 'training_all')
+        elif len(settings['src']) == 1:
+            dst = os.path.join(src, 'datasets', 'training')
+        # Create a new directory for training data if necessary
+        if os.path.exists(dst):
+            for i in range(1, 100000):
+                dst = dst + f'_{i}'
+                if not os.path.exists(dst):
+                    print(f'Creating new directory for training: {dst}')
+                    break
+        # Select dataset based on dataset mode
+        if settings['dataset_mode'] == 'annotation':
+            class_paths_ls = annotation_based_selection(db_path, dst, settings)
+        elif settings['dataset_mode'] == 'metadata':
+            class_paths_ls = metadata_based_selection(db_path, settings)
+        elif settings['dataset_mode'] == 'measurement':
+            class_paths_ls = measurement_based_selection(settings, db_path)
+        if class_path_list is None:
+            class_path_list = [[] for _ in range(len(class_paths_ls))]
+        # Extend each list in class_path_list with the corresponding list from class_paths_ls
+        for idx in range(len(class_paths_ls)):
+            class_path_list[idx].extend(class_paths_ls[idx])
+    # Generate and return training and testing directories
+    train_class_dir, test_class_dir = generate_dataset_from_lists(dst, class_data=class_path_list, classes=settings['classes'], test_split=settings['test_split'])
+    return train_class_dir, test_class_dir
+def training_dataset_from_annotation(db_path, dst, annotation_column='test', annotated_classes=(1, 2)):
+    all_paths = []
+    # Connect to the database and retrieve the image paths and annotations
+    print(f'Reading DataBase: {db_path}')
+    with sqlite3.connect(db_path) as conn:
+        cursor = conn.cursor()
+        # Prepare the query with parameterized placeholders for annotated_classes
+        placeholders = ','.join('?' * len(annotated_classes))
+        query = f"SELECT png_path, {annotation_column} FROM png_list WHERE {annotation_column} IN ({placeholders})"
+        cursor.execute(query, annotated_classes)
+        while True:
+            rows = cursor.fetchmany(1000)
+            if not rows:
+                break
+            for row in rows:
+                all_paths.append(row)
+    # Filter paths based on annotation
+    class_paths = []
+    for class_ in annotated_classes:
+        class_paths_temp = [path for path, annotation in all_paths if annotation == class_]
+        class_paths.append(class_paths_temp)
+    print(f'Generated a list of lists from annotation of {len(class_paths)} classes')
+    return class_paths
+def generate_dataset_from_lists(dst, class_data, classes, test_split=0.1):
+    from .utils import print_progress
+    # Make sure that the length of class_data matches the length of classes
+    if len(class_data) != len(classes):
+        raise ValueError("class_data and classes must have the same length.")
+    total_files = sum(len(data) for data in class_data)
+    processed_files = 0
+    time_ls = []
+    for cls, data in zip(classes, class_data):
+        # Create directories
+        train_class_dir = os.path.join(dst, f'train/{cls}')
+        test_class_dir = os.path.join(dst, f'test/{cls}')
+        os.makedirs(train_class_dir, exist_ok=True)
+        os.makedirs(test_class_dir, exist_ok=True)
+        # Split the data
+        train_data, test_data = train_test_split(data, test_size=test_split, shuffle=True, random_state=42)
+        # Copy train files
+        for path in train_data:
+            start = time.time()
+            shutil.copy(path, os.path.join(train_class_dir, os.path.basename(path)))
+            duration = time.time() - start
+            time_ls.append(duration)
+            print_progress(processed_files, total_files, n_jobs=1, time_ls=None, batch_size=None, operation_type="Copying files for Train dataset")
+            processed_files += 1
+        # Copy test files
+        for path in test_data:
+            start = time.time()
+            shutil.copy(path, os.path.join(test_class_dir, os.path.basename(path)))
+            duration = time.time() - start
+            time_ls.append(duration)
+            print_progress(processed_files, total_files, n_jobs=1, time_ls=None, batch_size=None, operation_type="Copying files for Test dataset")
+            processed_files += 1
+    # Print summary
+    for cls in classes:
+        train_class_dir = os.path.join(dst, f'train/{cls}')
+        test_class_dir = os.path.join(dst, f'test/{cls}')
+        print(f'Train class {cls}: {len(os.listdir(train_class_dir))}, Test class {cls}: {len(os.listdir(test_class_dir))}')
+    return os.path.join(dst, 'train'), os.path.join(dst, 'test')

spacr 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

spacr 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl