PyPI - spacr - Versions diffs - 0.0.20__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

spacr 0.0.20py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

spacr/alpha.py +291 -14
spacr/annotate_app.py +2 -2
spacr/core.py +1301 -426
spacr/foldseek.py +793 -0
spacr/get_alfafold_structures.py +72 -0
spacr/gui_mask_app.py +30 -10
spacr/gui_utils.py +17 -2
spacr/io.py +260 -102
spacr/measure.py +150 -64
spacr/plot.py +151 -12
spacr/sim.py +666 -119
spacr/timelapse.py +139 -9
spacr/train.py +18 -10
spacr/utils.py +43 -43
{spacr-0.0.20.dist-info → spacr-0.0.21.dist-info}/METADATA +5 -2
spacr-0.0.21.dist-info/RECORD +33 -0
spacr-0.0.20.dist-info/RECORD +0 -31
{spacr-0.0.20.dist-info → spacr-0.0.21.dist-info}/LICENSE +0 -0
{spacr-0.0.20.dist-info → spacr-0.0.21.dist-info}/WHEEL +0 -0
{spacr-0.0.20.dist-info → spacr-0.0.21.dist-info}/entry_points.txt +0 -0
{spacr-0.0.20.dist-info → spacr-0.0.21.dist-info}/top_level.txt +0 -0

spacr/core.py CHANGED Viewed

@@ -1,11 +1,13 @@
-import os, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, datetime
+import os, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, datetime, shap, string
 # image and array processing
 import numpy as np
 import pandas as pd
+from cellpose import train
 import cellpose
 from cellpose import models as cp_models
+from cellpose.models import CellposeModel
 import statsmodels.formula.api as smf
 import statsmodels.api as sm
@@ -27,9 +29,17 @@ matplotlib.use('Agg')
 import torchvision.transforms as transforms
 from sklearn.model_selection import train_test_split
-from sklearn.ensemble import  IsolationForest
+from sklearn.ensemble import  IsolationForest, RandomForestClassifier, HistGradientBoostingClassifier
 from .logger import log_function_call
+from sklearn.linear_model import LogisticRegression
+from sklearn.inspection import permutation_importance
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
+from xgboost import XGBClassifier
+from scipy.spatial.distance import cosine, euclidean, mahalanobis, cityblock, minkowski, chebyshev, hamming, jaccard, braycurtis
+from sklearn.preprocessing import StandardScaler
+import shap
 def analyze_plaques(folder):
     summary_data = []
@@ -67,74 +77,6 @@ def analyze_plaques(folder):
     print(f"Analysis completed and saved to database '{db_name}'.")
-def compare_masks(dir1, dir2, dir3, verbose=False):
-    from .io import _read_mask
-    from .plot import visualize_masks, plot_comparison_results
-    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index, dice_coefficient
-    filenames = os.listdir(dir1)
-    results = []
-    cond_1 = os.path.basename(dir1)
-    cond_2 = os.path.basename(dir2)
-    cond_3 = os.path.basename(dir3)
-    for index, filename in enumerate(filenames):
-        print(f'Processing image:{index+1}', end='\r', flush=True)
-        path1, path2, path3 = os.path.join(dir1, filename), os.path.join(dir2, filename), os.path.join(dir3, filename)
-        if os.path.exists(path2) and os.path.exists(path3):
-            mask1, mask2, mask3 = _read_mask(path1), _read_mask(path2), _read_mask(path3)
-            boundary_true1, boundary_true2, boundary_true3 = extract_boundaries(mask1), extract_boundaries(mask2), extract_boundaries(mask3)
-            true_masks, pred_masks = [mask1], [mask2, mask3]  # Assuming mask1 is the ground truth for simplicity
-            true_labels, pred_labels_1, pred_labels_2 = label(mask1), label(mask2), label(mask3)
-            average_precision_0, average_precision_1 = compute_segmentation_ap(mask1, mask2), compute_segmentation_ap(mask1, mask3)
-            ap_scores = [average_precision_0, average_precision_1]
-            if verbose:
-                unique_values1, unique_values2, unique_values3 = np.unique(mask1),  np.unique(mask2), np.unique(mask3)
-                print(f"Unique values in mask 1: {unique_values1}, mask 2: {unique_values2}, mask 3: {unique_values3}")
-                visualize_masks(boundary_true1, boundary_true2, boundary_true3, title=f"Boundaries - {filename}")
-            boundary_f1_12, boundary_f1_13, boundary_f1_23 = boundary_f1_score(mask1, mask2), boundary_f1_score(mask1, mask3), boundary_f1_score(mask2, mask3)
-            if (np.unique(mask1).size == 1 and np.unique(mask1)[0] == 0) and \
-               (np.unique(mask2).size == 1 and np.unique(mask2)[0] == 0) and \
-               (np.unique(mask3).size == 1 and np.unique(mask3)[0] == 0):
-                continue
-            if verbose:
-                unique_values4, unique_values5, unique_values6 = np.unique(boundary_f1_12), np.unique(boundary_f1_13), np.unique(boundary_f1_23)
-                print(f"Unique values in boundary mask 1: {unique_values4}, mask 2: {unique_values5}, mask 3: {unique_values6}")
-                visualize_masks(mask1, mask2, mask3, title=filename)
-            jaccard12 = jaccard_index(mask1, mask2)
-            dice12 = dice_coefficient(mask1, mask2)
-            jaccard13 = jaccard_index(mask1, mask3)
-            dice13 = dice_coefficient(mask1, mask3)
-            jaccard23 = jaccard_index(mask2, mask3)
-            dice23 = dice_coefficient(mask2, mask3)
-            results.append({
-                f'filename': filename,
-                f'jaccard_{cond_1}_{cond_2}': jaccard12,
-                f'dice_{cond_1}_{cond_2}': dice12,
-                f'jaccard_{cond_1}_{cond_3}': jaccard13,
-                f'dice_{cond_1}_{cond_3}': dice13,
-                f'jaccard_{cond_2}_{cond_3}': jaccard23,
-                f'dice_{cond_2}_{cond_3}': dice23,
-                f'boundary_f1_{cond_1}_{cond_2}': boundary_f1_12,
-                f'boundary_f1_{cond_1}_{cond_3}': boundary_f1_13,
-                f'boundary_f1_{cond_2}_{cond_3}': boundary_f1_23,
-                f'average_precision_{cond_1}_{cond_2}': ap_scores[0],
-                f'average_precision_{cond_1}_{cond_3}': ap_scores[1]
-            })
-        else:
-            print(f'Cannot find {path1} or {path2} or {path3}')
-    fig = plot_comparison_results(results)
-    return results, fig
 def generate_cp_masks(settings):
     src = settings['src']
@@ -177,8 +119,146 @@ def train_cellpose(settings):
     from .utils import resize_images_and_labels
     img_src = settings['img_src']
-    mask_src= settings['mask_src']
-    secondary_image_dir = None
+    mask_src = os.path.join(img_src, 'mask')
+    model_name = settings['model_name']
+    model_type = settings['model_type']
+    learning_rate = settings['learning_rate']
+    weight_decay = settings['weight_decay']
+    batch_size = settings['batch_size']
+    n_epochs = settings['n_epochs']
+    from_scratch = settings['from_scratch']
+    diameter = settings['diameter']
+    verbose = settings['verbose']
+    channels = [0,0]
+    signal_thresholds = 1000
+    normalize = True
+    percentiles = [2,98]
+    circular = False
+    invert = False
+    resize = False
+    settings['width_height'] = [1000,1000]
+    target_height = settings['width_height'][1]
+    target_width = settings['width_height'][0]
+    rescale = False
+    grayscale = True
+    test = False
+    if test:
+        test_img_src = os.path.join(os.path.dirname(img_src), 'test')
+        test_mask_src = os.path.join(test_img_src, 'mask')
+    test_images, test_masks, test_image_names, test_mask_names = None,None,None,None,
+    print(settings)
+    if from_scratch:
+        model_name=f'scratch_{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
+    else:
+        if resize:
+            model_name=f'{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
+        else:
+            model_name=f'{model_name}_{model_type}_e{n_epochs}.CP_model'
+    model_save_path = os.path.join(mask_src, 'models', 'cellpose_model')
+    print(model_save_path)
+    os.makedirs(model_save_path, exist_ok=True)
+    settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
+    settings_csv = os.path.join(model_save_path,f'{model_name}_settings.csv')
+    settings_df.to_csv(settings_csv, index=False)
+    if from_scratch:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_type, diam_mean=diameter, pretrained_model=None)
+    else:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_type)
+    if normalize:
+        image_files = [os.path.join(img_src, f) for f in os.listdir(img_src) if f.endswith('.tif')]
+        label_files = [os.path.join(mask_src, f) for f in os.listdir(mask_src) if f.endswith('.tif')]
+        images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_files, label_files, signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+        images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+        if test:
+            test_image_files = [os.path.join(test_img_src, f) for f in os.listdir(test_img_src) if f.endswith('.tif')]
+            test_label_files = [os.path.join(test_mask_src, f) for f in os.listdir(test_mask_src) if f.endswith('.tif')]
+            test_images, test_masks, test_image_names, test_mask_names = _load_normalized_images_and_labels(image_files=test_image_files, label_files=test_label_files, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+            test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
+    else:
+        images, masks, image_names, mask_names = _load_images_and_labels(img_src, mask_src, circular, invert)
+        images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+        if test:
+            test_images, test_masks, test_image_names, test_mask_names = _load_images_and_labels(img_src=test_img_src, mask_src=test_mask_src, circular=circular, invert=circular)
+            test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
+    if resize:
+        images, masks = resize_images_and_labels(images, masks, target_height, target_width, show_example=True)
+    if model_type == 'cyto':
+        cp_channels = [0,1]
+    if model_type == 'cyto2':
+        cp_channels = [0,2]
+    if model_type == 'nucleus':
+        cp_channels = [0,0]
+    if grayscale:
+        cp_channels = [0,0]
+        images = [np.squeeze(img) if img.ndim == 3 and 1 in img.shape else img for img in images]
+    masks = [np.squeeze(mask) if mask.ndim == 3 and 1 in mask.shape else mask for mask in masks]
+    print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {masks[0].shape}, image type: masks[0].shape')
+    save_every = int(n_epochs/10)
+    if save_every < 10:
+        save_every = n_epochs
+    train.train_seg(model.net,
+                    train_data=images,
+                    train_labels=masks,
+                    train_files=image_names,
+                    train_labels_files=mask_names,
+                    train_probs=None,
+                    test_data=test_images,
+                    test_labels=test_masks,
+                    test_files=test_image_names,
+                    test_labels_files=test_mask_names,
+                    test_probs=None,
+                    load_files=True,
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    n_epochs=n_epochs,
+                    weight_decay=weight_decay,
+                    momentum=0.9,
+                    SGD=False,
+                    channels=cp_channels,
+                    channel_axis=None,
+                    #rgb=False,
+                    normalize=False,
+                    compute_flows=False,
+                    save_path=model_save_path,
+                    save_every=save_every,
+                    nimg_per_epoch=None,
+                    nimg_test_per_epoch=None,
+                    rescale=rescale,
+                    #scale_range=None,
+                    #bsize=224,
+                    min_train_masks=1,
+                    model_name=model_name)
+    return print(f"Model saved at: {model_save_path}/{model_name}")
+def train_cellpose_v1(settings):
+    from .io import _load_normalized_images_and_labels, _load_images_and_labels
+    from .utils import resize_images_and_labels
+    img_src = settings['img_src']
+    mask_src = os.path.join(img_src, 'mask')
     model_name = settings['model_name']
     model_type = settings['model_type']
     learning_rate = settings['learning_rate']
@@ -186,7 +266,9 @@ def train_cellpose(settings):
     batch_size = settings['batch_size']
     n_epochs = settings['n_epochs']
     verbose = settings['verbose']
-    signal_thresholds = settings['signal_thresholds']
+    signal_thresholds = 100 #settings['signal_thresholds']
     channels = settings['channels']
     from_scratch = settings['from_scratch']
     diameter = settings['diameter']
@@ -199,7 +281,17 @@ def train_cellpose(settings):
     invert = settings['invert']
     percentiles = settings['percentiles']
     grayscale = settings['grayscale']
+    if model_type == 'cyto':
+        settings['diameter'] = 30
+        diameter = settings['diameter']
+        print(f'Cyto model must have diamiter 30. Diameter set the 30')
+    if model_type == 'nuclei':
+        settings['diameter'] = 17
+        diameter = settings['diameter']
+        print(f'Nuclei model must have diamiter 17. Diameter set the 17')
     print(settings)
     if from_scratch:
@@ -208,24 +300,24 @@ def train_cellpose(settings):
         model_name=f'{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
     model_save_path = os.path.join(mask_src, 'models', 'cellpose_model')
-    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+    print(model_save_path)
+    os.makedirs(model_save_path, exist_ok=True)
     settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
     settings_csv = os.path.join(model_save_path,f'{model_name}_settings.csv')
     settings_df.to_csv(settings_csv, index=False)
-    if model_type =='cyto':
-        if not from_scratch:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_type)
-        else:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_type, net_avg=False, diam_mean=diameter, pretrained_model=None)
-    if model_type !='cyto':
+    if not from_scratch:
         model = cp_models.CellposeModel(gpu=True, model_type=model_type)
-    if normalize:
-        images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_dir=img_src, label_dir=mask_src, secondary_image_dir=secondary_image_dir, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+    else:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_type, pretrained_model=None)
+    if normalize:
+        image_files = [os.path.join(img_src, f) for f in os.listdir(img_src) if f.endswith('.tif')]
+        label_files = [os.path.join(mask_src, f) for f in os.listdir(mask_src) if f.endswith('.tif')]
+        images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_files, label_files, signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
         images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
     else:
         images, masks, image_names, mask_names = _load_images_and_labels(img_src, mask_src, circular, invert)
@@ -248,25 +340,86 @@ def train_cellpose(settings):
     print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {masks[0].shape}, image type: masks[0].shape')
     save_every = int(n_epochs/10)
-    print('cellpose image input dtype', images[0].dtype)
-    print('cellpose mask input dtype', masks[0].dtype)
+    if save_every < 10:
+        save_every = n_epochs
+    #print('cellpose image input dtype', images[0].dtype)
+    #print('cellpose mask input dtype', masks[0].dtype)
     # Train the model
-    model.train(train_data=images, #(list of arrays (2D or 3D)) – images for training
-                train_labels=masks, #(list of arrays (2D or 3D)) – labels for train_data, where 0=no masks; 1,2,…=mask labels can include flows as additional images
-                train_files=image_names, #(list of strings) – file names for images in train_data (to save flows for future runs)
-                channels=cp_channels, #(list of ints (default, None)) – channels to use for training
-                normalize=False, #(bool (default, True)) – normalize data so 0.0=1st percentile and 1.0=99th percentile of image intensities in each channel
-                save_path=model_save_path, #(string (default, None)) – where to save trained model, if None it is not saved
-                save_every=save_every, #(int (default, 100)) – save network every [save_every] epochs
-                learning_rate=learning_rate, #(float or list/np.ndarray (default, 0.2)) – learning rate for training, if list, must be same length as n_epochs
-                n_epochs=n_epochs, #(int (default, 500)) – how many times to go through whole training set during training
-                weight_decay=weight_decay, #(float (default, 0.00001)) –
-                SGD=True, #(bool (default, True)) – use SGD as optimization instead of RAdam
-                batch_size=batch_size, #(int (optional, default 8)) – number of 224x224 patches to run simultaneously on the GPU (can make smaller or bigger depending on GPU memory usage)
-                nimg_per_epoch=None, #(int (optional, default None)) – minimum number of images to train on per epoch, with a small training set (< 8 images) it may help to set to 8
-                rescale=rescale, #(bool (default, True)) – whether or not to rescale images to diam_mean during training, if True it assumes you will fit a size model after training or resize your images accordingly, if False it will try to train the model to be scale-invariant (works worse)
-                min_train_masks=1, #(int (default, 5)) – minimum number of masks an image must have to use in training set
-                model_name=model_name) #(str (default, None)) – name of network, otherwise saved with name as params + training start time
+    #model.train(train_data=images, #(list of arrays (2D or 3D)) – images for training
+    #model.train(train_data=images, #(list of arrays (2D or 3D)) – images for training
+    #            train_labels=masks, #(list of arrays (2D or 3D)) – labels for train_data, where 0=no masks; 1,2,…=mask labels can include flows as additional images
+    #            train_files=image_names, #(list of strings) – file names for images in train_data (to save flows for future runs)
+    #            channels=cp_channels, #(list of ints (default, None)) – channels to use for training
+    #            normalize=False, #(bool (default, True)) – normalize data so 0.0=1st percentile and 1.0=99th percentile of image intensities in each channel
+    #            save_path=model_save_path, #(string (default, None)) – where to save trained model, if None it is not saved
+    #            save_every=save_every, #(int (default, 100)) – save network every [save_every] epochs
+    #            learning_rate=learning_rate, #(float or list/np.ndarray (default, 0.2)) – learning rate for training, if list, must be same length as n_epochs
+    #            n_epochs=n_epochs, #(int (default, 500)) – how many times to go through whole training set during training
+    #            weight_decay=weight_decay, #(float (default, 0.00001)) –
+    #            SGD=True, #(bool (default, True)) – use SGD as optimization instead of RAdam
+    #            batch_size=batch_size, #(int (optional, default 8)) – number of 224x224 patches to run simultaneously on the GPU (can make smaller or bigger depending on GPU memory usage)
+    #            nimg_per_epoch=None, #(int (optional, default None)) – minimum number of images to train on per epoch, with a small training set (< 8 images) it may help to set to 8
+    #            rescale=rescale, #(bool (default, True)) – whether or not to rescale images to diam_mean during training, if True it assumes you will fit a size model after training or resize your images accordingly, if False it will try to train the model to be scale-invariant (works worse)
+    #            min_train_masks=1, #(int (default, 5)) – minimum number of masks an image must have to use in training set
+    #            model_name=model_name) #(str (default, None)) – name of network, otherwise saved with name as params + training start time
+    train.train_seg(model.net,
+                    train_data=images,
+                    train_labels=masks,
+                    train_files=image_names,
+                    train_labels_files=None,
+                    train_probs=None,
+                    test_data=None,
+                    test_labels=None,
+                    test_files=None,
+                    test_labels_files=None,
+                    test_probs=None,
+                    load_files=True,
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    n_epochs=n_epochs,
+                    weight_decay=weight_decay,
+                    momentum=0.9,
+                    SGD=False,
+                    channels=cp_channels,
+                    channel_axis=None,
+                    #rgb=False,
+                    normalize=False,
+                    compute_flows=False,
+                    save_path=model_save_path,
+                    save_every=save_every,
+                    nimg_per_epoch=None,
+                    nimg_test_per_epoch=None,
+                    rescale=rescale,
+                    #scale_range=None,
+                    #bsize=224,
+                    min_train_masks=1,
+                    model_name=model_name)
+    #model_save_path = train.train_seg(model.net,
+    #                                  train_data=images,
+    #                                  train_files=image_names,
+    #                                  train_labels=masks,
+    #                                  channels=cp_channels,
+    #                                  normalize=False,
+    #                                  save_every=save_every,
+    #                                  learning_rate=learning_rate,
+    #                                  n_epochs=n_epochs,
+    #                                  #test_data=test_images,
+    #                                  #test_labels=test_labels,
+    #                                  weight_decay=weight_decay,
+    #                                  SGD=True,
+    #                                  batch_size=batch_size,
+    #                                  nimg_per_epoch=None,
+    #                                  rescale=rescale,
+    #                                  min_train_masks=1,
+    #                                  model_name=model_name)
     return print(f"Model saved at: {model_save_path}/{model_name}")
@@ -926,30 +1079,38 @@ def annotate_results(pred_loc):
     display(df)
     return df
-def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=None):
+def generate_dataset(src, file_metadata=None, experiment='TSG101_screen', sample=None):
-    from .utils import init_globals, add_images_to_tar
-    db_path = os.path.join(src, 'measurements','measurements.db')
+    from .utils import initiate_counter, add_images_to_tar
+    db_path = os.path.join(src, 'measurements', 'measurements.db')
     dst = os.path.join(src, 'datasets')
-    global total_images
     all_paths = []
     # Connect to the database and retrieve the image paths
     print(f'Reading DataBase: {db_path}')
-    with sqlite3.connect(db_path) as conn:
-        cursor = conn.cursor()
-        if file_type:
-            cursor.execute("SELECT png_path FROM png_list WHERE png_path LIKE ?", (f"%{file_type}%",))
-        else:
-            cursor.execute("SELECT png_path FROM png_list")
-        while True:
-            rows = cursor.fetchmany(1000)
-            if not rows:
-                break
-            all_paths.extend([row[0] for row in rows])
+    try:
+        with sqlite3.connect(db_path) as conn:
+            cursor = conn.cursor()
+            if file_metadata:
+                if isinstance(file_metadata, str):
+                    cursor.execute("SELECT png_path FROM png_list WHERE png_path LIKE ?", (f"%{file_metadata}%",))
+            else:
+                cursor.execute("SELECT png_path FROM png_list")
+            while True:
+                rows = cursor.fetchmany(1000)
+                if not rows:
+                    break
+                all_paths.extend([row[0] for row in rows])
+    except sqlite3.Error as e:
+        print(f"Database error: {e}")
+        return
+    except Exception as e:
+        print(f"Error: {e}")
+        return
     if isinstance(sample, int):
         selected_paths = random.sample(all_paths, sample)
         print(f'Random selection of {len(selected_paths)} paths')
@@ -957,23 +1118,18 @@ def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=Non
         selected_paths = all_paths
         random.shuffle(selected_paths)
         print(f'All paths: {len(selected_paths)} paths')
     total_images = len(selected_paths)
-    print(f'found {total_images} images')
+    print(f'Found {total_images} images')
     # Create a temp folder in dst
     temp_dir = os.path.join(dst, "temp_tars")
     os.makedirs(temp_dir, exist_ok=True)
     # Chunking the data
-    if len(selected_paths) > 10000:
-        num_procs = cpu_count()-2
-        chunk_size = len(selected_paths) // num_procs
-        remainder = len(selected_paths) % num_procs
-    else:
-        num_procs = 2
-        chunk_size = len(selected_paths) // 2
-        remainder = 0
+    num_procs = max(2, cpu_count() - 2)
+    chunk_size = len(selected_paths) // num_procs
+    remainder = len(selected_paths) % num_procs
     paths_chunks = []
     start = 0
@@ -983,45 +1139,43 @@ def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=Non
         start = end
     temp_tar_files = [os.path.join(temp_dir, f'temp_{i}.tar') for i in range(num_procs)]
-    # Initialize the shared objects
-    counter_ = Value('i', 0)
-    lock_ = Lock()
-    ctx = multiprocessing.get_context('spawn')
     print(f'Generating temporary tar files in {dst}')
+    # Initialize shared counter and lock
+    counter = Value('i', 0)
+    lock = Lock()
+    with Pool(processes=num_procs, initializer=initiate_counter, initargs=(counter, lock)) as pool:
+        pool.starmap(add_images_to_tar, [(paths_chunks[i], temp_tar_files[i], total_images) for i in range(num_procs)])
     # Combine the temporary tar files into a final tar
     date_name = datetime.date.today().strftime('%y%m%d')
-    tar_name = f'{date_name}_{experiment}_{file_type}.tar'
+    if not file_metadata is None:
+        tar_name = f'{date_name}_{experiment}_{file_metadata}.tar'
+    else:
+        tar_name = f'{date_name}_{experiment}.tar'
+    tar_name = os.path.join(dst, tar_name)
     if os.path.exists(tar_name):
         number = random.randint(1, 100)
-        tar_name_2 = f'{date_name}_{experiment}_{file_type}_{number}.tar'
-        print(f'Warning: {os.path.basename(tar_name)} exists saving as {os.path.basename(tar_name_2)} ')
-        tar_name = tar_name_2
-    # Add the counter and lock to the arguments for pool.map
+        tar_name_2 = f'{date_name}_{experiment}_{file_metadata}_{number}.tar'
+        print(f'Warning: {os.path.basename(tar_name)} exists, saving as {os.path.basename(tar_name_2)} ')
+        tar_name = os.path.join(dst, tar_name_2)
     print(f'Merging temporary files')
-    #with Pool(processes=num_procs, initializer=init_globals, initargs=(counter_, lock_)) as pool:
-    #    results = pool.map(add_images_to_tar, zip(paths_chunks, temp_tar_files))
-    with ctx.Pool(processes=num_procs, initializer=init_globals, initargs=(counter_, lock_)) as pool:
-        results = pool.map(add_images_to_tar, zip(paths_chunks, temp_tar_files))
-    with tarfile.open(os.path.join(dst, tar_name), 'w') as final_tar:
-        for tar_path in results:
-            with tarfile.open(tar_path, 'r') as t:
-                for member in t.getmembers():
-                    t.extract(member, path=dst)
-                    final_tar.add(os.path.join(dst, member.name), arcname=member.name)
-                    os.remove(os.path.join(dst, member.name))
-            os.remove(tar_path)
+    with tarfile.open(tar_name, 'w') as final_tar:
+        for temp_tar_path in temp_tar_files:
+            with tarfile.open(temp_tar_path, 'r') as temp_tar:
+                for member in temp_tar.getmembers():
+                    file_obj = temp_tar.extractfile(member)
+                    final_tar.addfile(member, file_obj)
+            os.remove(temp_tar_path)
     # Delete the temp folder
     shutil.rmtree(temp_dir)
-    print(f"\nSaved {total_images} images to {os.path.join(dst, tar_name)}")
+    print(f"\nSaved {total_images} images to {tar_name}")
 def apply_model_to_tar(tar_path, model_path, file_type='cell_png', image_size=224, batch_size=64, normalize=True, preload='images', num_workers=10, verbose=False):
     from .io import TarImageDataset, DataLoader
@@ -1257,7 +1411,14 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     db_path = os.path.join(src, 'measurements','measurements.db')
     dst = os.path.join(src, 'datasets', 'training')
+    if os.path.exists(dst):
+        for i in range(1, 1000):
+            dst = os.path.join(src, 'datasets', f'training_{i}')
+            if not os.path.exists(dst):
+                print(f'Creating new directory for training: {dst}')
+                break
     if mode == 'annotation':
         class_paths_ls_2 = []
         class_paths_ls = training_dataset_from_annotation(db_path, dst, annotation_column, annotated_classes=annotated_classes)
@@ -1268,6 +1429,7 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     elif mode == 'metadata':
         class_paths_ls = []
+        class_len_ls = []
         [df] = _read_db(db_loc=db_path, tables=['png_list'])
         df['metadata_based_class'] = pd.NA
         for i, class_ in enumerate(classes):
@@ -1275,7 +1437,18 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
             df.loc[df[metadata_type_by].isin(ls), 'metadata_based_class'] = class_
         for class_ in classes:
+            if size == None:
+                c_s = []
+                for c in classes:
+                    c_s_t_df = df[df['metadata_based_class'] == c]
+                    c_s.append(len(c_s_t_df))
+                    print(f'Found {len(c_s_t_df)} images for class {c}')
+                size = min(c_s)
+                print(f'Using the smallest class size: {size}')
             class_temp_df = df[df['metadata_based_class'] == class_]
+            class_len_ls.append(len(class_temp_df))
+            print(f'Found {len(class_temp_df)} images for class {class_}')
             class_paths_temp = random.sample(class_temp_df['png_path'].tolist(), size)
             class_paths_ls.append(class_paths_temp)
@@ -1332,7 +1505,7 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     return
-def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_size=32, classes=['nc','pc'], num_workers=None, validation_split=0.0, max_show=2, pin_memory=False, normalize=False, verbose=False):
+def generate_loaders_v1(src, train_mode='erm', mode='train', image_size=224, batch_size=32, classes=['nc','pc'], num_workers=None, validation_split=0.0, max_show=2, pin_memory=False, normalize=False, verbose=False):
     """
     Generate data loaders for training and validation/test datasets.
@@ -1463,56 +1636,223 @@ def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_
     return train_loaders, val_loaders, plate_names
-def analyze_recruitment(src, metadata_settings, advanced_settings):
+def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_size=32, classes=['nc','pc'], num_workers=None, validation_split=0.0, max_show=2, pin_memory=False, normalize=False, channels=[1, 2, 3], verbose=False):
     """
-    Analyze recruitment data by grouping the DataFrame by well coordinates and plotting controls and recruitment data.
+    Generate data loaders for training and validation/test datasets.
     Parameters:
-    src (str): The source of the recruitment data.
-    metadata_settings (dict): The settings for metadata.
-    advanced_settings (dict): The advanced settings for recruitment analysis.
+    - src (str): The source directory containing the data.
+    - train_mode (str): The training mode. Options are 'erm' (Empirical Risk Minimization) or 'irm' (Invariant Risk Minimization).
+    - mode (str): The mode of operation. Options are 'train' or 'test'.
+    - image_size (int): The size of the input images.
+    - batch_size (int): The batch size for the data loaders.
+    - classes (list): The list of classes to consider.
+    - num_workers (int): The number of worker threads for data loading.
+    - validation_split (float): The fraction of data to use for validation when train_mode is 'erm'.
+    - max_show (int): The maximum number of images to show when verbose is True.
+    - pin_memory (bool): Whether to pin memory for faster data transfer.
+    - normalize (bool): Whether to normalize the input images.
+    - verbose (bool): Whether to print additional information and show images.
+    - channels (list): The list of channels to retain. Options are [1, 2, 3] for all channels, [1, 2] for blue and green, etc.
     Returns:
-    None
+    - train_loaders (list): List of data loaders for training datasets.
+    - val_loaders (list): List of data loaders for validation datasets.
+    - plate_names (list): List of plate names (only applicable when train_mode is 'irm').
     """
-    from .io import _read_and_merge_data, _results_to_csv
-    from .plot import plot_merged, _plot_controls, _plot_recruitment
-    from .utils import _object_filter, annotate_conditions, _calculate_recruitment, _group_by_well
-    settings_dict = {**metadata_settings, **advanced_settings}
-    settings_df = pd.DataFrame(list(settings_dict.items()), columns=['Key', 'Value'])
-    settings_csv = os.path.join(src,'settings','analyze_settings.csv')
-    os.makedirs(os.path.join(src,'settings'), exist_ok=True)
-    settings_df.to_csv(settings_csv, index=False)
-    # metadata settings
-    target = metadata_settings['target']
-    cell_types = metadata_settings['cell_types']
-    cell_plate_metadata = metadata_settings['cell_plate_metadata']
-    pathogen_types = metadata_settings['pathogen_types']
-    pathogen_plate_metadata = metadata_settings['pathogen_plate_metadata']
-    treatments = metadata_settings['treatments']
-    treatment_plate_metadata = metadata_settings['treatment_plate_metadata']
-    metadata_types = metadata_settings['metadata_types']
-    channel_dims = metadata_settings['channel_dims']
-    cell_chann_dim = metadata_settings['cell_chann_dim']
-    cell_mask_dim = metadata_settings['cell_mask_dim']
-    nucleus_chann_dim = metadata_settings['nucleus_chann_dim']
-    nucleus_mask_dim = metadata_settings['nucleus_mask_dim']
-    pathogen_chann_dim = metadata_settings['pathogen_chann_dim']
-    pathogen_mask_dim = metadata_settings['pathogen_mask_dim']
-    channel_of_interest = metadata_settings['channel_of_interest']
-    # Advanced settings
-    plot = advanced_settings['plot']
-    plot_nr = advanced_settings['plot_nr']
-    plot_control = advanced_settings['plot_control']
-    figuresize = advanced_settings['figuresize']
-    remove_background = advanced_settings['remove_background']
-    backgrounds = advanced_settings['backgrounds']
-    include_noninfected = advanced_settings['include_noninfected']
-    include_multiinfected = advanced_settings['include_multiinfected']
+    from .io import MyDataset
+    from .plot import _imshow
+    from torchvision import transforms
+    from torch.utils.data import DataLoader, random_split
+    from collections import defaultdict
+    import os
+    import random
+    from PIL import Image
+    from torchvision.transforms import ToTensor
+    chans = []
+    if 'r' in channels:
+        chans.append(1)
+    if 'g' in channels:
+        chans.append(2)
+    if 'b' in channels:
+        chans.append(3)
+    channels = chans
+    if verbose:
+        print(f'Training a network on channels: {channels}')
+        print(f'Channel 1: Red, Channel 2: Green, Channel 3: Blue')
+    class SelectChannels:
+        def __init__(self, channels):
+            self.channels = channels
+        def __call__(self, img):
+            img = img.clone()
+            if 1 not in self.channels:
+                img[0, :, :] = 0  # Zero out the red channel
+            if 2 not in self.channels:
+                img[1, :, :] = 0  # Zero out the green channel
+            if 3 not in self.channels:
+                img[2, :, :] = 0  # Zero out the blue channel
+            return img
+    plate_to_filenames = defaultdict(list)
+    plate_to_labels = defaultdict(list)
+    train_loaders = []
+    val_loaders = []
+    plate_names = []
+    if normalize:
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels),
+            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
+    else:
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels)])
+    if mode == 'train':
+        data_dir = os.path.join(src, 'train')
+        shuffle = True
+        print('Generating Train and validation datasets')
+    elif mode == 'test':
+        data_dir = os.path.join(src, 'test')
+        val_loaders = []
+        validation_split = 0.0
+        shuffle = True
+        print('Generating test dataset')
+    else:
+        print(f'mode:{mode} is not valid, use mode = train or test')
+        return
+    if train_mode == 'erm':
+        data = MyDataset(data_dir, classes, transform=transform, shuffle=shuffle, pin_memory=pin_memory)
+        if validation_split > 0:
+            train_size = int((1 - validation_split) * len(data))
+            val_size = len(data) - train_size
+            print(f'Train data:{train_size}, Validation data:{val_size}')
+            train_dataset, val_dataset = random_split(data, [train_size, val_size])
+            train_loaders = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+            val_loaders = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+        else:
+            train_loaders = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+    elif train_mode == 'irm':
+        data = MyDataset(data_dir, classes, transform=transform, shuffle=shuffle, pin_memory=pin_memory)
+        for filename, label in zip(data.filenames, data.labels):
+            plate = data.get_plate(filename)
+            plate_to_filenames[plate].append(filename)
+            plate_to_labels[plate].append(label)
+        for plate, filenames in plate_to_filenames.items():
+            labels = plate_to_labels[plate]
+            plate_data = MyDataset(data_dir, classes, specific_files=filenames, specific_labels=labels, transform=transform, shuffle=False, pin_memory=pin_memory)
+            plate_names.append(plate)
+            if validation_split > 0:
+                train_size = int((1 - validation_split) * len(plate_data))
+                val_size = len(plate_data) - train_size
+                print(f'Train data:{train_size}, Validation data:{val_size}')
+                train_dataset, val_dataset = random_split(plate_data, [train_size, val_size])
+                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+                train_loaders.append(train_loader)
+                val_loaders.append(val_loader)
+            else:
+                train_loader = DataLoader(plate_data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
+                train_loaders.append(train_loader)
+                val_loaders.append(None)
+    else:
+        print(f'train_mode:{train_mode} is not valid, use: train_mode = irm or erm')
+        return
+    if verbose:
+        if train_mode == 'erm':
+            for idx, (images, labels, filenames) in enumerate(train_loaders):
+                if idx >= max_show:
+                    break
+                images = images.cpu()
+                label_strings = [str(label.item()) for label in labels]
+                _imshow(images, label_strings, nrow=20, fontsize=12)
+        elif train_mode == 'irm':
+            for plate_name, train_loader in zip(plate_names, train_loaders):
+                print(f'Plate: {plate_name} with {len(train_loader.dataset)} images')
+                for idx, (images, labels, filenames) in enumerate(train_loader):
+                    if idx >= max_show:
+                        break
+                    images = images.cpu()
+                    label_strings = [str(label.item()) for label in labels]
+                    _imshow(images, label_strings, nrow=20, fontsize=12)
+    return train_loaders, val_loaders, plate_names
+def analyze_recruitment(src, metadata_settings, advanced_settings):
+    """
+    Analyze recruitment data by grouping the DataFrame by well coordinates and plotting controls and recruitment data.
+    Parameters:
+    src (str): The source of the recruitment data.
+    metadata_settings (dict): The settings for metadata.
+    advanced_settings (dict): The advanced settings for recruitment analysis.
+    Returns:
+    None
+    """
+    from .io import _read_and_merge_data, _results_to_csv
+    from .plot import plot_merged, _plot_controls, _plot_recruitment
+    from .utils import _object_filter, annotate_conditions, _calculate_recruitment, _group_by_well
+    settings_dict = {**metadata_settings, **advanced_settings}
+    settings_df = pd.DataFrame(list(settings_dict.items()), columns=['Key', 'Value'])
+    settings_csv = os.path.join(src,'settings','analyze_settings.csv')
+    os.makedirs(os.path.join(src,'settings'), exist_ok=True)
+    settings_df.to_csv(settings_csv, index=False)
+    # metadata settings
+    target = metadata_settings['target']
+    cell_types = metadata_settings['cell_types']
+    cell_plate_metadata = metadata_settings['cell_plate_metadata']
+    pathogen_types = metadata_settings['pathogen_types']
+    pathogen_plate_metadata = metadata_settings['pathogen_plate_metadata']
+    treatments = metadata_settings['treatments']
+    treatment_plate_metadata = metadata_settings['treatment_plate_metadata']
+    metadata_types = metadata_settings['metadata_types']
+    channel_dims = metadata_settings['channel_dims']
+    cell_chann_dim = metadata_settings['cell_chann_dim']
+    cell_mask_dim = metadata_settings['cell_mask_dim']
+    nucleus_chann_dim = metadata_settings['nucleus_chann_dim']
+    nucleus_mask_dim = metadata_settings['nucleus_mask_dim']
+    pathogen_chann_dim = metadata_settings['pathogen_chann_dim']
+    pathogen_mask_dim = metadata_settings['pathogen_mask_dim']
+    channel_of_interest = metadata_settings['channel_of_interest']
+    # Advanced settings
+    plot = advanced_settings['plot']
+    plot_nr = advanced_settings['plot_nr']
+    plot_control = advanced_settings['plot_control']
+    figuresize = advanced_settings['figuresize']
+    remove_background = advanced_settings['remove_background']
+    backgrounds = advanced_settings['backgrounds']
+    include_noninfected = advanced_settings['include_noninfected']
+    include_multiinfected = advanced_settings['include_multiinfected']
     include_multinucleated = advanced_settings['include_multinucleated']
     cells_per_well = advanced_settings['cells_per_well']
     pathogen_size_range = advanced_settings['pathogen_size_range']
@@ -1569,15 +1909,30 @@ def analyze_recruitment(src, metadata_settings, advanced_settings):
     df = df.dropna(subset=['condition'])
     print(f'After dropping non-annotated wells: {len(df)} rows')
     files = df['file_name'].tolist()
+    print(f'found: {len(files)} files')
     files = [item + '.npy' for item in files]
     random.shuffle(files)
+    _max = 10**100
+    if cell_size_range is None and nucleus_size_range is None and pathogen_size_range is None:
+        filter_min_max = None
+    else:
+        if cell_size_range is None:
+            cell_size_range = [0,_max]
+        if nucleus_size_range is None:
+            nucleus_size_range = [0,_max]
+        if pathogen_size_range is None:
+            pathogen_size_range = [0,_max]
+        filter_min_max = [[cell_size_range[0],cell_size_range[1]],[nucleus_size_range[0],nucleus_size_range[1]],[pathogen_size_range[0],pathogen_size_range[1]]]
     if plot:
         plot_settings = {'include_noninfected':include_noninfected,
                          'include_multiinfected':include_multiinfected,
                          'include_multinucleated':include_multinucleated,
                          'remove_background':remove_background,
-                         'filter_min_max':[[cell_size_range[0],cell_size_range[1]],[nucleus_size_range[0],nucleus_size_range[1]],[pathogen_size_range[0],pathogen_size_range[1]]],
+                         'filter_min_max':filter_min_max,
                          'channel_dims':channel_dims,
                          'backgrounds':backgrounds,
                          'cell_mask_dim':mask_dims[0],
@@ -1640,6 +1995,7 @@ def preprocess_generate_masks(src, settings={}):
     from .plot import plot_merged, plot_arrays
     from .utils import _pivot_counts_table
+    settings['plot'] = False
     settings['fps'] = 2
     settings['remove_background'] = True
     settings['lower_quantile'] = 0.02
@@ -1655,6 +2011,15 @@ def preprocess_generate_masks(src, settings={}):
     settings['upscale'] = False
     settings['upscale_factor'] = 2.0
+    settings['randomize'] = True
+    settings['timelapse'] = False
+    settings['timelapse_displacement'] = None
+    settings['timelapse_memory'] = 3
+    settings['timelapse_frame_limits'] = None
+    settings['timelapse_remove_transient'] = False
+    settings['timelapse_mode'] = 'trackpy'
+    settings['timelapse_objects'] = ['cells']
     settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
     settings_csv = os.path.join(src,'settings','preprocess_generate_masks_settings.csv')
     os.makedirs(os.path.join(src,'settings'), exist_ok=True)
@@ -1723,7 +2088,6 @@ def preprocess_generate_masks(src, settings={}):
                                  'cell_mask_dim':cell_mask_dim,
                                  'nucleus_mask_dim':nucleus_mask_dim,
                                  'pathogen_mask_dim':pathogen_mask_dim,
-                                 'overlay_chans':[0,2,3],
                                  'outline_thickness':3,
                                  'outline_color':'gbr',
                                  'overlay_chans':overlay_channels,
@@ -1735,6 +2099,10 @@ def preprocess_generate_masks(src, settings={}):
                                  'figuresize':20,
                                  'cmap':'inferno',
                                  'verbose':False}
+                if settings['test_mode'] == True:
+                    plot_settings['nr'] = len(os.path.join(src,'merged'))
                 try:
                     fig = plot_merged(src=os.path.join(src,'merged'), settings=plot_settings)
                 except Exception as e:
@@ -1747,26 +2115,61 @@ def preprocess_generate_masks(src, settings={}):
     print("Successfully completed run")
     return
-def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size, flow_threshold=30, cellprob_threshold=1, figuresize=25, cmap='inferno', verbose=False, plot=False, save=False, custom_model=None, signal_thresholds=1000, normalize=True, resize=False, target_height=None, target_width=None, rescale=True, resample=True, net_avg=False, invert=False, circular=False, percentiles=None, overlay=True, grayscale=False):
+def identify_masks_finetune(settings):
     from .plot import print_mask_and_flows
     from .utils import get_files_from_dir, resize_images_and_labels
     from .io import _load_normalized_images_and_labels, _load_images_and_labels
+    src=settings['src']
+    dst=settings['dst']
+    model_name=settings['model_name']
+    diameter=settings['diameter']
+    batch_size=settings['batch_size']
+    flow_threshold=settings['flow_threshold']
+    cellprob_threshold=settings['cellprob_threshold']
+    verbose=settings['verbose']
+    plot=settings['plot']
+    save=settings['save']
+    custom_model=settings['custom_model']
+    overlay=settings['overlay']
+    figuresize=25
+    cmap='inferno'
+    channels = [0,0]
+    signal_thresholds = 1000
+    normalize = True
+    percentiles = [2,98]
+    circular = False
+    invert = False
+    resize = False
+    settings['width_height'] = [1000,1000]
+    target_height = settings['width_height'][1]
+    target_width = settings['width_height'][0]
+    rescale = False
+    resample = False
+    grayscale = True
+    test = False
+    os.makedirs(dst, exist_ok=True)
+    if not custom_model is None:
+        if not os.path.exists(custom_model):
+            print(f'Custom model not found: {custom_model}')
+            return
     if not torch.cuda.is_available():
         print(f'Torch CUDA is not available, using CPU')
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     if custom_model == None:
-        if model_name =='cyto':
-            model = cp_models.CellposeModel(gpu=True, model_type=model_name, net_avg=False, diam_mean=diameter, pretrained_model=None)
-        else:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_name)
-    if custom_model != None:
-        model = cp_models.CellposeModel(gpu=torch.cuda.is_available(), model_type=None, pretrained_model=custom_model, diam_mean=diameter, device=device, net_avg=False)  #Assuming diameter is defined elsewhere
-        print(f'loaded custom model:{custom_model}')
+        model = cp_models.CellposeModel(gpu=True, model_type=model_name, device=device)
+        print(f'Loaded model: {model_name}')
+    else:
+        model = cp_models.CellposeModel(gpu=torch.cuda.is_available(), model_type=None, pretrained_model=custom_model, diam_mean=diameter, device=device)
+        print("Pretrained Model Loaded:", model.pretrained_model)
     chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [1,0] if model_name == 'cyto' else [2, 0]
@@ -1778,14 +2181,16 @@ def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size
     if verbose == True:
         print(f'Cellpose settings: Model: {model_name}, channels: {channels}, cellpose_chans: {chans}, diameter:{diameter}, flow_threshold:{flow_threshold}, cellprob_threshold:{cellprob_threshold}')
-    all_image_files = get_files_from_dir(src, file_extension="*.tif")
+    all_image_files = [os.path.join(src, f) for f in os.listdir(src) if f.endswith('.tif')]
     random.shuffle(all_image_files)
     time_ls = []
     for i in range(0, len(all_image_files), batch_size):
         image_files = all_image_files[i:i+batch_size]
         if normalize:
-            images, _, image_names, _ = _load_normalized_images_and_labels(image_files=image_files, label_files=None, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+            images, _, image_names, _ = _load_normalized_images_and_labels(image_files=image_files, label_files=None, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=plot)
             images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
             orig_dims = [(image.shape[0], image.shape[1]) for image in images]
         else:
@@ -1806,8 +2211,7 @@ def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size
                          cellprob_threshold=cellprob_threshold,
                          rescale=rescale,
                          resample=resample,
-                         net_avg=net_avg,
-                         progress=False)
+                         progress=True)
             if len(output) == 4:
                 mask, flows, _, _ = output
@@ -1882,7 +2286,6 @@ def identify_masks(src, object_type, model_name, batch_size, channels, diameter,
     #Note add logic that handles batches of size 1 as these will break the code batches must all be > 2 images
     gc.collect()
-    #print('========== generating masks ==========')
     if not torch.cuda.is_available():
         print(f'Torch CUDA is not available, using CPU')
@@ -2047,9 +2450,9 @@ def all_elements_match(list1, list2):
     # Check if all elements in list1 are in list2
     return all(element in list2 for element in list1)
-def generate_cellpose_masks_v1(src, settings, object_type):
+def generate_cellpose_masks(src, settings, object_type):
-    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, mask_object_count
+    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, _choose_model, mask_object_count
     from .io import _create_database, _save_object_counts_to_database, _check_masks, _get_avg_object_size
     from .timelapse import _npz_to_movie, _btrack_track_cells, _trackpy_track_cells
     from .plot import plot_masks
@@ -2079,15 +2482,12 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     cellpose_channels = _get_cellpose_channels(src, settings['nucleus_channel'], settings['pathogen_channel'], settings['cell_channel'])
     if settings['verbose']:
         print(cellpose_channels)
     channels = cellpose_channels[object_type]
     cellpose_batch_size = _get_cellpose_batch_size()
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model = cp_models.Cellpose(gpu=True, model_type=model_name, device=device) #net_avg=net_avg
-    #dn = denoise.CellposeDenoiseModel(model_type=f"denoise_{model_name}", gpu=True, device=device)
+    model = _choose_model(model_name, device, object_type='cell', restore_type=None)
     chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [2,0] if model_name == 'cyto' else [2, 0] if model_name == 'cyto3' else [2, 0]
     paths = [os.path.join(src, file) for file in os.listdir(src) if file.endswith('.npz')]
     count_loc = os.path.dirname(src)+'/measurements/measurements.db'
@@ -2096,7 +2496,6 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     average_sizes = []
     time_ls = []
     for file_index, path in enumerate(paths):
         name = os.path.basename(path)
         name, ext = os.path.splitext(name)
@@ -2210,23 +2609,45 @@ def generate_cellpose_masks_v1(src, settings, object_type):
                                                           mode=timelapse_mode)
                 else:
                     mask_stack = _masks_to_masks_stack(masks)
             else:
                 _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_before_filtration')
-                mask_stack = _filter_cp_masks(masks=masks,
-                                              flows=flows,
-                                              filter_size=object_settings['filter_size'],
-                                              filter_intensity=object_settings['filter_intensity'],
-                                              minimum_size=object_settings['minimum_size'],
-                                              maximum_size=object_settings['maximum_size'],
-                                              remove_border_objects=object_settings['remove_border_objects'],
-                                              merge=False,
-                                              batch=batch,
-                                              plot=settings['plot'],
-                                              figuresize=figuresize)
-                _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+                if object_settings['merge'] and not settings['filter']:
+                    mask_stack = _filter_cp_masks(masks=masks,
+                                                flows=flows,
+                                                filter_size=False,
+                                                filter_intensity=False,
+                                                minimum_size=object_settings['minimum_size'],
+                                                maximum_size=object_settings['maximum_size'],
+                                                remove_border_objects=False,
+                                                merge=object_settings['merge'],
+                                                batch=batch,
+                                                plot=settings['plot'],
+                                                figuresize=figuresize)
+                if settings['filter']:
+                    mask_stack = _filter_cp_masks(masks=masks,
+                                                flows=flows,
+                                                filter_size=object_settings['filter_size'],
+                                                filter_intensity=object_settings['filter_intensity'],
+                                                minimum_size=object_settings['minimum_size'],
+                                                maximum_size=object_settings['maximum_size'],
+                                                remove_border_objects=object_settings['remove_border_objects'],
+                                                merge=object_settings['merge'],
+                                                batch=batch,
+                                                plot=settings['plot'],
+                                                figuresize=figuresize)
+                    _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+                else:
+                    mask_stack = _masks_to_masks_stack(masks)
+                    if settings['plot']:
+                        for idx, (mask, flow, image) in enumerate(zip(masks, flows[0], batch)):
+                            if idx == 0:
+                                num_objects = mask_object_count(mask)
+                                print(f'Number of objects, : {num_objects}')
+                                plot_masks(batch=image, masks=mask, flows=flow, cmap='inferno', figuresize=figuresize, nr=1, file_type='.npz', print_object_number=True)
             if not np.any(mask_stack):
                 average_obj_size = 0
             else:
@@ -2255,207 +2676,661 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     torch.cuda.empty_cache()
     return
-def generate_cellpose_masks(src, settings, object_type):
-    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, _choose_model, mask_object_count
-    from .io import _create_database, _save_object_counts_to_database, _check_masks, _get_avg_object_size
-    from .timelapse import _npz_to_movie, _btrack_track_cells, _trackpy_track_cells
-    from .plot import plot_masks
-    gc.collect()
-    if not torch.cuda.is_available():
-        print(f'Torch CUDA is not available, using CPU')
-    figuresize=25
-    timelapse = settings['timelapse']
-    if timelapse:
-        timelapse_displacement = settings['timelapse_displacement']
-        timelapse_frame_limits = settings['timelapse_frame_limits']
-        timelapse_memory = settings['timelapse_memory']
-        timelapse_remove_transient = settings['timelapse_remove_transient']
-        timelapse_mode = settings['timelapse_mode']
-        timelapse_objects = settings['timelapse_objects']
+def generate_masks_from_imgs(src, model, model_name, batch_size, diameter, cellprob_threshold, grayscale, save, normalize, channels, percentiles, circular, invert, plot, resize, target_height, target_width, verbose):
+    from .io import _load_images_and_labels, _load_normalized_images_and_labels
+    from .utils import resize_images_and_labels, resizescikit
+    from .plot import print_mask_and_flows
+    dst = os.path.join(src, model_name)
+    os.makedirs(dst, exist_ok=True)
-    batch_size = settings['batch_size']
-    cellprob_threshold = settings[f'{object_type}_CP_prob']
     flow_threshold = 30
-    object_settings = _get_object_settings(object_type, settings)
-    model_name = object_settings['model_name']
-    cellpose_channels = _get_cellpose_channels(src, settings['nucleus_channel'], settings['pathogen_channel'], settings['cell_channel'])
-    if settings['verbose']:
-        print(cellpose_channels)
+    chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [1,0] if model_name == 'cyto' else [2, 0]
-    channels = cellpose_channels[object_type]
-    cellpose_batch_size = _get_cellpose_batch_size()
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model = _choose_model(model_name, device, object_type='cell', restore_type=None)
-    chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [2,0] if model_name == 'cyto' else [2, 0] if model_name == 'cyto3' else [2, 0]
-    paths = [os.path.join(src, file) for file in os.listdir(src) if file.endswith('.npz')]
+    if grayscale:
+        chans=[0, 0]
-    count_loc = os.path.dirname(src)+'/measurements/measurements.db'
-    os.makedirs(os.path.dirname(src)+'/measurements', exist_ok=True)
-    _create_database(count_loc)
+    all_image_files = [os.path.join(src, f) for f in os.listdir(src) if f.endswith('.tif')]
+    random.shuffle(all_image_files)
+    if verbose == True:
+        print(f'Cellpose settings: Model: {model_name}, channels: {channels}, cellpose_chans: {chans}, diameter:{diameter}, flow_threshold:{flow_threshold}, cellprob_threshold:{cellprob_threshold}')
-    average_sizes = []
     time_ls = []
-    for file_index, path in enumerate(paths):
-        name = os.path.basename(path)
-        name, ext = os.path.splitext(name)
-        output_folder = os.path.join(os.path.dirname(path), object_type+'_mask_stack')
-        os.makedirs(output_folder, exist_ok=True)
-        overall_average_size = 0
-        with np.load(path) as data:
-            stack = data['data']
-            filenames = data['filenames']
-        if settings['timelapse']:
-            trackable_objects = ['cell','nucleus','pathogen']
-            if not all_elements_match(settings['timelapse_objects'], trackable_objects):
-                print(f'timelapse_objects {settings["timelapse_objects"]} must be a subset of {trackable_objects}')
-                return
+    for i in range(0, len(all_image_files), batch_size):
+        image_files = all_image_files[i:i+batch_size]
-            if len(stack) != batch_size:
-                print(f'Changed batch_size:{batch_size} to {len(stack)}, data length:{len(stack)}')
-                settings['timelapse_batch_size'] = len(stack)
-                batch_size = len(stack)
-                if isinstance(timelapse_frame_limits, list):
-                    if len(timelapse_frame_limits) >= 2:
-                        stack = stack[timelapse_frame_limits[0]: timelapse_frame_limits[1], :, :, :].astype(stack.dtype)
-                        filenames = filenames[timelapse_frame_limits[0]: timelapse_frame_limits[1]]
-                        batch_size = len(stack)
-                        print(f'Cut batch at indecies: {timelapse_frame_limits}, New batch_size: {batch_size} ')
+        if normalize:
+            images, _, image_names, _ = _load_normalized_images_and_labels(image_files=image_files, label_files=None, signal_thresholds=100, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=plot)
+            images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+            orig_dims = [(image.shape[0], image.shape[1]) for image in images]
+        else:
+            images, _, image_names, _ = _load_images_and_labels(image_files=image_files, label_files=None, circular=circular, invert=invert)
+            images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+            orig_dims = [(image.shape[0], image.shape[1]) for image in images]
+        if resize:
+            images, _ = resize_images_and_labels(images, None, target_height, target_width, True)
-        for i in range(0, stack.shape[0], batch_size):
-            mask_stack = []
+        for file_index, stack in enumerate(images):
             start = time.time()
+            output = model.eval(x=stack,
+                         normalize=False,
+                         channels=chans,
+                         channel_axis=3,
+                         diameter=diameter,
+                         flow_threshold=flow_threshold,
+                         cellprob_threshold=cellprob_threshold,
+                         rescale=False,
+                         resample=False,
+                         progress=True)
-            if stack.shape[3] == 1:
-                batch = stack[i: i+batch_size, :, :, [0,0]].astype(stack.dtype)
+            if len(output) == 4:
+                mask, flows, _, _ = output
+            elif len(output) == 3:
+                mask, flows, _ = output
             else:
-                batch = stack[i: i+batch_size, :, :, channels].astype(stack.dtype)
+                raise ValueError("Unexpected number of return values from model.eval()")
-            batch_filenames = filenames[i: i+batch_size].tolist()
+            if resize:
+                dims = orig_dims[file_index]
+                mask = resizescikit(mask, dims, order=0, preserve_range=True, anti_aliasing=False).astype(mask.dtype)
-            if not settings['plot']:
-                batch, batch_filenames = _check_masks(batch, batch_filenames, output_folder)
-            if batch.size == 0:
-                print(f'Processing {file_index}/{len(paths)}: Images/npz {batch.shape[0]}')
-                continue
-            if batch.max() > 1:
-                batch = batch / batch.max()
+            stop = time.time()
+            duration = (stop - start)
+            time_ls.append(duration)
+            average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
+            print(f'Processing {file_index+1}/{len(images)} images : Time/image {average_time:.3f} sec', end='\r', flush=True)
+            if plot:
+                if resize:
+                    stack = resizescikit(stack, dims, preserve_range=True, anti_aliasing=False).astype(stack.dtype)
+                print_mask_and_flows(stack, mask, flows, overlay=True)
+            if save:
+                output_filename = os.path.join(dst, image_names[file_index])
+                cv2.imwrite(output_filename, mask)
-            if timelapse:
-                stitch_threshold=100.0
-                movie_path = os.path.join(os.path.dirname(src), 'movies')
-                os.makedirs(movie_path, exist_ok=True)
-                save_path = os.path.join(movie_path, f'timelapse_{object_type}_{name}.mp4')
-                _npz_to_movie(batch, batch_filenames, save_path, fps=2)
-            else:
-                stitch_threshold=0.0
-            print('batch.shape',batch.shape)
-            masks, flows, _, _ = model.eval(x=batch,
-                                            batch_size=cellpose_batch_size,
-                                            normalize=False,
-                                            channels=chans,
-                                            channel_axis=3,
-                                            diameter=object_settings['diameter'],
-                                            flow_threshold=flow_threshold,
-                                            cellprob_threshold=cellprob_threshold,
-                                            rescale=None,
-                                            resample=object_settings['resample'],
-                                            stitch_threshold=stitch_threshold)
+def check_cellpose_models(settings):
+    src = settings['src']
+    batch_size = settings['batch_size']
+    cellprob_threshold = settings['cellprob_threshold']
+    save = settings['save']
+    normalize = settings['normalize']
+    channels = settings['channels']
+    percentiles = settings['percentiles']
+    circular = settings['circular']
+    invert = settings['invert']
+    plot = settings['plot']
+    diameter = settings['diameter']
+    resize = settings['resize']
+    grayscale = settings['grayscale']
+    verbose = settings['verbose']
+    target_height = settings['width_height'][0]
+    target_width = settings['width_height'][1]
+    cellpose_models = ['cyto', 'nuclei', 'cyto2', 'cyto3']
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    for model_name in cellpose_models:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_name, device=device)
+        print(f'Using {model_name}')
+        generate_masks_from_imgs(src, model, model_name, batch_size, diameter, cellprob_threshold, grayscale, save, normalize, channels, percentiles, circular, invert, plot, resize, target_height, target_width, verbose)
+    return
+def compare_masks_v1(dir1, dir2, dir3, verbose=False):
+    from .io import _read_mask
+    from .plot import visualize_masks, plot_comparison_results
+    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index, dice_coefficient
+    filenames = os.listdir(dir1)
+    results = []
+    cond_1 = os.path.basename(dir1)
+    cond_2 = os.path.basename(dir2)
+    cond_3 = os.path.basename(dir3)
+    for index, filename in enumerate(filenames):
+        print(f'Processing image:{index+1}', end='\r', flush=True)
+        path1, path2, path3 = os.path.join(dir1, filename), os.path.join(dir2, filename), os.path.join(dir3, filename)
+        print(path1)
+        print(path2)
+        print(path3)
+        if os.path.exists(path2) and os.path.exists(path3):
-            if timelapse:
+            mask1, mask2, mask3 = _read_mask(path1), _read_mask(path2), _read_mask(path3)
+            boundary_true1, boundary_true2, boundary_true3 = extract_boundaries(mask1), extract_boundaries(mask2), extract_boundaries(mask3)
+            true_masks, pred_masks = [mask1], [mask2, mask3]  # Assuming mask1 is the ground truth for simplicity
+            true_labels, pred_labels_1, pred_labels_2 = label(mask1), label(mask2), label(mask3)
+            average_precision_0, average_precision_1 = compute_segmentation_ap(mask1, mask2), compute_segmentation_ap(mask1, mask3)
+            ap_scores = [average_precision_0, average_precision_1]
-                if settings['plot']:
-                    for idx, (mask, flow, image) in enumerate(zip(masks, flows[0], batch)):
-                        if idx == 0:
-                            num_objects = mask_object_count(mask)
-                            print(f'Number of objects: {num_objects}')
-                            plot_masks(batch=image, masks=mask, flows=flow, cmap='inferno', figuresize=figuresize, nr=1, file_type='.npz', print_object_number=True)
+            if verbose:
+                #unique_values1, unique_values2, unique_values3 = np.unique(mask1),  np.unique(mask2), np.unique(mask3)
+                #print(f"Unique values in mask 1: {unique_values1}, mask 2: {unique_values2}, mask 3: {unique_values3}")
+                visualize_masks(boundary_true1, boundary_true2, boundary_true3, title=f"Boundaries - {filename}")
+            boundary_f1_12, boundary_f1_13, boundary_f1_23 = boundary_f1_score(mask1, mask2), boundary_f1_score(mask1, mask3), boundary_f1_score(mask2, mask3)
-                _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_timelapse')
-                if object_type in timelapse_objects:
-                    if timelapse_mode == 'btrack':
-                        if not timelapse_displacement is None:
-                            radius = timelapse_displacement
-                        else:
-                            radius = 100
+            if (np.unique(mask1).size == 1 and np.unique(mask1)[0] == 0) and \
+               (np.unique(mask2).size == 1 and np.unique(mask2)[0] == 0) and \
+               (np.unique(mask3).size == 1 and np.unique(mask3)[0] == 0):
+                continue
+            if verbose:
+                #unique_values4, unique_values5, unique_values6 = np.unique(boundary_f1_12), np.unique(boundary_f1_13), np.unique(boundary_f1_23)
+                #print(f"Unique values in boundary mask 1: {unique_values4}, mask 2: {unique_values5}, mask 3: {unique_values6}")
+                visualize_masks(mask1, mask2, mask3, title=filename)
+            jaccard12 = jaccard_index(mask1, mask2)
+            dice12 = dice_coefficient(mask1, mask2)
+            jaccard13 = jaccard_index(mask1, mask3)
+            dice13 = dice_coefficient(mask1, mask3)
+            jaccard23 = jaccard_index(mask2, mask3)
+            dice23 = dice_coefficient(mask2, mask3)
-                        workers = os.cpu_count()-2
-                        if workers < 1:
-                            workers = 1
+            results.append({
+                f'filename': filename,
+                f'jaccard_{cond_1}_{cond_2}': jaccard12,
+                f'dice_{cond_1}_{cond_2}': dice12,
+                f'jaccard_{cond_1}_{cond_3}': jaccard13,
+                f'dice_{cond_1}_{cond_3}': dice13,
+                f'jaccard_{cond_2}_{cond_3}': jaccard23,
+                f'dice_{cond_2}_{cond_3}': dice23,
+                f'boundary_f1_{cond_1}_{cond_2}': boundary_f1_12,
+                f'boundary_f1_{cond_1}_{cond_3}': boundary_f1_13,
+                f'boundary_f1_{cond_2}_{cond_3}': boundary_f1_23,
+                f'average_precision_{cond_1}_{cond_2}': ap_scores[0],
+                f'average_precision_{cond_1}_{cond_3}': ap_scores[1]
+            })
+        else:
+            print(f'Cannot find {path1} or {path2} or {path3}')
+    fig = plot_comparison_results(results)
+    return results, fig
-                        mask_stack = _btrack_track_cells(src=src,
-                                                         name=name,
-                                                         batch_filenames=batch_filenames,
-                                                         object_type=object_type,
-                                                         plot=settings['plot'],
-                                                         save=settings['save'],
-                                                         masks_3D=masks,
-                                                         mode=timelapse_mode,
-                                                         timelapse_remove_transient=timelapse_remove_transient,
-                                                         radius=radius,
-                                                         workers=workers)
-                    if timelapse_mode == 'trackpy':
-                        mask_stack = _trackpy_track_cells(src=src,
-                                                          name=name,
-                                                          batch_filenames=batch_filenames,
-                                                          object_type=object_type,
-                                                          masks=masks,
-                                                          timelapse_displacement=timelapse_displacement,
-                                                          timelapse_memory=timelapse_memory,
-                                                          timelapse_remove_transient=timelapse_remove_transient,
-                                                          plot=settings['plot'],
-                                                          save=settings['save'],
-                                                          mode=timelapse_mode)
-                else:
-                    mask_stack = _masks_to_masks_stack(masks)
+def compare_cellpose_masks_v1(src, verbose=False):
+    from .io import _read_mask
+    from .plot import visualize_masks, plot_comparison_results, visualize_cellpose_masks
+    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index
-            else:
-                _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_before_filtration')
-                mask_stack = _filter_cp_masks(masks=masks,
-                                              flows=flows,
-                                              filter_size=object_settings['filter_size'],
-                                              filter_intensity=object_settings['filter_intensity'],
-                                              minimum_size=object_settings['minimum_size'],
-                                              maximum_size=object_settings['maximum_size'],
-                                              remove_border_objects=object_settings['remove_border_objects'],
-                                              merge=False,
-                                              batch=batch,
-                                              plot=settings['plot'],
-                                              figuresize=figuresize)
-                _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+    import os
+    import numpy as np
+    from skimage.measure import label
-            if not np.any(mask_stack):
-                average_obj_size = 0
-            else:
-                average_obj_size = _get_avg_object_size(mask_stack)
+    # Collect all subdirectories in src
+    dirs = [os.path.join(src, d) for d in os.listdir(src) if os.path.isdir(os.path.join(src, d))]
-            average_sizes.append(average_obj_size)
-            overall_average_size = np.mean(average_sizes) if len(average_sizes) > 0 else 0
+    dirs.sort()  # Optional: sort directories if needed
-        stop = time.time()
-        duration = (stop - start)
-        time_ls.append(duration)
-        average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
-        time_in_min = average_time/60
-        time_per_mask = average_time/batch_size
-        print(f'Processing {len(paths)}  files with {batch_size} imgs: {(file_index+1)*(batch_size+1)}/{(len(paths))*(batch_size+1)}: Time/batch {time_in_min:.3f} min: Time/mask {time_per_mask:.3f}sec: {object_type} size: {overall_average_size:.3f} px2')
-        if not timelapse:
-            if settings['plot']:
-                plot_masks(batch, mask_stack, flows, figuresize=figuresize, cmap='inferno', nr=batch_size)
-        if settings['save']:
-            for mask_index, mask in enumerate(mask_stack):
-                output_filename = os.path.join(output_folder, batch_filenames[mask_index])
-                np.save(output_filename, mask)
-            mask_stack = []
-            batch_filenames = []
-        gc.collect()
-    torch.cuda.empty_cache()
-    return
+    # Get common files in all directories
+    common_files = set(os.listdir(dirs[0]))
+    for d in dirs[1:]:
+        common_files.intersection_update(os.listdir(d))
+    common_files = list(common_files)
+    results = []
+    conditions = [os.path.basename(d) for d in dirs]
+    for index, filename in enumerate(common_files):
+        print(f'Processing image {index+1}/{len(common_files)}', end='\r', flush=True)
+        paths = [os.path.join(d, filename) for d in dirs]
+        # Check if file exists in all directories
+        if not all(os.path.exists(path) for path in paths):
+            print(f'Skipping {filename} as it is not present in all directories.')
+            continue
+        masks = [_read_mask(path) for path in paths]
+        boundaries = [extract_boundaries(mask) for mask in masks]
+        if verbose:
+            visualize_cellpose_masks(masks, titles=conditions, comparison_title=f"Masks Comparison for {filename}")
+        # Initialize data structure for results
+        file_results = {'filename': filename}
+        # Compare each mask with each other
+        for i in range(len(masks)):
+            for j in range(i + 1, len(masks)):
+                condition_i = conditions[i]
+                condition_j = conditions[j]
+                mask_i = masks[i]
+                mask_j = masks[j]
+                # Compute metrics
+                boundary_f1 = boundary_f1_score(mask_i, mask_j)
+                jaccard = jaccard_index(mask_i, mask_j)
+                average_precision = compute_segmentation_ap(mask_i, mask_j)
+                # Store results
+                file_results[f'jaccard_{condition_i}_{condition_j}'] = jaccard
+                file_results[f'boundary_f1_{condition_i}_{condition_j}'] = boundary_f1
+                file_results[f'average_precision_{condition_i}_{condition_j}'] = average_precision
+        results.append(file_results)
+    fig = plot_comparison_results(results)
+    return results, fig
+def compare_mask(args):
+    src, filename, dirs, conditions = args
+    paths = [os.path.join(d, filename) for d in dirs]
+    if not all(os.path.exists(path) for path in paths):
+        return None
+    from .io import _read_mask  # Import here to avoid issues in multiprocessing
+    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index
+    from .plot import plot_comparison_results
+    masks = [_read_mask(path) for path in paths]
+    file_results = {'filename': filename}
+    for i in range(len(masks)):
+        for j in range(i + 1, len(masks)):
+            mask_i, mask_j = masks[i], masks[j]
+            f1_score = boundary_f1_score(mask_i, mask_j)
+            jac_index = jaccard_index(mask_i, mask_j)
+            ap_score = compute_segmentation_ap(mask_i, mask_j)
+            file_results.update({
+                f'jaccard_{conditions[i]}_{conditions[j]}': jac_index,
+                f'boundary_f1_{conditions[i]}_{conditions[j]}': f1_score,
+                f'ap_{conditions[i]}_{conditions[j]}': ap_score
+            })
+    return file_results
+def compare_cellpose_masks(src, verbose=False, processes=None):
+    from .plot import visualize_cellpose_masks, plot_comparison_results
+    from .io import _read_mask
+    dirs = [os.path.join(src, d) for d in os.listdir(src) if os.path.isdir(os.path.join(src, d))]
+    dirs.sort()  # Optional: sort directories if needed
+    conditions = [os.path.basename(d) for d in dirs]
+    # Get common files in all directories
+    common_files = set(os.listdir(dirs[0]))
+    for d in dirs[1:]:
+        common_files.intersection_update(os.listdir(d))
+    common_files = list(common_files)
+    # Create a pool of workers
+    with Pool(processes=processes) as pool:
+        args = [(src, filename, dirs, conditions) for filename in common_files]
+        results = pool.map(compare_mask, args)
+    # Filter out None results (from skipped files)
+    results = [res for res in results if res is not None]
+    if verbose:
+        for result in results:
+            filename = result['filename']
+            masks = [_read_mask(os.path.join(d, filename)) for d in dirs]
+            visualize_cellpose_masks(masks, titles=conditions, comparison_title=f"Masks Comparison for {filename}")
+    fig = plot_comparison_results(results)
+    return results, fig
+def _calculate_similarity(df, features, col_to_compare, val1, val2):
+    """
+    Calculate similarity scores of each well to the positive and negative controls using various metrics.
+    Args:
+    df (pandas.DataFrame): DataFrame containing the data.
+    features (list): List of feature columns to use for similarity calculation.
+    col_to_compare (str): Column name to use for comparing groups.
+    val1, val2 (str): Values in col_to_compare to create subsets for comparison.
+    Returns:
+    pandas.DataFrame: DataFrame with similarity scores.
+    """
+    # Separate positive and negative control wells
+    pos_control = df[df[col_to_compare] == val1][features].mean()
+    neg_control = df[df[col_to_compare] == val2][features].mean()
+    # Standardize features for Mahalanobis distance
+    scaler = StandardScaler()
+    scaled_features = scaler.fit_transform(df[features])
+    # Regularize the covariance matrix to avoid singularity
+    cov_matrix = np.cov(scaled_features, rowvar=False)
+    inv_cov_matrix = None
+    try:
+        inv_cov_matrix = np.linalg.inv(cov_matrix)
+    except np.linalg.LinAlgError:
+        # Add a small value to the diagonal elements for regularization
+        epsilon = 1e-5
+        inv_cov_matrix = np.linalg.inv(cov_matrix + np.eye(cov_matrix.shape[0]) * epsilon)
+    # Calculate similarity scores
+    df['similarity_to_pos_euclidean'] = df[features].apply(lambda row: euclidean(row, pos_control), axis=1)
+    df['similarity_to_neg_euclidean'] = df[features].apply(lambda row: euclidean(row, neg_control), axis=1)
+    df['similarity_to_pos_cosine'] = df[features].apply(lambda row: cosine(row, pos_control), axis=1)
+    df['similarity_to_neg_cosine'] = df[features].apply(lambda row: cosine(row, neg_control), axis=1)
+    df['similarity_to_pos_mahalanobis'] = df[features].apply(lambda row: mahalanobis(row, pos_control, inv_cov_matrix), axis=1)
+    df['similarity_to_neg_mahalanobis'] = df[features].apply(lambda row: mahalanobis(row, neg_control, inv_cov_matrix), axis=1)
+    df['similarity_to_pos_manhattan'] = df[features].apply(lambda row: cityblock(row, pos_control), axis=1)
+    df['similarity_to_neg_manhattan'] = df[features].apply(lambda row: cityblock(row, neg_control), axis=1)
+    df['similarity_to_pos_minkowski'] = df[features].apply(lambda row: minkowski(row, pos_control, p=3), axis=1)
+    df['similarity_to_neg_minkowski'] = df[features].apply(lambda row: minkowski(row, neg_control, p=3), axis=1)
+    df['similarity_to_pos_chebyshev'] = df[features].apply(lambda row: chebyshev(row, pos_control), axis=1)
+    df['similarity_to_neg_chebyshev'] = df[features].apply(lambda row: chebyshev(row, neg_control), axis=1)
+    df['similarity_to_pos_hamming'] = df[features].apply(lambda row: hamming(row, pos_control), axis=1)
+    df['similarity_to_neg_hamming'] = df[features].apply(lambda row: hamming(row, neg_control), axis=1)
+    df['similarity_to_pos_jaccard'] = df[features].apply(lambda row: jaccard(row, pos_control), axis=1)
+    df['similarity_to_neg_jaccard'] = df[features].apply(lambda row: jaccard(row, neg_control), axis=1)
+    df['similarity_to_pos_braycurtis'] = df[features].apply(lambda row: braycurtis(row, pos_control), axis=1)
+    df['similarity_to_neg_braycurtis'] = df[features].apply(lambda row: braycurtis(row, neg_control), axis=1)
+    return df
+def _permutation_importance(df, feature_string='channel_3', col_to_compare='col', pos='c1', neg='c2', exclude=None, n_repeats=10, clean=True, nr_to_plot=30, n_estimators=100, test_size=0.2, random_state=42, model_type='xgboost', n_jobs=-1):
+    """
+    Calculates permutation importance for numerical features in the dataframe,
+    comparing groups based on specified column values and uses the model to predict
+    the class for all other rows in the dataframe.
+    Args:
+    df (pandas.DataFrame): The DataFrame containing the data.
+    feature_string (str): String to filter features that contain this substring.
+    col_to_compare (str): Column name to use for comparing groups.
+    pos, neg (str): Values in col_to_compare to create subsets for comparison.
+    exclude (list or str, optional): Columns to exclude from features.
+    n_repeats (int): Number of repeats for permutation importance.
+    clean (bool): Whether to remove columns with a single value.
+    nr_to_plot (int): Number of top features to plot based on permutation importance.
+    n_estimators (int): Number of trees in the random forest, gradient boosting, or XGBoost model.
+    test_size (float): Proportion of the dataset to include in the test split.
+    random_state (int): Random seed for reproducibility.
+    model_type (str): Type of model to use ('random_forest', 'logistic_regression', 'gradient_boosting', 'xgboost').
+    n_jobs (int): Number of jobs to run in parallel for applicable models.
+    Returns:
+    pandas.DataFrame: The original dataframe with added prediction and data usage columns.
+    pandas.DataFrame: DataFrame containing the importances and standard deviations.
+    """
+    if 'cells_per_well' in df.columns:
+        df = df.drop(columns=['cells_per_well'])
+    # Subset the dataframe based on specified column values
+    df1 = df[df[col_to_compare] == pos].copy()
+    df2 = df[df[col_to_compare] == neg].copy()
+    # Create target variable
+    df1['target'] = 0
+    df2['target'] = 1
+    # Combine the subsets for analysis
+    combined_df = pd.concat([df1, df2])
+    # Automatically select numerical features
+    features = combined_df.select_dtypes(include=[np.number]).columns.tolist()
+    features.remove('target')
+    if clean:
+        combined_df = combined_df.loc[:, combined_df.nunique() > 1]
+        features = [feature for feature in features if feature in combined_df.columns]
+    if feature_string is not None:
+        feature_list = ['channel_0', 'channel_1', 'channel_2', 'channel_3']
+        # Remove feature_string from the list if it exists
+        if feature_string in feature_list:
+            feature_list.remove(feature_string)
+        features = [feature for feature in features if feature_string in feature]
+        # Iterate through the list and remove columns from df
+        for feature_ in feature_list:
+            features = [feature for feature in features if feature_ not in feature]
+            print(f'After removing {feature_} features: {len(features)}')
+    if exclude:
+        if isinstance(exclude, list):
+            features = [feature for feature in features if feature not in exclude]
+        else:
+            features.remove(exclude)
+    X = combined_df[features]
+    y = combined_df['target']
+    # Split the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+    # Label the data in the original dataframe
+    combined_df['data_usage'] = 'train'
+    combined_df.loc[X_test.index, 'data_usage'] = 'test'
+    # Initialize the model based on model_type
+    if model_type == 'random_forest':
+        model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state, n_jobs=n_jobs)
+    elif model_type == 'logistic_regression':
+        model = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs)
+    elif model_type == 'gradient_boosting':
+        model = HistGradientBoostingClassifier(max_iter=n_estimators, random_state=random_state)  # Supports n_jobs internally
+    elif model_type == 'xgboost':
+        model = XGBClassifier(n_estimators=n_estimators, random_state=random_state, nthread=n_jobs, use_label_encoder=False, eval_metric='logloss')
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}")
+    model.fit(X_train, y_train)
+    perm_importance = permutation_importance(model, X_train, y_train, n_repeats=n_repeats, random_state=random_state, n_jobs=n_jobs)
+    # Create a DataFrame for permutation importances
+    permutation_df = pd.DataFrame({
+        'feature': [features[i] for i in perm_importance.importances_mean.argsort()],
+        'importance_mean': perm_importance.importances_mean[perm_importance.importances_mean.argsort()],
+        'importance_std': perm_importance.importances_std[perm_importance.importances_mean.argsort()]
+    }).tail(nr_to_plot)
+    # Plotting
+    fig, ax = plt.subplots()
+    ax.barh(permutation_df['feature'], permutation_df['importance_mean'], xerr=permutation_df['importance_std'], color="teal", align="center", alpha=0.6)
+    ax.set_xlabel('Permutation Importance')
+    plt.tight_layout()
+    plt.show()
+    # Feature importance for models that support it
+    if model_type in ['random_forest', 'xgboost', 'gradient_boosting']:
+        feature_importances = model.feature_importances_
+        feature_importance_df = pd.DataFrame({
+            'feature': features,
+            'importance': feature_importances
+        }).sort_values(by='importance', ascending=False).head(nr_to_plot)
+        # Plotting feature importance
+        fig, ax = plt.subplots()
+        ax.barh(feature_importance_df['feature'], feature_importance_df['importance'], color="blue", align="center", alpha=0.6)
+        ax.set_xlabel('Feature Importance')
+        plt.tight_layout()
+        plt.show()
+    else:
+        feature_importance_df = pd.DataFrame()
+    # Predicting the target variable for the test set
+    predictions_test = model.predict(X_test)
+    combined_df.loc[X_test.index, 'predictions'] = predictions_test
+    # Predicting the target variable for the training set
+    predictions_train = model.predict(X_train)
+    combined_df.loc[X_train.index, 'predictions'] = predictions_train
+    # Predicting the target variable for all other rows in the dataframe
+    X_all = df[features]
+    all_predictions = model.predict(X_all)
+    df['predictions'] = all_predictions
+    # Combine data usage labels back to the original dataframe
+    combined_data_usage = pd.concat([combined_df[['data_usage']], df[['predictions']]], axis=0)
+    df = df.join(combined_data_usage, how='left', rsuffix='_model')
+    # Calculating and printing the accuracy metrics
+    accuracy = accuracy_score(y_test, predictions_test)
+    precision = precision_score(y_test, predictions_test)
+    recall = recall_score(y_test, predictions_test)
+    f1 = f1_score(y_test, predictions_test)
+    print(f"Accuracy: {accuracy}")
+    print(f"Precision: {precision}")
+    print(f"Recall: {recall}")
+    print(f"F1 Score: {f1}")
+    # Printing class-specific accuracy metrics
+    print("\nClassification Report:")
+    print(classification_report(y_test, predictions_test))
+    df = _calculate_similarity(df, features, col_to_compare, pos, neg)
+    return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test]
+def _shap_analysis(model, X_train, X_test):
+    """
+    Performs SHAP analysis on the given model and data.
+    Args:
+    model: The trained model.
+    X_train (pandas.DataFrame): Training feature set.
+    X_test (pandas.DataFrame): Testing feature set.
+    """
+    explainer = shap.Explainer(model, X_train)
+    shap_values = explainer(X_test)
+    # Summary plot
+    shap.summary_plot(shap_values, X_test)
+def plate_heatmap(src, model_type='xgboost', variable='predictions', grouping='mean', min_max='allq', cmap='viridis', channel_of_interest=3, min_count=25, n_estimators=100, col_to_compare='col', pos='c1', neg='c2', exclude=None, n_repeats=10, clean=True, nr_to_plot=20, verbose=False, n_jobs=-1):
+    from .io import _read_and_merge_data
+    from .plot import _plot_plates
+    db_loc = [src+'/measurements/measurements.db']
+    tables = ['cell', 'nucleus', 'pathogen','cytoplasm']
+    include_multinucleated, include_multiinfected, include_noninfected = True, 2.0, True
+    df, _ = _read_and_merge_data(db_loc,
+                                 tables,
+                                 verbose=verbose,
+                                 include_multinucleated=include_multinucleated,
+                                 include_multiinfected=include_multiinfected,
+                                 include_noninfected=include_noninfected)
+    if not channel_of_interest is None:
+        df['recruitment'] = df[f'pathogen_channel_{channel_of_interest}_mean_intensity']/df[f'cytoplasm_channel_{channel_of_interest}_mean_intensity']
+        feature_string = f'channel_{channel_of_interest}'
+    else:
+        feature_string = None
+    output = _permutation_importance(df, feature_string, col_to_compare, pos, neg, exclude, n_repeats, clean, nr_to_plot, n_estimators=n_estimators, random_state=42, model_type=model_type, n_jobs=n_jobs)
+    _shap_analysis(output[3], output[4], output[5])
+    features = output[0].select_dtypes(include=[np.number]).columns.tolist()
+    if not variable in features:
+        raise ValueError(f"Variable {variable} not found in the dataframe. Please choose one of the following: {features}")
+    plate_heatmap = _plot_plates(output[0], variable, grouping, min_max, cmap, min_count)
+    return [output, plate_heatmap]
+def join_measurments_and_annotation(src, tables = ['cell', 'nucleus', 'pathogen','cytoplasm']):
+    from .io import _read_and_merge_data, _read_db
+    db_loc = [src+'/measurements/measurements.db']
+    loc = src+'/measurements/measurements.db'
+    df, _ = _read_and_merge_data(db_loc,
+                                 tables,
+                                 verbose=True,
+                                 include_multinucleated=True,
+                                 include_multiinfected=True,
+                                 include_noninfected=True)
+    paths_df = _read_db(loc, tables=['png_list'])
+    merged_df = pd.merge(df, paths_df[0], on='prcfo', how='left')
+    return merged_df
+def jitterplot_by_annotation(src, x_column, y_column, plot_title='Jitter Plot', output_path=None, filter_column=None, filter_values=None):
+    """
+    Reads a CSV file and creates a jitter plot of one column grouped by another column.
+    Args:
+    src (str): Path to the source data.
+    x_column (str): Name of the column to be used for the x-axis.
+    y_column (str): Name of the column to be used for the y-axis.
+    plot_title (str): Title of the plot. Default is 'Jitter Plot'.
+    output_path (str): Path to save the plot image. If None, the plot will be displayed. Default is None.
+    Returns:
+    pd.DataFrame: The filtered and balanced DataFrame.
+    """
+    # Read the CSV file into a DataFrame
+    df = join_measurments_and_annotation(src, tables=['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    # Print column names for debugging
+    print(f"Generated dataframe with: {df.shape[1]} columns and {df.shape[0]} rows")
+    #print("Columns in DataFrame:", df.columns.tolist())
+    # Replace NaN values with a specific label in x_column
+    df[x_column] = df[x_column].fillna('NaN')
+    # Filter the DataFrame if filter_column and filter_values are provided
+    if not filter_column is None:
+        if isinstance(filter_column, str):
+            df = df[df[filter_column].isin(filter_values)]
+        if isinstance(filter_column, list):
+            for i,val in enumerate(filter_column):
+                print(f'hello {len(df)}')
+                df = df[df[val].isin(filter_values[i])]
+    # Use the correct column names based on your DataFrame
+    required_columns = ['plate_x', 'row_x', 'col_x']
+    if not all(column in df.columns for column in required_columns):
+        raise KeyError(f"DataFrame does not contain the necessary columns: {required_columns}")
+    # Filter to retain rows with non-NaN values in x_column and with matching plate, row, col values
+    non_nan_df = df[df[x_column] != 'NaN']
+    retained_rows = df[df[['plate_x', 'row_x', 'col_x']].apply(tuple, axis=1).isin(non_nan_df[['plate_x', 'row_x', 'col_x']].apply(tuple, axis=1))]
+    # Determine the minimum count of examples across all groups in x_column
+    min_count = retained_rows[x_column].value_counts().min()
+    print(f'Found {min_count} annotated images')
+    # Randomly sample min_count examples from each group in x_column
+    balanced_df = retained_rows.groupby(x_column).apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)
+    # Create the jitter plot
+    plt.figure(figsize=(10, 6))
+    jitter_plot = sns.stripplot(data=balanced_df, x=x_column, y=y_column, hue=x_column, jitter=True, palette='viridis', dodge=False)
+    plt.title(plot_title)
+    plt.xlabel(x_column)
+    plt.ylabel(y_column)
+    # Customize the x-axis labels
+    plt.xticks(rotation=45, ha='right')
+    # Adjust the position of the x-axis labels to be centered below the data
+    ax = plt.gca()
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='center')
+    # Save the plot to a file or display it
+    if output_path:
+        plt.savefig(output_path, bbox_inches='tight')
+        print(f"Jitter plot saved to {output_path}")
+    else:
+        plt.show()
+    return balanced_df

spacr 0.0.20__py3-none-any.whl → 0.0.21__py3-none-any.whl

spacr 0.0.20py3-none-any.whl → 0.0.21py3-none-any.whl