PyPI - spacr - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

spacr 0.0.2py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

spacr/__init__.py +2 -2
spacr/__main__.py +0 -2
spacr/alpha.py +803 -14
spacr/annotate_app.py +118 -120
spacr/chris.py +50 -0
spacr/core.py +1544 -533
spacr/deep_spacr.py +696 -0
spacr/foldseek.py +779 -0
spacr/get_alfafold_structures.py +72 -0
spacr/graph_learning.py +297 -253
spacr/gui.py +145 -0
spacr/gui_2.py +90 -0
spacr/gui_classify_app.py +70 -80
spacr/gui_mask_app.py +114 -91
spacr/gui_measure_app.py +109 -88
spacr/gui_utils.py +376 -32
spacr/io.py +441 -438
spacr/mask_app.py +116 -9
spacr/measure.py +169 -69
spacr/models/cp/toxo_pv_lumen.CP_model +0 -0
spacr/old_code.py +70 -2
spacr/plot.py +173 -17
spacr/sequencing.py +1130 -0
spacr/sim.py +630 -125
spacr/timelapse.py +139 -10
spacr/train.py +188 -21
spacr/umap.py +0 -689
spacr/utils.py +1360 -119
{spacr-0.0.2.dist-info → spacr-0.0.6.dist-info}/METADATA +17 -29
spacr-0.0.6.dist-info/RECORD +39 -0
{spacr-0.0.2.dist-info → spacr-0.0.6.dist-info}/WHEEL +1 -1
spacr-0.0.6.dist-info/entry_points.txt +9 -0
spacr-0.0.2.dist-info/RECORD +0 -31
spacr-0.0.2.dist-info/entry_points.txt +0 -7
{spacr-0.0.2.dist-info → spacr-0.0.6.dist-info}/LICENSE +0 -0
{spacr-0.0.2.dist-info → spacr-0.0.6.dist-info}/top_level.txt +0 -0

spacr/core.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import os, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, datetime
+import os, sqlite3, gc, torch, time, random, shutil, cv2, tarfile, datetime, shap
-# image and array processing
 import numpy as np
 import pandas as pd
-import cellpose
+from cellpose import train
 from cellpose import models as cp_models
 import statsmodels.formula.api as smf
@@ -14,23 +13,37 @@ from IPython.display import display
 from multiprocessing import Pool, cpu_count, Value, Lock
 import seaborn as sns
-import matplotlib.pyplot as plt
 from skimage.measure import regionprops, label
-import skimage.measure as measure
+from skimage.morphology import square
 from skimage.transform import resize as resizescikit
-from sklearn.model_selection import train_test_split
 from collections import defaultdict
-import multiprocessing
 from torch.utils.data import DataLoader, random_split
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from skimage import measure
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import  IsolationForest, RandomForestClassifier, HistGradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.inspection import permutation_importance
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
+from sklearn.preprocessing import StandardScaler
+from scipy.ndimage import binary_dilation
+from scipy.spatial.distance import cosine, euclidean, mahalanobis, cityblock, minkowski, chebyshev, hamming, jaccard, braycurtis
+import torchvision.transforms as transforms
+from xgboost import XGBClassifier
+import shap
+import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
+#import matplotlib.pyplot as plt
-import torchvision.transforms as transforms
-from sklearn.model_selection import train_test_split
-from sklearn.ensemble import  IsolationForest
 from .logger import log_function_call
 def analyze_plaques(folder):
     summary_data = []
     details_data = []
@@ -67,169 +80,95 @@ def analyze_plaques(folder):
     print(f"Analysis completed and saved to database '{db_name}'.")
-def compare_masks(dir1, dir2, dir3, verbose=False):
-    from .io import _read_mask
-    from .plot import visualize_masks, plot_comparison_results
-    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index, dice_coefficient
-    filenames = os.listdir(dir1)
-    results = []
-    cond_1 = os.path.basename(dir1)
-    cond_2 = os.path.basename(dir2)
-    cond_3 = os.path.basename(dir3)
-    for index, filename in enumerate(filenames):
-        print(f'Processing image:{index+1}', end='\r', flush=True)
-        path1, path2, path3 = os.path.join(dir1, filename), os.path.join(dir2, filename), os.path.join(dir3, filename)
-        if os.path.exists(path2) and os.path.exists(path3):
-            mask1, mask2, mask3 = _read_mask(path1), _read_mask(path2), _read_mask(path3)
-            boundary_true1, boundary_true2, boundary_true3 = extract_boundaries(mask1), extract_boundaries(mask2), extract_boundaries(mask3)
-            true_masks, pred_masks = [mask1], [mask2, mask3]  # Assuming mask1 is the ground truth for simplicity
-            true_labels, pred_labels_1, pred_labels_2 = label(mask1), label(mask2), label(mask3)
-            average_precision_0, average_precision_1 = compute_segmentation_ap(mask1, mask2), compute_segmentation_ap(mask1, mask3)
-            ap_scores = [average_precision_0, average_precision_1]
-            if verbose:
-                unique_values1, unique_values2, unique_values3 = np.unique(mask1),  np.unique(mask2), np.unique(mask3)
-                print(f"Unique values in mask 1: {unique_values1}, mask 2: {unique_values2}, mask 3: {unique_values3}")
-                visualize_masks(boundary_true1, boundary_true2, boundary_true3, title=f"Boundaries - {filename}")
-            boundary_f1_12, boundary_f1_13, boundary_f1_23 = boundary_f1_score(mask1, mask2), boundary_f1_score(mask1, mask3), boundary_f1_score(mask2, mask3)
-            if (np.unique(mask1).size == 1 and np.unique(mask1)[0] == 0) and \
-               (np.unique(mask2).size == 1 and np.unique(mask2)[0] == 0) and \
-               (np.unique(mask3).size == 1 and np.unique(mask3)[0] == 0):
-                continue
-            if verbose:
-                unique_values4, unique_values5, unique_values6 = np.unique(boundary_f1_12), np.unique(boundary_f1_13), np.unique(boundary_f1_23)
-                print(f"Unique values in boundary mask 1: {unique_values4}, mask 2: {unique_values5}, mask 3: {unique_values6}")
-                visualize_masks(mask1, mask2, mask3, title=filename)
-            jaccard12 = jaccard_index(mask1, mask2)
-            dice12 = dice_coefficient(mask1, mask2)
-            jaccard13 = jaccard_index(mask1, mask3)
-            dice13 = dice_coefficient(mask1, mask3)
-            jaccard23 = jaccard_index(mask2, mask3)
-            dice23 = dice_coefficient(mask2, mask3)
-            results.append({
-                f'filename': filename,
-                f'jaccard_{cond_1}_{cond_2}': jaccard12,
-                f'dice_{cond_1}_{cond_2}': dice12,
-                f'jaccard_{cond_1}_{cond_3}': jaccard13,
-                f'dice_{cond_1}_{cond_3}': dice13,
-                f'jaccard_{cond_2}_{cond_3}': jaccard23,
-                f'dice_{cond_2}_{cond_3}': dice23,
-                f'boundary_f1_{cond_1}_{cond_2}': boundary_f1_12,
-                f'boundary_f1_{cond_1}_{cond_3}': boundary_f1_13,
-                f'boundary_f1_{cond_2}_{cond_3}': boundary_f1_23,
-                f'average_precision_{cond_1}_{cond_2}': ap_scores[0],
-                f'average_precision_{cond_1}_{cond_3}': ap_scores[1]
-            })
-        else:
-            print(f'Cannot find {path1} or {path2} or {path3}')
-    fig = plot_comparison_results(results)
-    return results, fig
-def generate_cp_masks(settings):
-    src = settings['src']
-    model_name = settings['model_name']
-    channels = settings['channels']
-    diameter = settings['diameter']
-    regex = '.tif'
-    #flow_threshold = 30
-    cellprob_threshold = settings['cellprob_threshold']
-    figuresize = 25
-    cmap = 'inferno'
-    verbose = settings['verbose']
-    plot = settings['plot']
-    save = settings['save']
-    custom_model = settings['custom_model']
-    signal_thresholds = 1000
-    normalize = settings['normalize']
-    resize = settings['resize']
-    target_height = settings['width_height'][1]
-    target_width = settings['width_height'][0]
-    rescale = settings['rescale']
-    resample = settings['resample']
-    net_avg = settings['net_avg']
-    invert = settings['invert']
-    circular = settings['circular']
-    percentiles = settings['percentiles']
-    overlay = settings['overlay']
-    grayscale = settings['grayscale']
-    flow_threshold = settings['flow_threshold']
-    batch_size = settings['batch_size']
-    dst = os.path.join(src,'masks')
-    os.makedirs(dst, exist_ok=True)
-    identify_masks(src, dst, model_name, channels, diameter, batch_size, flow_threshold, cellprob_threshold, figuresize, cmap, verbose, plot, save, custom_model, signal_thresholds, normalize, resize, target_height, target_width, rescale, resample, net_avg, invert, circular, percentiles, overlay, grayscale)
 def train_cellpose(settings):
     from .io import _load_normalized_images_and_labels, _load_images_and_labels
     from .utils import resize_images_and_labels
     img_src = settings['img_src']
-    mask_src= settings['mask_src']
-    secondary_image_dir = None
-    model_name = settings['model_name']
-    model_type = settings['model_type']
-    learning_rate = settings['learning_rate']
-    weight_decay = settings['weight_decay']
-    batch_size = settings['batch_size']
-    n_epochs = settings['n_epochs']
-    verbose = settings['verbose']
-    signal_thresholds = settings['signal_thresholds']
-    channels = settings['channels']
-    from_scratch = settings['from_scratch']
-    diameter = settings['diameter']
-    resize = settings['resize']
-    rescale = settings['rescale']
-    normalize = settings['normalize']
-    target_height = settings['width_height'][1]
-    target_width = settings['width_height'][0]
-    circular = settings['circular']
-    invert = settings['invert']
-    percentiles = settings['percentiles']
-    grayscale = settings['grayscale']
+    mask_src = os.path.join(img_src, 'masks')
+    model_name = settings.setdefault( 'model_name', '')
+    model_name = settings.setdefault('model_name', 'model_name')
+    model_type = settings.setdefault( 'model_type', 'cyto')
+    learning_rate = settings.setdefault( 'learning_rate', 0.01)
+    weight_decay = settings.setdefault( 'weight_decay', 1e-05)
+    batch_size = settings.setdefault( 'batch_size', 50)
+    n_epochs = settings.setdefault( 'n_epochs', 100)
+    from_scratch = settings.setdefault( 'from_scratch', False)
+    diameter = settings.setdefault( 'diameter', 40)
+    remove_background = settings.setdefault( 'remove_background', False)
+    background = settings.setdefault( 'background', 100)
+    Signal_to_noise = settings.setdefault( 'Signal_to_noise', 10)
+    verbose = settings.setdefault( 'verbose', False)
+    channels = settings.setdefault( 'channels', [0,0])
+    normalize = settings.setdefault( 'normalize', True)
+    percentiles = settings.setdefault( 'percentiles', None)
+    circular = settings.setdefault( 'circular', False)
+    invert = settings.setdefault( 'invert', False)
+    resize = settings.setdefault( 'resize', False)
+    if resize:
+        target_height = settings['width_height'][1]
+        target_width = settings['width_height'][0]
+    grayscale = settings.setdefault( 'grayscale', True)
+    rescale = settings.setdefault( 'channels', False)
+    test = settings.setdefault( 'test', False)
+    if test:
+        test_img_src = os.path.join(os.path.dirname(img_src), 'test')
+        test_mask_src = os.path.join(test_img_src, 'mask')
+    test_images, test_masks, test_image_names, test_mask_names = None,None,None,None,
     print(settings)
     if from_scratch:
         model_name=f'scratch_{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
     else:
-        model_name=f'{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
+        if resize:
+            model_name=f'{model_name}_{model_type}_e{n_epochs}_X{target_width}_Y{target_height}.CP_model'
+        else:
+            model_name=f'{model_name}_{model_type}_e{n_epochs}.CP_model'
     model_save_path = os.path.join(mask_src, 'models', 'cellpose_model')
-    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+    print(model_save_path)
+    os.makedirs(model_save_path, exist_ok=True)
     settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
     settings_csv = os.path.join(model_save_path,f'{model_name}_settings.csv')
     settings_df.to_csv(settings_csv, index=False)
-    if model_type =='cyto':
-        if not from_scratch:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_type)
-        else:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_type, net_avg=False, diam_mean=diameter, pretrained_model=None)
-    if model_type !='cyto':
+    if from_scratch:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_type, diam_mean=diameter, pretrained_model=None)
+    else:
         model = cp_models.CellposeModel(gpu=True, model_type=model_type)
-    if normalize:
-        images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_dir=img_src, label_dir=mask_src, secondary_image_dir=secondary_image_dir, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+    if normalize:
+        image_files = [os.path.join(img_src, f) for f in os.listdir(img_src) if f.endswith('.tif')]
+        label_files = [os.path.join(mask_src, f) for f in os.listdir(mask_src) if f.endswith('.tif')]
+        images, masks, image_names, mask_names = _load_normalized_images_and_labels(image_files, label_files, channels, percentiles,  circular, invert, verbose, remove_background, background, Signal_to_noise)
         images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+        if test:
+            test_image_files = [os.path.join(test_img_src, f) for f in os.listdir(test_img_src) if f.endswith('.tif')]
+            test_label_files = [os.path.join(test_mask_src, f) for f in os.listdir(test_mask_src) if f.endswith('.tif')]
+            test_images, test_masks, test_image_names, test_mask_names = _load_normalized_images_and_labels(test_image_files, test_label_files, channels, percentiles,  circular, invert, verbose, remove_background, background, Signal_to_noise)
+            test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
     else:
         images, masks, image_names, mask_names = _load_images_and_labels(img_src, mask_src, circular, invert)
         images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+        if test:
+            test_images, test_masks, test_image_names, test_mask_names = _load_images_and_labels(img_src=test_img_src, mask_src=test_mask_src, circular=circular, invert=invert)
+            test_images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in test_images]
     if resize:
         images, masks = resize_images_and_labels(images, masks, target_height, target_width, show_example=True)
@@ -248,25 +187,41 @@ def train_cellpose(settings):
     print(f'image shape: {images[0].shape}, image type: images[0].shape mask shape: {masks[0].shape}, image type: masks[0].shape')
     save_every = int(n_epochs/10)
-    print('cellpose image input dtype', images[0].dtype)
-    print('cellpose mask input dtype', masks[0].dtype)
-    # Train the model
-    model.train(train_data=images, #(list of arrays (2D or 3D)) – images for training
-                train_labels=masks, #(list of arrays (2D or 3D)) – labels for train_data, where 0=no masks; 1,2,…=mask labels can include flows as additional images
-                train_files=image_names, #(list of strings) – file names for images in train_data (to save flows for future runs)
-                channels=cp_channels, #(list of ints (default, None)) – channels to use for training
-                normalize=False, #(bool (default, True)) – normalize data so 0.0=1st percentile and 1.0=99th percentile of image intensities in each channel
-                save_path=model_save_path, #(string (default, None)) – where to save trained model, if None it is not saved
-                save_every=save_every, #(int (default, 100)) – save network every [save_every] epochs
-                learning_rate=learning_rate, #(float or list/np.ndarray (default, 0.2)) – learning rate for training, if list, must be same length as n_epochs
-                n_epochs=n_epochs, #(int (default, 500)) – how many times to go through whole training set during training
-                weight_decay=weight_decay, #(float (default, 0.00001)) –
-                SGD=True, #(bool (default, True)) – use SGD as optimization instead of RAdam
-                batch_size=batch_size, #(int (optional, default 8)) – number of 224x224 patches to run simultaneously on the GPU (can make smaller or bigger depending on GPU memory usage)
-                nimg_per_epoch=None, #(int (optional, default None)) – minimum number of images to train on per epoch, with a small training set (< 8 images) it may help to set to 8
-                rescale=rescale, #(bool (default, True)) – whether or not to rescale images to diam_mean during training, if True it assumes you will fit a size model after training or resize your images accordingly, if False it will try to train the model to be scale-invariant (works worse)
-                min_train_masks=1, #(int (default, 5)) – minimum number of masks an image must have to use in training set
-                model_name=model_name) #(str (default, None)) – name of network, otherwise saved with name as params + training start time
+    if save_every < 10:
+        save_every = n_epochs
+    train.train_seg(model.net,
+                    train_data=images,
+                    train_labels=masks,
+                    train_files=image_names,
+                    train_labels_files=mask_names,
+                    train_probs=None,
+                    test_data=test_images,
+                    test_labels=test_masks,
+                    test_files=test_image_names,
+                    test_labels_files=test_mask_names,
+                    test_probs=None,
+                    load_files=True,
+                    batch_size=batch_size,
+                    learning_rate=learning_rate,
+                    n_epochs=n_epochs,
+                    weight_decay=weight_decay,
+                    momentum=0.9,
+                    SGD=False,
+                    channels=cp_channels,
+                    channel_axis=None,
+                    #rgb=False,
+                    normalize=False,
+                    compute_flows=False,
+                    save_path=model_save_path,
+                    save_every=save_every,
+                    nimg_per_epoch=None,
+                    nimg_test_per_epoch=None,
+                    rescale=rescale,
+                    #scale_range=None,
+                    #bsize=224,
+                    min_train_masks=1,
+                    model_name=model_name)
     return print(f"Model saved at: {model_save_path}/{model_name}")
@@ -831,15 +786,6 @@ def merge_pred_mes(src,
     if verbose:
         _plot_histograms_and_stats(df=joined_df)
-    #dv = joined_df.copy()
-    #if 'prc' not in dv.columns:
-    #dv['prc'] = dv['plate'] + '_' + dv['row'] + '_' + dv['col']
-    #dv = dv[['pred']].groupby('prc').mean()
-    #dv.set_index('prc', inplace=True)
-    #loc = '/mnt/data/CellVoyager/20x/tsg101/crispr_screen/all/measurements/dv.csv'
-    #dv.to_csv(loc, index=True, header=True, mode='w')
     return joined_df
@@ -926,30 +872,38 @@ def annotate_results(pred_loc):
     display(df)
     return df
-def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=None):
+def generate_dataset(src, file_metadata=None, experiment='TSG101_screen', sample=None):
-    from .utils import init_globals, add_images_to_tar
-    db_path = os.path.join(src, 'measurements','measurements.db')
+    from .utils import initiate_counter, add_images_to_tar
+    db_path = os.path.join(src, 'measurements', 'measurements.db')
     dst = os.path.join(src, 'datasets')
-    global total_images
     all_paths = []
     # Connect to the database and retrieve the image paths
     print(f'Reading DataBase: {db_path}')
-    with sqlite3.connect(db_path) as conn:
-        cursor = conn.cursor()
-        if file_type:
-            cursor.execute("SELECT png_path FROM png_list WHERE png_path LIKE ?", (f"%{file_type}%",))
-        else:
-            cursor.execute("SELECT png_path FROM png_list")
-        while True:
-            rows = cursor.fetchmany(1000)
-            if not rows:
-                break
-            all_paths.extend([row[0] for row in rows])
+    try:
+        with sqlite3.connect(db_path) as conn:
+            cursor = conn.cursor()
+            if file_metadata:
+                if isinstance(file_metadata, str):
+                    cursor.execute("SELECT png_path FROM png_list WHERE png_path LIKE ?", (f"%{file_metadata}%",))
+            else:
+                cursor.execute("SELECT png_path FROM png_list")
+            while True:
+                rows = cursor.fetchmany(1000)
+                if not rows:
+                    break
+                all_paths.extend([row[0] for row in rows])
+    except sqlite3.Error as e:
+        print(f"Database error: {e}")
+        return
+    except Exception as e:
+        print(f"Error: {e}")
+        return
     if isinstance(sample, int):
         selected_paths = random.sample(all_paths, sample)
         print(f'Random selection of {len(selected_paths)} paths')
@@ -957,23 +911,18 @@ def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=Non
         selected_paths = all_paths
         random.shuffle(selected_paths)
         print(f'All paths: {len(selected_paths)} paths')
     total_images = len(selected_paths)
-    print(f'found {total_images} images')
+    print(f'Found {total_images} images')
     # Create a temp folder in dst
     temp_dir = os.path.join(dst, "temp_tars")
     os.makedirs(temp_dir, exist_ok=True)
     # Chunking the data
-    if len(selected_paths) > 10000:
-        num_procs = cpu_count()-2
-        chunk_size = len(selected_paths) // num_procs
-        remainder = len(selected_paths) % num_procs
-    else:
-        num_procs = 2
-        chunk_size = len(selected_paths) // 2
-        remainder = 0
+    num_procs = max(2, cpu_count() - 2)
+    chunk_size = len(selected_paths) // num_procs
+    remainder = len(selected_paths) % num_procs
     paths_chunks = []
     start = 0
@@ -983,45 +932,43 @@ def generate_dataset(src, file_type=None, experiment='TSG101_screen', sample=Non
         start = end
     temp_tar_files = [os.path.join(temp_dir, f'temp_{i}.tar') for i in range(num_procs)]
-    # Initialize the shared objects
-    counter_ = Value('i', 0)
-    lock_ = Lock()
-    ctx = multiprocessing.get_context('spawn')
     print(f'Generating temporary tar files in {dst}')
+    # Initialize shared counter and lock
+    counter = Value('i', 0)
+    lock = Lock()
+    with Pool(processes=num_procs, initializer=initiate_counter, initargs=(counter, lock)) as pool:
+        pool.starmap(add_images_to_tar, [(paths_chunks[i], temp_tar_files[i], total_images) for i in range(num_procs)])
     # Combine the temporary tar files into a final tar
     date_name = datetime.date.today().strftime('%y%m%d')
-    tar_name = f'{date_name}_{experiment}_{file_type}.tar'
+    if not file_metadata is None:
+        tar_name = f'{date_name}_{experiment}_{file_metadata}.tar'
+    else:
+        tar_name = f'{date_name}_{experiment}.tar'
+    tar_name = os.path.join(dst, tar_name)
     if os.path.exists(tar_name):
         number = random.randint(1, 100)
-        tar_name_2 = f'{date_name}_{experiment}_{file_type}_{number}.tar'
-        print(f'Warning: {os.path.basename(tar_name)} exists saving as {os.path.basename(tar_name_2)} ')
-        tar_name = tar_name_2
-    # Add the counter and lock to the arguments for pool.map
+        tar_name_2 = f'{date_name}_{experiment}_{file_metadata}_{number}.tar'
+        print(f'Warning: {os.path.basename(tar_name)} exists, saving as {os.path.basename(tar_name_2)} ')
+        tar_name = os.path.join(dst, tar_name_2)
     print(f'Merging temporary files')
-    #with Pool(processes=num_procs, initializer=init_globals, initargs=(counter_, lock_)) as pool:
-    #    results = pool.map(add_images_to_tar, zip(paths_chunks, temp_tar_files))
-    with ctx.Pool(processes=num_procs, initializer=init_globals, initargs=(counter_, lock_)) as pool:
-        results = pool.map(add_images_to_tar, zip(paths_chunks, temp_tar_files))
-    with tarfile.open(os.path.join(dst, tar_name), 'w') as final_tar:
-        for tar_path in results:
-            with tarfile.open(tar_path, 'r') as t:
-                for member in t.getmembers():
-                    t.extract(member, path=dst)
-                    final_tar.add(os.path.join(dst, member.name), arcname=member.name)
-                    os.remove(os.path.join(dst, member.name))
-            os.remove(tar_path)
+    with tarfile.open(tar_name, 'w') as final_tar:
+        for temp_tar_path in temp_tar_files:
+            with tarfile.open(temp_tar_path, 'r') as temp_tar:
+                for member in temp_tar.getmembers():
+                    file_obj = temp_tar.extractfile(member)
+                    final_tar.addfile(member, file_obj)
+            os.remove(temp_tar_path)
     # Delete the temp folder
     shutil.rmtree(temp_dir)
-    print(f"\nSaved {total_images} images to {os.path.join(dst, tar_name)}")
+    print(f"\nSaved {total_images} images to {tar_name}")
 def apply_model_to_tar(tar_path, model_path, file_type='cell_png', image_size=224, batch_size=64, normalize=True, preload='images', num_workers=10, verbose=False):
     from .io import TarImageDataset, DataLoader
@@ -1128,7 +1075,6 @@ def apply_model(src, model_path, image_size=224, batch_size=64, normalize=True,
     torch.cuda.memory.empty_cache()
     return df
 def generate_training_data_file_list(src,
                         target='protein of interest',
                         cell_dim=4,
@@ -1257,7 +1203,14 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     db_path = os.path.join(src, 'measurements','measurements.db')
     dst = os.path.join(src, 'datasets', 'training')
+    if os.path.exists(dst):
+        for i in range(1, 1000):
+            dst = os.path.join(src, 'datasets', f'training_{i}')
+            if not os.path.exists(dst):
+                print(f'Creating new directory for training: {dst}')
+                break
     if mode == 'annotation':
         class_paths_ls_2 = []
         class_paths_ls = training_dataset_from_annotation(db_path, dst, annotation_column, annotated_classes=annotated_classes)
@@ -1268,6 +1221,7 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     elif mode == 'metadata':
         class_paths_ls = []
+        class_len_ls = []
         [df] = _read_db(db_loc=db_path, tables=['png_list'])
         df['metadata_based_class'] = pd.NA
         for i, class_ in enumerate(classes):
@@ -1275,7 +1229,18 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
             df.loc[df[metadata_type_by].isin(ls), 'metadata_based_class'] = class_
         for class_ in classes:
+            if size == None:
+                c_s = []
+                for c in classes:
+                    c_s_t_df = df[df['metadata_based_class'] == c]
+                    c_s.append(len(c_s_t_df))
+                    print(f'Found {len(c_s_t_df)} images for class {c}')
+                size = min(c_s)
+                print(f'Using the smallest class size: {size}')
             class_temp_df = df[df['metadata_based_class'] == class_]
+            class_len_ls.append(len(class_temp_df))
+            print(f'Found {len(class_temp_df)} images for class {class_}')
             class_paths_temp = random.sample(class_temp_df['png_path'].tolist(), size)
             class_paths_ls.append(class_paths_temp)
@@ -1332,7 +1297,8 @@ def generate_training_dataset(src, mode='annotation', annotation_column='test',
     return
-def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_size=32, classes=['nc','pc'], num_workers=None, validation_split=0.0, max_show=2, pin_memory=False, normalize=False, verbose=False):
+def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_size=32, classes=['nc','pc'], num_workers=None, validation_split=0.0, max_show=2, pin_memory=False, normalize=False, channels=[1, 2, 3], verbose=False):
     """
     Generate data loaders for training and validation/test datasets.
@@ -1349,16 +1315,40 @@ def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_
     - pin_memory (bool): Whether to pin memory for faster data transfer.
     - normalize (bool): Whether to normalize the input images.
     - verbose (bool): Whether to print additional information and show images.
+    - channels (list): The list of channels to retain. Options are [1, 2, 3] for all channels, [1, 2] for blue and green, etc.
     Returns:
     - train_loaders (list): List of data loaders for training datasets.
     - val_loaders (list): List of data loaders for validation datasets.
     - plate_names (list): List of plate names (only applicable when train_mode is 'irm').
     """
     from .io import MyDataset
     from .plot import _imshow
+    from torchvision import transforms
+    from torch.utils.data import DataLoader, random_split
+    from collections import defaultdict
+    import os
+    import random
+    from PIL import Image
+    from torchvision.transforms import ToTensor
+    from .utils import SelectChannels
+    chans = []
+    if 'r' in channels:
+        chans.append(1)
+    if 'g' in channels:
+        chans.append(2)
+    if 'b' in channels:
+        chans.append(3)
+    channels = chans
+    if verbose:
+        print(f'Training a network on channels: {channels}')
+        print(f'Channel 1: Red, Channel 2: Green, Channel 3: Blue')
     plate_to_filenames = defaultdict(list)
     plate_to_labels = defaultdict(list)
     train_loaders = []
@@ -1369,31 +1359,30 @@ def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_
         transform = transforms.Compose([
             transforms.ToTensor(),
             transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels),
             transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
     else:
         transform = transforms.Compose([
             transforms.ToTensor(),
-            transforms.CenterCrop(size=(image_size, image_size))])
+            transforms.CenterCrop(size=(image_size, image_size)),
+            SelectChannels(channels)])
     if mode == 'train':
         data_dir = os.path.join(src, 'train')
         shuffle = True
-        print(f'Generating Train and validation datasets')
+        print('Generating Train and validation datasets')
     elif mode == 'test':
         data_dir = os.path.join(src, 'test')
         val_loaders = []
-        validation_split=0.0
+        validation_split = 0.0
         shuffle = True
-        print(f'Generating test dataset')
+        print('Generating test dataset')
     else:
         print(f'mode:{mode} is not valid, use mode = train or test')
         return
     if train_mode == 'erm':
         data = MyDataset(data_dir, classes, transform=transform, shuffle=shuffle, pin_memory=pin_memory)
-        #train_loaders = DataLoader(data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers if num_workers is not None else 0, pin_memory=pin_memory)
         if validation_split > 0:
             train_size = int((1 - validation_split) * len(data))
             val_size = len(data) - train_size
@@ -1450,7 +1439,6 @@ def generate_loaders(src, train_mode='erm', mode='train', image_size=224, batch_
                 images = images.cpu()
                 label_strings = [str(label.item()) for label in labels]
                 _imshow(images, label_strings, nrow=20, fontsize=12)
         elif train_mode == 'irm':
             for plate_name, train_loader in zip(plate_names, train_loaders):
                 print(f'Plate: {plate_name} with {len(train_loader.dataset)} images')
@@ -1569,15 +1557,30 @@ def analyze_recruitment(src, metadata_settings, advanced_settings):
     df = df.dropna(subset=['condition'])
     print(f'After dropping non-annotated wells: {len(df)} rows')
     files = df['file_name'].tolist()
+    print(f'found: {len(files)} files')
     files = [item + '.npy' for item in files]
     random.shuffle(files)
+    _max = 10**100
+    if cell_size_range is None and nucleus_size_range is None and pathogen_size_range is None:
+        filter_min_max = None
+    else:
+        if cell_size_range is None:
+            cell_size_range = [0,_max]
+        if nucleus_size_range is None:
+            nucleus_size_range = [0,_max]
+        if pathogen_size_range is None:
+            pathogen_size_range = [0,_max]
+        filter_min_max = [[cell_size_range[0],cell_size_range[1]],[nucleus_size_range[0],nucleus_size_range[1]],[pathogen_size_range[0],pathogen_size_range[1]]]
     if plot:
         plot_settings = {'include_noninfected':include_noninfected,
                          'include_multiinfected':include_multiinfected,
                          'include_multinucleated':include_multinucleated,
                          'remove_background':remove_background,
-                         'filter_min_max':[[cell_size_range[0],cell_size_range[1]],[nucleus_size_range[0],nucleus_size_range[1]],[pathogen_size_range[0],pathogen_size_range[1]]],
+                         'filter_min_max':filter_min_max,
                          'channel_dims':channel_dims,
                          'backgrounds':backgrounds,
                          'cell_mask_dim':mask_dims[0],
@@ -1634,31 +1637,225 @@ def analyze_recruitment(src, metadata_settings, advanced_settings):
     cells,wells = _results_to_csv(src, df, df_well)
     return [cells,wells]
+def _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold=5, perimeter_threshold=30):
+    """
+    Merge cells in cell_mask if a parasite in parasite_mask overlaps with more than one cell,
+    and if cells share more than a specified perimeter percentage.
+    Args:
+        parasite_mask (ndarray): Mask of parasites.
+        cell_mask (ndarray): Mask of cells.
+        nuclei_mask (ndarray): Mask of nuclei.
+        overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
+        perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
+    Returns:
+        ndarray: The modified cell mask (cell_mask) with unique labels.
+    """
+    labeled_cells = label(cell_mask)
+    labeled_parasites = label(parasite_mask)
+    labeled_nuclei = label(nuclei_mask)
+    num_parasites = np.max(labeled_parasites)
+    num_cells = np.max(labeled_cells)
+    num_nuclei = np.max(labeled_nuclei)
+    # Merge cells based on parasite overlap
+    for parasite_id in range(1, num_parasites + 1):
+        current_parasite_mask = labeled_parasites == parasite_id
+        overlapping_cell_labels = np.unique(labeled_cells[current_parasite_mask])
+        overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
+        if len(overlapping_cell_labels) > 1:
+            # Calculate the overlap percentages
+            overlap_percentages = [
+                np.sum(current_parasite_mask & (labeled_cells == cell_label)) / np.sum(current_parasite_mask) * 100
+                for cell_label in overlapping_cell_labels
+            ]
+            # Merge cells if overlap percentage is above the threshold
+            for cell_label, overlap_percentage in zip(overlapping_cell_labels, overlap_percentages):
+                if overlap_percentage > overlap_threshold:
+                    first_label = overlapping_cell_labels[0]
+                    for other_label in overlapping_cell_labels[1:]:
+                        if other_label != first_label:
+                            cell_mask[cell_mask == other_label] = first_label
+    # Merge cells based on nucleus overlap
+    for nucleus_id in range(1, num_nuclei + 1):
+        current_nucleus_mask = labeled_nuclei == nucleus_id
+        overlapping_cell_labels = np.unique(labeled_cells[current_nucleus_mask])
+        overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
+        if len(overlapping_cell_labels) > 1:
+            # Calculate the overlap percentages
+            overlap_percentages = [
+                np.sum(current_nucleus_mask & (labeled_cells == cell_label)) / np.sum(current_nucleus_mask) * 100
+                for cell_label in overlapping_cell_labels
+            ]
+            # Merge cells if overlap percentage is above the threshold for each cell
+            if all(overlap_percentage > overlap_threshold for overlap_percentage in overlap_percentages):
+                first_label = overlapping_cell_labels[0]
+                for other_label in overlapping_cell_labels[1:]:
+                    if other_label != first_label:
+                        cell_mask[cell_mask == other_label] = first_label
+    # Check for cells without nuclei and merge based on shared perimeter
+    labeled_cells = label(cell_mask)  # Re-label after merging based on overlap
+    cell_regions = regionprops(labeled_cells)
+    for region in cell_regions:
+        cell_label = region.label
+        cell_mask_binary = labeled_cells == cell_label
+        overlapping_nuclei = np.unique(nuclei_mask[cell_mask_binary])
+        overlapping_nuclei = overlapping_nuclei[overlapping_nuclei != 0]
+        if len(overlapping_nuclei) == 0:
+            # Cell does not overlap with any nucleus
+            perimeter = region.perimeter
+            # Dilate the cell to find neighbors
+            dilated_cell = binary_dilation(cell_mask_binary, structure=square(3))
+            neighbor_cells = np.unique(labeled_cells[dilated_cell])
+            neighbor_cells = neighbor_cells[(neighbor_cells != 0) & (neighbor_cells != cell_label)]
+            # Calculate shared border length with neighboring cells
+            shared_borders = [
+                np.sum((labeled_cells == neighbor_label) & dilated_cell) for neighbor_label in neighbor_cells
+            ]
+            shared_border_percentages = [shared_border / perimeter * 100 for shared_border in shared_borders]
+            # Merge with the neighbor cell with the largest shared border percentage above the threshold
+            if shared_borders:
+                max_shared_border_index = np.argmax(shared_border_percentages)
+                max_shared_border_percentage = shared_border_percentages[max_shared_border_index]
+                if max_shared_border_percentage > perimeter_threshold:
+                    cell_mask[labeled_cells == cell_label] = neighbor_cells[max_shared_border_index]
+    # Relabel the merged cell mask
+    relabeled_cell_mask, _ = label(cell_mask, return_num=True)
+    return relabeled_cell_mask
+def adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30):
+    """
+    Process all npy files in the given folders. Merge and relabel cells in cell masks
+    based on parasite overlap and cell perimeter sharing conditions.
+    Args:
+        parasite_folder (str): Path to the folder containing parasite masks.
+        cell_folder (str): Path to the folder containing cell masks.
+        nuclei_folder (str): Path to the folder containing nuclei masks.
+        overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
+        perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
+    """
+    parasite_files = sorted([f for f in os.listdir(parasite_folder) if f.endswith('.npy')])
+    cell_files = sorted([f for f in os.listdir(cell_folder) if f.endswith('.npy')])
+    nuclei_files = sorted([f for f in os.listdir(nuclei_folder) if f.endswith('.npy')])
+    # Ensure there are matching files in all folders
+    if not (len(parasite_files) == len(cell_files) == len(nuclei_files)):
+        raise ValueError("The number of files in the folders do not match.")
+    # Match files by name
+    for file_name in parasite_files:
+        parasite_path = os.path.join(parasite_folder, file_name)
+        cell_path = os.path.join(cell_folder, file_name)
+        nuclei_path = os.path.join(nuclei_folder, file_name)
+        # Check if the corresponding cell and nuclei mask files exist
+        if not (os.path.exists(cell_path) and os.path.exists(nuclei_path)):
+            raise ValueError(f"Corresponding cell or nuclei mask file for {file_name} not found.")
+        # Load the masks
+        parasite_mask = np.load(parasite_path)
+        cell_mask = np.load(cell_path)
+        nuclei_mask = np.load(nuclei_path)
+        # Merge and relabel cells
+        merged_cell_mask = _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold, perimeter_threshold)
+        # Overwrite the original cell mask file with the merged result
+        np.save(cell_path, merged_cell_mask)
+def process_masks(mask_folder, image_folder, channel, batch_size=50, n_clusters=2, plot=False):
+    def read_files_in_batches(folder, batch_size=50):
+        files = [f for f in os.listdir(folder) if f.endswith('.npy')]
+        files.sort()  # Sort to ensure matching order
+        for i in range(0, len(files), batch_size):
+            yield files[i:i + batch_size]
+    def measure_morphology_and_intensity(mask, image):
+        properties = measure.regionprops(mask, intensity_image=image)
+        properties_list = [{'area': p.area, 'mean_intensity': p.mean_intensity, 'perimeter': p.perimeter, 'eccentricity': p.eccentricity} for p in properties]
+        return properties_list
+    def cluster_objects(properties, n_clusters=2):
+        data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
+        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
+        return kmeans
+    def remove_objects_not_in_largest_cluster(mask, labels, largest_cluster_label):
+        cleaned_mask = np.zeros_like(mask)
+        for region in measure.regionprops(mask):
+            if labels[region.label - 1] == largest_cluster_label:
+                cleaned_mask[mask == region.label] = region.label
+        return cleaned_mask
+    def plot_clusters(properties, labels):
+        data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
+        pca = PCA(n_components=2)
+        data_2d = pca.fit_transform(data)
+        plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='viridis')
+        plt.xlabel('PCA Component 1')
+        plt.ylabel('PCA Component 2')
+        plt.title('Object Clustering')
+        plt.show()
+    all_properties = []
+    # Step 1: Accumulate properties over all files
+    for batch in read_files_in_batches(mask_folder, batch_size):
+        mask_files = [os.path.join(mask_folder, file) for file in batch]
+        image_files = [os.path.join(image_folder, file) for file in batch]
+        masks = [np.load(file) for file in mask_files]
+        images = [np.load(file)[:, :, channel] for file in image_files]
+        for i, mask in enumerate(masks):
+            image = images[i]
+            # Measure morphology and intensity
+            properties = measure_morphology_and_intensity(mask, image)
+            all_properties.extend(properties)
+    # Step 2: Perform clustering on accumulated properties
+    kmeans = cluster_objects(all_properties, n_clusters)
+    labels = kmeans.labels_
+    if plot:
+        # Step 3: Plot clusters using PCA
+        plot_clusters(all_properties, labels)
+    # Step 4: Remove objects not in the largest cluster and overwrite files in batches
+    label_index = 0
+    for batch in read_files_in_batches(mask_folder, batch_size):
+        mask_files = [os.path.join(mask_folder, file) for file in batch]
+        masks = [np.load(file) for file in mask_files]
+        for i, mask in enumerate(masks):
+            batch_properties = measure_morphology_and_intensity(mask, mask)
+            batch_labels = labels[label_index:label_index + len(batch_properties)]
+            largest_cluster_label = np.bincount(batch_labels).argmax()
+            cleaned_mask = remove_objects_not_in_largest_cluster(mask, batch_labels, largest_cluster_label)
+            np.save(mask_files[i], cleaned_mask)
+            label_index += len(batch_properties)
 def preprocess_generate_masks(src, settings={}):
     from .io import preprocess_img_data, _load_and_concatenate_arrays
     from .plot import plot_merged, plot_arrays
-    from .utils import _pivot_counts_table
-    settings['fps'] = 2
-    settings['remove_background'] = True
-    settings['lower_quantile'] = 0.02
-    settings['merge'] = False
-    settings['normalize_plots'] = True
-    settings['all_to_mip'] = False
-    settings['pick_slice'] = False
-    settings['skip_mode'] = src
-    settings['workers'] = os.cpu_count()-4
-    settings['verbose'] = True
-    settings['examples_to_plot'] = 1
-    settings['src'] = src
-    settings['upscale'] = False
-    settings['upscale_factor'] = 2.0
+    from .utils import _pivot_counts_table, set_default_settings_preprocess_generate_masks, set_default_plot_merge_settings, check_mask_folder
+    settings = set_default_settings_preprocess_generate_masks(src, settings)
     settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
     settings_csv = os.path.join(src,'settings','preprocess_generate_masks_settings.csv')
     os.makedirs(os.path.join(src,'settings'), exist_ok=True)
     settings_df.to_csv(settings_csv, index=False)
+    if not settings['pathogen_channel'] is None:
+        custom_model_ls = ['toxo_pv_lumen','toxo_cyto']
+        if settings['pathogen_model'] not in custom_model_ls:
+            ValueError(f'Pathogen model must be {custom_model_ls} or None')
     if settings['timelapse']:
         settings['randomize'] = False
@@ -1667,24 +1864,50 @@ def preprocess_generate_masks(src, settings={}):
         if not settings['masks']:
             print(f'WARNING: channels for mask generation are defined when preprocess = True')
-    if isinstance(settings['merge'], bool):
-        settings['merge'] = [settings['merge']]*3
     if isinstance(settings['save'], bool):
         settings['save'] = [settings['save']]*3
+    if settings['verbose']:
+        settings_df = pd.DataFrame(list(settings.items()), columns=['setting_key', 'setting_value'])
+        settings_df['setting_value'] = settings_df['setting_value'].apply(str)
+        display(settings_df)
+    if settings['test_mode']:
+        print(f'Starting Test mode ...')
     if settings['preprocess']:
         settings, src = preprocess_img_data(settings)
     if settings['masks']:
         mask_src = os.path.join(src, 'norm_channel_stack')
         if settings['cell_channel'] != None:
-            generate_cellpose_masks(src=mask_src, settings=settings, object_type='cell')
+            if check_mask_folder(src, 'cell_mask_stack'):
+                generate_cellpose_masks(mask_src, settings, 'cell')
         if settings['nucleus_channel'] != None:
-            generate_cellpose_masks(src=mask_src, settings=settings, object_type='nucleus')
+            if check_mask_folder(src, 'nucleus_mask_stack'):
+                generate_cellpose_masks(mask_src, settings, 'nucleus')
         if settings['pathogen_channel'] != None:
-            generate_cellpose_masks(src=mask_src, settings=settings, object_type='pathogen')
+            if check_mask_folder(src, 'pathogen_mask_stack'):
+                generate_cellpose_masks(mask_src, settings, 'pathogen')
+        if settings['adjust_cells']:
+            if settings['pathogen_channel'] != None and settings['cell_channel'] != None and settings['nucleus_channel'] != None:
+                start = time.time()
+                cell_folder = os.path.join(mask_src, 'cell_mask_stack')
+                nuclei_folder = os.path.join(mask_src, 'nucleus_mask_stack')
+                parasite_folder = os.path.join(mask_src, 'pathogen_mask_stack')
+                #image_folder = os.path.join(src, 'stack')
+                #process_masks(cell_folder, image_folder, settings['cell_channel'], settings['batch_size'], n_clusters=2, plot=settings['plot'])
+                #process_masks(nuclei_folder, image_folder, settings['nucleus_channel'], settings['batch_size'], n_clusters=2, plot=settings['plot'])
+                #process_masks(parasite_folder, image_folder, settings['pathogen_channel'], settings['batch_size'], n_clusters=2, plot=settings['plot'])
+                adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30)
+                stop = time.time()
+                print(f'Cell mask adjustment: {stop-start} seconds')
         if os.path.exists(os.path.join(src,'measurements')):
             _pivot_counts_table(db_path=os.path.join(src,'measurements', 'measurements.db'))
@@ -1713,60 +1936,110 @@ def preprocess_generate_masks(src, settings={}):
                 overlay_channels = [settings['nucleus_channel'], settings['pathogen_channel'], settings['cell_channel']]
                 overlay_channels = [element for element in overlay_channels if element is not None]
-                plot_settings = {'include_noninfected':True,
-                                 'include_multiinfected':True,
-                                 'include_multinucleated':True,
-                                 'remove_background':False,
-                                 'filter_min_max':None,
-                                 'channel_dims':settings['channels'],
-                                 'backgrounds':[100,100,100,100],
-                                 'cell_mask_dim':cell_mask_dim,
-                                 'nucleus_mask_dim':nucleus_mask_dim,
-                                 'pathogen_mask_dim':pathogen_mask_dim,
-                                 'overlay_chans':[0,2,3],
-                                 'outline_thickness':3,
-                                 'outline_color':'gbr',
-                                 'overlay_chans':overlay_channels,
-                                 'overlay':True,
-                                 'normalization_percentiles':[1,99],
-                                 'normalize':True,
-                                 'print_object_number':True,
-                                 'nr':settings['examples_to_plot'],
-                                 'figuresize':20,
-                                 'cmap':'inferno',
-                                 'verbose':False}
+                plot_settings = set_default_plot_merge_settings()
+                plot_settings['channel_dims'] = settings['channels']
+                plot_settings['cell_mask_dim'] = cell_mask_dim
+                plot_settings['nucleus_mask_dim'] = nucleus_mask_dim
+                plot_settings['pathogen_mask_dim'] = pathogen_mask_dim
+                plot_settings['overlay_chans'] = overlay_channels
+                plot_settings['nr'] = settings['examples_to_plot']
+                if settings['test_mode'] == True:
+                    plot_settings['nr'] = len(os.path.join(src,'merged'))
                 try:
                     fig = plot_merged(src=os.path.join(src,'merged'), settings=plot_settings)
                 except Exception as e:
                     print(f'Failed to plot image mask overly. Error: {e}')
             else:
-                plot_arrays(src=os.path.join(src,'merged'), figuresize=50, cmap='inferno', nr=1, normalize=True, q1=1, q2=99)
+                plot_arrays(src=os.path.join(src,'merged'), figuresize=settings['figuresize'], cmap=settings['cmap'], nr=settings['examples_to_plot'], normalize=settings['normalize'], q1=1, q2=99)
     torch.cuda.empty_cache()
     gc.collect()
     print("Successfully completed run")
     return
-def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size, flow_threshold=30, cellprob_threshold=1, figuresize=25, cmap='inferno', verbose=False, plot=False, save=False, custom_model=None, signal_thresholds=1000, normalize=True, resize=False, target_height=None, target_width=None, rescale=True, resample=True, net_avg=False, invert=False, circular=False, percentiles=None, overlay=True, grayscale=False):
+def identify_masks_finetune(settings):
     from .plot import print_mask_and_flows
     from .utils import get_files_from_dir, resize_images_and_labels
     from .io import _load_normalized_images_and_labels, _load_images_and_labels
+    #User defined settings
+    src=settings['src']
+    dst=settings['dst']
+    settings.setdefault('model_name', 'cyto')
+    settings.setdefault('custom_model', None)
+    settings.setdefault('channels', [0,0])
+    settings.setdefault('background', 100)
+    settings.setdefault('remove_background', False)
+    settings.setdefault('Signal_to_noise', 10)
+    settings.setdefault('CP_prob', 0)
+    settings.setdefault('diameter', 30)
+    settings.setdefault('batch_size', 50)
+    settings.setdefault('flow_threshold', 0.4)
+    settings.setdefault('save', False)
+    settings.setdefault('verbose', False)
+    settings.setdefault('normalize', True)
+    settings.setdefault('percentiles', None)
+    settings.setdefault('circular', False)
+    settings.setdefault('invert', False)
+    settings.setdefault('resize', False)
+    settings.setdefault('target_height', None)
+    settings.setdefault('target_width', None)
+    settings.setdefault('rescale', False)
+    settings.setdefault('resample', False)
+    settings.setdefault('grayscale', True)
+    model_name=settings['model_name']
+    custom_model=settings['custom_model']
+    channels = settings['channels']
+    background = settings['background']
+    remove_background=settings['remove_background']
+    Signal_to_noise = settings['Signal_to_noise']
+    CP_prob = settings['CP_prob']
+    diameter=settings['diameter']
+    batch_size=settings['batch_size']
+    flow_threshold=settings['flow_threshold']
+    save=settings['save']
+    verbose=settings['verbose']
+    # static settings
+    normalize = settings['normalize']
+    percentiles = settings['percentiles']
+    circular = settings['circular']
+    invert = settings['invert']
+    resize = settings['resize']
+    if resize:
+        target_height = settings['target_height']
+        target_width = settings['target_width']
+    rescale = settings['rescale']
+    resample = settings['resample']
+    grayscale = settings['grayscale']
+    os.makedirs(dst, exist_ok=True)
+    if not custom_model is None:
+        if not os.path.exists(custom_model):
+            print(f'Custom model not found: {custom_model}')
+            return
     if not torch.cuda.is_available():
         print(f'Torch CUDA is not available, using CPU')
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     if custom_model == None:
-        if model_name =='cyto':
-            model = cp_models.CellposeModel(gpu=True, model_type=model_name, net_avg=False, diam_mean=diameter, pretrained_model=None)
-        else:
-            model = cp_models.CellposeModel(gpu=True, model_type=model_name)
-    if custom_model != None:
-        model = cp_models.CellposeModel(gpu=torch.cuda.is_available(), model_type=None, pretrained_model=custom_model, diam_mean=diameter, device=device, net_avg=False)  #Assuming diameter is defined elsewhere
-        print(f'loaded custom model:{custom_model}')
+        model = cp_models.CellposeModel(gpu=True, model_type=model_name, device=device)
+        print(f'Loaded model: {model_name}')
+    else:
+        model = cp_models.CellposeModel(gpu=torch.cuda.is_available(), model_type=None, pretrained_model=custom_model, diam_mean=diameter, device=device)
+        print("Pretrained Model Loaded:", model.pretrained_model)
     chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [1,0] if model_name == 'cyto' else [2, 0]
@@ -1776,16 +2049,18 @@ def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size
     print(f'Using channels: {chans} for model of type {model_name}')
     if verbose == True:
-        print(f'Cellpose settings: Model: {model_name}, channels: {channels}, cellpose_chans: {chans}, diameter:{diameter}, flow_threshold:{flow_threshold}, cellprob_threshold:{cellprob_threshold}')
+        print(f'Cellpose settings: Model: {model_name}, channels: {channels}, cellpose_chans: {chans}, diameter:{diameter}, flow_threshold:{flow_threshold}, cellprob_threshold:{CP_prob}')
-    all_image_files = get_files_from_dir(src, file_extension="*.tif")
+    all_image_files = [os.path.join(src, f) for f in os.listdir(src) if f.endswith('.tif')]
     random.shuffle(all_image_files)
     time_ls = []
     for i in range(0, len(all_image_files), batch_size):
         image_files = all_image_files[i:i+batch_size]
         if normalize:
-            images, _, image_names, _ = _load_normalized_images_and_labels(image_files=image_files, label_files=None, signal_thresholds=signal_thresholds, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose)
+            images, _, image_names, _ = _load_normalized_images_and_labels(image_files=image_files, label_files=None, channels=channels, percentiles=percentiles,  circular=circular, invert=invert, visualize=verbose, remove_background=remove_background, background=background, Signal_to_noise=Signal_to_noise)
             images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
             orig_dims = [(image.shape[0], image.shape[1]) for image in images]
         else:
@@ -1803,11 +2078,10 @@ def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size
                          channel_axis=3,
                          diameter=diameter,
                          flow_threshold=flow_threshold,
-                         cellprob_threshold=cellprob_threshold,
+                         cellprob_threshold=CP_prob,
                          rescale=rescale,
                          resample=resample,
-                         net_avg=net_avg,
-                         progress=False)
+                         progress=True)
             if len(output) == 4:
                 mask, flows, _, _ = output
@@ -1825,11 +2099,12 @@ def identify_masks_finetune(src, dst, model_name, channels, diameter, batch_size
             time_ls.append(duration)
             average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
             print(f'Processing {file_index+1}/{len(images)} images : Time/image {average_time:.3f} sec', end='\r', flush=True)
-            if plot:
+            if verbose:
                 if resize:
                     stack = resizescikit(stack, dims, preserve_range=True, anti_aliasing=False).astype(stack.dtype)
-                print_mask_and_flows(stack, mask, flows, overlay=overlay)
+                print_mask_and_flows(stack, mask, flows, overlay=True)
             if save:
+                os.makedirs(dst, exist_ok=True)
                 output_filename = os.path.join(dst, image_names[file_index])
                 cv2.imwrite(output_filename, mask)
     return
@@ -1882,7 +2157,6 @@ def identify_masks(src, object_type, model_name, batch_size, channels, diameter,
     #Note add logic that handles batches of size 1 as these will break the code batches must all be > 2 images
     gc.collect()
-    #print('========== generating masks ==========')
     if not torch.cuda.is_available():
         print(f'Torch CUDA is not available, using CPU')
@@ -1972,8 +2246,6 @@ def identify_masks(src, object_type, model_name, batch_size, channels, diameter,
                     stitch_threshold=0.0
                 cellpose_batch_size = _get_cellpose_batch_size()
-                #model = cellpose.denoise.DenoiseModel(model_type=f"denoise_{model_name}", gpu=True)
                 masks, flows, _, _ = model.eval(x=batch,
                                                 batch_size=cellpose_batch_size,
@@ -2047,9 +2319,21 @@ def all_elements_match(list1, list2):
     # Check if all elements in list1 are in list2
     return all(element in list2 for element in list1)
-def generate_cellpose_masks_v1(src, settings, object_type):
+def prepare_batch_for_cellpose(batch):
+    # Ensure the batch is of dtype float32
+    if batch.dtype != np.float32:
+        batch = batch.astype(np.float32)
+    # Normalize each image in the batch
+    for i in range(batch.shape[0]):
+        if batch[i].max() > 1:
+            batch[i] = batch[i] / batch[i].max()
+    return batch
+def generate_cellpose_masks(src, settings, object_type):
-    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, mask_object_count
+    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, _choose_model, mask_object_count, set_default_settings_preprocess_generate_masks
     from .io import _create_database, _save_object_counts_to_database, _check_masks, _get_avg_object_size
     from .timelapse import _npz_to_movie, _btrack_track_cells, _trackpy_track_cells
     from .plot import plot_masks
@@ -2057,6 +2341,13 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     gc.collect()
     if not torch.cuda.is_available():
         print(f'Torch CUDA is not available, using CPU')
+    settings = set_default_settings_preprocess_generate_masks(src, settings)
+    if settings['verbose']:
+        settings_df = pd.DataFrame(list(settings.items()), columns=['setting_key', 'setting_value'])
+        settings_df['setting_value'] = settings_df['setting_value'].apply(str)
+        display(settings_df)
     figuresize=25
     timelapse = settings['timelapse']
@@ -2071,23 +2362,26 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     batch_size = settings['batch_size']
     cellprob_threshold = settings[f'{object_type}_CP_prob']
-    flow_threshold = 30
+    flow_threshold = settings[f'{object_type}_FT']
     object_settings = _get_object_settings(object_type, settings)
     model_name = object_settings['model_name']
     cellpose_channels = _get_cellpose_channels(src, settings['nucleus_channel'], settings['pathogen_channel'], settings['cell_channel'])
     if settings['verbose']:
         print(cellpose_channels)
     channels = cellpose_channels[object_type]
     cellpose_batch_size = _get_cellpose_batch_size()
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model = cp_models.Cellpose(gpu=True, model_type=model_name, device=device) #net_avg=net_avg
-    #dn = denoise.CellposeDenoiseModel(model_type=f"denoise_{model_name}", gpu=True, device=device)
+    if object_type == 'pathogen' and not settings['pathogen_model'] is None:
+        model_name = settings['pathogen_model']
+    model = _choose_model(model_name, device, object_type=object_type, restore_type=None, object_settings=object_settings)
     chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [2,0] if model_name == 'cyto' else [2, 0] if model_name == 'cyto3' else [2, 0]
     paths = [os.path.join(src, file) for file in os.listdir(src) if file.endswith('.npz')]
     count_loc = os.path.dirname(src)+'/measurements/measurements.db'
@@ -2096,7 +2390,6 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     average_sizes = []
     time_ls = []
     for file_index, path in enumerate(paths):
         name = os.path.basename(path)
         name, ext = os.path.splitext(name)
@@ -2106,6 +2399,14 @@ def generate_cellpose_masks_v1(src, settings, object_type):
         with np.load(path) as data:
             stack = data['data']
             filenames = data['filenames']
+            for i, filename in enumerate(filenames):
+                output_path = os.path.join(output_folder, filename)
+                if os.path.exists(output_path):
+                    print(f"File {filename} already exists in the output folder. Skipping...")
+                    continue
         if settings['timelapse']:
             trackable_objects = ['cell','nucleus','pathogen']
@@ -2140,31 +2441,43 @@ def generate_cellpose_masks_v1(src, settings, object_type):
             if batch.size == 0:
                 print(f'Processing {file_index}/{len(paths)}: Images/npz {batch.shape[0]}')
                 continue
-            if batch.max() > 1:
-                batch = batch / batch.max()
+            batch = prepare_batch_for_cellpose(batch)
             if timelapse:
-                stitch_threshold=100.0
                 movie_path = os.path.join(os.path.dirname(src), 'movies')
                 os.makedirs(movie_path, exist_ok=True)
                 save_path = os.path.join(movie_path, f'timelapse_{object_type}_{name}.mp4')
                 _npz_to_movie(batch, batch_filenames, save_path, fps=2)
-            else:
-                stitch_threshold=0.0
-            print('batch.shape',batch.shape)
-            masks, flows, _, _ = model.eval(x=batch,
-                                            batch_size=cellpose_batch_size,
-                                            normalize=False,
-                                            channels=chans,
-                                            channel_axis=3,
-                                            diameter=object_settings['diameter'],
-                                            flow_threshold=flow_threshold,
-                                            cellprob_threshold=cellprob_threshold,
-                                            rescale=None,
-                                            resample=object_settings['resample'],
-                                            stitch_threshold=stitch_threshold)
+            if settings['verbose']:
+                print(f'Processing {file_index}/{len(paths)}: Images/npz {batch.shape[0]}')
+            #cellpose_normalize_dict = {'lowhigh':[0.0,1.0], #pass in normalization values for 0.0 and 1.0 as list [low, high] if None all other keys ignored
+            #                           'sharpen':object_settings['diameter']/4, #recommended to be 1/4-1/8 diameter of cells in pixels
+            #                           'normalize':True, #(if False, all following parameters ignored)
+            #                           'percentile':[2,98], #[perc_low, perc_high]
+            #                           'tile_norm':224, #normalize by tile set to e.g. 100 for normailize window to be 100 px
+            #                           'norm3D':True} #compute normalization across entire z-stack rather than plane-by-plane in stitching mode.
+            output = model.eval(x=batch,
+                                batch_size=cellpose_batch_size,
+                                normalize=False,
+                                channels=chans,
+                                channel_axis=3,
+                                diameter=object_settings['diameter'],
+                                flow_threshold=flow_threshold,
+                                cellprob_threshold=cellprob_threshold,
+                                rescale=None,
+                                resample=object_settings['resample'])
+            if len(output) == 4:
+                masks, flows, _, _ = output
+            elif len(output) == 3:
+                masks, flows, _ = output
+            else:
+                raise ValueError(f"Unexpected number of return values from model.eval(). Expected 3 or 4, got {len(output)}")
             if timelapse:
                 if settings['plot']:
                     for idx, (mask, flow, image) in enumerate(zip(masks, flows[0], batch)):
@@ -2210,23 +2523,45 @@ def generate_cellpose_masks_v1(src, settings, object_type):
                                                           mode=timelapse_mode)
                 else:
                     mask_stack = _masks_to_masks_stack(masks)
             else:
                 _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_before_filtration')
-                mask_stack = _filter_cp_masks(masks=masks,
-                                              flows=flows,
-                                              filter_size=object_settings['filter_size'],
-                                              filter_intensity=object_settings['filter_intensity'],
-                                              minimum_size=object_settings['minimum_size'],
-                                              maximum_size=object_settings['maximum_size'],
-                                              remove_border_objects=object_settings['remove_border_objects'],
-                                              merge=False,
-                                              batch=batch,
-                                              plot=settings['plot'],
-                                              figuresize=figuresize)
-                _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+                if object_settings['merge'] and not settings['filter']:
+                    mask_stack = _filter_cp_masks(masks=masks,
+                                                flows=flows,
+                                                filter_size=False,
+                                                filter_intensity=False,
+                                                minimum_size=object_settings['minimum_size'],
+                                                maximum_size=object_settings['maximum_size'],
+                                                remove_border_objects=False,
+                                                merge=object_settings['merge'],
+                                                batch=batch,
+                                                plot=settings['plot'],
+                                                figuresize=figuresize)
+                if settings['filter']:
+                    mask_stack = _filter_cp_masks(masks=masks,
+                                                flows=flows,
+                                                filter_size=object_settings['filter_size'],
+                                                filter_intensity=object_settings['filter_intensity'],
+                                                minimum_size=object_settings['minimum_size'],
+                                                maximum_size=object_settings['maximum_size'],
+                                                remove_border_objects=object_settings['remove_border_objects'],
+                                                merge=object_settings['merge'],
+                                                batch=batch,
+                                                plot=settings['plot'],
+                                                figuresize=figuresize)
+                    _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+                else:
+                    mask_stack = _masks_to_masks_stack(masks)
+                    if settings['plot']:
+                        for idx, (mask, flow, image) in enumerate(zip(masks, flows[0], batch)):
+                            if idx == 0:
+                                num_objects = mask_object_count(mask)
+                                print(f'Number of objects, : {num_objects}')
+                                plot_masks(batch=image, masks=mask, flows=flow, cmap='inferno', figuresize=figuresize, nr=1, file_type='.npz', print_object_number=True)
             if not np.any(mask_stack):
                 average_obj_size = 0
             else:
@@ -2255,207 +2590,883 @@ def generate_cellpose_masks_v1(src, settings, object_type):
     torch.cuda.empty_cache()
     return
-def generate_cellpose_masks(src, settings, object_type):
+def generate_masks_from_imgs(src, model, model_name, batch_size, diameter, cellprob_threshold, flow_threshold, grayscale, save, normalize, channels, percentiles, circular, invert, plot, resize, target_height, target_width, remove_background, background, Signal_to_noise, verbose):
-    from .utils import _masks_to_masks_stack, _filter_cp_masks, _get_cellpose_batch_size, _get_object_settings, _get_cellpose_channels, _choose_model, mask_object_count
-    from .io import _create_database, _save_object_counts_to_database, _check_masks, _get_avg_object_size
-    from .timelapse import _npz_to_movie, _btrack_track_cells, _trackpy_track_cells
-    from .plot import plot_masks
+    from .io import _load_images_and_labels, _load_normalized_images_and_labels
+    from .utils import resize_images_and_labels, resizescikit
+    from .plot import print_mask_and_flows
+    dst = os.path.join(src, model_name)
+    os.makedirs(dst, exist_ok=True)
+    chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [1,0] if model_name == 'cyto' else [2, 0]
+    if grayscale:
+        chans=[0, 0]
-    gc.collect()
-    if not torch.cuda.is_available():
-        print(f'Torch CUDA is not available, using CPU')
+    all_image_files = [os.path.join(src, f) for f in os.listdir(src) if f.endswith('.tif')]
+    random.shuffle(all_image_files)
-    figuresize=25
-    timelapse = settings['timelapse']
-    if timelapse:
-        timelapse_displacement = settings['timelapse_displacement']
-        timelapse_frame_limits = settings['timelapse_frame_limits']
-        timelapse_memory = settings['timelapse_memory']
-        timelapse_remove_transient = settings['timelapse_remove_transient']
-        timelapse_mode = settings['timelapse_mode']
-        timelapse_objects = settings['timelapse_objects']
-    batch_size = settings['batch_size']
-    cellprob_threshold = settings[f'{object_type}_CP_prob']
-    flow_threshold = 30
+    if verbose == True:
+        print(f'Cellpose settings: Model: {model_name}, channels: {channels}, cellpose_chans: {chans}, diameter:{diameter}, flow_threshold:{flow_threshold}, cellprob_threshold:{cellprob_threshold}')
-    object_settings = _get_object_settings(object_type, settings)
-    model_name = object_settings['model_name']
+    time_ls = []
+    for i in range(0, len(all_image_files), batch_size):
+        image_files = all_image_files[i:i+batch_size]
+        if normalize:
+            images, _, image_names, _ = _load_normalized_images_and_labels(image_files, None, channels, percentiles,  circular, invert, plot, remove_background, background, Signal_to_noise)
+            images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+            orig_dims = [(image.shape[0], image.shape[1]) for image in images]
+        else:
+            images, _, image_names, _ = _load_images_and_labels(image_files, None, circular, invert)
+            images = [np.squeeze(img) if img.shape[-1] == 1 else img for img in images]
+            orig_dims = [(image.shape[0], image.shape[1]) for image in images]
+        if resize:
+            images, _ = resize_images_and_labels(images, None, target_height, target_width, True)
+        for file_index, stack in enumerate(images):
+            start = time.time()
+            output = model.eval(x=stack,
+                         normalize=False,
+                         channels=chans,
+                         channel_axis=3,
+                         diameter=diameter,
+                         flow_threshold=flow_threshold,
+                         cellprob_threshold=cellprob_threshold,
+                         rescale=False,
+                         resample=False,
+                         progress=False)
+            if len(output) == 4:
+                mask, flows, _, _ = output
+            elif len(output) == 3:
+                mask, flows, _ = output
+            else:
+                raise ValueError("Unexpected number of return values from model.eval()")
+            if resize:
+                dims = orig_dims[file_index]
+                mask = resizescikit(mask, dims, order=0, preserve_range=True, anti_aliasing=False).astype(mask.dtype)
+            stop = time.time()
+            duration = (stop - start)
+            time_ls.append(duration)
+            average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
+            print(f'Processing {file_index+1}/{len(images)} images : Time/image {average_time:.3f} sec', end='\r', flush=True)
+            if plot:
+                if resize:
+                    stack = resizescikit(stack, dims, preserve_range=True, anti_aliasing=False).astype(stack.dtype)
+                print_mask_and_flows(stack, mask, flows, overlay=True)
+            if save:
+                output_filename = os.path.join(dst, image_names[file_index])
+                cv2.imwrite(output_filename, mask)
+def check_cellpose_models(settings):
-    cellpose_channels = _get_cellpose_channels(src, settings['nucleus_channel'], settings['pathogen_channel'], settings['cell_channel'])
+    src = settings['src']
+    settings.setdefault('batch_size', 10)
+    settings.setdefault('CP_prob', 0)
+    settings.setdefault('flow_threshold', 0.4)
+    settings.setdefault('save', True)
+    settings.setdefault('normalize', True)
+    settings.setdefault('channels', [0,0])
+    settings.setdefault('percentiles', None)
+    settings.setdefault('circular', False)
+    settings.setdefault('invert', False)
+    settings.setdefault('plot', True)
+    settings.setdefault('diameter', 40)
+    settings.setdefault('grayscale', True)
+    settings.setdefault('remove_background', False)
+    settings.setdefault('background', 100)
+    settings.setdefault('Signal_to_noise', 5)
+    settings.setdefault('verbose', False)
+    settings.setdefault('resize', False)
+    settings.setdefault('target_height', None)
+    settings.setdefault('target_width', None)
     if settings['verbose']:
-        print(cellpose_channels)
+        settings_df = pd.DataFrame(list(settings.items()), columns=['setting_key', 'setting_value'])
+        settings_df['setting_value'] = settings_df['setting_value'].apply(str)
+        display(settings_df)
-    channels = cellpose_channels[object_type]
-    cellpose_batch_size = _get_cellpose_batch_size()
+    cellpose_models = ['cyto', 'nuclei', 'cyto2', 'cyto3']
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model = _choose_model(model_name, device, object_type='cell', restore_type=None)
-    chans = [2, 1] if model_name == 'cyto2' else [0,0] if model_name == 'nucleus' else [2,0] if model_name == 'cyto' else [2, 0] if model_name == 'cyto3' else [2, 0]
-    paths = [os.path.join(src, file) for file in os.listdir(src) if file.endswith('.npz')]
-    count_loc = os.path.dirname(src)+'/measurements/measurements.db'
-    os.makedirs(os.path.dirname(src)+'/measurements', exist_ok=True)
-    _create_database(count_loc)
+    for model_name in cellpose_models:
+        model = cp_models.CellposeModel(gpu=True, model_type=model_name, device=device)
+        print(f'Using {model_name}')
+        generate_masks_from_imgs(src, model, model_name, settings['batch_size'], settings['diameter'], settings['CP_prob'], settings['flow_threshold'], settings['grayscale'], settings['save'], settings['normalize'], settings['channels'], settings['percentiles'], settings['circular'], settings['invert'], settings['plot'], settings['resize'], settings['target_height'], settings['target_width'], settings['remove_background'], settings['background'], settings['Signal_to_noise'], settings['verbose'])
+    return
+def save_results_and_figure(src, fig, results):
+    if not isinstance(results, pd.DataFrame):
+        results = pd.DataFrame(results)
+    results_dir = os.path.join(src, 'results')
+    os.makedirs(results_dir, exist_ok=True)
+    results_path = os.path.join(results_dir,f'results.csv')
+    fig_path = os.path.join(results_dir, f'model_comparison_plot.pdf')
+    results.to_csv(results_path, index=False)
+    fig.savefig(fig_path, format='pdf')
+    print(f'Saved figure to {fig_path} and results to {results_path}')
+def compare_mask(args):
+    src, filename, dirs, conditions = args
+    paths = [os.path.join(d, filename) for d in dirs]
+    if not all(os.path.exists(path) for path in paths):
+        return None
+    from .io import _read_mask  # Import here to avoid issues in multiprocessing
+    from .utils import extract_boundaries, boundary_f1_score, compute_segmentation_ap, jaccard_index
+    from .plot import plot_comparison_results
+    masks = [_read_mask(path) for path in paths]
+    file_results = {'filename': filename}
+    for i in range(len(masks)):
+        for j in range(i + 1, len(masks)):
+            mask_i, mask_j = masks[i], masks[j]
+            f1_score = boundary_f1_score(mask_i, mask_j)
+            jac_index = jaccard_index(mask_i, mask_j)
+            ap_score = compute_segmentation_ap(mask_i, mask_j)
+            file_results.update({
+                f'jaccard_{conditions[i]}_{conditions[j]}': jac_index,
+                f'boundary_f1_{conditions[i]}_{conditions[j]}': f1_score,
+                f'ap_{conditions[i]}_{conditions[j]}': ap_score
+            })
-    average_sizes = []
-    time_ls = []
-    for file_index, path in enumerate(paths):
-        name = os.path.basename(path)
-        name, ext = os.path.splitext(name)
-        output_folder = os.path.join(os.path.dirname(path), object_type+'_mask_stack')
-        os.makedirs(output_folder, exist_ok=True)
-        overall_average_size = 0
-        with np.load(path) as data:
-            stack = data['data']
-            filenames = data['filenames']
-        if settings['timelapse']:
+    return file_results
-            trackable_objects = ['cell','nucleus','pathogen']
-            if not all_elements_match(settings['timelapse_objects'], trackable_objects):
-                print(f'timelapse_objects {settings["timelapse_objects"]} must be a subset of {trackable_objects}')
-                return
+def compare_cellpose_masks(src, verbose=False, processes=None, save=True):
+    from .plot import visualize_cellpose_masks, plot_comparison_results
+    from .io import _read_mask
-            if len(stack) != batch_size:
-                print(f'Changed batch_size:{batch_size} to {len(stack)}, data length:{len(stack)}')
-                settings['timelapse_batch_size'] = len(stack)
-                batch_size = len(stack)
-                if isinstance(timelapse_frame_limits, list):
-                    if len(timelapse_frame_limits) >= 2:
-                        stack = stack[timelapse_frame_limits[0]: timelapse_frame_limits[1], :, :, :].astype(stack.dtype)
-                        filenames = filenames[timelapse_frame_limits[0]: timelapse_frame_limits[1]]
-                        batch_size = len(stack)
-                        print(f'Cut batch at indecies: {timelapse_frame_limits}, New batch_size: {batch_size} ')
+    dirs = [os.path.join(src, d) for d in os.listdir(src) if os.path.isdir(os.path.join(src, d)) and d != 'results']
+    dirs.sort()  # Optional: sort directories if needed
+    conditions = [os.path.basename(d) for d in dirs]
-        for i in range(0, stack.shape[0], batch_size):
-            mask_stack = []
-            start = time.time()
+    # Get common files in all directories
+    common_files = set(os.listdir(dirs[0]))
+    for d in dirs[1:]:
+        common_files.intersection_update(os.listdir(d))
+    common_files = list(common_files)
-            if stack.shape[3] == 1:
-                batch = stack[i: i+batch_size, :, :, [0,0]].astype(stack.dtype)
-            else:
-                batch = stack[i: i+batch_size, :, :, channels].astype(stack.dtype)
+    # Create a pool of workers
+    with Pool(processes=processes) as pool:
+        args = [(src, filename, dirs, conditions) for filename in common_files]
+        results = pool.map(compare_mask, args)
-            batch_filenames = filenames[i: i+batch_size].tolist()
+    # Filter out None results (from skipped files)
+    results = [res for res in results if res is not None]
+    #print(results)
+    if verbose:
+        for result in results:
+            filename = result['filename']
+            masks = [_read_mask(os.path.join(d, filename)) for d in dirs]
+            visualize_cellpose_masks(masks, titles=conditions, filename=filename, save=save, src=src)
-            if not settings['plot']:
-                batch, batch_filenames = _check_masks(batch, batch_filenames, output_folder)
-            if batch.size == 0:
-                print(f'Processing {file_index}/{len(paths)}: Images/npz {batch.shape[0]}')
-                continue
-            if batch.max() > 1:
-                batch = batch / batch.max()
+    fig = plot_comparison_results(results)
+    save_results_and_figure(src, fig, results)
+    return
-            if timelapse:
-                stitch_threshold=100.0
-                movie_path = os.path.join(os.path.dirname(src), 'movies')
-                os.makedirs(movie_path, exist_ok=True)
-                save_path = os.path.join(movie_path, f'timelapse_{object_type}_{name}.mp4')
-                _npz_to_movie(batch, batch_filenames, save_path, fps=2)
-            else:
-                stitch_threshold=0.0
-            print('batch.shape',batch.shape)
-            masks, flows, _, _ = model.eval(x=batch,
-                                            batch_size=cellpose_batch_size,
-                                            normalize=False,
-                                            channels=chans,
-                                            channel_axis=3,
-                                            diameter=object_settings['diameter'],
-                                            flow_threshold=flow_threshold,
-                                            cellprob_threshold=cellprob_threshold,
-                                            rescale=None,
-                                            resample=object_settings['resample'],
-                                            stitch_threshold=stitch_threshold)
-            if timelapse:
+def _calculate_similarity(df, features, col_to_compare, val1, val2):
+    """
+    Calculate similarity scores of each well to the positive and negative controls using various metrics.
+    Args:
+    df (pandas.DataFrame): DataFrame containing the data.
+    features (list): List of feature columns to use for similarity calculation.
+    col_to_compare (str): Column name to use for comparing groups.
+    val1, val2 (str): Values in col_to_compare to create subsets for comparison.
-                if settings['plot']:
-                    for idx, (mask, flow, image) in enumerate(zip(masks, flows[0], batch)):
-                        if idx == 0:
-                            num_objects = mask_object_count(mask)
-                            print(f'Number of objects: {num_objects}')
-                            plot_masks(batch=image, masks=mask, flows=flow, cmap='inferno', figuresize=figuresize, nr=1, file_type='.npz', print_object_number=True)
+    Returns:
+    pandas.DataFrame: DataFrame with similarity scores.
+    """
+    # Separate positive and negative control wells
+    pos_control = df[df[col_to_compare] == val1][features].mean()
+    neg_control = df[df[col_to_compare] == val2][features].mean()
+    # Standardize features for Mahalanobis distance
+    scaler = StandardScaler()
+    scaled_features = scaler.fit_transform(df[features])
+    # Regularize the covariance matrix to avoid singularity
+    cov_matrix = np.cov(scaled_features, rowvar=False)
+    inv_cov_matrix = None
+    try:
+        inv_cov_matrix = np.linalg.inv(cov_matrix)
+    except np.linalg.LinAlgError:
+        # Add a small value to the diagonal elements for regularization
+        epsilon = 1e-5
+        inv_cov_matrix = np.linalg.inv(cov_matrix + np.eye(cov_matrix.shape[0]) * epsilon)
+    # Calculate similarity scores
+    df['similarity_to_pos_euclidean'] = df[features].apply(lambda row: euclidean(row, pos_control), axis=1)
+    df['similarity_to_neg_euclidean'] = df[features].apply(lambda row: euclidean(row, neg_control), axis=1)
+    df['similarity_to_pos_cosine'] = df[features].apply(lambda row: cosine(row, pos_control), axis=1)
+    df['similarity_to_neg_cosine'] = df[features].apply(lambda row: cosine(row, neg_control), axis=1)
+    df['similarity_to_pos_mahalanobis'] = df[features].apply(lambda row: mahalanobis(row, pos_control, inv_cov_matrix), axis=1)
+    df['similarity_to_neg_mahalanobis'] = df[features].apply(lambda row: mahalanobis(row, neg_control, inv_cov_matrix), axis=1)
+    df['similarity_to_pos_manhattan'] = df[features].apply(lambda row: cityblock(row, pos_control), axis=1)
+    df['similarity_to_neg_manhattan'] = df[features].apply(lambda row: cityblock(row, neg_control), axis=1)
+    df['similarity_to_pos_minkowski'] = df[features].apply(lambda row: minkowski(row, pos_control, p=3), axis=1)
+    df['similarity_to_neg_minkowski'] = df[features].apply(lambda row: minkowski(row, neg_control, p=3), axis=1)
+    df['similarity_to_pos_chebyshev'] = df[features].apply(lambda row: chebyshev(row, pos_control), axis=1)
+    df['similarity_to_neg_chebyshev'] = df[features].apply(lambda row: chebyshev(row, neg_control), axis=1)
+    df['similarity_to_pos_hamming'] = df[features].apply(lambda row: hamming(row, pos_control), axis=1)
+    df['similarity_to_neg_hamming'] = df[features].apply(lambda row: hamming(row, neg_control), axis=1)
+    df['similarity_to_pos_jaccard'] = df[features].apply(lambda row: jaccard(row, pos_control), axis=1)
+    df['similarity_to_neg_jaccard'] = df[features].apply(lambda row: jaccard(row, neg_control), axis=1)
+    df['similarity_to_pos_braycurtis'] = df[features].apply(lambda row: braycurtis(row, pos_control), axis=1)
+    df['similarity_to_neg_braycurtis'] = df[features].apply(lambda row: braycurtis(row, neg_control), axis=1)
+    return df
-                _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_timelapse')
-                if object_type in timelapse_objects:
-                    if timelapse_mode == 'btrack':
-                        if not timelapse_displacement is None:
-                            radius = timelapse_displacement
-                        else:
-                            radius = 100
+def _permutation_importance(df, feature_string='channel_3', col_to_compare='col', pos='c1', neg='c2', exclude=None, n_repeats=10, clean=True, nr_to_plot=30, n_estimators=100, test_size=0.2, random_state=42, model_type='xgboost', n_jobs=-1):
+    """
+    Calculates permutation importance for numerical features in the dataframe,
+    comparing groups based on specified column values and uses the model to predict
+    the class for all other rows in the dataframe.
-                        workers = os.cpu_count()-2
-                        if workers < 1:
-                            workers = 1
+    Args:
+    df (pandas.DataFrame): The DataFrame containing the data.
+    feature_string (str): String to filter features that contain this substring.
+    col_to_compare (str): Column name to use for comparing groups.
+    pos, neg (str): Values in col_to_compare to create subsets for comparison.
+    exclude (list or str, optional): Columns to exclude from features.
+    n_repeats (int): Number of repeats for permutation importance.
+    clean (bool): Whether to remove columns with a single value.
+    nr_to_plot (int): Number of top features to plot based on permutation importance.
+    n_estimators (int): Number of trees in the random forest, gradient boosting, or XGBoost model.
+    test_size (float): Proportion of the dataset to include in the test split.
+    random_state (int): Random seed for reproducibility.
+    model_type (str): Type of model to use ('random_forest', 'logistic_regression', 'gradient_boosting', 'xgboost').
+    n_jobs (int): Number of jobs to run in parallel for applicable models.
-                        mask_stack = _btrack_track_cells(src=src,
-                                                         name=name,
-                                                         batch_filenames=batch_filenames,
-                                                         object_type=object_type,
-                                                         plot=settings['plot'],
-                                                         save=settings['save'],
-                                                         masks_3D=masks,
-                                                         mode=timelapse_mode,
-                                                         timelapse_remove_transient=timelapse_remove_transient,
-                                                         radius=radius,
-                                                         workers=workers)
-                    if timelapse_mode == 'trackpy':
-                        mask_stack = _trackpy_track_cells(src=src,
-                                                          name=name,
-                                                          batch_filenames=batch_filenames,
-                                                          object_type=object_type,
-                                                          masks=masks,
-                                                          timelapse_displacement=timelapse_displacement,
-                                                          timelapse_memory=timelapse_memory,
-                                                          timelapse_remove_transient=timelapse_remove_transient,
-                                                          plot=settings['plot'],
-                                                          save=settings['save'],
-                                                          mode=timelapse_mode)
-                else:
-                    mask_stack = _masks_to_masks_stack(masks)
+    Returns:
+    pandas.DataFrame: The original dataframe with added prediction and data usage columns.
+    pandas.DataFrame: DataFrame containing the importances and standard deviations.
+    """
+    from .utils import filter_dataframe_features
+    if 'cells_per_well' in df.columns:
+        df = df.drop(columns=['cells_per_well'])
+    # Subset the dataframe based on specified column values
+    df1 = df[df[col_to_compare] == pos].copy()
+    df2 = df[df[col_to_compare] == neg].copy()
+    # Create target variable
+    df1['target'] = 0
+    df2['target'] = 1
+    # Combine the subsets for analysis
+    combined_df = pd.concat([df1, df2])
+    if feature_string in ['channel_0', 'channel_1', 'channel_2', 'channel_3']:
+        channel_of_interest = int(feature_string.split('_')[-1])
+    elif not feature_string is 'morphology':
+        channel_of_interest = 'morphology'
+    _, features = filter_dataframe_features(combined_df, channel_of_interest, exclude)
+    X = combined_df[features]
+    y = combined_df['target']
+    # Split the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
+    # Label the data in the original dataframe
+    combined_df['data_usage'] = 'train'
+    combined_df.loc[X_test.index, 'data_usage'] = 'test'
+    # Initialize the model based on model_type
+    if model_type == 'random_forest':
+        model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state, n_jobs=n_jobs)
+    elif model_type == 'logistic_regression':
+        model = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs)
+    elif model_type == 'gradient_boosting':
+        model = HistGradientBoostingClassifier(max_iter=n_estimators, random_state=random_state)  # Supports n_jobs internally
+    elif model_type == 'xgboost':
+        model = XGBClassifier(n_estimators=n_estimators, random_state=random_state, nthread=n_jobs, use_label_encoder=False, eval_metric='logloss')
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}")
+    model.fit(X_train, y_train)
+    perm_importance = permutation_importance(model, X_train, y_train, n_repeats=n_repeats, random_state=random_state, n_jobs=n_jobs)
+    # Create a DataFrame for permutation importances
+    permutation_df = pd.DataFrame({
+        'feature': [features[i] for i in perm_importance.importances_mean.argsort()],
+        'importance_mean': perm_importance.importances_mean[perm_importance.importances_mean.argsort()],
+        'importance_std': perm_importance.importances_std[perm_importance.importances_mean.argsort()]
+    }).tail(nr_to_plot)
+    # Plotting
+    fig, ax = plt.subplots()
+    ax.barh(permutation_df['feature'], permutation_df['importance_mean'], xerr=permutation_df['importance_std'], color="teal", align="center", alpha=0.6)
+    ax.set_xlabel('Permutation Importance')
+    plt.tight_layout()
+    plt.show()
+    # Feature importance for models that support it
+    if model_type in ['random_forest', 'xgboost', 'gradient_boosting']:
+        feature_importances = model.feature_importances_
+        feature_importance_df = pd.DataFrame({
+            'feature': features,
+            'importance': feature_importances
+        }).sort_values(by='importance', ascending=False).head(nr_to_plot)
+        # Plotting feature importance
+        fig, ax = plt.subplots()
+        ax.barh(feature_importance_df['feature'], feature_importance_df['importance'], color="blue", align="center", alpha=0.6)
+        ax.set_xlabel('Feature Importance')
+        plt.tight_layout()
+        plt.show()
+    else:
+        feature_importance_df = pd.DataFrame()
+    # Predicting the target variable for the test set
+    predictions_test = model.predict(X_test)
+    combined_df.loc[X_test.index, 'predictions'] = predictions_test
+    # Predicting the target variable for the training set
+    predictions_train = model.predict(X_train)
+    combined_df.loc[X_train.index, 'predictions'] = predictions_train
+    # Predicting the target variable for all other rows in the dataframe
+    X_all = df[features]
+    all_predictions = model.predict(X_all)
+    df['predictions'] = all_predictions
+    # Combine data usage labels back to the original dataframe
+    combined_data_usage = pd.concat([combined_df[['data_usage']], df[['predictions']]], axis=0)
+    df = df.join(combined_data_usage, how='left', rsuffix='_model')
+    # Calculating and printing the accuracy metrics
+    accuracy = accuracy_score(y_test, predictions_test)
+    precision = precision_score(y_test, predictions_test)
+    recall = recall_score(y_test, predictions_test)
+    f1 = f1_score(y_test, predictions_test)
+    print(f"Accuracy: {accuracy}")
+    print(f"Precision: {precision}")
+    print(f"Recall: {recall}")
+    print(f"F1 Score: {f1}")
+    # Printing class-specific accuracy metrics
+    print("\nClassification Report:")
+    print(classification_report(y_test, predictions_test))
+    df = _calculate_similarity(df, features, col_to_compare, pos, neg)
+    return [df, permutation_df, feature_importance_df, model, X_train, X_test, y_train, y_test]
+def _shap_analysis(model, X_train, X_test):
+    """
+    Performs SHAP analysis on the given model and data.
+    Args:
+    model: The trained model.
+    X_train (pandas.DataFrame): Training feature set.
+    X_test (pandas.DataFrame): Testing feature set.
+    """
+    explainer = shap.Explainer(model, X_train)
+    shap_values = explainer(X_test)
+    # Summary plot
+    shap.summary_plot(shap_values, X_test)
+def plate_heatmap(src, model_type='xgboost', variable='predictions', grouping='mean', min_max='allq', cmap='viridis', channel_of_interest=3, min_count=25, n_estimators=100, col_to_compare='col', pos='c1', neg='c2', exclude=None, n_repeats=10, clean=True, nr_to_plot=20, verbose=False, n_jobs=-1):
+    from .io import _read_and_merge_data
+    from .plot import _plot_plates
+    db_loc = [src+'/measurements/measurements.db']
+    tables = ['cell', 'nucleus', 'pathogen','cytoplasm']
+    include_multinucleated, include_multiinfected, include_noninfected = True, 2.0, True
+    df, _ = _read_and_merge_data(db_loc,
+                                 tables,
+                                 verbose=verbose,
+                                 include_multinucleated=include_multinucleated,
+                                 include_multiinfected=include_multiinfected,
+                                 include_noninfected=include_noninfected)
+    if not channel_of_interest is None:
+        df['recruitment'] = df[f'pathogen_channel_{channel_of_interest}_mean_intensity']/df[f'cytoplasm_channel_{channel_of_interest}_mean_intensity']
+        feature_string = f'channel_{channel_of_interest}'
+    else:
+        feature_string = None
+    output = _permutation_importance(df, feature_string, col_to_compare, pos, neg, exclude, n_repeats, clean, nr_to_plot, n_estimators=n_estimators, random_state=42, model_type=model_type, n_jobs=n_jobs)
+    _shap_analysis(output[3], output[4], output[5])
+    features = output[0].select_dtypes(include=[np.number]).columns.tolist()
+    if not variable in features:
+        raise ValueError(f"Variable {variable} not found in the dataframe. Please choose one of the following: {features}")
+    plate_heatmap = _plot_plates(output[0], variable, grouping, min_max, cmap, min_count)
+    return [output, plate_heatmap]
+def join_measurments_and_annotation(src, tables = ['cell', 'nucleus', 'pathogen','cytoplasm']):
+    from .io import _read_and_merge_data, _read_db
+    db_loc = [src+'/measurements/measurements.db']
+    loc = src+'/measurements/measurements.db'
+    df, _ = _read_and_merge_data(db_loc,
+                                 tables,
+                                 verbose=True,
+                                 include_multinucleated=True,
+                                 include_multiinfected=True,
+                                 include_noninfected=True)
+    paths_df = _read_db(loc, tables=['png_list'])
+    merged_df = pd.merge(df, paths_df[0], on='prcfo', how='left')
+    return merged_df
+def jitterplot_by_annotation(src, x_column, y_column, plot_title='Jitter Plot', output_path=None, filter_column=None, filter_values=None):
+    """
+    Reads a CSV file and creates a jitter plot of one column grouped by another column.
+    Args:
+    src (str): Path to the source data.
+    x_column (str): Name of the column to be used for the x-axis.
+    y_column (str): Name of the column to be used for the y-axis.
+    plot_title (str): Title of the plot. Default is 'Jitter Plot'.
+    output_path (str): Path to save the plot image. If None, the plot will be displayed. Default is None.
+    Returns:
+    pd.DataFrame: The filtered and balanced DataFrame.
+    """
+    # Read the CSV file into a DataFrame
+    df = join_measurments_and_annotation(src, tables=['cell', 'nucleus', 'pathogen', 'cytoplasm'])
+    # Print column names for debugging
+    print(f"Generated dataframe with: {df.shape[1]} columns and {df.shape[0]} rows")
+    #print("Columns in DataFrame:", df.columns.tolist())
+    # Replace NaN values with a specific label in x_column
+    df[x_column] = df[x_column].fillna('NaN')
+    # Filter the DataFrame if filter_column and filter_values are provided
+    if not filter_column is None:
+        if isinstance(filter_column, str):
+            df = df[df[filter_column].isin(filter_values)]
+        if isinstance(filter_column, list):
+            for i,val in enumerate(filter_column):
+                print(f'hello {len(df)}')
+                df = df[df[val].isin(filter_values[i])]
+    # Use the correct column names based on your DataFrame
+    required_columns = ['plate_x', 'row_x', 'col_x']
+    if not all(column in df.columns for column in required_columns):
+        raise KeyError(f"DataFrame does not contain the necessary columns: {required_columns}")
+    # Filter to retain rows with non-NaN values in x_column and with matching plate, row, col values
+    non_nan_df = df[df[x_column] != 'NaN']
+    retained_rows = df[df[['plate_x', 'row_x', 'col_x']].apply(tuple, axis=1).isin(non_nan_df[['plate_x', 'row_x', 'col_x']].apply(tuple, axis=1))]
+    # Determine the minimum count of examples across all groups in x_column
+    min_count = retained_rows[x_column].value_counts().min()
+    print(f'Found {min_count} annotated images')
+    # Randomly sample min_count examples from each group in x_column
+    balanced_df = retained_rows.groupby(x_column).apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)
+    # Create the jitter plot
+    plt.figure(figsize=(10, 6))
+    jitter_plot = sns.stripplot(data=balanced_df, x=x_column, y=y_column, hue=x_column, jitter=True, palette='viridis', dodge=False)
+    plt.title(plot_title)
+    plt.xlabel(x_column)
+    plt.ylabel(y_column)
+    # Customize the x-axis labels
+    plt.xticks(rotation=45, ha='right')
+    # Adjust the position of the x-axis labels to be centered below the data
+    ax = plt.gca()
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='center')
+    # Save the plot to a file or display it
+    if output_path:
+        plt.savefig(output_path, bbox_inches='tight')
+        print(f"Jitter plot saved to {output_path}")
+    else:
+        plt.show()
+    return balanced_df
+def generate_image_umap(settings={}):
+    """
+    Generate UMAP or tSNE embedding and visualize the data with clustering.
+    Parameters:
+    settings (dict): Dictionary containing the following keys:
+        src (str): Source directory containing the data.
+        row_limit (int): Limit the number of rows to process.
+        tables (list): List of table names to read from the database.
+        visualize (str): Visualization type.
+        image_nr (int): Number of images to display.
+        dot_size (int): Size of dots in the scatter plot.
+        n_neighbors (int): Number of neighbors for UMAP.
+        figuresize (int): Size of the figure.
+        black_background (bool): Whether to use a black background.
+        remove_image_canvas (bool): Whether to remove the image canvas.
+        plot_outlines (bool): Whether to plot outlines.
+        plot_points (bool): Whether to plot points.
+        smooth_lines (bool): Whether to smooth lines.
+        verbose (bool): Whether to print verbose output.
+        embedding_by_controls (bool): Whether to use embedding from controls.
+        col_to_compare (str): Column to compare for control-based embedding.
+        pos (str): Positive control value.
+        neg (str): Negative control value.
+        clustering (str): Clustering method ('DBSCAN' or 'KMeans').
+        exclude (list): List of columns to exclude from the analysis.
+        plot_images (bool): Whether to plot images.
+        reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
+        save_figure (bool): Whether to save the figure as a PDF.
+    Returns:
+    pd.DataFrame: DataFrame with the original data and an additional column 'cluster' containing the cluster identity.
+    """
+    from .io import _read_and_join_tables
+    from .utils import get_db_paths, preprocess_data, reduction_and_clustering, remove_noise, generate_colors, correct_paths, plot_embedding, plot_clusters_grid, get_umap_image_settings
+    from .alpha import cluster_feature_analysis, generate_umap_from_images
+    settings = get_umap_image_settings(settings)
+    if isinstance(settings['src'], str):
+        settings['src'] = [settings['src']]
+    if settings['plot_images'] is False:
+        settings['black_background'] = False
+    if settings['color_by']:
+        settings['remove_cluster_noise'] = False
+        settings['plot_outlines'] = False
+        settings['smooth_lines'] = False
+    settings_df = pd.DataFrame(list(settings.items()), columns=['Key', 'Value'])
+    settings_dir = os.path.join(settings['src'][0],'settings')
+    settings_csv = os.path.join(settings_dir,'embedding_settings.csv')
+    os.makedirs(settings_dir, exist_ok=True)
+    settings_df.to_csv(settings_csv, index=False)
+    display(settings_df)
+    db_paths = get_db_paths(settings['src'])
+    tables = settings['tables'] + ['png_list']
+    all_df = pd.DataFrame()
+    #image_paths = []
+    for i,db_path in enumerate(db_paths):
+        df = _read_and_join_tables(db_path, table_names=tables)
+        df, image_paths_tmp = correct_paths(df, settings['src'][i])
+        all_df = pd.concat([all_df, df], axis=0)
+        #image_paths.extend(image_paths_tmp)
+    all_df['cond'] = all_df['col'].apply(map_condition, neg=settings['neg'], pos=settings['pos'], mix=settings['mix'])
+    if settings['exclude_conditions']:
+        if isinstance(settings['exclude_conditions'], str):
+            settings['exclude_conditions'] = [settings['exclude_conditions']]
+        row_count_before = len(all_df)
+        all_df = all_df[~all_df['cond'].isin(settings['exclude_conditions'])]
+        if settings['verbose']:
+            print(f'Excluded {row_count_before - len(all_df)} rows after excluding: {settings["exclude_conditions"]}, rows left: {len(all_df)}')
+    if settings['row_limit'] is not None:
+        all_df = all_df.sample(n=settings['row_limit'], random_state=42)
+    image_paths = all_df['png_path'].to_list()
+    if settings['embedding_by_controls']:
+        # Extract and reset the index for the column to compare
+        col_to_compare = all_df[settings['col_to_compare']].reset_index(drop=True)
+        # Preprocess the data to obtain numeric data
+        numeric_data = preprocess_data(all_df, settings['filter_by'], settings['remove_highly_correlated'], settings['log_data'], settings['exclude'])
+        # Convert numeric_data back to a DataFrame to align with col_to_compare
+        numeric_data_df = pd.DataFrame(numeric_data)
+        # Ensure numeric_data_df and col_to_compare are properly aligned
+        numeric_data_df = numeric_data_df.reset_index(drop=True)
+        # Assign the column back to numeric_data_df
+        numeric_data_df[settings['col_to_compare']] = col_to_compare
+        # Subset the dataframe based on specified column values for controls
+        positive_control_df = numeric_data_df[numeric_data_df[settings['col_to_compare']] == settings['pos']].copy()
+        negative_control_df = numeric_data_df[numeric_data_df[settings['col_to_compare']] == settings['neg']].copy()
+        control_numeric_data_df = pd.concat([positive_control_df, negative_control_df])
+        # Drop the comparison column from numeric_data_df and control_numeric_data_df
+        numeric_data_df = numeric_data_df.drop(columns=[settings['col_to_compare']])
+        control_numeric_data_df = control_numeric_data_df.drop(columns=[settings['col_to_compare']])
+        # Convert numeric_data_df and control_numeric_data_df back to numpy arrays
+        numeric_data = numeric_data_df.values
+        control_numeric_data = control_numeric_data_df.values
+        # Train the reducer on control data
+        _, _, reducer = reduction_and_clustering(control_numeric_data, settings['n_neighbors'], settings['min_dist'], settings['metric'], settings['eps'], settings['min_samples'], settings['clustering'], settings['reduction_method'], settings['verbose'], n_jobs=settings['n_jobs'], mode='fit', model=False)
+        # Apply the trained reducer to the entire dataset
+        numeric_data = preprocess_data(all_df, settings['filter_by'], settings['remove_highly_correlated'], settings['log_data'], settings['exclude'])
+        embedding, labels, _ = reduction_and_clustering(numeric_data, settings['n_neighbors'], settings['min_dist'], settings['metric'], settings['eps'], settings['min_samples'], settings['clustering'], settings['reduction_method'], settings['verbose'], n_jobs=settings['n_jobs'], mode=None, model=reducer)
+    else:
+        if settings['resnet_features']:
+            numeric_data, embedding, labels = generate_umap_from_images(image_paths, settings['n_neighbors'], settings['min_dist'], settings['metric'], settings['clustering'], settings['eps'], settings['min_samples'], settings['n_jobs'], settings['verbose'])
+        else:
+            # Apply the trained reducer to the entire dataset
+            numeric_data = preprocess_data(all_df, settings['filter_by'], settings['remove_highly_correlated'], settings['log_data'], settings['exclude'])
+            embedding, labels, _ = reduction_and_clustering(numeric_data, settings['n_neighbors'], settings['min_dist'], settings['metric'], settings['eps'], settings['min_samples'], settings['clustering'], settings['reduction_method'], settings['verbose'], n_jobs=settings['n_jobs'])
+    if settings['remove_cluster_noise']:
+        # Remove noise from the clusters (removes -1 labels from DBSCAN)
+        embedding, labels = remove_noise(embedding, labels)
+    # Plot the results
+    if settings['color_by']:
+        if settings['embedding_by_controls']:
+            labels = all_df[settings['color_by']]
+        else:
+            labels = all_df[settings['color_by']]
+    # Generate colors for the clusters
+    colors = generate_colors(len(np.unique(labels)), settings['black_background'])
+    # Plot the embedding
+    umap_plt = plot_embedding(embedding, image_paths, labels, settings['image_nr'], settings['img_zoom'], colors, settings['plot_by_cluster'], settings['plot_outlines'], settings['plot_points'], settings['plot_images'], settings['smooth_lines'], settings['black_background'], settings['figuresize'], settings['dot_size'], settings['remove_image_canvas'], settings['verbose'])
+    if settings['plot_cluster_grids'] and settings['plot_images']:
+        grid_plt = plot_clusters_grid(embedding, labels, settings['image_nr'], image_paths, colors, settings['figuresize'], settings['black_background'], settings['verbose'])
+    # Save figure as PDF if required
+    if settings['save_figure']:
+        results_dir = os.path.join(settings['src'][0], 'results')
+        os.makedirs(results_dir, exist_ok=True)
+        reduction_method = settings['reduction_method'].upper()
+        embedding_path = os.path.join(results_dir, f'{reduction_method}_embedding.pdf')
+        umap_plt.savefig(embedding_path, format='pdf')
+        print(f'Saved {reduction_method} embedding to {embedding_path} and grid to {embedding_path}')
+        if settings['plot_cluster_grids'] and settings['plot_images']:
+            grid_path = os.path.join(results_dir, f'{reduction_method}_grid.pdf')
+            grid_plt.savefig(grid_path, format='pdf')
+            print(f'Saved {reduction_method} embedding to {embedding_path} and grid to {grid_path}')
+    # Add cluster labels to the dataframe
+    all_df['cluster'] = labels
+    # Save the results to a CSV file
+    results_dir = os.path.join(settings['src'][0], 'results')
+    results_csv = os.path.join(results_dir,'embedding_results.csv')
+    os.makedirs(results_dir, exist_ok=True)
+    all_df.to_csv(results_csv, index=False)
+    print(f'Results saved to {results_csv}')
+    if settings['analyze_clusters']:
+        combined_results = cluster_feature_analysis(all_df)
+        results_dir = os.path.join(settings['src'][0], 'results')
+        cluster_results_csv = os.path.join(results_dir,'cluster_results.csv')
+        os.makedirs(results_dir, exist_ok=True)
+        combined_results.to_csv(cluster_results_csv, index=False)
+        print(f'Cluster results saved to {cluster_results_csv}')
+    return all_df
+# Define the mapping function
+def map_condition(col_value, neg='c1', pos='c2', mix='c3'):
+    if col_value == neg:
+        return 'neg'
+    elif col_value == pos:
+        return 'pos'
+    elif col_value == mix:
+        return 'mix'
+    else:
+        return 'screen'
+def reducer_hyperparameter_search(settings={}, reduction_params=None, dbscan_params=None, kmeans_params=None, save=False):
+    """
+    Perform a hyperparameter search for UMAP or tSNE on the given data.
+    Parameters:
+    settings (dict): Dictionary containing the following keys:
+        src (str): Source directory containing the data.
+        row_limit (int): Limit the number of rows to process.
+        tables (list): List of table names to read from the database.
+        filter_by (str): Column to filter the data.
+        sample_size (int): Number of samples to use for the hyperparameter search.
+        remove_highly_correlated (bool): Whether to remove highly correlated columns.
+        log_data (bool): Whether to log transform the data.
+        verbose (bool): Whether to print verbose output.
+        reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
+    reduction_params (list): List of dictionaries containing hyperparameters to test for the reduction method.
+    dbscan_params (list): List of dictionaries containing DBSCAN hyperparameters to test.
+    kmeans_params (list): List of dictionaries containing KMeans hyperparameters to test.
+    pointsize (int): Size of the points in the scatter plot.
+    save (bool): Whether to save the resulting plot as a file.
+    Returns:
+    None
+    """
+    from .io import _read_and_join_tables
+    from .utils import get_db_paths, preprocess_data, search_reduction_and_clustering, generate_colors, get_umap_image_settings
+    settings = get_umap_image_settings(settings)
+    pointsize = settings['dot_size']
+    if isinstance(dbscan_params, dict):
+        dbscan_params = [dbscan_params]
+    if isinstance(kmeans_params, dict):
+        kmeans_params = [kmeans_params]
+    if isinstance(reduction_params, dict):
+        reduction_params = [reduction_params]
+    # Determine reduction method based on the keys in reduction_param
+    if any('n_neighbors' in param for param in reduction_params):
+        reduction_method = 'umap'
+    elif any('perplexity' in param for param in reduction_params):
+        reduction_method = 'tsne'
+    elif any('perplexity' in param for param in reduction_params) and any('n_neighbors' in param for param in reduction_params):
+        raise ValueError("Reduction parameters must include 'n_neighbors' for UMAP or 'perplexity' for tSNE, not both.")
+    if settings['reduction_method'].lower() != reduction_method:
+        settings['reduction_method'] = reduction_method
+        print(f'Changed reduction method to {reduction_method} based on the provided parameters.')
+    if settings['verbose']:
+        display(pd.DataFrame(list(settings.items()), columns=['Key', 'Value']))
+    db_paths = get_db_paths(settings['src'])
+    tables = settings['tables']
+    all_df = pd.DataFrame()
+    for db_path in db_paths:
+        df = _read_and_join_tables(db_path, table_names=tables)
+        all_df = pd.concat([all_df, df], axis=0)
+    all_df['cond'] = all_df['col'].apply(map_condition, neg=settings['neg'], pos=settings['pos'], mix=settings['mix'])
+    if settings['exclude_conditions']:
+        if isinstance(settings['exclude_conditions'], str):
+            settings['exclude_conditions'] = [settings['exclude_conditions']]
+        row_count_before = len(all_df)
+        all_df = all_df[~all_df['cond'].isin(settings['exclude_conditions'])]
+        if settings['verbose']:
+            print(f'Excluded {row_count_before - len(all_df)} rows after excluding: {settings["exclude_conditions"]}, rows left: {len(all_df)}')
+    if settings['row_limit'] is not None:
+        all_df = all_df.sample(n=settings['row_limit'], random_state=42)
+    numeric_data = preprocess_data(all_df, settings['filter_by'], settings['remove_highly_correlated'], settings['log_data'], settings['exclude'])
+    # Combine DBSCAN and KMeans parameters
+    clustering_params = []
+    if dbscan_params:
+        for param in dbscan_params:
+            param['method'] = 'dbscan'
+            clustering_params.append(param)
+    if kmeans_params:
+        for param in kmeans_params:
+            param['method'] = 'kmeans'
+            clustering_params.append(param)
+    print('Testing paramiters:', reduction_params)
+    print('Testing clustering paramiters:', clustering_params)
+    # Calculate the grid size
+    grid_rows = len(reduction_params)
+    grid_cols = len(clustering_params)
+    fig_width = grid_cols*10
+    fig_height = grid_rows*10
+    fig, axs = plt.subplots(grid_rows, grid_cols, figsize=(fig_width, fig_height))
+    # Make sure axs is always an array of axes
+    axs = np.atleast_1d(axs)
+    # Iterate through the Cartesian product of reduction and clustering hyperparameters
+    for i, reduction_param in enumerate(reduction_params):
+        for j, clustering_param in enumerate(clustering_params):
+            if len(clustering_params) <= 1:
+                axs[i].axis('off')
+                ax = axs[i]
+            elif len(reduction_params) <= 1:
+                axs[j].axis('off')
+                ax = axs[j]
             else:
-                _save_object_counts_to_database(masks, object_type, batch_filenames, count_loc, added_string='_before_filtration')
-                mask_stack = _filter_cp_masks(masks=masks,
-                                              flows=flows,
-                                              filter_size=object_settings['filter_size'],
-                                              filter_intensity=object_settings['filter_intensity'],
-                                              minimum_size=object_settings['minimum_size'],
-                                              maximum_size=object_settings['maximum_size'],
-                                              remove_border_objects=object_settings['remove_border_objects'],
-                                              merge=False,
-                                              batch=batch,
-                                              plot=settings['plot'],
-                                              figuresize=figuresize)
+                ax = axs[i, j]
+            # Perform dimensionality reduction and clustering
+            if settings['reduction_method'].lower() == 'umap':
+                n_neighbors = reduction_param.get('n_neighbors', 15)
+                if isinstance(n_neighbors, float):
+                    n_neighbors = int(n_neighbors * len(numeric_data))
+                min_dist = reduction_param.get('min_dist', 0.1)
+                embedding, labels = search_reduction_and_clustering(numeric_data, n_neighbors, min_dist, settings['metric'],
+                                                                    clustering_param.get('eps', 0.5), clustering_param.get('min_samples', 5),
+                                                                    clustering_param['method'], settings['reduction_method'], settings['verbose'], reduction_param, n_jobs=settings['n_jobs'])
-                _save_object_counts_to_database(mask_stack, object_type, batch_filenames, count_loc, added_string='_after_filtration')
+            elif settings['reduction_method'].lower() == 'tsne':
+                perplexity = reduction_param.get('perplexity', 30)
-            if not np.any(mask_stack):
-                average_obj_size = 0
-            else:
-                average_obj_size = _get_avg_object_size(mask_stack)
+                if isinstance(perplexity, float):
+                    perplexity = int(perplexity * len(numeric_data))
-            average_sizes.append(average_obj_size)
-            overall_average_size = np.mean(average_sizes) if len(average_sizes) > 0 else 0
+                embedding, labels = search_reduction_and_clustering(numeric_data, perplexity, 0.1, settings['metric'],
+                                                                    clustering_param.get('eps', 0.5), clustering_param.get('min_samples', 5),
+                                                                    clustering_param['method'], settings['reduction_method'], settings['verbose'], reduction_param, n_jobs=settings['n_jobs'])
+            else:
+                raise ValueError(f"Unsupported reduction method: {settings['reduction_method']}. Supported methods are 'UMAP' and 'tSNE'")
+            # Plot the results
+            if settings['color_by']:
+                unique_groups = all_df[settings['color_by']].unique()
+                colors = generate_colors(len(unique_groups), False)
+                for group, color in zip(unique_groups, colors):
+                    indices = all_df[settings['color_by']] == group
+                    ax.scatter(embedding[indices, 0], embedding[indices, 1], s=pointsize, label=f"{group}", color=color)
+            else:
+                unique_labels = np.unique(labels)
+                colors = generate_colors(len(unique_labels), False)
+                for label, color in zip(unique_labels, colors):
+                    ax.scatter(embedding[labels == label, 0], embedding[labels == label, 1], s=pointsize, label=f"Cluster {label}", color=color)
+            ax.set_title(f"{settings['reduction_method']} {reduction_param}\n{clustering_param['method']} {clustering_param}")
+            ax.legend()
+    plt.tight_layout()
+    if save:
+        results_dir = os.path.join(settings['src'], 'results')
+        os.makedirs(results_dir, exist_ok=True)
+        plt.savefig(os.path.join(results_dir, 'hyperparameter_search.pdf'))
+    else:
+        plt.show()
-        stop = time.time()
-        duration = (stop - start)
-        time_ls.append(duration)
-        average_time = np.mean(time_ls) if len(time_ls) > 0 else 0
-        time_in_min = average_time/60
-        time_per_mask = average_time/batch_size
-        print(f'Processing {len(paths)}  files with {batch_size} imgs: {(file_index+1)*(batch_size+1)}/{(len(paths))*(batch_size+1)}: Time/batch {time_in_min:.3f} min: Time/mask {time_per_mask:.3f}sec: {object_type} size: {overall_average_size:.3f} px2')
-        if not timelapse:
-            if settings['plot']:
-                plot_masks(batch, mask_stack, flows, figuresize=figuresize, cmap='inferno', nr=batch_size)
-        if settings['save']:
-            for mask_index, mask in enumerate(mask_stack):
-                output_filename = os.path.join(output_folder, batch_filenames[mask_index])
-                np.save(output_filename, mask)
-            mask_stack = []
-            batch_filenames = []
-        gc.collect()
-    torch.cuda.empty_cache()
     return

spacr 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl

spacr 0.0.2py3-none-any.whl → 0.0.6py3-none-any.whl