spacr 0.0.71__py3-none-any.whl → 0.0.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/utils.py CHANGED
@@ -43,6 +43,7 @@ from scipy.stats import fisher_exact
43
43
  from scipy.ndimage.filters import gaussian_filter
44
44
  from scipy.spatial import ConvexHull
45
45
  from scipy.interpolate import splprep, splev
46
+ from scipy.ndimage import binary_dilation
46
47
 
47
48
  from sklearn.preprocessing import StandardScaler
48
49
  from skimage.exposure import rescale_intensity
@@ -55,6 +56,8 @@ from sklearn.preprocessing import StandardScaler
55
56
  from sklearn.cluster import DBSCAN
56
57
  from sklearn.cluster import KMeans
57
58
  from sklearn.manifold import TSNE
59
+ from sklearn.cluster import KMeans
60
+ from sklearn.decomposition import PCA
58
61
 
59
62
  import umap.umap_ as umap
60
63
 
@@ -62,6 +65,12 @@ from torchvision import models
62
65
  from torchvision.models.resnet import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights
63
66
  import torchvision.transforms as transforms
64
67
 
68
+ from sklearn.ensemble import RandomForestClassifier
69
+ from sklearn.preprocessing import StandardScaler
70
+ from scipy.stats import f_oneway, kruskal
71
+ from sklearn.cluster import KMeans
72
+ from scipy import stats
73
+
65
74
  from .logger import log_function_call
66
75
 
67
76
  def check_mask_folder(src,mask_fldr):
@@ -529,48 +538,12 @@ def _annotate_conditions(df, cells=['HeLa'], cell_loc=None, pathogens=['rh'], pa
529
538
  df['condition'] = df['condition'].apply(lambda x: x if x else 'none')
530
539
  return df
531
540
 
532
- def normalize_to_dtype(array, p1=2, p2=98):
533
- """
534
- Normalize each image in the stack to its own percentiles.
535
-
536
- Parameters:
537
- - array: numpy array
538
- The input stack to be normalized.
539
- - p1: int, optional
540
- The lower percentile value for normalization. Default is 2.
541
- - p2: int, optional
542
- The upper percentile value for normalization. Default is 98.
543
-
544
- Returns:
545
- - new_stack: numpy array
546
- The normalized stack with the same shape as the input stack.
547
- """
548
- nimg = array.shape[2]
549
- new_stack = np.empty_like(array)
550
-
551
- for i in range(nimg):
552
- img = array[:, :, i]
553
- non_zero_img = img[img > 0]
554
-
555
- if non_zero_img.size > 0:
556
- img_min = np.percentile(non_zero_img, p1)
557
- img_max = np.percentile(non_zero_img, p2)
558
- else:
559
- img_min = img.min()
560
- img_max = img.max()
561
-
562
- # Determine output range based on dtype
563
- if np.issubdtype(array.dtype, np.integer):
564
- out_range = (0, np.iinfo(array.dtype).max)
565
- else:
566
- out_range = (0.0, 1.0)
567
-
568
- img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range).astype(array.dtype)
569
- new_stack[:, :, i] = img
570
-
571
- return new_stack
541
+ def is_list_of_lists(var):
542
+ if isinstance(var, list) and all(isinstance(i, list) for i in var):
543
+ return True
544
+ return False
572
545
 
573
- def normalize_to_dtype(array, p1=2, p2=98):
546
+ def normalize_to_dtype(array, p1=2, p2=98, percentile_list=None):
574
547
  """
575
548
  Normalize each image in the stack to its own percentiles.
576
549
 
@@ -581,29 +554,40 @@ def normalize_to_dtype(array, p1=2, p2=98):
581
554
  The lower percentile value for normalization. Default is 2.
582
555
  - p2: int, optional
583
556
  The upper percentile value for normalization. Default is 98.
584
-
557
+ - percentile_list: list, optional
558
+ A list of pre-calculated percentiles for each image in the stack. Default is None.
559
+
585
560
  Returns:
586
561
  - new_stack: numpy array
587
562
  The normalized stack with the same shape as the input stack.
588
563
  """
564
+
565
+ out_range = (0, np.iinfo(array.dtype).max)
589
566
  nimg = array.shape[2]
590
- new_stack = np.empty_like(array, dtype=np.float32)
567
+ new_stack = np.empty_like(array, dtype=array.dtype)
591
568
 
592
569
  for i in range(nimg):
593
570
  img = array[:, :, i]
594
571
  non_zero_img = img[img > 0]
595
-
596
- if non_zero_img.size > 0:
597
- img_min = np.percentile(non_zero_img, p1)
598
- img_max = np.percentile(non_zero_img, p2)
572
+ if not percentile_list is None:
573
+ percentiles = percentile_list[i]
574
+ else:
575
+ percentile_1 = p1
576
+ percentile_2 = p2
577
+ if percentile_list is None:
578
+ if non_zero_img.size > 0:
579
+ img_min = np.percentile(non_zero_img, percentile_1)
580
+ img_max = np.percentile(non_zero_img, percentile_2)
581
+ else:
582
+ img_min = np.percentile(img, percentile_1)
583
+ img_max = np.percentile(img, percentile_2)
599
584
  else:
600
- img_min = img.min()
601
- img_max = img.max()
585
+ img_min = percentiles[0]
586
+ img_max = percentiles[1]
602
587
 
603
588
  # Normalize to the range (0, 1) for visualization
604
- img = rescale_intensity(img, in_range=(img_min, img_max), out_range=(0.0, 1.0))
589
+ img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range)
605
590
  new_stack[:, :, i] = img
606
-
607
591
  return new_stack
608
592
 
609
593
  def _list_endpoint_subdirectories(base_dir):
@@ -866,7 +850,7 @@ def _check_integrity(df):
866
850
  df['label_list'] = df['label_list'].astype(str)
867
851
  return df
868
852
 
869
- def _get_percentiles(array, q1=2, q2=98):
853
+ def _get_percentiles(array, p1=2, p2=98):
870
854
  """
871
855
  Calculate the percentiles of each image in the given array.
872
856
 
@@ -889,15 +873,16 @@ def _get_percentiles(array, q1=2, q2=98):
889
873
  img = np.squeeze(array[:, :, v])
890
874
  non_zero_img = img[img > 0]
891
875
  if non_zero_img.size > 0: # check if there are non-zero values
892
- img_min = np.percentile(non_zero_img, q1) # change percentile from 0.02 to 2
893
- img_max = np.percentile(non_zero_img, q2) # change percentile from 0.98 to 98
876
+ img_min = np.percentile(non_zero_img, p1) # change percentile from 0.02 to 2
877
+ img_max = np.percentile(non_zero_img, p2) # change percentile from 0.98 to 98
894
878
  percentiles.append([img_min, img_max])
895
879
  else: # if there are no non-zero values, just use the image as it is
896
- img_min, img_max = img.min(), img.max()
880
+ img_min = np.percentile(img, p1) # change percentile from 0.02 to 2
881
+ img_max = np.percentile(img, p2) # change percentile from 0.98 to 98
897
882
  percentiles.append([img_min, img_max])
898
883
  return percentiles
899
884
 
900
- def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
885
+ def _crop_center(img, cell_mask, new_width, new_height):
901
886
  """
902
887
  Crop the image around the center of the cell mask.
903
888
 
@@ -910,8 +895,6 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
910
895
  The desired width of the cropped image.
911
896
  - new_height: int
912
897
  The desired height of the cropped image.
913
- - normalize: tuple, optional
914
- The normalization range for the image pixel values. Default is (2, 98).
915
898
 
916
899
  Returns:
917
900
  - img: numpy.ndarray
@@ -921,19 +904,22 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
921
904
  cell_mask[cell_mask != 0] = 1
922
905
  mask_3d = np.repeat(cell_mask[:, :, np.newaxis], img.shape[2], axis=2).astype(img.dtype) # Create 3D mask
923
906
  img = np.multiply(img, mask_3d).astype(img.dtype) # Multiply image with mask to set pixel values outside of the mask to 0
924
- #centroid = np.round(ndi.measurements.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
925
907
  centroid = np.round(ndi.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
908
+
926
909
  # Pad the image and mask to ensure the crop will not go out of bounds
927
910
  pad_width = max(new_width, new_height)
928
911
  img = np.pad(img, ((pad_width, pad_width), (pad_width, pad_width), (0, 0)), mode='constant')
929
912
  cell_mask = np.pad(cell_mask, ((pad_width, pad_width), (pad_width, pad_width)), mode='constant')
913
+
930
914
  # Update centroid coordinates due to padding
931
915
  centroid += pad_width
916
+
932
917
  # Compute bounding box
933
918
  start_y = max(0, centroid[0] - new_height // 2)
934
919
  end_y = min(start_y + new_height, img.shape[0])
935
920
  start_x = max(0, centroid[1] - new_width // 2)
936
921
  end_x = min(start_x + new_width, img.shape[1])
922
+
937
923
  # Crop to bounding box
938
924
  img = img[start_y:end_y, start_x:end_x, :]
939
925
  return img
@@ -3434,105 +3420,6 @@ def reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric, eps, m
3434
3420
 
3435
3421
  return embedding, labels, reducer
3436
3422
 
3437
- def reduction_and_clustering_v1(numeric_data, n_neighbors, min_dist, metric, eps, min_samples, clustering, reduction_method='umap', verbose=False, embedding=None, n_jobs=-1):
3438
- """
3439
- Perform dimensionality reduction and clustering on the given data.
3440
-
3441
- Parameters:
3442
- numeric_data (np.ndarray): Numeric data for embedding and clustering.
3443
- n_neighbors (int or float): Number of neighbors for UMAP or perplexity for t-SNE.
3444
- min_dist (float): Minimum distance for UMAP.
3445
- metric (str): Metric for UMAP and DBSCAN.
3446
- eps (float): Epsilon for DBSCAN.
3447
- min_samples (int): Minimum samples for DBSCAN or number of clusters for KMeans.
3448
- clustering (str): Clustering method ('DBSCAN' or 'KMeans').
3449
- reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
3450
- verbose (bool): Whether to print verbose output.
3451
- embedding (np.ndarray, optional): Precomputed embedding. Default is None.
3452
-
3453
- Returns:
3454
- tuple: embedding, labels
3455
- """
3456
-
3457
- if verbose:
3458
- v=1
3459
- else:
3460
- v=0
3461
-
3462
- if isinstance(n_neighbors, float):
3463
- n_neighbors = int(n_neighbors * len(numeric_data))
3464
-
3465
- if n_neighbors <= 2:
3466
- n_neighbors = 2
3467
-
3468
- if reduction_method == 'umap':
3469
- reducer = umap.UMAP(n_neighbors=n_neighbors,
3470
- n_components=2,
3471
- metric=metric,
3472
- n_epochs=None,
3473
- learning_rate=1.0,
3474
- init='spectral',
3475
- min_dist=min_dist,
3476
- spread=1.0,
3477
- set_op_mix_ratio=1.0,
3478
- local_connectivity=1,
3479
- repulsion_strength=1.0,
3480
- negative_sample_rate=5,
3481
- transform_queue_size=4.0,
3482
- a=None,
3483
- b=None,
3484
- random_state=42,
3485
- metric_kwds=None,
3486
- angular_rp_forest=False,
3487
- target_n_neighbors=-1,
3488
- target_metric='categorical',
3489
- target_metric_kwds=None,
3490
- target_weight=0.5,
3491
- transform_seed=42,
3492
- n_jobs=n_jobs,
3493
- verbose=verbose)
3494
-
3495
- elif reduction_method == 'tsne':
3496
-
3497
- #tsne_params.setdefault('n_components', 2)
3498
- #reducer = TSNE(**tsne_params)
3499
-
3500
- reducer = TSNE(n_components=2,
3501
- perplexity=n_neighbors,
3502
- early_exaggeration=12.0,
3503
- learning_rate=200.0,
3504
- n_iter=1000,
3505
- n_iter_without_progress=300,
3506
- min_grad_norm=1e-7,
3507
- metric=metric,
3508
- init='random',
3509
- verbose=v,
3510
- random_state=42,
3511
- method='barnes_hut',
3512
- angle=0.5,
3513
- n_jobs=n_jobs)
3514
-
3515
- else:
3516
- raise ValueError(f"Unsupported reduction method: {reduction_method}. Supported methods are 'umap' and 'tsne'")
3517
-
3518
- if embedding is None:
3519
- embedding = reducer.fit_transform(numeric_data)
3520
-
3521
- if clustering == 'dbscan':
3522
- clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=n_jobs)
3523
- elif clustering == 'kmeans':
3524
- clustering_model = KMeans(n_clusters=min_samples, random_state=42)
3525
- else:
3526
- raise ValueError(f"Unsupported clustering method: {clustering}. Supported methods are 'dbscan' and 'kmeans'")
3527
-
3528
- clustering_model.fit(embedding)
3529
- labels = clustering_model.labels_ if clustering == 'dbscan' else clustering_model.predict(embedding)
3530
-
3531
- if verbose:
3532
- print(f'Embedding shape: {embedding.shape}')
3533
-
3534
- return embedding, labels
3535
-
3536
3423
  def remove_noise(embedding, labels):
3537
3424
  non_noise_indices = labels != -1
3538
3425
  embedding = embedding[non_noise_indices]
@@ -3744,30 +3631,6 @@ def correct_paths(df, base_path):
3744
3631
  image_paths = df['png_path'].to_list()
3745
3632
  return df, image_paths
3746
3633
 
3747
- def correct_paths_v1(df, base_path):
3748
- if 'png_path' not in df.columns:
3749
- print("No 'png_path' column found in the dataframe.")
3750
- return df, None
3751
-
3752
- image_paths = df['png_path'].to_list()
3753
-
3754
- adjusted_image_paths = []
3755
- for path in image_paths:
3756
- if base_path not in path:
3757
- print(f"Adjusting path: {path}")
3758
- parts = path.split('data/')
3759
- if len(parts) > 1:
3760
- new_path = os.path.join(base_path, 'data', parts[1])
3761
- adjusted_image_paths.append(new_path)
3762
- else:
3763
- adjusted_image_paths.append(path)
3764
- else:
3765
- adjusted_image_paths.append(path)
3766
-
3767
- df['png_path'] = adjusted_image_paths
3768
- image_paths = df['png_path'].to_list()
3769
- return df, image_paths
3770
-
3771
3634
  def get_umap_image_settings(settings={}):
3772
3635
  settings.setdefault('src', 'path')
3773
3636
  settings.setdefault('row_limit', 1000)
@@ -3814,6 +3677,110 @@ def get_umap_image_settings(settings={}):
3814
3677
  settings.setdefault('verbose',True)
3815
3678
  return settings
3816
3679
 
3680
+ def get_measure_crop_settings(settings):
3681
+
3682
+ # Test mode
3683
+ settings.setdefault('test_mode', False)
3684
+ settings.setdefault('test_nr', 10)
3685
+
3686
+ #measurement settings
3687
+ settings.setdefault('save_measurements',True)
3688
+ settings.setdefault('radial_dist', True)
3689
+ settings.setdefault('calculate_correlation', True)
3690
+ settings.setdefault('manders_thresholds', [15,85,95])
3691
+ settings.setdefault('homogeneity', True)
3692
+ settings.setdefault('homogeneity_distances', [8,16,32])
3693
+
3694
+ # Cropping settings
3695
+ settings.setdefault('save_arrays', False)
3696
+ settings.setdefault('save_png',True)
3697
+ settings.setdefault('use_bounding_box',False)
3698
+ settings.setdefault('png_size',[224,224])
3699
+ settings.setdefault('png_dims',[0,1,2])
3700
+ settings.setdefault('normalize',False)
3701
+ settings.setdefault('normalize_by','png')
3702
+ settings.setdefault('crop_mode',['cell'])
3703
+ settings.setdefault('dialate_pngs', False)
3704
+ settings.setdefault('dialate_png_ratios', [0.2])
3705
+
3706
+ # Timelapsed settings
3707
+ settings.setdefault('timelapse', False)
3708
+ settings.setdefault('timelapse_objects', 'cell')
3709
+
3710
+ # Operational settings
3711
+ settings.setdefault('plot',False)
3712
+ settings.setdefault('plot_filtration',False)
3713
+ settings.setdefault('representative_images', False)
3714
+ settings.setdefault('max_workers', os.cpu_count()-2)
3715
+
3716
+ # Object settings
3717
+ settings.setdefault('cell_mask_dim',None)
3718
+ settings.setdefault('nucleus_mask_dim',None)
3719
+ settings.setdefault('pathogen_mask_dim',None)
3720
+ settings.setdefault('cytoplasm',False)
3721
+ settings.setdefault('include_uninfected',True)
3722
+ settings.setdefault('cell_min_size',0)
3723
+ settings.setdefault('nucleus_min_size',0)
3724
+ settings.setdefault('pathogen_min_size',0)
3725
+ settings.setdefault('cytoplasm_min_size',0)
3726
+ settings.setdefault('merge_edge_pathogen_cells', True)
3727
+
3728
+ # Miscellaneous settings
3729
+ settings.setdefault('experiment', 'exp')
3730
+ settings.setdefault('cells', 'HeLa')
3731
+ settings.setdefault('cell_loc', None)
3732
+ settings.setdefault('pathogens', ['ME49Dku80WT', 'ME49Dku80dgra8:GRA8', 'ME49Dku80dgra8', 'ME49Dku80TKO'])
3733
+ settings.setdefault('pathogen_loc', [['c1', 'c2', 'c3', 'c4', 'c5', 'c6'], ['c7', 'c8', 'c9', 'c10', 'c11', 'c12'], ['c13', 'c14', 'c15', 'c16', 'c17', 'c18'], ['c19', 'c20', 'c21', 'c22', 'c23', 'c24']])
3734
+ settings.setdefault('treatments', ['BR1', 'BR2', 'BR3'])
3735
+ settings.setdefault('treatment_loc', [['c1', 'c2', 'c7', 'c8', 'c13', 'c14', 'c19', 'c20'], ['c3', 'c4', 'c9', 'c10', 'c15', 'c16', 'c21', 'c22'], ['c5', 'c6', 'c11', 'c12', 'c17', 'c18', 'c23', 'c24']])
3736
+ settings.setdefault('channel_of_interest', 2)
3737
+ settings.setdefault('compartments', ['pathogen', 'cytoplasm'])
3738
+ settings.setdefault('measurement', 'mean_intensity')
3739
+ settings.setdefault('nr_imgs', 32)
3740
+ settings.setdefault('um_per_pixel', 0.1)
3741
+
3742
+ if settings['test_mode']:
3743
+ settings['plot'] = True
3744
+ settings['plot_filtration'] = True
3745
+ test_imgs = settings['test_nr']
3746
+ print(f'Test mode enabled with {test_imgs} images, plotting set to True')
3747
+
3748
+ return settings
3749
+
3750
+ def delete_folder(folder_path):
3751
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
3752
+ for root, dirs, files in os.walk(folder_path, topdown=False):
3753
+ for name in files:
3754
+ os.remove(os.path.join(root, name))
3755
+ for name in dirs:
3756
+ os.rmdir(os.path.join(root, name))
3757
+ os.rmdir(folder_path)
3758
+ print(f"Folder '{folder_path}' has been deleted.")
3759
+ else:
3760
+ print(f"Folder '{folder_path}' does not exist or is not a directory.")
3761
+
3762
+ def measure_test_mode(settings):
3763
+
3764
+ if settings['test_mode']:
3765
+ if not os.path.basename(settings['input_folder']) == 'test':
3766
+ all_files = os.listdir(settings['input_folder'])
3767
+ random_files = random.sample(all_files, settings['test_nr'])
3768
+
3769
+ src = os.path.join(os.path.dirname(settings['input_folder']),'test', 'merged')
3770
+ if os.path.exists(src):
3771
+ delete_folder(src)
3772
+ os.makedirs(src, exist_ok=True)
3773
+
3774
+ for file in random_files:
3775
+ shutil.copy(os.path.join(settings['input_folder'], file), os.path.join(src,file))
3776
+
3777
+ settings['input_folder'] = src
3778
+ print(f'Changed source folder to {src} for test mode')
3779
+ else:
3780
+ print(f'Test mode enabled, using source folder {settings["input_folder"]}')
3781
+
3782
+ return settings
3783
+
3817
3784
  def preprocess_data(df, filter_by, remove_highly_correlated, log_data, exclude):
3818
3785
  """
3819
3786
  Preprocesses the given dataframe by applying filtering, removing highly correlated columns,
@@ -4003,6 +3970,326 @@ def search_reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric,
4003
3970
  if verbose:
4004
3971
  print(f'Embedding shape: {embedding.shape}')
4005
3972
  return embedding, labels
3973
+ import torch
3974
+ import torchvision.transforms as transforms
3975
+ from torchvision.models import resnet50
3976
+ from PIL import Image
3977
+ import numpy as np
3978
+ import umap
3979
+ import pandas as pd
3980
+ from sklearn.ensemble import RandomForestClassifier
3981
+ from sklearn.preprocessing import StandardScaler
3982
+ from scipy.stats import f_oneway, kruskal
3983
+ from sklearn.cluster import KMeans
3984
+ from scipy import stats
3985
+
3986
+ def load_image(image_path):
3987
+ """Load and preprocess an image."""
3988
+ transform = transforms.Compose([
3989
+ transforms.Resize((224, 224)),
3990
+ transforms.ToTensor(),
3991
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
3992
+ ])
3993
+ image = Image.open(image_path).convert('RGB')
3994
+ image = transform(image).unsqueeze(0)
3995
+ return image
3996
+
3997
+ def extract_features(image_paths, resnet=resnet50):
3998
+ """Extract features from images using a pre-trained ResNet model."""
3999
+ model = resnet(pretrained=True)
4000
+ model = model.eval()
4001
+ model = torch.nn.Sequential(*list(model.children())[:-1]) # Remove the last classification layer
4002
+
4003
+ features = []
4004
+ for image_path in image_paths:
4005
+ image = load_image(image_path)
4006
+ with torch.no_grad():
4007
+ feature = model(image).squeeze().numpy()
4008
+ features.append(feature)
4006
4009
 
4010
+ return np.array(features)
4007
4011
 
4012
+ def check_normality(series):
4013
+ """Helper function to check if a feature is normally distributed."""
4014
+ k2, p = stats.normaltest(series)
4015
+ alpha = 0.05
4016
+ if p < alpha: # null hypothesis: x comes from a normal distribution
4017
+ return False
4018
+ return True
4019
+
4020
+ def random_forest_feature_importance(all_df, cluster_col='cluster'):
4021
+ """Random Forest feature importance."""
4022
+ numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
4023
+ if cluster_col in numeric_features:
4024
+ numeric_features.remove(cluster_col)
4025
+
4026
+ X = all_df[numeric_features]
4027
+ y = all_df[cluster_col]
4028
+
4029
+ scaler = StandardScaler()
4030
+ X_scaled = scaler.fit_transform(X)
4031
+
4032
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
4033
+ model.fit(X_scaled, y)
4034
+
4035
+ feature_importances = model.feature_importances_
4036
+
4037
+ importance_df = pd.DataFrame({
4038
+ 'Feature': numeric_features,
4039
+ 'Importance': feature_importances
4040
+ }).sort_values(by='Importance', ascending=False)
4041
+
4042
+ return importance_df
4043
+
4044
+ def perform_statistical_tests(all_df, cluster_col='cluster'):
4045
+ """Perform ANOVA or Kruskal-Wallis tests depending on normality of features."""
4046
+ numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
4047
+ if cluster_col in numeric_features:
4048
+ numeric_features.remove(cluster_col)
4049
+
4050
+ anova_results = []
4051
+ kruskal_results = []
4052
+
4053
+ for feature in numeric_features:
4054
+ groups = [all_df[all_df[cluster_col] == label][feature] for label in np.unique(all_df[cluster_col])]
4055
+
4056
+ if check_normality(all_df[feature]):
4057
+ stat, p = f_oneway(*groups)
4058
+ anova_results.append((feature, stat, p))
4059
+ else:
4060
+ stat, p = kruskal(*groups)
4061
+ kruskal_results.append((feature, stat, p))
4062
+
4063
+ anova_df = pd.DataFrame(anova_results, columns=['Feature', 'ANOVA_Statistic', 'ANOVA_pValue'])
4064
+ kruskal_df = pd.DataFrame(kruskal_results, columns=['Feature', 'Kruskal_Statistic', 'Kruskal_pValue'])
4065
+
4066
+ return anova_df, kruskal_df
4067
+
4068
+ def combine_results(rf_df, anova_df, kruskal_df):
4069
+ """Combine the results into a single DataFrame."""
4070
+ combined_df = rf_df.merge(anova_df, on='Feature', how='left')
4071
+ combined_df = combined_df.merge(kruskal_df, on='Feature', how='left')
4072
+ return combined_df
4073
+
4074
+ def cluster_feature_analysis(all_df, cluster_col='cluster'):
4075
+ """
4076
+ Perform Random Forest feature importance, ANOVA for normally distributed features,
4077
+ and Kruskal-Wallis for non-normally distributed features. Combine results into a single DataFrame.
4078
+ """
4079
+ rf_df = random_forest_feature_importance(all_df, cluster_col)
4080
+ anova_df, kruskal_df = perform_statistical_tests(all_df, cluster_col)
4081
+ combined_df = combine_results(rf_df, anova_df, kruskal_df)
4082
+ return combined_df
4083
+
4084
+ def _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold=5, perimeter_threshold=30):
4085
+ """
4086
+ Merge cells in cell_mask if a parasite in parasite_mask overlaps with more than one cell,
4087
+ and if cells share more than a specified perimeter percentage.
4008
4088
 
4089
+ Args:
4090
+ parasite_mask (ndarray): Mask of parasites.
4091
+ cell_mask (ndarray): Mask of cells.
4092
+ nuclei_mask (ndarray): Mask of nuclei.
4093
+ overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
4094
+ perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
4095
+
4096
+ Returns:
4097
+ ndarray: The modified cell mask (cell_mask) with unique labels.
4098
+ """
4099
+ labeled_cells = label(cell_mask)
4100
+ labeled_parasites = label(parasite_mask)
4101
+ labeled_nuclei = label(nuclei_mask)
4102
+ num_parasites = np.max(labeled_parasites)
4103
+ num_cells = np.max(labeled_cells)
4104
+ num_nuclei = np.max(labeled_nuclei)
4105
+
4106
+ # Merge cells based on parasite overlap
4107
+ for parasite_id in range(1, num_parasites + 1):
4108
+ current_parasite_mask = labeled_parasites == parasite_id
4109
+ overlapping_cell_labels = np.unique(labeled_cells[current_parasite_mask])
4110
+ overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
4111
+ if len(overlapping_cell_labels) > 1:
4112
+
4113
+ # Calculate the overlap percentages
4114
+ overlap_percentages = [
4115
+ np.sum(current_parasite_mask & (labeled_cells == cell_label)) / np.sum(current_parasite_mask) * 100
4116
+ for cell_label in overlapping_cell_labels
4117
+ ]
4118
+ # Merge cells if overlap percentage is above the threshold
4119
+ for cell_label, overlap_percentage in zip(overlapping_cell_labels, overlap_percentages):
4120
+ if overlap_percentage > overlap_threshold:
4121
+ first_label = overlapping_cell_labels[0]
4122
+ for other_label in overlapping_cell_labels[1:]:
4123
+ if other_label != first_label:
4124
+ cell_mask[cell_mask == other_label] = first_label
4125
+
4126
+ # Merge cells based on nucleus overlap
4127
+ for nucleus_id in range(1, num_nuclei + 1):
4128
+ current_nucleus_mask = labeled_nuclei == nucleus_id
4129
+ overlapping_cell_labels = np.unique(labeled_cells[current_nucleus_mask])
4130
+ overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
4131
+ if len(overlapping_cell_labels) > 1:
4132
+
4133
+ # Calculate the overlap percentages
4134
+ overlap_percentages = [
4135
+ np.sum(current_nucleus_mask & (labeled_cells == cell_label)) / np.sum(current_nucleus_mask) * 100
4136
+ for cell_label in overlapping_cell_labels
4137
+ ]
4138
+ # Merge cells if overlap percentage is above the threshold for each cell
4139
+ if all(overlap_percentage > overlap_threshold for overlap_percentage in overlap_percentages):
4140
+ first_label = overlapping_cell_labels[0]
4141
+ for other_label in overlapping_cell_labels[1:]:
4142
+ if other_label != first_label:
4143
+ cell_mask[cell_mask == other_label] = first_label
4144
+
4145
+ # Check for cells without nuclei and merge based on shared perimeter
4146
+ labeled_cells = label(cell_mask) # Re-label after merging based on overlap
4147
+ cell_regions = regionprops(labeled_cells)
4148
+ for region in cell_regions:
4149
+ cell_label = region.label
4150
+ cell_mask_binary = labeled_cells == cell_label
4151
+ overlapping_nuclei = np.unique(nuclei_mask[cell_mask_binary])
4152
+ overlapping_nuclei = overlapping_nuclei[overlapping_nuclei != 0]
4153
+
4154
+ if len(overlapping_nuclei) == 0:
4155
+
4156
+ # Cell does not overlap with any nucleus
4157
+ perimeter = region.perimeter
4158
+
4159
+ # Dilate the cell to find neighbors
4160
+ dilated_cell = binary_dilation(cell_mask_binary, structure=square(3))
4161
+ neighbor_cells = np.unique(labeled_cells[dilated_cell])
4162
+ neighbor_cells = neighbor_cells[(neighbor_cells != 0) & (neighbor_cells != cell_label)]
4163
+
4164
+ # Calculate shared border length with neighboring cells
4165
+ shared_borders = [
4166
+ np.sum((labeled_cells == neighbor_label) & dilated_cell) for neighbor_label in neighbor_cells
4167
+ ]
4168
+ shared_border_percentages = [shared_border / perimeter * 100 for shared_border in shared_borders]
4169
+
4170
+ # Merge with the neighbor cell with the largest shared border percentage above the threshold
4171
+ if shared_borders:
4172
+ max_shared_border_index = np.argmax(shared_border_percentages)
4173
+ max_shared_border_percentage = shared_border_percentages[max_shared_border_index]
4174
+ if max_shared_border_percentage > perimeter_threshold:
4175
+ cell_mask[labeled_cells == cell_label] = neighbor_cells[max_shared_border_index]
4176
+
4177
+ # Relabel the merged cell mask
4178
+ relabeled_cell_mask, _ = label(cell_mask, return_num=True)
4179
+ return relabeled_cell_mask
4180
+
4181
+ def adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30):
4182
+
4183
+ """
4184
+ Process all npy files in the given folders. Merge and relabel cells in cell masks
4185
+ based on parasite overlap and cell perimeter sharing conditions.
4186
+
4187
+ Args:
4188
+ parasite_folder (str): Path to the folder containing parasite masks.
4189
+ cell_folder (str): Path to the folder containing cell masks.
4190
+ nuclei_folder (str): Path to the folder containing nuclei masks.
4191
+ overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
4192
+ perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
4193
+ """
4194
+
4195
+ parasite_files = sorted([f for f in os.listdir(parasite_folder) if f.endswith('.npy')])
4196
+ cell_files = sorted([f for f in os.listdir(cell_folder) if f.endswith('.npy')])
4197
+ nuclei_files = sorted([f for f in os.listdir(nuclei_folder) if f.endswith('.npy')])
4198
+
4199
+ # Ensure there are matching files in all folders
4200
+ if not (len(parasite_files) == len(cell_files) == len(nuclei_files)):
4201
+ raise ValueError("The number of files in the folders do not match.")
4202
+
4203
+ # Match files by name
4204
+ for file_name in parasite_files:
4205
+ parasite_path = os.path.join(parasite_folder, file_name)
4206
+ cell_path = os.path.join(cell_folder, file_name)
4207
+ nuclei_path = os.path.join(nuclei_folder, file_name)
4208
+ # Check if the corresponding cell and nuclei mask files exist
4209
+ if not (os.path.exists(cell_path) and os.path.exists(nuclei_path)):
4210
+ raise ValueError(f"Corresponding cell or nuclei mask file for {file_name} not found.")
4211
+ # Load the masks
4212
+ parasite_mask = np.load(parasite_path)
4213
+ cell_mask = np.load(cell_path)
4214
+ nuclei_mask = np.load(nuclei_path)
4215
+ # Merge and relabel cells
4216
+ merged_cell_mask = _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold, perimeter_threshold)
4217
+
4218
+ # Force 16 bit
4219
+ mamerged_cell_masksk = merged_cell_mask.astype(np.uint16)
4220
+
4221
+ # Overwrite the original cell mask file with the merged result
4222
+ np.save(cell_path, merged_cell_mask)
4223
+
4224
+ def process_masks(mask_folder, image_folder, channel, batch_size=50, n_clusters=2, plot=False):
4225
+
4226
+ def read_files_in_batches(folder, batch_size=50):
4227
+ files = [f for f in os.listdir(folder) if f.endswith('.npy')]
4228
+ files.sort() # Sort to ensure matching order
4229
+ for i in range(0, len(files), batch_size):
4230
+ yield files[i:i + batch_size]
4231
+
4232
+ def measure_morphology_and_intensity(mask, image):
4233
+ properties = measure.regionprops(mask, intensity_image=image)
4234
+ properties_list = [{'area': p.area, 'mean_intensity': p.mean_intensity, 'perimeter': p.perimeter, 'eccentricity': p.eccentricity} for p in properties]
4235
+ return properties_list
4236
+
4237
+ def cluster_objects(properties, n_clusters=2):
4238
+ data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
4239
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
4240
+ return kmeans
4241
+
4242
+ def remove_objects_not_in_largest_cluster(mask, labels, largest_cluster_label):
4243
+ cleaned_mask = np.zeros_like(mask)
4244
+ for region in measure.regionprops(mask):
4245
+ if labels[region.label - 1] == largest_cluster_label:
4246
+ cleaned_mask[mask == region.label] = region.label
4247
+ return cleaned_mask
4248
+
4249
+ def plot_clusters(properties, labels):
4250
+ data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
4251
+ pca = PCA(n_components=2)
4252
+ data_2d = pca.fit_transform(data)
4253
+ plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='viridis')
4254
+ plt.xlabel('PCA Component 1')
4255
+ plt.ylabel('PCA Component 2')
4256
+ plt.title('Object Clustering')
4257
+ plt.show()
4258
+
4259
+ all_properties = []
4260
+
4261
+ # Step 1: Accumulate properties over all files
4262
+ for batch in read_files_in_batches(mask_folder, batch_size):
4263
+ mask_files = [os.path.join(mask_folder, file) for file in batch]
4264
+ image_files = [os.path.join(image_folder, file) for file in batch]
4265
+
4266
+ masks = [np.load(file) for file in mask_files]
4267
+ images = [np.load(file)[:, :, channel] for file in image_files]
4268
+
4269
+ for i, mask in enumerate(masks):
4270
+ image = images[i]
4271
+ # Measure morphology and intensity
4272
+ properties = measure_morphology_and_intensity(mask, image)
4273
+ all_properties.extend(properties)
4274
+
4275
+ # Step 2: Perform clustering on accumulated properties
4276
+ kmeans = cluster_objects(all_properties, n_clusters)
4277
+ labels = kmeans.labels_
4278
+
4279
+ if plot:
4280
+ # Step 3: Plot clusters using PCA
4281
+ plot_clusters(all_properties, labels)
4282
+
4283
+ # Step 4: Remove objects not in the largest cluster and overwrite files in batches
4284
+ label_index = 0
4285
+ for batch in read_files_in_batches(mask_folder, batch_size):
4286
+ mask_files = [os.path.join(mask_folder, file) for file in batch]
4287
+ masks = [np.load(file) for file in mask_files]
4288
+
4289
+ for i, mask in enumerate(masks):
4290
+ batch_properties = measure_morphology_and_intensity(mask, mask)
4291
+ batch_labels = labels[label_index:label_index + len(batch_properties)]
4292
+ largest_cluster_label = np.bincount(batch_labels).argmax()
4293
+ cleaned_mask = remove_objects_not_in_largest_cluster(mask, batch_labels, largest_cluster_label)
4294
+ np.save(mask_files[i], cleaned_mask)
4295
+ label_index += len(batch_properties)