spacr 0.0.70__py3-none-any.whl → 0.0.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/utils.py CHANGED
@@ -43,6 +43,7 @@ from scipy.stats import fisher_exact
43
43
  from scipy.ndimage.filters import gaussian_filter
44
44
  from scipy.spatial import ConvexHull
45
45
  from scipy.interpolate import splprep, splev
46
+ from scipy.ndimage import binary_dilation
46
47
 
47
48
  from sklearn.preprocessing import StandardScaler
48
49
  from skimage.exposure import rescale_intensity
@@ -55,6 +56,8 @@ from sklearn.preprocessing import StandardScaler
55
56
  from sklearn.cluster import DBSCAN
56
57
  from sklearn.cluster import KMeans
57
58
  from sklearn.manifold import TSNE
59
+ from sklearn.cluster import KMeans
60
+ from sklearn.decomposition import PCA
58
61
 
59
62
  import umap.umap_ as umap
60
63
 
@@ -62,6 +65,12 @@ from torchvision import models
62
65
  from torchvision.models.resnet import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights
63
66
  import torchvision.transforms as transforms
64
67
 
68
+ from sklearn.ensemble import RandomForestClassifier
69
+ from sklearn.preprocessing import StandardScaler
70
+ from scipy.stats import f_oneway, kruskal
71
+ from sklearn.cluster import KMeans
72
+ from scipy import stats
73
+
65
74
  from .logger import log_function_call
66
75
 
67
76
  def check_mask_folder(src,mask_fldr):
@@ -370,12 +379,10 @@ def mask_object_count(mask):
370
379
  Counts the number of objects in a given mask.
371
380
 
372
381
  Parameters:
373
- - mask: numpy.ndarray
374
- The mask containing object labels.
382
+ - mask: numpy.ndarray. The mask containing object labels.
375
383
 
376
384
  Returns:
377
- - int
378
- The number of objects in the mask.
385
+ - int. The number of objects in the mask.
379
386
  """
380
387
  unique_labels = np.unique(mask)
381
388
  num_objects = len(unique_labels[unique_labels!=0])
@@ -531,81 +538,56 @@ def _annotate_conditions(df, cells=['HeLa'], cell_loc=None, pathogens=['rh'], pa
531
538
  df['condition'] = df['condition'].apply(lambda x: x if x else 'none')
532
539
  return df
533
540
 
534
- def normalize_to_dtype(array, p1=2, p2=98):
535
- """
536
- Normalize each image in the stack to its own percentiles.
537
-
538
- Parameters:
539
- - array: numpy array
540
- The input stack to be normalized.
541
- - p1: int, optional
542
- The lower percentile value for normalization. Default is 2.
543
- - p2: int, optional
544
- The upper percentile value for normalization. Default is 98.
545
-
546
- Returns:
547
- - new_stack: numpy array
548
- The normalized stack with the same shape as the input stack.
549
- """
550
- nimg = array.shape[2]
551
- new_stack = np.empty_like(array)
552
-
553
- for i in range(nimg):
554
- img = array[:, :, i]
555
- non_zero_img = img[img > 0]
556
-
557
- if non_zero_img.size > 0:
558
- img_min = np.percentile(non_zero_img, p1)
559
- img_max = np.percentile(non_zero_img, p2)
560
- else:
561
- img_min = img.min()
562
- img_max = img.max()
563
-
564
- # Determine output range based on dtype
565
- if np.issubdtype(array.dtype, np.integer):
566
- out_range = (0, np.iinfo(array.dtype).max)
567
- else:
568
- out_range = (0.0, 1.0)
569
-
570
- img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range).astype(array.dtype)
571
- new_stack[:, :, i] = img
572
-
573
- return new_stack
541
+ def is_list_of_lists(var):
542
+ if isinstance(var, list) and all(isinstance(i, list) for i in var):
543
+ return True
544
+ return False
574
545
 
575
- def normalize_to_dtype(array, p1=2, p2=98):
546
+ def normalize_to_dtype(array, p1=2, p2=98, percentile_list=None):
576
547
  """
577
548
  Normalize each image in the stack to its own percentiles.
578
549
 
579
550
  Parameters:
580
551
  - array: numpy array
581
- The input stack to be normalized.
552
+ The input stack to be normalized.
582
553
  - p1: int, optional
583
- The lower percentile value for normalization. Default is 2.
554
+ The lower percentile value for normalization. Default is 2.
584
555
  - p2: int, optional
585
- The upper percentile value for normalization. Default is 98.
586
-
556
+ The upper percentile value for normalization. Default is 98.
557
+ - percentile_list: list, optional
558
+ A list of pre-calculated percentiles for each image in the stack. Default is None.
559
+
587
560
  Returns:
588
561
  - new_stack: numpy array
589
- The normalized stack with the same shape as the input stack.
562
+ The normalized stack with the same shape as the input stack.
590
563
  """
564
+
565
+ out_range = (0, np.iinfo(array.dtype).max)
591
566
  nimg = array.shape[2]
592
- new_stack = np.empty_like(array, dtype=np.float32)
567
+ new_stack = np.empty_like(array, dtype=array.dtype)
593
568
 
594
569
  for i in range(nimg):
595
570
  img = array[:, :, i]
596
571
  non_zero_img = img[img > 0]
597
-
598
- if non_zero_img.size > 0:
599
- img_min = np.percentile(non_zero_img, p1)
600
- img_max = np.percentile(non_zero_img, p2)
572
+ if not percentile_list is None:
573
+ percentiles = percentile_list[i]
601
574
  else:
602
- img_min = img.min()
603
- img_max = img.max()
575
+ percentile_1 = p1
576
+ percentile_2 = p2
577
+ if percentile_list is None:
578
+ if non_zero_img.size > 0:
579
+ img_min = np.percentile(non_zero_img, percentile_1)
580
+ img_max = np.percentile(non_zero_img, percentile_2)
581
+ else:
582
+ img_min = np.percentile(img, percentile_1)
583
+ img_max = np.percentile(img, percentile_2)
584
+ else:
585
+ img_min = percentiles[0]
586
+ img_max = percentiles[1]
604
587
 
605
588
  # Normalize to the range (0, 1) for visualization
606
- img = rescale_intensity(img, in_range=(img_min, img_max), out_range=(0.0, 1.0))
589
+ img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range)
607
590
  new_stack[:, :, i] = img
608
-
609
591
  return new_stack
610
592
 
611
593
  def _list_endpoint_subdirectories(base_dir):
@@ -868,7 +850,7 @@ def _check_integrity(df):
868
850
  df['label_list'] = df['label_list'].astype(str)
869
851
  return df
870
852
 
871
- def _get_percentiles(array, q1=2, q2=98):
853
+ def _get_percentiles(array, p1=2, p2=98):
872
854
  """
873
855
  Calculate the percentiles of each image in the given array.
874
856
 
@@ -891,15 +873,16 @@ def _get_percentiles(array, q1=2, q2=98):
891
873
  img = np.squeeze(array[:, :, v])
892
874
  non_zero_img = img[img > 0]
893
875
  if non_zero_img.size > 0: # check if there are non-zero values
894
- img_min = np.percentile(non_zero_img, q1) # change percentile from 0.02 to 2
895
- img_max = np.percentile(non_zero_img, q2) # change percentile from 0.98 to 98
876
+ img_min = np.percentile(non_zero_img, p1) # change percentile from 0.02 to 2
877
+ img_max = np.percentile(non_zero_img, p2) # change percentile from 0.98 to 98
896
878
  percentiles.append([img_min, img_max])
897
879
  else: # if there are no non-zero values, just use the image as it is
898
- img_min, img_max = img.min(), img.max()
880
+ img_min = np.percentile(img, p1) # change percentile from 0.02 to 2
881
+ img_max = np.percentile(img, p2) # change percentile from 0.98 to 98
899
882
  percentiles.append([img_min, img_max])
900
883
  return percentiles
901
884
 
902
- def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
885
+ def _crop_center(img, cell_mask, new_width, new_height):
903
886
  """
904
887
  Crop the image around the center of the cell mask.
905
888
 
@@ -912,8 +895,6 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
912
895
  The desired width of the cropped image.
913
896
  - new_height: int
914
897
  The desired height of the cropped image.
915
- - normalize: tuple, optional
916
- The normalization range for the image pixel values. Default is (2, 98).
917
898
 
918
899
  Returns:
919
900
  - img: numpy.ndarray
@@ -923,19 +904,22 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
923
904
  cell_mask[cell_mask != 0] = 1
924
905
  mask_3d = np.repeat(cell_mask[:, :, np.newaxis], img.shape[2], axis=2).astype(img.dtype) # Create 3D mask
925
906
  img = np.multiply(img, mask_3d).astype(img.dtype) # Multiply image with mask to set pixel values outside of the mask to 0
926
- #centroid = np.round(ndi.measurements.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
927
907
  centroid = np.round(ndi.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
908
+
928
909
  # Pad the image and mask to ensure the crop will not go out of bounds
929
910
  pad_width = max(new_width, new_height)
930
911
  img = np.pad(img, ((pad_width, pad_width), (pad_width, pad_width), (0, 0)), mode='constant')
931
912
  cell_mask = np.pad(cell_mask, ((pad_width, pad_width), (pad_width, pad_width)), mode='constant')
913
+
932
914
  # Update centroid coordinates due to padding
933
915
  centroid += pad_width
916
+
934
917
  # Compute bounding box
935
918
  start_y = max(0, centroid[0] - new_height // 2)
936
919
  end_y = min(start_y + new_height, img.shape[0])
937
920
  start_x = max(0, centroid[1] - new_width // 2)
938
921
  end_x = min(start_x + new_width, img.shape[1])
922
+
939
923
  # Crop to bounding box
940
924
  img = img[start_y:end_y, start_x:end_x, :]
941
925
  return img
@@ -1485,52 +1469,18 @@ class SpatialAttention(nn.Module):
1485
1469
 
1486
1470
  # Multi-Scale Block with Attention
1487
1471
  class MultiScaleBlockWithAttention(nn.Module):
1488
- """
1489
- Multi-scale block with attention module.
1490
-
1491
- Args:
1492
- in_channels (int): Number of input channels.
1493
- out_channels (int): Number of output channels.
1494
-
1495
- Attributes:
1496
- dilated_conv1 (nn.Conv2d): Dilated convolution layer.
1497
- spatial_attention (nn.Conv2d): Spatial attention layer.
1498
-
1499
- Methods:
1500
- custom_forward: Custom forward method for the module.
1501
- forward: Forward method for the module.
1502
- """
1503
-
1504
1472
  def __init__(self, in_channels, out_channels):
1505
1473
  super(MultiScaleBlockWithAttention, self).__init__()
1506
1474
  self.dilated_conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, dilation=1, padding=1)
1507
1475
  self.spatial_attention = nn.Conv2d(out_channels, out_channels, kernel_size=1)
1508
1476
 
1509
1477
  def custom_forward(self, x):
1510
- """
1511
- Custom forward method for the module.
1512
-
1513
- Args:
1514
- x (torch.Tensor): Input tensor.
1515
-
1516
- Returns:
1517
- torch.Tensor: Output tensor.
1518
- """
1519
1478
  x1 = F.relu(self.dilated_conv1(x), inplace=True)
1520
1479
  x = self.spatial_attention(x1)
1521
1480
  return x
1522
1481
 
1523
1482
  def forward(self, x):
1524
- """
1525
- Forward method for the module.
1526
-
1527
- Args:
1528
- x (torch.Tensor): Input tensor.
1529
-
1530
- Returns:
1531
- torch.Tensor: Output tensor.
1532
- """
1533
- return checkpoint(self.custom_forward, x)
1483
+ return self.custom_forward(x)
1534
1484
 
1535
1485
  # Final Classifier
1536
1486
  class CustomCellClassifier(nn.Module):
@@ -2258,25 +2208,6 @@ def MLR(merged_df, refine_model):
2258
2208
 
2259
2209
  return max_effects, max_effects_pvalues, model, df
2260
2210
 
2261
- #def normalize_to_dtype(array, q1=2, q2=98, percentiles=None):
2262
- # if len(array.shape) == 2:
2263
- # array = np.expand_dims(array, axis=-1)
2264
- # num_channels = array.shape[-1]
2265
- # new_stack = np.empty_like(array)
2266
- # for channel in range(num_channels):
2267
- # img = array[..., channel]
2268
- # non_zero_img = img[img > 0]
2269
- # if non_zero_img.size > 0:
2270
- # img_min = np.percentile(non_zero_img, q1)
2271
- # img_max = np.percentile(non_zero_img, q2)
2272
- # else:
2273
- # img_min, img_max = (percentiles[channel] if percentiles and channel < len(percentiles)
2274
- # else (img.min(), img.max()))
2275
- # new_stack[..., channel] = rescale_intensity(img, in_range=(img_min, img_max), out_range='dtype')
2276
- # if new_stack.shape[-1] == 1:
2277
- # new_stack = np.squeeze(new_stack, axis=-1)
2278
- # return new_stack
2279
-
2280
2211
  def get_files_from_dir(dir_path, file_extension="*"):
2281
2212
  return glob(os.path.join(dir_path, file_extension))
2282
2213
 
@@ -3489,105 +3420,6 @@ def reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric, eps, m
3489
3420
 
3490
3421
  return embedding, labels, reducer
3491
3422
 
3492
- def reduction_and_clustering_v1(numeric_data, n_neighbors, min_dist, metric, eps, min_samples, clustering, reduction_method='umap', verbose=False, embedding=None, n_jobs=-1):
3493
- """
3494
- Perform dimensionality reduction and clustering on the given data.
3495
-
3496
- Parameters:
3497
- numeric_data (np.ndarray): Numeric data for embedding and clustering.
3498
- n_neighbors (int or float): Number of neighbors for UMAP or perplexity for t-SNE.
3499
- min_dist (float): Minimum distance for UMAP.
3500
- metric (str): Metric for UMAP and DBSCAN.
3501
- eps (float): Epsilon for DBSCAN.
3502
- min_samples (int): Minimum samples for DBSCAN or number of clusters for KMeans.
3503
- clustering (str): Clustering method ('DBSCAN' or 'KMeans').
3504
- reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
3505
- verbose (bool): Whether to print verbose output.
3506
- embedding (np.ndarray, optional): Precomputed embedding. Default is None.
3507
-
3508
- Returns:
3509
- tuple: embedding, labels
3510
- """
3511
-
3512
- if verbose:
3513
- v=1
3514
- else:
3515
- v=0
3516
-
3517
- if isinstance(n_neighbors, float):
3518
- n_neighbors = int(n_neighbors * len(numeric_data))
3519
-
3520
- if n_neighbors <= 2:
3521
- n_neighbors = 2
3522
-
3523
- if reduction_method == 'umap':
3524
- reducer = umap.UMAP(n_neighbors=n_neighbors,
3525
- n_components=2,
3526
- metric=metric,
3527
- n_epochs=None,
3528
- learning_rate=1.0,
3529
- init='spectral',
3530
- min_dist=min_dist,
3531
- spread=1.0,
3532
- set_op_mix_ratio=1.0,
3533
- local_connectivity=1,
3534
- repulsion_strength=1.0,
3535
- negative_sample_rate=5,
3536
- transform_queue_size=4.0,
3537
- a=None,
3538
- b=None,
3539
- random_state=42,
3540
- metric_kwds=None,
3541
- angular_rp_forest=False,
3542
- target_n_neighbors=-1,
3543
- target_metric='categorical',
3544
- target_metric_kwds=None,
3545
- target_weight=0.5,
3546
- transform_seed=42,
3547
- n_jobs=n_jobs,
3548
- verbose=verbose)
3549
-
3550
- elif reduction_method == 'tsne':
3551
-
3552
- #tsne_params.setdefault('n_components', 2)
3553
- #reducer = TSNE(**tsne_params)
3554
-
3555
- reducer = TSNE(n_components=2,
3556
- perplexity=n_neighbors,
3557
- early_exaggeration=12.0,
3558
- learning_rate=200.0,
3559
- n_iter=1000,
3560
- n_iter_without_progress=300,
3561
- min_grad_norm=1e-7,
3562
- metric=metric,
3563
- init='random',
3564
- verbose=v,
3565
- random_state=42,
3566
- method='barnes_hut',
3567
- angle=0.5,
3568
- n_jobs=n_jobs)
3569
-
3570
- else:
3571
- raise ValueError(f"Unsupported reduction method: {reduction_method}. Supported methods are 'umap' and 'tsne'")
3572
-
3573
- if embedding is None:
3574
- embedding = reducer.fit_transform(numeric_data)
3575
-
3576
- if clustering == 'dbscan':
3577
- clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=n_jobs)
3578
- elif clustering == 'kmeans':
3579
- clustering_model = KMeans(n_clusters=min_samples, random_state=42)
3580
- else:
3581
- raise ValueError(f"Unsupported clustering method: {clustering}. Supported methods are 'dbscan' and 'kmeans'")
3582
-
3583
- clustering_model.fit(embedding)
3584
- labels = clustering_model.labels_ if clustering == 'dbscan' else clustering_model.predict(embedding)
3585
-
3586
- if verbose:
3587
- print(f'Embedding shape: {embedding.shape}')
3588
-
3589
- return embedding, labels
3590
-
3591
3423
  def remove_noise(embedding, labels):
3592
3424
  non_noise_indices = labels != -1
3593
3425
  embedding = embedding[non_noise_indices]
@@ -3799,30 +3631,6 @@ def correct_paths(df, base_path):
3799
3631
  image_paths = df['png_path'].to_list()
3800
3632
  return df, image_paths
3801
3633
 
3802
- def correct_paths_v1(df, base_path):
3803
- if 'png_path' not in df.columns:
3804
- print("No 'png_path' column found in the dataframe.")
3805
- return df, None
3806
-
3807
- image_paths = df['png_path'].to_list()
3808
-
3809
- adjusted_image_paths = []
3810
- for path in image_paths:
3811
- if base_path not in path:
3812
- print(f"Adjusting path: {path}")
3813
- parts = path.split('data/')
3814
- if len(parts) > 1:
3815
- new_path = os.path.join(base_path, 'data', parts[1])
3816
- adjusted_image_paths.append(new_path)
3817
- else:
3818
- adjusted_image_paths.append(path)
3819
- else:
3820
- adjusted_image_paths.append(path)
3821
-
3822
- df['png_path'] = adjusted_image_paths
3823
- image_paths = df['png_path'].to_list()
3824
- return df, image_paths
3825
-
3826
3634
  def get_umap_image_settings(settings={}):
3827
3635
  settings.setdefault('src', 'path')
3828
3636
  settings.setdefault('row_limit', 1000)
@@ -3869,25 +3677,129 @@ def get_umap_image_settings(settings={}):
3869
3677
  settings.setdefault('verbose',True)
3870
3678
  return settings
3871
3679
 
3680
+ def get_measure_crop_settings(settings):
3681
+
3682
+ # Test mode
3683
+ settings.setdefault('test_mode', False)
3684
+ settings.setdefault('test_nr', 10)
3685
+
3686
+ #measurement settings
3687
+ settings.setdefault('save_measurements',True)
3688
+ settings.setdefault('radial_dist', True)
3689
+ settings.setdefault('calculate_correlation', True)
3690
+ settings.setdefault('manders_thresholds', [15,85,95])
3691
+ settings.setdefault('homogeneity', True)
3692
+ settings.setdefault('homogeneity_distances', [8,16,32])
3693
+
3694
+ # Cropping settings
3695
+ settings.setdefault('save_arrays', False)
3696
+ settings.setdefault('save_png',True)
3697
+ settings.setdefault('use_bounding_box',False)
3698
+ settings.setdefault('png_size',[224,224])
3699
+ settings.setdefault('png_dims',[0,1,2])
3700
+ settings.setdefault('normalize',False)
3701
+ settings.setdefault('normalize_by','png')
3702
+ settings.setdefault('crop_mode',['cell'])
3703
+ settings.setdefault('dialate_pngs', False)
3704
+ settings.setdefault('dialate_png_ratios', [0.2])
3705
+
3706
+ # Timelapsed settings
3707
+ settings.setdefault('timelapse', False)
3708
+ settings.setdefault('timelapse_objects', 'cell')
3709
+
3710
+ # Operational settings
3711
+ settings.setdefault('plot',False)
3712
+ settings.setdefault('plot_filtration',False)
3713
+ settings.setdefault('representative_images', False)
3714
+ settings.setdefault('max_workers', os.cpu_count()-2)
3715
+
3716
+ # Object settings
3717
+ settings.setdefault('cell_mask_dim',None)
3718
+ settings.setdefault('nucleus_mask_dim',None)
3719
+ settings.setdefault('pathogen_mask_dim',None)
3720
+ settings.setdefault('cytoplasm',False)
3721
+ settings.setdefault('include_uninfected',True)
3722
+ settings.setdefault('cell_min_size',0)
3723
+ settings.setdefault('nucleus_min_size',0)
3724
+ settings.setdefault('pathogen_min_size',0)
3725
+ settings.setdefault('cytoplasm_min_size',0)
3726
+ settings.setdefault('merge_edge_pathogen_cells', True)
3727
+
3728
+ # Miscellaneous settings
3729
+ settings.setdefault('experiment', 'exp')
3730
+ settings.setdefault('cells', 'HeLa')
3731
+ settings.setdefault('cell_loc', None)
3732
+ settings.setdefault('pathogens', ['ME49Dku80WT', 'ME49Dku80dgra8:GRA8', 'ME49Dku80dgra8', 'ME49Dku80TKO'])
3733
+ settings.setdefault('pathogen_loc', [['c1', 'c2', 'c3', 'c4', 'c5', 'c6'], ['c7', 'c8', 'c9', 'c10', 'c11', 'c12'], ['c13', 'c14', 'c15', 'c16', 'c17', 'c18'], ['c19', 'c20', 'c21', 'c22', 'c23', 'c24']])
3734
+ settings.setdefault('treatments', ['BR1', 'BR2', 'BR3'])
3735
+ settings.setdefault('treatment_loc', [['c1', 'c2', 'c7', 'c8', 'c13', 'c14', 'c19', 'c20'], ['c3', 'c4', 'c9', 'c10', 'c15', 'c16', 'c21', 'c22'], ['c5', 'c6', 'c11', 'c12', 'c17', 'c18', 'c23', 'c24']])
3736
+ settings.setdefault('channel_of_interest', 2)
3737
+ settings.setdefault('compartments', ['pathogen', 'cytoplasm'])
3738
+ settings.setdefault('measurement', 'mean_intensity')
3739
+ settings.setdefault('nr_imgs', 32)
3740
+ settings.setdefault('um_per_pixel', 0.1)
3741
+
3742
+ if settings['test_mode']:
3743
+ settings['plot'] = True
3744
+ settings['plot_filtration'] = True
3745
+ test_imgs = settings['test_nr']
3746
+ print(f'Test mode enabled with {test_imgs} images, plotting set to True')
3747
+
3748
+ return settings
3749
+
3750
+ def delete_folder(folder_path):
3751
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
3752
+ for root, dirs, files in os.walk(folder_path, topdown=False):
3753
+ for name in files:
3754
+ os.remove(os.path.join(root, name))
3755
+ for name in dirs:
3756
+ os.rmdir(os.path.join(root, name))
3757
+ os.rmdir(folder_path)
3758
+ print(f"Folder '{folder_path}' has been deleted.")
3759
+ else:
3760
+ print(f"Folder '{folder_path}' does not exist or is not a directory.")
3761
+
3762
+ def measure_test_mode(settings):
3763
+
3764
+ if settings['test_mode']:
3765
+ if not os.path.basename(settings['input_folder']) == 'test':
3766
+ all_files = os.listdir(settings['input_folder'])
3767
+ random_files = random.sample(all_files, settings['test_nr'])
3768
+
3769
+ src = os.path.join(os.path.dirname(settings['input_folder']),'test', 'merged')
3770
+ if os.path.exists(src):
3771
+ delete_folder(src)
3772
+ os.makedirs(src, exist_ok=True)
3773
+
3774
+ for file in random_files:
3775
+ shutil.copy(os.path.join(settings['input_folder'], file), os.path.join(src,file))
3776
+
3777
+ settings['input_folder'] = src
3778
+ print(f'Changed source folder to {src} for test mode')
3779
+ else:
3780
+ print(f'Test mode enabled, using source folder {settings["input_folder"]}')
3781
+
3782
+ return settings
3783
+
3872
3784
  def preprocess_data(df, filter_by, remove_highly_correlated, log_data, exclude):
3873
3785
  """
3874
3786
  Preprocesses the given dataframe by applying filtering, removing highly correlated columns,
3875
3787
  applying log transformation, filling NaN values, and scaling the numeric data.
3876
3788
 
3877
3789
  Args:
3878
- df (pandas.DataFrame): The input dataframe.
3879
- filter_by (str or None): The channel of interest to filter the dataframe by.
3880
- remove_highly_correlated (bool or float): Whether to remove highly correlated columns.
3881
- If a float is provided, it represents the correlation threshold.
3882
- log_data (bool): Whether to apply log transformation to the numeric data.
3883
- exclude (list or None): List of features to exclude from the filtering process.
3884
- verbose (bool): Whether to print verbose output during preprocessing.
3790
+ df (pandas.DataFrame): The input dataframe.
3791
+ filter_by (str or None): The channel of interest to filter the dataframe by.
3792
+ remove_highly_correlated (bool or float): Whether to remove highly correlated columns.
3793
+ If a float is provided, it represents the correlation threshold.
3794
+ log_data (bool): Whether to apply log transformation to the numeric data.
3795
+ exclude (list or None): List of features to exclude from the filtering process.
3796
+ verbose (bool): Whether to print verbose output during preprocessing.
3885
3797
 
3886
3798
  Returns:
3887
- numpy.ndarray: The preprocessed numeric data.
3799
+ numpy.ndarray: The preprocessed numeric data.
3888
3800
 
3889
3801
  Raises:
3890
- ValueError: If no numeric columns are available after filtering.
3802
+ ValueError: If no numeric columns are available after filtering.
3891
3803
 
3892
3804
  """
3893
3805
  # Apply filtering based on the `filter_by` parameter
@@ -3927,13 +3839,8 @@ def filter_dataframe_features(df, channel_of_interest, exclude=None):
3927
3839
 
3928
3840
  Parameters:
3929
3841
  - df (pandas.DataFrame): The input dataframe to be filtered.
3930
- - channel_of_interest (str, int, list, None): The channel(s) of interest to filter the dataframe.
3931
- If None, no filtering is applied. If 'morphology', only morphology features are included.
3932
- If an integer, only the specified channel is included. If a list, only the specified channels are included.
3933
- If a string, only the specified channel is included.
3934
- - exclude (str, list, None): The feature(s) to exclude from the filtered dataframe.
3935
- If None, no features are excluded. If a string, the specified feature is excluded.
3936
- If a list, the specified features are excluded.
3842
+ - channel_of_interest (str, int, list, None): The channel(s) of interest to filter the dataframe. If None, no filtering is applied. If 'morphology', only morphology features are included.If an integer, only the specified channel is included. If a list, only the specified channels are included.If a string, only the specified channel is included.
3843
+ - exclude (str, list, None): The feature(s) to exclude from the filtered dataframe. If None, no features are excluded. If a string, the specified feature is excluded.If a list, the specified features are excluded.
3937
3844
 
3938
3845
  Returns:
3939
3846
  - filtered_df (pandas.DataFrame): The filtered dataframe based on the specified parameters.
@@ -4063,6 +3970,326 @@ def search_reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric,
4063
3970
  if verbose:
4064
3971
  print(f'Embedding shape: {embedding.shape}')
4065
3972
  return embedding, labels
3973
+ import torch
3974
+ import torchvision.transforms as transforms
3975
+ from torchvision.models import resnet50
3976
+ from PIL import Image
3977
+ import numpy as np
3978
+ import umap
3979
+ import pandas as pd
3980
+ from sklearn.ensemble import RandomForestClassifier
3981
+ from sklearn.preprocessing import StandardScaler
3982
+ from scipy.stats import f_oneway, kruskal
3983
+ from sklearn.cluster import KMeans
3984
+ from scipy import stats
3985
+
3986
+ def load_image(image_path):
3987
+ """Load and preprocess an image."""
3988
+ transform = transforms.Compose([
3989
+ transforms.Resize((224, 224)),
3990
+ transforms.ToTensor(),
3991
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
3992
+ ])
3993
+ image = Image.open(image_path).convert('RGB')
3994
+ image = transform(image).unsqueeze(0)
3995
+ return image
3996
+
3997
+ def extract_features(image_paths, resnet=resnet50):
3998
+ """Extract features from images using a pre-trained ResNet model."""
3999
+ model = resnet(pretrained=True)
4000
+ model = model.eval()
4001
+ model = torch.nn.Sequential(*list(model.children())[:-1]) # Remove the last classification layer
4002
+
4003
+ features = []
4004
+ for image_path in image_paths:
4005
+ image = load_image(image_path)
4006
+ with torch.no_grad():
4007
+ feature = model(image).squeeze().numpy()
4008
+ features.append(feature)
4009
+
4010
+ return np.array(features)
4011
+
4012
+ def check_normality(series):
4013
+ """Helper function to check if a feature is normally distributed."""
4014
+ k2, p = stats.normaltest(series)
4015
+ alpha = 0.05
4016
+ if p < alpha: # null hypothesis: x comes from a normal distribution
4017
+ return False
4018
+ return True
4019
+
4020
+ def random_forest_feature_importance(all_df, cluster_col='cluster'):
4021
+ """Random Forest feature importance."""
4022
+ numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
4023
+ if cluster_col in numeric_features:
4024
+ numeric_features.remove(cluster_col)
4025
+
4026
+ X = all_df[numeric_features]
4027
+ y = all_df[cluster_col]
4028
+
4029
+ scaler = StandardScaler()
4030
+ X_scaled = scaler.fit_transform(X)
4031
+
4032
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
4033
+ model.fit(X_scaled, y)
4034
+
4035
+ feature_importances = model.feature_importances_
4066
4036
 
4037
+ importance_df = pd.DataFrame({
4038
+ 'Feature': numeric_features,
4039
+ 'Importance': feature_importances
4040
+ }).sort_values(by='Importance', ascending=False)
4067
4041
 
4042
+ return importance_df
4043
+
4044
+ def perform_statistical_tests(all_df, cluster_col='cluster'):
4045
+ """Perform ANOVA or Kruskal-Wallis tests depending on normality of features."""
4046
+ numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
4047
+ if cluster_col in numeric_features:
4048
+ numeric_features.remove(cluster_col)
4049
+
4050
+ anova_results = []
4051
+ kruskal_results = []
4052
+
4053
+ for feature in numeric_features:
4054
+ groups = [all_df[all_df[cluster_col] == label][feature] for label in np.unique(all_df[cluster_col])]
4055
+
4056
+ if check_normality(all_df[feature]):
4057
+ stat, p = f_oneway(*groups)
4058
+ anova_results.append((feature, stat, p))
4059
+ else:
4060
+ stat, p = kruskal(*groups)
4061
+ kruskal_results.append((feature, stat, p))
4062
+
4063
+ anova_df = pd.DataFrame(anova_results, columns=['Feature', 'ANOVA_Statistic', 'ANOVA_pValue'])
4064
+ kruskal_df = pd.DataFrame(kruskal_results, columns=['Feature', 'Kruskal_Statistic', 'Kruskal_pValue'])
4065
+
4066
+ return anova_df, kruskal_df
4067
+
4068
+ def combine_results(rf_df, anova_df, kruskal_df):
4069
+ """Combine the results into a single DataFrame."""
4070
+ combined_df = rf_df.merge(anova_df, on='Feature', how='left')
4071
+ combined_df = combined_df.merge(kruskal_df, on='Feature', how='left')
4072
+ return combined_df
4073
+
4074
+ def cluster_feature_analysis(all_df, cluster_col='cluster'):
4075
+ """
4076
+ Perform Random Forest feature importance, ANOVA for normally distributed features,
4077
+ and Kruskal-Wallis for non-normally distributed features. Combine results into a single DataFrame.
4078
+ """
4079
+ rf_df = random_forest_feature_importance(all_df, cluster_col)
4080
+ anova_df, kruskal_df = perform_statistical_tests(all_df, cluster_col)
4081
+ combined_df = combine_results(rf_df, anova_df, kruskal_df)
4082
+ return combined_df
4083
+
4084
+ def _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold=5, perimeter_threshold=30):
4085
+ """
4086
+ Merge cells in cell_mask if a parasite in parasite_mask overlaps with more than one cell,
4087
+ and if cells share more than a specified perimeter percentage.
4068
4088
 
4089
+ Args:
4090
+ parasite_mask (ndarray): Mask of parasites.
4091
+ cell_mask (ndarray): Mask of cells.
4092
+ nuclei_mask (ndarray): Mask of nuclei.
4093
+ overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
4094
+ perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
4095
+
4096
+ Returns:
4097
+ ndarray: The modified cell mask (cell_mask) with unique labels.
4098
+ """
4099
+ labeled_cells = label(cell_mask)
4100
+ labeled_parasites = label(parasite_mask)
4101
+ labeled_nuclei = label(nuclei_mask)
4102
+ num_parasites = np.max(labeled_parasites)
4103
+ num_cells = np.max(labeled_cells)
4104
+ num_nuclei = np.max(labeled_nuclei)
4105
+
4106
+ # Merge cells based on parasite overlap
4107
+ for parasite_id in range(1, num_parasites + 1):
4108
+ current_parasite_mask = labeled_parasites == parasite_id
4109
+ overlapping_cell_labels = np.unique(labeled_cells[current_parasite_mask])
4110
+ overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
4111
+ if len(overlapping_cell_labels) > 1:
4112
+
4113
+ # Calculate the overlap percentages
4114
+ overlap_percentages = [
4115
+ np.sum(current_parasite_mask & (labeled_cells == cell_label)) / np.sum(current_parasite_mask) * 100
4116
+ for cell_label in overlapping_cell_labels
4117
+ ]
4118
+ # Merge cells if overlap percentage is above the threshold
4119
+ for cell_label, overlap_percentage in zip(overlapping_cell_labels, overlap_percentages):
4120
+ if overlap_percentage > overlap_threshold:
4121
+ first_label = overlapping_cell_labels[0]
4122
+ for other_label in overlapping_cell_labels[1:]:
4123
+ if other_label != first_label:
4124
+ cell_mask[cell_mask == other_label] = first_label
4125
+
4126
+ # Merge cells based on nucleus overlap
4127
+ for nucleus_id in range(1, num_nuclei + 1):
4128
+ current_nucleus_mask = labeled_nuclei == nucleus_id
4129
+ overlapping_cell_labels = np.unique(labeled_cells[current_nucleus_mask])
4130
+ overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
4131
+ if len(overlapping_cell_labels) > 1:
4132
+
4133
+ # Calculate the overlap percentages
4134
+ overlap_percentages = [
4135
+ np.sum(current_nucleus_mask & (labeled_cells == cell_label)) / np.sum(current_nucleus_mask) * 100
4136
+ for cell_label in overlapping_cell_labels
4137
+ ]
4138
+ # Merge cells if overlap percentage is above the threshold for each cell
4139
+ if all(overlap_percentage > overlap_threshold for overlap_percentage in overlap_percentages):
4140
+ first_label = overlapping_cell_labels[0]
4141
+ for other_label in overlapping_cell_labels[1:]:
4142
+ if other_label != first_label:
4143
+ cell_mask[cell_mask == other_label] = first_label
4144
+
4145
+ # Check for cells without nuclei and merge based on shared perimeter
4146
+ labeled_cells = label(cell_mask) # Re-label after merging based on overlap
4147
+ cell_regions = regionprops(labeled_cells)
4148
+ for region in cell_regions:
4149
+ cell_label = region.label
4150
+ cell_mask_binary = labeled_cells == cell_label
4151
+ overlapping_nuclei = np.unique(nuclei_mask[cell_mask_binary])
4152
+ overlapping_nuclei = overlapping_nuclei[overlapping_nuclei != 0]
4153
+
4154
+ if len(overlapping_nuclei) == 0:
4155
+
4156
+ # Cell does not overlap with any nucleus
4157
+ perimeter = region.perimeter
4158
+
4159
+ # Dilate the cell to find neighbors
4160
+ dilated_cell = binary_dilation(cell_mask_binary, structure=square(3))
4161
+ neighbor_cells = np.unique(labeled_cells[dilated_cell])
4162
+ neighbor_cells = neighbor_cells[(neighbor_cells != 0) & (neighbor_cells != cell_label)]
4163
+
4164
+ # Calculate shared border length with neighboring cells
4165
+ shared_borders = [
4166
+ np.sum((labeled_cells == neighbor_label) & dilated_cell) for neighbor_label in neighbor_cells
4167
+ ]
4168
+ shared_border_percentages = [shared_border / perimeter * 100 for shared_border in shared_borders]
4169
+
4170
+ # Merge with the neighbor cell with the largest shared border percentage above the threshold
4171
+ if shared_borders:
4172
+ max_shared_border_index = np.argmax(shared_border_percentages)
4173
+ max_shared_border_percentage = shared_border_percentages[max_shared_border_index]
4174
+ if max_shared_border_percentage > perimeter_threshold:
4175
+ cell_mask[labeled_cells == cell_label] = neighbor_cells[max_shared_border_index]
4176
+
4177
+ # Relabel the merged cell mask
4178
+ relabeled_cell_mask, _ = label(cell_mask, return_num=True)
4179
+ return relabeled_cell_mask
4180
+
4181
+ def adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30):
4182
+
4183
+ """
4184
+ Process all npy files in the given folders. Merge and relabel cells in cell masks
4185
+ based on parasite overlap and cell perimeter sharing conditions.
4186
+
4187
+ Args:
4188
+ parasite_folder (str): Path to the folder containing parasite masks.
4189
+ cell_folder (str): Path to the folder containing cell masks.
4190
+ nuclei_folder (str): Path to the folder containing nuclei masks.
4191
+ overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
4192
+ perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
4193
+ """
4194
+
4195
+ parasite_files = sorted([f for f in os.listdir(parasite_folder) if f.endswith('.npy')])
4196
+ cell_files = sorted([f for f in os.listdir(cell_folder) if f.endswith('.npy')])
4197
+ nuclei_files = sorted([f for f in os.listdir(nuclei_folder) if f.endswith('.npy')])
4198
+
4199
+ # Ensure there are matching files in all folders
4200
+ if not (len(parasite_files) == len(cell_files) == len(nuclei_files)):
4201
+ raise ValueError("The number of files in the folders do not match.")
4202
+
4203
+ # Match files by name
4204
+ for file_name in parasite_files:
4205
+ parasite_path = os.path.join(parasite_folder, file_name)
4206
+ cell_path = os.path.join(cell_folder, file_name)
4207
+ nuclei_path = os.path.join(nuclei_folder, file_name)
4208
+ # Check if the corresponding cell and nuclei mask files exist
4209
+ if not (os.path.exists(cell_path) and os.path.exists(nuclei_path)):
4210
+ raise ValueError(f"Corresponding cell or nuclei mask file for {file_name} not found.")
4211
+ # Load the masks
4212
+ parasite_mask = np.load(parasite_path)
4213
+ cell_mask = np.load(cell_path)
4214
+ nuclei_mask = np.load(nuclei_path)
4215
+ # Merge and relabel cells
4216
+ merged_cell_mask = _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold, perimeter_threshold)
4217
+
4218
+ # Force 16 bit
4219
+ mamerged_cell_masksk = merged_cell_mask.astype(np.uint16)
4220
+
4221
+ # Overwrite the original cell mask file with the merged result
4222
+ np.save(cell_path, merged_cell_mask)
4223
+
4224
+ def process_masks(mask_folder, image_folder, channel, batch_size=50, n_clusters=2, plot=False):
4225
+
4226
+ def read_files_in_batches(folder, batch_size=50):
4227
+ files = [f for f in os.listdir(folder) if f.endswith('.npy')]
4228
+ files.sort() # Sort to ensure matching order
4229
+ for i in range(0, len(files), batch_size):
4230
+ yield files[i:i + batch_size]
4231
+
4232
+ def measure_morphology_and_intensity(mask, image):
4233
+ properties = measure.regionprops(mask, intensity_image=image)
4234
+ properties_list = [{'area': p.area, 'mean_intensity': p.mean_intensity, 'perimeter': p.perimeter, 'eccentricity': p.eccentricity} for p in properties]
4235
+ return properties_list
4236
+
4237
+ def cluster_objects(properties, n_clusters=2):
4238
+ data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
4239
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
4240
+ return kmeans
4241
+
4242
+ def remove_objects_not_in_largest_cluster(mask, labels, largest_cluster_label):
4243
+ cleaned_mask = np.zeros_like(mask)
4244
+ for region in measure.regionprops(mask):
4245
+ if labels[region.label - 1] == largest_cluster_label:
4246
+ cleaned_mask[mask == region.label] = region.label
4247
+ return cleaned_mask
4248
+
4249
+ def plot_clusters(properties, labels):
4250
+ data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
4251
+ pca = PCA(n_components=2)
4252
+ data_2d = pca.fit_transform(data)
4253
+ plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='viridis')
4254
+ plt.xlabel('PCA Component 1')
4255
+ plt.ylabel('PCA Component 2')
4256
+ plt.title('Object Clustering')
4257
+ plt.show()
4258
+
4259
+ all_properties = []
4260
+
4261
+ # Step 1: Accumulate properties over all files
4262
+ for batch in read_files_in_batches(mask_folder, batch_size):
4263
+ mask_files = [os.path.join(mask_folder, file) for file in batch]
4264
+ image_files = [os.path.join(image_folder, file) for file in batch]
4265
+
4266
+ masks = [np.load(file) for file in mask_files]
4267
+ images = [np.load(file)[:, :, channel] for file in image_files]
4268
+
4269
+ for i, mask in enumerate(masks):
4270
+ image = images[i]
4271
+ # Measure morphology and intensity
4272
+ properties = measure_morphology_and_intensity(mask, image)
4273
+ all_properties.extend(properties)
4274
+
4275
+ # Step 2: Perform clustering on accumulated properties
4276
+ kmeans = cluster_objects(all_properties, n_clusters)
4277
+ labels = kmeans.labels_
4278
+
4279
+ if plot:
4280
+ # Step 3: Plot clusters using PCA
4281
+ plot_clusters(all_properties, labels)
4282
+
4283
+ # Step 4: Remove objects not in the largest cluster and overwrite files in batches
4284
+ label_index = 0
4285
+ for batch in read_files_in_batches(mask_folder, batch_size):
4286
+ mask_files = [os.path.join(mask_folder, file) for file in batch]
4287
+ masks = [np.load(file) for file in mask_files]
4288
+
4289
+ for i, mask in enumerate(masks):
4290
+ batch_properties = measure_morphology_and_intensity(mask, mask)
4291
+ batch_labels = labels[label_index:label_index + len(batch_properties)]
4292
+ largest_cluster_label = np.bincount(batch_labels).argmax()
4293
+ cleaned_mask = remove_objects_not_in_largest_cluster(mask, batch_labels, largest_cluster_label)
4294
+ np.save(mask_files[i], cleaned_mask)
4295
+ label_index += len(batch_properties)