spacr 0.0.71__py3-none-any.whl → 0.0.81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +4 -1
- spacr/__main__.py +0 -7
- spacr/annotate_app.py +74 -58
- spacr/core.py +7 -214
- spacr/io.py +0 -66
- spacr/measure.py +46 -59
- spacr/plot.py +117 -81
- spacr/sequencing.py +508 -491
- spacr/utils.py +469 -182
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/METADATA +2 -1
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/RECORD +15 -15
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/LICENSE +0 -0
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/WHEEL +0 -0
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/entry_points.txt +0 -0
- {spacr-0.0.71.dist-info → spacr-0.0.81.dist-info}/top_level.txt +0 -0
spacr/utils.py
CHANGED
@@ -43,6 +43,7 @@ from scipy.stats import fisher_exact
|
|
43
43
|
from scipy.ndimage.filters import gaussian_filter
|
44
44
|
from scipy.spatial import ConvexHull
|
45
45
|
from scipy.interpolate import splprep, splev
|
46
|
+
from scipy.ndimage import binary_dilation
|
46
47
|
|
47
48
|
from sklearn.preprocessing import StandardScaler
|
48
49
|
from skimage.exposure import rescale_intensity
|
@@ -55,6 +56,8 @@ from sklearn.preprocessing import StandardScaler
|
|
55
56
|
from sklearn.cluster import DBSCAN
|
56
57
|
from sklearn.cluster import KMeans
|
57
58
|
from sklearn.manifold import TSNE
|
59
|
+
from sklearn.cluster import KMeans
|
60
|
+
from sklearn.decomposition import PCA
|
58
61
|
|
59
62
|
import umap.umap_ as umap
|
60
63
|
|
@@ -62,6 +65,12 @@ from torchvision import models
|
|
62
65
|
from torchvision.models.resnet import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights
|
63
66
|
import torchvision.transforms as transforms
|
64
67
|
|
68
|
+
from sklearn.ensemble import RandomForestClassifier
|
69
|
+
from sklearn.preprocessing import StandardScaler
|
70
|
+
from scipy.stats import f_oneway, kruskal
|
71
|
+
from sklearn.cluster import KMeans
|
72
|
+
from scipy import stats
|
73
|
+
|
65
74
|
from .logger import log_function_call
|
66
75
|
|
67
76
|
def check_mask_folder(src,mask_fldr):
|
@@ -529,48 +538,12 @@ def _annotate_conditions(df, cells=['HeLa'], cell_loc=None, pathogens=['rh'], pa
|
|
529
538
|
df['condition'] = df['condition'].apply(lambda x: x if x else 'none')
|
530
539
|
return df
|
531
540
|
|
532
|
-
def
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
Parameters:
|
537
|
-
- array: numpy array
|
538
|
-
The input stack to be normalized.
|
539
|
-
- p1: int, optional
|
540
|
-
The lower percentile value for normalization. Default is 2.
|
541
|
-
- p2: int, optional
|
542
|
-
The upper percentile value for normalization. Default is 98.
|
543
|
-
|
544
|
-
Returns:
|
545
|
-
- new_stack: numpy array
|
546
|
-
The normalized stack with the same shape as the input stack.
|
547
|
-
"""
|
548
|
-
nimg = array.shape[2]
|
549
|
-
new_stack = np.empty_like(array)
|
550
|
-
|
551
|
-
for i in range(nimg):
|
552
|
-
img = array[:, :, i]
|
553
|
-
non_zero_img = img[img > 0]
|
554
|
-
|
555
|
-
if non_zero_img.size > 0:
|
556
|
-
img_min = np.percentile(non_zero_img, p1)
|
557
|
-
img_max = np.percentile(non_zero_img, p2)
|
558
|
-
else:
|
559
|
-
img_min = img.min()
|
560
|
-
img_max = img.max()
|
561
|
-
|
562
|
-
# Determine output range based on dtype
|
563
|
-
if np.issubdtype(array.dtype, np.integer):
|
564
|
-
out_range = (0, np.iinfo(array.dtype).max)
|
565
|
-
else:
|
566
|
-
out_range = (0.0, 1.0)
|
567
|
-
|
568
|
-
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range).astype(array.dtype)
|
569
|
-
new_stack[:, :, i] = img
|
570
|
-
|
571
|
-
return new_stack
|
541
|
+
def is_list_of_lists(var):
|
542
|
+
if isinstance(var, list) and all(isinstance(i, list) for i in var):
|
543
|
+
return True
|
544
|
+
return False
|
572
545
|
|
573
|
-
def normalize_to_dtype(array, p1=2, p2=98):
|
546
|
+
def normalize_to_dtype(array, p1=2, p2=98, percentile_list=None):
|
574
547
|
"""
|
575
548
|
Normalize each image in the stack to its own percentiles.
|
576
549
|
|
@@ -581,29 +554,40 @@ def normalize_to_dtype(array, p1=2, p2=98):
|
|
581
554
|
The lower percentile value for normalization. Default is 2.
|
582
555
|
- p2: int, optional
|
583
556
|
The upper percentile value for normalization. Default is 98.
|
584
|
-
|
557
|
+
- percentile_list: list, optional
|
558
|
+
A list of pre-calculated percentiles for each image in the stack. Default is None.
|
559
|
+
|
585
560
|
Returns:
|
586
561
|
- new_stack: numpy array
|
587
562
|
The normalized stack with the same shape as the input stack.
|
588
563
|
"""
|
564
|
+
|
565
|
+
out_range = (0, np.iinfo(array.dtype).max)
|
589
566
|
nimg = array.shape[2]
|
590
|
-
new_stack = np.empty_like(array, dtype=
|
567
|
+
new_stack = np.empty_like(array, dtype=array.dtype)
|
591
568
|
|
592
569
|
for i in range(nimg):
|
593
570
|
img = array[:, :, i]
|
594
571
|
non_zero_img = img[img > 0]
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
572
|
+
if not percentile_list is None:
|
573
|
+
percentiles = percentile_list[i]
|
574
|
+
else:
|
575
|
+
percentile_1 = p1
|
576
|
+
percentile_2 = p2
|
577
|
+
if percentile_list is None:
|
578
|
+
if non_zero_img.size > 0:
|
579
|
+
img_min = np.percentile(non_zero_img, percentile_1)
|
580
|
+
img_max = np.percentile(non_zero_img, percentile_2)
|
581
|
+
else:
|
582
|
+
img_min = np.percentile(img, percentile_1)
|
583
|
+
img_max = np.percentile(img, percentile_2)
|
599
584
|
else:
|
600
|
-
img_min =
|
601
|
-
img_max =
|
585
|
+
img_min = percentiles[0]
|
586
|
+
img_max = percentiles[1]
|
602
587
|
|
603
588
|
# Normalize to the range (0, 1) for visualization
|
604
|
-
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=
|
589
|
+
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range)
|
605
590
|
new_stack[:, :, i] = img
|
606
|
-
|
607
591
|
return new_stack
|
608
592
|
|
609
593
|
def _list_endpoint_subdirectories(base_dir):
|
@@ -866,7 +850,7 @@ def _check_integrity(df):
|
|
866
850
|
df['label_list'] = df['label_list'].astype(str)
|
867
851
|
return df
|
868
852
|
|
869
|
-
def _get_percentiles(array,
|
853
|
+
def _get_percentiles(array, p1=2, p2=98):
|
870
854
|
"""
|
871
855
|
Calculate the percentiles of each image in the given array.
|
872
856
|
|
@@ -889,15 +873,16 @@ def _get_percentiles(array, q1=2, q2=98):
|
|
889
873
|
img = np.squeeze(array[:, :, v])
|
890
874
|
non_zero_img = img[img > 0]
|
891
875
|
if non_zero_img.size > 0: # check if there are non-zero values
|
892
|
-
img_min = np.percentile(non_zero_img,
|
893
|
-
img_max = np.percentile(non_zero_img,
|
876
|
+
img_min = np.percentile(non_zero_img, p1) # change percentile from 0.02 to 2
|
877
|
+
img_max = np.percentile(non_zero_img, p2) # change percentile from 0.98 to 98
|
894
878
|
percentiles.append([img_min, img_max])
|
895
879
|
else: # if there are no non-zero values, just use the image as it is
|
896
|
-
img_min
|
880
|
+
img_min = np.percentile(img, p1) # change percentile from 0.02 to 2
|
881
|
+
img_max = np.percentile(img, p2) # change percentile from 0.98 to 98
|
897
882
|
percentiles.append([img_min, img_max])
|
898
883
|
return percentiles
|
899
884
|
|
900
|
-
def _crop_center(img, cell_mask, new_width, new_height
|
885
|
+
def _crop_center(img, cell_mask, new_width, new_height):
|
901
886
|
"""
|
902
887
|
Crop the image around the center of the cell mask.
|
903
888
|
|
@@ -910,8 +895,6 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
|
|
910
895
|
The desired width of the cropped image.
|
911
896
|
- new_height: int
|
912
897
|
The desired height of the cropped image.
|
913
|
-
- normalize: tuple, optional
|
914
|
-
The normalization range for the image pixel values. Default is (2, 98).
|
915
898
|
|
916
899
|
Returns:
|
917
900
|
- img: numpy.ndarray
|
@@ -921,19 +904,22 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
|
|
921
904
|
cell_mask[cell_mask != 0] = 1
|
922
905
|
mask_3d = np.repeat(cell_mask[:, :, np.newaxis], img.shape[2], axis=2).astype(img.dtype) # Create 3D mask
|
923
906
|
img = np.multiply(img, mask_3d).astype(img.dtype) # Multiply image with mask to set pixel values outside of the mask to 0
|
924
|
-
#centroid = np.round(ndi.measurements.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
|
925
907
|
centroid = np.round(ndi.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
|
908
|
+
|
926
909
|
# Pad the image and mask to ensure the crop will not go out of bounds
|
927
910
|
pad_width = max(new_width, new_height)
|
928
911
|
img = np.pad(img, ((pad_width, pad_width), (pad_width, pad_width), (0, 0)), mode='constant')
|
929
912
|
cell_mask = np.pad(cell_mask, ((pad_width, pad_width), (pad_width, pad_width)), mode='constant')
|
913
|
+
|
930
914
|
# Update centroid coordinates due to padding
|
931
915
|
centroid += pad_width
|
916
|
+
|
932
917
|
# Compute bounding box
|
933
918
|
start_y = max(0, centroid[0] - new_height // 2)
|
934
919
|
end_y = min(start_y + new_height, img.shape[0])
|
935
920
|
start_x = max(0, centroid[1] - new_width // 2)
|
936
921
|
end_x = min(start_x + new_width, img.shape[1])
|
922
|
+
|
937
923
|
# Crop to bounding box
|
938
924
|
img = img[start_y:end_y, start_x:end_x, :]
|
939
925
|
return img
|
@@ -3434,105 +3420,6 @@ def reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric, eps, m
|
|
3434
3420
|
|
3435
3421
|
return embedding, labels, reducer
|
3436
3422
|
|
3437
|
-
def reduction_and_clustering_v1(numeric_data, n_neighbors, min_dist, metric, eps, min_samples, clustering, reduction_method='umap', verbose=False, embedding=None, n_jobs=-1):
|
3438
|
-
"""
|
3439
|
-
Perform dimensionality reduction and clustering on the given data.
|
3440
|
-
|
3441
|
-
Parameters:
|
3442
|
-
numeric_data (np.ndarray): Numeric data for embedding and clustering.
|
3443
|
-
n_neighbors (int or float): Number of neighbors for UMAP or perplexity for t-SNE.
|
3444
|
-
min_dist (float): Minimum distance for UMAP.
|
3445
|
-
metric (str): Metric for UMAP and DBSCAN.
|
3446
|
-
eps (float): Epsilon for DBSCAN.
|
3447
|
-
min_samples (int): Minimum samples for DBSCAN or number of clusters for KMeans.
|
3448
|
-
clustering (str): Clustering method ('DBSCAN' or 'KMeans').
|
3449
|
-
reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
|
3450
|
-
verbose (bool): Whether to print verbose output.
|
3451
|
-
embedding (np.ndarray, optional): Precomputed embedding. Default is None.
|
3452
|
-
|
3453
|
-
Returns:
|
3454
|
-
tuple: embedding, labels
|
3455
|
-
"""
|
3456
|
-
|
3457
|
-
if verbose:
|
3458
|
-
v=1
|
3459
|
-
else:
|
3460
|
-
v=0
|
3461
|
-
|
3462
|
-
if isinstance(n_neighbors, float):
|
3463
|
-
n_neighbors = int(n_neighbors * len(numeric_data))
|
3464
|
-
|
3465
|
-
if n_neighbors <= 2:
|
3466
|
-
n_neighbors = 2
|
3467
|
-
|
3468
|
-
if reduction_method == 'umap':
|
3469
|
-
reducer = umap.UMAP(n_neighbors=n_neighbors,
|
3470
|
-
n_components=2,
|
3471
|
-
metric=metric,
|
3472
|
-
n_epochs=None,
|
3473
|
-
learning_rate=1.0,
|
3474
|
-
init='spectral',
|
3475
|
-
min_dist=min_dist,
|
3476
|
-
spread=1.0,
|
3477
|
-
set_op_mix_ratio=1.0,
|
3478
|
-
local_connectivity=1,
|
3479
|
-
repulsion_strength=1.0,
|
3480
|
-
negative_sample_rate=5,
|
3481
|
-
transform_queue_size=4.0,
|
3482
|
-
a=None,
|
3483
|
-
b=None,
|
3484
|
-
random_state=42,
|
3485
|
-
metric_kwds=None,
|
3486
|
-
angular_rp_forest=False,
|
3487
|
-
target_n_neighbors=-1,
|
3488
|
-
target_metric='categorical',
|
3489
|
-
target_metric_kwds=None,
|
3490
|
-
target_weight=0.5,
|
3491
|
-
transform_seed=42,
|
3492
|
-
n_jobs=n_jobs,
|
3493
|
-
verbose=verbose)
|
3494
|
-
|
3495
|
-
elif reduction_method == 'tsne':
|
3496
|
-
|
3497
|
-
#tsne_params.setdefault('n_components', 2)
|
3498
|
-
#reducer = TSNE(**tsne_params)
|
3499
|
-
|
3500
|
-
reducer = TSNE(n_components=2,
|
3501
|
-
perplexity=n_neighbors,
|
3502
|
-
early_exaggeration=12.0,
|
3503
|
-
learning_rate=200.0,
|
3504
|
-
n_iter=1000,
|
3505
|
-
n_iter_without_progress=300,
|
3506
|
-
min_grad_norm=1e-7,
|
3507
|
-
metric=metric,
|
3508
|
-
init='random',
|
3509
|
-
verbose=v,
|
3510
|
-
random_state=42,
|
3511
|
-
method='barnes_hut',
|
3512
|
-
angle=0.5,
|
3513
|
-
n_jobs=n_jobs)
|
3514
|
-
|
3515
|
-
else:
|
3516
|
-
raise ValueError(f"Unsupported reduction method: {reduction_method}. Supported methods are 'umap' and 'tsne'")
|
3517
|
-
|
3518
|
-
if embedding is None:
|
3519
|
-
embedding = reducer.fit_transform(numeric_data)
|
3520
|
-
|
3521
|
-
if clustering == 'dbscan':
|
3522
|
-
clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=n_jobs)
|
3523
|
-
elif clustering == 'kmeans':
|
3524
|
-
clustering_model = KMeans(n_clusters=min_samples, random_state=42)
|
3525
|
-
else:
|
3526
|
-
raise ValueError(f"Unsupported clustering method: {clustering}. Supported methods are 'dbscan' and 'kmeans'")
|
3527
|
-
|
3528
|
-
clustering_model.fit(embedding)
|
3529
|
-
labels = clustering_model.labels_ if clustering == 'dbscan' else clustering_model.predict(embedding)
|
3530
|
-
|
3531
|
-
if verbose:
|
3532
|
-
print(f'Embedding shape: {embedding.shape}')
|
3533
|
-
|
3534
|
-
return embedding, labels
|
3535
|
-
|
3536
3423
|
def remove_noise(embedding, labels):
|
3537
3424
|
non_noise_indices = labels != -1
|
3538
3425
|
embedding = embedding[non_noise_indices]
|
@@ -3744,30 +3631,6 @@ def correct_paths(df, base_path):
|
|
3744
3631
|
image_paths = df['png_path'].to_list()
|
3745
3632
|
return df, image_paths
|
3746
3633
|
|
3747
|
-
def correct_paths_v1(df, base_path):
|
3748
|
-
if 'png_path' not in df.columns:
|
3749
|
-
print("No 'png_path' column found in the dataframe.")
|
3750
|
-
return df, None
|
3751
|
-
|
3752
|
-
image_paths = df['png_path'].to_list()
|
3753
|
-
|
3754
|
-
adjusted_image_paths = []
|
3755
|
-
for path in image_paths:
|
3756
|
-
if base_path not in path:
|
3757
|
-
print(f"Adjusting path: {path}")
|
3758
|
-
parts = path.split('data/')
|
3759
|
-
if len(parts) > 1:
|
3760
|
-
new_path = os.path.join(base_path, 'data', parts[1])
|
3761
|
-
adjusted_image_paths.append(new_path)
|
3762
|
-
else:
|
3763
|
-
adjusted_image_paths.append(path)
|
3764
|
-
else:
|
3765
|
-
adjusted_image_paths.append(path)
|
3766
|
-
|
3767
|
-
df['png_path'] = adjusted_image_paths
|
3768
|
-
image_paths = df['png_path'].to_list()
|
3769
|
-
return df, image_paths
|
3770
|
-
|
3771
3634
|
def get_umap_image_settings(settings={}):
|
3772
3635
|
settings.setdefault('src', 'path')
|
3773
3636
|
settings.setdefault('row_limit', 1000)
|
@@ -3814,6 +3677,110 @@ def get_umap_image_settings(settings={}):
|
|
3814
3677
|
settings.setdefault('verbose',True)
|
3815
3678
|
return settings
|
3816
3679
|
|
3680
|
+
def get_measure_crop_settings(settings):
|
3681
|
+
|
3682
|
+
# Test mode
|
3683
|
+
settings.setdefault('test_mode', False)
|
3684
|
+
settings.setdefault('test_nr', 10)
|
3685
|
+
|
3686
|
+
#measurement settings
|
3687
|
+
settings.setdefault('save_measurements',True)
|
3688
|
+
settings.setdefault('radial_dist', True)
|
3689
|
+
settings.setdefault('calculate_correlation', True)
|
3690
|
+
settings.setdefault('manders_thresholds', [15,85,95])
|
3691
|
+
settings.setdefault('homogeneity', True)
|
3692
|
+
settings.setdefault('homogeneity_distances', [8,16,32])
|
3693
|
+
|
3694
|
+
# Cropping settings
|
3695
|
+
settings.setdefault('save_arrays', False)
|
3696
|
+
settings.setdefault('save_png',True)
|
3697
|
+
settings.setdefault('use_bounding_box',False)
|
3698
|
+
settings.setdefault('png_size',[224,224])
|
3699
|
+
settings.setdefault('png_dims',[0,1,2])
|
3700
|
+
settings.setdefault('normalize',False)
|
3701
|
+
settings.setdefault('normalize_by','png')
|
3702
|
+
settings.setdefault('crop_mode',['cell'])
|
3703
|
+
settings.setdefault('dialate_pngs', False)
|
3704
|
+
settings.setdefault('dialate_png_ratios', [0.2])
|
3705
|
+
|
3706
|
+
# Timelapsed settings
|
3707
|
+
settings.setdefault('timelapse', False)
|
3708
|
+
settings.setdefault('timelapse_objects', 'cell')
|
3709
|
+
|
3710
|
+
# Operational settings
|
3711
|
+
settings.setdefault('plot',False)
|
3712
|
+
settings.setdefault('plot_filtration',False)
|
3713
|
+
settings.setdefault('representative_images', False)
|
3714
|
+
settings.setdefault('max_workers', os.cpu_count()-2)
|
3715
|
+
|
3716
|
+
# Object settings
|
3717
|
+
settings.setdefault('cell_mask_dim',None)
|
3718
|
+
settings.setdefault('nucleus_mask_dim',None)
|
3719
|
+
settings.setdefault('pathogen_mask_dim',None)
|
3720
|
+
settings.setdefault('cytoplasm',False)
|
3721
|
+
settings.setdefault('include_uninfected',True)
|
3722
|
+
settings.setdefault('cell_min_size',0)
|
3723
|
+
settings.setdefault('nucleus_min_size',0)
|
3724
|
+
settings.setdefault('pathogen_min_size',0)
|
3725
|
+
settings.setdefault('cytoplasm_min_size',0)
|
3726
|
+
settings.setdefault('merge_edge_pathogen_cells', True)
|
3727
|
+
|
3728
|
+
# Miscellaneous settings
|
3729
|
+
settings.setdefault('experiment', 'exp')
|
3730
|
+
settings.setdefault('cells', 'HeLa')
|
3731
|
+
settings.setdefault('cell_loc', None)
|
3732
|
+
settings.setdefault('pathogens', ['ME49Dku80WT', 'ME49Dku80dgra8:GRA8', 'ME49Dku80dgra8', 'ME49Dku80TKO'])
|
3733
|
+
settings.setdefault('pathogen_loc', [['c1', 'c2', 'c3', 'c4', 'c5', 'c6'], ['c7', 'c8', 'c9', 'c10', 'c11', 'c12'], ['c13', 'c14', 'c15', 'c16', 'c17', 'c18'], ['c19', 'c20', 'c21', 'c22', 'c23', 'c24']])
|
3734
|
+
settings.setdefault('treatments', ['BR1', 'BR2', 'BR3'])
|
3735
|
+
settings.setdefault('treatment_loc', [['c1', 'c2', 'c7', 'c8', 'c13', 'c14', 'c19', 'c20'], ['c3', 'c4', 'c9', 'c10', 'c15', 'c16', 'c21', 'c22'], ['c5', 'c6', 'c11', 'c12', 'c17', 'c18', 'c23', 'c24']])
|
3736
|
+
settings.setdefault('channel_of_interest', 2)
|
3737
|
+
settings.setdefault('compartments', ['pathogen', 'cytoplasm'])
|
3738
|
+
settings.setdefault('measurement', 'mean_intensity')
|
3739
|
+
settings.setdefault('nr_imgs', 32)
|
3740
|
+
settings.setdefault('um_per_pixel', 0.1)
|
3741
|
+
|
3742
|
+
if settings['test_mode']:
|
3743
|
+
settings['plot'] = True
|
3744
|
+
settings['plot_filtration'] = True
|
3745
|
+
test_imgs = settings['test_nr']
|
3746
|
+
print(f'Test mode enabled with {test_imgs} images, plotting set to True')
|
3747
|
+
|
3748
|
+
return settings
|
3749
|
+
|
3750
|
+
def delete_folder(folder_path):
|
3751
|
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
3752
|
+
for root, dirs, files in os.walk(folder_path, topdown=False):
|
3753
|
+
for name in files:
|
3754
|
+
os.remove(os.path.join(root, name))
|
3755
|
+
for name in dirs:
|
3756
|
+
os.rmdir(os.path.join(root, name))
|
3757
|
+
os.rmdir(folder_path)
|
3758
|
+
print(f"Folder '{folder_path}' has been deleted.")
|
3759
|
+
else:
|
3760
|
+
print(f"Folder '{folder_path}' does not exist or is not a directory.")
|
3761
|
+
|
3762
|
+
def measure_test_mode(settings):
|
3763
|
+
|
3764
|
+
if settings['test_mode']:
|
3765
|
+
if not os.path.basename(settings['input_folder']) == 'test':
|
3766
|
+
all_files = os.listdir(settings['input_folder'])
|
3767
|
+
random_files = random.sample(all_files, settings['test_nr'])
|
3768
|
+
|
3769
|
+
src = os.path.join(os.path.dirname(settings['input_folder']),'test', 'merged')
|
3770
|
+
if os.path.exists(src):
|
3771
|
+
delete_folder(src)
|
3772
|
+
os.makedirs(src, exist_ok=True)
|
3773
|
+
|
3774
|
+
for file in random_files:
|
3775
|
+
shutil.copy(os.path.join(settings['input_folder'], file), os.path.join(src,file))
|
3776
|
+
|
3777
|
+
settings['input_folder'] = src
|
3778
|
+
print(f'Changed source folder to {src} for test mode')
|
3779
|
+
else:
|
3780
|
+
print(f'Test mode enabled, using source folder {settings["input_folder"]}')
|
3781
|
+
|
3782
|
+
return settings
|
3783
|
+
|
3817
3784
|
def preprocess_data(df, filter_by, remove_highly_correlated, log_data, exclude):
|
3818
3785
|
"""
|
3819
3786
|
Preprocesses the given dataframe by applying filtering, removing highly correlated columns,
|
@@ -4003,6 +3970,326 @@ def search_reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric,
|
|
4003
3970
|
if verbose:
|
4004
3971
|
print(f'Embedding shape: {embedding.shape}')
|
4005
3972
|
return embedding, labels
|
3973
|
+
import torch
|
3974
|
+
import torchvision.transforms as transforms
|
3975
|
+
from torchvision.models import resnet50
|
3976
|
+
from PIL import Image
|
3977
|
+
import numpy as np
|
3978
|
+
import umap
|
3979
|
+
import pandas as pd
|
3980
|
+
from sklearn.ensemble import RandomForestClassifier
|
3981
|
+
from sklearn.preprocessing import StandardScaler
|
3982
|
+
from scipy.stats import f_oneway, kruskal
|
3983
|
+
from sklearn.cluster import KMeans
|
3984
|
+
from scipy import stats
|
3985
|
+
|
3986
|
+
def load_image(image_path):
|
3987
|
+
"""Load and preprocess an image."""
|
3988
|
+
transform = transforms.Compose([
|
3989
|
+
transforms.Resize((224, 224)),
|
3990
|
+
transforms.ToTensor(),
|
3991
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
3992
|
+
])
|
3993
|
+
image = Image.open(image_path).convert('RGB')
|
3994
|
+
image = transform(image).unsqueeze(0)
|
3995
|
+
return image
|
3996
|
+
|
3997
|
+
def extract_features(image_paths, resnet=resnet50):
|
3998
|
+
"""Extract features from images using a pre-trained ResNet model."""
|
3999
|
+
model = resnet(pretrained=True)
|
4000
|
+
model = model.eval()
|
4001
|
+
model = torch.nn.Sequential(*list(model.children())[:-1]) # Remove the last classification layer
|
4002
|
+
|
4003
|
+
features = []
|
4004
|
+
for image_path in image_paths:
|
4005
|
+
image = load_image(image_path)
|
4006
|
+
with torch.no_grad():
|
4007
|
+
feature = model(image).squeeze().numpy()
|
4008
|
+
features.append(feature)
|
4006
4009
|
|
4010
|
+
return np.array(features)
|
4007
4011
|
|
4012
|
+
def check_normality(series):
|
4013
|
+
"""Helper function to check if a feature is normally distributed."""
|
4014
|
+
k2, p = stats.normaltest(series)
|
4015
|
+
alpha = 0.05
|
4016
|
+
if p < alpha: # null hypothesis: x comes from a normal distribution
|
4017
|
+
return False
|
4018
|
+
return True
|
4019
|
+
|
4020
|
+
def random_forest_feature_importance(all_df, cluster_col='cluster'):
|
4021
|
+
"""Random Forest feature importance."""
|
4022
|
+
numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
|
4023
|
+
if cluster_col in numeric_features:
|
4024
|
+
numeric_features.remove(cluster_col)
|
4025
|
+
|
4026
|
+
X = all_df[numeric_features]
|
4027
|
+
y = all_df[cluster_col]
|
4028
|
+
|
4029
|
+
scaler = StandardScaler()
|
4030
|
+
X_scaled = scaler.fit_transform(X)
|
4031
|
+
|
4032
|
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
4033
|
+
model.fit(X_scaled, y)
|
4034
|
+
|
4035
|
+
feature_importances = model.feature_importances_
|
4036
|
+
|
4037
|
+
importance_df = pd.DataFrame({
|
4038
|
+
'Feature': numeric_features,
|
4039
|
+
'Importance': feature_importances
|
4040
|
+
}).sort_values(by='Importance', ascending=False)
|
4041
|
+
|
4042
|
+
return importance_df
|
4043
|
+
|
4044
|
+
def perform_statistical_tests(all_df, cluster_col='cluster'):
|
4045
|
+
"""Perform ANOVA or Kruskal-Wallis tests depending on normality of features."""
|
4046
|
+
numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
|
4047
|
+
if cluster_col in numeric_features:
|
4048
|
+
numeric_features.remove(cluster_col)
|
4049
|
+
|
4050
|
+
anova_results = []
|
4051
|
+
kruskal_results = []
|
4052
|
+
|
4053
|
+
for feature in numeric_features:
|
4054
|
+
groups = [all_df[all_df[cluster_col] == label][feature] for label in np.unique(all_df[cluster_col])]
|
4055
|
+
|
4056
|
+
if check_normality(all_df[feature]):
|
4057
|
+
stat, p = f_oneway(*groups)
|
4058
|
+
anova_results.append((feature, stat, p))
|
4059
|
+
else:
|
4060
|
+
stat, p = kruskal(*groups)
|
4061
|
+
kruskal_results.append((feature, stat, p))
|
4062
|
+
|
4063
|
+
anova_df = pd.DataFrame(anova_results, columns=['Feature', 'ANOVA_Statistic', 'ANOVA_pValue'])
|
4064
|
+
kruskal_df = pd.DataFrame(kruskal_results, columns=['Feature', 'Kruskal_Statistic', 'Kruskal_pValue'])
|
4065
|
+
|
4066
|
+
return anova_df, kruskal_df
|
4067
|
+
|
4068
|
+
def combine_results(rf_df, anova_df, kruskal_df):
|
4069
|
+
"""Combine the results into a single DataFrame."""
|
4070
|
+
combined_df = rf_df.merge(anova_df, on='Feature', how='left')
|
4071
|
+
combined_df = combined_df.merge(kruskal_df, on='Feature', how='left')
|
4072
|
+
return combined_df
|
4073
|
+
|
4074
|
+
def cluster_feature_analysis(all_df, cluster_col='cluster'):
|
4075
|
+
"""
|
4076
|
+
Perform Random Forest feature importance, ANOVA for normally distributed features,
|
4077
|
+
and Kruskal-Wallis for non-normally distributed features. Combine results into a single DataFrame.
|
4078
|
+
"""
|
4079
|
+
rf_df = random_forest_feature_importance(all_df, cluster_col)
|
4080
|
+
anova_df, kruskal_df = perform_statistical_tests(all_df, cluster_col)
|
4081
|
+
combined_df = combine_results(rf_df, anova_df, kruskal_df)
|
4082
|
+
return combined_df
|
4083
|
+
|
4084
|
+
def _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold=5, perimeter_threshold=30):
|
4085
|
+
"""
|
4086
|
+
Merge cells in cell_mask if a parasite in parasite_mask overlaps with more than one cell,
|
4087
|
+
and if cells share more than a specified perimeter percentage.
|
4008
4088
|
|
4089
|
+
Args:
|
4090
|
+
parasite_mask (ndarray): Mask of parasites.
|
4091
|
+
cell_mask (ndarray): Mask of cells.
|
4092
|
+
nuclei_mask (ndarray): Mask of nuclei.
|
4093
|
+
overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
|
4094
|
+
perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
|
4095
|
+
|
4096
|
+
Returns:
|
4097
|
+
ndarray: The modified cell mask (cell_mask) with unique labels.
|
4098
|
+
"""
|
4099
|
+
labeled_cells = label(cell_mask)
|
4100
|
+
labeled_parasites = label(parasite_mask)
|
4101
|
+
labeled_nuclei = label(nuclei_mask)
|
4102
|
+
num_parasites = np.max(labeled_parasites)
|
4103
|
+
num_cells = np.max(labeled_cells)
|
4104
|
+
num_nuclei = np.max(labeled_nuclei)
|
4105
|
+
|
4106
|
+
# Merge cells based on parasite overlap
|
4107
|
+
for parasite_id in range(1, num_parasites + 1):
|
4108
|
+
current_parasite_mask = labeled_parasites == parasite_id
|
4109
|
+
overlapping_cell_labels = np.unique(labeled_cells[current_parasite_mask])
|
4110
|
+
overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
|
4111
|
+
if len(overlapping_cell_labels) > 1:
|
4112
|
+
|
4113
|
+
# Calculate the overlap percentages
|
4114
|
+
overlap_percentages = [
|
4115
|
+
np.sum(current_parasite_mask & (labeled_cells == cell_label)) / np.sum(current_parasite_mask) * 100
|
4116
|
+
for cell_label in overlapping_cell_labels
|
4117
|
+
]
|
4118
|
+
# Merge cells if overlap percentage is above the threshold
|
4119
|
+
for cell_label, overlap_percentage in zip(overlapping_cell_labels, overlap_percentages):
|
4120
|
+
if overlap_percentage > overlap_threshold:
|
4121
|
+
first_label = overlapping_cell_labels[0]
|
4122
|
+
for other_label in overlapping_cell_labels[1:]:
|
4123
|
+
if other_label != first_label:
|
4124
|
+
cell_mask[cell_mask == other_label] = first_label
|
4125
|
+
|
4126
|
+
# Merge cells based on nucleus overlap
|
4127
|
+
for nucleus_id in range(1, num_nuclei + 1):
|
4128
|
+
current_nucleus_mask = labeled_nuclei == nucleus_id
|
4129
|
+
overlapping_cell_labels = np.unique(labeled_cells[current_nucleus_mask])
|
4130
|
+
overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
|
4131
|
+
if len(overlapping_cell_labels) > 1:
|
4132
|
+
|
4133
|
+
# Calculate the overlap percentages
|
4134
|
+
overlap_percentages = [
|
4135
|
+
np.sum(current_nucleus_mask & (labeled_cells == cell_label)) / np.sum(current_nucleus_mask) * 100
|
4136
|
+
for cell_label in overlapping_cell_labels
|
4137
|
+
]
|
4138
|
+
# Merge cells if overlap percentage is above the threshold for each cell
|
4139
|
+
if all(overlap_percentage > overlap_threshold for overlap_percentage in overlap_percentages):
|
4140
|
+
first_label = overlapping_cell_labels[0]
|
4141
|
+
for other_label in overlapping_cell_labels[1:]:
|
4142
|
+
if other_label != first_label:
|
4143
|
+
cell_mask[cell_mask == other_label] = first_label
|
4144
|
+
|
4145
|
+
# Check for cells without nuclei and merge based on shared perimeter
|
4146
|
+
labeled_cells = label(cell_mask) # Re-label after merging based on overlap
|
4147
|
+
cell_regions = regionprops(labeled_cells)
|
4148
|
+
for region in cell_regions:
|
4149
|
+
cell_label = region.label
|
4150
|
+
cell_mask_binary = labeled_cells == cell_label
|
4151
|
+
overlapping_nuclei = np.unique(nuclei_mask[cell_mask_binary])
|
4152
|
+
overlapping_nuclei = overlapping_nuclei[overlapping_nuclei != 0]
|
4153
|
+
|
4154
|
+
if len(overlapping_nuclei) == 0:
|
4155
|
+
|
4156
|
+
# Cell does not overlap with any nucleus
|
4157
|
+
perimeter = region.perimeter
|
4158
|
+
|
4159
|
+
# Dilate the cell to find neighbors
|
4160
|
+
dilated_cell = binary_dilation(cell_mask_binary, structure=square(3))
|
4161
|
+
neighbor_cells = np.unique(labeled_cells[dilated_cell])
|
4162
|
+
neighbor_cells = neighbor_cells[(neighbor_cells != 0) & (neighbor_cells != cell_label)]
|
4163
|
+
|
4164
|
+
# Calculate shared border length with neighboring cells
|
4165
|
+
shared_borders = [
|
4166
|
+
np.sum((labeled_cells == neighbor_label) & dilated_cell) for neighbor_label in neighbor_cells
|
4167
|
+
]
|
4168
|
+
shared_border_percentages = [shared_border / perimeter * 100 for shared_border in shared_borders]
|
4169
|
+
|
4170
|
+
# Merge with the neighbor cell with the largest shared border percentage above the threshold
|
4171
|
+
if shared_borders:
|
4172
|
+
max_shared_border_index = np.argmax(shared_border_percentages)
|
4173
|
+
max_shared_border_percentage = shared_border_percentages[max_shared_border_index]
|
4174
|
+
if max_shared_border_percentage > perimeter_threshold:
|
4175
|
+
cell_mask[labeled_cells == cell_label] = neighbor_cells[max_shared_border_index]
|
4176
|
+
|
4177
|
+
# Relabel the merged cell mask
|
4178
|
+
relabeled_cell_mask, _ = label(cell_mask, return_num=True)
|
4179
|
+
return relabeled_cell_mask
|
4180
|
+
|
4181
|
+
def adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30):
|
4182
|
+
|
4183
|
+
"""
|
4184
|
+
Process all npy files in the given folders. Merge and relabel cells in cell masks
|
4185
|
+
based on parasite overlap and cell perimeter sharing conditions.
|
4186
|
+
|
4187
|
+
Args:
|
4188
|
+
parasite_folder (str): Path to the folder containing parasite masks.
|
4189
|
+
cell_folder (str): Path to the folder containing cell masks.
|
4190
|
+
nuclei_folder (str): Path to the folder containing nuclei masks.
|
4191
|
+
overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
|
4192
|
+
perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
|
4193
|
+
"""
|
4194
|
+
|
4195
|
+
parasite_files = sorted([f for f in os.listdir(parasite_folder) if f.endswith('.npy')])
|
4196
|
+
cell_files = sorted([f for f in os.listdir(cell_folder) if f.endswith('.npy')])
|
4197
|
+
nuclei_files = sorted([f for f in os.listdir(nuclei_folder) if f.endswith('.npy')])
|
4198
|
+
|
4199
|
+
# Ensure there are matching files in all folders
|
4200
|
+
if not (len(parasite_files) == len(cell_files) == len(nuclei_files)):
|
4201
|
+
raise ValueError("The number of files in the folders do not match.")
|
4202
|
+
|
4203
|
+
# Match files by name
|
4204
|
+
for file_name in parasite_files:
|
4205
|
+
parasite_path = os.path.join(parasite_folder, file_name)
|
4206
|
+
cell_path = os.path.join(cell_folder, file_name)
|
4207
|
+
nuclei_path = os.path.join(nuclei_folder, file_name)
|
4208
|
+
# Check if the corresponding cell and nuclei mask files exist
|
4209
|
+
if not (os.path.exists(cell_path) and os.path.exists(nuclei_path)):
|
4210
|
+
raise ValueError(f"Corresponding cell or nuclei mask file for {file_name} not found.")
|
4211
|
+
# Load the masks
|
4212
|
+
parasite_mask = np.load(parasite_path)
|
4213
|
+
cell_mask = np.load(cell_path)
|
4214
|
+
nuclei_mask = np.load(nuclei_path)
|
4215
|
+
# Merge and relabel cells
|
4216
|
+
merged_cell_mask = _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold, perimeter_threshold)
|
4217
|
+
|
4218
|
+
# Force 16 bit
|
4219
|
+
mamerged_cell_masksk = merged_cell_mask.astype(np.uint16)
|
4220
|
+
|
4221
|
+
# Overwrite the original cell mask file with the merged result
|
4222
|
+
np.save(cell_path, merged_cell_mask)
|
4223
|
+
|
4224
|
+
def process_masks(mask_folder, image_folder, channel, batch_size=50, n_clusters=2, plot=False):
|
4225
|
+
|
4226
|
+
def read_files_in_batches(folder, batch_size=50):
|
4227
|
+
files = [f for f in os.listdir(folder) if f.endswith('.npy')]
|
4228
|
+
files.sort() # Sort to ensure matching order
|
4229
|
+
for i in range(0, len(files), batch_size):
|
4230
|
+
yield files[i:i + batch_size]
|
4231
|
+
|
4232
|
+
def measure_morphology_and_intensity(mask, image):
|
4233
|
+
properties = measure.regionprops(mask, intensity_image=image)
|
4234
|
+
properties_list = [{'area': p.area, 'mean_intensity': p.mean_intensity, 'perimeter': p.perimeter, 'eccentricity': p.eccentricity} for p in properties]
|
4235
|
+
return properties_list
|
4236
|
+
|
4237
|
+
def cluster_objects(properties, n_clusters=2):
|
4238
|
+
data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
|
4239
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
|
4240
|
+
return kmeans
|
4241
|
+
|
4242
|
+
def remove_objects_not_in_largest_cluster(mask, labels, largest_cluster_label):
|
4243
|
+
cleaned_mask = np.zeros_like(mask)
|
4244
|
+
for region in measure.regionprops(mask):
|
4245
|
+
if labels[region.label - 1] == largest_cluster_label:
|
4246
|
+
cleaned_mask[mask == region.label] = region.label
|
4247
|
+
return cleaned_mask
|
4248
|
+
|
4249
|
+
def plot_clusters(properties, labels):
|
4250
|
+
data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
|
4251
|
+
pca = PCA(n_components=2)
|
4252
|
+
data_2d = pca.fit_transform(data)
|
4253
|
+
plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='viridis')
|
4254
|
+
plt.xlabel('PCA Component 1')
|
4255
|
+
plt.ylabel('PCA Component 2')
|
4256
|
+
plt.title('Object Clustering')
|
4257
|
+
plt.show()
|
4258
|
+
|
4259
|
+
all_properties = []
|
4260
|
+
|
4261
|
+
# Step 1: Accumulate properties over all files
|
4262
|
+
for batch in read_files_in_batches(mask_folder, batch_size):
|
4263
|
+
mask_files = [os.path.join(mask_folder, file) for file in batch]
|
4264
|
+
image_files = [os.path.join(image_folder, file) for file in batch]
|
4265
|
+
|
4266
|
+
masks = [np.load(file) for file in mask_files]
|
4267
|
+
images = [np.load(file)[:, :, channel] for file in image_files]
|
4268
|
+
|
4269
|
+
for i, mask in enumerate(masks):
|
4270
|
+
image = images[i]
|
4271
|
+
# Measure morphology and intensity
|
4272
|
+
properties = measure_morphology_and_intensity(mask, image)
|
4273
|
+
all_properties.extend(properties)
|
4274
|
+
|
4275
|
+
# Step 2: Perform clustering on accumulated properties
|
4276
|
+
kmeans = cluster_objects(all_properties, n_clusters)
|
4277
|
+
labels = kmeans.labels_
|
4278
|
+
|
4279
|
+
if plot:
|
4280
|
+
# Step 3: Plot clusters using PCA
|
4281
|
+
plot_clusters(all_properties, labels)
|
4282
|
+
|
4283
|
+
# Step 4: Remove objects not in the largest cluster and overwrite files in batches
|
4284
|
+
label_index = 0
|
4285
|
+
for batch in read_files_in_batches(mask_folder, batch_size):
|
4286
|
+
mask_files = [os.path.join(mask_folder, file) for file in batch]
|
4287
|
+
masks = [np.load(file) for file in mask_files]
|
4288
|
+
|
4289
|
+
for i, mask in enumerate(masks):
|
4290
|
+
batch_properties = measure_morphology_and_intensity(mask, mask)
|
4291
|
+
batch_labels = labels[label_index:label_index + len(batch_properties)]
|
4292
|
+
largest_cluster_label = np.bincount(batch_labels).argmax()
|
4293
|
+
cleaned_mask = remove_objects_not_in_largest_cluster(mask, batch_labels, largest_cluster_label)
|
4294
|
+
np.save(mask_files[i], cleaned_mask)
|
4295
|
+
label_index += len(batch_properties)
|