spacr 0.0.70__py3-none-any.whl → 0.0.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +4 -1
- spacr/__main__.py +0 -7
- spacr/annotate_app.py +75 -61
- spacr/core.py +39 -246
- spacr/foldseek.py +6 -6
- spacr/get_alfafold_structures.py +3 -3
- spacr/io.py +53 -116
- spacr/measure.py +46 -59
- spacr/plot.py +117 -81
- spacr/sequencing.py +508 -491
- spacr/sim.py +24 -29
- spacr/utils.py +487 -260
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/METADATA +10 -8
- spacr-0.0.80.dist-info/RECORD +36 -0
- spacr/graph_learning_lap.py +0 -84
- spacr/train.py +0 -667
- spacr/umap.py +0 -0
- spacr-0.0.70.dist-info/RECORD +0 -39
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/LICENSE +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/WHEEL +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/entry_points.txt +0 -0
- {spacr-0.0.70.dist-info → spacr-0.0.80.dist-info}/top_level.txt +0 -0
spacr/utils.py
CHANGED
@@ -43,6 +43,7 @@ from scipy.stats import fisher_exact
|
|
43
43
|
from scipy.ndimage.filters import gaussian_filter
|
44
44
|
from scipy.spatial import ConvexHull
|
45
45
|
from scipy.interpolate import splprep, splev
|
46
|
+
from scipy.ndimage import binary_dilation
|
46
47
|
|
47
48
|
from sklearn.preprocessing import StandardScaler
|
48
49
|
from skimage.exposure import rescale_intensity
|
@@ -55,6 +56,8 @@ from sklearn.preprocessing import StandardScaler
|
|
55
56
|
from sklearn.cluster import DBSCAN
|
56
57
|
from sklearn.cluster import KMeans
|
57
58
|
from sklearn.manifold import TSNE
|
59
|
+
from sklearn.cluster import KMeans
|
60
|
+
from sklearn.decomposition import PCA
|
58
61
|
|
59
62
|
import umap.umap_ as umap
|
60
63
|
|
@@ -62,6 +65,12 @@ from torchvision import models
|
|
62
65
|
from torchvision.models.resnet import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights
|
63
66
|
import torchvision.transforms as transforms
|
64
67
|
|
68
|
+
from sklearn.ensemble import RandomForestClassifier
|
69
|
+
from sklearn.preprocessing import StandardScaler
|
70
|
+
from scipy.stats import f_oneway, kruskal
|
71
|
+
from sklearn.cluster import KMeans
|
72
|
+
from scipy import stats
|
73
|
+
|
65
74
|
from .logger import log_function_call
|
66
75
|
|
67
76
|
def check_mask_folder(src,mask_fldr):
|
@@ -370,12 +379,10 @@ def mask_object_count(mask):
|
|
370
379
|
Counts the number of objects in a given mask.
|
371
380
|
|
372
381
|
Parameters:
|
373
|
-
- mask: numpy.ndarray
|
374
|
-
The mask containing object labels.
|
382
|
+
- mask: numpy.ndarray. The mask containing object labels.
|
375
383
|
|
376
384
|
Returns:
|
377
|
-
- int
|
378
|
-
The number of objects in the mask.
|
385
|
+
- int. The number of objects in the mask.
|
379
386
|
"""
|
380
387
|
unique_labels = np.unique(mask)
|
381
388
|
num_objects = len(unique_labels[unique_labels!=0])
|
@@ -531,81 +538,56 @@ def _annotate_conditions(df, cells=['HeLa'], cell_loc=None, pathogens=['rh'], pa
|
|
531
538
|
df['condition'] = df['condition'].apply(lambda x: x if x else 'none')
|
532
539
|
return df
|
533
540
|
|
534
|
-
def
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
Parameters:
|
539
|
-
- array: numpy array
|
540
|
-
The input stack to be normalized.
|
541
|
-
- p1: int, optional
|
542
|
-
The lower percentile value for normalization. Default is 2.
|
543
|
-
- p2: int, optional
|
544
|
-
The upper percentile value for normalization. Default is 98.
|
545
|
-
|
546
|
-
Returns:
|
547
|
-
- new_stack: numpy array
|
548
|
-
The normalized stack with the same shape as the input stack.
|
549
|
-
"""
|
550
|
-
nimg = array.shape[2]
|
551
|
-
new_stack = np.empty_like(array)
|
552
|
-
|
553
|
-
for i in range(nimg):
|
554
|
-
img = array[:, :, i]
|
555
|
-
non_zero_img = img[img > 0]
|
556
|
-
|
557
|
-
if non_zero_img.size > 0:
|
558
|
-
img_min = np.percentile(non_zero_img, p1)
|
559
|
-
img_max = np.percentile(non_zero_img, p2)
|
560
|
-
else:
|
561
|
-
img_min = img.min()
|
562
|
-
img_max = img.max()
|
563
|
-
|
564
|
-
# Determine output range based on dtype
|
565
|
-
if np.issubdtype(array.dtype, np.integer):
|
566
|
-
out_range = (0, np.iinfo(array.dtype).max)
|
567
|
-
else:
|
568
|
-
out_range = (0.0, 1.0)
|
569
|
-
|
570
|
-
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range).astype(array.dtype)
|
571
|
-
new_stack[:, :, i] = img
|
572
|
-
|
573
|
-
return new_stack
|
541
|
+
def is_list_of_lists(var):
|
542
|
+
if isinstance(var, list) and all(isinstance(i, list) for i in var):
|
543
|
+
return True
|
544
|
+
return False
|
574
545
|
|
575
|
-
def normalize_to_dtype(array, p1=2, p2=98):
|
546
|
+
def normalize_to_dtype(array, p1=2, p2=98, percentile_list=None):
|
576
547
|
"""
|
577
548
|
Normalize each image in the stack to its own percentiles.
|
578
549
|
|
579
550
|
Parameters:
|
580
551
|
- array: numpy array
|
581
|
-
|
552
|
+
The input stack to be normalized.
|
582
553
|
- p1: int, optional
|
583
|
-
|
554
|
+
The lower percentile value for normalization. Default is 2.
|
584
555
|
- p2: int, optional
|
585
|
-
|
586
|
-
|
556
|
+
The upper percentile value for normalization. Default is 98.
|
557
|
+
- percentile_list: list, optional
|
558
|
+
A list of pre-calculated percentiles for each image in the stack. Default is None.
|
559
|
+
|
587
560
|
Returns:
|
588
561
|
- new_stack: numpy array
|
589
|
-
|
562
|
+
The normalized stack with the same shape as the input stack.
|
590
563
|
"""
|
564
|
+
|
565
|
+
out_range = (0, np.iinfo(array.dtype).max)
|
591
566
|
nimg = array.shape[2]
|
592
|
-
new_stack = np.empty_like(array, dtype=
|
567
|
+
new_stack = np.empty_like(array, dtype=array.dtype)
|
593
568
|
|
594
569
|
for i in range(nimg):
|
595
570
|
img = array[:, :, i]
|
596
571
|
non_zero_img = img[img > 0]
|
597
|
-
|
598
|
-
|
599
|
-
img_min = np.percentile(non_zero_img, p1)
|
600
|
-
img_max = np.percentile(non_zero_img, p2)
|
572
|
+
if not percentile_list is None:
|
573
|
+
percentiles = percentile_list[i]
|
601
574
|
else:
|
602
|
-
|
603
|
-
|
575
|
+
percentile_1 = p1
|
576
|
+
percentile_2 = p2
|
577
|
+
if percentile_list is None:
|
578
|
+
if non_zero_img.size > 0:
|
579
|
+
img_min = np.percentile(non_zero_img, percentile_1)
|
580
|
+
img_max = np.percentile(non_zero_img, percentile_2)
|
581
|
+
else:
|
582
|
+
img_min = np.percentile(img, percentile_1)
|
583
|
+
img_max = np.percentile(img, percentile_2)
|
584
|
+
else:
|
585
|
+
img_min = percentiles[0]
|
586
|
+
img_max = percentiles[1]
|
604
587
|
|
605
588
|
# Normalize to the range (0, 1) for visualization
|
606
|
-
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=
|
589
|
+
img = rescale_intensity(img, in_range=(img_min, img_max), out_range=out_range)
|
607
590
|
new_stack[:, :, i] = img
|
608
|
-
|
609
591
|
return new_stack
|
610
592
|
|
611
593
|
def _list_endpoint_subdirectories(base_dir):
|
@@ -868,7 +850,7 @@ def _check_integrity(df):
|
|
868
850
|
df['label_list'] = df['label_list'].astype(str)
|
869
851
|
return df
|
870
852
|
|
871
|
-
def _get_percentiles(array,
|
853
|
+
def _get_percentiles(array, p1=2, p2=98):
|
872
854
|
"""
|
873
855
|
Calculate the percentiles of each image in the given array.
|
874
856
|
|
@@ -891,15 +873,16 @@ def _get_percentiles(array, q1=2, q2=98):
|
|
891
873
|
img = np.squeeze(array[:, :, v])
|
892
874
|
non_zero_img = img[img > 0]
|
893
875
|
if non_zero_img.size > 0: # check if there are non-zero values
|
894
|
-
img_min = np.percentile(non_zero_img,
|
895
|
-
img_max = np.percentile(non_zero_img,
|
876
|
+
img_min = np.percentile(non_zero_img, p1) # change percentile from 0.02 to 2
|
877
|
+
img_max = np.percentile(non_zero_img, p2) # change percentile from 0.98 to 98
|
896
878
|
percentiles.append([img_min, img_max])
|
897
879
|
else: # if there are no non-zero values, just use the image as it is
|
898
|
-
img_min
|
880
|
+
img_min = np.percentile(img, p1) # change percentile from 0.02 to 2
|
881
|
+
img_max = np.percentile(img, p2) # change percentile from 0.98 to 98
|
899
882
|
percentiles.append([img_min, img_max])
|
900
883
|
return percentiles
|
901
884
|
|
902
|
-
def _crop_center(img, cell_mask, new_width, new_height
|
885
|
+
def _crop_center(img, cell_mask, new_width, new_height):
|
903
886
|
"""
|
904
887
|
Crop the image around the center of the cell mask.
|
905
888
|
|
@@ -912,8 +895,6 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
|
|
912
895
|
The desired width of the cropped image.
|
913
896
|
- new_height: int
|
914
897
|
The desired height of the cropped image.
|
915
|
-
- normalize: tuple, optional
|
916
|
-
The normalization range for the image pixel values. Default is (2, 98).
|
917
898
|
|
918
899
|
Returns:
|
919
900
|
- img: numpy.ndarray
|
@@ -923,19 +904,22 @@ def _crop_center(img, cell_mask, new_width, new_height, normalize=(2,98)):
|
|
923
904
|
cell_mask[cell_mask != 0] = 1
|
924
905
|
mask_3d = np.repeat(cell_mask[:, :, np.newaxis], img.shape[2], axis=2).astype(img.dtype) # Create 3D mask
|
925
906
|
img = np.multiply(img, mask_3d).astype(img.dtype) # Multiply image with mask to set pixel values outside of the mask to 0
|
926
|
-
#centroid = np.round(ndi.measurements.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
|
927
907
|
centroid = np.round(ndi.center_of_mass(cell_mask)).astype(int) # Compute centroid of the mask
|
908
|
+
|
928
909
|
# Pad the image and mask to ensure the crop will not go out of bounds
|
929
910
|
pad_width = max(new_width, new_height)
|
930
911
|
img = np.pad(img, ((pad_width, pad_width), (pad_width, pad_width), (0, 0)), mode='constant')
|
931
912
|
cell_mask = np.pad(cell_mask, ((pad_width, pad_width), (pad_width, pad_width)), mode='constant')
|
913
|
+
|
932
914
|
# Update centroid coordinates due to padding
|
933
915
|
centroid += pad_width
|
916
|
+
|
934
917
|
# Compute bounding box
|
935
918
|
start_y = max(0, centroid[0] - new_height // 2)
|
936
919
|
end_y = min(start_y + new_height, img.shape[0])
|
937
920
|
start_x = max(0, centroid[1] - new_width // 2)
|
938
921
|
end_x = min(start_x + new_width, img.shape[1])
|
922
|
+
|
939
923
|
# Crop to bounding box
|
940
924
|
img = img[start_y:end_y, start_x:end_x, :]
|
941
925
|
return img
|
@@ -1485,52 +1469,18 @@ class SpatialAttention(nn.Module):
|
|
1485
1469
|
|
1486
1470
|
# Multi-Scale Block with Attention
|
1487
1471
|
class MultiScaleBlockWithAttention(nn.Module):
|
1488
|
-
"""
|
1489
|
-
Multi-scale block with attention module.
|
1490
|
-
|
1491
|
-
Args:
|
1492
|
-
in_channels (int): Number of input channels.
|
1493
|
-
out_channels (int): Number of output channels.
|
1494
|
-
|
1495
|
-
Attributes:
|
1496
|
-
dilated_conv1 (nn.Conv2d): Dilated convolution layer.
|
1497
|
-
spatial_attention (nn.Conv2d): Spatial attention layer.
|
1498
|
-
|
1499
|
-
Methods:
|
1500
|
-
custom_forward: Custom forward method for the module.
|
1501
|
-
forward: Forward method for the module.
|
1502
|
-
"""
|
1503
|
-
|
1504
1472
|
def __init__(self, in_channels, out_channels):
|
1505
1473
|
super(MultiScaleBlockWithAttention, self).__init__()
|
1506
1474
|
self.dilated_conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, dilation=1, padding=1)
|
1507
1475
|
self.spatial_attention = nn.Conv2d(out_channels, out_channels, kernel_size=1)
|
1508
1476
|
|
1509
1477
|
def custom_forward(self, x):
|
1510
|
-
"""
|
1511
|
-
Custom forward method for the module.
|
1512
|
-
|
1513
|
-
Args:
|
1514
|
-
x (torch.Tensor): Input tensor.
|
1515
|
-
|
1516
|
-
Returns:
|
1517
|
-
torch.Tensor: Output tensor.
|
1518
|
-
"""
|
1519
1478
|
x1 = F.relu(self.dilated_conv1(x), inplace=True)
|
1520
1479
|
x = self.spatial_attention(x1)
|
1521
1480
|
return x
|
1522
1481
|
|
1523
1482
|
def forward(self, x):
|
1524
|
-
|
1525
|
-
Forward method for the module.
|
1526
|
-
|
1527
|
-
Args:
|
1528
|
-
x (torch.Tensor): Input tensor.
|
1529
|
-
|
1530
|
-
Returns:
|
1531
|
-
torch.Tensor: Output tensor.
|
1532
|
-
"""
|
1533
|
-
return checkpoint(self.custom_forward, x)
|
1483
|
+
return self.custom_forward(x)
|
1534
1484
|
|
1535
1485
|
# Final Classifier
|
1536
1486
|
class CustomCellClassifier(nn.Module):
|
@@ -2258,25 +2208,6 @@ def MLR(merged_df, refine_model):
|
|
2258
2208
|
|
2259
2209
|
return max_effects, max_effects_pvalues, model, df
|
2260
2210
|
|
2261
|
-
#def normalize_to_dtype(array, q1=2, q2=98, percentiles=None):
|
2262
|
-
# if len(array.shape) == 2:
|
2263
|
-
# array = np.expand_dims(array, axis=-1)
|
2264
|
-
# num_channels = array.shape[-1]
|
2265
|
-
# new_stack = np.empty_like(array)
|
2266
|
-
# for channel in range(num_channels):
|
2267
|
-
# img = array[..., channel]
|
2268
|
-
# non_zero_img = img[img > 0]
|
2269
|
-
# if non_zero_img.size > 0:
|
2270
|
-
# img_min = np.percentile(non_zero_img, q1)
|
2271
|
-
# img_max = np.percentile(non_zero_img, q2)
|
2272
|
-
# else:
|
2273
|
-
# img_min, img_max = (percentiles[channel] if percentiles and channel < len(percentiles)
|
2274
|
-
# else (img.min(), img.max()))
|
2275
|
-
# new_stack[..., channel] = rescale_intensity(img, in_range=(img_min, img_max), out_range='dtype')
|
2276
|
-
# if new_stack.shape[-1] == 1:
|
2277
|
-
# new_stack = np.squeeze(new_stack, axis=-1)
|
2278
|
-
# return new_stack
|
2279
|
-
|
2280
2211
|
def get_files_from_dir(dir_path, file_extension="*"):
|
2281
2212
|
return glob(os.path.join(dir_path, file_extension))
|
2282
2213
|
|
@@ -3489,105 +3420,6 @@ def reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric, eps, m
|
|
3489
3420
|
|
3490
3421
|
return embedding, labels, reducer
|
3491
3422
|
|
3492
|
-
def reduction_and_clustering_v1(numeric_data, n_neighbors, min_dist, metric, eps, min_samples, clustering, reduction_method='umap', verbose=False, embedding=None, n_jobs=-1):
|
3493
|
-
"""
|
3494
|
-
Perform dimensionality reduction and clustering on the given data.
|
3495
|
-
|
3496
|
-
Parameters:
|
3497
|
-
numeric_data (np.ndarray): Numeric data for embedding and clustering.
|
3498
|
-
n_neighbors (int or float): Number of neighbors for UMAP or perplexity for t-SNE.
|
3499
|
-
min_dist (float): Minimum distance for UMAP.
|
3500
|
-
metric (str): Metric for UMAP and DBSCAN.
|
3501
|
-
eps (float): Epsilon for DBSCAN.
|
3502
|
-
min_samples (int): Minimum samples for DBSCAN or number of clusters for KMeans.
|
3503
|
-
clustering (str): Clustering method ('DBSCAN' or 'KMeans').
|
3504
|
-
reduction_method (str): Dimensionality reduction method ('UMAP' or 'tSNE').
|
3505
|
-
verbose (bool): Whether to print verbose output.
|
3506
|
-
embedding (np.ndarray, optional): Precomputed embedding. Default is None.
|
3507
|
-
|
3508
|
-
Returns:
|
3509
|
-
tuple: embedding, labels
|
3510
|
-
"""
|
3511
|
-
|
3512
|
-
if verbose:
|
3513
|
-
v=1
|
3514
|
-
else:
|
3515
|
-
v=0
|
3516
|
-
|
3517
|
-
if isinstance(n_neighbors, float):
|
3518
|
-
n_neighbors = int(n_neighbors * len(numeric_data))
|
3519
|
-
|
3520
|
-
if n_neighbors <= 2:
|
3521
|
-
n_neighbors = 2
|
3522
|
-
|
3523
|
-
if reduction_method == 'umap':
|
3524
|
-
reducer = umap.UMAP(n_neighbors=n_neighbors,
|
3525
|
-
n_components=2,
|
3526
|
-
metric=metric,
|
3527
|
-
n_epochs=None,
|
3528
|
-
learning_rate=1.0,
|
3529
|
-
init='spectral',
|
3530
|
-
min_dist=min_dist,
|
3531
|
-
spread=1.0,
|
3532
|
-
set_op_mix_ratio=1.0,
|
3533
|
-
local_connectivity=1,
|
3534
|
-
repulsion_strength=1.0,
|
3535
|
-
negative_sample_rate=5,
|
3536
|
-
transform_queue_size=4.0,
|
3537
|
-
a=None,
|
3538
|
-
b=None,
|
3539
|
-
random_state=42,
|
3540
|
-
metric_kwds=None,
|
3541
|
-
angular_rp_forest=False,
|
3542
|
-
target_n_neighbors=-1,
|
3543
|
-
target_metric='categorical',
|
3544
|
-
target_metric_kwds=None,
|
3545
|
-
target_weight=0.5,
|
3546
|
-
transform_seed=42,
|
3547
|
-
n_jobs=n_jobs,
|
3548
|
-
verbose=verbose)
|
3549
|
-
|
3550
|
-
elif reduction_method == 'tsne':
|
3551
|
-
|
3552
|
-
#tsne_params.setdefault('n_components', 2)
|
3553
|
-
#reducer = TSNE(**tsne_params)
|
3554
|
-
|
3555
|
-
reducer = TSNE(n_components=2,
|
3556
|
-
perplexity=n_neighbors,
|
3557
|
-
early_exaggeration=12.0,
|
3558
|
-
learning_rate=200.0,
|
3559
|
-
n_iter=1000,
|
3560
|
-
n_iter_without_progress=300,
|
3561
|
-
min_grad_norm=1e-7,
|
3562
|
-
metric=metric,
|
3563
|
-
init='random',
|
3564
|
-
verbose=v,
|
3565
|
-
random_state=42,
|
3566
|
-
method='barnes_hut',
|
3567
|
-
angle=0.5,
|
3568
|
-
n_jobs=n_jobs)
|
3569
|
-
|
3570
|
-
else:
|
3571
|
-
raise ValueError(f"Unsupported reduction method: {reduction_method}. Supported methods are 'umap' and 'tsne'")
|
3572
|
-
|
3573
|
-
if embedding is None:
|
3574
|
-
embedding = reducer.fit_transform(numeric_data)
|
3575
|
-
|
3576
|
-
if clustering == 'dbscan':
|
3577
|
-
clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=n_jobs)
|
3578
|
-
elif clustering == 'kmeans':
|
3579
|
-
clustering_model = KMeans(n_clusters=min_samples, random_state=42)
|
3580
|
-
else:
|
3581
|
-
raise ValueError(f"Unsupported clustering method: {clustering}. Supported methods are 'dbscan' and 'kmeans'")
|
3582
|
-
|
3583
|
-
clustering_model.fit(embedding)
|
3584
|
-
labels = clustering_model.labels_ if clustering == 'dbscan' else clustering_model.predict(embedding)
|
3585
|
-
|
3586
|
-
if verbose:
|
3587
|
-
print(f'Embedding shape: {embedding.shape}')
|
3588
|
-
|
3589
|
-
return embedding, labels
|
3590
|
-
|
3591
3423
|
def remove_noise(embedding, labels):
|
3592
3424
|
non_noise_indices = labels != -1
|
3593
3425
|
embedding = embedding[non_noise_indices]
|
@@ -3799,30 +3631,6 @@ def correct_paths(df, base_path):
|
|
3799
3631
|
image_paths = df['png_path'].to_list()
|
3800
3632
|
return df, image_paths
|
3801
3633
|
|
3802
|
-
def correct_paths_v1(df, base_path):
|
3803
|
-
if 'png_path' not in df.columns:
|
3804
|
-
print("No 'png_path' column found in the dataframe.")
|
3805
|
-
return df, None
|
3806
|
-
|
3807
|
-
image_paths = df['png_path'].to_list()
|
3808
|
-
|
3809
|
-
adjusted_image_paths = []
|
3810
|
-
for path in image_paths:
|
3811
|
-
if base_path not in path:
|
3812
|
-
print(f"Adjusting path: {path}")
|
3813
|
-
parts = path.split('data/')
|
3814
|
-
if len(parts) > 1:
|
3815
|
-
new_path = os.path.join(base_path, 'data', parts[1])
|
3816
|
-
adjusted_image_paths.append(new_path)
|
3817
|
-
else:
|
3818
|
-
adjusted_image_paths.append(path)
|
3819
|
-
else:
|
3820
|
-
adjusted_image_paths.append(path)
|
3821
|
-
|
3822
|
-
df['png_path'] = adjusted_image_paths
|
3823
|
-
image_paths = df['png_path'].to_list()
|
3824
|
-
return df, image_paths
|
3825
|
-
|
3826
3634
|
def get_umap_image_settings(settings={}):
|
3827
3635
|
settings.setdefault('src', 'path')
|
3828
3636
|
settings.setdefault('row_limit', 1000)
|
@@ -3869,25 +3677,129 @@ def get_umap_image_settings(settings={}):
|
|
3869
3677
|
settings.setdefault('verbose',True)
|
3870
3678
|
return settings
|
3871
3679
|
|
3680
|
+
def get_measure_crop_settings(settings):
|
3681
|
+
|
3682
|
+
# Test mode
|
3683
|
+
settings.setdefault('test_mode', False)
|
3684
|
+
settings.setdefault('test_nr', 10)
|
3685
|
+
|
3686
|
+
#measurement settings
|
3687
|
+
settings.setdefault('save_measurements',True)
|
3688
|
+
settings.setdefault('radial_dist', True)
|
3689
|
+
settings.setdefault('calculate_correlation', True)
|
3690
|
+
settings.setdefault('manders_thresholds', [15,85,95])
|
3691
|
+
settings.setdefault('homogeneity', True)
|
3692
|
+
settings.setdefault('homogeneity_distances', [8,16,32])
|
3693
|
+
|
3694
|
+
# Cropping settings
|
3695
|
+
settings.setdefault('save_arrays', False)
|
3696
|
+
settings.setdefault('save_png',True)
|
3697
|
+
settings.setdefault('use_bounding_box',False)
|
3698
|
+
settings.setdefault('png_size',[224,224])
|
3699
|
+
settings.setdefault('png_dims',[0,1,2])
|
3700
|
+
settings.setdefault('normalize',False)
|
3701
|
+
settings.setdefault('normalize_by','png')
|
3702
|
+
settings.setdefault('crop_mode',['cell'])
|
3703
|
+
settings.setdefault('dialate_pngs', False)
|
3704
|
+
settings.setdefault('dialate_png_ratios', [0.2])
|
3705
|
+
|
3706
|
+
# Timelapsed settings
|
3707
|
+
settings.setdefault('timelapse', False)
|
3708
|
+
settings.setdefault('timelapse_objects', 'cell')
|
3709
|
+
|
3710
|
+
# Operational settings
|
3711
|
+
settings.setdefault('plot',False)
|
3712
|
+
settings.setdefault('plot_filtration',False)
|
3713
|
+
settings.setdefault('representative_images', False)
|
3714
|
+
settings.setdefault('max_workers', os.cpu_count()-2)
|
3715
|
+
|
3716
|
+
# Object settings
|
3717
|
+
settings.setdefault('cell_mask_dim',None)
|
3718
|
+
settings.setdefault('nucleus_mask_dim',None)
|
3719
|
+
settings.setdefault('pathogen_mask_dim',None)
|
3720
|
+
settings.setdefault('cytoplasm',False)
|
3721
|
+
settings.setdefault('include_uninfected',True)
|
3722
|
+
settings.setdefault('cell_min_size',0)
|
3723
|
+
settings.setdefault('nucleus_min_size',0)
|
3724
|
+
settings.setdefault('pathogen_min_size',0)
|
3725
|
+
settings.setdefault('cytoplasm_min_size',0)
|
3726
|
+
settings.setdefault('merge_edge_pathogen_cells', True)
|
3727
|
+
|
3728
|
+
# Miscellaneous settings
|
3729
|
+
settings.setdefault('experiment', 'exp')
|
3730
|
+
settings.setdefault('cells', 'HeLa')
|
3731
|
+
settings.setdefault('cell_loc', None)
|
3732
|
+
settings.setdefault('pathogens', ['ME49Dku80WT', 'ME49Dku80dgra8:GRA8', 'ME49Dku80dgra8', 'ME49Dku80TKO'])
|
3733
|
+
settings.setdefault('pathogen_loc', [['c1', 'c2', 'c3', 'c4', 'c5', 'c6'], ['c7', 'c8', 'c9', 'c10', 'c11', 'c12'], ['c13', 'c14', 'c15', 'c16', 'c17', 'c18'], ['c19', 'c20', 'c21', 'c22', 'c23', 'c24']])
|
3734
|
+
settings.setdefault('treatments', ['BR1', 'BR2', 'BR3'])
|
3735
|
+
settings.setdefault('treatment_loc', [['c1', 'c2', 'c7', 'c8', 'c13', 'c14', 'c19', 'c20'], ['c3', 'c4', 'c9', 'c10', 'c15', 'c16', 'c21', 'c22'], ['c5', 'c6', 'c11', 'c12', 'c17', 'c18', 'c23', 'c24']])
|
3736
|
+
settings.setdefault('channel_of_interest', 2)
|
3737
|
+
settings.setdefault('compartments', ['pathogen', 'cytoplasm'])
|
3738
|
+
settings.setdefault('measurement', 'mean_intensity')
|
3739
|
+
settings.setdefault('nr_imgs', 32)
|
3740
|
+
settings.setdefault('um_per_pixel', 0.1)
|
3741
|
+
|
3742
|
+
if settings['test_mode']:
|
3743
|
+
settings['plot'] = True
|
3744
|
+
settings['plot_filtration'] = True
|
3745
|
+
test_imgs = settings['test_nr']
|
3746
|
+
print(f'Test mode enabled with {test_imgs} images, plotting set to True')
|
3747
|
+
|
3748
|
+
return settings
|
3749
|
+
|
3750
|
+
def delete_folder(folder_path):
|
3751
|
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
3752
|
+
for root, dirs, files in os.walk(folder_path, topdown=False):
|
3753
|
+
for name in files:
|
3754
|
+
os.remove(os.path.join(root, name))
|
3755
|
+
for name in dirs:
|
3756
|
+
os.rmdir(os.path.join(root, name))
|
3757
|
+
os.rmdir(folder_path)
|
3758
|
+
print(f"Folder '{folder_path}' has been deleted.")
|
3759
|
+
else:
|
3760
|
+
print(f"Folder '{folder_path}' does not exist or is not a directory.")
|
3761
|
+
|
3762
|
+
def measure_test_mode(settings):
|
3763
|
+
|
3764
|
+
if settings['test_mode']:
|
3765
|
+
if not os.path.basename(settings['input_folder']) == 'test':
|
3766
|
+
all_files = os.listdir(settings['input_folder'])
|
3767
|
+
random_files = random.sample(all_files, settings['test_nr'])
|
3768
|
+
|
3769
|
+
src = os.path.join(os.path.dirname(settings['input_folder']),'test', 'merged')
|
3770
|
+
if os.path.exists(src):
|
3771
|
+
delete_folder(src)
|
3772
|
+
os.makedirs(src, exist_ok=True)
|
3773
|
+
|
3774
|
+
for file in random_files:
|
3775
|
+
shutil.copy(os.path.join(settings['input_folder'], file), os.path.join(src,file))
|
3776
|
+
|
3777
|
+
settings['input_folder'] = src
|
3778
|
+
print(f'Changed source folder to {src} for test mode')
|
3779
|
+
else:
|
3780
|
+
print(f'Test mode enabled, using source folder {settings["input_folder"]}')
|
3781
|
+
|
3782
|
+
return settings
|
3783
|
+
|
3872
3784
|
def preprocess_data(df, filter_by, remove_highly_correlated, log_data, exclude):
|
3873
3785
|
"""
|
3874
3786
|
Preprocesses the given dataframe by applying filtering, removing highly correlated columns,
|
3875
3787
|
applying log transformation, filling NaN values, and scaling the numeric data.
|
3876
3788
|
|
3877
3789
|
Args:
|
3878
|
-
|
3879
|
-
|
3880
|
-
|
3881
|
-
|
3882
|
-
|
3883
|
-
|
3884
|
-
|
3790
|
+
df (pandas.DataFrame): The input dataframe.
|
3791
|
+
filter_by (str or None): The channel of interest to filter the dataframe by.
|
3792
|
+
remove_highly_correlated (bool or float): Whether to remove highly correlated columns.
|
3793
|
+
If a float is provided, it represents the correlation threshold.
|
3794
|
+
log_data (bool): Whether to apply log transformation to the numeric data.
|
3795
|
+
exclude (list or None): List of features to exclude from the filtering process.
|
3796
|
+
verbose (bool): Whether to print verbose output during preprocessing.
|
3885
3797
|
|
3886
3798
|
Returns:
|
3887
|
-
|
3799
|
+
numpy.ndarray: The preprocessed numeric data.
|
3888
3800
|
|
3889
3801
|
Raises:
|
3890
|
-
|
3802
|
+
ValueError: If no numeric columns are available after filtering.
|
3891
3803
|
|
3892
3804
|
"""
|
3893
3805
|
# Apply filtering based on the `filter_by` parameter
|
@@ -3927,13 +3839,8 @@ def filter_dataframe_features(df, channel_of_interest, exclude=None):
|
|
3927
3839
|
|
3928
3840
|
Parameters:
|
3929
3841
|
- df (pandas.DataFrame): The input dataframe to be filtered.
|
3930
|
-
- channel_of_interest (str, int, list, None): The channel(s) of interest to filter the dataframe.
|
3931
|
-
|
3932
|
-
If an integer, only the specified channel is included. If a list, only the specified channels are included.
|
3933
|
-
If a string, only the specified channel is included.
|
3934
|
-
- exclude (str, list, None): The feature(s) to exclude from the filtered dataframe.
|
3935
|
-
If None, no features are excluded. If a string, the specified feature is excluded.
|
3936
|
-
If a list, the specified features are excluded.
|
3842
|
+
- channel_of_interest (str, int, list, None): The channel(s) of interest to filter the dataframe. If None, no filtering is applied. If 'morphology', only morphology features are included.If an integer, only the specified channel is included. If a list, only the specified channels are included.If a string, only the specified channel is included.
|
3843
|
+
- exclude (str, list, None): The feature(s) to exclude from the filtered dataframe. If None, no features are excluded. If a string, the specified feature is excluded.If a list, the specified features are excluded.
|
3937
3844
|
|
3938
3845
|
Returns:
|
3939
3846
|
- filtered_df (pandas.DataFrame): The filtered dataframe based on the specified parameters.
|
@@ -4063,6 +3970,326 @@ def search_reduction_and_clustering(numeric_data, n_neighbors, min_dist, metric,
|
|
4063
3970
|
if verbose:
|
4064
3971
|
print(f'Embedding shape: {embedding.shape}')
|
4065
3972
|
return embedding, labels
|
3973
|
+
import torch
|
3974
|
+
import torchvision.transforms as transforms
|
3975
|
+
from torchvision.models import resnet50
|
3976
|
+
from PIL import Image
|
3977
|
+
import numpy as np
|
3978
|
+
import umap
|
3979
|
+
import pandas as pd
|
3980
|
+
from sklearn.ensemble import RandomForestClassifier
|
3981
|
+
from sklearn.preprocessing import StandardScaler
|
3982
|
+
from scipy.stats import f_oneway, kruskal
|
3983
|
+
from sklearn.cluster import KMeans
|
3984
|
+
from scipy import stats
|
3985
|
+
|
3986
|
+
def load_image(image_path):
|
3987
|
+
"""Load and preprocess an image."""
|
3988
|
+
transform = transforms.Compose([
|
3989
|
+
transforms.Resize((224, 224)),
|
3990
|
+
transforms.ToTensor(),
|
3991
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
3992
|
+
])
|
3993
|
+
image = Image.open(image_path).convert('RGB')
|
3994
|
+
image = transform(image).unsqueeze(0)
|
3995
|
+
return image
|
3996
|
+
|
3997
|
+
def extract_features(image_paths, resnet=resnet50):
|
3998
|
+
"""Extract features from images using a pre-trained ResNet model."""
|
3999
|
+
model = resnet(pretrained=True)
|
4000
|
+
model = model.eval()
|
4001
|
+
model = torch.nn.Sequential(*list(model.children())[:-1]) # Remove the last classification layer
|
4002
|
+
|
4003
|
+
features = []
|
4004
|
+
for image_path in image_paths:
|
4005
|
+
image = load_image(image_path)
|
4006
|
+
with torch.no_grad():
|
4007
|
+
feature = model(image).squeeze().numpy()
|
4008
|
+
features.append(feature)
|
4009
|
+
|
4010
|
+
return np.array(features)
|
4011
|
+
|
4012
|
+
def check_normality(series):
|
4013
|
+
"""Helper function to check if a feature is normally distributed."""
|
4014
|
+
k2, p = stats.normaltest(series)
|
4015
|
+
alpha = 0.05
|
4016
|
+
if p < alpha: # null hypothesis: x comes from a normal distribution
|
4017
|
+
return False
|
4018
|
+
return True
|
4019
|
+
|
4020
|
+
def random_forest_feature_importance(all_df, cluster_col='cluster'):
|
4021
|
+
"""Random Forest feature importance."""
|
4022
|
+
numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
|
4023
|
+
if cluster_col in numeric_features:
|
4024
|
+
numeric_features.remove(cluster_col)
|
4025
|
+
|
4026
|
+
X = all_df[numeric_features]
|
4027
|
+
y = all_df[cluster_col]
|
4028
|
+
|
4029
|
+
scaler = StandardScaler()
|
4030
|
+
X_scaled = scaler.fit_transform(X)
|
4031
|
+
|
4032
|
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
4033
|
+
model.fit(X_scaled, y)
|
4034
|
+
|
4035
|
+
feature_importances = model.feature_importances_
|
4066
4036
|
|
4037
|
+
importance_df = pd.DataFrame({
|
4038
|
+
'Feature': numeric_features,
|
4039
|
+
'Importance': feature_importances
|
4040
|
+
}).sort_values(by='Importance', ascending=False)
|
4067
4041
|
|
4042
|
+
return importance_df
|
4043
|
+
|
4044
|
+
def perform_statistical_tests(all_df, cluster_col='cluster'):
|
4045
|
+
"""Perform ANOVA or Kruskal-Wallis tests depending on normality of features."""
|
4046
|
+
numeric_features = all_df.select_dtypes(include=[np.number]).columns.tolist()
|
4047
|
+
if cluster_col in numeric_features:
|
4048
|
+
numeric_features.remove(cluster_col)
|
4049
|
+
|
4050
|
+
anova_results = []
|
4051
|
+
kruskal_results = []
|
4052
|
+
|
4053
|
+
for feature in numeric_features:
|
4054
|
+
groups = [all_df[all_df[cluster_col] == label][feature] for label in np.unique(all_df[cluster_col])]
|
4055
|
+
|
4056
|
+
if check_normality(all_df[feature]):
|
4057
|
+
stat, p = f_oneway(*groups)
|
4058
|
+
anova_results.append((feature, stat, p))
|
4059
|
+
else:
|
4060
|
+
stat, p = kruskal(*groups)
|
4061
|
+
kruskal_results.append((feature, stat, p))
|
4062
|
+
|
4063
|
+
anova_df = pd.DataFrame(anova_results, columns=['Feature', 'ANOVA_Statistic', 'ANOVA_pValue'])
|
4064
|
+
kruskal_df = pd.DataFrame(kruskal_results, columns=['Feature', 'Kruskal_Statistic', 'Kruskal_pValue'])
|
4065
|
+
|
4066
|
+
return anova_df, kruskal_df
|
4067
|
+
|
4068
|
+
def combine_results(rf_df, anova_df, kruskal_df):
|
4069
|
+
"""Combine the results into a single DataFrame."""
|
4070
|
+
combined_df = rf_df.merge(anova_df, on='Feature', how='left')
|
4071
|
+
combined_df = combined_df.merge(kruskal_df, on='Feature', how='left')
|
4072
|
+
return combined_df
|
4073
|
+
|
4074
|
+
def cluster_feature_analysis(all_df, cluster_col='cluster'):
|
4075
|
+
"""
|
4076
|
+
Perform Random Forest feature importance, ANOVA for normally distributed features,
|
4077
|
+
and Kruskal-Wallis for non-normally distributed features. Combine results into a single DataFrame.
|
4078
|
+
"""
|
4079
|
+
rf_df = random_forest_feature_importance(all_df, cluster_col)
|
4080
|
+
anova_df, kruskal_df = perform_statistical_tests(all_df, cluster_col)
|
4081
|
+
combined_df = combine_results(rf_df, anova_df, kruskal_df)
|
4082
|
+
return combined_df
|
4083
|
+
|
4084
|
+
def _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold=5, perimeter_threshold=30):
|
4085
|
+
"""
|
4086
|
+
Merge cells in cell_mask if a parasite in parasite_mask overlaps with more than one cell,
|
4087
|
+
and if cells share more than a specified perimeter percentage.
|
4068
4088
|
|
4089
|
+
Args:
|
4090
|
+
parasite_mask (ndarray): Mask of parasites.
|
4091
|
+
cell_mask (ndarray): Mask of cells.
|
4092
|
+
nuclei_mask (ndarray): Mask of nuclei.
|
4093
|
+
overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
|
4094
|
+
perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
|
4095
|
+
|
4096
|
+
Returns:
|
4097
|
+
ndarray: The modified cell mask (cell_mask) with unique labels.
|
4098
|
+
"""
|
4099
|
+
labeled_cells = label(cell_mask)
|
4100
|
+
labeled_parasites = label(parasite_mask)
|
4101
|
+
labeled_nuclei = label(nuclei_mask)
|
4102
|
+
num_parasites = np.max(labeled_parasites)
|
4103
|
+
num_cells = np.max(labeled_cells)
|
4104
|
+
num_nuclei = np.max(labeled_nuclei)
|
4105
|
+
|
4106
|
+
# Merge cells based on parasite overlap
|
4107
|
+
for parasite_id in range(1, num_parasites + 1):
|
4108
|
+
current_parasite_mask = labeled_parasites == parasite_id
|
4109
|
+
overlapping_cell_labels = np.unique(labeled_cells[current_parasite_mask])
|
4110
|
+
overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
|
4111
|
+
if len(overlapping_cell_labels) > 1:
|
4112
|
+
|
4113
|
+
# Calculate the overlap percentages
|
4114
|
+
overlap_percentages = [
|
4115
|
+
np.sum(current_parasite_mask & (labeled_cells == cell_label)) / np.sum(current_parasite_mask) * 100
|
4116
|
+
for cell_label in overlapping_cell_labels
|
4117
|
+
]
|
4118
|
+
# Merge cells if overlap percentage is above the threshold
|
4119
|
+
for cell_label, overlap_percentage in zip(overlapping_cell_labels, overlap_percentages):
|
4120
|
+
if overlap_percentage > overlap_threshold:
|
4121
|
+
first_label = overlapping_cell_labels[0]
|
4122
|
+
for other_label in overlapping_cell_labels[1:]:
|
4123
|
+
if other_label != first_label:
|
4124
|
+
cell_mask[cell_mask == other_label] = first_label
|
4125
|
+
|
4126
|
+
# Merge cells based on nucleus overlap
|
4127
|
+
for nucleus_id in range(1, num_nuclei + 1):
|
4128
|
+
current_nucleus_mask = labeled_nuclei == nucleus_id
|
4129
|
+
overlapping_cell_labels = np.unique(labeled_cells[current_nucleus_mask])
|
4130
|
+
overlapping_cell_labels = overlapping_cell_labels[overlapping_cell_labels != 0]
|
4131
|
+
if len(overlapping_cell_labels) > 1:
|
4132
|
+
|
4133
|
+
# Calculate the overlap percentages
|
4134
|
+
overlap_percentages = [
|
4135
|
+
np.sum(current_nucleus_mask & (labeled_cells == cell_label)) / np.sum(current_nucleus_mask) * 100
|
4136
|
+
for cell_label in overlapping_cell_labels
|
4137
|
+
]
|
4138
|
+
# Merge cells if overlap percentage is above the threshold for each cell
|
4139
|
+
if all(overlap_percentage > overlap_threshold for overlap_percentage in overlap_percentages):
|
4140
|
+
first_label = overlapping_cell_labels[0]
|
4141
|
+
for other_label in overlapping_cell_labels[1:]:
|
4142
|
+
if other_label != first_label:
|
4143
|
+
cell_mask[cell_mask == other_label] = first_label
|
4144
|
+
|
4145
|
+
# Check for cells without nuclei and merge based on shared perimeter
|
4146
|
+
labeled_cells = label(cell_mask) # Re-label after merging based on overlap
|
4147
|
+
cell_regions = regionprops(labeled_cells)
|
4148
|
+
for region in cell_regions:
|
4149
|
+
cell_label = region.label
|
4150
|
+
cell_mask_binary = labeled_cells == cell_label
|
4151
|
+
overlapping_nuclei = np.unique(nuclei_mask[cell_mask_binary])
|
4152
|
+
overlapping_nuclei = overlapping_nuclei[overlapping_nuclei != 0]
|
4153
|
+
|
4154
|
+
if len(overlapping_nuclei) == 0:
|
4155
|
+
|
4156
|
+
# Cell does not overlap with any nucleus
|
4157
|
+
perimeter = region.perimeter
|
4158
|
+
|
4159
|
+
# Dilate the cell to find neighbors
|
4160
|
+
dilated_cell = binary_dilation(cell_mask_binary, structure=square(3))
|
4161
|
+
neighbor_cells = np.unique(labeled_cells[dilated_cell])
|
4162
|
+
neighbor_cells = neighbor_cells[(neighbor_cells != 0) & (neighbor_cells != cell_label)]
|
4163
|
+
|
4164
|
+
# Calculate shared border length with neighboring cells
|
4165
|
+
shared_borders = [
|
4166
|
+
np.sum((labeled_cells == neighbor_label) & dilated_cell) for neighbor_label in neighbor_cells
|
4167
|
+
]
|
4168
|
+
shared_border_percentages = [shared_border / perimeter * 100 for shared_border in shared_borders]
|
4169
|
+
|
4170
|
+
# Merge with the neighbor cell with the largest shared border percentage above the threshold
|
4171
|
+
if shared_borders:
|
4172
|
+
max_shared_border_index = np.argmax(shared_border_percentages)
|
4173
|
+
max_shared_border_percentage = shared_border_percentages[max_shared_border_index]
|
4174
|
+
if max_shared_border_percentage > perimeter_threshold:
|
4175
|
+
cell_mask[labeled_cells == cell_label] = neighbor_cells[max_shared_border_index]
|
4176
|
+
|
4177
|
+
# Relabel the merged cell mask
|
4178
|
+
relabeled_cell_mask, _ = label(cell_mask, return_num=True)
|
4179
|
+
return relabeled_cell_mask
|
4180
|
+
|
4181
|
+
def adjust_cell_masks(parasite_folder, cell_folder, nuclei_folder, overlap_threshold=5, perimeter_threshold=30):
|
4182
|
+
|
4183
|
+
"""
|
4184
|
+
Process all npy files in the given folders. Merge and relabel cells in cell masks
|
4185
|
+
based on parasite overlap and cell perimeter sharing conditions.
|
4186
|
+
|
4187
|
+
Args:
|
4188
|
+
parasite_folder (str): Path to the folder containing parasite masks.
|
4189
|
+
cell_folder (str): Path to the folder containing cell masks.
|
4190
|
+
nuclei_folder (str): Path to the folder containing nuclei masks.
|
4191
|
+
overlap_threshold (float): The percentage threshold for merging cells based on parasite overlap.
|
4192
|
+
perimeter_threshold (float): The percentage threshold for merging cells based on shared perimeter.
|
4193
|
+
"""
|
4194
|
+
|
4195
|
+
parasite_files = sorted([f for f in os.listdir(parasite_folder) if f.endswith('.npy')])
|
4196
|
+
cell_files = sorted([f for f in os.listdir(cell_folder) if f.endswith('.npy')])
|
4197
|
+
nuclei_files = sorted([f for f in os.listdir(nuclei_folder) if f.endswith('.npy')])
|
4198
|
+
|
4199
|
+
# Ensure there are matching files in all folders
|
4200
|
+
if not (len(parasite_files) == len(cell_files) == len(nuclei_files)):
|
4201
|
+
raise ValueError("The number of files in the folders do not match.")
|
4202
|
+
|
4203
|
+
# Match files by name
|
4204
|
+
for file_name in parasite_files:
|
4205
|
+
parasite_path = os.path.join(parasite_folder, file_name)
|
4206
|
+
cell_path = os.path.join(cell_folder, file_name)
|
4207
|
+
nuclei_path = os.path.join(nuclei_folder, file_name)
|
4208
|
+
# Check if the corresponding cell and nuclei mask files exist
|
4209
|
+
if not (os.path.exists(cell_path) and os.path.exists(nuclei_path)):
|
4210
|
+
raise ValueError(f"Corresponding cell or nuclei mask file for {file_name} not found.")
|
4211
|
+
# Load the masks
|
4212
|
+
parasite_mask = np.load(parasite_path)
|
4213
|
+
cell_mask = np.load(cell_path)
|
4214
|
+
nuclei_mask = np.load(nuclei_path)
|
4215
|
+
# Merge and relabel cells
|
4216
|
+
merged_cell_mask = _merge_cells_based_on_parasite_overlap(parasite_mask, cell_mask, nuclei_mask, overlap_threshold, perimeter_threshold)
|
4217
|
+
|
4218
|
+
# Force 16 bit
|
4219
|
+
mamerged_cell_masksk = merged_cell_mask.astype(np.uint16)
|
4220
|
+
|
4221
|
+
# Overwrite the original cell mask file with the merged result
|
4222
|
+
np.save(cell_path, merged_cell_mask)
|
4223
|
+
|
4224
|
+
def process_masks(mask_folder, image_folder, channel, batch_size=50, n_clusters=2, plot=False):
|
4225
|
+
|
4226
|
+
def read_files_in_batches(folder, batch_size=50):
|
4227
|
+
files = [f for f in os.listdir(folder) if f.endswith('.npy')]
|
4228
|
+
files.sort() # Sort to ensure matching order
|
4229
|
+
for i in range(0, len(files), batch_size):
|
4230
|
+
yield files[i:i + batch_size]
|
4231
|
+
|
4232
|
+
def measure_morphology_and_intensity(mask, image):
|
4233
|
+
properties = measure.regionprops(mask, intensity_image=image)
|
4234
|
+
properties_list = [{'area': p.area, 'mean_intensity': p.mean_intensity, 'perimeter': p.perimeter, 'eccentricity': p.eccentricity} for p in properties]
|
4235
|
+
return properties_list
|
4236
|
+
|
4237
|
+
def cluster_objects(properties, n_clusters=2):
|
4238
|
+
data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
|
4239
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
|
4240
|
+
return kmeans
|
4241
|
+
|
4242
|
+
def remove_objects_not_in_largest_cluster(mask, labels, largest_cluster_label):
|
4243
|
+
cleaned_mask = np.zeros_like(mask)
|
4244
|
+
for region in measure.regionprops(mask):
|
4245
|
+
if labels[region.label - 1] == largest_cluster_label:
|
4246
|
+
cleaned_mask[mask == region.label] = region.label
|
4247
|
+
return cleaned_mask
|
4248
|
+
|
4249
|
+
def plot_clusters(properties, labels):
|
4250
|
+
data = np.array([[p['area'], p['mean_intensity'], p['perimeter'], p['eccentricity']] for p in properties])
|
4251
|
+
pca = PCA(n_components=2)
|
4252
|
+
data_2d = pca.fit_transform(data)
|
4253
|
+
plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels, cmap='viridis')
|
4254
|
+
plt.xlabel('PCA Component 1')
|
4255
|
+
plt.ylabel('PCA Component 2')
|
4256
|
+
plt.title('Object Clustering')
|
4257
|
+
plt.show()
|
4258
|
+
|
4259
|
+
all_properties = []
|
4260
|
+
|
4261
|
+
# Step 1: Accumulate properties over all files
|
4262
|
+
for batch in read_files_in_batches(mask_folder, batch_size):
|
4263
|
+
mask_files = [os.path.join(mask_folder, file) for file in batch]
|
4264
|
+
image_files = [os.path.join(image_folder, file) for file in batch]
|
4265
|
+
|
4266
|
+
masks = [np.load(file) for file in mask_files]
|
4267
|
+
images = [np.load(file)[:, :, channel] for file in image_files]
|
4268
|
+
|
4269
|
+
for i, mask in enumerate(masks):
|
4270
|
+
image = images[i]
|
4271
|
+
# Measure morphology and intensity
|
4272
|
+
properties = measure_morphology_and_intensity(mask, image)
|
4273
|
+
all_properties.extend(properties)
|
4274
|
+
|
4275
|
+
# Step 2: Perform clustering on accumulated properties
|
4276
|
+
kmeans = cluster_objects(all_properties, n_clusters)
|
4277
|
+
labels = kmeans.labels_
|
4278
|
+
|
4279
|
+
if plot:
|
4280
|
+
# Step 3: Plot clusters using PCA
|
4281
|
+
plot_clusters(all_properties, labels)
|
4282
|
+
|
4283
|
+
# Step 4: Remove objects not in the largest cluster and overwrite files in batches
|
4284
|
+
label_index = 0
|
4285
|
+
for batch in read_files_in_batches(mask_folder, batch_size):
|
4286
|
+
mask_files = [os.path.join(mask_folder, file) for file in batch]
|
4287
|
+
masks = [np.load(file) for file in mask_files]
|
4288
|
+
|
4289
|
+
for i, mask in enumerate(masks):
|
4290
|
+
batch_properties = measure_morphology_and_intensity(mask, mask)
|
4291
|
+
batch_labels = labels[label_index:label_index + len(batch_properties)]
|
4292
|
+
largest_cluster_label = np.bincount(batch_labels).argmax()
|
4293
|
+
cleaned_mask = remove_objects_not_in_largest_cluster(mask, batch_labels, largest_cluster_label)
|
4294
|
+
np.save(mask_files[i], cleaned_mask)
|
4295
|
+
label_index += len(batch_properties)
|