gsMap 1.71__py3-none-any.whl → 1.71.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/GNN/__init__.py CHANGED
File without changes
@@ -1,75 +1,75 @@
1
- import numpy as np
2
- import pandas as pd
3
- import scipy.sparse as sp
4
- from sklearn.neighbors import NearestNeighbors
5
- import torch
6
-
7
- def cal_spatial_net(adata, n_neighbors=5, verbose=True):
8
- """Construct the spatial neighbor network."""
9
- if verbose:
10
- print('------Calculating spatial graph...')
11
- coor = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index)
12
- nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(coor)
13
- distances, indices = nbrs.kneighbors(coor)
14
- n_cells, n_neighbors = indices.shape
15
- cell_indices = np.arange(n_cells)
16
- cell1 = np.repeat(cell_indices, n_neighbors)
17
- cell2 = indices.flatten()
18
- distance = distances.flatten()
19
- knn_df = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
20
- knn_df = knn_df[knn_df['Distance'] > 0].copy()
21
- cell_id_map = dict(zip(cell_indices, coor.index))
22
- knn_df['Cell1'] = knn_df['Cell1'].map(cell_id_map)
23
- knn_df['Cell2'] = knn_df['Cell2'].map(cell_id_map)
24
- return knn_df
25
-
26
- def sparse_mx_to_torch_sparse_tensor(sparse_mx):
27
- """Convert a scipy sparse matrix to a torch sparse tensor."""
28
- sparse_mx = sparse_mx.tocoo().astype(np.float32)
29
- indices = torch.from_numpy(
30
- np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
31
- )
32
- values = torch.from_numpy(sparse_mx.data)
33
- shape = torch.Size(sparse_mx.shape)
34
- return torch.sparse_coo_tensor(indices, values, shape)
35
-
36
- def preprocess_graph(adj):
37
- """Symmetrically normalize the adjacency matrix."""
38
- adj = sp.coo_matrix(adj)
39
- adj_ = adj + sp.eye(adj.shape[0])
40
- rowsum = np.array(adj_.sum(1)).flatten()
41
- degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5))
42
- adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
43
- return sparse_mx_to_torch_sparse_tensor(adj_normalized)
44
-
45
- def construct_adjacency_matrix(adata, params, verbose=True):
46
- """Construct the adjacency matrix from spatial data."""
47
- spatial_net = cal_spatial_net(adata, n_neighbors=params.n_neighbors, verbose=verbose)
48
- if verbose:
49
- num_edges = spatial_net.shape[0]
50
- num_cells = adata.n_obs
51
- print(f'The graph contains {num_edges} edges, {num_cells} cells.')
52
- print(f'{num_edges / num_cells:.2f} neighbors per cell on average.')
53
- cell_ids = {cell: idx for idx, cell in enumerate(adata.obs.index)}
54
- spatial_net['Cell1'] = spatial_net['Cell1'].map(cell_ids)
55
- spatial_net['Cell2'] = spatial_net['Cell2'].map(cell_ids)
56
- if params.weighted_adj:
57
- distance_normalized = spatial_net['Distance'] / (spatial_net['Distance'].max() + 1)
58
- weights = np.exp(-0.5 * distance_normalized ** 2)
59
- adj_org = sp.coo_matrix(
60
- (weights, (spatial_net['Cell1'], spatial_net['Cell2'])),
61
- shape=(adata.n_obs, adata.n_obs)
62
- )
63
- else:
64
- adj_org = sp.coo_matrix(
65
- (np.ones(spatial_net.shape[0]), (spatial_net['Cell1'], spatial_net['Cell2'])),
66
- shape=(adata.n_obs, adata.n_obs)
67
- )
68
- adj_norm = preprocess_graph(adj_org)
69
- norm_value = adj_org.shape[0] ** 2 / ((adj_org.shape[0] ** 2 - adj_org.sum()) * 2)
70
- graph_dict = {
71
- "adj_org": adj_org,
72
- "adj_norm": adj_norm,
73
- "norm_value": norm_value
74
- }
75
- return graph_dict
1
+ import numpy as np
2
+ import pandas as pd
3
+ import scipy.sparse as sp
4
+ from sklearn.neighbors import NearestNeighbors
5
+ import torch
6
+
7
+ def cal_spatial_net(adata, n_neighbors=5, verbose=True):
8
+ """Construct the spatial neighbor network."""
9
+ if verbose:
10
+ print('------Calculating spatial graph...')
11
+ coor = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index)
12
+ nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(coor)
13
+ distances, indices = nbrs.kneighbors(coor)
14
+ n_cells, n_neighbors = indices.shape
15
+ cell_indices = np.arange(n_cells)
16
+ cell1 = np.repeat(cell_indices, n_neighbors)
17
+ cell2 = indices.flatten()
18
+ distance = distances.flatten()
19
+ knn_df = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
20
+ knn_df = knn_df[knn_df['Distance'] > 0].copy()
21
+ cell_id_map = dict(zip(cell_indices, coor.index))
22
+ knn_df['Cell1'] = knn_df['Cell1'].map(cell_id_map)
23
+ knn_df['Cell2'] = knn_df['Cell2'].map(cell_id_map)
24
+ return knn_df
25
+
26
+ def sparse_mx_to_torch_sparse_tensor(sparse_mx):
27
+ """Convert a scipy sparse matrix to a torch sparse tensor."""
28
+ sparse_mx = sparse_mx.tocoo().astype(np.float32)
29
+ indices = torch.from_numpy(
30
+ np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
31
+ )
32
+ values = torch.from_numpy(sparse_mx.data)
33
+ shape = torch.Size(sparse_mx.shape)
34
+ return torch.sparse_coo_tensor(indices, values, shape)
35
+
36
+ def preprocess_graph(adj):
37
+ """Symmetrically normalize the adjacency matrix."""
38
+ adj = sp.coo_matrix(adj)
39
+ adj_ = adj + sp.eye(adj.shape[0])
40
+ rowsum = np.array(adj_.sum(1)).flatten()
41
+ degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5))
42
+ adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
43
+ return sparse_mx_to_torch_sparse_tensor(adj_normalized)
44
+
45
+ def construct_adjacency_matrix(adata, params, verbose=True):
46
+ """Construct the adjacency matrix from spatial data."""
47
+ spatial_net = cal_spatial_net(adata, n_neighbors=params.n_neighbors, verbose=verbose)
48
+ if verbose:
49
+ num_edges = spatial_net.shape[0]
50
+ num_cells = adata.n_obs
51
+ print(f'The graph contains {num_edges} edges, {num_cells} cells.')
52
+ print(f'{num_edges / num_cells:.2f} neighbors per cell on average.')
53
+ cell_ids = {cell: idx for idx, cell in enumerate(adata.obs.index)}
54
+ spatial_net['Cell1'] = spatial_net['Cell1'].map(cell_ids)
55
+ spatial_net['Cell2'] = spatial_net['Cell2'].map(cell_ids)
56
+ if params.weighted_adj:
57
+ distance_normalized = spatial_net['Distance'] / (spatial_net['Distance'].max() + 1)
58
+ weights = np.exp(-0.5 * distance_normalized ** 2)
59
+ adj_org = sp.coo_matrix(
60
+ (weights, (spatial_net['Cell1'], spatial_net['Cell2'])),
61
+ shape=(adata.n_obs, adata.n_obs)
62
+ )
63
+ else:
64
+ adj_org = sp.coo_matrix(
65
+ (np.ones(spatial_net.shape[0]), (spatial_net['Cell1'], spatial_net['Cell2'])),
66
+ shape=(adata.n_obs, adata.n_obs)
67
+ )
68
+ adj_norm = preprocess_graph(adj_org)
69
+ norm_value = adj_org.shape[0] ** 2 / ((adj_org.shape[0] ** 2 - adj_org.sum()) * 2)
70
+ graph_dict = {
71
+ "adj_org": adj_org,
72
+ "adj_norm": adj_norm,
73
+ "norm_value": norm_value
74
+ }
75
+ return graph_dict
gsMap/GNN/model.py CHANGED
@@ -1,89 +1,90 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
- from torch_geometric.nn import GATConv
5
-
6
- def full_block(in_features, out_features, p_drop):
7
- return nn.Sequential(
8
- nn.Linear(in_features, out_features),
9
- nn.BatchNorm1d(out_features),
10
- nn.ELU(),
11
- nn.Dropout(p=p_drop)
12
- )
13
-
14
- class GATModel(nn.Module):
15
- def __init__(self, input_dim, params, num_classes=1):
16
- super().__init__()
17
- self.var = params.var
18
- self.num_classes = num_classes
19
- self.params = params
20
-
21
- # Encoder
22
- self.encoder = nn.Sequential(
23
- full_block(input_dim, params.feat_hidden1, params.p_drop),
24
- full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop)
25
- )
26
-
27
- # GAT Layers
28
- self.gat1 = GATConv(
29
- in_channels=params.feat_hidden2,
30
- out_channels=params.gat_hidden1,
31
- heads=params.nheads,
32
- dropout=params.p_drop
33
- )
34
- self.gat2 = GATConv(
35
- in_channels=params.gat_hidden1 * params.nheads,
36
- out_channels=params.gat_hidden2,
37
- heads=1,
38
- concat=False,
39
- dropout=params.p_drop
40
- )
41
- if self.var:
42
- self.gat3 = GATConv(
43
- in_channels=params.gat_hidden1 * params.nheads,
44
- out_channels=params.gat_hidden2,
45
- heads=1,
46
- concat=False,
47
- dropout=params.p_drop
48
- )
49
-
50
- # Decoder
51
- self.decoder = nn.Sequential(
52
- full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
53
- full_block(params.feat_hidden2, params.feat_hidden1, params.p_drop),
54
- nn.Linear(params.feat_hidden1, input_dim)
55
- )
56
-
57
- # Clustering Layer
58
- self.cluster = nn.Sequential(
59
- full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
60
- nn.Linear(params.feat_hidden2, self.num_classes)
61
- )
62
-
63
- def encode(self, x, edge_index):
64
- x = self.encoder(x)
65
- x = self.gat1(x, edge_index)
66
- x = F.relu(x)
67
- x = F.dropout(x, p=self.params.p_drop, training=self.training)
68
-
69
- mu = self.gat2(x, edge_index)
70
- if self.var:
71
- logvar = self.gat3(x, edge_index)
72
- return mu, logvar
73
- else:
74
- return mu, None
75
-
76
- def reparameterize(self, mu, logvar):
77
- if self.training and logvar is not None:
78
- std = torch.exp(0.5 * logvar)
79
- eps = torch.randn_like(std)
80
- return eps * std + mu
81
- else:
82
- return mu
83
-
84
- def forward(self, x, edge_index):
85
- mu, logvar = self.encode(x, edge_index)
86
- z = self.reparameterize(mu, logvar)
87
- x_reconstructed = self.decoder(z)
88
- pred_label = F.softmax(self.cluster(z), dim=1)
89
- return pred_label, x_reconstructed, z, mu, logvar
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch_geometric.nn import GATConv
5
+
6
+ def full_block(in_features, out_features, p_drop):
7
+ return nn.Sequential(
8
+ nn.Linear(in_features, out_features),
9
+ nn.BatchNorm1d(out_features),
10
+ nn.ELU(),
11
+ nn.Dropout(p=p_drop)
12
+ )
13
+
14
+ class GATModel(nn.Module):
15
+ def __init__(self, input_dim, params, num_classes=1):
16
+ super().__init__()
17
+ self.var = params.var
18
+ self.num_classes = num_classes
19
+ self.params = params
20
+
21
+ # Encoder
22
+ self.encoder = nn.Sequential(
23
+ full_block(input_dim, params.feat_hidden1, params.p_drop),
24
+ full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop)
25
+ )
26
+
27
+ # GAT Layers
28
+ self.gat1 = GATConv(
29
+ in_channels=params.feat_hidden2,
30
+ out_channels=params.gat_hidden1,
31
+ heads=params.nheads,
32
+ dropout=params.p_drop
33
+ )
34
+ self.gat2 = GATConv(
35
+ in_channels=params.gat_hidden1 * params.nheads,
36
+ out_channels=params.gat_hidden2,
37
+ heads=1,
38
+ concat=False,
39
+ dropout=params.p_drop
40
+ )
41
+ if self.var:
42
+ self.gat3 = GATConv(
43
+ in_channels=params.gat_hidden1 * params.nheads,
44
+ out_channels=params.gat_hidden2,
45
+ heads=1,
46
+ concat=False,
47
+ dropout=params.p_drop
48
+ )
49
+
50
+ # Decoder
51
+ self.decoder = nn.Sequential(
52
+ full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
53
+ full_block(params.feat_hidden2, params.feat_hidden1, params.p_drop),
54
+ nn.Linear(params.feat_hidden1, input_dim)
55
+ )
56
+
57
+ # Clustering Layer
58
+ self.cluster = nn.Sequential(
59
+ full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
60
+ nn.Linear(params.feat_hidden2, self.num_classes)
61
+ )
62
+
63
+ def encode(self, x, edge_index):
64
+ x = self.encoder(x)
65
+ x = self.gat1(x, edge_index)
66
+ x = F.relu(x)
67
+ x = F.dropout(x, p=self.params.p_drop, training=self.training)
68
+
69
+ mu = self.gat2(x, edge_index)
70
+ if self.var:
71
+ logvar = self.gat3(x, edge_index)
72
+ return mu, logvar
73
+ else:
74
+ return mu, None
75
+
76
+ def reparameterize(self, mu, logvar):
77
+ if self.training and logvar is not None:
78
+ std = torch.exp(0.5 * logvar)
79
+ eps = torch.randn_like(std)
80
+ return eps * std + mu
81
+ else:
82
+ return mu
83
+
84
+ def forward(self, x, edge_index):
85
+ mu, logvar = self.encode(x, edge_index)
86
+ z = self.reparameterize(mu, logvar)
87
+ x_reconstructed = self.decoder(z)
88
+ # pred_label = F.softmax(self.cluster(z), dim=1)
89
+ pred_label = self.cluster(z)
90
+ return pred_label, x_reconstructed, z, mu, logvar
gsMap/GNN/train.py CHANGED
File without changes
gsMap/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
- '''
2
- Genetics-informed pathogenic spatial mapping
3
- '''
4
-
5
- __version__ = '1.71'
1
+ '''
2
+ Genetics-informed pathogenic spatial mapping
3
+ '''
4
+
5
+ __version__ = '1.71.1'
gsMap/__main__.py CHANGED
@@ -1,3 +1,3 @@
1
- from .main import main
2
- if __name__ == '__main__':
1
+ from .main import main
2
+ if __name__ == '__main__':
3
3
  main()
@@ -1,141 +1,141 @@
1
- import logging
2
- from pathlib import Path
3
-
4
- import numpy as np
5
- import pandas as pd
6
- import scanpy as sc
7
- import scipy as sp
8
-
9
- from gsMap.config import CauchyCombinationConfig
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- # The fun of cauchy combination
14
- def acat_test(pvalues, weights=None):
15
- '''acat_test()
16
- Aggregated Cauchy Assocaition Test
17
- A p-value combination method using the Cauchy distribution.
18
-
19
- Inspired by: https://github.com/yaowuliu/ACAT/blob/master/R/ACAT.R
20
- Inputs:
21
- pvalues: <list or numpy array>
22
- The p-values you want to combine.
23
- weights: <list or numpy array>, default=None
24
- The weights for each of the p-values. If None, equal weights are used.
25
-
26
- Returns:
27
- pval: <float>
28
- The ACAT combined p-value.
29
- '''
30
- if any(np.isnan(pvalues)):
31
- raise Exception("Cannot have NAs in the p-values.")
32
- if any([(i > 1) | (i < 0) for i in pvalues]):
33
- raise Exception("P-values must be between 0 and 1.")
34
- if any([i == 1 for i in pvalues]) & any([i == 0 for i in pvalues]):
35
- raise Exception("Cannot have both 0 and 1 p-values.")
36
- if any([i == 0 for i in pvalues]):
37
- logger.info("Warn: p-values are exactly 0.")
38
- return 0
39
- if any([i == 1 for i in pvalues]):
40
- logger.info("Warn: p-values are exactly 1.")
41
- return 1
42
- if weights == None:
43
- weights = [1 / len(pvalues) for i in pvalues]
44
- elif len(weights) != len(pvalues):
45
- raise Exception("Length of weights and p-values differs.")
46
- elif any([i < 0 for i in weights]):
47
- raise Exception("All weights must be positive.")
48
- else:
49
- weights = [i / len(weights) for i in weights]
50
-
51
- pvalues = np.array(pvalues)
52
- weights = np.array(weights)
53
-
54
- if any([i < 1e-16 for i in pvalues]) == False:
55
- cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
56
- else:
57
- is_small = [i < (1e-16) for i in pvalues]
58
- is_large = [i >= (1e-16) for i in pvalues]
59
- cct_stat = sum((weights[is_small] / pvalues[is_small]) / np.pi)
60
- cct_stat += sum(weights[is_large] * np.tan((0.5 - pvalues[is_large]) * np.pi))
61
-
62
- if cct_stat > 1e15:
63
- pval = (1 / cct_stat) / np.pi
64
- else:
65
- pval = 1 - sp.stats.cauchy.cdf(cct_stat)
66
-
67
- return pval
68
-
69
-
70
- def run_Cauchy_combination(config:CauchyCombinationConfig):
71
- # Load the ldsc results
72
- logger.info(f'------Loading LDSC results of {config.ldsc_save_dir}...')
73
- ldsc_input_file= config.get_ldsc_result_file(config.trait_name)
74
- ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
75
- ldsc.spot = ldsc.spot.astype(str).replace('\.0', '', regex=True)
76
- ldsc.index = ldsc.spot
77
- if config.meta is None:
78
- # Load the spatial data
79
- logger.info(f'------Loading ST data of {config.hdf5_with_latent_path}...')
80
- spe = sc.read_h5ad(f'{config.hdf5_with_latent_path}')
81
-
82
- common_cell = np.intersect1d(ldsc.index, spe.obs_names)
83
- spe = spe[common_cell]
84
- ldsc = ldsc.loc[common_cell]
85
-
86
- # Add the annotation
87
- ldsc['annotation'] = spe.obs.loc[ldsc.spot][config.annotation].to_list()
88
-
89
- elif config.meta is not None:
90
- # Or Load the additional annotation (just for the macaque data at this stage: 2023Nov25)
91
- logger.info(f'------Loading additional annotation...')
92
- meta = pd.read_csv(config.meta, index_col=0)
93
- meta = meta.loc[meta.slide == config.slide]
94
- meta.index = meta.cell_id.astype(str).replace('\.0', '', regex=True)
95
-
96
- common_cell = np.intersect1d(ldsc.index, meta.index)
97
- meta = meta.loc[common_cell]
98
- ldsc = ldsc.loc[common_cell]
99
-
100
- # Add the annotation
101
- ldsc['annotation'] = meta.loc[ldsc.spot][config.annotation].to_list()
102
- # Perform the Cauchy combination based on the given annotations
103
- p_cauchy = []
104
- p_median = []
105
- for ct in np.unique(ldsc.annotation):
106
- p_temp = ldsc.loc[ldsc['annotation'] == ct, 'p']
107
-
108
- # The Cauchy test is sensitive to very small p-values, so extreme outliers should be considered for removal...
109
- # to enhance robustness, particularly in cases where spot annotations may be incorrect.
110
- # p_cauchy_temp = acat_test(p_temp[p_temp != np.min(p_temp)])
111
- p_temp_log = -np.log10(p_temp)
112
- median_log = np.median(p_temp_log)
113
- IQR_log = np.percentile(p_temp_log, 75) - np.percentile(p_temp_log, 25)
114
-
115
- p_use = p_temp[p_temp_log < median_log + 3*IQR_log]
116
- n_remove = len(p_temp) - len(p_use)
117
-
118
- # Outlier: -log10(p) < median + 3IQR && len(outlier set) < 20
119
- if (0 < n_remove < 20):
120
- logger.info(f'Remove {n_remove}/{len(p_temp)} outliers (median + 3IQR) for {ct}.')
121
- p_cauchy_temp = acat_test(p_use)
122
- else:
123
- p_cauchy_temp = acat_test(p_temp)
124
-
125
- p_median_temp = np.median(p_temp)
126
-
127
- p_cauchy.append(p_cauchy_temp)
128
- p_median.append(p_median_temp)
129
- # p_tissue = pd.DataFrame(p_cauchy,p_median,np.unique(ldsc.annotation))
130
- data = {'p_cauchy': p_cauchy, 'p_median': p_median, 'annotation': np.unique(ldsc.annotation)}
131
- p_tissue = pd.DataFrame(data)
132
- p_tissue.columns = ['p_cauchy', 'p_median', 'annotation']
133
- # Save the results
134
- output_dir = Path(config.cauchy_save_dir)
135
- output_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
136
- output_file = output_dir / f'{config.sample_name}_{config.trait_name}.Cauchy.csv.gz'
137
- p_tissue.to_csv(
138
- output_file,
139
- compression='gzip',
140
- index=False,
141
- )
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import scanpy as sc
7
+ import scipy as sp
8
+
9
+ from gsMap.config import CauchyCombinationConfig
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # The fun of cauchy combination
14
+ def acat_test(pvalues, weights=None):
15
+ '''acat_test()
16
+ Aggregated Cauchy Assocaition Test
17
+ A p-value combination method using the Cauchy distribution.
18
+
19
+ Inspired by: https://github.com/yaowuliu/ACAT/blob/master/R/ACAT.R
20
+ Inputs:
21
+ pvalues: <list or numpy array>
22
+ The p-values you want to combine.
23
+ weights: <list or numpy array>, default=None
24
+ The weights for each of the p-values. If None, equal weights are used.
25
+
26
+ Returns:
27
+ pval: <float>
28
+ The ACAT combined p-value.
29
+ '''
30
+ if any(np.isnan(pvalues)):
31
+ raise Exception("Cannot have NAs in the p-values.")
32
+ if any([(i > 1) | (i < 0) for i in pvalues]):
33
+ raise Exception("P-values must be between 0 and 1.")
34
+ if any([i == 1 for i in pvalues]) & any([i == 0 for i in pvalues]):
35
+ raise Exception("Cannot have both 0 and 1 p-values.")
36
+ if any([i == 0 for i in pvalues]):
37
+ logger.info("Warn: p-values are exactly 0.")
38
+ return 0
39
+ if any([i == 1 for i in pvalues]):
40
+ logger.info("Warn: p-values are exactly 1.")
41
+ return 1
42
+ if weights == None:
43
+ weights = [1 / len(pvalues) for i in pvalues]
44
+ elif len(weights) != len(pvalues):
45
+ raise Exception("Length of weights and p-values differs.")
46
+ elif any([i < 0 for i in weights]):
47
+ raise Exception("All weights must be positive.")
48
+ else:
49
+ weights = [i / len(weights) for i in weights]
50
+
51
+ pvalues = np.array(pvalues)
52
+ weights = np.array(weights)
53
+
54
+ if any([i < 1e-16 for i in pvalues]) == False:
55
+ cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
56
+ else:
57
+ is_small = [i < (1e-16) for i in pvalues]
58
+ is_large = [i >= (1e-16) for i in pvalues]
59
+ cct_stat = sum((weights[is_small] / pvalues[is_small]) / np.pi)
60
+ cct_stat += sum(weights[is_large] * np.tan((0.5 - pvalues[is_large]) * np.pi))
61
+
62
+ if cct_stat > 1e15:
63
+ pval = (1 / cct_stat) / np.pi
64
+ else:
65
+ pval = 1 - sp.stats.cauchy.cdf(cct_stat)
66
+
67
+ return pval
68
+
69
+
70
+ def run_Cauchy_combination(config:CauchyCombinationConfig):
71
+ # Load the ldsc results
72
+ logger.info(f'------Loading LDSC results of {config.ldsc_save_dir}...')
73
+ ldsc_input_file= config.get_ldsc_result_file(config.trait_name)
74
+ ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
75
+ ldsc.spot = ldsc.spot.astype(str).replace('\.0', '', regex=True)
76
+ ldsc.index = ldsc.spot
77
+ if config.meta is None:
78
+ # Load the spatial data
79
+ logger.info(f'------Loading ST data of {config.hdf5_with_latent_path}...')
80
+ spe = sc.read_h5ad(f'{config.hdf5_with_latent_path}')
81
+
82
+ common_cell = np.intersect1d(ldsc.index, spe.obs_names)
83
+ spe = spe[common_cell]
84
+ ldsc = ldsc.loc[common_cell]
85
+
86
+ # Add the annotation
87
+ ldsc['annotation'] = spe.obs.loc[ldsc.spot][config.annotation].to_list()
88
+
89
+ elif config.meta is not None:
90
+ # Or Load the additional annotation (just for the macaque data at this stage: 2023Nov25)
91
+ logger.info(f'------Loading additional annotation...')
92
+ meta = pd.read_csv(config.meta, index_col=0)
93
+ meta = meta.loc[meta.slide == config.slide]
94
+ meta.index = meta.cell_id.astype(str).replace('\.0', '', regex=True)
95
+
96
+ common_cell = np.intersect1d(ldsc.index, meta.index)
97
+ meta = meta.loc[common_cell]
98
+ ldsc = ldsc.loc[common_cell]
99
+
100
+ # Add the annotation
101
+ ldsc['annotation'] = meta.loc[ldsc.spot][config.annotation].to_list()
102
+ # Perform the Cauchy combination based on the given annotations
103
+ p_cauchy = []
104
+ p_median = []
105
+ for ct in np.unique(ldsc.annotation):
106
+ p_temp = ldsc.loc[ldsc['annotation'] == ct, 'p']
107
+
108
+ # The Cauchy test is sensitive to very small p-values, so extreme outliers should be considered for removal...
109
+ # to enhance robustness, particularly in cases where spot annotations may be incorrect.
110
+ # p_cauchy_temp = acat_test(p_temp[p_temp != np.min(p_temp)])
111
+ p_temp_log = -np.log10(p_temp)
112
+ median_log = np.median(p_temp_log)
113
+ IQR_log = np.percentile(p_temp_log, 75) - np.percentile(p_temp_log, 25)
114
+
115
+ p_use = p_temp[p_temp_log < median_log + 3*IQR_log]
116
+ n_remove = len(p_temp) - len(p_use)
117
+
118
+ # Outlier: -log10(p) < median + 3IQR && len(outlier set) < 20
119
+ if (0 < n_remove < 20):
120
+ logger.info(f'Remove {n_remove}/{len(p_temp)} outliers (median + 3IQR) for {ct}.')
121
+ p_cauchy_temp = acat_test(p_use)
122
+ else:
123
+ p_cauchy_temp = acat_test(p_temp)
124
+
125
+ p_median_temp = np.median(p_temp)
126
+
127
+ p_cauchy.append(p_cauchy_temp)
128
+ p_median.append(p_median_temp)
129
+ # p_tissue = pd.DataFrame(p_cauchy,p_median,np.unique(ldsc.annotation))
130
+ data = {'p_cauchy': p_cauchy, 'p_median': p_median, 'annotation': np.unique(ldsc.annotation)}
131
+ p_tissue = pd.DataFrame(data)
132
+ p_tissue.columns = ['p_cauchy', 'p_median', 'annotation']
133
+ # Save the results
134
+ output_dir = Path(config.cauchy_save_dir)
135
+ output_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
136
+ output_file = output_dir / f'{config.sample_name}_{config.trait_name}.Cauchy.csv.gz'
137
+ p_tissue.to_csv(
138
+ output_file,
139
+ compression='gzip',
140
+ index=False,
141
+ )