PyPI - gsMap - Versions diffs - 1.60__py3-none-any.whl - Mend

gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

gsMap/GNN_VAE/__init__.py +0 -0
gsMap/GNN_VAE/adjacency_matrix.py +95 -0
gsMap/GNN_VAE/model.py +87 -0
gsMap/GNN_VAE/train.py +97 -0
gsMap/__init__.py +5 -0
gsMap/__main__.py +3 -0
gsMap/cauchy_combination_test.py +163 -0
gsMap/config.py +734 -0
gsMap/find_latent_representation.py +209 -0
gsMap/format_sumstats.py +410 -0
gsMap/generate_ldscore.py +551 -0
gsMap/generate_r2_matrix.py +743 -0
gsMap/jackknife.py +514 -0
gsMap/latent_to_gene.py +257 -0
gsMap/main.py +39 -0
gsMap/make_annotations.py +560 -0
gsMap/regression_read.py +294 -0
gsMap/spatial_ldsc_multiple_sumstats.py +307 -0
gsMap/visualize.py +154 -0
gsmap-1.60.dist-info/LICENSE +21 -0
gsmap-1.60.dist-info/METADATA +124 -0
gsmap-1.60.dist-info/RECORD +24 -0
gsmap-1.60.dist-info/WHEEL +4 -0
gsmap-1.60.dist-info/entry_points.txt +3 -0

gsMap/GNN_VAE/__init__.py ADDED Viewed

File without changes

gsMap/GNN_VAE/adjacency_matrix.py ADDED Viewed

@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  4 21:31:27 2023
+@author: songliyang
+"""
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+import sklearn.neighbors
+import torch
+def Cal_Spatial_Net(adata, n_neighbors=5, verbose=True):
+    """\
+    Construct the spatial neighbor networks.
+    """
+    #-
+    if verbose:
+        print('------Calculating spatial graph...')
+    coor = pd.DataFrame(adata.obsm['spatial'])
+    coor.index = adata.obs.index
+    #-
+    nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors).fit(coor)
+    #-
+    distances, indices = nbrs.kneighbors(coor, return_distance=True)
+    KNN_list = []
+    for it in range(indices.shape[0]):
+        KNN_list.append(pd.DataFrame(zip([it]*indices[it].shape[0], indices[it], distances[it])))
+    #-
+    KNN_df = pd.concat(KNN_list)
+    KNN_df.columns = ['Cell1', 'Cell2', 'Distance']
+    #-
+    Spatial_Net = KNN_df.copy()
+    Spatial_Net = Spatial_Net.loc[Spatial_Net['Distance']>0,]
+    id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index), ))
+    Spatial_Net['Cell1'] = Spatial_Net['Cell1'].map(id_cell_trans)
+    Spatial_Net['Cell2'] = Spatial_Net['Cell2'].map(id_cell_trans)
+    #-
+    return Spatial_Net
+def sparse_mx_to_torch_sparse_tensor(sparse_mx):
+    """Convert a scipy sparse matrix to a torch sparse tensor."""
+    sparse_mx = sparse_mx.tocoo().astype(np.float32)
+    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
+    values = torch.from_numpy(sparse_mx.data)
+    shape = torch.Size(sparse_mx.shape)
+    return torch.sparse.FloatTensor(indices, values, shape)
+def preprocess_graph(adj):
+    adj = sp.coo_matrix(adj)
+    adj_ = adj + sp.eye(adj.shape[0])
+    rowsum = np.array(adj_.sum(1))
+    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
+    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
+    return sparse_mx_to_torch_sparse_tensor(adj_normalized)
+def Construct_Adjacency_Matrix(adata,Params, verbose=True):
+    # Construct the neighbor graph
+    Spatial_Net = Cal_Spatial_Net(adata, n_neighbors=Params.n_neighbors)
+    #-
+    if verbose:
+        print('The graph contains %d edges, %d cells.' %(Spatial_Net.shape[0], adata.n_obs))
+        print('%.4f neighbors per cell on average.' %(Spatial_Net.shape[0]/adata.n_obs))
+    #-
+    cells = np.array(adata.obs.index)
+    cells_id_tran = dict(zip(cells, range(cells.shape[0])))
+    #-
+    G_df = Spatial_Net.copy()
+    G_df['Cell1'] = G_df['Cell1'].map(cells_id_tran)
+    G_df['Cell2'] = G_df['Cell2'].map(cells_id_tran)
+    #-
+    if Params.weighted_adj:
+        distance_normalized = G_df.Distance/(max(G_df.Distance)+1)
+        adj_org = sp.coo_matrix((np.exp(-distance_normalized**2/(2)), (G_df['Cell1'], G_df['Cell2'])), shape=(adata.n_obs, adata.n_obs))
+    else:
+        adj_org = sp.coo_matrix((np.ones(G_df.shape[0]), (G_df['Cell1'], G_df['Cell2'])), shape=(adata.n_obs, adata.n_obs))
+    #-
+    adj_m1 = adj_org
+    adj_norm_m1 = preprocess_graph(adj_m1)
+    adj_label_m1 = adj_m1 + sp.eye(adj_m1.shape[0])
+    norm_m1 = adj_m1.shape[0] * adj_m1.shape[0] / float((adj_m1.shape[0] * adj_m1.shape[0] - adj_m1.sum()) * 2)
+    #-
+    graph_dict = {
+        "adj_org": adj_org,
+        "adj_norm": adj_norm_m1,
+        "norm_value": norm_m1
+    }
+    #-
+    return graph_dict

gsMap/GNN_VAE/model.py ADDED Viewed

@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul  3 11:42:44 2023
+@author: songliyang
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.nn import GATConv
+def full_block(in_features, out_features, p_drop):
+    return nn.Sequential(nn.Linear(in_features, out_features),
+                         nn.BatchNorm1d(out_features),
+                         nn.ELU(),
+                         nn.Dropout(p=p_drop))
+class GNN(nn.Module):
+    def __init__(self, in_features, out_features, dr=0, act=F.relu,heads=1):
+        super().__init__()
+        self.conv1 = GATConv(in_features, out_features,heads)
+        self.act = act
+        self.dr = dr
+    #-
+    def forward(self, x, edge_index):
+        out = self.conv1(x, edge_index)
+        out = self.act(out)
+        out = F.dropout(out, self.dr, self.training)
+        return out
+class GNN_VAE_Model(nn.Module):
+    def __init__(self, input_dim,params,num_classes=1):
+        super(GNN_VAE_Model, self).__init__()
+        self.var = params.var
+        self.num_classes = num_classes
+        # Encoder
+        self.encoder = nn.Sequential()
+        self.encoder.add_module('encoder_L1', full_block(input_dim, params.feat_hidden1, params.p_drop))
+        self.encoder.add_module('encoder_L2', full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop))
+        # GNN (GAT)
+        self.gn1 = GNN(params.feat_hidden2, params.gcn_hidden1, params.p_drop, act=F.relu,heads = params.nheads)
+        self.gn2 = GNN(params.gcn_hidden1*params.nheads, params.gcn_hidden2, params.p_drop, act=lambda x: x)
+        self.gn3 = GNN(params.gcn_hidden1*params.nheads, params.gcn_hidden2, params.p_drop, act=lambda x: x)
+        # Decoder
+        self.decoder = nn.Sequential()
+        self.decoder.add_module('decoder_L1', full_block(params.gcn_hidden2, params.feat_hidden2, params.p_drop))
+        self.decoder.add_module('decoder_L2', full_block(params.feat_hidden2, params.feat_hidden1, params.p_drop))
+        self.decoder.add_module('decoder_output', nn.Sequential(nn.Linear(params.feat_hidden1, input_dim)))
+        # Cluster
+        self.cluster = nn.Sequential()
+        self.cluster.add_module('cluster_L1', full_block(params.gcn_hidden2, params.feat_hidden2, params.p_drop))
+        self.cluster.add_module('cluster_output', nn.Linear(params.feat_hidden2, self.num_classes))
+    def encode(self, x, adj):
+        feat_x = self.encoder(x)
+        hidden1 = self.gn1(feat_x, adj)
+        mu = self.gn2(hidden1, adj)
+        if self.var:
+            logvar = self.gn3(hidden1, adj)
+            return mu, logvar
+        else:
+            return mu, None
+    def reparameterize(self, mu, logvar):
+        if self.training and logvar is not None:
+            std = torch.exp(logvar)
+            eps = torch.randn_like(std)
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+    def forward(self, x, adj):
+        mu, logvar = self.encode(x, adj)
+        gnn_z = self.reparameterize(mu, logvar)
+        x_reconstructed = self.decoder(gnn_z)
+        pred_label = F.softmax(self.cluster(gnn_z),dim=1)
+        return pred_label, x_reconstructed, gnn_z, mu, logvar

gsMap/GNN_VAE/train.py ADDED Viewed

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul  4 19:58:58 2023
+@author: songliyang
+"""
+import time
+import torch
+from progress.bar import Bar
+from gsMap.GNN_VAE.model import GNN_VAE_Model
+def reconstruction_loss(decoded, x):
+    loss_fn = torch.nn.MSELoss()
+    loss = loss_fn(decoded, x)
+    return loss
+def label_loss(pred_label, true_label):
+    loss_fn = torch.nn.CrossEntropyLoss()
+    loss = loss_fn(pred_label, true_label)
+    return loss
+class Model_Train:
+    def __init__(self, node_X, graph_dict, params, label=None):
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        torch.cuda.empty_cache()
+        self.params = params
+        self.device = device
+        self.epochs = params.epochs
+        self.node_X = torch.FloatTensor(node_X.copy()).to(device)
+        self.adj_norm = graph_dict["adj_norm"].to(device).coalesce()
+        self.label = label
+        self.num_classes = 1
+        if not self.label is None:
+            self.label = torch.tensor(self.label).to(self.device)
+            self.num_classes = len(self.label.unique())
+        # Set Model
+        self.model = GNN_VAE_Model(self.params.feat_cell,self.params,self.num_classes).to(device)
+        self.optimizer = torch.optim.Adam(params = list(self.model.parameters()),
+                                          lr = self.params.gcn_lr, weight_decay = self.params.gcn_decay)
+    # Train
+    def run_train(self):
+        self.model.train()
+        prev_loss = float('inf')
+        bar = Bar('GAT-AE model train:', max = self.epochs)
+        bar.check_tty = False
+        for epoch in range(self.epochs):
+            start_time = time.time()
+            self.model.train()
+            self.optimizer.zero_grad()
+            pred_label, de_feat, latent_z, mu, logvar = self.model(self.node_X, self.adj_norm)
+            loss_rec = reconstruction_loss(de_feat, self.node_X)
+            # Check whether annotation was provided
+            if not self.label is None:
+                loss_pre = label_loss(pred_label, self.label)
+                loss = (self.params.rec_w * loss_rec) + (self.params.label_w * loss_pre)
+            else:
+                loss = loss_rec
+            loss.backward()
+            self.optimizer.step()
+            # Update process
+            end_time = time.time()
+            batch_time = end_time - start_time
+            bar_str = '{} / {} | Left time: {batch_time:.2f} mins| Loss: {loss:.4f}'
+            bar.suffix = bar_str.format(epoch + 1,self.epochs,
+                                        batch_time = batch_time * (self.epochs - epoch) / 60, loss=loss.item())
+            bar.next()
+            # Check convergence
+            if abs(loss.item() - prev_loss) <= self.params.convergence_threshold and epoch >= 200:
+                print('\nConvergence reached. Training stopped.')
+                break
+            prev_loss = loss.item()
+        bar.finish()
+    #-
+    def get_latent(self):
+        self.model.eval()
+        pred, de_fea, latent_z, mu, logvar = self.model(self.node_X, self.adj_norm)
+        latent_z = latent_z.data.cpu().numpy()
+        return latent_z

gsMap/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+'''
+Genetics-informed pathogenic spatial mapping
+'''
+__version__ = '1.60'

gsMap/__main__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .main import main
+if __name__ == '__main__':
+    main()

gsMap/cauchy_combination_test.py ADDED Viewed

@@ -0,0 +1,163 @@
+import argparse
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import scanpy as sc
+import scipy as sp
+from gsMap.config import CauchyCombinationConfig, add_Cauchy_combination_args
+# The fun of cauchy combination
+def acat_test(pvalues, weights=None):
+    '''acat_test()
+    Aggregated Cauchy Assocaition Test
+    A p-value combination method using the Cauchy distribution.
+    Inspired by: https://github.com/yaowuliu/ACAT/blob/master/R/ACAT.R
+    Inputs:
+        pvalues: <list or numpy array>
+            The p-values you want to combine.
+        weights: <list or numpy array>, default=None
+            The weights for each of the p-values. If None, equal weights are used.
+    Returns:
+        pval: <float>
+            The ACAT combined p-value.
+    '''
+    if any(np.isnan(pvalues)):
+        raise Exception("Cannot have NAs in the p-values.")
+    if any([(i > 1) | (i < 0) for i in pvalues]):
+        raise Exception("P-values must be between 0 and 1.")
+    if any([i == 1 for i in pvalues]) & any([i == 0 for i in pvalues]):
+        raise Exception("Cannot have both 0 and 1 p-values.")
+    if any([i == 0 for i in pvalues]):
+        print("Warn: p-values are exactly 0.")
+        return 0
+    if any([i == 1 for i in pvalues]):
+        print("Warn: p-values are exactly 1.")
+        return 1
+    if weights == None:
+        weights = [1 / len(pvalues) for i in pvalues]
+    elif len(weights) != len(pvalues):
+        raise Exception("Length of weights and p-values differs.")
+    elif any([i < 0 for i in weights]):
+        raise Exception("All weights must be positive.")
+    else:
+        weights = [i / len(weights) for i in weights]
+    pvalues = np.array(pvalues)
+    weights = np.array(weights)
+    if any([i < 1e-16 for i in pvalues]) == False:
+        cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
+    else:
+        is_small = [i < (1e-16) for i in pvalues]
+        is_large = [i >= (1e-16) for i in pvalues]
+        cct_stat = sum((weights[is_small] / pvalues[is_small]) / np.pi)
+        cct_stat += sum(weights[is_large] * np.tan((0.5 - pvalues[is_large]) * np.pi))
+    if cct_stat > 1e15:
+        pval = (1 / cct_stat) / np.pi
+    else:
+        pval = 1 - sp.stats.cauchy.cdf(cct_stat)
+    return pval
+def run_Cauchy_combination(config:CauchyCombinationConfig):
+    # Load the ldsc results
+    print(f'------Loading LDSC results of {config.input_ldsc_dir}...')
+    ldsc_input_file= Path(config.input_ldsc_dir)/f'{config.sample_name}_{config.trait_name}.csv.gz'
+    ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
+    ldsc.spot = ldsc.spot.astype(str).replace('\.0', '', regex=True)
+    ldsc.index = ldsc.spot
+    if config.meta is None:
+        # Load the spatial data
+        print(f'------Loading ST data of {config.input_hdf5_path}...')
+        spe = sc.read_h5ad(f'{config.input_hdf5_path}')
+        common_cell = np.intersect1d(ldsc.index, spe.obs_names)
+        spe = spe[common_cell,]
+        ldsc = ldsc.loc[common_cell]
+        # Add the annotation
+        ldsc['annotation'] = spe.obs.loc[ldsc.spot][config.annotation].to_list()
+    elif config.meta is not None:
+        # Or Load the additional annotation (just for the macaque data at this stage: 2023Nov25)
+        print(f'------Loading additional annotation...')
+        meta = pd.read_csv(config.meta, index_col=0)
+        meta = meta.loc[meta.slide == config.slide]
+        meta.index = meta.cell_id.astype(str).replace('\.0', '', regex=True)
+        common_cell = np.intersect1d(ldsc.index, meta.index)
+        meta = meta.loc[common_cell]
+        ldsc = ldsc.loc[common_cell]
+        # Add the annotation
+        ldsc['annotation'] = meta.loc[ldsc.spot][config.annotation].to_list()
+    # Perform the Cauchy combination based on the given annotations
+    p_cauchy = []
+    p_median = []
+    for ct in np.unique(ldsc.annotation):
+        p_temp = ldsc.loc[ldsc['annotation'] == ct, 'p']
+        # The Cauchy test is sensitive to very small p-values, so extreme outliers should be considered for removal...
+        # to enhance robustness, particularly in cases where spot annotations may be incorrect.
+        # p_cauchy_temp = acat_test(p_temp[p_temp != np.min(p_temp)])
+        p_temp_log = -np.log10(p_temp)
+        median_log = np.median(p_temp_log)
+        IQR_log = np.percentile(p_temp_log, 75) - np.percentile(p_temp_log, 25)
+        p_use = p_temp[p_temp_log < median_log + 3*IQR_log]
+        n_remove = len(p_temp) - len(p_use)
+        # Outlier: -log10(p) < median + 3IQR && len(outlier set) < 20
+        if (0 < n_remove < 20):
+            print(f'Remove {n_remove}/{len(p_temp)} outliers (median + 3IQR) for {ct}.')
+            p_cauchy_temp = acat_test(p_use)
+        else:
+             p_cauchy_temp = acat_test(p_temp)
+        p_median_temp = np.median(p_temp)
+        p_cauchy.append(p_cauchy_temp)
+        p_median.append(p_median_temp)
+    #     p_tissue = pd.DataFrame(p_cauchy,p_median,np.unique(ldsc.annotation))
+    data = {'p_cauchy': p_cauchy, 'p_median': p_median, 'annotation': np.unique(ldsc.annotation)}
+    p_tissue = pd.DataFrame(data)
+    p_tissue.columns = ['p_cauchy', 'p_median', 'annotation']
+    # Save the results
+    output_dir = Path(config.output_cauchy_dir)
+    output_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
+    output_file = output_dir / f'{config.sample_name}_{config.trait_name}.Cauchy.csv.gz'
+    p_tissue.to_csv(
+        output_file,
+        compression='gzip',
+        index=False,
+    )
+if __name__ == '__main__':
+    TEST = True
+    if TEST:
+        test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
+        name = 'Cortex_151507'
+        config = CauchyCombinationConfig(
+            input_hdf5_path= f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
+            input_ldsc_dir=
+            f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/Cortex_151507/ldsc/',
+            sample_name=name,
+            annotation='layer_guess',
+            output_cauchy_dir='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/Cortex_151507/cauchy/',
+            trait_name='adult1_adult2_onset_asthma',
+        )
+    else:
+        parser = argparse.ArgumentParser(description="Run Cauchy Combination Analysis")
+        add_Cauchy_combination_args(parser)
+        args = parser.parse_args()
+        config = CauchyCombinationConfig(**vars(args))
+    run_Cauchy_combination(config)