PyPI - gsMap - Versions diffs - 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl - Mend

gsMap 1.71.2py3-none-any.whl → 1.72.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

gsMap/GNN/adjacency_matrix.py +25 -27
gsMap/GNN/model.py +9 -7
gsMap/GNN/train.py +8 -11
gsMap/__init__.py +3 -3
gsMap/__main__.py +3 -2
gsMap/cauchy_combination_test.py +75 -72
gsMap/config.py +822 -316
gsMap/create_slice_mean.py +154 -0
gsMap/diagnosis.py +179 -101
gsMap/find_latent_representation.py +28 -26
gsMap/format_sumstats.py +233 -201
gsMap/generate_ldscore.py +353 -209
gsMap/latent_to_gene.py +92 -60
gsMap/main.py +23 -14
gsMap/report.py +39 -25
gsMap/run_all_mode.py +86 -46
gsMap/setup.py +1 -1
gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
gsMap/utils/generate_r2_matrix.py +173 -140
gsMap/utils/jackknife.py +84 -80
gsMap/utils/manhattan_plot.py +180 -207
gsMap/utils/regression_read.py +105 -122
gsMap/visualize.py +82 -64
{gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/METADATA +21 -6
gsmap-1.72.3.dist-info/RECORD +31 -0
{gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/WHEEL +1 -1
gsMap/utils/make_annotations.py +0 -518
gsmap-1.71.2.dist-info/RECORD +0 -31
{gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/LICENSE +0 -0
{gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/entry_points.txt +0 -0

gsMap/GNN/adjacency_matrix.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
-from sklearn.neighbors import NearestNeighbors
 import torch
+from sklearn.neighbors import NearestNeighbors
 def cal_spatial_net(adata, n_neighbors=5, verbose=True):
     """Construct the spatial neighbor network."""
     if verbose:
-        print('------Calculating spatial graph...')
-    coor = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index)
+        print("------Calculating spatial graph...")
+    coor = pd.DataFrame(adata.obsm["spatial"], index=adata.obs.index)
     nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(coor)
     distances, indices = nbrs.kneighbors(coor)
     n_cells, n_neighbors = indices.shape
@@ -16,22 +17,22 @@ def cal_spatial_net(adata, n_neighbors=5, verbose=True):
     cell1 = np.repeat(cell_indices, n_neighbors)
     cell2 = indices.flatten()
     distance = distances.flatten()
-    knn_df = pd.DataFrame({'Cell1': cell1, 'Cell2': cell2, 'Distance': distance})
-    knn_df = knn_df[knn_df['Distance'] > 0].copy()
-    cell_id_map = dict(zip(cell_indices, coor.index))
-    knn_df['Cell1'] = knn_df['Cell1'].map(cell_id_map)
-    knn_df['Cell2'] = knn_df['Cell2'].map(cell_id_map)
+    knn_df = pd.DataFrame({"Cell1": cell1, "Cell2": cell2, "Distance": distance})
+    knn_df = knn_df[knn_df["Distance"] > 0].copy()
+    cell_id_map = dict(zip(cell_indices, coor.index, strict=False))
+    knn_df["Cell1"] = knn_df["Cell1"].map(cell_id_map)
+    knn_df["Cell2"] = knn_df["Cell2"].map(cell_id_map)
     return knn_df
 def sparse_mx_to_torch_sparse_tensor(sparse_mx):
     """Convert a scipy sparse matrix to a torch sparse tensor."""
     sparse_mx = sparse_mx.tocoo().astype(np.float32)
-    indices = torch.from_numpy(
-        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
-    )
+    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
     values = torch.from_numpy(sparse_mx.data)
     shape = torch.Size(sparse_mx.shape)
-    return  torch.sparse_coo_tensor(indices, values, shape)
+    return torch.sparse_coo_tensor(indices, values, shape)
 def preprocess_graph(adj):
     """Symmetrically normalize the adjacency matrix."""
@@ -42,34 +43,31 @@ def preprocess_graph(adj):
     adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
     return sparse_mx_to_torch_sparse_tensor(adj_normalized)
 def construct_adjacency_matrix(adata, params, verbose=True):
     """Construct the adjacency matrix from spatial data."""
     spatial_net = cal_spatial_net(adata, n_neighbors=params.n_neighbors, verbose=verbose)
     if verbose:
         num_edges = spatial_net.shape[0]
         num_cells = adata.n_obs
-        print(f'The graph contains {num_edges} edges, {num_cells} cells.')
-        print(f'{num_edges / num_cells:.2f} neighbors per cell on average.')
+        print(f"The graph contains {num_edges} edges, {num_cells} cells.")
+        print(f"{num_edges / num_cells:.2f} neighbors per cell on average.")
     cell_ids = {cell: idx for idx, cell in enumerate(adata.obs.index)}
-    spatial_net['Cell1'] = spatial_net['Cell1'].map(cell_ids)
-    spatial_net['Cell2'] = spatial_net['Cell2'].map(cell_ids)
+    spatial_net["Cell1"] = spatial_net["Cell1"].map(cell_ids)
+    spatial_net["Cell2"] = spatial_net["Cell2"].map(cell_ids)
     if params.weighted_adj:
-        distance_normalized = spatial_net['Distance'] / (spatial_net['Distance'].max() + 1)
-        weights = np.exp(-0.5 * distance_normalized ** 2)
+        distance_normalized = spatial_net["Distance"] / (spatial_net["Distance"].max() + 1)
+        weights = np.exp(-0.5 * distance_normalized**2)
         adj_org = sp.coo_matrix(
-            (weights, (spatial_net['Cell1'], spatial_net['Cell2'])),
-            shape=(adata.n_obs, adata.n_obs)
+            (weights, (spatial_net["Cell1"], spatial_net["Cell2"])),
+            shape=(adata.n_obs, adata.n_obs),
         )
     else:
         adj_org = sp.coo_matrix(
-            (np.ones(spatial_net.shape[0]), (spatial_net['Cell1'], spatial_net['Cell2'])),
-            shape=(adata.n_obs, adata.n_obs)
+            (np.ones(spatial_net.shape[0]), (spatial_net["Cell1"], spatial_net["Cell2"])),
+            shape=(adata.n_obs, adata.n_obs),
         )
     adj_norm = preprocess_graph(adj_org)
     norm_value = adj_org.shape[0] ** 2 / ((adj_org.shape[0] ** 2 - adj_org.sum()) * 2)
-    graph_dict = {
-        "adj_org": adj_org,
-        "adj_norm": adj_norm,
-        "norm_value": norm_value
-    }
+    graph_dict = {"adj_org": adj_org, "adj_norm": adj_norm, "norm_value": norm_value}
     return graph_dict

gsMap/GNN/model.py CHANGED Viewed

@@ -3,14 +3,16 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch_geometric.nn import GATConv
 def full_block(in_features, out_features, p_drop):
     return nn.Sequential(
         nn.Linear(in_features, out_features),
         nn.BatchNorm1d(out_features),
         nn.ELU(),
-        nn.Dropout(p=p_drop)
+        nn.Dropout(p=p_drop),
     )
 class GATModel(nn.Module):
     def __init__(self, input_dim, params, num_classes=1):
         super().__init__()
@@ -21,7 +23,7 @@ class GATModel(nn.Module):
         # Encoder
         self.encoder = nn.Sequential(
             full_block(input_dim, params.feat_hidden1, params.p_drop),
-            full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop)
+            full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop),
         )
         # GAT Layers
@@ -29,14 +31,14 @@ class GATModel(nn.Module):
             in_channels=params.feat_hidden2,
             out_channels=params.gat_hidden1,
             heads=params.nheads,
-            dropout=params.p_drop
+            dropout=params.p_drop,
         )
         self.gat2 = GATConv(
             in_channels=params.gat_hidden1 * params.nheads,
             out_channels=params.gat_hidden2,
             heads=1,
             concat=False,
-            dropout=params.p_drop
+            dropout=params.p_drop,
         )
         if self.var:
             self.gat3 = GATConv(
@@ -44,20 +46,20 @@ class GATModel(nn.Module):
                 out_channels=params.gat_hidden2,
                 heads=1,
                 concat=False,
-                dropout=params.p_drop
+                dropout=params.p_drop,
             )
         # Decoder
         self.decoder = nn.Sequential(
             full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
             full_block(params.feat_hidden2, params.feat_hidden1, params.p_drop),
-            nn.Linear(params.feat_hidden1, input_dim)
+            nn.Linear(params.feat_hidden1, input_dim),
         )
         # Clustering Layer
         self.cluster = nn.Sequential(
             full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
-            nn.Linear(params.feat_hidden2, self.num_classes)
+            nn.Linear(params.feat_hidden2, self.num_classes),
         )
     def encode(self, x, edge_index):

gsMap/GNN/train.py CHANGED Viewed

@@ -23,7 +23,7 @@ def label_loss(pred_label, true_label):
 class ModelTrainer:
     def __init__(self, node_x, graph_dict, params, label=None):
         """Initialize the ModelTrainer with data and hyperparameters."""
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.params = params
         self.epochs = params.epochs
         self.node_x = torch.FloatTensor(node_x).to(self.device)
@@ -38,17 +38,15 @@ class ModelTrainer:
         # Set up the model
         self.model = GATModel(self.params.feat_cell, self.params, self.num_classes).to(self.device)
         self.optimizer = torch.optim.Adam(
-            self.model.parameters(),
-            lr=self.params.gat_lr,
-            weight_decay=self.params.gcn_decay
+            self.model.parameters(), lr=self.params.gat_lr, weight_decay=self.params.gcn_decay
         )
     def run_train(self):
         """Train the model."""
         self.model.train()
-        prev_loss = float('inf')
-        logger.info('Start training...')
-        pbar = tqdm(range(self.epochs), desc='GAT-AE model train:', total=self.epochs)
+        prev_loss = float("inf")
+        logger.info("Start training...")
+        pbar = tqdm(range(self.epochs), desc="GAT-AE model train:", total=self.epochs)
         for epoch in range(self.epochs):
             start_time = time.time()
             self.optimizer.zero_grad()
@@ -67,18 +65,17 @@ class ModelTrainer:
             batch_time = time.time() - start_time
             left_time = batch_time * (self.epochs - epoch - 1) / 60  # in minutes
-            pbar.set_postfix({'Left time': f'{left_time:.2f} mins', 'Loss': f'{loss.item():.4f}'})
+            pbar.set_postfix({"Left time": f"{left_time:.2f} mins", "Loss": f"{loss.item():.4f}"})
             pbar.update(1)
             if abs(loss.item() - prev_loss) <= self.params.convergence_threshold and epoch >= 200:
                 pbar.close()
-                logger.info('Convergence reached. Training stopped.')
+                logger.info("Convergence reached. Training stopped.")
                 break
             prev_loss = loss.item()
         else:
             pbar.close()
-            logger.info('Max epochs reached. Training stopped.')
+            logger.info("Max epochs reached. Training stopped.")
     def get_latent(self):
         """Retrieve the latent representation from the model."""

gsMap/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-'''
+"""
 Genetics-informed pathogenic spatial mapping
-'''
+"""
-__version__ = '1.71.2'
+__version__ = "1.72.3"

gsMap/__main__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .main import main
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main()

gsMap/cauchy_combination_test.py CHANGED Viewed

@@ -10,9 +10,10 @@ from gsMap.config import CauchyCombinationConfig
 logger = logging.getLogger(__name__)
 # The fun of cauchy combination
 def acat_test(pvalues, weights=None):
-    '''acat_test()
+    """acat_test()
     Aggregated Cauchy Assocaition Test
     A p-value combination method using the Cauchy distribution.
@@ -23,27 +24,28 @@ def acat_test(pvalues, weights=None):
         weights: <list or numpy array>, default=None
             The weights for each of the p-values. If None, equal weights are used.
-    Returns:
+    Returns
+    -------
         pval: <float>
             The ACAT combined p-value.
-    '''
+    """
     if any(np.isnan(pvalues)):
         raise Exception("Cannot have NAs in the p-values.")
-    if any([(i > 1) | (i < 0) for i in pvalues]):
+    if any((i > 1) | (i < 0) for i in pvalues):
         raise Exception("P-values must be between 0 and 1.")
-    if any([i == 1 for i in pvalues]) & any([i == 0 for i in pvalues]):
+    if any(i == 1 for i in pvalues) & any(i == 0 for i in pvalues):
         raise Exception("Cannot have both 0 and 1 p-values.")
-    if any([i == 0 for i in pvalues]):
+    if any(i == 0 for i in pvalues):
         logger.info("Warn: p-values are exactly 0.")
         return 0
-    if any([i == 1 for i in pvalues]):
+    if any(i == 1 for i in pvalues):
         logger.info("Warn: p-values are exactly 1.")
         return 1
-    if weights == None:
+    if weights is None:
         weights = [1 / len(pvalues) for i in pvalues]
     elif len(weights) != len(pvalues):
         raise Exception("Length of weights and p-values differs.")
-    elif any([i < 0 for i in weights]):
+    elif any(i < 0 for i in weights):
         raise Exception("All weights must be positive.")
     else:
         weights = [i / len(weights) for i in weights]
@@ -51,7 +53,7 @@ def acat_test(pvalues, weights=None):
     pvalues = np.array(pvalues)
     weights = np.array(weights)
-    if any([i < 1e-16 for i in pvalues]) == False:
+    if not any(i < 1e-16 for i in pvalues):
         cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
     else:
         is_small = [i < (1e-16) for i in pvalues]
@@ -67,75 +69,76 @@ def acat_test(pvalues, weights=None):
     return pval
-def run_Cauchy_combination(config:CauchyCombinationConfig):
-    # Load the ldsc results
-    logger.info(f'------Loading LDSC results of {config.ldsc_save_dir}...')
-    ldsc_input_file= config.get_ldsc_result_file(config.trait_name)
-    ldsc = pd.read_csv(ldsc_input_file, compression='gzip')
-    ldsc.spot = ldsc.spot.astype(str).replace('\.0', '', regex=True)
-    ldsc.index = ldsc.spot
-    if config.meta is None:
-        # Load the spatial data
-        logger.info(f'------Loading ST data of {config.hdf5_with_latent_path}...')
-        spe = sc.read_h5ad(f'{config.hdf5_with_latent_path}')
-        common_cell = np.intersect1d(ldsc.index, spe.obs_names)
-        spe = spe[common_cell]
-        ldsc = ldsc.loc[common_cell]
-        # Add the annotation
-        ldsc['annotation'] = spe.obs.loc[ldsc.spot][config.annotation].to_list()
-    elif config.meta is not None:
-        # Or Load the additional annotation (just for the macaque data at this stage: 2023Nov25)
-        logger.info(f'------Loading additional annotation...')
-        meta = pd.read_csv(config.meta, index_col=0)
-        meta = meta.loc[meta.slide == config.slide]
-        meta.index = meta.cell_id.astype(str).replace('\.0', '', regex=True)
-        common_cell = np.intersect1d(ldsc.index, meta.index)
-        meta = meta.loc[common_cell]
-        ldsc = ldsc.loc[common_cell]
-        # Add the annotation
-        ldsc['annotation'] = meta.loc[ldsc.spot][config.annotation].to_list()
-    # Perform the Cauchy combination based on the given annotations
+def run_Cauchy_combination(config: CauchyCombinationConfig):
+    ldsc_list = []
+    for sample_name in config.sample_name_list:
+        config.sample_name = sample_name
+        # Load the LDSC results for the current sample
+        logger.info(f"------Loading LDSC results for sample {sample_name}...")
+        ldsc_input_file = config.get_ldsc_result_file(
+            trait_name=config.trait_name,
+        )
+        ldsc = pd.read_csv(ldsc_input_file, compression="gzip")
+        ldsc["spot"] = ldsc["spot"].astype(str)
+        ldsc.index = ldsc["spot"]
+        # Load the spatial transcriptomics (ST) data for the current sample
+        logger.info(f"------Loading ST data for sample {sample_name}...")
+        h5ad_file = config.hdf5_with_latent_path
+        adata = sc.read_h5ad(h5ad_file)
+        # Identify common cells between LDSC results and ST data
+        common_cells = np.intersect1d(ldsc.index, adata.obs_names)
+        adata = adata[common_cells]
+        ldsc = ldsc.loc[common_cells]
+        # Add annotations to the LDSC dataframe
+        ldsc["annotation"] = adata.obs.loc[ldsc.spot, config.annotation].to_list()
+        ldsc_list.append(ldsc)
+    # Concatenate all LDSC dataframes from different samples
+    ldsc_all = pd.concat(ldsc_list)
+    # Run the Cauchy combination
     p_cauchy = []
     p_median = []
-    for ct in np.unique(ldsc.annotation):
-        p_temp = ldsc.loc[ldsc['annotation'] == ct, 'p']
-        # The Cauchy test is sensitive to very small p-values, so extreme outliers should be considered for removal...
-        # to enhance robustness, particularly in cases where spot annotations may be incorrect.
-        # p_cauchy_temp = acat_test(p_temp[p_temp != np.min(p_temp)])
-        p_temp_log = -np.log10(p_temp)
-        median_log = np.median(p_temp_log)
-        IQR_log = np.percentile(p_temp_log, 75) - np.percentile(p_temp_log, 25)
-        p_use = p_temp[p_temp_log < median_log + 3*IQR_log]
-        n_remove = len(p_temp) - len(p_use)
-        # Outlier: -log10(p) < median + 3IQR && len(outlier set) < 20
-        if (0 < n_remove < 20):
-            logger.info(f'Remove {n_remove}/{len(p_temp)} outliers (median + 3IQR) for {ct}.')
-            p_cauchy_temp = acat_test(p_use)
+    annotations = ldsc_all["annotation"].unique()
+    for ct in annotations:
+        p_values = ldsc_all.loc[ldsc_all["annotation"] == ct, "p"]
+        # Handle extreme outliers to enhance robustness
+        p_values_log = -np.log10(p_values)
+        median_log = np.median(p_values_log)
+        iqr_log = np.percentile(p_values_log, 75) - np.percentile(p_values_log, 25)
+        p_values_filtered = p_values[p_values_log < median_log + 3 * iqr_log]
+        n_removed = len(p_values) - len(p_values_filtered)
+        # Remove outliers if the number is reasonable
+        if 0 < n_removed < 20:
+            logger.info(f"Removed {n_removed}/{len(p_values)} outliers (median + 3IQR) for {ct}.")
+            p_cauchy_temp = acat_test(p_values_filtered)
         else:
-             p_cauchy_temp = acat_test(p_temp)
-        p_median_temp = np.median(p_temp)
+            p_cauchy_temp = acat_test(p_values)
+        p_median_temp = np.median(p_values)
         p_cauchy.append(p_cauchy_temp)
         p_median.append(p_median_temp)
-    #     p_tissue = pd.DataFrame(p_cauchy,p_median,np.unique(ldsc.annotation))
-    data = {'p_cauchy': p_cauchy, 'p_median': p_median, 'annotation': np.unique(ldsc.annotation)}
-    p_tissue = pd.DataFrame(data)
-    p_tissue.columns = ['p_cauchy', 'p_median', 'annotation']
+    # Prepare the results dataframe
+    results = pd.DataFrame({"annotation": annotations, "p_cauchy": p_cauchy, "p_median": p_median})
+    results.sort_values(by="p_cauchy", inplace=True)
     # Save the results
-    output_dir = Path(config.cauchy_save_dir)
-    output_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
-    output_file = output_dir / f'{config.sample_name}_{config.trait_name}.Cauchy.csv.gz'
-    p_tissue.to_csv(
+    Path(config.output_file).parent.mkdir(parents=True, exist_ok=True, mode=0o755)
+    output_file = Path(config.output_file)
+    results.to_csv(
         output_file,
-        compression='gzip',
+        compression="gzip",
         index=False,
     )
+    logger.info(f"Cauchy combination results saved at {output_file}.")
+    return results

gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl

gsMap 1.71.2py3-none-any.whl → 1.72.3py3-none-any.whl