PyPI - gsMap - Versions diffs - 1.62__py3-none-any.whl → 1.63__py3-none-any.whl - Mend

gsMap 1.62py3-none-any.whl → 1.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gsMap/GNN_VAE/adjacency_matrix.py +1 -1
gsMap/GNN_VAE/model.py +5 -5
gsMap/GNN_VAE/train.py +1 -1
gsMap/__init__.py +1 -1
gsMap/cauchy_combination_test.py +14 -36
gsMap/config.py +473 -404
gsMap/diagnosis.py +273 -0
gsMap/find_latent_representation.py +22 -86
gsMap/format_sumstats.py +79 -82
gsMap/generate_ldscore.py +145 -78
gsMap/latent_to_gene.py +65 -104
gsMap/main.py +1 -9
gsMap/report.py +160 -0
gsMap/run_all_mode.py +195 -0
gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
gsMap/templates/report_template.html +198 -0
gsMap/utils/__init__.py +0 -0
gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
gsMap/utils/manhattan_plot.py +639 -0
gsMap/{regression_read.py → utils/regression_read.py} +1 -1
gsMap/visualize.py +100 -55
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
gsmap-1.63.dist-info/RECORD +30 -0
gsmap-1.62.dist-info/RECORD +0 -24
/gsMap/{jackknife.py → utils/jackknife.py} +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0

gsMap/generate_ldscore.py CHANGED Viewed

@@ -1,24 +1,19 @@
-import argparse
 import logging
+import warnings
 from pathlib import Path
 import numpy as np
-# %%
 import pandas as pd
 import pyranges as pr
+import zarr
 from scipy.sparse import csr_matrix
 from tqdm import trange
-from gsMap.config import GenerateLDScoreConfig, add_generate_ldscore_args
-# %%
-from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
+from gsMap.config import GenerateLDScoreConfig
+from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
+warnings.filterwarnings("ignore", category=FutureWarning)
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter(
-    '[{asctime}] {levelname:6s} {message}', style='{'))
-logger.addHandler(handler)
 # %%
@@ -30,7 +25,7 @@ def load_gtf(gtf_file, mk_score, window_size):
     print("Loading gtf data")
     #
     # Load GTF file
-    gtf = pr.read_gtf(gtf_file)
+    gtf = pr.read_gtf(gtf_file, )
     gtf = gtf.df
     #
     # Select the common genes
@@ -82,14 +77,16 @@ def load_bim(bfile_root, chrom):
     """
     Load the bim file.
     """
-    print("Loading bim data")
     bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
     bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
     #
     # Transform bim to PyRanges
     bim_pr = bim.copy()
     bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
-    bim_pr['End'] = bim_pr['Start']
+    bim_pr['End'] = bim_pr['Start'].copy()
+    bim_pr['Start'] = bim_pr['Start'] - 1  # Due to bim file is 1-based
     bim_pr = pr.PyRanges(bim_pr)
     bim_pr.Chromosome = f'chr{chrom}'
     return bim, bim_pr
@@ -110,6 +107,36 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
 # %%
+def filter_snps_by_keep_snp(bim_df, keep_snp_file):
+    # Load the keep_snp file and filter the BIM DataFrame
+    keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
+    filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
+    return filtered_bim_df
+def get_snp_counts(config):
+    snp_counts = {}
+    total_snp = 0
+    for chrom in range(1, 23):
+        bim_df, _ = load_bim(config.bfile_root, chrom)
+        if config.keep_snp_root:
+            keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
+            filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
+        else:
+            filtered_bim_df = bim_df
+        snp_counts[chrom] = filtered_bim_df.shape[0]
+        total_snp += snp_counts[chrom]
+    snp_counts['total'] = total_snp
+    chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
+    snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
+    return snp_counts
 # %%
@@ -189,7 +216,7 @@ def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_w
 def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
-    SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1)
+    SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
     snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
                                          ld_unit=ld_unit)
@@ -212,7 +239,7 @@ class S_LDSC_Boost:
     def __init__(self, config: GenerateLDScoreConfig):
         self.config = config
-        self.mk_score = load_marker_score(config.mkscore_feather_file)
+        self.mk_score = load_marker_score(config.mkscore_feather_path)
         # Load GTF and get common markers
         self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
@@ -237,6 +264,25 @@ class S_LDSC_Boost:
         else:
             self.enhancer_pr = None
+        # create tha zarr file
+        if config.ldscore_save_format == 'zarr':
+            chrom_snp_length_dict = get_snp_counts(config)
+            self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
+            zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
+            if not zarr_path.exists():
+                self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
+                                           chunks=config.zarr_chunk_size,
+                                           shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
+                zarr_path.mkdir(parents=True, exist_ok=True)
+                # save spot names
+                self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
+                # save chrom_snp_length_dict
+                self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
+            else:
+                self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
     def process_chromosome(self, chrom: int):
         self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
@@ -252,9 +298,9 @@ class S_LDSC_Boost:
             self.keep_snp_mask = None
             self.snp_name = self.snp_gene_pair_dummy.index.to_list()
-        if self.config.additional_baseline_annotation_dir_path is not None:
-            additional_baseline_annotation_dir_path = Path(self.config.additional_baseline_annotation_dir_path)
-            additional_baseline_annotation_file_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.annot.gz'
+        if self.config.additional_baseline_annotation is not None:
+            additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
+            additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
             assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
             additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
             additional_baseline_annotation_df.set_index('SNP', inplace=True)
@@ -274,7 +320,7 @@ class S_LDSC_Boost:
                 additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
                     self.snp_gene_pair_dummy.index)
-            # do this for saving the cpu time, by only calculate r2 once
+            # do this for saving the cpu time, only calculate r2 once
             self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
                 calculate_ldscore_from_multiple_annotation(
                     [self.snp_gene_pair_dummy, additional_baseline_annotation_df],
@@ -283,21 +329,24 @@ class S_LDSC_Boost:
                     ld_wind=self.config.ld_wind,
                     ld_unit=self.config.ld_unit))
+            additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
+            # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
             ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
             M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
             M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
             # save additional baseline annotation ldscore
-            self.save_ldscore(additional_baseline_annotation_ldscore.values,
-                              column_names=additional_baseline_annotation_ldscore.columns,
-                              save_file_name=ld_score_file,
-                              )
+            self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
+                                         column_names=additional_baseline_annotation_ldscore.columns,
+                                         save_file_name=ld_score_file,
+                                         )
             # caculate the M and save
             save_dir = Path(M_file_path).parent
             save_dir.mkdir(parents=True, exist_ok=True)
             M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
-            M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0,keepdims=True)
+            M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
             np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
             np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
@@ -307,13 +356,27 @@ class S_LDSC_Boost:
                                                                             self.config.bfile_root,
                                                                             ld_wind=self.config.ld_wind,
                                                                             ld_unit=self.config.ld_unit)
+        # only keep the snp in keep_snp_root
+        if self.keep_snp_mask is not None:
+            self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
+        if self.config.save_pre_calculate_snp_gene_weight_matrix:
+            snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
+            snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
+            self.snp_gene_weight_matrix.reset_index().to_feather(
+                snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
         # convert to sparse
         self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
+        logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
         # calculate baseline ld score
+        logger.info(f'Calculating baseline ld score for chr{chrom}...')
         self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
         # calculate ld score for annotation
+        logger.info(f'Calculating ld score for annotation for chr{chrom}...')
         self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
             self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
             chrom,
@@ -323,7 +386,6 @@ class S_LDSC_Boost:
     def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
                                                               mk_score_chunk,
-                                                              save_file_name,
                                                               drop_dummy_na=True,
                                                               ):
@@ -332,20 +394,18 @@ class S_LDSC_Boost:
         else:
             ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
-        self.save_ldscore(ldscore_chr_chunk,
-                          column_names=mk_score_chunk.columns,
-                          save_file_name=save_file_name,
-                          )
+        return ldscore_chr_chunk
-    def save_ldscore(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
+    def save_ldscore_to_feather(self, ldscore_chr_chunk: np.ndarray, column_names, save_file_name):
         save_dir = Path(save_file_name).parent
         save_dir.mkdir(parents=True, exist_ok=True)
         ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
         # avoid overflow of float16, if inf, set to max of float16
         ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
-        ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
-            self.keep_snp_mask]
+        # ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
+        #     self.keep_snp_mask]
         # save for each chunk
         df = pd.DataFrame(ldscore_chr_chunk,
                           index=self.snp_name,
@@ -354,6 +414,20 @@ class S_LDSC_Boost:
         df.index.name = 'SNP'
         df.reset_index().to_feather(save_file_name)
+    def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
+                                   chrom: int, start_col_index,
+                                   ):
+        ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
+        # avoid overflow of float16, if inf, set to max of float16
+        ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
+        # save for each chunk
+        chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
+        chrom_snp_end_point = self.chrom_snp_start_point[chrom]
+        self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
+        start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
     def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
                                                      mk_score_chunk,
                                                      M_file_path, M_5_file_path,
@@ -377,7 +451,6 @@ class S_LDSC_Boost:
         np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
         np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
     def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
         """
         Calculate the LD score using the SNP-gene weight matrix.
@@ -393,11 +466,23 @@ class S_LDSC_Boost:
             M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
             M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
-            self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
+            ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
                 mk_score_chunk,
-                save_file_name=ld_score_file,
                 drop_dummy_na=True,
             )
+            if self.config.ldscore_save_format == 'feather':
+                self.save_ldscore_to_feather(ldscore_chr_chunk,
+                                             column_names=mk_score_chunk.columns,
+                                             save_file_name=ld_score_file,
+                                             )
+            elif self.config.ldscore_save_format == 'zarr':
+                self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
+                                                chrom=chrom,
+                                                start_col_index=i,
+                                                )
+            else:
+                raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
             self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
                 mk_score_chunk,
                 M_file,
@@ -417,11 +502,15 @@ class S_LDSC_Boost:
         M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
         M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
-        self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
+        ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
             baseline_mk_score_df,
-            save_file_name=ld_score_file,
             drop_dummy_na=False,
         )
+        self.save_ldscore_to_feather(ldscore_chr_chunk,
+                                     column_names=baseline_mk_score_df.columns,
+                                     save_file_name=ld_score_file,
+                                     )
         # save baseline M
         self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
             baseline_mk_score_df,
@@ -435,6 +524,7 @@ class S_LDSC_Boost:
         Get the dummy matrix of SNP-gene pairs.
         """
         # Load the bim file
+        print("Loading bim data")
         bim, bim_pr = load_bim(self.config.bfile_root, chrom)
         if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
@@ -468,6 +558,12 @@ class S_LDSC_Boost:
         else:
             raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
+        # save the SNP_gene_pair to feather
+        SNP_gene_pair_save_path = Path(
+            self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
+        SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
+        SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
         # Get the dummy matrix
         SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
         return SNP_gene_pair_dummy
@@ -502,50 +598,21 @@ class S_LDSC_Boost:
 def run_generate_ldscore(config: GenerateLDScoreConfig):
+    if config.ldscore_save_format == 'quick_mode':
+        logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
+        ldscore_save_dir = config.ldscore_save_dir
+        # link the baseline annotation
+        baseline_annotation_dir = Path(config.baseline_annotation_dir)
+        (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
+        # link the SNP_gene_pair
+        SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
+        (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
+        return
     s_ldsc_boost = S_LDSC_Boost(config)
     if config.chrom == 'all':
         for chrom in range(1, 23):
             s_ldsc_boost.process_chromosome(chrom)
     else:
         s_ldsc_boost.process_chromosome(config.chrom)
-# %%
-if __name__ == '__main__':
-    TEST = True
-    if TEST:
-        # %%
-        sample_name = 'Cortex_151507'
-        chrom = 'all'
-        save_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/Cortex_151507/snp_annotation/test/0101/sparse'
-        # %%
-        gtf_file = '/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf'
-        mkscore_feather_file = f'/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/{sample_name}/gene_markers/{sample_name}_rank.feather'
-        bfile_root = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
-        window_size = 50000
-        keep_snp_root = '/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm'
-        spots_per_chunk = 10_000
-        enhancer_annotation = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/epigenome/cleaned_data/by_tissue/BRN/ABC_roadmap_merged.bed'
-        # %%
-        config = GenerateLDScoreConfig(
-            sample_name=sample_name,
-            chrom=chrom,
-            ldscore_save_dir=save_dir,
-            gtf_annotation_file=gtf_file,
-            mkscore_feather_file=mkscore_feather_file,
-            bfile_root=bfile_root,
-            keep_snp_root=keep_snp_root,
-            gene_window_size=window_size,
-            spots_per_chunk=spots_per_chunk,
-            enhancer_annotation_file=enhancer_annotation,
-            gene_window_enhancer_priority='enhancer_first',
-            additional_baseline_annotation_dir_path='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/resource/ldsc/baseline_v1.2/remove_base'
-        )
-        # %%
-        run_generate_ldscore(config)
-    else:
-        parser = argparse.ArgumentParser(description="Configuration for the application.")
-        add_generate_ldscore_args(parser)
-        args = parser.parse_args()
-        config = GenerateLDScoreConfig(**vars(args))
-        run_generate_ldscore(config)

gsMap/latent_to_gene.py CHANGED Viewed

@@ -1,8 +1,4 @@
-import argparse
 import logging
-import multiprocessing
-import pprint
-import time
 from pathlib import Path
 import numpy as np
@@ -14,14 +10,9 @@ from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
-from gsMap.config import add_latent_to_gene_args, LatentToGeneConfig
+from gsMap.config import LatentToGeneConfig
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter(
-    '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
-logger.addHandler(handler)
 def find_Neighbors(coor, num_neighbour):
@@ -49,31 +40,31 @@ def _build_spatial_net(adata, annotation, num_neighbour):
     """
     1 Build spatial neighbourhood matrix for each spot (cell) based on the spatial coord
     """
-    print(f'------Building spatial graph based on spatial coordinates...')
+    logger.info(f'------Building spatial graph based on spatial coordinates...')
     coor = pd.DataFrame(adata.obsm['spatial'])
     coor.index = adata.obs.index
     if not annotation is None:
-        print(f'Cell annotations are provided...')
+        logger.info(f'Cell annotations are provided...')
         spatial_net = pd.DataFrame()
         # Cells with annotations
         for ct in adata.obs[annotation].dropna().unique():
             coor_temp = coor.loc[adata.obs[annotation] == ct, :]
             spatial_net_temp = find_Neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
             spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
-            print(f'{ct}: {coor_temp.shape[0]} cells')
+            logger.info(f'{ct}: {coor_temp.shape[0]} cells')
         # Cells labeled as nan
         if pd.isnull(adata.obs[annotation]).any():
             cell_nan = adata.obs.index[np.where(pd.isnull(adata.obs[annotation]))[0]]
-            print(f'Nan: {len(cell_nan)} cells')
+            logger.info(f'Nan: {len(cell_nan)} cells')
             spatial_net_temp = find_Neighbors(coor, num_neighbour)
             spatial_net_temp = spatial_net_temp.loc[spatial_net_temp.Cell1.isin(cell_nan), :]
             spatial_net = pd.concat((spatial_net, spatial_net_temp), axis=0)
     else:
-        print(f'Cell annotations are not provided...')
+        logger.info(f'Cell annotations are not provided...')
         spatial_net = find_Neighbors(coor, num_neighbour)
     return spatial_net
@@ -117,7 +108,7 @@ def _compute_regional_mkscore(cell_tg, ):
         # Simultaneously consider the ratio of expression fractions and ranks
         gene_ranks_region = (gene_ranks_region * frac_region).values
-    mkscore = np.exp(gene_ranks_region ** 2) - 1
+    mkscore = np.exp(gene_ranks_region ** 1.5) - 1
     return mkscore.astype(np.float16, copy=False)
@@ -125,31 +116,39 @@ def run_latent_to_gene(config: LatentToGeneConfig):
     global adata, coor_latent, spatial_net, ranks, frac_whole, args, spatial_net_dict, expressed_mask
     args = config
     # Load and process the spatial data
-    print('------Loading the spatial data...')
-    adata = sc.read_h5ad(config.input_hdf5_with_latent_path)
+    logger.info('------Loading the spatial data...')
+    adata = sc.read_h5ad(config.hdf5_with_latent_path)
+    logger.info('------Ranking the spatial data...')
+    adata.layers['rank'] = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
     if not config.annotation is None:
-        print(f'------Cell annotations are provided as {config.annotation}...')
+        logger.info(f'------Cell annotations are provided as {config.annotation}...')
         adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
-    # Homologs transformation
-    if not config.species is None:
-        print(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
-        homologs = pd.read_csv(config.gs_species, sep='\t')
-        homologs.index = homologs[config.species]
-        adata = adata[:, adata.var_names.isin(homologs[config.species])]
-        print(f'{adata.shape[1]} genes left after homologs transformation.')
-        adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM']
-    # Process the data
-    if config.type == 'count':
-        adata.X = adata.layers[config.type]
-        sc.pp.normalize_total(adata, target_sum=1e4)
-        sc.pp.log1p(adata)
-    else:
-        adata.X = adata.layers[config.type]
-        # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
-    print(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
+    # Homologs transformation
+    if not config.homolog_file is None:
+        logger.info(f'------Transforming the {config.species} to HUMAN_GENE_SYM...')
+        homologs = pd.read_csv(config.homolog_file, sep='\t')
+        if homologs.shape[1] != 2:
+            raise ValueError(
+                "Homologs file must have two columns: one for the species and one for the human gene symbol.")
+        homologs.columns = [config.species, 'HUMAN_GENE_SYM']
+        homologs.set_index(config.species, inplace=True)
+        adata = adata[:, adata.var_names.isin(homologs.index)]
+        # Log the number of genes left after homolog transformation
+        logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
+        if adata.shape[1] < 100:
+            raise ValueError("Too few genes retained in ST data (<100).")
+        adata.var_names = homologs.loc[adata.var_names, 'HUMAN_GENE_SYM'].values
+        # drop duplicated genes
+        adata = adata[:, ~adata.var_names.duplicated()]
+    # Remove cells that do not express any genes after transformation, and genes that are not expressed in any cells.
+    logger.info(f'Number of cells, genes of the input data: {adata.shape[0]},{adata.shape[1]}')
     adata = adata[adata.X.sum(axis=1) > 0, adata.X.sum(axis=0) > 0]
-    print(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
+    logger.info(f'Number of cells, genes after transformation: {adata.shape[0]},{adata.shape[1]}')
     # Buid the spatial graph
     spatial_net = _build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
     spatial_net.set_index('Cell1', inplace=True)
@@ -163,27 +162,31 @@ def run_latent_to_gene(config: LatentToGeneConfig):
     cell_list = adata.obs.index.tolist()
     # Load the geometrical mean across slices
-    if not config.gM_slices is None:
-        print('Geometrical mean across multiple slices are provided.')
+    if config.gM_slices is not None:
+        logger.info('Geometrical mean across multiple slices is provided.')
         gM = pd.read_parquet(config.gM_slices)
-        # Select the common gene
+        if config.species is not None:
+            homologs = pd.read_csv(config.homolog_file, sep='\t', header=None)
+            if homologs.shape[1] < 2:
+                raise ValueError(
+                    "Homologs file must have at least two columns: one for the species and one for the human gene symbol.")
+            homologs.columns = [config.species, 'HUMAN_GENE_SYM']
+            homologs.set_index(config.species, inplace=True)
+            gM = gM.loc[gM.index.isin(homologs.index)]
+            gM.index = homologs.loc[gM.index, 'HUMAN_GENE_SYM'].values
         common_gene = np.intersect1d(adata.var_names, gM.index)
         gM = gM.loc[common_gene]
-        gM = gM['G_Mean'].to_list()
-        print('------Ranking the spatial data...')
+        gM = gM['G_Mean'].to_numpy()
         adata = adata[:, common_gene]
-        ranks = np.apply_along_axis(rankdata, 1, adata.X.toarray())
     else:
-        print('------Ranking the spatial data...')
-        ranks = rankdata(adata.X.toarray().astype(np.float32), axis=1).astype(np.float32)
-        gM = gmean(ranks, axis=0)
+        gM = gmean(adata.layers['rank'], axis=0)
     # Compute the fraction of each gene across cells
     expressed_mask = pd.DataFrame((adata.X > 0).toarray(), index=adata.obs.index, columns=adata.var.index)
-    # frac_whole = np.array((adata.X > 0).sum(axis=0))[0] / (adata.shape[0])
+    # frac_whole = np.array((adata_layer > 0).sum(axis=0))[0] / (adata.shape[0])
     frac_whole = np.array(expressed_mask.sum(axis=0)) / (adata.shape[0])
     # Normalize the geometrical mean
-    ranks = ranks / gM
+    ranks = adata.layers['rank'] / gM
     ranks = pd.DataFrame(ranks, index=adata.obs_names)
     ranks.columns = adata.var.index
     mk_score = [
@@ -192,66 +195,24 @@ def run_latent_to_gene(config: LatentToGeneConfig):
                             desc="Finding markers (Rank-based approach) | cells")
     ]
     # Normalize the marker scores
-    mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var.index, columns=cell_list)
+    mk_score = pd.DataFrame(np.vstack(mk_score).T, index=adata.var_names, columns=cell_list)
     # mk_score_normalized = mk_score.div(mk_score.sum())*1e+2
-    # Remove the mitochondrial genes
-    mt_genes = [gene for gene in mk_score.index if gene.startswith('MT-') or gene.startswith('mt-')]
-    mask = ~mk_score.index.isin(set(mt_genes))
-    mk_score = mk_score[mask]  # Apply the mask to mk_score
-    print(mk_score.shape)
+    # Remove the mitochondrial genes from mk_score
+    mt_gene_mask = ~adata.var_names.str.startswith(('MT-', 'mt-'))
+    mk_score = mk_score[mt_gene_mask]
+    adata = adata[:, mt_gene_mask]
+    # # Save the mk_score DataFrame to an adata layer
+    # adata.layers['mkscore'] = mk_score.values.T
     # Save the marker scores
-    print(f'------Saving marker scores ...')
-    output_file_path = Path(config.output_feather_path)
+    logger.info(f'------Saving marker scores ...')
+    output_file_path = Path(config.mkscore_feather_path)
     output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
     mk_score.reset_index(inplace=True)
     mk_score.rename(columns={mk_score.columns[0]: 'HUMAN_GENE_SYM'}, inplace=True)
     mk_score.to_feather(output_file_path)
-#%%
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Process latent to gene data.")
-    add_latent_to_gene_args(parser)
-    TEST = True
-    if TEST:
-        name = 'Cortex_151507'
-        test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
-        args = parser.parse_args([
-            '--input_hdf5_with_latent_path', f'{test_dir}/{name}/hdf5/{name}_add_latent.h5ad',
-            '--sample_name', f'{name}',
-            '--output_feather_path', f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
-            '--method', 'rank',
-            '--latent_representation', 'latent_GVAE',
-            '--type', 'count',
-            '--annotation', 'layer_guess',
-            '--num_neighbour', '51',
-            # '--no_expression_fraction',
-        ])
-        # config = LatentToGeneConfig(
-        #     **{'annotation': 'SubClass',
-        #        'fold': 1.0,
-        #        'gM_slices': None,
-        #        'gs_species': '/storage/yangjianLab/songliyang/SpatialData/homologs/macaque_human_homologs.txt',
-        #        'input_hdf5_with_latent_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/find_latent_representations/T121_macaque1_add_latent.h5ad',
-        #        'latent_representation': 'latent_GVAE',
-        #        'method': 'rank',
-        #        'num_neighbour': 51,
-        #        'num_neighbour_spatial': 201,
-        #        'output_feather_path': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/macaque/T121_macaque1/latent_to_gene/T121_macaque1_gene_marker_score.feather',
-        #        'pst': 0.2,
-        #        'sample_name': 'T121_macaque1',
-        #        'species': 'MACAQUE_GENE_SYM',
-        #        'type': 'SCT'}
-        # )
-    else:
-        args = parser.parse_args()
-        config = LatentToGeneConfig(**vars(args))
-    logger.info(f'Latent to gene for {args.sample_name}...')
-    pprint.pprint(config)
-    start_time = time.time()
-    run_latent_to_gene(config)
-    end_time = time.time()
-    logger.info(
-        f'Latent to gene for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')
+    # Save the modified adata object to disk
+    adata.write(config.hdf5_with_latent_path)

gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl

gsMap 1.62py3-none-any.whl → 1.63py3-none-any.whl