PyPI - gsMap - Versions diffs - 1.62__py3-none-any.whl → 1.64__py3-none-any.whl - Mend

gsMap 1.62py3-none-any.whl → 1.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gsMap/GNN_VAE/adjacency_matrix.py +1 -1
gsMap/GNN_VAE/model.py +5 -5
gsMap/GNN_VAE/train.py +1 -1
gsMap/__init__.py +1 -1
gsMap/cauchy_combination_test.py +14 -36
gsMap/config.py +473 -404
gsMap/diagnosis.py +273 -0
gsMap/find_latent_representation.py +22 -86
gsMap/format_sumstats.py +79 -82
gsMap/generate_ldscore.py +145 -78
gsMap/latent_to_gene.py +65 -104
gsMap/main.py +1 -9
gsMap/report.py +160 -0
gsMap/run_all_mode.py +195 -0
gsMap/spatial_ldsc_multiple_sumstats.py +188 -113
gsMap/templates/report_template.html +198 -0
gsMap/utils/__init__.py +0 -0
gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +2 -10
gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
gsMap/utils/manhattan_plot.py +639 -0
gsMap/{regression_read.py → utils/regression_read.py} +1 -1
gsMap/visualize.py +100 -55
{gsmap-1.62.dist-info → gsmap-1.64.dist-info}/METADATA +21 -46
gsmap-1.64.dist-info/RECORD +30 -0
gsmap-1.62.dist-info/RECORD +0 -24
/gsMap/{jackknife.py → utils/jackknife.py} +0 -0
{gsmap-1.62.dist-info → gsmap-1.64.dist-info}/LICENSE +0 -0
{gsmap-1.62.dist-info → gsmap-1.64.dist-info}/WHEEL +0 -0
{gsmap-1.62.dist-info → gsmap-1.64.dist-info}/entry_points.txt +0 -0

gsMap/spatial_ldsc_multiple_sumstats.py CHANGED Viewed

@@ -1,27 +1,28 @@
-import os
-import numpy as np
-import pandas as pd
-import argparse
+import gc
 import logging
-import multiprocessing
+import os
 from collections import defaultdict
 from pathlib import Path
+import anndata as ad
+import numpy as np
+import pandas as pd
+import zarr
 from scipy.stats import norm
-from tqdm.contrib.concurrent import process_map
+from tqdm.contrib.concurrent import thread_map
-import gsMap.jackknife as jk
-from gsMap.config import add_spatial_ldsc_args, SpatialLDSCConfig
-from gsMap.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2, _read_M_v2
+import gsMap.utils.jackknife as jk
+from gsMap.config import SpatialLDSCConfig
+from gsMap.utils.regression_read import _read_sumstats, _read_w_ld, _read_ref_ld_v2
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('gsMap.spatial_ldsc')
 # %%
 def _coef_new(jknife):
     # return coef[0], coef_se[0], z[0]]
-    est_ = jknife.est[0, 0] / Nbar
+    # est_ = jknife.est[0, 0] / Nbar
+    est_ = jknife.jknife_est[0, 0] / Nbar
     se_ = jknife.jknife_se[0, 0] / Nbar
     return est_, se_
@@ -68,16 +69,19 @@ def weights(ld, w_ld, N, M, hsq, intercept=1):
 def jackknife_for_processmap(spot_id):
     # calculate the initial weight for each spot
+    spot_spatial_annotation = spatial_annotation[:, spot_id]
+    spot_x_tot_precomputed = spot_spatial_annotation + ref_ld_baseline_column_sum
     initial_w = (
-        get_weight_optimized(sumstats, x_tot_precomputed_common_snp[:, spot_id], 10000, w_ld_common_snp, intercept=1)
+        get_weight_optimized(sumstats, x_tot_precomputed=spot_x_tot_precomputed,
+                             M_tot=10000, w_ld=w_ld_common_snp, intercept=1)
         .astype(np.float32)
         .reshape((-1, 1)))
     # apply the weight to baseline annotation, spatial annotation and CHISQ
     initial_w_scaled = initial_w / np.sum(initial_w)
     baseline_annotation_spot = baseline_annotation * initial_w_scaled
-    spatial_annotation_spot = spatial_annotation.iloc[:, spot_id].values.reshape((-1, 1)) * initial_w_scaled
-    CHISQ = sumstats.chisq.to_numpy(dtype=np.float32).reshape((-1, 1)).copy()
+    spatial_annotation_spot = spot_spatial_annotation.reshape((-1, 1)) * initial_w_scaled
+    CHISQ = sumstats.chisq.values.reshape((-1, 1))
     y = CHISQ * initial_w_scaled
     # run the jackknife
@@ -113,6 +117,9 @@ def _preprocess_sumstats(trait_name, sumstat_file_path, baseline_and_w_ld_common
         logger.warning(f'WARNING: number of SNPs less than 200k; for {trait_name} this is almost always bad.')
     sumstats = sumstats.loc[common_snp]
+    # get the common index position of baseline_and_w_ld_common_snp for quick access
+    sumstats['common_index_pos'] = pd.Index(baseline_and_w_ld_common_snp).get_indexer(sumstats.index)
     return sumstats
@@ -132,8 +139,77 @@ def _get_sumstats_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_
     return sumstats_cleaned_dict
+class S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix:
+    def __init__(self, config: SpatialLDSCConfig, common_snp_among_all_sumstats_pos):
+        self.config = config
+        mk_score = pd.read_feather(config.mkscore_feather_path).set_index('HUMAN_GENE_SYM')
+        mk_score_genes = mk_score.index
+        snp_gene_weight_adata = ad.read_h5ad(config.snp_gene_weight_adata_path)
+        common_genes = mk_score_genes.intersection(snp_gene_weight_adata.var.index)
+        common_snps = snp_gene_weight_adata.obs.index
+        # self.snp_gene_weight_adata = snp_gene_weight_adata[common_snp_among_all_sumstats:, common_genes.to_list()]
+        self.snp_gene_weight_matrix = snp_gene_weight_adata[common_snp_among_all_sumstats_pos, common_genes.to_list()].X
+        self.mk_score_common = mk_score.loc[common_genes]
+        # calculate the chunk number
+        self.chunk_starts = list(range(0, self.mk_score_common.shape[1], self.config.spots_per_chunk_quick_mode))
+    def fetch_ldscore_by_chunk(self, chunk_index):
+        chunk_start = self.chunk_starts[chunk_index]
+        mk_score_chunk = self.mk_score_common.iloc[:,
+                         chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
+        ldscore_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
+            mk_score_chunk,
+            drop_dummy_na=False,
+        )
+        spots_name = self.mk_score_common.columns[chunk_start:chunk_start + self.config.spots_per_chunk_quick_mode]
+        return ldscore_chunk, spots_name
+    def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
+                                                              mk_score_chunk,
+                                                              drop_dummy_na=True,
+                                                              ):
+        if drop_dummy_na:
+            ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
+        else:
+            ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
+        return ldscore_chr_chunk
+def _get_sumstats_with_common_snp_from_sumstats_dict(sumstats_config_dict: dict, baseline_and_w_ld_common_snp: pd.Index,
+                                                     chisq_max=None):
+    # first validate if all sumstats file exists
+    logger.info('Validating sumstats files...')
+    for trait_name, sumstat_file_path in sumstats_config_dict.items():
+        if not os.path.exists(sumstat_file_path):
+            raise FileNotFoundError(f'{sumstat_file_path} not found')
+    # then load all sumstats
+    sumstats_cleaned_dict = {}
+    for trait_name, sumstat_file_path in sumstats_config_dict.items():
+        sumstats_cleaned_dict[trait_name] = _preprocess_sumstats(trait_name, sumstat_file_path,
+                                                                 baseline_and_w_ld_common_snp, chisq_max)
+    # get the common snps among all sumstats
+    common_snp_among_all_sumstats = None
+    for trait_name, sumstats in sumstats_cleaned_dict.items():
+        if common_snp_among_all_sumstats is None:
+            common_snp_among_all_sumstats = sumstats.index
+        else:
+            common_snp_among_all_sumstats = common_snp_among_all_sumstats.intersection(sumstats.index)
+    # filter the common snps among all sumstats
+    for trait_name, sumstats in sumstats_cleaned_dict.items():
+        sumstats_cleaned_dict[trait_name] = sumstats.loc[common_snp_among_all_sumstats]
+    logger.info(f'Common SNPs among all sumstats: {len(common_snp_among_all_sumstats)}')
+    return sumstats_cleaned_dict, common_snp_among_all_sumstats
 def run_spatial_ldsc(config: SpatialLDSCConfig):
-    global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, x_tot_precomputed_common_snp, w_ld_common_snp
+    global spatial_annotation, baseline_annotation, n_blocks, Nbar, sumstats, ref_ld_baseline_column_sum, w_ld_common_snp
     # config
     n_blocks = config.n_blocks
     sample_name = config.sample_name
@@ -144,72 +220,107 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
     w_ld_cname = w_ld.columns[1]
     w_ld.set_index('SNP', inplace=True)
-    # Load the baseline annotations
-    ld_file_baseline = f'{config.ldscore_input_dir}/baseline/baseline.'
+    ld_file_baseline = f'{config.ldscore_save_dir}/baseline/baseline.'
     ref_ld_baseline = _read_ref_ld_v2(ld_file_baseline)
-    n_annot_baseline = len(ref_ld_baseline.columns)
-    M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
+    # n_annot_baseline = len(ref_ld_baseline.columns)
+    # M_annot_baseline = _read_M_v2(ld_file_baseline, n_annot_baseline, config.not_M_5_50)
     # common snp between baseline and w_ld
     baseline_and_w_ld_common_snp = ref_ld_baseline.index.intersection(w_ld.index)
-    if len(baseline_and_w_ld_common_snp) < 200000:
-        logger.warning(f'WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad.')
-    ref_ld_baseline = ref_ld_baseline.loc[baseline_and_w_ld_common_snp]
+    baseline_and_w_ld_common_snp_pos = pd.Index(ref_ld_baseline.index).get_indexer(baseline_and_w_ld_common_snp)
+    # Clean the sumstats
+    sumstats_cleaned_dict, common_snp_among_all_sumstats = _get_sumstats_with_common_snp_from_sumstats_dict(
+        config.sumstats_config_dict, baseline_and_w_ld_common_snp,
+        chisq_max=config.chisq_max)
+    common_snp_among_all_sumstats_pos = ref_ld_baseline.index.get_indexer(common_snp_among_all_sumstats)
+    # insure the order is monotonic
+    assert pd.Series(
+        common_snp_among_all_sumstats_pos).is_monotonic_increasing, 'common_snp_among_all_sumstats_pos is not monotonic increasing'
+    if len(common_snp_among_all_sumstats) < 200000:
+        logger.warning(
+            f'!!!!! WARNING: number of SNPs less than 200k; for {sample_name} this is almost always bad. Please check the sumstats files.')
+    ref_ld_baseline = ref_ld_baseline.loc[common_snp_among_all_sumstats]
+    w_ld = w_ld.loc[common_snp_among_all_sumstats]
     # load additional baseline annotations
     if config.use_additional_baseline_annotation:
-        ld_file_baseline_additional = f'{config.ldscore_input_dir}/additional_baseline/baseline.'
+        print('Using additional baseline annotations')
+        ld_file_baseline_additional = f'{config.ldscore_save_dir}/additional_baseline/baseline.'
         ref_ld_baseline_additional = _read_ref_ld_v2(ld_file_baseline_additional)
         n_annot_baseline_additional = len(ref_ld_baseline_additional.columns)
         logger.info(f'{len(ref_ld_baseline_additional.columns)} additional baseline annotations loaded')
         # M_annot_baseline_additional = _read_M_v2(ld_file_baseline_additional, n_annot_baseline_additional,
         #                                             config.not_M_5_50)
-        ref_ld_baseline_additional = ref_ld_baseline_additional.loc[baseline_and_w_ld_common_snp]
+        ref_ld_baseline_additional = ref_ld_baseline_additional.loc[common_snp_among_all_sumstats]
         ref_ld_baseline = pd.concat([ref_ld_baseline, ref_ld_baseline_additional], axis=1)
         del ref_ld_baseline_additional
-    w_ld = w_ld.loc[baseline_and_w_ld_common_snp]
-    # Clean the sumstats
-    sumstats_cleaned_dict = _get_sumstats_from_sumstats_dict(config.sumstats_config_dict, baseline_and_w_ld_common_snp,
-                                                             chisq_max=config.chisq_max)
+    # Detect available chunk files
+    if config.ldscore_save_format == 'quick_mode':
+        s_ldsc = S_LDSC_Boost_with_pre_calculate_SNP_Gene_weight_matrix(config, common_snp_among_all_sumstats_pos)
+        total_chunk_number_found = len(s_ldsc.chunk_starts)
+        print(f'Split data into {total_chunk_number_found} chunks')
+    else:
+        all_file = os.listdir(config.ldscore_save_dir)
+        total_chunk_number_found = sum('chunk' in name for name in all_file)
+        print(f'Find {total_chunk_number_found} chunked files in {config.ldscore_save_dir}')
-    # Detect avalable chunk files
-    all_file = os.listdir(config.ldscore_input_dir)
     if config.all_chunk is None:
-        all_chunk = sum('chunk' in name for name in all_file)
-        print(f'\t')
-        print(f'Find {all_chunk} chunked files')
+        if config.chunk_range is not None:
+            assert config.chunk_range[0] >= 1 and config.chunk_range[
+                1] <= total_chunk_number_found, 'Chunk range out of bound. It should be in [1, all_chunk]'
+            print(
+                f'chunk range provided, using chunked files from {config.chunk_range[0]} to {config.chunk_range[1]}')
+            start_chunk, end_chunk = config.chunk_range
+        else:
+            start_chunk, end_chunk = 1, total_chunk_number_found
     else:
         all_chunk = config.all_chunk
         print(f'using {all_chunk} chunked files by provided argument')
         print(f'\t')
         print(f'Input {all_chunk} chunked files')
+        start_chunk, end_chunk = 1, all_chunk
+    running_chunk_number = end_chunk - start_chunk + 1
     # Process each chunk
     output_dict = defaultdict(list)
-    for chunk_index in range(1, all_chunk + 1):
-        print(f'------Processing chunk-{chunk_index}')
-        # Load the spatial annotations for this chunk
-        ld_file_spatial = f'{config.ldscore_input_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
-        ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
-        ref_ld_spatial = ref_ld_spatial.loc[baseline_and_w_ld_common_snp]
-        ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
+    zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
+    if config.ldscore_save_format == 'zarr':
+        assert zarr_path.exists(), f'{zarr_path} not found, which is required for zarr format'
+        zarr_file = zarr.open(str(zarr_path))
+        spots_name = zarr_file.attrs['spot_names']
+    for chunk_index in range(start_chunk, end_chunk + 1):
+        if config.ldscore_save_format == 'feather':
+            ref_ld_spatial, spatial_annotation_cnames = load_ldscore_chunk_from_feather(chunk_index,
+                                                                                        common_snp_among_all_sumstats_pos,
+                                                                                        config,
+                                                                                        )
+        elif config.ldscore_save_format == 'zarr':
+            ref_ld_spatial = zarr_file.blocks[:, chunk_index - 1][common_snp_among_all_sumstats_pos]
+            start_spot = (chunk_index - 1) * zarr_file.chunks[1]
+            ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
+            spatial_annotation_cnames = spots_name[start_spot:start_spot + zarr_file.chunks[1]]
+        elif config.ldscore_save_format == 'quick_mode':
+            ref_ld_spatial, spatial_annotation_cnames = s_ldsc.fetch_ldscore_by_chunk(chunk_index - 1)
+        else:
+            raise ValueError(f'Invalid ld score save format: {config.ldscore_save_format}')
         # get the x_tot_precomputed matrix by adding baseline and spatial annotation
-        x_tot_precomputed = ref_ld_spatial + ref_ld_baseline.sum(axis=1).values.reshape((-1, 1))
+        ref_ld_baseline_column_sum = ref_ld_baseline.sum(axis=1).values
+        # x_tot_precomputed = ref_ld_spatial + ref_ld_baseline_column_sum
         for trait_name, sumstats in sumstats_cleaned_dict.items():
-            logger.info(f'Processing {trait_name}...')
-            # filter ldscore by common snp
-            common_snp = sumstats.index
-            spatial_annotation = ref_ld_spatial.loc[common_snp].astype(np.float32, copy=False)
-            spatial_annotation_cnames = spatial_annotation.columns
-            baseline_annotation = ref_ld_baseline.loc[common_snp].astype(np.float32, copy=False)
-            w_ld_common_snp = w_ld.loc[common_snp].astype(np.float32, copy=False)
-            x_tot_precomputed_common_snp = x_tot_precomputed.loc[common_snp].values
+            spatial_annotation = ref_ld_spatial.astype(np.float32, copy=False)
+            baseline_annotation = ref_ld_baseline.copy().astype(np.float32, copy=False)
+            w_ld_common_snp = w_ld.astype(np.float32, copy=False)
             # weight the baseline annotation by N
             baseline_annotation = baseline_annotation * sumstats.N.values.reshape((-1, 1)) / sumstats.N.mean()
@@ -219,10 +330,11 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
             # Run the jackknife
             Nbar = sumstats.N.mean()
             chunk_size = spatial_annotation.shape[1]
-            out_chunk = process_map(jackknife_for_processmap, range(chunk_size),
-                                    max_workers=config.num_processes,
-                                    chunksize=10,
-                                    desc=f'LDSC chunk-{chunk_index}: {trait_name}')
+            out_chunk = thread_map(jackknife_for_processmap, range(chunk_size),
+                                   max_workers=config.num_processes,
+                                   chunksize=10,
+                                   desc=f'Chunk-{chunk_index}/Total-chunk-{running_chunk_number} for {trait_name}',
+                                   )
             # cache the results
             out_chunk = pd.DataFrame.from_records(out_chunk,
@@ -230,7 +342,8 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
                                                   index=spatial_annotation_cnames)
             # get the spots with nan
             nan_spots = out_chunk[out_chunk.isna().any(axis=1)].index
-            logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
+            if len(nan_spots) > 0:
+                logger.info(f'Nan spots: {nan_spots} in chunk-{chunk_index} for {trait_name}. They are removed.')
             # drop the nan
             out_chunk = out_chunk.dropna()
@@ -238,70 +351,32 @@ def run_spatial_ldsc(config: SpatialLDSCConfig):
             out_chunk['p'] = norm.sf(out_chunk['z'])
             output_dict[trait_name].append(out_chunk)
-            # garbage collection
-            del spatial_annotation
+        del ref_ld_spatial, spatial_annotation, baseline_annotation, w_ld_common_snp
+        gc.collect()
     # Save the results
-    out_dir = Path(config.ldsc_save_dir)
-    out_dir.mkdir(parents=True, exist_ok=True, mode=0o777)
+    out_dir = config.ldsc_save_dir
     for trait_name, out_chunk_list in output_dict.items():
         out_all = pd.concat(out_chunk_list, axis=0)
-        out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
+        if running_chunk_number == total_chunk_number_found:
+            out_file_name = out_dir / f'{sample_name}_{trait_name}.csv.gz'
+        else:
+            out_file_name = out_dir / f'{sample_name}_{trait_name}_chunk{start_chunk}-{end_chunk}.csv.gz'
         out_all['spot'] = out_all.index
         out_all = out_all[['spot', 'beta', 'se', 'z', 'p']]
         out_all.to_csv(out_file_name, compression='gzip', index=False)
         logger.info(f'Output saved to {out_file_name} for {trait_name}')
     logger.info(f'------Spatial LDSC for {sample_name} finished!')
-# %%
-if __name__ == '__main__':
-    # Main function of analysis
-    parser = argparse.ArgumentParser(
-        description="Run Spatial LD Score Regression (LDSC) analysis for GWAS and spatial transcriptomic data."
-    )
-    parser = add_spatial_ldsc_args(parser)
-    TEST = True
-    if TEST:
-        gwas_root = "/storage/yangjianLab/songliyang/GWAS_trait/LDSC"
-        gwas_trait = "/storage/yangjianLab/songliyang/GWAS_trait/GWAS_Public_Use_MaxPower.csv"
-        root = "/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/processed/h5ad"
-        name = 'Cortex_151507'
-        spe_name = name
-        # ld_pth = f"/storage/yangjianLab/songliyang/SpatialData/Data/Brain/Human/Nature_Neuroscience_2021/annotation/{spe_name}/snp_annotation"
-        ld_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/generate_ldscore"
-        out_pth = f"/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021/snake_workdir/{name}/ldsc"
-        gwas_file = "ADULT1_ADULT2_ONSET_ASTHMA"
-        # Prepare the arguments list using f-strings
-        args_list = [
-            "--h2", f"{gwas_root}/{gwas_file}.sumstats.gz",
-            "--w_file", "/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.",
-            "--sample_name", spe_name,
-            "--num_processes", '4',
-            "--ldscore_input_dir", ld_pth,
-            "--ldsc_save_dir", out_pth,
-            '--trait_name', 'adult1_adult2_onset_asthma'
-        ]
-        # args = parser.parse_args(args_list)
-    else:
-        args = parser.parse_args()
-    os.chdir('/storage/yangjianLab/chenwenhao/tmp/gsMap_Height_debug')
-    TASK_ID = 16
-    spe_name = f'E{TASK_ID}.5_E1S1'
-    config = SpatialLDSCConfig(**{'all_chunk': None,
-                                  'chisq_max': None,
-                                  # 'sumstats_file': '/storage/yangjianLab/songliyang/GWAS_trait/LDSC/GIANT_EUR_Height_2022_Nature.sumstats.gz',
-                                  'ldsc_save_dir': f'{spe_name}/ldsc_results_three_row_sum_sub_config_traits',
-                                  'ldscore_input_dir': '/storage/yangjianLab/songliyang/SpatialData/Data/Embryo/Mice/Cell_MOSTA/annotation/E16.5_E1S1/generate_ldscore_new',
-                                  'n_blocks': 200,
-                                  'not_M_5_50': False,
-                                  'num_processes': 15,
-                                  'sample_name': spe_name,
-                                  # 'trait_name': 'GIANT_EUR_Height_2022_Nature',
-                                  'sumstats_config_file': '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/src/gsMap/example/sumstats_config_sub.yaml',
-                                  'w_file': '/storage/yangjianLab/sharedata/LDSC_resource/LDSC_SEG_ldscores/weights_hm3_no_hla/weights.'
-                                  })
-    # config = SpatialLDSCConfig(**vars(args))
-    run_spatial_ldsc(config)
+def load_ldscore_chunk_from_feather(chunk_index, common_snp_among_all_sumstats_pos, config, ):
+    # Load the spatial annotations for this chunk
+    sample_name = config.sample_name
+    ld_file_spatial = f'{config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.'
+    ref_ld_spatial = _read_ref_ld_v2(ld_file_spatial)
+    ref_ld_spatial = ref_ld_spatial.iloc[common_snp_among_all_sumstats_pos]
+    ref_ld_spatial = ref_ld_spatial.astype(np.float32, copy=False)
+    spatial_annotation_cnames = ref_ld_spatial.columns
+    return ref_ld_spatial.values, spatial_annotation_cnames

gsMap/templates/report_template.html ADDED Viewed

@@ -0,0 +1,198 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>{{ title }}</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+    <!-- Custom Styles -->
+    <style>
+        body {
+            padding: 20px;
+            font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
+        }
+        .plot-container {
+            margin-bottom: 50px;
+        }
+        .section-description {
+            color: #6c757d;
+            font-size: 0.95rem;
+            margin-bottom: 20px;
+        }
+        .scrollable-table {
+            max-height: 400px;
+            overflow-y: auto;
+        }
+        .table thead th {
+            position: sticky;
+            top: 0;
+            background-color: #f8f9fa;
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid #dee2e6;
+            border-radius: 5px;
+        }
+        .gene-select-label {
+            font-weight: bold;
+            margin-bottom: 10px;
+        }
+        .collapse-toggle {
+            cursor: pointer;
+            color: #0d6efd;
+            text-decoration: underline;
+        }
+    </style>
+</head>
+<body>
+    <div class="container-fluid">
+        <h1 class="mb-4">{{ title }}</h1>
+        <!-- Genetic Spatial Mapping Plot -->
+        <div class="plot-container">
+            <h2>Genetic Spatial Mapping Plot</h2>
+            <p class="section-description">This plot shows the spatial genetic mapping results across different tissues.</p>
+            <div class="border rounded p-3">
+                {{ genetic_mapping_plot|safe }}
+            </div>
+        </div>
+        <!-- Cauchy Combination Result Table -->
+        <div class="plot-container">
+            <h2>Cauchy Combination Result</h2>
+            <p class="section-description">This table presents the results of the Cauchy combination test, summarizing the genetic associations.</p>
+            <div class="scrollable-table">
+                <table class="table table-hover table-bordered">
+                    <thead class="table-light">
+                        <tr>
+                            <th>Annotation</th>
+                            <th>P Cauchy</th>
+                            <th>P Median</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        {% for row in cauchy_table %}
+                        <tr>
+                            <td>{{ row.annotation }}</td>
+                            <td>{{ "%.4e"|format(row.p_cauchy) }}</td>
+                            <td>{{ "%.4e"|format(row.p_median) }}</td>
+                        </tr>
+                        {% endfor %}
+                    </tbody>
+                </table>
+            </div>
+        </div>
+        <!-- Manhattan Plot -->
+        <div class="plot-container">
+            <h2>Diagnosis Manhattan Plot</h2>
+            <p class="section-description">The Manhattan plot shows the association of SNPs with the top associated gene across the genome.</p>
+            <div class="border rounded p-3">
+                {{ manhattan_plot|safe }}
+            </div>
+        </div>
+        <!-- Gene Expression and GSS Distribution -->
+        <div class="plot-container">
+            <h2>Gene Expression and GSS Distribution</h2>
+            <p class="section-description">Select a gene to view its expression distribution and gene specificity score (GSS).</p>
+            <label for="geneSelect" class="gene-select-label">Select a gene:</label>
+            <select id="geneSelect" class="form-select mb-4">
+                {% for gene in gene_plots %}
+                <option value="{{ gene.name }}">{{ gene.name }}</option>
+                {% endfor %}
+            </select>
+            <div id="genePlots" class="row">
+                <div class="col-md-6 mb-4">
+                    <h5>Expression Distribution</h5>
+                    <img src="{{ gene_plots[0].expression_plot }}" alt="{{ gene_plots[0].name }} Expression Distribution" id="expressionPlotImg" class="img-fluid">
+                </div>
+                <div class="col-md-6 mb-4">
+                    <h5>Gene Specificity Score (GSS)</h5>
+                    <img src="{{ gene_plots[0].gss_plot }}" alt="{{ gene_plots[0].name }} GSS Distribution" id="gssPlotImg" class="img-fluid">
+                </div>
+            </div>
+        </div>
+        <!-- Gene Diagnostic Info Table -->
+        <div class="plot-container">
+            <h2>Top 50 Gene Diagnostic Info</h2>
+            <p class="section-description">This table lists the top 50 genes based on diagnostic criteria, including the gene specificity score (GSS) and PCC.</p>
+            <div class="scrollable-table">
+                <table class="table table-hover table-bordered">
+                    <thead class="table-light">
+                        <tr>
+                            <th>Gene</th>
+                            <th>Annotation</th>
+                            <th>Median GSS</th>
+                            <th>PCC</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        {% for row in gene_diagnostic_info %}
+                        <tr>
+                            <td>{{ row.Gene }}</td>
+                            <td>{{ row.Annotation }}</td>
+                            <td>{{ "%.4f"|format(row.Median_GSS) }}</td>
+                            <td>{{ "%.4f"|format(row.PCC) }}</td>
+                        </tr>
+                        {% endfor %}
+                    </tbody>
+                </table>
+            </div>
+        </div>
+        <!-- Running Info (collapsible) -->
+        <div class="plot-container">
+            <h2>Running Info</h2>
+            <p class="section-description">Click to view detailed run information and parameters.</p>
+            <p class="collapse-toggle" data-bs-toggle="collapse" href="#runningInfo" role="button" aria-expanded="false" aria-controls="runningInfo">
+                Show/Hide Running Info
+            </p>
+            <div class="collapse" id="runningInfo">
+                <div class="card card-body">
+                    <p><strong>gsMap Version:</strong> {{ gsmap_version }}</p>
+                    <p><strong>Parameters:</strong></p>
+                    <ul class="mb-0">
+                        {% for key, value in parameters.items() %}
+                        <li><strong>{{ key }}:</strong> {{ value }}</li>
+                        {% endfor %}
+                    </ul>
+                </div>
+            </div>
+        </div>
+    </div>
+    <!-- JavaScript for Gene Plots -->
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
+    <script>
+        (function() {
+            const geneSelect = document.getElementById('geneSelect');
+            const expressionPlotImg = document.getElementById('expressionPlotImg');
+            const gssPlotImg = document.getElementById('gssPlotImg');
+            const genePlots = {
+                {% for gene in gene_plots %}
+                "{{ gene.name }}": {
+                    expression_plot: "{{ gene.expression_plot }}",
+                    gss_plot: "{{ gene.gss_plot }}"
+                }{% if not loop.last %},{% endif %}
+                {% endfor %}
+            };
+            geneSelect.addEventListener('change', function() {
+                const selectedGene = this.value;
+                const selectedGenePlots = genePlots[selectedGene];
+                // Update images
+                expressionPlotImg.src = selectedGenePlots.expression_plot;
+                expressionPlotImg.alt = `${selectedGene} Expression Distribution`;
+                gssPlotImg.src = selectedGenePlots.gss_plot;
+                gssPlotImg.alt = `${selectedGene} GSS Distribution`;
+            });
+        })();
+    </script>
+</body>
+</html>

gsMap/utils/__init__.py ADDED Viewed

File without changes

gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} RENAMED Viewed

@@ -2,7 +2,7 @@ from pathlib import Path
 import bitarray as ba
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix,csc_matrix
+from scipy.sparse import csr_matrix
 from scipy.sparse import save_npz, load_npz
 from tqdm import trange, tqdm
@@ -69,7 +69,7 @@ def ID_List_Factory(colnames, keepcol, fname_end, header=None, usecols=None):
                 raise ValueError('{f} filename must end in {f}'.format(f=end))
             comp = get_compression(fname)
             self.df = pd.read_csv(fname, header=self.header, usecols=self.usecols,
-                                  delim_whitespace=True, compression=comp)
+                                  sep='\s+', compression=comp)
             if self.colnames:
                 self.df.columns = self.colnames
             if self.keepcol is not None:
@@ -733,11 +733,3 @@ def generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_win
                                      ld_wind_cm=ld_wind_cm,
                                      output_cache_file_dir=output_cache_file_prefix)
         print(f'Compute r2 matrix for chr{chr} done!')
-if __name__ == '__main__':
-    bfile_prefix = '/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC'
-    chromosome_list = range(1, 22)
-    r2_cache_dir = Path('/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/r2_matrix')
-    ld_wind_cm = 1
-    generate_r2_matrix_cache(bfile_prefix, chromosome_list, r2_cache_dir, ld_wind_cm)

gsMap 1.62__py3-none-any.whl → 1.64__py3-none-any.whl

gsMap 1.62py3-none-any.whl → 1.64py3-none-any.whl