PyPI - gsMap - Versions diffs - 1.62__py3-none-any.whl → 1.63__py3-none-any.whl - Mend

gsMap 1.62py3-none-any.whl → 1.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gsMap/GNN_VAE/adjacency_matrix.py +1 -1
gsMap/GNN_VAE/model.py +5 -5
gsMap/GNN_VAE/train.py +1 -1
gsMap/__init__.py +1 -1
gsMap/cauchy_combination_test.py +14 -36
gsMap/config.py +473 -404
gsMap/diagnosis.py +273 -0
gsMap/find_latent_representation.py +22 -86
gsMap/format_sumstats.py +79 -82
gsMap/generate_ldscore.py +145 -78
gsMap/latent_to_gene.py +65 -104
gsMap/main.py +1 -9
gsMap/report.py +160 -0
gsMap/run_all_mode.py +195 -0
gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
gsMap/templates/report_template.html +198 -0
gsMap/utils/__init__.py +0 -0
gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
gsMap/utils/manhattan_plot.py +639 -0
gsMap/{regression_read.py → utils/regression_read.py} +1 -1
gsMap/visualize.py +100 -55
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
gsmap-1.63.dist-info/RECORD +30 -0
gsmap-1.62.dist-info/RECORD +0 -24
/gsMap/{jackknife.py → utils/jackknife.py} +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
{gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0

gsMap/format_sumstats.py CHANGED Viewed

@@ -1,17 +1,15 @@
-import os
 import numpy as np
-import pandas as pd
-import itertools as it
-import math
-import re
-import argparse
 import logging
-from scipy.stats import chi2
+import re
-from gsMap.config import FormatSumstatsConfig, add_format_sumstats_args
+import math
+import numpy as np
+import pandas as pd
+from scipy.stats import chi2
+from gsMap.config import FormatSumstatsConfig
-VALID_SNPS = set(['AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'])
+VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
 logger = logging.getLogger(__name__)
 default_cnames = {
@@ -24,7 +22,7 @@ default_cnames = {
     # P-VALUE
     'P': 'P',
     'PVALUE': 'P',
-    'P_VALUE':  'P',
+    'P_VALUE': 'P',
     'PVAL': 'P',
     'P_VAL': 'P',
     'GC_PVALUE': 'P',
@@ -72,7 +70,7 @@ default_cnames = {
     'EFFECT': 'BETA',
     'b': 'BETA',
     'beta': 'BETA',
-    #SE
+    # SE
     'se': 'SE',
     # INFO
     'INFO': 'INFO',
@@ -103,7 +101,7 @@ def get_compression(fh):
     return compression
-def gwas_checkname(gwas,config):
+def gwas_checkname(gwas, config):
     '''
     Iterpret column names of gwas
     '''
@@ -114,21 +112,27 @@ def gwas_checkname(gwas,config):
     gwas.columns = list(mapped_cnames.values())
     # When column names are provided by users
-    name_updates = {'SNP': config.snp,'A1': config.a1,'A2': config.a2,'INFO': config.info,
-                    'BETA': config.beta,'SE': config.se,'P': config.p,'FRQ': config.frq,'N': config.n,
-                    'Z': config.z,'Chr': config.chr, 'Pos': config.pos,'OR':config.OR, 'SE_OR':config.se_OR}
+    name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
+                    'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
+                    'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
     for key, value in name_updates.items():
         if value is not None and value in gwas.columns:
             gwas.rename(columns={value: key}, inplace=True)
     new_name = gwas.columns
+    # check the name duplication
+    for head in new_name:
+        numc = list(new_name).count(head)
+        if numc > 1:
+            raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
     name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
     # When at OR scale
     if 'OR' in new_name and 'SE_OR' in new_name:
-        gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
-        gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
+        gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
+        gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
     interpreting = {
         "SNP": 'Variant ID (e.g., rs number).',
         "A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
@@ -142,7 +146,7 @@ def gwas_checkname(gwas,config):
         "N": 'Sample size.',
         "INFO": 'INFO score (imputation quality; higher → better imputation).',
         "FRQ": 'Allele frequency of A1.',
-        "Chr":'Chromsome.',
+        "Chr": 'Chromsome.',
         'Pos': 'SNP positions.'
     }
@@ -150,45 +154,46 @@ def gwas_checkname(gwas,config):
     for key, value in interpreting.items():
         if key in new_name:
             print(f'{name_dict[key]}: {interpreting[key]}')
     return gwas
-def gwas_checkformat(gwas,config):
+def gwas_checkformat(gwas, config):
     '''
     Check column names required for different format
     '''
-    if config.format=='gsMap':
-        condition1 = np.any(np.isin(['P', 'Z'],gwas.columns))
-        condition2 = np.all(np.isin(['BETA', 'SE'],gwas.columns))
+    if config.format == 'gsMap':
+        condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
+        condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
         if not (condition1 or condition2):
-            raise ValueError('To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
+            raise ValueError(
+                'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
         else:
             if 'Z' in gwas.columns:
                 pass
             elif 'P' in gwas.columns:
-                gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
+                gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
             else:
                 gwas['Z'] = gwas.BETA / gwas.SE
-    elif config.format=='COJO':
-        condition = np.all(np.isin(['A1','A2','FRQ','BETA','SE','P','N'],gwas.columns))
-        if not condition:
+    elif config.format == 'COJO':
+        condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
+        if not condition:
             raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
         else:
             gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
     return gwas
-def filter_info(info,config):
+def filter_info(info, config):
     '''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
     if type(info) is pd.Series:  # one INFO column
         jj = ((info > 2.0) | (info < 0)) & info.notnull()
         ii = info >= config.info_min
     elif type(info) is pd.DataFrame:  # several INFO columns
         jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
-            (info < 0) & info.notnull()).any(axis=1))
+                (info < 0) & info.notnull()).any(axis=1))
         ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
     else:
         raise ValueError('Expected pd.DataFrame or pd.Series.')
@@ -201,7 +206,7 @@ def filter_info(info,config):
     return ii
-def filter_frq(frq,config):
+def filter_frq(frq, config):
     '''
     Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
     '''
@@ -216,7 +221,7 @@ def filter_frq(frq,config):
     return ii & ~jj
-def filter_pvals(P,config):
+def filter_pvals(P, config):
     '''Remove out-of-bounds P-values'''
     ii = (P > 0) & (P <= 1)
     bad_p = (~ii).sum()
@@ -232,17 +237,17 @@ def filter_alleles(a):
     return a.isin(VALID_SNPS)
-def gwas_qc(gwas,config):
+def gwas_qc(gwas, config):
     '''
     Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
     '''
     old = len(gwas)
     print(f'\nFiltering SNPs as follows:')
     # filter: SNPs with missing values
-    drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N':0}
+    drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
     gwas = gwas.dropna(axis=0, how="any", subset=filter(
-                lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
+        lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
     drops['NA'] = old - len(gwas)
     print(f'Removed {drops["NA"]} SNPs with missing values.')
@@ -250,21 +255,21 @@ def gwas_qc(gwas,config):
     # filter: SNPs with Info < 0.9
     if 'INFO' in gwas.columns:
         old = len(gwas)
-        gwas = gwas.loc[filter_info(gwas['INFO'],config)]
+        gwas = gwas.loc[filter_info(gwas['INFO'], config)]
         drops['INFO'] = old - len(gwas)
         print(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
     # filter: SNPs with MAF <= 0.01
     if 'FRQ' in gwas.columns:
         old = len(gwas)
-        gwas = gwas.loc[filter_frq(gwas['FRQ'],config)]
+        gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
         drops['FRQ'] += old - len(gwas)
         print(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
     # filter: P-value that out-of-bounds [0,1]
     if 'P' in gwas.columns:
         old = len(gwas)
-        gwas = gwas.loc[filter_pvals(gwas['P'],config)]
+        gwas = gwas.loc[filter_pvals(gwas['P'], config)]
         drops['P'] += old - len(gwas)
         print(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
@@ -289,11 +294,11 @@ def gwas_qc(gwas,config):
     gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
     drops['N'] += old - len(gwas)
     print(f'Removed {drops["N"]} SNPs with N < {n_min}.')
     return gwas
-def variant_to_rsid(gwas,config):
+def variant_to_rsid(gwas, config):
     '''
     Convert variant id (Chr, Pos) to rsid
     '''
@@ -303,42 +308,42 @@ def variant_to_rsid(gwas,config):
     chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
     dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
-    chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t",  skiprows=1,
+    chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
                              dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
     # Iterate over chunks
     matching_id = pd.DataFrame()
     for chunk in chunk_iter:
-        chunk['id'] = chr_format+chunk["chr"]+"_"+chunk["pos"]
-        matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp','id']]])
+        chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
+        matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
     matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
     matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
     matching_id.index = matching_id.id
     return matching_id
-def clean_SNP_id(gwas,config):
+def clean_SNP_id(gwas, config):
     '''
     Clean SNP id
     '''
     old = len(gwas)
     condition1 = 'SNP' in gwas.columns
-    condition2 = np.all(np.isin(['Chr', 'Pos'],gwas.columns))
+    condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
     if not (condition1 or condition2):
-             raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
+        raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
     elif condition1:
         pass
     elif condition2:
         if config.dbsnp is None:
-             raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
+            raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
         else:
-            gwas['id'] = gwas["Chr"].astype(str)+"_"+gwas["Pos"].astype(str)
+            gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
             gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
             gwas.index = gwas.id
-            matching_id = variant_to_rsid(gwas,config)
+            matching_id = variant_to_rsid(gwas, config)
             gwas = gwas.loc[matching_id.id]
             gwas['SNP'] = matching_id.dbsnp
             num_fail = old - len(gwas)
@@ -347,7 +352,7 @@ def clean_SNP_id(gwas,config):
     return gwas
-def gwas_metadata(gwas,config):
+def gwas_metadata(gwas, config):
     '''
     Report key features of GWAS data
     '''
@@ -360,51 +365,43 @@ def gwas_metadata(gwas,config):
     print('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
     print('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
-    print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ> 29).sum()))
+    print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
-def gwas_format(config:FormatSumstatsConfig):
+def gwas_format(config: FormatSumstatsConfig):
     '''
     Format GWAS data
     '''
     print(f'------Formating gwas data for {config.sumstats}...')
-    gwas_file="/storage/yangjianLab/songliyang/GWAS_trait/COJO/Alcohol_Dependence.txt"
-    gwas = pd.read_csv(config.sumstats,delim_whitespace=True,
-                header=0,compression=get_compression(gwas_file),na_values=['.', 'NA'])
+    compression_type = get_compression(config.sumstats)
+    gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
+                       na_values=['.', 'NA'])
     print(f'Read {len(gwas)} SNPs from {config.sumstats}.')
     # Check name and format
-    gwas = gwas_checkname(gwas,config)
-    gwas = gwas_checkformat(gwas,config)
+    gwas = gwas_checkname(gwas, config)
+    gwas = gwas_checkformat(gwas, config)
     # Clean the snp id
-    gwas = clean_SNP_id(gwas,config)
+    gwas = clean_SNP_id(gwas, config)
     # QC
-    gwas = gwas_qc(gwas,config)
+    gwas = gwas_qc(gwas, config)
     # Meta
-    gwas_metadata(gwas,config)
+    gwas_metadata(gwas, config)
     # Saving the data
-    if config.format=='COJO':
-        keep = ['SNP','A1','A2','FRQ','BETA','SE','P','N']
+    if config.format == 'COJO':
+        keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
         appendix = '.cojo'
-    elif config.format=='gsMap':
-        keep = ["A1","A2","Z","N","SNP"]
+    elif config.format == 'gsMap':
+        keep = ["SNP", "A1", "A2", "Z", "N"]
         appendix = '.sumstats'
     if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
-        keep = keep + ['Chr','Pos']
+        keep = keep + ['Chr', 'Pos']
     gwas = gwas[keep]
-    out_name = config.out + appendix +'.gz'
+    out_name = config.out + appendix + '.gz'
     print(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
     gwas.to_csv(out_name, sep="\t", index=False,
-               float_format='%.3f', compression = 'gzip')
-if __name__ == '__main__':
-        parser = argparse.ArgumentParser(description="Visualization the results")
-        parser = add_format_sumstats_args(parser)
-        args = parser.parse_args()
-        config = FormatSumstatsConfig(**vars(args))
-        gwas_format(config)
+                float_format='%.3f', compression='gzip')

gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl

gsMap 1.62py3-none-any.whl → 1.63py3-none-any.whl