PyPI - gwaslab - Versions diffs - 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl - Mend

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show

gwaslab/bd_common_data.py +6 -3
gwaslab/bd_download.py +9 -9
gwaslab/bd_get_hapmap3.py +43 -9
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +22 -5
gwaslab/g_Sumstats.py +110 -163
gwaslab/g_SumstatsPair.py +76 -25
gwaslab/g_SumstatsT.py +2 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_version.py +10 -10
gwaslab/hm_casting.py +36 -17
gwaslab/hm_harmonize_sumstats.py +354 -221
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +49 -43
gwaslab/io_read_ldsc.py +49 -1
gwaslab/io_to_formats.py +428 -295
gwaslab/ldsc_irwls.py +198 -0
gwaslab/ldsc_jackknife.py +514 -0
gwaslab/ldsc_ldscore.py +417 -0
gwaslab/ldsc_parse.py +294 -0
gwaslab/ldsc_regressions.py +747 -0
gwaslab/ldsc_sumstats.py +629 -0
gwaslab/qc_check_datatype.py +3 -3
gwaslab/qc_fix_sumstats.py +891 -778
gwaslab/util_ex_calculate_ldmatrix.py +31 -13
gwaslab/util_ex_gwascatalog.py +25 -25
gwaslab/util_ex_ldproxyfinder.py +10 -10
gwaslab/util_ex_ldsc.py +189 -0
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_calculate_gc.py +6 -6
gwaslab/util_in_calculate_power.py +42 -43
gwaslab/util_in_convert_h2.py +8 -8
gwaslab/util_in_fill_data.py +30 -30
gwaslab/util_in_filter_value.py +201 -74
gwaslab/util_in_get_density.py +10 -10
gwaslab/util_in_get_sig.py +445 -71
gwaslab/viz_aux_annotate_plot.py +12 -12
gwaslab/viz_aux_quickfix.py +42 -37
gwaslab/viz_aux_reposition_text.py +10 -7
gwaslab/viz_aux_save_figure.py +18 -8
gwaslab/viz_plot_compare_af.py +32 -33
gwaslab/viz_plot_compare_effect.py +63 -71
gwaslab/viz_plot_miamiplot2.py +34 -26
gwaslab/viz_plot_mqqplot.py +126 -75
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +36 -33
gwaslab/viz_plot_rg_heatmap.py +28 -26
gwaslab/viz_plot_stackedregional.py +40 -21
gwaslab/viz_plot_trumpetplot.py +65 -61
gwaslab-3.4.39.dist-info/LICENSE +674 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
gwaslab-3.4.39.dist-info/RECORD +80 -0
gwaslab-3.4.37.dist-info/RECORD +0 -72
/gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0

gwaslab/util_ex_calculate_ldmatrix.py CHANGED Viewed

@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
                   study=None,
                   bfile=None,
                   vcf=None,
+                  loci=None,
                   out="./",
                   windowsizekb=1000,
                   n_cores=1,
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
         suffixes=[""]
     if getlead_args is None:
         getlead_args={"windowsizekb":1000}
-    sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+    if loci is None:
+        log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
+        sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+    else:
+        sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
     # Drop duplicate!!!!
     log.write(" -Dropping duplicated SNPIDs...")
     sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
 def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
     if suffixes is None:
             suffixes=[""]
     log.write("   -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
     # convert category to string
     locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
     combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
     # match allele
-    allele_match =  ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) ) | ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
-    log.write("   -#Variants with matched alleles:{}".format(sum(allele_match)))
+    perfect_match =  ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
+    log.write("   -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
     # fliipped allele
-    ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
-    log.write("   -#Variants with flipped alleles:{}".format(sum(ea_mis_match)))
+    #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
+    flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
+    log.write("   -#Variants with flipped alleles:{}".format(sum(flipped_match)))
-    if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
-        log.write("   -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
+    allele_match = perfect_match | flipped_match
+    log.write("   -#Total Variants matched:{}".format(sum(allele_match)))
+    if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
+        log.warning("Lead variant was not available in reference!")
     # adjust statistics
     output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
     for suffix in suffixes:
         if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
-            combined_df.loc[ea_mis_match,"BETA"+suffix] = - combined_df.loc[ea_mis_match,"BETA"+suffix]
+            log.write("   -Flipping BETA{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
             output_columns.append("BETA"+suffix)
             output_columns.append("SE"+suffix)
         if "Z" in locus_sumstats.columns:
-            combined_df.loc[ea_mis_match,"Z"+suffix] = - combined_df.loc[ea_mis_match,"Z"+suffix]
+            log.write("   -Flipping Z{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
             output_columns.append("Z"+suffix)
         if "EAF" in locus_sumstats.columns:
-            combined_df.loc[ea_mis_match,"EAF"+suffix] = 1 - combined_df.loc[ea_mis_match,"EAF"+suffix]
+            log.write("   -Flipping EAF{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
             output_columns.append("EAF"+suffix)
         if "N" in locus_sumstats.columns:
             output_columns.append("N"+suffix)
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
         matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
         matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
+        log.write(" -Exporting SNP list of {}  to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
         # create locus-sumstats EA, NEA, (BETA, SE), Z
         matched_sumstats_path =  "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
                 to_export_columns.append("EAF"+suffix)
             if "N"+suffix in matched_sumstats.columns:
                 to_export_columns.append("N"+suffix)
-        matched_sumstats.loc[:, ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
+        log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
+        log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
+        matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
         return matched_snp_list_path, matched_sumstats_path
 def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
@@ -238,4 +256,4 @@ def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
     if list(matched_sumstats_snpid) == list(snpid_list):
         log.write(" -Sumstats SNPID order and LD matrix SNPID order are matched.")
     else:
-        log.write(" -Warning: Sumstats SNPID order and LD matrix SNPID order are not matched...")
+        log.warning("Sumstats SNPID order and LD matrix SNPID order are not matched!")

gwaslab/util_ex_gwascatalog.py CHANGED Viewed

@@ -9,26 +9,26 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
     #https://www.ebi.ac.uk/gwas/rest/docs/api
     base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
-    if verbose: log.write("Start to retrieve data from GWASCatalog...")
-    if verbose: log.write(" -Please make sure your sumstats is based on GRCh38...")
-    if verbose: log.write(" -Requesting (GET) trait information through the GWASCatalog API...")
-    if verbose: log.write(" -EFO trait api: "+ base_url)
+    log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
+    log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
+    log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
+    log.write(" -EFO trait api: "+ base_url, verbose=verbose)
     text = requests.get(base_url)
-    if verbose:
-        log.write(" -Status code: {}".format(text.status_code))
-        if text.status_code!=200:
-            log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.")
-            log.write(" -Message:{}".format(text.text))
-            return 0
+    log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
+    if text.status_code!=200:
+        log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
+        log.write(" -Message:{}".format(text.text), verbose=verbose)
+        return 0
     api_response = json.loads(text.text)
-    if verbose: log.write(" -Trait Name:",api_response["trait"])
-    if verbose: log.write(" -Trait URL:",api_response["uri"])
+    log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
+    log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
     base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
-    if verbose: log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...")
-    if verbose: log.write(" -associationsByTraitSummary API: "+ base_url)
-    if verbose: log.write(" -Note: this step might take a while...")
+    log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
+    log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
+    log.write(" -Note: this step might take a while...", verbose=verbose)
     # get request and check status code of response
     raw_data = requests.get(base_url)
@@ -37,13 +37,13 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
     is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
     if is_proceed is False: return False
-    if verbose: log.write(" -Loading json ...")
+    log.write(" -Loading json ...", verbose=verbose)
     # Transform API response from JSON into Python dictionary
     api_response = json.loads(raw_data.text)
-    if verbose: log.write(" -Parsing json ...")
+    log.write(" -Parsing json ...", verbose=verbose)
     # An
     records=list()
-    if verbose: log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]))
+    log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]), verbose=verbose)
     for association in api_response["_embedded"]["associations"]:
         #association statistics:
@@ -126,12 +126,12 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
                                 records.append(row)
             #rsid locations
     gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
-    if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
-    sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
+    log.write(" -Loading retrieved data into gwaslab Sumstats object ...", verbose=verbose)
+    sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
     sigs.fix_pos(verbose=False)
     sigs.fix_chr(verbose=False)
     sigs.sort_coordinate(verbose=False)
-    if verbose: log.write("Finished retrieving data from GWASCatalog...")
+    log.write("Finished retrieving data from GWASCatalog...", verbose=verbose)
     #return gwaslab Sumstats object
     return sigs
@@ -142,14 +142,14 @@ def check_request_status_code(request_code,verbose=True,log=Log()):
     is_proceed=False
     if request_code == 200:
-        if verbose: log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...")
+        log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...", verbose=verbose)
         is_proceed=True
     elif request_code == 404:
-        if verbose: log.write(" -Status code 404 Not Found: The requested resource did not exist ...")
+        log.write(" -Status code 404 Not Found: The requested resource did not exist ...", verbose=verbose)
     elif request_code == 301:
-        if verbose: log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...")
+        log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...", verbose=verbose)
     elif request_code == 400:
-        if verbose: log.write(" -Status code 400 Bad Request: The requested resource did not exist ...")
+        log.write(" -Status code 400 Bad Request: The requested resource did not exist ...", verbose=verbose)
     return is_proceed

gwaslab/util_ex_ldproxyfinder.py CHANGED Viewed

@@ -46,11 +46,11 @@ def _extract_with_ld_proxy(  snplist=None,
                             log=Log(),
                             verbose=True,
                             windowsizekb=100,
-                            ld_threshold=0.8,
+                            ld_threshold=0.8
                             ):
     ### Load vcf#######################################################################################
-    if verbose: log.write("Start to load reference genotype...")
-    if verbose: log.write(" -reference vcf path : "+ vcf_path)
+    log.write("Start to load reference genotype...", verbose=verbose)
+    log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
     if tabix is None:
         tabix = which("tabix")
     vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
@@ -122,7 +122,7 @@ def _extract_with_ld_proxy(  snplist=None,
     extracted_sumstats = pd.concat([extracted_sumstats, ld_proxies],ignore_index=True)
-    if verbose: log.write("Finished loading reference genotype successfully!")
+    log.write("Finished loading reference genotype successfully!", verbose=verbose)
     return extracted_sumstats
@@ -139,13 +139,13 @@ def _get_rsq(    row,
         ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
         if ref_genotype is None:
-            if verbose: log.write("  -Warning: no data was retrieved. Skipping ...")
+            log.warning("No data was retrieved. Skipping ...", verbose=verbose)
             ref_genotype=dict()
             ref_genotype["variants/POS"]=np.array([],dtype="int64")
             return None
-        if verbose: log.write("  -Retrieving index...")
-        if verbose: log.write("  -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])))
+        log.write("  -Retrieving index...", verbose=verbose)
+        log.write("  -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
         #  match sumstats pos and ref pos:
         # get ref index for its first appearance of sumstats pos
         #######################################################################################
@@ -170,7 +170,7 @@ def _get_rsq(    row,
             else:
                 # no position match
                 return None
-        if verbose: log.write("  -Matching variants using POS, NEA, EA ...")
+        log.write("  -Matching variants using POS, NEA, EA ...", verbose=verbose)
         sumstats["REFINDEX"] = sumstats.loc[:,["POS","NEA","EA"]].apply(lambda x: match_varaint(x), axis=1)
         log.write("  -Matched variants in sumstats and vcf:{} ".format(sum(~sumstats["REFINDEX"].isna())))
@@ -190,7 +190,7 @@ def _get_rsq(    row,
             lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
             other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
-            if verbose: log.write("  -Calculating Rsq...")
+            log.write("  -Calculating Rsq...", verbose=verbose)
             if len(other_snp_genotype)>1:
                 valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
@@ -198,7 +198,7 @@ def _get_rsq(    row,
                 valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
             sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
         else:
-            if verbose: log.write("  -Lead SNP not found in reference...")
+            log.write("  -Lead SNP not found in reference...", verbose=verbose)
             sumstats["RSQ"]=None
         sumstats["RSQ"] = sumstats["RSQ"].astype("float")

gwaslab/util_ex_ldsc.py ADDED Viewed

@@ -0,0 +1,189 @@
+from gwaslab.ldsc_sumstats import estimate_h2
+from gwaslab.ldsc_sumstats import estimate_rg
+from gwaslab.g_Log import Log
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_fix_sumstats import skipped
+from gwaslab.io_read_ldsc import parse_ldsc_summary
+class ARGS():
+    def __init__(self, **args):
+        self.out = "ldsc"
+        self.bfile = None
+        self.l2 = False
+        self.extract = None
+        self.keep = None
+        self.ld_wind_snps = None
+        self.ld_wind_kb = None
+        self.ld_wind_cm = None
+        self.print_snps = None
+        self.annot =None
+        self.thin_annot = False
+        self.cts_bin = None
+        self.cts_breaks = None
+        self.cts_names = None
+        self.per_allele = False
+        self.pq_exp =None
+        self.no_print_annot = False
+        if "h2" in args.keys():
+            self.h2 = args["h2"]
+        else:
+            self.h2 = None
+        self.h2_cts = None
+        if "rg" in args.keys():
+            self.rg = args["rg"]
+        else:
+            self.rg = None
+        if "ref_ld" in args.keys():
+            self.ref_ld = args["ref_ld"]
+        else:
+            self.ref_ld = None
+        if "ref_ld_chr" in args.keys():
+            self.ref_ld_chr = args["ref_ld_chr"]
+        else:
+            self.ref_ld_chr = None
+        if "w_ld" in args.keys():
+            self.w_ld = args["w_ld"]
+        else:
+            self.w_ld = None
+        if "w_ld_chr" in args.keys():
+            self.w_ld_chr = args["w_ld_chr"]
+        else:
+            self.w_ld_chr = None
+        self.overlap_annot = False
+        self.print_coefficients = "ldsc"
+        self.frqfile = None
+        self.frqfile_chr = None
+        self.no_intercept = None
+        self.intercept_h2 = None
+        self.intercept_gencov = None
+        self.M = None
+        self.two_step = None
+        self.chisq_max = None
+        self.ref_ld_chr_cts = None
+        self.print_cov = None
+        self.print_delete_vals = False
+        self.chunk_size = 50
+        self.pickle = False
+        self.yes_really = False
+        self.invert_anyway = False
+        self.n_blocks = 200
+        self.not_M_5_50 = False
+        self.no_check_alleles = False
+        self.return_silly_things = False
+        if "samp_prev" in args.keys():
+            self.samp_prev = args["samp_prev"]
+        else:
+            self.samp_prev = None
+        if "pop_prev" in args.keys():
+            self.pop_prev = args["pop_prev"]
+        else:
+            self.pop_prev = None
+def _estimate_h2_by_ldsc(insumstats, log, verbose=True, **args):
+    sumstats = insumstats.copy()
+    ##start function with col checking##########################################################
+    _start_line = "run LD score regression"
+    _end_line = "running LD score regression"
+    _start_cols =[]
+    _start_function = ".estimate_h2_by_ldsc()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return None
+    ############################################################################################
+    log.write(" -Run single variate LD score regression:", verbose=verbose)
+    log.write("  -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
+    log.write("  -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
+    log.write(" -Arguments:", verbose=verbose)
+    for key, value in args.items():
+        log.write("  -{}:{}".format(key, value), verbose=verbose)
+    default_args = ARGS(**args)
+    if "Z" not in sumstats.columns:
+        sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
+    sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
+    log.write(" -LDSC log:", verbose=verbose)
+    summary = estimate_h2(sumstats, default_args, log)
+    log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
+    finished(log=log,verbose=verbose,end_line=_end_line)
+    return parse_ldsc_summary(summary)
+def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **args):
+    sumstats = insumstats.copy()
+    ##start function with col checking##########################################################
+    _start_line = "run LD score regression for genetic correlation"
+    _end_line = "running LD score regression for genetic correlation"
+    _start_cols =[]
+    _start_function = ".estimate_rg_by_ldsc()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return None
+    ############################################################################################
+    log.write(" -Run cross-trait LD score regression:", verbose=verbose)
+    log.write("  -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
+    log.write("  -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
+    log.write(" -Arguments:", verbose=verbose)
+    for key, value in args.items():
+        log.write("  -{}:{}".format(key, value), verbose=verbose)
+    default_args = ARGS(**args)
+    if "Z" not in sumstats.columns:
+        sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
+    sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
+    other_traits_to_use = []
+    alias = default_args.rg.split(",")[1:]
+    for index, each_other_sumstats in enumerate(other_traits):
+        log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
+        if "rsID" not in each_other_sumstats.data.columns:
+            to_append = each_other_sumstats.filter_hapmap3(verbose=False).data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
+        else:
+            to_append = each_other_sumstats.data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
+        if "Z" not in to_append.columns:
+            to_append["Z"] = to_append["BETA"]/to_append["SE"]
+        other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
+    log.write(" -LDSC log:", verbose=verbose)
+    summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
+    log.write(" -Results have been stored in .ldsc_rg", verbose=verbose)
+    finished(log=log,verbose=verbose,end_line=_end_line)
+    return summary

gwaslab/util_ex_process_ref.py CHANGED Viewed

@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
                              sep="\s+",
                              usecols=[0,1,3,4,5],
                              header=None,
-                             dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"NEA_bim",5:"EA_bim"})
+                             dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
     log.write("   -#variants in ref file: {}".format(len(single_bim)))
     ref_bims.append(single_bim)
     return ref_bims
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
                              usecols=[0,1,2,3,4],
                              header=None,
                              comment="#",
-                             dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"NEA_bim",4:"EA_bim"})
+                             dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
     log.write("   -#variants in ref file: {}".format(len(single_bim)))
     ref_bims.append(single_bim)
     return ref_bims
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
             except subprocess.CalledProcessError as e:
                 log.write(e.output)
         else:
-            log.write("  -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
+            log.write("  -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
         if load_bim == True:
             if convert == "bfile":

gwaslab/util_ex_run_coloc.py CHANGED Viewed

@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
         D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
         D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
+        abf <- coloc.abf(dataset1=D1,dataset2=D2)
+        write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
         S1=runsusie(D1{susie_args})
         S2=runsusie(D2{susie_args})
         susie.res=coloc.susie(S1,S2{coloc_args})
         write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
         '''.format(sumstats_path = sumstats,
                    ld_r_matrix_path = ld_r_matrix,
                     fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
                     coloc_args = coloc_args,
                     output_prefix = output_prefix)
-        log.write("  -coloc script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
+        log.write("  -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
+        log.write("  -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
         with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
                 file.write(rscript)
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
             #plink_process.kill()
             log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
             r_log+= output + "\n"
+            pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
+            if len(pip_cs)==0:
+                 log.write("  -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
+            else:
+                pip_cs["Locus"] = row["SNPID"]
+                pip_cs["STUDY"] = row["study"]
+                pip_cs["hit1"] = row["SNPID"]
+                pip_cs["METHOD"] = "abf"
+                locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
             pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
             if len(pip_cs)==0:
                  log.write("  -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
             else:
                 pip_cs["Locus"] = row["SNPID"]
                 pip_cs["STUDY"] = row["study"]
+                pip_cs["METHOD"] = "susie"
                 locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
             os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
             if delete == True:
-                os.remove("{}.pipcs".format(output_prefix))
+                os.remove("{}.coloc.susie".format(output_prefix))
+                os.remove("{}.coloc.abf".format(output_prefix))
             else:
-                log.write("  -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)), verbose=verbose)
+                log.write("  -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
+                log.write("  -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
         except subprocess.CalledProcessError as e:
             log.write(e.output)
             os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
-    log.write("Finished finemapping using SuSieR.", verbose=verbose)
+    log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
     return locus_pip_cs

gwaslab/util_in_calculate_gc.py CHANGED Viewed

@@ -12,34 +12,34 @@ def lambdaGC(insumstats,include_chrXYMT=True, x=23 ,y=24, mt=25, mode="P",level=
     sumstats=insumstats.loc[:,["CHR",mode]]
     if include_chrXYMT is False:
-        if verbose: log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.")
+        log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.", verbose=verbose)
         xymt= [x,y,mt,"chrx","chry","chrmt","chrX","chrY","chrMT","chrM","M","x","y","mt","X","Y","MT"]
         sumstats = sumstats.loc[~sumstats["CHR"].isin(xymt),:]
     indata = sumstats[mode].values
     if len(indata) == 0:
-        if verbose: log.write("  -No available variants to use for calculation.")
+        log.write("  -No available variants to use for calculation.", verbose=verbose)
         return np.nan
     if mode=="p" or mode=="P":
         observedMedianChi2 = sp.stats.chi2.isf(np.nanmedian(indata),1)
         expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
         lambdagc=observedMedianChi2/expectedMedianChi2
-        if verbose: log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc))
+        log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
     elif mode=="mlog10p" or mode=="MLOG10P":
         observedMedianChi2 = sp.stats.chi2.isf( np.nanmedian(np.power(10,-indata)) ,1)
         expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
         lambdagc=observedMedianChi2/expectedMedianChi2
-        if verbose: log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
+        log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
     elif mode=="z" or mode=="Z":
         observedMedianChi2 = np.median((indata)**2)
         expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
         lambdagc=observedMedianChi2/expectedMedianChi2
-        if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
+        if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
     elif mode=="chi2" or mode=="CHISQ":
         observedMedianChi2 = np.median(indata)
         expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
         lambdagc=observedMedianChi2/expectedMedianChi2
-        if verbose:log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
+        log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
     else:
         return np.nan
     return lambdagc

gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl