PyPI - gwaslab - Versions diffs - 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl - Mend

gwaslab 3.4.37py3-none-any.whl → 3.4.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show

gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +8 -0
gwaslab/g_Sumstats.py +26 -147
gwaslab/g_SumstatsPair.py +6 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +29 -15
gwaslab/hm_harmonize_sumstats.py +291 -163
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +43 -37
gwaslab/io_to_formats.py +428 -295
gwaslab/qc_check_datatype.py +3 -3
gwaslab/qc_fix_sumstats.py +793 -682
gwaslab/util_ex_calculate_ldmatrix.py +29 -11
gwaslab/util_ex_gwascatalog.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +1 -1
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_convert_h2.py +1 -1
gwaslab/util_in_fill_data.py +2 -2
gwaslab/util_in_filter_value.py +122 -34
gwaslab/util_in_get_density.py +2 -2
gwaslab/util_in_get_sig.py +41 -9
gwaslab/viz_aux_quickfix.py +24 -19
gwaslab/viz_aux_reposition_text.py +7 -4
gwaslab/viz_aux_save_figure.py +6 -5
gwaslab/viz_plot_compare_af.py +5 -5
gwaslab/viz_plot_miamiplot2.py +28 -20
gwaslab/viz_plot_mqqplot.py +109 -72
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +3 -1
gwaslab/viz_plot_trumpetplot.py +15 -6
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0

gwaslab/util_ex_calculate_ldmatrix.py CHANGED Viewed

@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
                   study=None,
                   bfile=None,
                   vcf=None,
+                  loci=None,
                   out="./",
                   windowsizekb=1000,
                   n_cores=1,
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
         suffixes=[""]
     if getlead_args is None:
         getlead_args={"windowsizekb":1000}
-    sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+    if loci is None:
+        log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
+        sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+    else:
+        sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
     # Drop duplicate!!!!
     log.write(" -Dropping duplicated SNPIDs...")
     sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
 def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
     if suffixes is None:
             suffixes=[""]
     log.write("   -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
     # convert category to string
     locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
     combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
     # match allele
-    allele_match =  ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) ) | ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
-    log.write("   -#Variants with matched alleles:{}".format(sum(allele_match)))
+    perfect_match =  ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
+    log.write("   -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
     # fliipped allele
-    ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
-    log.write("   -#Variants with flipped alleles:{}".format(sum(ea_mis_match)))
+    #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
+    flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
+    log.write("   -#Variants with flipped alleles:{}".format(sum(flipped_match)))
-    if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
+    allele_match = perfect_match | flipped_match
+    log.write("   -#Total Variants matched:{}".format(sum(allele_match)))
+    if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
         log.write("   -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
     # adjust statistics
     output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
     for suffix in suffixes:
         if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
-            combined_df.loc[ea_mis_match,"BETA"+suffix] = - combined_df.loc[ea_mis_match,"BETA"+suffix]
+            log.write("   -Flipping BETA{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
             output_columns.append("BETA"+suffix)
             output_columns.append("SE"+suffix)
         if "Z" in locus_sumstats.columns:
-            combined_df.loc[ea_mis_match,"Z"+suffix] = - combined_df.loc[ea_mis_match,"Z"+suffix]
+            log.write("   -Flipping Z{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
             output_columns.append("Z"+suffix)
         if "EAF" in locus_sumstats.columns:
-            combined_df.loc[ea_mis_match,"EAF"+suffix] = 1 - combined_df.loc[ea_mis_match,"EAF"+suffix]
+            log.write("   -Flipping EAF{} for variants with flipped alleles...".format(suffix))
+            combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
             output_columns.append("EAF"+suffix)
         if "N" in locus_sumstats.columns:
             output_columns.append("N"+suffix)
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
         matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
         matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
+        log.write(" -Exporting SNP list of {}  to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
         # create locus-sumstats EA, NEA, (BETA, SE), Z
         matched_sumstats_path =  "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
                 to_export_columns.append("EAF"+suffix)
             if "N"+suffix in matched_sumstats.columns:
                 to_export_columns.append("N"+suffix)
-        matched_sumstats.loc[:, ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
+        log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
+        log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
+        matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
         return matched_snp_list_path, matched_sumstats_path
 def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):

gwaslab/util_ex_gwascatalog.py CHANGED Viewed

@@ -127,7 +127,7 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
             #rsid locations
     gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
     if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
-    sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
+    sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
     sigs.fix_pos(verbose=False)
     sigs.fix_chr(verbose=False)
     sigs.sort_coordinate(verbose=False)

gwaslab/util_ex_ldproxyfinder.py CHANGED Viewed

@@ -46,7 +46,7 @@ def _extract_with_ld_proxy(  snplist=None,
                             log=Log(),
                             verbose=True,
                             windowsizekb=100,
-                            ld_threshold=0.8,
+                            ld_threshold=0.8
                             ):
     ### Load vcf#######################################################################################
     if verbose: log.write("Start to load reference genotype...")

gwaslab/util_ex_process_ref.py CHANGED Viewed

@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
                              sep="\s+",
                              usecols=[0,1,3,4,5],
                              header=None,
-                             dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"NEA_bim",5:"EA_bim"})
+                             dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
     log.write("   -#variants in ref file: {}".format(len(single_bim)))
     ref_bims.append(single_bim)
     return ref_bims
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
                              usecols=[0,1,2,3,4],
                              header=None,
                              comment="#",
-                             dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"NEA_bim",4:"EA_bim"})
+                             dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
     log.write("   -#variants in ref file: {}".format(len(single_bim)))
     ref_bims.append(single_bim)
     return ref_bims
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
             except subprocess.CalledProcessError as e:
                 log.write(e.output)
         else:
-            log.write("  -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
+            log.write("  -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
         if load_bim == True:
             if convert == "bfile":

gwaslab/util_ex_run_coloc.py CHANGED Viewed

@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
         D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
         D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
+        abf <- coloc.abf(dataset1=D1,dataset2=D2)
+        write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
         S1=runsusie(D1{susie_args})
         S2=runsusie(D2{susie_args})
         susie.res=coloc.susie(S1,S2{coloc_args})
         write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
         '''.format(sumstats_path = sumstats,
                    ld_r_matrix_path = ld_r_matrix,
                     fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
                     coloc_args = coloc_args,
                     output_prefix = output_prefix)
-        log.write("  -coloc script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
+        log.write("  -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
+        log.write("  -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
         with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
                 file.write(rscript)
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
             #plink_process.kill()
             log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
             r_log+= output + "\n"
+            pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
+            if len(pip_cs)==0:
+                 log.write("  -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
+            else:
+                pip_cs["Locus"] = row["SNPID"]
+                pip_cs["STUDY"] = row["study"]
+                pip_cs["hit1"] = row["SNPID"]
+                pip_cs["METHOD"] = "abf"
+                locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
             pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
             if len(pip_cs)==0:
                  log.write("  -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
             else:
                 pip_cs["Locus"] = row["SNPID"]
                 pip_cs["STUDY"] = row["study"]
+                pip_cs["METHOD"] = "susie"
                 locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
             os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
             if delete == True:
-                os.remove("{}.pipcs".format(output_prefix))
+                os.remove("{}.coloc.susie".format(output_prefix))
+                os.remove("{}.coloc.abf".format(output_prefix))
             else:
-                log.write("  -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)), verbose=verbose)
+                log.write("  -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
+                log.write("  -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
         except subprocess.CalledProcessError as e:
             log.write(e.output)
             os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
-    log.write("Finished finemapping using SuSieR.", verbose=verbose)
+    log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
     return locus_pip_cs

gwaslab/util_in_convert_h2.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _get_per_snp_r2(sumstats,
         if verbose: log.write(" -For r2, {} is used.".format(snpr2))
         sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
-    if verbose: log.write("Finished calculating per-SNP heritibility!")
+    if verbose: log.write("Finished calculating per-SNP heritability!")
     return sumstats
 #
 def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):

gwaslab/util_in_fill_data.py CHANGED Viewed

@@ -9,7 +9,7 @@ from gwaslab.g_version import _get_version
 from gwaslab.qc_check_datatype import check_datatype
 def filldata(
-    sumstats,
+    insumstats,
     to_fill=None,
     df=None,
     overwrite=False,
@@ -23,7 +23,7 @@ def filldata(
     # if a string is passed to to_fill, convert it to list
     if type(to_fill) is str:
         to_fill = [to_fill]
+    sumstats = insumstats.copy()
     if verbose: log.write("Start filling data using existing columns...{}".format(_get_version()))
     check_datatype(sumstats,verbose=verbose,log=log)

gwaslab/util_in_filter_value.py CHANGED Viewed

@@ -8,6 +8,8 @@ from gwaslab.bd_common_data import get_chr_to_number
 from gwaslab.g_Log import Log
 from gwaslab.g_vchange_status import vchange_status
 from gwaslab.qc_fix_sumstats import sortcoordinate
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import finished
 import gc
 def filtervalues(sumstats,expr,remove=False,verbose=True,log=Log()):
@@ -214,6 +216,24 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
     return sumstats
 def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
+    ##start function with col checking##########################################################
+    _start_line = "infer genome build version using hapmap3 SNPs"
+    _end_line = "inferring genome build version using hapmap3 SNPs"
+    _start_cols = [chrom,pos]
+    _start_function = ".infer_build()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                              log=log,
+                              verbose=verbose,
+                              start_line=_start_line,
+                              end_line=_end_line,
+                              start_cols=_start_cols,
+                              start_function=_start_function,
+                              **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
     inferred_build="Unknown"
     if verbose:log.write("Start to infer genome build version using hapmap3 SNPs...")
     data_path_19 =  path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
@@ -222,42 +242,39 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
     hapmap3_ref_19 = pd.read_csv(data_path_19,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
     hapmap3_ref_38 = pd.read_csv(data_path_38,sep="\s+",usecols=["#CHROM","POS"],dtype={"#CHROM":"string","POS":"string"})
-    if chrom in sumstats.columns and pos in sumstats.columns:
-        if verbose: log.write(" -CHR:POS will be used for matching...")
-        raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
-        hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
-        hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
-        match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
-        match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
-        if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
-        if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
-        if max(match_count_for_19, match_count_for_38)<10000:
-            if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
-        if match_count_for_19 > match_count_for_38:
-            if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
-            sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","1")
-            sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","9")
-            inferred_build="19"
-        elif match_count_for_19 < match_count_for_38:
-            if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
-            sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],1,"9","3")
-            sumstats.loc[:,status] = vchange_status(sumstats.loc[:,status],2,"9","8")
-            inferred_build="38"
-        else:
-            if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
-        gc.collect()
-        if verbose:log.write("Finished inferring genome build version using hapmap3 SNPs...")
-        return sumstats, inferred_build
+    if verbose: log.write(" -CHR:POS will be used for matching...")
+    raw_chrpos = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
+    hapmap3_ref_19["chr:pos"] = hapmap3_ref_19["#CHROM"]+":"+hapmap3_ref_19["POS"]
+    hapmap3_ref_38["chr:pos"] = hapmap3_ref_38["#CHROM"]+":"+hapmap3_ref_38["POS"]
+    match_count_for_19 = sum(raw_chrpos.isin(hapmap3_ref_19["chr:pos"].values))
+    match_count_for_38 = sum(raw_chrpos.isin(hapmap3_ref_38["chr:pos"].values))
+    if verbose:log.write(" -Matching variants for hg19: num_hg19 = ",match_count_for_19)
+    if verbose:log.write(" -Matching variants for hg38: num_hg38 = ",match_count_for_38)
+    if max(match_count_for_19, match_count_for_38)<10000:
+        if verbose:log.write(" -Warning: please be cautious due to the limited number of variants.")
+    if match_count_for_19 > match_count_for_38:
+        if verbose:log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...")
+        sumstats[status] = vchange_status(sumstats[status],1,"9","1")
+        sumstats[status] = vchange_status(sumstats[status],2,"9","9")
+        inferred_build="19"
+    elif match_count_for_19 < match_count_for_38:
+        if verbose:log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...")
+        sumstats[status] = vchange_status(sumstats[status],1,"9","3")
+        sumstats[status] = vchange_status(sumstats[status],2,"9","8")
+        inferred_build="38"
     else:
-        gc.collect()
-        raise ValueError("Not enough information to match SNPs. Please check if CHR and POS columns are in your sumstats...")
+        if verbose:log.write(" -Since num_hg19 = num_hg38, unable to infer...")
+    finished(log,verbose,_end_line)
+    return sumstats, inferred_build
 def sampling(sumstats,n=1, p=None, verbose=True,log=Log(),**args):
     if verbose:log.write("Start to randomly select variants from the sumstats...")
     if p is None:
         if verbose:log.write(" -Number of variants selected from the sumstats:",n)
@@ -301,4 +318,75 @@ def _get_flanking(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**ar
     log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
     return flanking
+def _get_flanking_by_id(sumstats, snpid, windowsizekb=500, verbose=True,log=Log(),**args):
+    log.write("Start to extract variants in the flanking regions using rsID or SNPID...",verbose=verbose)
+    log.write(" - Central variants: {}".format(snpid))
+    log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
+    if type(snpid) == str:
+        snpid = [snpid]
+    if "rsID" in sumstats.columns and "SNPID" not in sumstats.columns:
+        is_specified = sumstats["rsID"].isin(snpid)
+    elif "rsID" not in sumstats.columns and "SNPID" in sumstats.columns:
+        is_specified = sumstats["SNPID"].isin(snpid)
+    else:
+        is_specified = sumstats["rsID"].isin(snpid) | sumstats["SNPID"].isin(snpid)
+    row = sumstats.loc[is_specified,:]
+    is_flanking = None
+    for index, row in row.iterrows():
+        chrom = row["CHR"]
+        left =  row["POS"] - 1000 * windowsizekb
+        right = row["POS"] + 1000 * windowsizekb
+        is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
+        log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
+        if is_flanking is None:
+            is_flanking = is_flancking_in_this_region
+        else:
+            is_flanking = is_flanking | is_flancking_in_this_region
+    flanking = sumstats.loc[is_flanking,:]
+    log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
+    log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
+    return flanking
+def _get_flanking_by_chrpos(sumstats, chrpos, windowsizekb=500, verbose=True,log=Log(),**args):
+    log.write("Start to extract variants in the flanking regions using CHR and POS...",verbose=verbose)
+    log.write(" - Central positions: {}".format(chrpos))
+    log.write(" - Flanking windowsize in kb: {}".format(windowsizekb))
+    if type(chrpos) == tuple:
+        chrpos_to_check = [chrpos]
+    else:
+        chrpos_to_check = chrpos
+    is_flanking = None
+    for index, row in enumerate(chrpos_to_check):
+        chrom = row[0]
+        left =  row[1] - 1000 * windowsizekb
+        right = row[1] + 1000 * windowsizekb
+        is_flancking_in_this_region = (sumstats["CHR"] == chrom) & (sumstats["POS"] >= left) & (sumstats["POS"] <= right)
+        log.write(" - Variants in flanking region {}:{}-{} : {}".format(chrom, left, right, sum(is_flancking_in_this_region) ))
+        if is_flanking is None:
+            is_flanking = is_flancking_in_this_region
+        else:
+            is_flanking = is_flanking | is_flancking_in_this_region
+    flanking = sumstats.loc[is_flanking,:]
+    log.write(" - Extracted {} variants in the regions.".format(len(flanking)),verbose=verbose)
+    log.write("Finished extracting variants in the flanking regions.",verbose=verbose)
+    return flanking

gwaslab/util_in_get_density.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gc
 def getsignaldensity(insumstats, id="SNPID", chrom="CHR",pos="POS", bwindowsizekb=100,log=Log(),verbose=True):
     if verbose:log.write("Start to calculate signal DENSITY...")
-    sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
+    sumstats = insumstats[[id,chrom,pos]].copy()
     if verbose:log.write(" -Calculating DENSITY with windowsize of ",bwindowsizekb ," kb")
     #stack=[]
@@ -81,7 +81,7 @@ def assigndensity(insumstats,
             large_number = int(large_number * 10)
         else:
             break
-    sumstats = insumstats.loc[:,[id,chrom,pos]].copy()
+    sumstats = insumstats[[id,chrom,pos]].copy()
     sumstats["DENSITY"] = 0
     sumstats["TCHR+POS"] = sumstats[chrom]*large_number +  sumstats[pos]
     sig_sumstats["TCHR+POS"] = sig_sumstats[chrom]*large_number +  sig_sumstats[pos]

gwaslab/util_in_get_sig.py CHANGED Viewed

@@ -13,8 +13,9 @@ from gwaslab.bd_common_data import get_chr_to_NC
 from gwaslab.bd_common_data import gtf_to_protein_coding
 from gwaslab.bd_download import check_and_download
 from gwaslab.util_ex_gwascatalog import gwascatalog_trait
+from gwaslab.qc_fix_sumstats import check_dataframe_shape
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import finished
 # getsig
 # closest_gene
 # annogene
@@ -39,8 +40,24 @@ def getsig(insumstats,
     """
     Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
     """
+    ##start function with col checking##########################################################
+    _start_line = "extract lead variants"
+    _end_line = "extracting lead variants"
+    _start_cols = [chrom,pos]
+    _start_function = ".get_lead()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=insumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return None
+    ############################################################################################
-    if verbose: log.write("Start to extract lead variants...")
     if verbose: log.write(" -Processing "+str(len(insumstats))+" variants...")
     if verbose: log.write(" -Significance threshold :", sig_level)
     if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
@@ -155,11 +172,9 @@ def getsig(insumstats,
                source=source,
                verbose=verbose)
-    # Finishing
-    if verbose: log.write("Finished extracting lead variants successfully!")
     # drop internal id
     output = output.drop("__ID",axis=1)
-    gc.collect()
+    finished(log,verbose,_end_line)
     return output.copy()
@@ -329,7 +344,24 @@ def getnovel(insumstats,
            gwascatalog_source="NCBI",
            output_known=False,
            verbose=True):
-    if verbose: log.write("Start to check if lead variants are known...")
+    ##start function with col checking##########################################################
+    _start_line = "check if lead variants are known"
+    _end_line = "checking if lead variants are known"
+    _start_cols = [chrom,pos]
+    _start_function = ".get_novel()"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=insumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return None
+    ############################################################################################
     allsig = getsig(insumstats=insumstats,
            id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
            xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
@@ -438,8 +470,8 @@ def getnovel(insumstats,
     if verbose: log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...")
     if verbose: log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...")
-    if verbose: log.write("Finished checking known or novel successfully!")
-    gc.collect()
+    finished(log,verbose,_end_line)
     # how to return
     if only_novel is True:

gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.37py3-none-any.whl → 3.4.38py3-none-any.whl