gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/bd_common_data.py
CHANGED
|
@@ -280,17 +280,20 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
|
|
|
280
280
|
protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
|
|
281
281
|
# if not existing, extract protein coding records and output to a new file
|
|
282
282
|
if not path.isfile(protein_coding_path):
|
|
283
|
+
|
|
283
284
|
# get gene list
|
|
284
|
-
|
|
285
|
+
log.write(" - Extracting protein_coding genes from {}".format(gtfpath),verbose=verbose)
|
|
285
286
|
gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
|
|
286
287
|
gene_list = gtf.loc[(gtf["feature"]=="gene") & (gtf["gene_biotype"]=="protein_coding"),"gene_id"].values
|
|
287
|
-
|
|
288
|
+
log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)),verbose=verbose)
|
|
289
|
+
|
|
288
290
|
# extract entry using csv
|
|
289
291
|
gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
|
|
290
292
|
gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
|
|
291
293
|
gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
|
|
292
294
|
gtf_raw = gtf_raw.drop("_gene_id",axis=1)
|
|
293
|
-
|
|
295
|
+
|
|
296
|
+
log.write(" - Extracted records are saved to : {} ".format(protein_coding_path),verbose=verbose)
|
|
294
297
|
gtf_raw.to_csv(protein_coding_path, header=None, index=None, sep="\t")
|
|
295
298
|
|
|
296
299
|
return protein_coding_path
|
gwaslab/bd_download.py
CHANGED
|
@@ -106,7 +106,7 @@ def check_available_ref(log=Log(),verbose=True):
|
|
|
106
106
|
Check available reference files for gwaslab.
|
|
107
107
|
Return a dictionary of available reference files.
|
|
108
108
|
'''
|
|
109
|
-
|
|
109
|
+
log.write("Start to check available reference files...", verbose=verbose)
|
|
110
110
|
#ref_path = path.dirname(__file__) + '/data/reference.json'
|
|
111
111
|
ref_path = options.paths["reference"]
|
|
112
112
|
if not path.exists(ref_path):
|
|
@@ -115,11 +115,11 @@ def check_available_ref(log=Log(),verbose=True):
|
|
|
115
115
|
dicts = json.load(open(ref_path))
|
|
116
116
|
if dicts is not None:
|
|
117
117
|
for key,value in dicts.items():
|
|
118
|
-
|
|
118
|
+
log.write(" -",key," : ",value, verbose=verbose)
|
|
119
119
|
return dicts
|
|
120
120
|
else:
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
log.write(" -No available reference files.", verbose=verbose)
|
|
122
|
+
log.write("Finished checking available reference files...", verbose=verbose)
|
|
123
123
|
return {}
|
|
124
124
|
|
|
125
125
|
def update_available_ref(log=Log()):
|
|
@@ -167,8 +167,8 @@ def get_path(name,log=Log(),verbose=True):
|
|
|
167
167
|
#config_path = path.dirname(__file__) + '/data/config.json'
|
|
168
168
|
config_path = options.paths["config"]
|
|
169
169
|
if not path.exists(config_path):
|
|
170
|
-
|
|
171
|
-
|
|
170
|
+
log.write("Config file not exists...", verbose=verbose)
|
|
171
|
+
log.write("Created new config file...", verbose=verbose)
|
|
172
172
|
initiate_config()
|
|
173
173
|
else:
|
|
174
174
|
try:
|
|
@@ -176,9 +176,9 @@ def get_path(name,log=Log(),verbose=True):
|
|
|
176
176
|
if path.exists(dicts[name]):
|
|
177
177
|
return dicts[name]
|
|
178
178
|
else:
|
|
179
|
-
|
|
179
|
+
log.write("File not exist.", verbose=verbose)
|
|
180
180
|
except:
|
|
181
|
-
|
|
181
|
+
log.write("No records in config file. Please download first.", verbose=verbose)
|
|
182
182
|
return False
|
|
183
183
|
|
|
184
184
|
##################################################################################
|
|
@@ -277,7 +277,7 @@ def check_file_integrity(local_path, md5sum,log):
|
|
|
277
277
|
log.write(" -MD5 verified.")
|
|
278
278
|
return 1
|
|
279
279
|
else:
|
|
280
|
-
log.
|
|
280
|
+
log.warning("-MD5 VERIFICATION FAILED!")
|
|
281
281
|
return 0
|
|
282
282
|
|
|
283
283
|
def remove_file(name,log=Log()):
|
gwaslab/bd_get_hapmap3.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from os import path
|
|
3
3
|
from gwaslab.g_Log import Log
|
|
4
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
5
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
6
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
7
|
+
|
|
4
8
|
#A unique identifier (e.g., the rs number)
|
|
5
9
|
#Allele 1 (effect allele)
|
|
6
10
|
#Allele 2 (non-effect allele)
|
|
@@ -8,30 +12,60 @@ from gwaslab.g_Log import Log
|
|
|
8
12
|
#A P-value
|
|
9
13
|
#A signed summary statistic (beta, OR, log odds, Z-score, etc)
|
|
10
14
|
|
|
11
|
-
def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
|
|
12
|
-
|
|
15
|
+
def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True, match_allele= True, log=Log()):
|
|
16
|
+
##start function with col checking##########################################################
|
|
17
|
+
_start_line = "extract HapMap3 SNPs"
|
|
18
|
+
_end_line = "extracting HapMap3 SNPs"
|
|
19
|
+
_start_cols =[]
|
|
20
|
+
_start_function = ".gethapmap3"
|
|
21
|
+
_must_args ={}
|
|
22
|
+
|
|
23
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
24
|
+
log=log,
|
|
25
|
+
verbose=verbose,
|
|
26
|
+
start_line=_start_line,
|
|
27
|
+
end_line=_end_line,
|
|
28
|
+
start_cols=_start_cols,
|
|
29
|
+
start_function=_start_function,
|
|
30
|
+
**_must_args)
|
|
31
|
+
if is_enough_info == False: return None
|
|
13
32
|
|
|
33
|
+
############################################################################################
|
|
14
34
|
if build=="19":
|
|
15
35
|
data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
|
|
16
36
|
elif build=="38":
|
|
17
37
|
data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
|
|
18
38
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
39
|
+
log.write(" -Loading Hapmap3 variants from built-in datasets...", verbose=verbose)
|
|
40
|
+
|
|
41
|
+
if match_allele:
|
|
42
|
+
additional_cols= ["A1","A2"]
|
|
43
|
+
else:
|
|
44
|
+
additional_cols=[]
|
|
45
|
+
hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"]+additional_cols, dtype={"#CHROM":"string","POS":"string"})
|
|
46
|
+
|
|
22
47
|
#rsid A1 A2 #CHROM POS
|
|
23
48
|
#rs3094315 G A 1 752566
|
|
49
|
+
|
|
24
50
|
if rsid in sumstats.columns:
|
|
25
51
|
output = sumstats.loc[sumstats[rsid].isin(hapmap3_ref["rsid"].values),:].copy()
|
|
26
52
|
return output
|
|
53
|
+
|
|
27
54
|
elif chrom in sumstats.columns and pos in sumstats.columns:
|
|
28
|
-
|
|
55
|
+
log.write(" -Since rsID not in sumstats, CHR:POS( build "+build+") will be used for matching...", verbose=verbose)
|
|
29
56
|
sumstats ["chr:pos"] = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
|
|
30
57
|
hapmap3_ref["chr:pos"] = hapmap3_ref["#CHROM"]+":"+hapmap3_ref["POS"]
|
|
31
58
|
hapmap3_ref = hapmap3_ref.rename(columns={"rsid":"rsID"})
|
|
32
|
-
output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
|
|
33
|
-
|
|
34
|
-
|
|
59
|
+
output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]+additional_cols],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
|
|
60
|
+
if match_allele:
|
|
61
|
+
log.write(" -Checking if alleles are same...")
|
|
62
|
+
is_matched = ((output[ea].astype("string") == output["A1"]) & (output[nea].astype("string") == output["A2"])) \
|
|
63
|
+
| ((output[ea].astype("string") == output["A2"]) & (output[nea].astype("string") == output["A1"]))
|
|
64
|
+
log.write(" -Variants with macthed alleles: {}".format(sum(is_matched)))
|
|
65
|
+
output = output.loc[is_matched,:]
|
|
66
|
+
output = output.drop(columns=["chr:pos"]+additional_cols)
|
|
67
|
+
log.write(" -Raw input contains "+str(len(output))+" Hapmap3 variants based on CHR:POS...", verbose=verbose)
|
|
68
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
35
69
|
return output
|
|
36
70
|
else:
|
|
37
71
|
raise ValueError("Not enough information to match SNPs. Please check your sumstats...")
|