gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/bd_common_data.py
CHANGED
|
@@ -280,17 +280,20 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
|
|
|
280
280
|
protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
|
|
281
281
|
# if not existing, extract protein coding records and output to a new file
|
|
282
282
|
if not path.isfile(protein_coding_path):
|
|
283
|
+
|
|
283
284
|
# get gene list
|
|
284
|
-
|
|
285
|
+
log.write(" - Extracting protein_coding genes from {}".format(gtfpath),verbose=verbose)
|
|
285
286
|
gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
|
|
286
287
|
gene_list = gtf.loc[(gtf["feature"]=="gene") & (gtf["gene_biotype"]=="protein_coding"),"gene_id"].values
|
|
287
|
-
|
|
288
|
+
log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)),verbose=verbose)
|
|
289
|
+
|
|
288
290
|
# extract entry using csv
|
|
289
291
|
gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
|
|
290
292
|
gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
|
|
291
293
|
gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
|
|
292
294
|
gtf_raw = gtf_raw.drop("_gene_id",axis=1)
|
|
293
|
-
|
|
295
|
+
|
|
296
|
+
log.write(" - Extracted records are saved to : {} ".format(protein_coding_path),verbose=verbose)
|
|
294
297
|
gtf_raw.to_csv(protein_coding_path, header=None, index=None, sep="\t")
|
|
295
298
|
|
|
296
299
|
return protein_coding_path
|
gwaslab/bd_download.py
CHANGED
|
@@ -106,7 +106,7 @@ def check_available_ref(log=Log(),verbose=True):
|
|
|
106
106
|
Check available reference files for gwaslab.
|
|
107
107
|
Return a dictionary of available reference files.
|
|
108
108
|
'''
|
|
109
|
-
|
|
109
|
+
log.write("Start to check available reference files...", verbose=verbose)
|
|
110
110
|
#ref_path = path.dirname(__file__) + '/data/reference.json'
|
|
111
111
|
ref_path = options.paths["reference"]
|
|
112
112
|
if not path.exists(ref_path):
|
|
@@ -115,11 +115,11 @@ def check_available_ref(log=Log(),verbose=True):
|
|
|
115
115
|
dicts = json.load(open(ref_path))
|
|
116
116
|
if dicts is not None:
|
|
117
117
|
for key,value in dicts.items():
|
|
118
|
-
|
|
118
|
+
log.write(" -",key," : ",value, verbose=verbose)
|
|
119
119
|
return dicts
|
|
120
120
|
else:
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
log.write(" -No available reference files.", verbose=verbose)
|
|
122
|
+
log.write("Finished checking available reference files...", verbose=verbose)
|
|
123
123
|
return {}
|
|
124
124
|
|
|
125
125
|
def update_available_ref(log=Log()):
|
|
@@ -167,8 +167,8 @@ def get_path(name,log=Log(),verbose=True):
|
|
|
167
167
|
#config_path = path.dirname(__file__) + '/data/config.json'
|
|
168
168
|
config_path = options.paths["config"]
|
|
169
169
|
if not path.exists(config_path):
|
|
170
|
-
|
|
171
|
-
|
|
170
|
+
log.write("Config file not exists...", verbose=verbose)
|
|
171
|
+
log.write("Created new config file...", verbose=verbose)
|
|
172
172
|
initiate_config()
|
|
173
173
|
else:
|
|
174
174
|
try:
|
|
@@ -176,9 +176,9 @@ def get_path(name,log=Log(),verbose=True):
|
|
|
176
176
|
if path.exists(dicts[name]):
|
|
177
177
|
return dicts[name]
|
|
178
178
|
else:
|
|
179
|
-
|
|
179
|
+
log.write("File not exist.", verbose=verbose)
|
|
180
180
|
except:
|
|
181
|
-
|
|
181
|
+
log.write("No records in config file. Please download first.", verbose=verbose)
|
|
182
182
|
return False
|
|
183
183
|
|
|
184
184
|
##################################################################################
|
|
@@ -277,7 +277,7 @@ def check_file_integrity(local_path, md5sum,log):
|
|
|
277
277
|
log.write(" -MD5 verified.")
|
|
278
278
|
return 1
|
|
279
279
|
else:
|
|
280
|
-
log.
|
|
280
|
+
log.warning("-MD5 VERIFICATION FAILED!")
|
|
281
281
|
return 0
|
|
282
282
|
|
|
283
283
|
def remove_file(name,log=Log()):
|
gwaslab/bd_get_hapmap3.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from os import path
|
|
3
3
|
from gwaslab.g_Log import Log
|
|
4
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
5
|
+
from gwaslab.qc_fix_sumstats import skipped
|
|
6
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
7
|
+
|
|
4
8
|
#A unique identifier (e.g., the rs number)
|
|
5
9
|
#Allele 1 (effect allele)
|
|
6
10
|
#Allele 2 (non-effect allele)
|
|
@@ -8,30 +12,60 @@ from gwaslab.g_Log import Log
|
|
|
8
12
|
#A P-value
|
|
9
13
|
#A signed summary statistic (beta, OR, log odds, Z-score, etc)
|
|
10
14
|
|
|
11
|
-
def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
|
|
12
|
-
|
|
15
|
+
def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True, match_allele= True, log=Log()):
|
|
16
|
+
##start function with col checking##########################################################
|
|
17
|
+
_start_line = "extract HapMap3 SNPs"
|
|
18
|
+
_end_line = "extracting HapMap3 SNPs"
|
|
19
|
+
_start_cols =[]
|
|
20
|
+
_start_function = ".gethapmap3"
|
|
21
|
+
_must_args ={}
|
|
22
|
+
|
|
23
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
24
|
+
log=log,
|
|
25
|
+
verbose=verbose,
|
|
26
|
+
start_line=_start_line,
|
|
27
|
+
end_line=_end_line,
|
|
28
|
+
start_cols=_start_cols,
|
|
29
|
+
start_function=_start_function,
|
|
30
|
+
**_must_args)
|
|
31
|
+
if is_enough_info == False: return None
|
|
13
32
|
|
|
33
|
+
############################################################################################
|
|
14
34
|
if build=="19":
|
|
15
35
|
data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
|
|
16
36
|
elif build=="38":
|
|
17
37
|
data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
|
|
18
38
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
39
|
+
log.write(" -Loading Hapmap3 variants from built-in datasets...", verbose=verbose)
|
|
40
|
+
|
|
41
|
+
if match_allele:
|
|
42
|
+
additional_cols= ["A1","A2"]
|
|
43
|
+
else:
|
|
44
|
+
additional_cols=[]
|
|
45
|
+
hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"]+additional_cols, dtype={"#CHROM":"string","POS":"string"})
|
|
46
|
+
|
|
22
47
|
#rsid A1 A2 #CHROM POS
|
|
23
48
|
#rs3094315 G A 1 752566
|
|
49
|
+
|
|
24
50
|
if rsid in sumstats.columns:
|
|
25
51
|
output = sumstats.loc[sumstats[rsid].isin(hapmap3_ref["rsid"].values),:].copy()
|
|
26
52
|
return output
|
|
53
|
+
|
|
27
54
|
elif chrom in sumstats.columns and pos in sumstats.columns:
|
|
28
|
-
|
|
55
|
+
log.write(" -Since rsID not in sumstats, CHR:POS( build "+build+") will be used for matching...", verbose=verbose)
|
|
29
56
|
sumstats ["chr:pos"] = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
|
|
30
57
|
hapmap3_ref["chr:pos"] = hapmap3_ref["#CHROM"]+":"+hapmap3_ref["POS"]
|
|
31
58
|
hapmap3_ref = hapmap3_ref.rename(columns={"rsid":"rsID"})
|
|
32
|
-
output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
|
|
33
|
-
|
|
34
|
-
|
|
59
|
+
output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]+additional_cols],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
|
|
60
|
+
if match_allele:
|
|
61
|
+
log.write(" -Checking if alleles are same...")
|
|
62
|
+
is_matched = ((output[ea].astype("string") == output["A1"]) & (output[nea].astype("string") == output["A2"])) \
|
|
63
|
+
| ((output[ea].astype("string") == output["A2"]) & (output[nea].astype("string") == output["A1"]))
|
|
64
|
+
log.write(" -Variants with macthed alleles: {}".format(sum(is_matched)))
|
|
65
|
+
output = output.loc[is_matched,:]
|
|
66
|
+
output = output.drop(columns=["chr:pos"]+additional_cols)
|
|
67
|
+
log.write(" -Raw input contains "+str(len(output))+" Hapmap3 variants based on CHR:POS...", verbose=verbose)
|
|
68
|
+
finished(log=log,verbose=verbose,end_line=_end_line)
|
|
35
69
|
return output
|
|
36
70
|
else:
|
|
37
71
|
raise ValueError("Not enough information to match SNPs. Please check your sumstats...")
|
gwaslab/g_Log.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import time
|
|
2
2
|
class Log():
|
|
3
3
|
def __init__(self):
|
|
4
|
-
self.log_text=str(time.
|
|
4
|
+
self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
|
|
5
5
|
|
|
6
6
|
def write(self,*message,end="\n",show_time=True, verbose=True):
|
|
7
7
|
if show_time is True:
|
|
8
|
-
if verbose: print(str(time.
|
|
9
|
-
self.log_text = self.log_text + str(time.
|
|
8
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
|
|
9
|
+
self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
|
|
10
10
|
else:
|
|
11
11
|
if verbose: print(*message,end=end)
|
|
12
12
|
self.log_text = self.log_text + " ".join(map(str,message)) + end
|
|
@@ -21,5 +21,14 @@ class Log():
|
|
|
21
21
|
print(self.log_text)
|
|
22
22
|
def save(self,path,verbose=True):
|
|
23
23
|
with open(path,"w") as f:
|
|
24
|
-
if verbose: print(str(time.
|
|
25
|
-
f.write(self.log_text)
|
|
24
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
|
|
25
|
+
f.write(self.log_text)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def log(self,*message,end="\n",show_time=True, verbose=True):
|
|
29
|
+
if show_time is True:
|
|
30
|
+
if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
|
|
31
|
+
self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
|
|
32
|
+
else:
|
|
33
|
+
if verbose: print(*message,end=end)
|
|
34
|
+
self.log_text = self.log_text + " ".join(map(str,message)) + end
|
gwaslab/g_Sumstats.py
CHANGED
|
@@ -32,6 +32,9 @@ from gwaslab.util_in_filter_value import filterout
|
|
|
32
32
|
from gwaslab.util_in_filter_value import filterin
|
|
33
33
|
from gwaslab.util_in_filter_value import filterregionin
|
|
34
34
|
from gwaslab.util_in_filter_value import filterregionout
|
|
35
|
+
from gwaslab.util_in_filter_value import _filter_indel
|
|
36
|
+
from gwaslab.util_in_filter_value import _filter_palindromic
|
|
37
|
+
from gwaslab.util_in_filter_value import _filter_snp
|
|
35
38
|
from gwaslab.util_in_filter_value import inferbuild
|
|
36
39
|
from gwaslab.util_in_filter_value import sampling
|
|
37
40
|
from gwaslab.util_in_filter_value import _get_flanking
|
|
@@ -44,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
|
|
|
44
47
|
from gwaslab.util_in_get_density import assigndensity
|
|
45
48
|
from gwaslab.util_in_get_sig import annogene
|
|
46
49
|
from gwaslab.util_in_get_sig import getnovel
|
|
50
|
+
from gwaslab.util_in_get_sig import _check_cis
|
|
51
|
+
from gwaslab.util_in_get_sig import _check_novel_set
|
|
47
52
|
from gwaslab.util_in_fill_data import filldata
|
|
48
53
|
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
49
54
|
from gwaslab.bd_common_data import get_chr_list
|
|
@@ -64,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
|
|
|
64
69
|
from gwaslab.viz_plot_compare_af import plotdaf
|
|
65
70
|
from gwaslab.util_ex_run_susie import _run_susie_rss
|
|
66
71
|
from gwaslab.qc_fix_sumstats import _check_data_consistency
|
|
72
|
+
from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
|
|
73
|
+
from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
|
|
74
|
+
from gwaslab.bd_get_hapmap3 import gethapmap3
|
|
67
75
|
import gc
|
|
68
76
|
|
|
69
77
|
#20220309
|
|
@@ -121,7 +129,8 @@ class Sumstats():
|
|
|
121
129
|
# basic attributes
|
|
122
130
|
self.data = pd.DataFrame()
|
|
123
131
|
self.log = Log()
|
|
124
|
-
|
|
132
|
+
self.ldsc_h2 = None
|
|
133
|
+
self.ldsc_rg = None
|
|
125
134
|
# meta information
|
|
126
135
|
self.meta = _init_meta()
|
|
127
136
|
self.build = build
|
|
@@ -135,7 +144,7 @@ class Sumstats():
|
|
|
135
144
|
self.pipcs = pd.DataFrame()
|
|
136
145
|
|
|
137
146
|
# print gwaslab version information
|
|
138
|
-
|
|
147
|
+
_show_version(self.log, verbose=verbose)
|
|
139
148
|
|
|
140
149
|
#preformat the data
|
|
141
150
|
self.data = preformat(
|
|
@@ -405,19 +414,16 @@ class Sumstats():
|
|
|
405
414
|
_check_data_consistency(self.data,log=self.log,**args)
|
|
406
415
|
def check_id(self,**args):
|
|
407
416
|
pass
|
|
408
|
-
|
|
409
417
|
def check_ref(self,ref_seq,**args):
|
|
410
418
|
self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
|
|
411
419
|
self.data = checkref(self.data,ref_seq,log=self.log,**args)
|
|
412
420
|
def infer_strand(self,ref_infer,**args):
|
|
413
421
|
self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
|
|
414
422
|
self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
415
|
-
|
|
416
423
|
def flip_allele_stats(self,**args):
|
|
417
424
|
self.data = flipallelestats(self.data,log=self.log,**args)
|
|
418
425
|
def normalize_allele(self,**args):
|
|
419
426
|
self.data = parallelnormalizeallele(self.data,log=self.log,**args)
|
|
420
|
-
|
|
421
427
|
def assign_rsid(self,
|
|
422
428
|
ref_rsid_tsv=None,
|
|
423
429
|
ref_rsid_vcf=None,
|
|
@@ -428,14 +434,11 @@ class Sumstats():
|
|
|
428
434
|
if ref_rsid_vcf is not None:
|
|
429
435
|
self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
|
|
430
436
|
self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
|
|
431
|
-
|
|
432
437
|
def rsid_to_chrpos(self,**args):
|
|
433
438
|
self.data = rsidtochrpos(self.data,log=self.log,**args)
|
|
434
|
-
|
|
435
439
|
def rsid_to_chrpos2(self,**args):
|
|
436
440
|
self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
|
|
437
441
|
|
|
438
|
-
|
|
439
442
|
############################################################################################################
|
|
440
443
|
|
|
441
444
|
def sort_coordinate(self,**sort_args):
|
|
@@ -458,7 +461,6 @@ class Sumstats():
|
|
|
458
461
|
return new_Sumstats_object
|
|
459
462
|
else:
|
|
460
463
|
self.data = _get_flanking(self.data, **args)
|
|
461
|
-
|
|
462
464
|
def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
|
|
463
465
|
if inplace is False:
|
|
464
466
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -466,7 +468,6 @@ class Sumstats():
|
|
|
466
468
|
return new_Sumstats_object
|
|
467
469
|
else:
|
|
468
470
|
self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
|
|
469
|
-
|
|
470
471
|
def filter_flanking_by_id(self, snpid, inplace=False,**args):
|
|
471
472
|
if inplace is False:
|
|
472
473
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -474,7 +475,6 @@ class Sumstats():
|
|
|
474
475
|
return new_Sumstats_object
|
|
475
476
|
else:
|
|
476
477
|
self.data = _get_flanking_by_id(self.data, snpid, **args)
|
|
477
|
-
|
|
478
478
|
def filter_value(self, expr, inplace=False, **args):
|
|
479
479
|
if inplace is False:
|
|
480
480
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -482,7 +482,6 @@ class Sumstats():
|
|
|
482
482
|
return new_Sumstats_object
|
|
483
483
|
else:
|
|
484
484
|
self.data = filtervalues(self.data, expr,log=self.log,**args)
|
|
485
|
-
|
|
486
485
|
def filter_out(self, inplace=False, **args):
|
|
487
486
|
if inplace is False:
|
|
488
487
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -490,7 +489,6 @@ class Sumstats():
|
|
|
490
489
|
return new_Sumstats_object
|
|
491
490
|
else:
|
|
492
491
|
self.data = filterout(self.data,log=self.log,**args)
|
|
493
|
-
|
|
494
492
|
def filter_in(self, inplace=False, **args):
|
|
495
493
|
if inplace is False:
|
|
496
494
|
new_Sumstats_object = copy.deepcopy(self)
|
|
@@ -512,7 +510,28 @@ class Sumstats():
|
|
|
512
510
|
return new_Sumstats_object
|
|
513
511
|
else:
|
|
514
512
|
self.data = filterregionout(self.data,log=self.log,**args)
|
|
515
|
-
|
|
513
|
+
def filter_palindromic(self, inplace=False, **args):
|
|
514
|
+
if inplace is False:
|
|
515
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
516
|
+
new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
517
|
+
return new_Sumstats_object
|
|
518
|
+
else:
|
|
519
|
+
self.data = _filter_palindromic(self.data,log=self.log,**args)
|
|
520
|
+
def filter_snp(self, inplace=False, **args):
|
|
521
|
+
if inplace is False:
|
|
522
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
523
|
+
new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
524
|
+
return new_Sumstats_object
|
|
525
|
+
else:
|
|
526
|
+
self.data = _filter_snp(self.data,log=self.log,**args)
|
|
527
|
+
def filter_indel(self, inplace=False, **args):
|
|
528
|
+
if inplace is False:
|
|
529
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
530
|
+
new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
|
|
531
|
+
return new_Sumstats_object
|
|
532
|
+
else:
|
|
533
|
+
self.data = _filter_indel(self.data,log=self.log,**args)
|
|
534
|
+
|
|
516
535
|
def random_variants(self,inplace=False,n=1,p=None,**args):
|
|
517
536
|
if inplace is True:
|
|
518
537
|
self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
|
|
@@ -520,18 +539,25 @@ class Sumstats():
|
|
|
520
539
|
new_Sumstats_object = copy.deepcopy(self)
|
|
521
540
|
new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
|
|
522
541
|
return new_Sumstats_object
|
|
523
|
-
|
|
542
|
+
|
|
543
|
+
def filter_hapmap3(self, inplace=False, build=None, **args ):
|
|
544
|
+
if build is None:
|
|
545
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
546
|
+
if inplace is True:
|
|
547
|
+
self.data = gethapmap3(self.data, build=build,log=self.log, **args)
|
|
548
|
+
else:
|
|
549
|
+
new_Sumstats_object = copy.deepcopy(self)
|
|
550
|
+
new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
|
|
551
|
+
return new_Sumstats_object
|
|
524
552
|
######################################################################
|
|
525
553
|
|
|
526
554
|
def check_af(self,ref_infer,**args):
|
|
527
555
|
self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
528
556
|
self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
|
|
529
|
-
|
|
530
557
|
def infer_af(self,ref_infer,**args):
|
|
531
558
|
self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
|
|
532
559
|
self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
|
|
533
560
|
self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
|
|
534
|
-
|
|
535
561
|
def plot_daf(self, **args):
|
|
536
562
|
fig,outliers = plotdaf(self.data, **args)
|
|
537
563
|
return fig, outliers
|
|
@@ -637,7 +663,37 @@ class Sumstats():
|
|
|
637
663
|
**args)
|
|
638
664
|
# return sumstats object
|
|
639
665
|
return output
|
|
640
|
-
|
|
666
|
+
|
|
667
|
+
def check_cis(self, **args):
|
|
668
|
+
if "SNPID" in self.data.columns:
|
|
669
|
+
id_to_use = "SNPID"
|
|
670
|
+
else:
|
|
671
|
+
id_to_use = "rsID"
|
|
672
|
+
output = _check_cis(self.data,
|
|
673
|
+
id=id_to_use,
|
|
674
|
+
chrom="CHR",
|
|
675
|
+
pos="POS",
|
|
676
|
+
p="P",
|
|
677
|
+
log=self.log,
|
|
678
|
+
**args)
|
|
679
|
+
# return sumstats object
|
|
680
|
+
return output
|
|
681
|
+
|
|
682
|
+
def check_novel_set(self, **args):
|
|
683
|
+
if "SNPID" in self.data.columns:
|
|
684
|
+
id_to_use = "SNPID"
|
|
685
|
+
else:
|
|
686
|
+
id_to_use = "rsID"
|
|
687
|
+
output = _check_novel_set(self.data,
|
|
688
|
+
id=id_to_use,
|
|
689
|
+
chrom="CHR",
|
|
690
|
+
pos="POS",
|
|
691
|
+
p="P",
|
|
692
|
+
log=self.log,
|
|
693
|
+
**args)
|
|
694
|
+
# return sumstats object
|
|
695
|
+
return output
|
|
696
|
+
|
|
641
697
|
def anno_gene(self, **args):
|
|
642
698
|
if "SNPID" in self.data.columns:
|
|
643
699
|
id_to_use = "SNPID"
|
|
@@ -673,6 +729,18 @@ class Sumstats():
|
|
|
673
729
|
output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
|
|
674
730
|
self.meta["Genomic inflation factor"] = output
|
|
675
731
|
return output
|
|
732
|
+
|
|
733
|
+
def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
|
|
734
|
+
if build is None:
|
|
735
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
736
|
+
insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
|
|
737
|
+
self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
|
|
738
|
+
|
|
739
|
+
def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
|
|
740
|
+
if build is None:
|
|
741
|
+
build = self.meta["gwaslab"]["genome_build"]
|
|
742
|
+
insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
|
|
743
|
+
self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
|
|
676
744
|
# external ################################################################################################
|
|
677
745
|
|
|
678
746
|
def to_finemapping(self,**args):
|
gwaslab/g_SumstatsPair.py
CHANGED
|
@@ -6,23 +6,26 @@ from gwaslab.util_in_filter_value import filtervalues
|
|
|
6
6
|
from gwaslab.g_Log import Log
|
|
7
7
|
from math import floor
|
|
8
8
|
from gwaslab.g_Sumstats import Sumstats
|
|
9
|
-
from gwaslab.hm_casting import
|
|
9
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
10
10
|
from gwaslab.hm_casting import _align_with_mold
|
|
11
11
|
from gwaslab.hm_casting import _fill_missing_columns
|
|
12
12
|
from gwaslab.hm_casting import _check_daf
|
|
13
13
|
from gwaslab.hm_casting import _assign_warning_code
|
|
14
14
|
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
15
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
16
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
15
17
|
from gwaslab.hm_casting import _renaming_cols
|
|
16
18
|
from gwaslab.hm_casting import _sort_pair_cols
|
|
17
19
|
from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
|
|
18
20
|
from gwaslab.util_ex_run_coloc import _run_coloc_susie
|
|
19
21
|
from gwaslab.viz_plot_miamiplot2 import plot_miami2
|
|
22
|
+
from gwaslab.viz_plot_compare_af import plotdaf
|
|
20
23
|
from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
|
|
21
24
|
from gwaslab.util_ex_run_clumping import _clump
|
|
22
25
|
from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
|
|
23
26
|
|
|
24
27
|
class SumstatsPair( ):
|
|
25
|
-
def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
|
|
28
|
+
def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
|
|
26
29
|
|
|
27
30
|
if not isinstance(sumstatsObject1, Sumstats):
|
|
28
31
|
raise ValueError("Please provide GWASLab Sumstats Object #1.")
|
|
@@ -34,7 +37,9 @@ class SumstatsPair( ):
|
|
|
34
37
|
self.study_name = "{}_{}".format("STUDY1", "STUDY2")
|
|
35
38
|
self.snp_info_cols = []
|
|
36
39
|
self.stats_cols =[]
|
|
37
|
-
self.
|
|
40
|
+
self.stats_cols2 =[]
|
|
41
|
+
self.other_cols =[]
|
|
42
|
+
self.other_cols2 =[]
|
|
38
43
|
self.log = Log()
|
|
39
44
|
self.suffixes = suffixes
|
|
40
45
|
self.colocalization=pd.DataFrame()
|
|
@@ -43,28 +48,53 @@ class SumstatsPair( ):
|
|
|
43
48
|
self.mr = {}
|
|
44
49
|
self.clumps ={}
|
|
45
50
|
self.ns = None
|
|
51
|
+
self.to_finemapping_file_path = ""
|
|
52
|
+
self.plink_log = ""
|
|
46
53
|
|
|
47
54
|
self.log.write( "Start to create SumstatsPair object..." )
|
|
48
55
|
|
|
56
|
+
self.log.write( " -Checking sumstats 1..." , verbose=verbose)
|
|
57
|
+
check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
|
|
58
|
+
check_dataframe_shape(sumstats=sumstatsObject1.data,
|
|
59
|
+
log=self.log,
|
|
60
|
+
verbose=verbose)
|
|
61
|
+
|
|
62
|
+
self.log.write( " -Checking sumstats 2..." , verbose=verbose)
|
|
63
|
+
check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
|
|
64
|
+
check_dataframe_shape(sumstats=sumstatsObject2.data,
|
|
65
|
+
log=self.log,
|
|
66
|
+
verbose=verbose)
|
|
67
|
+
|
|
49
68
|
for i in sumstatsObject1.data.columns:
|
|
50
69
|
if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
|
|
51
70
|
self.snp_info_cols.append(i)
|
|
52
|
-
elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","
|
|
71
|
+
elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
|
|
53
72
|
self.stats_cols.append(i)
|
|
54
73
|
else:
|
|
55
74
|
self.other_cols.append(i)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
75
|
+
for i in sumstatsObject2.data.columns:
|
|
76
|
+
if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
|
|
77
|
+
continue
|
|
78
|
+
elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
|
|
79
|
+
self.stats_cols2.append(i)
|
|
80
|
+
else:
|
|
81
|
+
self.other_cols2.append(i)
|
|
82
|
+
|
|
83
|
+
self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
|
|
84
|
+
self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
|
|
85
|
+
self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
|
|
86
|
+
self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
|
|
87
|
+
|
|
88
|
+
# extract only info and stats cols
|
|
89
|
+
self.data = sumstatsObject1.data
|
|
90
|
+
|
|
91
|
+
#rename with _1
|
|
59
92
|
self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
|
|
60
|
-
|
|
61
93
|
self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
|
|
94
|
+
self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
|
|
62
95
|
|
|
63
96
|
self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
|
|
64
97
|
|
|
65
|
-
self.to_finemapping_file_path = ""
|
|
66
|
-
self.plink_log = ""
|
|
67
|
-
|
|
68
98
|
if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
|
|
69
99
|
n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
|
|
70
100
|
n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
|
|
@@ -74,8 +104,9 @@ class SumstatsPair( ):
|
|
|
74
104
|
|
|
75
105
|
def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
|
|
76
106
|
|
|
77
|
-
|
|
78
|
-
|
|
107
|
+
# sumstats1 with suffix _1, sumstats2 with no suffix
|
|
108
|
+
molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
|
|
109
|
+
sumstats=sumstatsObject2.data,
|
|
79
110
|
log=self.log,
|
|
80
111
|
verbose=verbose,
|
|
81
112
|
suffixes=(suffixes[0],""),
|
|
@@ -83,16 +114,21 @@ class SumstatsPair( ):
|
|
|
83
114
|
|
|
84
115
|
molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
|
|
85
116
|
|
|
117
|
+
# flip sumstats2 statistics
|
|
86
118
|
molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
|
|
87
119
|
|
|
120
|
+
# drop sumstats2 EA NEA
|
|
88
121
|
molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
|
|
122
|
+
|
|
123
|
+
# rename sumstats1 EA NEA
|
|
89
124
|
molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
|
|
90
125
|
|
|
91
|
-
if not
|
|
92
|
-
cols_to_fill = set(self.stats_cols).difference(set(
|
|
126
|
+
if not set(self.stats_cols2) == set(self.stats_cols):
|
|
127
|
+
cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
|
|
93
128
|
molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
|
|
94
129
|
|
|
95
|
-
|
|
130
|
+
# rename sumstast2 with _2
|
|
131
|
+
molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
|
|
96
132
|
|
|
97
133
|
molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
|
|
98
134
|
|
|
@@ -108,13 +144,7 @@ class SumstatsPair( ):
|
|
|
108
144
|
def run_coloc_susie(self,**args):
|
|
109
145
|
|
|
110
146
|
self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
|
|
111
|
-
|
|
112
|
-
def plot_miami(self,**args):
|
|
113
147
|
|
|
114
|
-
plot_miami2(merged_sumstats=self.data,
|
|
115
|
-
suffixes=self.suffixes,
|
|
116
|
-
**args)
|
|
117
|
-
|
|
118
148
|
def run_two_sample_mr(self, clump=False, **args):
|
|
119
149
|
exposure1 = self.study_name.split("_")[0]
|
|
120
150
|
outcome2 = self.study_name.split("_")[1]
|
|
@@ -130,4 +160,21 @@ class SumstatsPair( ):
|
|
|
130
160
|
return new_Sumstats_object
|
|
131
161
|
else:
|
|
132
162
|
self.data = filtervalues(self.data, expr,log=self.log,**args)
|
|
133
|
-
gc.collect()
|
|
163
|
+
gc.collect()
|
|
164
|
+
|
|
165
|
+
## Visualization #############################################################################################################################################
|
|
166
|
+
def plot_miami(self,**args):
|
|
167
|
+
|
|
168
|
+
plot_miami2(merged_sumstats=self.data,
|
|
169
|
+
suffixes=self.suffixes,
|
|
170
|
+
**args)
|
|
171
|
+
|
|
172
|
+
def compare_af(self, **args):
|
|
173
|
+
|
|
174
|
+
return plotdaf( self.data,
|
|
175
|
+
eaf="EAF_2",
|
|
176
|
+
raf="EAF_1",
|
|
177
|
+
xlabel="Effect Allele Frequency in Sumstats 1",
|
|
178
|
+
ylabel="Effect Allele Frequency in Sumstats 2",
|
|
179
|
+
**args)
|
|
180
|
+
|
gwaslab/g_SumstatsT.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import numpy as np
|
|
3
3
|
from gwaslab.g_Sumstats import Sumstats
|
|
4
|
-
from gwaslab.hm_casting import
|
|
4
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
5
5
|
from gwaslab.hm_casting import _align_with_mold
|
|
6
6
|
from gwaslab.hm_casting import _fill_missing_columns
|
|
7
7
|
from gwaslab.hm_casting import _check_daf
|
|
@@ -34,7 +34,7 @@ class SumstatsT( ):
|
|
|
34
34
|
|
|
35
35
|
def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
|
|
36
36
|
|
|
37
|
-
molded_sumstats =
|
|
37
|
+
molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
|
|
38
38
|
|
|
39
39
|
molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)
|
|
40
40
|
|