gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/g_Sumstats.py +98 -24
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +123 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +165 -141
- gwaslab/viz_plot_qqplot.py +6 -6
- gwaslab/viz_plot_regional2.py +5 -13
- gwaslab/viz_plot_rg_heatmap.py +6 -1
- gwaslab/viz_plot_stackedregional.py +21 -6
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
- gwaslab-3.5.8.dist-info/RECORD +117 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
- gwaslab-3.5.6.dist-info/RECORD +0 -96
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
from gwaslab.g_Log import Log
|
|
6
|
+
# pandas.api.types.is_int64_dtype
|
|
7
|
+
# pandas.api.types.is_categorical_dtype
|
|
8
|
+
|
|
9
|
+
dtype_dict ={
|
|
10
|
+
"SNPID":[pl.String()],
|
|
11
|
+
"rsID": [pl.String()],
|
|
12
|
+
"CHR": [pl.Int64()],
|
|
13
|
+
"POS": [pl.Int64()],
|
|
14
|
+
"EA": [pl.String()],
|
|
15
|
+
"NEA":[pl.String()],
|
|
16
|
+
"REF":[pl.String()],
|
|
17
|
+
"ALT":[pl.String()],
|
|
18
|
+
"BETA":[pl.Float64()],
|
|
19
|
+
"BETA_95L":[pl.Float64()],
|
|
20
|
+
"BETA_95U":[pl.Float64()],
|
|
21
|
+
"SE":[pl.Float64()],
|
|
22
|
+
"N":[pl.Int64()],
|
|
23
|
+
"N_CASE":[pl.Int64()],
|
|
24
|
+
"N_CONTROL":[pl.Int64()],
|
|
25
|
+
"OR":[pl.Float64()],
|
|
26
|
+
"OR_95L":[pl.Float64()],
|
|
27
|
+
"OR_95U":[pl.Float64()],
|
|
28
|
+
"HR":[pl.Float64()],
|
|
29
|
+
"HR_95L":[pl.Float64()],
|
|
30
|
+
"HR_95U":[pl.Float64()],
|
|
31
|
+
"P":[pl.Float64()],
|
|
32
|
+
"MLOG10P":[pl.Float64()],
|
|
33
|
+
"Z":[pl.Float64()],
|
|
34
|
+
"F":[pl.Float64()],
|
|
35
|
+
"T":[pl.Float64()],
|
|
36
|
+
"TEST":[pl.String()],
|
|
37
|
+
"CHISQ":[pl.Float64()],
|
|
38
|
+
"I2":[pl.Float64()],
|
|
39
|
+
"P_HET":[pl.Float64()],
|
|
40
|
+
"SNPR2":[pl.Float64()],
|
|
41
|
+
"EAF":[pl.Float64()],
|
|
42
|
+
"NEAF":[pl.Float64()],
|
|
43
|
+
"MAF":[pl.Float64()],
|
|
44
|
+
"INFO":[pl.Float64()],
|
|
45
|
+
"DOF":[pl.Int64()],
|
|
46
|
+
"STATUS":[pl.String()],
|
|
47
|
+
"DIRECTION":[pl.String()],
|
|
48
|
+
'PIP' :[pl.Float64()],
|
|
49
|
+
'CREDIBLE_SET_INDEX':[pl.Int64()],
|
|
50
|
+
'N_SNP' :[pl.Int64()],
|
|
51
|
+
'LOCUS' :[pl.String()],
|
|
52
|
+
'STUDY' :[pl.String()],
|
|
53
|
+
'BETA_RANDOM' :[pl.Float64()],
|
|
54
|
+
'SE_RANDOM' :[pl.Float64()],
|
|
55
|
+
'Z_RANDOM' :[pl.Float64()],
|
|
56
|
+
'P_RANDOM' :[pl.Float64()]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def check_datatype(sumstats, verbose=True, log=Log()):
|
|
60
|
+
|
|
61
|
+
#try:
|
|
62
|
+
headers = []
|
|
63
|
+
dtypes = []
|
|
64
|
+
verified = []
|
|
65
|
+
raw_verified =[]
|
|
66
|
+
for header,dtype in sumstats.schema.items():
|
|
67
|
+
width = max(len(header),len(str(dtype)))
|
|
68
|
+
|
|
69
|
+
header_fix_length = header + " "*(width- len(header) )
|
|
70
|
+
|
|
71
|
+
dtype_fix_length = str(dtype) + " "*(width- len(str(dtype)))
|
|
72
|
+
|
|
73
|
+
verified_str = verify_datatype(header, dtype)
|
|
74
|
+
verified_fix_length = verified_str + " " *(width- len(verified_str))
|
|
75
|
+
|
|
76
|
+
headers.append(format(header_fix_length))
|
|
77
|
+
dtypes.append((str(dtype_fix_length)))
|
|
78
|
+
verified.append(verified_fix_length)
|
|
79
|
+
if verified_str == "F":
|
|
80
|
+
raw_verified.append(header)
|
|
81
|
+
|
|
82
|
+
log.write(" -Column :", " ".join(headers), verbose=verbose)
|
|
83
|
+
log.write(" -DType :", " ".join(dtypes), verbose=verbose)
|
|
84
|
+
log.write(" -Verified:", " ".join(verified), verbose=verbose)
|
|
85
|
+
|
|
86
|
+
if len(raw_verified)>0:
|
|
87
|
+
log.warning("Columns with possibly incompatible dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
|
|
88
|
+
#except:
|
|
89
|
+
# pass
|
|
90
|
+
|
|
91
|
+
def verify_datatype(header, dtype):
|
|
92
|
+
|
|
93
|
+
if header in dtype_dict.keys():
|
|
94
|
+
if dtype in dtype_dict[header]:
|
|
95
|
+
return "T"
|
|
96
|
+
else:
|
|
97
|
+
return "F"
|
|
98
|
+
else:
|
|
99
|
+
return "NA"
|
|
100
|
+
|
|
101
|
+
def quick_convert_datatype(sumstats, log, verbose):
|
|
102
|
+
for col in sumstats.columns:
|
|
103
|
+
if col in dtype_dict.keys():
|
|
104
|
+
if sumstats[col].dtype not in dtype_dict[col]:
|
|
105
|
+
datatype=dtype_dict[col][0]
|
|
106
|
+
log.write(" -Trying to convert datatype for {}: {} -> {}...".format(col, str(sumstats[col].dtype), datatype), end="" ,verbose=verbose)
|
|
107
|
+
try:
|
|
108
|
+
sumstats = sumstats.cast({col: datatype})
|
|
109
|
+
log.write("{}".format(datatype),show_time=False, verbose=verbose)
|
|
110
|
+
except:
|
|
111
|
+
log.write("Failed...",show_time=False,verbose=verbose)
|
|
112
|
+
pass
|
|
113
|
+
return sumstats
|
|
114
|
+
|
|
115
|
+
def check_dataframe_shape(sumstats, log, verbose):
|
|
116
|
+
memory_in_mb = sumstats.estimated_size(unit="mb")
|
|
117
|
+
try:
|
|
118
|
+
log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
|
|
119
|
+
except:
|
|
120
|
+
log.warning("Error: cannot get Dataframe shape...")
|
|
121
|
+
|
|
122
|
+
def check_dataframe_memory_usage(sumstats, log, verbose):
|
|
123
|
+
memory_in_mb = sumstats.estimated_size(unit="mb")
|
|
124
|
+
try:
|
|
125
|
+
log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
|
|
126
|
+
except:
|
|
127
|
+
log.warning("Error: cannot get Memory usage...")
|
|
128
|
+
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -16,6 +16,8 @@ from gwaslab.bd_common_data import get_number_to_chr
|
|
|
16
16
|
from gwaslab.bd_common_data import get_chr_list
|
|
17
17
|
from gwaslab.qc_check_datatype import check_datatype
|
|
18
18
|
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
19
|
+
from gwaslab.qc_build import _process_build
|
|
20
|
+
from gwaslab.qc_build import _set_build
|
|
19
21
|
from gwaslab.g_version import _get_version
|
|
20
22
|
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
21
23
|
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
@@ -41,29 +43,29 @@ from gwaslab.bd_common_data import get_chain
|
|
|
41
43
|
|
|
42
44
|
###############################################################################################################
|
|
43
45
|
# 20220514
|
|
44
|
-
def _process_build(build,log,verbose):
|
|
45
|
-
if str(build).lower() in ["hg19","19","37","b37","grch37"]:
|
|
46
|
-
log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
|
|
47
|
-
final_build = "19"
|
|
48
|
-
elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
|
|
49
|
-
log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
|
|
50
|
-
final_build = "18"
|
|
51
|
-
elif str(build).lower() in ["hg38","38","b38","grch38"]:
|
|
52
|
-
log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
53
|
-
final_build = "38"
|
|
54
|
-
elif str(build).lower() in ["t2t","hs1","chm13","13"]:
|
|
55
|
-
log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
|
|
56
|
-
final_build = "13"
|
|
57
|
-
else:
|
|
58
|
-
log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
59
|
-
final_build = "99"
|
|
60
|
-
return final_build
|
|
61
|
-
|
|
62
|
-
def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
63
|
-
build = _process_build(build,log=log,verbose=verbose)
|
|
64
|
-
sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
65
|
-
sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
66
|
-
return sumstats, build
|
|
46
|
+
#def _process_build(build,log,verbose):
|
|
47
|
+
# if str(build).lower() in ["hg19","19","37","b37","grch37"]:
|
|
48
|
+
# log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
|
|
49
|
+
# final_build = "19"
|
|
50
|
+
# elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
|
|
51
|
+
# log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
|
|
52
|
+
# final_build = "18"
|
|
53
|
+
# elif str(build).lower() in ["hg38","38","b38","grch38"]:
|
|
54
|
+
# log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
|
|
55
|
+
# final_build = "38"
|
|
56
|
+
# elif str(build).lower() in ["t2t","hs1","chm13","13"]:
|
|
57
|
+
# log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
|
|
58
|
+
# final_build = "13"
|
|
59
|
+
# else:
|
|
60
|
+
# log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
|
|
61
|
+
# final_build = "99"
|
|
62
|
+
# return final_build
|
|
63
|
+
#
|
|
64
|
+
#def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
|
|
65
|
+
# build = _process_build(build,log=log,verbose=verbose)
|
|
66
|
+
# sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
|
|
67
|
+
# sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
|
|
68
|
+
# return sumstats, build
|
|
67
69
|
|
|
68
70
|
def fixID(sumstats,
|
|
69
71
|
snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import gc
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from itertools import repeat
|
|
6
|
+
from multiprocessing import Pool
|
|
7
|
+
from liftover import get_lifter
|
|
8
|
+
from liftover import ChainFile
|
|
9
|
+
from functools import partial
|
|
10
|
+
from gwaslab.g_vchange_status_polars import vchange_statusp
|
|
11
|
+
from gwaslab.g_vchange_status import status_match
|
|
12
|
+
from gwaslab.g_vchange_status import change_status
|
|
13
|
+
from gwaslab.g_Log import Log
|
|
14
|
+
from gwaslab.bd_common_data import get_chr_to_number
|
|
15
|
+
from gwaslab.bd_common_data import get_number_to_chr
|
|
16
|
+
from gwaslab.bd_common_data import get_chr_list
|
|
17
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
18
|
+
from gwaslab.qc_check_datatype import check_dataframe_shape
|
|
19
|
+
from gwaslab.qc_build import _process_build
|
|
20
|
+
from gwaslab.qc_build import _set_build
|
|
21
|
+
from gwaslab.g_version import _get_version
|
|
22
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
|
|
23
|
+
from gwaslab.util_in_fill_data import _convert_betase_to_p
|
|
24
|
+
from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
|
|
25
|
+
from gwaslab.bd_common_data import get_chain
|
|
26
|
+
import polars as pl
|
|
27
|
+
###############################################################################################################
|
|
28
|
+
# 20220426
|
|
29
|
+
def get_reverse_complementary_allele(a):
|
|
30
|
+
dic = str.maketrans({
|
|
31
|
+
"A":"T",
|
|
32
|
+
"T":"A",
|
|
33
|
+
"C":"G",
|
|
34
|
+
"G":"C"})
|
|
35
|
+
return a[::-1].translate(dic)
|
|
36
|
+
|
|
37
|
+
def flip_direction(string):
|
|
38
|
+
flipped_string=""
|
|
39
|
+
for char in string:
|
|
40
|
+
if char=="?":
|
|
41
|
+
flipped_string+="?"
|
|
42
|
+
elif char=="+":
|
|
43
|
+
flipped_string+="-"
|
|
44
|
+
elif char=="-":
|
|
45
|
+
flipped_string+="+"
|
|
46
|
+
else: #sometime it is 0
|
|
47
|
+
flipped_string+=char
|
|
48
|
+
return flipped_string
|
|
49
|
+
|
|
50
|
+
def flip_by_swap(sumstats, matched_index, log, verbose):
|
|
51
|
+
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
52
|
+
log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
|
|
53
|
+
|
|
54
|
+
sumstats = sumstats.with_columns(
|
|
55
|
+
pl.when( matched_index )
|
|
56
|
+
.then( pl.col("EA") )
|
|
57
|
+
.otherwise( pl.col("NEA") )
|
|
58
|
+
.alias("NEA"),
|
|
59
|
+
|
|
60
|
+
pl.when( matched_index )
|
|
61
|
+
.then( pl.col("NEA") )
|
|
62
|
+
.otherwise( pl.col("EA") )
|
|
63
|
+
.alias("EA"),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return sumstats
|
|
67
|
+
|
|
68
|
+
def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
69
|
+
for header in ["OR","OR_95L","OR_95U","HR","HR_95L","HR_95U"]:
|
|
70
|
+
if header in sumstats.columns:
|
|
71
|
+
log.write(" -Flipping column: {header} = 1 / {header}...".format(header = header), verbose=verbose)
|
|
72
|
+
sumstats = sumstats.with_columns(
|
|
73
|
+
pl.when( matched_index )
|
|
74
|
+
.then( 1/ pl.col(header) )
|
|
75
|
+
.otherwise( pl.col(header) )
|
|
76
|
+
.alias(header)
|
|
77
|
+
)
|
|
78
|
+
return sumstats
|
|
79
|
+
|
|
80
|
+
def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
|
|
81
|
+
header="EAF"
|
|
82
|
+
if header in sumstats.columns:
|
|
83
|
+
log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
|
|
84
|
+
sumstats = sumstats.with_columns(
|
|
85
|
+
pl.when( matched_index )
|
|
86
|
+
.then( 1 - pl.col(header) )
|
|
87
|
+
.otherwise( pl.col(header) )
|
|
88
|
+
.alias(header)
|
|
89
|
+
)
|
|
90
|
+
return sumstats
|
|
91
|
+
|
|
92
|
+
def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
|
|
93
|
+
for header in ["BETA","BETA_95L","BETA_95U","T","Z"]:
|
|
94
|
+
if header in sumstats.columns:
|
|
95
|
+
log.write(" -Flipping column: {header} = - {header}...".format(header = header), verbose=verbose)
|
|
96
|
+
sumstats = sumstats.with_columns(
|
|
97
|
+
pl.when( matched_index )
|
|
98
|
+
.then( - pl.col(header) )
|
|
99
|
+
.otherwise( pl.col(header) )
|
|
100
|
+
.alias(header)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if "DIRECTION" in sumstats.columns:
|
|
104
|
+
sumstats = sumstats.with_columns(
|
|
105
|
+
pl.when( matched_index )
|
|
106
|
+
.then( pl.col("DIRECTION").map_batches(lambda x: pl.Series(flip_direction(x))) )
|
|
107
|
+
.otherwise( pl.col("DIRECTION") )
|
|
108
|
+
.alias("DIRECTION")
|
|
109
|
+
)
|
|
110
|
+
return sumstats
|
|
111
|
+
|
|
112
|
+
def flipallelestatsp(sumstats,status="STATUS",verbose=True,log=Log()):
|
|
113
|
+
##start function with col checking#########################################################
|
|
114
|
+
|
|
115
|
+
if_stats_flipped = False
|
|
116
|
+
###################get reverse complementary####################
|
|
117
|
+
pattern = r"\w\w\w\w\w[45]\w"
|
|
118
|
+
#matched_index = status_match(sumstats[status],6,[4,5]) #
|
|
119
|
+
#matched_index = sumstats[status].str[5].str.match(r"4|5")
|
|
120
|
+
|
|
121
|
+
matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[45]\w")
|
|
122
|
+
|
|
123
|
+
if len(sumstats.filter(matched_index))>0:
|
|
124
|
+
log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
|
|
125
|
+
log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
|
|
126
|
+
if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
|
|
127
|
+
log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
|
|
128
|
+
|
|
129
|
+
sumstats = sumstats.filter(matched_index).with_columns(
|
|
130
|
+
NEA = pl.col("NEA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x))),
|
|
131
|
+
EA = pl.col("EA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x)))
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
sumstats = vchange_statusp(sumstats, matched_index, status,6, "4","2")
|
|
135
|
+
log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
|
|
136
|
+
if_stats_flipped = True
|
|
137
|
+
|
|
138
|
+
###################flip ref####################
|
|
139
|
+
pattern = r"\w\w\w\w\w[35]\w"
|
|
140
|
+
#matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
|
|
141
|
+
matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[35]\w")
|
|
142
|
+
if len(sumstats.filter(matched_index))>0:
|
|
143
|
+
log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
|
|
144
|
+
log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
|
|
145
|
+
|
|
146
|
+
sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
|
|
147
|
+
sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
148
|
+
sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
149
|
+
sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
150
|
+
|
|
151
|
+
#change status
|
|
152
|
+
log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
|
|
153
|
+
sumstats = vchange_statusp(sumstats, matched_index,status,6, "35","12")
|
|
154
|
+
if_stats_flipped = True
|
|
155
|
+
|
|
156
|
+
###################flip ref for undistingushable indels####################
|
|
157
|
+
pattern = r"\w\w\w\w[123][67]6"
|
|
158
|
+
#matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
|
|
159
|
+
matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w[123][67]6")
|
|
160
|
+
if len(sumstats.filter(matched_index))>0:
|
|
161
|
+
log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
|
|
162
|
+
log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
|
|
163
|
+
|
|
164
|
+
sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
|
|
165
|
+
sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
166
|
+
sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
167
|
+
sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
168
|
+
|
|
169
|
+
#change status
|
|
170
|
+
log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
|
|
171
|
+
sumstats = vchange_statusp(sumstats, matched_index,status, 7, "6","4")
|
|
172
|
+
if_stats_flipped = True
|
|
173
|
+
# flip ref
|
|
174
|
+
###################flip statistics for reverse strand panlindromic variants####################
|
|
175
|
+
pattern = r"\w\w\w\w\w[012]5"
|
|
176
|
+
#matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
|
|
177
|
+
matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[012]5")
|
|
178
|
+
if len(sumstats.filter(matched_index))>0:
|
|
179
|
+
log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
|
|
180
|
+
log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
|
|
181
|
+
|
|
182
|
+
sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
|
|
183
|
+
sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
184
|
+
sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
|
|
185
|
+
|
|
186
|
+
#change status
|
|
187
|
+
log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
|
|
188
|
+
sumstats = vchange_statusp(sumstats, matched_index,status,7, "5","2")
|
|
189
|
+
if_stats_flipped = True
|
|
190
|
+
|
|
191
|
+
if if_stats_flipped != True:
|
|
192
|
+
log.write(" -No statistics have been changed.")
|
|
193
|
+
return sumstats
|
|
@@ -16,6 +16,7 @@ def tofinemapping(sumstats,
|
|
|
16
16
|
bfile=None,
|
|
17
17
|
vcf=None,
|
|
18
18
|
loci=None,
|
|
19
|
+
loci_chrpos=None,
|
|
19
20
|
out="./",
|
|
20
21
|
plink="plink",
|
|
21
22
|
plink2="plink2",
|
|
@@ -28,8 +29,10 @@ def tofinemapping(sumstats,
|
|
|
28
29
|
overwrite=False,
|
|
29
30
|
log=Log(),
|
|
30
31
|
suffixes=None,
|
|
32
|
+
extra_plink_option="",
|
|
31
33
|
verbose=True,
|
|
32
34
|
**kwargs):
|
|
35
|
+
|
|
33
36
|
##start function with col checking##########################################################
|
|
34
37
|
_start_line = "calculate LD matrix"
|
|
35
38
|
_end_line = "calculating LD matrix"
|
|
@@ -52,11 +55,21 @@ def tofinemapping(sumstats,
|
|
|
52
55
|
if getlead_args is None:
|
|
53
56
|
getlead_args={"windowsizekb":1000}
|
|
54
57
|
|
|
55
|
-
if
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
if loci_chrpos is None:
|
|
59
|
+
if loci is None:
|
|
60
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
|
|
61
|
+
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
62
|
+
else:
|
|
63
|
+
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
58
64
|
else:
|
|
59
|
-
sig_df =
|
|
65
|
+
sig_df = pd.DataFrame()
|
|
66
|
+
for chrpos in loci_chrpos:
|
|
67
|
+
chrpos_row_dict={}
|
|
68
|
+
chrpos_row_dict["SNPID"]="{}:{}".format(chrpos[0], chrpos[1])
|
|
69
|
+
chrpos_row_dict["CHR"] = chrpos[0]
|
|
70
|
+
chrpos_row_dict["POS"] = chrpos[1]
|
|
71
|
+
chrpos_row = pd.Series(chrpos_row_dict).to_frame().T
|
|
72
|
+
sig_df = pd.concat([sig_df, chrpos_row],ignore_index=True)
|
|
60
73
|
|
|
61
74
|
log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
|
|
62
75
|
log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
|
|
@@ -128,6 +141,8 @@ def tofinemapping(sumstats,
|
|
|
128
141
|
filetype=filetype,
|
|
129
142
|
plink=plink,
|
|
130
143
|
plink2=plink2,
|
|
144
|
+
extra_plink_option=extra_plink_option,
|
|
145
|
+
ref_allele_path = matched_sumstats_path,
|
|
131
146
|
verbose=verbose)
|
|
132
147
|
|
|
133
148
|
|
|
@@ -136,7 +151,7 @@ def tofinemapping(sumstats,
|
|
|
136
151
|
row_dict["SNPID"]=row["SNPID"]
|
|
137
152
|
row_dict["SNPID_LIST"] = matched_snp_list_path
|
|
138
153
|
row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
|
|
139
|
-
row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path
|
|
154
|
+
row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path+".gz"
|
|
140
155
|
file_row = pd.Series(row_dict).to_frame().T
|
|
141
156
|
output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
|
|
142
157
|
|
|
@@ -156,7 +171,7 @@ def tofinemapping(sumstats,
|
|
|
156
171
|
|
|
157
172
|
|
|
158
173
|
|
|
159
|
-
def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
|
|
174
|
+
def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,ref_allele_path, extra_plink_option="",verbose=True):
|
|
160
175
|
'''
|
|
161
176
|
Calculate LD r matrix by calling PLINK; return file name and log
|
|
162
177
|
'''
|
|
@@ -177,18 +192,32 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
|
|
|
177
192
|
if filetype=="pfile":
|
|
178
193
|
raise ValueError("Please use bfile instead of pfile for PLINK1.")
|
|
179
194
|
|
|
195
|
+
#log.write(" -Flipping plink file ref allele to match...",verbose=verbose)
|
|
196
|
+
#script_vcf_to_bfile = """
|
|
197
|
+
#{} \
|
|
198
|
+
# --bfile {} \
|
|
199
|
+
# --extract {} \
|
|
200
|
+
# --chr {} \
|
|
201
|
+
# --ref-allele 'force' {} 4 1 \
|
|
202
|
+
# --threads {} {} \
|
|
203
|
+
# --make-bed \
|
|
204
|
+
# --out {}
|
|
205
|
+
|
|
206
|
+
#""".format(plink2, bfile_to_use, snplist_path, row["CHR"],ref_allele_path, n_cores, memory_flag if memory is not None else "", output_prefix+"_gwaslab_tmp")
|
|
207
|
+
|
|
208
|
+
log.write(" -Calculating r matrix...",verbose=verbose)
|
|
180
209
|
script_vcf_to_bfile = """
|
|
181
210
|
{} \
|
|
182
211
|
--bfile {} \
|
|
183
|
-
--
|
|
212
|
+
--a2-allele {} 4 1 \
|
|
184
213
|
--extract {} \
|
|
185
214
|
--chr {} \
|
|
186
215
|
--{} square gz \
|
|
187
216
|
--allow-no-sex \
|
|
188
217
|
--threads {} {}\
|
|
189
218
|
--write-snplist \
|
|
190
|
-
--out {}
|
|
191
|
-
""".format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
|
|
219
|
+
--out {} {}
|
|
220
|
+
""".format(plink, bfile_to_use, ref_allele_path, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix, extra_plink_option)
|
|
192
221
|
|
|
193
222
|
try:
|
|
194
223
|
output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
@@ -236,20 +265,20 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
|
|
|
236
265
|
log.warning("Lead variant was not available in reference!")
|
|
237
266
|
|
|
238
267
|
# adjust statistics
|
|
239
|
-
output_columns=["SNPID","CHR","POS","
|
|
268
|
+
output_columns=["SNPID","CHR","POS","EA","NEA"]
|
|
240
269
|
for suffix in suffixes:
|
|
241
270
|
if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
|
|
242
|
-
log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
|
|
243
|
-
combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
|
|
271
|
+
#log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
|
|
272
|
+
#combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
|
|
244
273
|
output_columns.append("BETA"+suffix)
|
|
245
274
|
output_columns.append("SE"+suffix)
|
|
246
275
|
if "Z" in locus_sumstats.columns:
|
|
247
|
-
log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
|
|
248
|
-
combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
|
|
276
|
+
#log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
|
|
277
|
+
#combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
|
|
249
278
|
output_columns.append("Z"+suffix)
|
|
250
279
|
if "EAF" in locus_sumstats.columns:
|
|
251
|
-
log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
|
|
252
|
-
combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
|
|
280
|
+
#log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
|
|
281
|
+
#combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
|
|
253
282
|
output_columns.append("EAF"+suffix)
|
|
254
283
|
if "N" in locus_sumstats.columns:
|
|
255
284
|
output_columns.append("N"+suffix)
|
|
@@ -266,9 +295,9 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
266
295
|
log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
|
|
267
296
|
|
|
268
297
|
# create locus-sumstats EA, NEA, (BETA, SE), Z
|
|
269
|
-
matched_sumstats_path = "{}/{}_{}_{}.sumstats
|
|
298
|
+
matched_sumstats_path = "{}/{}_{}_{}.sumstats".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
|
|
270
299
|
|
|
271
|
-
to_export_columns=["CHR","POS","
|
|
300
|
+
to_export_columns=["CHR","POS","EA","NEA"]
|
|
272
301
|
for suffix in suffixes:
|
|
273
302
|
if "Z"+suffix in matched_sumstats.columns :
|
|
274
303
|
to_export_columns.append("Z"+suffix)
|
|
@@ -282,7 +311,8 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
|
|
|
282
311
|
|
|
283
312
|
log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
|
|
284
313
|
log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
|
|
285
|
-
matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
|
|
314
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, sep="\t",index=None)
|
|
315
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path+".gz", sep="\t",index=None)
|
|
286
316
|
return matched_snp_list_path, matched_sumstats_path
|
|
287
317
|
|
|
288
318
|
def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
|
gwaslab/util_ex_gwascatalog.py
CHANGED
|
@@ -3,43 +3,86 @@ import json
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import gwaslab as gl
|
|
5
5
|
from gwaslab.g_Log import Log
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
import os
|
|
6
8
|
|
|
7
|
-
def
|
|
9
|
+
def find_efo_cache(efo, path):
|
|
10
|
+
for root, dirs, files in os.walk(path):
|
|
11
|
+
for file in files:
|
|
12
|
+
if efo in file:
|
|
13
|
+
return os.path.join(root, file)
|
|
14
|
+
return False
|
|
15
|
+
|
|
16
|
+
def gwascatalog_trait(efo,
|
|
17
|
+
source="NCBI",
|
|
18
|
+
sig_level=5e-8,
|
|
19
|
+
use_cache=True,
|
|
20
|
+
cache_dir="./",
|
|
21
|
+
verbose=True,
|
|
22
|
+
log=Log()):
|
|
8
23
|
|
|
9
24
|
#https://www.ebi.ac.uk/gwas/rest/docs/api
|
|
10
25
|
|
|
11
26
|
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
|
|
12
27
|
log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if use_cache==True:
|
|
31
|
+
log.write("searching cache in : {}".format(cache_dir))
|
|
32
|
+
cache = find_efo_cache(efo, cache_dir)
|
|
33
|
+
if cache==False:
|
|
34
|
+
log.write(" -Cache not found for {}... Downloading from GWASCatalog...".format(cache), verbose=verbose)
|
|
35
|
+
else:
|
|
36
|
+
cache = False
|
|
37
|
+
|
|
38
|
+
if cache==False:
|
|
39
|
+
#log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
|
|
40
|
+
log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
|
|
41
|
+
log.write(" -EFO trait api: "+ base_url, verbose=verbose)
|
|
42
|
+
text = requests.get(base_url)
|
|
17
43
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
44
|
+
log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
|
|
45
|
+
if text.status_code!=200:
|
|
46
|
+
log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
|
|
47
|
+
log.write(" -Message:{}".format(text.text), verbose=verbose)
|
|
48
|
+
return 0
|
|
23
49
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
50
|
+
api_response = json.loads(text.text)
|
|
51
|
+
log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
|
|
52
|
+
log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
|
|
53
|
+
|
|
54
|
+
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
|
|
55
|
+
log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
|
|
56
|
+
log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
|
|
57
|
+
log.write(" -Note: this step might take a while...", verbose=verbose)
|
|
58
|
+
|
|
59
|
+
# get request and check status code of response
|
|
60
|
+
raw_data = requests.get(base_url)
|
|
61
|
+
|
|
62
|
+
# whether to proceed based on status code
|
|
63
|
+
is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
|
|
64
|
+
if is_proceed is False: return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
log.write(" -Loading json ...", verbose=verbose)
|
|
68
|
+
# Transform API response from JSON into Python dictionary
|
|
69
|
+
api_response = json.loads(raw_data.text)
|
|
70
|
+
|
|
71
|
+
now = datetime.now() # current date and time
|
|
72
|
+
datestring = now.strftime("%Y%m%d")
|
|
73
|
+
json_path = cache_dir + "GWASCatalog_{}_associationsByTraitSummary_text_{}.json".format(efo, datestring)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
log.write(" -Saving json to: {} ...".format(json_path), verbose=verbose)
|
|
77
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
78
|
+
json.dump(api_response, f, ensure_ascii=False, indent=4)
|
|
79
|
+
except:
|
|
80
|
+
pass
|
|
81
|
+
else:
|
|
82
|
+
log.write(" -Loading cache for {}: {} ...".format(efo, cache), verbose=verbose)
|
|
83
|
+
with open(cache) as f:
|
|
84
|
+
api_response = json.load(f)
|
|
27
85
|
|
|
28
|
-
base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
|
|
29
|
-
log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
|
|
30
|
-
log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
|
|
31
|
-
log.write(" -Note: this step might take a while...", verbose=verbose)
|
|
32
|
-
|
|
33
|
-
# get request and check status code of response
|
|
34
|
-
raw_data = requests.get(base_url)
|
|
35
|
-
|
|
36
|
-
# whether to proceed based on status code
|
|
37
|
-
is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
|
|
38
|
-
if is_proceed is False: return False
|
|
39
|
-
|
|
40
|
-
log.write(" -Loading json ...", verbose=verbose)
|
|
41
|
-
# Transform API response from JSON into Python dictionary
|
|
42
|
-
api_response = json.loads(raw_data.text)
|
|
43
86
|
log.write(" -Parsing json ...", verbose=verbose)
|
|
44
87
|
# An
|
|
45
88
|
records=list()
|