gwaslab 3.4.45__py3-none-any.whl → 3.4.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -1
- gwaslab/bd_common_data.py +22 -0
- gwaslab/g_Sumstats.py +2 -0
- gwaslab/g_version.py +7 -7
- gwaslab/hm_harmonize_sumstats.py +3 -2
- gwaslab/io_preformat_input.py +22 -1
- gwaslab/io_to_formats.py +8 -3
- gwaslab/qc_fix_sumstats.py +8 -1
- gwaslab/util_ex_calculate_ldmatrix.py +20 -7
- gwaslab/util_ex_calculate_prs.py +13 -7
- gwaslab/util_ex_process_ref.py +22 -11
- gwaslab/util_in_filter_value.py +38 -2
- gwaslab/util_in_get_sig.py +32 -8
- gwaslab/util_in_meta.py +234 -0
- gwaslab/util_in_snphwe.py +58 -0
- gwaslab/viz_aux_chromatin.py +112 -0
- gwaslab/viz_plot_compare_effect.py +4 -1
- gwaslab/viz_plot_mqqplot.py +82 -42
- gwaslab/viz_plot_regional2.py +792 -0
- gwaslab/viz_plot_regionalplot.py +4 -0
- gwaslab/viz_plot_stackedregional.py +97 -22
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/METADATA +5 -5
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/RECORD +27 -23
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/WHEEL +1 -1
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE +0 -0
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.45.dist-info → gwaslab-3.4.47.dist-info}/top_level.txt +0 -0
gwaslab/__init__.py
CHANGED
|
@@ -44,4 +44,5 @@ from gwaslab.viz_plot_trumpetplot import plot_power
|
|
|
44
44
|
from gwaslab.viz_plot_trumpetplot import plot_power_x
|
|
45
45
|
from gwaslab.util_ex_process_h5 import process_vcf_to_hfd5
|
|
46
46
|
from gwaslab.util_ex_run_susie import _run_susie_rss as run_susie_rss
|
|
47
|
-
from gwaslab.io_read_tabular import _read_tabular as read_tabular
|
|
47
|
+
from gwaslab.io_read_tabular import _read_tabular as read_tabular
|
|
48
|
+
from gwaslab.util_in_meta import meta_analyze
|
gwaslab/bd_common_data.py
CHANGED
|
@@ -298,6 +298,28 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
|
|
|
298
298
|
|
|
299
299
|
return protein_coding_path
|
|
300
300
|
|
|
301
|
+
def gtf_to_all_gene(gtfpath,log=Log(),verbose=True):
|
|
302
|
+
all_gene_path = gtfpath[:-6]+"all_genes.gtf.gz"
|
|
303
|
+
# if not existing, extract protein coding records and output to a new file
|
|
304
|
+
if not path.isfile(all_gene_path):
|
|
305
|
+
|
|
306
|
+
# get gene list
|
|
307
|
+
log.write(" - Extracting genes from {}".format(gtfpath),verbose=verbose)
|
|
308
|
+
gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
|
|
309
|
+
gene_list = gtf.loc[gtf["feature"]=="gene","gene_id"].values
|
|
310
|
+
log.write(" - Loaded {} genes.".format(len(gene_list)),verbose=verbose)
|
|
311
|
+
|
|
312
|
+
# extract entry using csv
|
|
313
|
+
gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
|
|
314
|
+
gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
|
|
315
|
+
gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
|
|
316
|
+
gtf_raw = gtf_raw.drop("_gene_id",axis=1)
|
|
317
|
+
|
|
318
|
+
log.write(" - Extracted records are saved to : {} ".format(all_gene_path),verbose=verbose)
|
|
319
|
+
gtf_raw.to_csv(all_gene_path, header=None, index=None, sep="\t")
|
|
320
|
+
|
|
321
|
+
return all_gene_path
|
|
322
|
+
|
|
301
323
|
####################################################################################################################
|
|
302
324
|
# From BioPython: https://github.com/biopython/biopython/blob/c5a6b1374267d769b19c1022b4b45472316e78b4/Bio/Seq.py#L36
|
|
303
325
|
def _maketrans(complement_mapping):
|
gwaslab/g_Sumstats.py
CHANGED
|
@@ -121,6 +121,7 @@ class Sumstats():
|
|
|
121
121
|
snpr2=None,
|
|
122
122
|
status=None,
|
|
123
123
|
other=[],
|
|
124
|
+
usekeys=None,
|
|
124
125
|
direction=None,
|
|
125
126
|
verbose=True,
|
|
126
127
|
study="Study_1",
|
|
@@ -200,6 +201,7 @@ class Sumstats():
|
|
|
200
201
|
trait=trait,
|
|
201
202
|
status=status,
|
|
202
203
|
other=other,
|
|
204
|
+
usekeys=usekeys,
|
|
203
205
|
verbose=verbose,
|
|
204
206
|
readargs=readargs,
|
|
205
207
|
log=self.log)
|
gwaslab/g_version.py
CHANGED
|
@@ -15,16 +15,16 @@ def _get_version():
|
|
|
15
15
|
def gwaslab_info():
|
|
16
16
|
# version meta information
|
|
17
17
|
dic={
|
|
18
|
-
"version":"3.4.
|
|
19
|
-
"release_date":"
|
|
18
|
+
"version":"3.4.47",
|
|
19
|
+
"release_date":"20240703"
|
|
20
20
|
}
|
|
21
21
|
return dic
|
|
22
22
|
|
|
23
|
-
def _checking_plink_version(
|
|
24
|
-
if
|
|
25
|
-
which_plink_script = "
|
|
26
|
-
elif
|
|
27
|
-
which_plink_script = "
|
|
23
|
+
def _checking_plink_version(plink=None,plink2=None,log=Log(), verbose=True):
|
|
24
|
+
if plink is not None:
|
|
25
|
+
which_plink_script = "{} --version".format(plink)
|
|
26
|
+
elif plink2 is not None:
|
|
27
|
+
which_plink_script = "{} --version".format(plink2)
|
|
28
28
|
output = subprocess.check_output(which_plink_script, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
29
29
|
log.write(" -PLINK version: {}".format(output.strip()))
|
|
30
30
|
return log
|
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -868,8 +868,9 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
868
868
|
if is_enough_info == False: return sumstats
|
|
869
869
|
############################################################################################
|
|
870
870
|
|
|
871
|
-
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
|
|
872
|
-
|
|
871
|
+
#standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
|
|
872
|
+
standardized_normalized = sumstats["STATUS"] == sumstats["STATUS"]
|
|
873
|
+
|
|
873
874
|
if rsid not in sumstats.columns:
|
|
874
875
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
875
876
|
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -55,6 +55,7 @@ def preformat(sumstats,
|
|
|
55
55
|
trait=None,
|
|
56
56
|
build=None,
|
|
57
57
|
other=[],
|
|
58
|
+
usekeys=None,
|
|
58
59
|
verbose=False,
|
|
59
60
|
readargs=None,
|
|
60
61
|
log=None):
|
|
@@ -65,6 +66,11 @@ def preformat(sumstats,
|
|
|
65
66
|
dtype_dictionary ={}
|
|
66
67
|
|
|
67
68
|
#######################################################################################################################################################
|
|
69
|
+
# workflow:
|
|
70
|
+
# 1. formatbook
|
|
71
|
+
# 2. user specified header
|
|
72
|
+
# 3. usekeys
|
|
73
|
+
|
|
68
74
|
if fmt is not None:
|
|
69
75
|
# loading format parameters
|
|
70
76
|
log.write("Start to load format from formatbook....",verbose=verbose)
|
|
@@ -129,6 +135,8 @@ def preformat(sumstats,
|
|
|
129
135
|
|
|
130
136
|
################################################
|
|
131
137
|
for key,value in rename_dictionary.items():
|
|
138
|
+
# check avaiable keys key->raw header
|
|
139
|
+
# usecols : a list of raw headers to load from file/DataFrame
|
|
132
140
|
if key in raw_cols:
|
|
133
141
|
usecols.append(key)
|
|
134
142
|
if value in ["EA","NEA"]:
|
|
@@ -137,7 +145,7 @@ def preformat(sumstats,
|
|
|
137
145
|
dtype_dictionary[value]="string"
|
|
138
146
|
|
|
139
147
|
except ValueError:
|
|
140
|
-
raise ValueError("Please input a path or a pd.DataFrame, and make sure the columns you specified are in the file.")
|
|
148
|
+
raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
|
|
141
149
|
|
|
142
150
|
###################################################################################################################################################
|
|
143
151
|
## check columns/datatype to use
|
|
@@ -276,6 +284,19 @@ def preformat(sumstats,
|
|
|
276
284
|
else:
|
|
277
285
|
study = raw_cols[9]
|
|
278
286
|
usecols = usecols + [study]
|
|
287
|
+
|
|
288
|
+
if usekeys is not None:
|
|
289
|
+
# extract only specified keys
|
|
290
|
+
usecols_new =[]
|
|
291
|
+
for i in usekeys:
|
|
292
|
+
for k, v in rename_dictionary.items():
|
|
293
|
+
if i == v:
|
|
294
|
+
usecols_new.append(k)
|
|
295
|
+
usecols_valid =[]
|
|
296
|
+
for i in usecols_new:
|
|
297
|
+
if i in usecols:
|
|
298
|
+
usecols_valid.append(i)
|
|
299
|
+
usecols = usecols_valid
|
|
279
300
|
#loading data ##########################################################################################################
|
|
280
301
|
|
|
281
302
|
try:
|
gwaslab/io_to_formats.py
CHANGED
|
@@ -212,8 +212,10 @@ def tofmt(sumstats,
|
|
|
212
212
|
log.write(" -Start outputting sumstats in "+fmt+" format...")
|
|
213
213
|
|
|
214
214
|
if "CHR" in sumstats.columns:
|
|
215
|
+
# output X,Y,MT instead of 23,24,25
|
|
215
216
|
if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
|
|
216
217
|
sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
|
|
218
|
+
# add prefix to CHR
|
|
217
219
|
elif chr_prefix is not None:
|
|
218
220
|
sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
|
|
219
221
|
|
|
@@ -437,17 +439,20 @@ def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status
|
|
|
437
439
|
ouput_cols.append(i)
|
|
438
440
|
|
|
439
441
|
# + additional cols and remove duplicated
|
|
440
|
-
|
|
442
|
+
ouput_cols_final = []
|
|
443
|
+
for i in ouput_cols + cols:
|
|
444
|
+
if i not in ouput_cols_final:
|
|
445
|
+
ouput_cols_final.append(i)
|
|
441
446
|
|
|
442
447
|
# remove STATUS
|
|
443
448
|
try:
|
|
444
449
|
if no_status == True:
|
|
445
|
-
|
|
450
|
+
ouput_cols_final.remove("STATUS")
|
|
446
451
|
except:
|
|
447
452
|
pass
|
|
448
453
|
|
|
449
454
|
#filter and rename to target fromat headers
|
|
450
|
-
sumstats = sumstats[
|
|
455
|
+
sumstats = sumstats[ouput_cols_final]
|
|
451
456
|
sumstats = sumstats.rename(columns=rename_dictionary)
|
|
452
457
|
|
|
453
458
|
# configure target format args and reorder columns
|
gwaslab/qc_fix_sumstats.py
CHANGED
|
@@ -1061,6 +1061,13 @@ def check_range(sumstats, var_range, header, coltocheck, cols_to_check, log, ver
|
|
|
1061
1061
|
if sum(is_low_p) >0:
|
|
1062
1062
|
log.warning("Extremely low P detected (P=0 or P < minimum positive value of float64) : {}".format(sum(is_low_p)))
|
|
1063
1063
|
log.warning("Please consider using MLOG10P instead.")
|
|
1064
|
+
|
|
1065
|
+
if header=="INFO":
|
|
1066
|
+
is_high_info = sumstats["INFO"]>1
|
|
1067
|
+
if sum(is_high_info) >0:
|
|
1068
|
+
log.warning("High INFO detected (INFO>1) : {}".format(sum(is_high_info)))
|
|
1069
|
+
log.warning("max(INFO): {}".format(sumstats["INFO"].max()))
|
|
1070
|
+
log.warning("Please check if this is as expected.")
|
|
1064
1071
|
|
|
1065
1072
|
if sum(~is_valid)>0:
|
|
1066
1073
|
try:
|
|
@@ -1102,7 +1109,7 @@ def sanitycheckstats(sumstats,
|
|
|
1102
1109
|
HR=(-100,100),
|
|
1103
1110
|
HR_95L=(0,float("Inf")),
|
|
1104
1111
|
HR_95U=(0,float("Inf")),
|
|
1105
|
-
info=(0,
|
|
1112
|
+
info=(0,2),
|
|
1106
1113
|
float_tolerence = 1e-7,
|
|
1107
1114
|
verbose=True,
|
|
1108
1115
|
log=Log()):
|
|
@@ -17,6 +17,8 @@ def tofinemapping(sumstats,
|
|
|
17
17
|
vcf=None,
|
|
18
18
|
loci=None,
|
|
19
19
|
out="./",
|
|
20
|
+
plink="plink",
|
|
21
|
+
plink2="plink2",
|
|
20
22
|
windowsizekb=1000,
|
|
21
23
|
n_cores=1,
|
|
22
24
|
mode="r",
|
|
@@ -56,6 +58,9 @@ def tofinemapping(sumstats,
|
|
|
56
58
|
else:
|
|
57
59
|
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
58
60
|
|
|
61
|
+
log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
|
|
62
|
+
log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
|
|
63
|
+
|
|
59
64
|
# Drop duplicate!!!!
|
|
60
65
|
log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
|
|
61
66
|
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
@@ -68,11 +73,13 @@ def tofinemapping(sumstats,
|
|
|
68
73
|
if exclude_hla==True:
|
|
69
74
|
sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
|
|
70
75
|
|
|
76
|
+
sig_df = sig_df.reset_index()
|
|
77
|
+
|
|
71
78
|
## for each lead variant
|
|
72
79
|
for index, row in sig_df.iterrows():
|
|
73
80
|
# extract snplist in each locus
|
|
74
81
|
gc.collect()
|
|
75
|
-
|
|
82
|
+
log.write(" -Locus #{}---------------------------------------------------------------".format(index+1))
|
|
76
83
|
log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
|
|
77
84
|
locus_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
|
|
78
85
|
|
|
@@ -84,7 +91,10 @@ def tofinemapping(sumstats,
|
|
|
84
91
|
n_cores=n_cores,
|
|
85
92
|
log=log,
|
|
86
93
|
load_bim=True,
|
|
87
|
-
overwrite=overwrite
|
|
94
|
+
overwrite=overwrite,
|
|
95
|
+
plink=plink,
|
|
96
|
+
plink2=plink2,
|
|
97
|
+
**kwargs)
|
|
88
98
|
|
|
89
99
|
## check available snps with reference file
|
|
90
100
|
matched_sumstats = _align_sumstats_with_bim(row=row,
|
|
@@ -114,7 +124,10 @@ def tofinemapping(sumstats,
|
|
|
114
124
|
windowsizekb=windowsizekb,
|
|
115
125
|
out=out,
|
|
116
126
|
plink_log=plink_log,
|
|
117
|
-
log=log,
|
|
127
|
+
log=log,
|
|
128
|
+
filetype=filetype,
|
|
129
|
+
plink=plink,
|
|
130
|
+
plink2=plink2,
|
|
118
131
|
verbose=verbose)
|
|
119
132
|
|
|
120
133
|
|
|
@@ -143,12 +156,12 @@ def tofinemapping(sumstats,
|
|
|
143
156
|
|
|
144
157
|
|
|
145
158
|
|
|
146
|
-
def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,verbose=True):
|
|
159
|
+
def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
|
|
147
160
|
'''
|
|
148
161
|
Calculate LD r matrix by calling PLINK; return file name and log
|
|
149
162
|
'''
|
|
150
163
|
log.write(" -Start to calculate LD r matrix...",verbose=verbose)
|
|
151
|
-
log = _checking_plink_version(
|
|
164
|
+
log = _checking_plink_version(plink=plink, log=log)
|
|
152
165
|
if "@" in bfile_prefix:
|
|
153
166
|
bfile_to_use = bfile_prefix.replace("@",str(row["CHR"]))
|
|
154
167
|
else:
|
|
@@ -165,7 +178,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
|
|
|
165
178
|
raise ValueError("Please use bfile instead of pfile for PLINK1.")
|
|
166
179
|
|
|
167
180
|
script_vcf_to_bfile = """
|
|
168
|
-
|
|
181
|
+
{} \
|
|
169
182
|
--bfile {} \
|
|
170
183
|
--keep-allele-order \
|
|
171
184
|
--extract {} \
|
|
@@ -175,7 +188,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
|
|
|
175
188
|
--threads {} {}\
|
|
176
189
|
--write-snplist \
|
|
177
190
|
--out {}
|
|
178
|
-
""".format(bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
|
|
191
|
+
""".format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
|
|
179
192
|
|
|
180
193
|
try:
|
|
181
194
|
output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
|
gwaslab/util_ex_calculate_prs.py
CHANGED
|
@@ -18,6 +18,8 @@ def _calculate_prs(sumstats,
|
|
|
18
18
|
memory=None,
|
|
19
19
|
overwrite=False,
|
|
20
20
|
mode=None,delete=True,
|
|
21
|
+
plink="plink",
|
|
22
|
+
plink2="plink2",
|
|
21
23
|
log=Log(),**kwargs):
|
|
22
24
|
|
|
23
25
|
#matching_alleles
|
|
@@ -30,14 +32,18 @@ def _calculate_prs(sumstats,
|
|
|
30
32
|
chrlist.sort()
|
|
31
33
|
plink_log = ""
|
|
32
34
|
#process reference fileWWW
|
|
33
|
-
bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files(
|
|
35
|
+
bfile_prefix, plink_log, ref_bim, filetype = _process_plink_input_files(
|
|
36
|
+
chrlist=chrlist,
|
|
34
37
|
bfile=bfile,
|
|
35
38
|
vcf=vcf,
|
|
36
39
|
plink_log=plink_log,
|
|
37
40
|
n_cores=n_cores,
|
|
38
41
|
log=log,
|
|
39
42
|
load_bim=False,
|
|
40
|
-
overwrite=overwrite
|
|
43
|
+
overwrite=overwrite,
|
|
44
|
+
plink=plink,
|
|
45
|
+
plink2=plink2,
|
|
46
|
+
**kwargs)
|
|
41
47
|
score_file_path_list =[]
|
|
42
48
|
for index, chrom in enumerate(chrlist):
|
|
43
49
|
chr_sumstats = sumstats.loc[sumstats["CHR"]==chrom,:].copy()
|
|
@@ -61,7 +67,7 @@ def _calculate_prs(sumstats,
|
|
|
61
67
|
plink_log=plink_log,
|
|
62
68
|
log=log,
|
|
63
69
|
memory=memory,
|
|
64
|
-
mode=mode,filetype=filetype)
|
|
70
|
+
mode=mode,filetype=filetype,plink2=plink2)
|
|
65
71
|
score_file_path_list.append(score_file_path)
|
|
66
72
|
if delete == True:
|
|
67
73
|
os.remove(model_path)
|
|
@@ -71,10 +77,10 @@ def _calculate_prs(sumstats,
|
|
|
71
77
|
|
|
72
78
|
|
|
73
79
|
|
|
74
|
-
def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, mode=None):
|
|
80
|
+
def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, plink_log, log, memory,filetype, plink2,mode=None):
|
|
75
81
|
|
|
76
82
|
log.write(" -Start to calculate PRS for Chr {}...".format(chrom))
|
|
77
|
-
_checking_plink_version(
|
|
83
|
+
_checking_plink_version(plink2=plink2, log=log)
|
|
78
84
|
|
|
79
85
|
if "@" in bfile_prefix:
|
|
80
86
|
bpfile_to_use = bfile_prefix.replace("@",str(chrom))
|
|
@@ -92,13 +98,13 @@ def _run_calculate_prs(study, chrom , model_path, bfile_prefix, n_cores, out, pl
|
|
|
92
98
|
memory_flag = "--memory {}".format(memory)
|
|
93
99
|
|
|
94
100
|
script_vcf_to_bfile = """
|
|
95
|
-
|
|
101
|
+
{} \
|
|
96
102
|
{} \
|
|
97
103
|
--score {} 1 2 3 header {} cols=+scoresums,+denom ignore-dup-ids \
|
|
98
104
|
--chr {} \
|
|
99
105
|
--threads {} {}\
|
|
100
106
|
--out {}
|
|
101
|
-
""".format(file_flag, model_path , mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
|
|
107
|
+
""".format(plink2, file_flag, model_path , mode if mode is not None else "", chrom, n_cores, memory_flag if memory is not None else "", output_prefix)
|
|
102
108
|
|
|
103
109
|
try:
|
|
104
110
|
output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
|
gwaslab/util_ex_process_ref.py
CHANGED
|
@@ -20,7 +20,9 @@ def _process_plink_input_files(chrlist,
|
|
|
20
20
|
bgen_mode="ref-first",
|
|
21
21
|
convert="bfile",
|
|
22
22
|
memory=None,
|
|
23
|
-
load_bim=False
|
|
23
|
+
load_bim=False,
|
|
24
|
+
plink="plink",
|
|
25
|
+
plink2="plink2"):
|
|
24
26
|
"""
|
|
25
27
|
Process input files (bfile,pfile,vcf,bgen) to either PLINK1 bed/bim/fam or PLINK2 pgen/psam/pvar.
|
|
26
28
|
|
|
@@ -66,7 +68,9 @@ def _process_plink_input_files(chrlist,
|
|
|
66
68
|
convert=convert,
|
|
67
69
|
memory=memory,
|
|
68
70
|
overwrite=overwrite,
|
|
69
|
-
load_bim=load_bim
|
|
71
|
+
load_bim=load_bim,
|
|
72
|
+
plink=plink,
|
|
73
|
+
plink2=plink2)
|
|
70
74
|
filetype = convert
|
|
71
75
|
elif filetype == "bgen":
|
|
72
76
|
ref_file_prefix, plink_log, ref_bims = _process_bgen(ref_file_prefix=ref_file_prefix,
|
|
@@ -81,7 +85,9 @@ def _process_plink_input_files(chrlist,
|
|
|
81
85
|
convert=convert,
|
|
82
86
|
memory=memory,
|
|
83
87
|
overwrite=overwrite,
|
|
84
|
-
load_bim=load_bim
|
|
88
|
+
load_bim=load_bim,
|
|
89
|
+
plink=plink,
|
|
90
|
+
plink2=plink2)
|
|
85
91
|
filetype = convert
|
|
86
92
|
return ref_file_prefix, plink_log, ref_bims, filetype
|
|
87
93
|
|
|
@@ -199,11 +205,13 @@ def _process_vcf(ref_file_prefix,
|
|
|
199
205
|
convert="bfile",
|
|
200
206
|
memory=None,
|
|
201
207
|
overwrite=False,
|
|
202
|
-
load_bim=False
|
|
208
|
+
load_bim=False,
|
|
209
|
+
plink="plink",
|
|
210
|
+
plink2="plink2"):
|
|
203
211
|
log.write(" -Processing VCF : {}...".format(ref_file_prefix))
|
|
204
212
|
|
|
205
213
|
#check plink version
|
|
206
|
-
log = _checking_plink_version(
|
|
214
|
+
log = _checking_plink_version(plink2=plink2,log=log)
|
|
207
215
|
|
|
208
216
|
# file path prefix to return
|
|
209
217
|
if is_wild_card==True:
|
|
@@ -243,14 +251,15 @@ def _process_vcf(ref_file_prefix,
|
|
|
243
251
|
#if not existing or overwrite is True
|
|
244
252
|
if (not is_file_exist) or overwrite:
|
|
245
253
|
script_vcf_to_bfile = """
|
|
246
|
-
|
|
254
|
+
{} \
|
|
247
255
|
--vcf {} \
|
|
248
256
|
--chr {} \
|
|
249
257
|
{} \
|
|
250
258
|
--rm-dup force-first \
|
|
251
259
|
--threads {}{}\
|
|
252
260
|
--out {}
|
|
253
|
-
""".format(
|
|
261
|
+
""".format(plink2,
|
|
262
|
+
vcf_to_load,
|
|
254
263
|
i,
|
|
255
264
|
make_flag,
|
|
256
265
|
n_cores, memory_flag,
|
|
@@ -288,11 +297,13 @@ def _process_bgen(ref_file_prefix,
|
|
|
288
297
|
convert="bfile",
|
|
289
298
|
memory=None,
|
|
290
299
|
overwrite=False,
|
|
291
|
-
load_bim=False
|
|
300
|
+
load_bim=False,
|
|
301
|
+
plink="plink",
|
|
302
|
+
plink2="plink2"):
|
|
292
303
|
log.write(" -Processing BGEN files : {}...".format(ref_file_prefix))
|
|
293
304
|
|
|
294
305
|
#check plink version
|
|
295
|
-
log = _checking_plink_version(
|
|
306
|
+
log = _checking_plink_version(log=log,plink2=plink2)
|
|
296
307
|
|
|
297
308
|
# file path prefix to return
|
|
298
309
|
if is_wild_card==True:
|
|
@@ -338,14 +349,14 @@ def _process_bgen(ref_file_prefix,
|
|
|
338
349
|
#if not existing or overwrite is True
|
|
339
350
|
if (not is_file_exist) or overwrite:
|
|
340
351
|
script_vcf_to_bfile = """
|
|
341
|
-
|
|
352
|
+
{} \
|
|
342
353
|
--bgen {} {} {}\
|
|
343
354
|
--chr {} \
|
|
344
355
|
{} \
|
|
345
356
|
--rm-dup force-first \
|
|
346
357
|
--threads {}{}\
|
|
347
358
|
--out {}
|
|
348
|
-
""".format(bgen_to_load, bgen_mode, sample_flag,
|
|
359
|
+
""".format(plink2,bgen_to_load, bgen_mode, sample_flag,
|
|
349
360
|
i,
|
|
350
361
|
make_flag,
|
|
351
362
|
n_cores, memory_flag,
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -10,6 +10,7 @@ from gwaslab.g_vchange_status import vchange_status
|
|
|
10
10
|
from gwaslab.qc_fix_sumstats import sortcoordinate
|
|
11
11
|
from gwaslab.qc_fix_sumstats import start_to
|
|
12
12
|
from gwaslab.qc_fix_sumstats import finished
|
|
13
|
+
from gwaslab.qc_fix_sumstats import _process_build
|
|
13
14
|
from gwaslab.hm_harmonize_sumstats import is_palindromic
|
|
14
15
|
|
|
15
16
|
import gc
|
|
@@ -430,8 +431,43 @@ def _filter_snp(sumstats, mode="in", ea="EA",nea="NEA", log=Log(),verbose=True):
|
|
|
430
431
|
log.write("Finished filtering SNPs.",verbose=verbose)
|
|
431
432
|
return snp
|
|
432
433
|
|
|
433
|
-
def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=
|
|
434
|
-
|
|
434
|
+
def _exclude_hla(sumstats, chrom="CHR", pos="POS", lower=None ,upper=None, build=None, mode="xmhc", log=Log(), verbose=True):
|
|
435
|
+
|
|
436
|
+
if build is not None:
|
|
437
|
+
build = _process_build(build = build,log = log,verbose = verbose)
|
|
438
|
+
# xMHC : HIST1H2AA ~ 7.6mb ~ RPL12P1
|
|
439
|
+
# reference: Horton, R., Wilming, L., Rand, V., Lovering, R. C., Bruford, E. A., Khodiyar, V. K., ... & Beck, S. (2004). Gene map of the extended human MHC. Nature Reviews Genetics, 5(12), 889-899.
|
|
440
|
+
# hg38: 25,726,063 ~ 33,400,644
|
|
441
|
+
# hg19 : 25,726,291 ~ 33,368,421
|
|
442
|
+
|
|
443
|
+
# HLA : GABBR1 ~ 3.78mb ~ KIFC1
|
|
444
|
+
# reference: Shiina, T., Hosomichi, K., Inoko, H., & Kulski, J. K. (2009). The HLA genomic loci map: expression, interaction, diversity and disease. Journal of human genetics, 54(1), 15-39.
|
|
445
|
+
# hg38: 29,602,238 ~ 33,409,896
|
|
446
|
+
# hg19: 29,570,015 ~ 33,377,673
|
|
447
|
+
|
|
448
|
+
if build == "19":
|
|
449
|
+
if mode =="xmhc":
|
|
450
|
+
lower=25000000
|
|
451
|
+
upper=34000000
|
|
452
|
+
if mode =="hla" or mode =="mhc":
|
|
453
|
+
lower=29500000
|
|
454
|
+
upper=33500000
|
|
455
|
+
if build == "38":
|
|
456
|
+
if mode =="xmhc":
|
|
457
|
+
lower=25000000
|
|
458
|
+
upper=34000000
|
|
459
|
+
if mode =="hla" or mode =="mhc":
|
|
460
|
+
lower=29500000
|
|
461
|
+
upper=33500000
|
|
462
|
+
else:
|
|
463
|
+
# -> 25,000,000 ~ 34,000,000
|
|
464
|
+
if mode =="xmhc":
|
|
465
|
+
lower=25000000
|
|
466
|
+
upper=34000000
|
|
467
|
+
if mode =="hla" or mode =="mhc":
|
|
468
|
+
lower=29500000
|
|
469
|
+
upper=33500000
|
|
470
|
+
|
|
435
471
|
raw_len = len(sumstats)
|
|
436
472
|
|
|
437
473
|
if str(sumstats[chrom].dtype) == "string":
|
gwaslab/util_in_get_sig.py
CHANGED
|
@@ -11,6 +11,7 @@ from gwaslab.bd_common_data import get_chr_to_number
|
|
|
11
11
|
from gwaslab.bd_common_data import get_number_to_chr
|
|
12
12
|
from gwaslab.bd_common_data import get_chr_to_NC
|
|
13
13
|
from gwaslab.bd_common_data import gtf_to_protein_coding
|
|
14
|
+
from gwaslab.bd_common_data import gtf_to_all_gene
|
|
14
15
|
from gwaslab.bd_download import check_and_download
|
|
15
16
|
from gwaslab.util_ex_gwascatalog import gwascatalog_trait
|
|
16
17
|
from gwaslab.qc_fix_sumstats import check_dataframe_shape
|
|
@@ -38,6 +39,7 @@ def getsig(insumstats,
|
|
|
38
39
|
wc_correction=False,
|
|
39
40
|
build="19",
|
|
40
41
|
source="ensembl",
|
|
42
|
+
gtf_path=None,
|
|
41
43
|
verbose=True):
|
|
42
44
|
"""
|
|
43
45
|
Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
|
|
@@ -172,6 +174,7 @@ def getsig(insumstats,
|
|
|
172
174
|
xymt=xymt,
|
|
173
175
|
build=build,
|
|
174
176
|
source=source,
|
|
177
|
+
gtf_path=gtf_path,
|
|
175
178
|
verbose=verbose)
|
|
176
179
|
|
|
177
180
|
# drop internal id
|
|
@@ -253,6 +256,7 @@ def annogene(
|
|
|
253
256
|
xymt=["X","Y","MT"],
|
|
254
257
|
build="19",
|
|
255
258
|
source="ensembl",
|
|
259
|
+
gtf_path=None,
|
|
256
260
|
verbose=True):
|
|
257
261
|
|
|
258
262
|
log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
|
|
@@ -267,8 +271,13 @@ def annogene(
|
|
|
267
271
|
#| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
|
|
268
272
|
|
|
269
273
|
#gtf_path = check_and_download("ensembl_hg19_gtf_protein_coding")
|
|
270
|
-
gtf_path
|
|
271
|
-
|
|
274
|
+
if gtf_path is None:
|
|
275
|
+
gtf_path = check_and_download("ensembl_hg19_gtf")
|
|
276
|
+
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
277
|
+
else:
|
|
278
|
+
log.write(" -Using user-provided gtf:{}".format(gtf_path))
|
|
279
|
+
gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
|
|
280
|
+
|
|
272
281
|
gtf_db_path = gtf_path[:-2]+"db"
|
|
273
282
|
|
|
274
283
|
data = Genome(
|
|
@@ -283,8 +292,13 @@ def annogene(
|
|
|
283
292
|
elif build=="38":
|
|
284
293
|
log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
|
|
285
294
|
#gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
|
|
286
|
-
gtf_path
|
|
287
|
-
|
|
295
|
+
if gtf_path is None:
|
|
296
|
+
gtf_path = check_and_download("ensembl_hg38_gtf")
|
|
297
|
+
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
298
|
+
else:
|
|
299
|
+
log.write(" -Using user-provided gtf:{}".format(gtf_path))
|
|
300
|
+
gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
|
|
301
|
+
|
|
288
302
|
gtf_db_path = gtf_path[:-2]+"db"
|
|
289
303
|
data = Genome(
|
|
290
304
|
reference_name='GRCh38',
|
|
@@ -300,8 +314,13 @@ def annogene(
|
|
|
300
314
|
if build=="19":
|
|
301
315
|
log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
|
|
302
316
|
#gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
|
|
303
|
-
gtf_path
|
|
304
|
-
|
|
317
|
+
if gtf_path is None:
|
|
318
|
+
gtf_path = check_and_download("refseq_hg19_gtf")
|
|
319
|
+
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
320
|
+
else:
|
|
321
|
+
log.write(" -Using user-provided gtf:{}".format(gtf_path))
|
|
322
|
+
gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
|
|
323
|
+
|
|
305
324
|
gtf_db_path = gtf_path[:-2]+"db"
|
|
306
325
|
data = Genome(
|
|
307
326
|
reference_name='GRCh37',
|
|
@@ -315,8 +334,13 @@ def annogene(
|
|
|
315
334
|
elif build=="38":
|
|
316
335
|
log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
|
|
317
336
|
#gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
|
|
318
|
-
gtf_path
|
|
319
|
-
|
|
337
|
+
if gtf_path is None:
|
|
338
|
+
gtf_path = check_and_download("refseq_hg38_gtf")
|
|
339
|
+
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
340
|
+
else:
|
|
341
|
+
log.write(" -Using user-provided gtf:{}".format(gtf_path))
|
|
342
|
+
gtf_path = gtf_to_all_gene(gtf_path,log=log,verbose=verbose)
|
|
343
|
+
|
|
320
344
|
gtf_db_path = gtf_path[:-2]+"db"
|
|
321
345
|
data = Genome(
|
|
322
346
|
reference_name='GRCh38',
|