gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/g_version.py
CHANGED
|
@@ -3,10 +3,10 @@ import subprocess
|
|
|
3
3
|
import os
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
def _show_version(log=Log()):
|
|
6
|
+
def _show_version(log=Log(), verbose=True):
|
|
7
7
|
# show version when loading sumstats
|
|
8
|
-
log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]))
|
|
9
|
-
log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
|
|
8
|
+
log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
|
|
9
|
+
log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com",verbose=verbose)
|
|
10
10
|
|
|
11
11
|
def _get_version():
|
|
12
12
|
# return short version string like v3.4.33
|
|
@@ -15,12 +15,12 @@ def _get_version():
|
|
|
15
15
|
def gwaslab_info():
|
|
16
16
|
# version meta information
|
|
17
17
|
dic={
|
|
18
|
-
"version":"3.4.
|
|
19
|
-
"release_date":"
|
|
18
|
+
"version":"3.4.39",
|
|
19
|
+
"release_date":"20240210"
|
|
20
20
|
}
|
|
21
21
|
return dic
|
|
22
22
|
|
|
23
|
-
def _checking_plink_version(v=2,log=Log()):
|
|
23
|
+
def _checking_plink_version(v=2,log=Log(), verbose=True):
|
|
24
24
|
if v==1:
|
|
25
25
|
which_plink_script = "plink --version"
|
|
26
26
|
elif v==2:
|
|
@@ -29,19 +29,19 @@ def _checking_plink_version(v=2,log=Log()):
|
|
|
29
29
|
log.write(" -PLINK version: {}".format(output.strip()))
|
|
30
30
|
return log
|
|
31
31
|
|
|
32
|
-
def _checking_r_version(r, log):
|
|
32
|
+
def _checking_r_version(r, log=Log(), verbose=True):
|
|
33
33
|
which_r_script = "{} --version".format(r)
|
|
34
34
|
output = subprocess.check_output(which_r_script, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
35
|
-
log.write(" -R version: {}".format(output.strip()))
|
|
35
|
+
log.write(" -R version: {}".format(output.strip()),verbose=verbose)
|
|
36
36
|
return log
|
|
37
37
|
|
|
38
|
-
def _check_susie_version(r,log):
|
|
38
|
+
def _check_susie_version(r,log=Log(), verbose=True):
|
|
39
39
|
rscript = 'print(packageVersion("susieR"))'
|
|
40
40
|
temp_r = "_gwaslab_susie_temp_check_version_{}.R".format(np.random.randint(1, 99999999))
|
|
41
41
|
with open(temp_r,"w") as file:
|
|
42
42
|
file.write(rscript)
|
|
43
43
|
which_susie_script = "{} {}".format(r, temp_r)
|
|
44
44
|
output = subprocess.check_output(which_susie_script, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
45
|
-
log.write(" -SuSieR version: {}".format(output.strip()))
|
|
45
|
+
log.write(" -SuSieR version: {}".format(output.strip()),verbose=verbose)
|
|
46
46
|
os.remove(temp_r)
|
|
47
47
|
return log
|
gwaslab/hm_casting.py
CHANGED
|
@@ -5,11 +5,15 @@ from pandas.api.types import CategoricalDtype
|
|
|
5
5
|
from gwaslab.g_vchange_status import copy_status
|
|
6
6
|
from gwaslab.g_vchange_status import vchange_status
|
|
7
7
|
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
8
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
9
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
8
10
|
from gwaslab.util_in_fill_data import filldata
|
|
9
11
|
from Bio import SeqIO
|
|
10
12
|
from itertools import combinations
|
|
11
13
|
|
|
12
|
-
def
|
|
14
|
+
def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
|
|
15
|
+
|
|
16
|
+
|
|
13
17
|
cols_to_drop = []
|
|
14
18
|
for i in sumstats.columns:
|
|
15
19
|
if i in ["SNPID","rsID"]:
|
|
@@ -31,6 +35,7 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
|
|
|
31
35
|
if return_not_matched_mold:
|
|
32
36
|
mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
|
|
33
37
|
|
|
38
|
+
# mold sumffix + mold
|
|
34
39
|
mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
|
|
35
40
|
log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
36
41
|
|
|
@@ -133,9 +138,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
|
|
|
133
138
|
if i not in order:
|
|
134
139
|
output_columns.append(i)
|
|
135
140
|
|
|
136
|
-
|
|
141
|
+
log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
|
|
137
142
|
molded_sumstats = molded_sumstats[ output_columns]
|
|
138
|
-
|
|
143
|
+
log.write("Finished sorting columns successfully!", verbose=verbose)
|
|
139
144
|
|
|
140
145
|
return molded_sumstats
|
|
141
146
|
|
|
@@ -168,7 +173,7 @@ def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=L
|
|
|
168
173
|
record_chr = int(str(record.id).strip("chrCHR").upper())
|
|
169
174
|
|
|
170
175
|
if record_chr in chromlist:
|
|
171
|
-
|
|
176
|
+
log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
172
177
|
chromlist.remove(record_chr)
|
|
173
178
|
else:
|
|
174
179
|
continue
|
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -59,17 +59,17 @@ def rsidtochrpos(sumstats,
|
|
|
59
59
|
if is_enough_info == False: return sumstats
|
|
60
60
|
############################################################################################
|
|
61
61
|
|
|
62
|
-
|
|
62
|
+
log.write(" -rsID dictionary file: "+ path,verbose=verbose)
|
|
63
63
|
|
|
64
64
|
if ref_rsid_to_chrpos_tsv is not None:
|
|
65
65
|
path = ref_rsid_to_chrpos_tsv
|
|
66
66
|
|
|
67
67
|
if snpid in sumstats.columns and sum(sumstats[rsid].isna())>0:
|
|
68
|
-
|
|
68
|
+
log.write(" -Filling na in rsID columns with SNPID...",verbose=verbose)
|
|
69
69
|
sumstats.loc[sumstats[rsid].isna(),rsid] = sumstats.loc[sumstats[rsid].isna(),snpid]
|
|
70
70
|
|
|
71
71
|
if sum(sumstats[rsid].isna())>0:
|
|
72
|
-
|
|
72
|
+
log.write(" -Filling na in rsID columns with NA_xxx for {} variants...".format(sum(sumstats[rsid].isna())),verbose=verbose)
|
|
73
73
|
sumstats.loc[sumstats[rsid].isna(),rsid] = ["NA_" + str(x+1) for x in range(len(sumstats.loc[sumstats[rsid].isna(),rsid]))]
|
|
74
74
|
|
|
75
75
|
dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_rsid,ref_chr,ref_pos],
|
|
@@ -84,8 +84,8 @@ def rsidtochrpos(sumstats,
|
|
|
84
84
|
if pos not in sumstats.columns:
|
|
85
85
|
sumstats[pos] =pd.Series(dtype="Int64")
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
log.write(" -Setting block size: ",chunksize,verbose=verbose)
|
|
88
|
+
log.write(" -Loading block: ",end="",verbose=verbose)
|
|
89
89
|
for i,dic in enumerate(dic_chuncks):
|
|
90
90
|
dic_to_update = dic[dic.index.notnull()]
|
|
91
91
|
log.write(i," ",end=" ",show_time=False)
|
|
@@ -95,10 +95,10 @@ def rsidtochrpos(sumstats,
|
|
|
95
95
|
sumstats.update(dic_to_update,overwrite="True")
|
|
96
96
|
gc.collect()
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
99
99
|
sumstats = sumstats.reset_index()
|
|
100
100
|
sumstats = sumstats.rename(columns = {'index':rsid})
|
|
101
|
-
|
|
101
|
+
log.write(" -Updating CHR and POS finished.Start to re-fixing CHR and POS... ",verbose=verbose)
|
|
102
102
|
sumstats = fixchr(sumstats,verbose=verbose)
|
|
103
103
|
sumstats = fixpos(sumstats,verbose=verbose)
|
|
104
104
|
sumstats = sortcolumn(sumstats,verbose=verbose)
|
|
@@ -158,17 +158,17 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
158
158
|
|
|
159
159
|
sumstats["rsn"] = pd.to_numeric(sumstats[rsid].str.strip("rs"),errors="coerce").astype("Int64")
|
|
160
160
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
161
|
+
log.write(" -Source hdf5 file: ",path,verbose=verbose)
|
|
162
|
+
log.write(" -Cores to use : ",n_cores,verbose=verbose)
|
|
163
|
+
log.write(" -Blocksize (make sure it is the same as hdf5 file ): ",block_size,verbose=verbose)
|
|
164
164
|
|
|
165
165
|
input_columns= sumstats.columns
|
|
166
166
|
sumstats_nonrs = sumstats.loc[sumstats["rsn"].isna()|sumstats["rsn"].duplicated(keep='first') ,:].copy()
|
|
167
167
|
sumstats_rs = sumstats.loc[sumstats["rsn"].notnull(),:].copy()
|
|
168
168
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
169
|
+
log.write(" -Non-Valid rsIDs: ",sum(sumstats["rsn"].isna()),verbose=verbose)
|
|
170
|
+
log.write(" -Duplicated rsIDs except for the first occurrence: ",sum(sumstats.loc[~sumstats["rsn"].isna(), "rsn"].duplicated(keep='first')),verbose=verbose)
|
|
171
|
+
log.write(" -Valid rsIDs: ", len(sumstats_rs),verbose=verbose)
|
|
172
172
|
|
|
173
173
|
del sumstats
|
|
174
174
|
gc.collect()
|
|
@@ -185,16 +185,16 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
185
185
|
#
|
|
186
186
|
pool = Pool(n_cores)
|
|
187
187
|
if chrom not in input_columns:
|
|
188
|
-
|
|
188
|
+
log.write(" -Initiating CHR ... ",verbose=verbose)
|
|
189
189
|
sumstats_rs[chrom]=pd.Series(dtype="Int32")
|
|
190
190
|
|
|
191
191
|
if pos not in input_columns:
|
|
192
|
-
|
|
192
|
+
log.write(" -Initiating POS ... ",verbose=verbose)
|
|
193
193
|
sumstats_rs[pos]=pd.Series(dtype="Int64")
|
|
194
194
|
|
|
195
195
|
df_split=[y for x, y in sumstats_rs.groupby('group', as_index=False)]
|
|
196
|
-
|
|
197
|
-
|
|
196
|
+
log.write(" -Divided into groups: ",len(df_split),verbose=verbose)
|
|
197
|
+
log.write(" -",set(sumstats_rs.loc[:,"group"].unique()),verbose=verbose)
|
|
198
198
|
|
|
199
199
|
# check keys
|
|
200
200
|
store = pd.HDFStore(path, 'r')
|
|
@@ -202,21 +202,21 @@ def parallelrsidtochrpos(sumstats, rsid="rsID", chrom="CHR",pos="POS", path=None
|
|
|
202
202
|
all_groups_len = len(all_groups)
|
|
203
203
|
store.close()
|
|
204
204
|
all_groups_max = max(map(lambda x: int(x.split("_")[1]), all_groups))
|
|
205
|
-
|
|
206
|
-
|
|
205
|
+
log.write(" -Number of groups in HDF5: ",all_groups_len,verbose=verbose)
|
|
206
|
+
log.write(" -Max index of groups in HDF5: ",all_groups_max,verbose=verbose)
|
|
207
207
|
|
|
208
208
|
# update CHR and POS using rsID with multiple threads
|
|
209
209
|
sumstats_rs = pd.concat(pool.map(partial(merge_chrpos,all_groups_max=all_groups_max,path=path,build=build,status=status),df_split),ignore_index=True)
|
|
210
210
|
sumstats_rs.loc[:,["CHR","POS"]] = sumstats_rs.loc[:,["CHR","POS"]].astype("Int64")
|
|
211
211
|
del df_split
|
|
212
212
|
gc.collect()
|
|
213
|
-
|
|
213
|
+
log.write(" -Merging group data... ",verbose=verbose)
|
|
214
214
|
# drop group and rsn
|
|
215
215
|
sumstats_rs = sumstats_rs.drop(columns=["group"])
|
|
216
216
|
sumstats_nonrs = sumstats_nonrs.drop(columns=["rsn"])
|
|
217
217
|
|
|
218
218
|
# merge back
|
|
219
|
-
|
|
219
|
+
log.write(" -Append data... ",verbose=verbose)
|
|
220
220
|
sumstats = pd.concat([sumstats_rs,sumstats_nonrs],ignore_index=True)
|
|
221
221
|
|
|
222
222
|
del sumstats_rs
|
|
@@ -308,8 +308,8 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
308
308
|
**_must_args)
|
|
309
309
|
if is_enough_info == False: return sumstats
|
|
310
310
|
############################################################################################
|
|
311
|
-
|
|
312
|
-
|
|
311
|
+
log.write(" -Reference genome FASTA file: "+ ref_path,verbose=verbose)
|
|
312
|
+
log.write(" -Checking records: ", end="",verbose=verbose)
|
|
313
313
|
chromlist = get_chr_list(add_number=True)
|
|
314
314
|
records = SeqIO.parse(ref_path, "fasta")
|
|
315
315
|
for record in records:
|
|
@@ -321,11 +321,11 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
321
321
|
else:
|
|
322
322
|
i = record_chr
|
|
323
323
|
if i in chromlist:
|
|
324
|
-
|
|
324
|
+
log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
325
325
|
to_check_ref = (sumstats[chrom]==i) & (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna())
|
|
326
326
|
sumstats.loc[to_check_ref,status] = sumstats.loc[to_check_ref,[pos,ea,nea,status]].apply(lambda x:check_status(x,record),axis=1)
|
|
327
327
|
|
|
328
|
-
|
|
328
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
329
329
|
|
|
330
330
|
sumstats[status] = sumstats[status].astype("string")
|
|
331
331
|
available_to_check =sum( (~sumstats[pos].isna()) & (~sumstats[nea].isna()) & (~sumstats[ea].isna()))
|
|
@@ -337,25 +337,25 @@ def checkref(sumstats,ref_path,chrom="CHR",pos="POS",ea="EA",nea="NEA",status="S
|
|
|
337
337
|
#status_7=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[7]\w", case=False, flags=0, na=False))
|
|
338
338
|
status_8=sum(sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w", case=False, flags=0, na=False))
|
|
339
339
|
|
|
340
|
-
|
|
341
|
-
|
|
340
|
+
log.write(" -Variants allele on given reference sequence : ",status_0,verbose=verbose)
|
|
341
|
+
log.write(" -Variants flipped : ",status_3,verbose=verbose)
|
|
342
342
|
raw_matching_rate = (status_3+status_0)/available_to_check
|
|
343
343
|
flip_rate = status_3/available_to_check
|
|
344
|
-
|
|
344
|
+
log.write(" -Raw Matching rate : ","{:.2f}%".format(raw_matching_rate*100),verbose=verbose)
|
|
345
345
|
if raw_matching_rate <0.8:
|
|
346
|
-
|
|
346
|
+
log.warning("Matching rate is low, please check if the right reference genome is used.")
|
|
347
347
|
if flip_rate > 0.85 :
|
|
348
|
-
|
|
348
|
+
log.write(" -Flipping variants rate > 0.85, it is likely that the EA is aligned with REF in the original dataset.",verbose=verbose)
|
|
349
349
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
#
|
|
354
|
-
|
|
350
|
+
log.write(" -Variants inferred reverse_complement : ",status_4,verbose=verbose)
|
|
351
|
+
log.write(" -Variants inferred reverse_complement_flipped : ",status_5,verbose=verbose)
|
|
352
|
+
log.write(" -Both allele on genome + unable to distinguish : ",status_6,verbose=verbose)
|
|
353
|
+
#log.write(" -Reverse_complementary + both allele on genome + unable to distinguish: ",status_7)
|
|
354
|
+
log.write(" -Variants not on given reference sequence : ",status_8,verbose=verbose)
|
|
355
355
|
|
|
356
356
|
if remove is True:
|
|
357
357
|
sumstats = sumstats.loc[~sumstats["STATUS"].str.match("\w\w\w\w\w[8]\w"),:]
|
|
358
|
-
|
|
358
|
+
log.write(" -Variants not on given reference sequence were removed.",verbose=verbose)
|
|
359
359
|
|
|
360
360
|
finished(log, verbose, _end_line)
|
|
361
361
|
return sumstats
|
|
@@ -424,7 +424,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
424
424
|
if is_enough_info == False: return sumstats
|
|
425
425
|
############################################################################################
|
|
426
426
|
chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
|
|
427
|
-
|
|
427
|
+
log.write(" -Assigning rsID based on CHR:POS and REF:ALT/ALT:REF...",verbose=verbose)
|
|
428
428
|
##############################################
|
|
429
429
|
if rsid not in sumstats.columns:
|
|
430
430
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -458,8 +458,8 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
458
458
|
##################################################################################################################
|
|
459
459
|
|
|
460
460
|
after_number = sum(~sumstats[rsid].isna())
|
|
461
|
-
|
|
462
|
-
|
|
461
|
+
log.write(" -rsID Annotation for "+str(total_number - after_number) +" need to be fixed!",verbose=verbose)
|
|
462
|
+
log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
|
|
463
463
|
|
|
464
464
|
##################################################################################################################
|
|
465
465
|
elif ref_mode=="tsv":
|
|
@@ -486,7 +486,7 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
486
486
|
if is_enough_info == False: return sumstats
|
|
487
487
|
############################################################################################
|
|
488
488
|
|
|
489
|
-
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]
|
|
489
|
+
standardized_normalized = sumstats["STATUS"].str.match("\w\w\w[0][01234]\w\w", case=False, flags=0, na=False)
|
|
490
490
|
|
|
491
491
|
if rsid not in sumstats.columns:
|
|
492
492
|
sumstats[rsid]=pd.Series(dtype="string")
|
|
@@ -500,15 +500,15 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
500
500
|
|
|
501
501
|
total_number= len(sumstats)
|
|
502
502
|
pre_number = sum(~sumstats[rsid].isna())
|
|
503
|
-
|
|
503
|
+
log.write(" -"+str(sum(to_assign)) +" rsID could be possibly fixed...",verbose=verbose)
|
|
504
504
|
if sum(to_assign)>0:
|
|
505
505
|
sumstats = sumstats.set_index(snpid)
|
|
506
506
|
dic_chuncks = pd.read_csv(path,sep="\t",usecols=[ref_snpid,ref_rsid],
|
|
507
507
|
chunksize=chunksize,index_col=ref_snpid,
|
|
508
508
|
dtype={ref_snpid:"string",ref_rsid:"string"})
|
|
509
509
|
|
|
510
|
-
|
|
511
|
-
|
|
510
|
+
log.write(" -Setting block size: ",chunksize,verbose=verbose)
|
|
511
|
+
log.write(" -Loading block: ",end="",verbose=verbose)
|
|
512
512
|
for i,dic in enumerate(dic_chuncks):
|
|
513
513
|
gc.collect()
|
|
514
514
|
log.write(i," ",end=" ",show_time=False)
|
|
@@ -517,15 +517,15 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
|
|
|
517
517
|
dic = dic.loc[~dic.index.duplicated(keep=False),:]
|
|
518
518
|
sumstats.update(dic,overwrite=True)
|
|
519
519
|
|
|
520
|
-
|
|
520
|
+
log.write("\n",end="",show_time=False,verbose=verbose)
|
|
521
521
|
sumstats = sumstats.reset_index()
|
|
522
522
|
sumstats = sumstats.rename(columns = {'index':snpid})
|
|
523
523
|
|
|
524
524
|
after_number = sum(~sumstats[rsid].isna())
|
|
525
|
-
|
|
526
|
-
|
|
525
|
+
log.write(" -rsID annotation for "+str(total_number - after_number) +" needed to be fixed!",verbose=verbose)
|
|
526
|
+
log.write(" -Annotated "+str(after_number - pre_number) +" rsID successfully!",verbose=verbose)
|
|
527
527
|
else:
|
|
528
|
-
|
|
528
|
+
log.write(" -No rsID can be fixed...skipping...",verbose=verbose)
|
|
529
529
|
################################################################################################################
|
|
530
530
|
|
|
531
531
|
finished(log,verbose,_end_line)
|
|
@@ -652,7 +652,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
652
652
|
|
|
653
653
|
##not palindromic : change status
|
|
654
654
|
sumstats.loc[not_palindromic_snp,status] = vchange_status(sumstats.loc[not_palindromic_snp,status], 7 ,"9","0")
|
|
655
|
-
|
|
655
|
+
log.write(" -Identified ", sum(palindromic)," palindromic SNPs...",verbose=verbose)
|
|
656
656
|
|
|
657
657
|
#palindromic but can not infer
|
|
658
658
|
maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
@@ -664,7 +664,7 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
664
664
|
|
|
665
665
|
unknow_palindromic_to_check = palindromic & maf_can_infer & unknow_palindromic
|
|
666
666
|
|
|
667
|
-
|
|
667
|
+
log.write(" -After filtering by MAF< {} , {} palindromic SNPs with unknown strand will be inferred...".format(maf_threshold, sum(unknow_palindromic_to_check)),verbose=verbose)
|
|
668
668
|
|
|
669
669
|
#########################################################################################
|
|
670
670
|
if sum(unknow_palindromic_to_check)>0:
|
|
@@ -677,8 +677,10 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
677
677
|
map_func = partial(check_strand,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
678
678
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
679
679
|
sumstats.loc[unknow_palindromic_to_check,status] = status_inferred.values
|
|
680
|
-
|
|
681
|
-
|
|
680
|
+
pool.close()
|
|
681
|
+
pool.join()
|
|
682
|
+
else:
|
|
683
|
+
log.warning("No palindromic variants available for checking.")
|
|
682
684
|
#########################################################################################
|
|
683
685
|
#0 Not palindromic SNPs
|
|
684
686
|
#1 Palindromic +strand -> no need to flip
|
|
@@ -697,33 +699,33 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
697
699
|
status7 = sumstats[status].str.match(r'\w\w\w\w\w\w[7]', case=False, flags=0, na=False)
|
|
698
700
|
status8 = sumstats[status].str.match(r'\w\w\w\w\w[123][8]', case=False, flags=0, na=False)
|
|
699
701
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
702
|
+
log.write(" -Non-palindromic : ",sum(status0),verbose=verbose)
|
|
703
|
+
log.write(" -Palindromic SNPs on + strand: ",sum(status1),verbose=verbose)
|
|
704
|
+
log.write(" -Palindromic SNPs on - strand and needed to be flipped:",sum(status5),verbose=verbose)
|
|
705
|
+
log.write(" -Palindromic SNPs with MAF not available to infer : ",sum(status7),verbose=verbose)
|
|
706
|
+
log.write(" -Palindromic SNPs with no macthes or no information : ",sum(status8),verbose=verbose)
|
|
705
707
|
|
|
706
708
|
if ("7" in remove_snp) and ("8" in remove_snp) :
|
|
707
|
-
|
|
709
|
+
log.write(" -Palindromic SNPs with MAF not available to infer and with no macthes or no information will will be removed",verbose=verbose)
|
|
708
710
|
sumstats = sumstats.loc[~(status7 | status8),:].copy()
|
|
709
711
|
elif "8" in remove_snp:
|
|
710
|
-
|
|
712
|
+
log.write(" -Palindromic SNPs with no macthes or no information will be removed",verbose=verbose)
|
|
711
713
|
sumstats = sumstats.loc[~status8,:].copy()
|
|
712
714
|
elif "7" in remove_snp:
|
|
713
|
-
|
|
715
|
+
log.write(" -Palindromic SNPs with MAF not available to infer will be removed",verbose=verbose)
|
|
714
716
|
sumstats = sumstats.loc[~status7,:].copy()
|
|
715
717
|
|
|
716
718
|
### unknow_indel
|
|
717
719
|
if "i" in mode:
|
|
718
720
|
unknow_indel = sumstats[status].str.match(r'\w\w\w\w\w[6][89]', case=False, flags=0, na=False)
|
|
719
|
-
|
|
721
|
+
log.write(" -Identified ", sum(unknow_indel)," indistinguishable Indels...",verbose=verbose)
|
|
720
722
|
if sum(unknow_indel)>0:
|
|
721
|
-
|
|
723
|
+
log.write(" -Indistinguishable indels will be inferred from reference vcf REF and ALT...",verbose=verbose)
|
|
722
724
|
#########################################################################################
|
|
723
725
|
#with maf can not infer
|
|
724
726
|
#maf_can_infer = (sumstats[eaf] < maf_threshold) | (sumstats[eaf] > 1 - maf_threshold)
|
|
725
727
|
#sumstats.loc[unknow_indel&(~maf_can_infer),status] = vchange_status(sumstats.loc[unknow_indel&(~maf_can_infer),status],7,"9","8")
|
|
726
|
-
|
|
728
|
+
log.write(" -Difference in allele frequency (DAF) tolerance: {}".format(daf_tolerance),verbose=verbose)
|
|
727
729
|
|
|
728
730
|
if sum(unknow_indel)>0:
|
|
729
731
|
if sum(unknow_indel)<10000:
|
|
@@ -734,20 +736,23 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
|
|
|
734
736
|
map_func = partial(check_indel,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,status=status,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict,daf_tolerance=daf_tolerance)
|
|
735
737
|
status_inferred = pd.concat(pool.map(map_func,df_split))
|
|
736
738
|
sumstats.loc[unknow_indel,status] = status_inferred.values
|
|
737
|
-
|
|
738
|
-
|
|
739
|
+
pool.close()
|
|
740
|
+
pool.join()
|
|
741
|
+
|
|
739
742
|
#########################################################################################
|
|
740
743
|
|
|
741
744
|
status3 = sumstats[status].str.match(r'\w\w\w\w\w\w[3]', case=False, flags=0, na=False)
|
|
742
745
|
status6 = sumstats[status].str.match(r'\w\w\w\w\w\w[6]', case=False, flags=0, na=False)
|
|
743
746
|
status8 = sumstats[status].str.match(r'\w\w\w\w\w[6][8]', case=False, flags=0, na=False)
|
|
744
747
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
+
log.write(" -Indels ea/nea match reference : ",sum(status3),verbose=verbose)
|
|
749
|
+
log.write(" -Indels ea/nea need to be flipped : ",sum(status6),verbose=verbose)
|
|
750
|
+
log.write(" -Indels with no macthes or no information : ",sum(status8),verbose=verbose)
|
|
748
751
|
if "8" in remove_indel:
|
|
749
|
-
|
|
750
|
-
sumstats = sumstats.loc[~status8,:].copy()
|
|
752
|
+
log.write(" -Indels with no macthes or no information will be removed",verbose=verbose)
|
|
753
|
+
sumstats = sumstats.loc[~status8,:].copy()
|
|
754
|
+
else:
|
|
755
|
+
log.warning("No indistinguishable indels available for checking.")
|
|
751
756
|
|
|
752
757
|
finished(log,verbose,_end_line)
|
|
753
758
|
return sumstats
|
|
@@ -804,7 +809,7 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
804
809
|
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
805
810
|
if not force:
|
|
806
811
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
807
|
-
|
|
812
|
+
log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
|
|
808
813
|
sumstats[column_name]=np.nan
|
|
809
814
|
|
|
810
815
|
########################
|
|
@@ -823,13 +828,13 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,colu
|
|
|
823
828
|
|
|
824
829
|
#sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
|
|
825
830
|
#sumstats["DAF"]=sumstats["DAF"].astype("float")
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
831
|
+
log.write(" - {} max:".format(column_name), np.nanmax(sumstats[column_name]),verbose=verbose)
|
|
832
|
+
log.write(" - {} min:".format(column_name), np.nanmin(sumstats[column_name]),verbose=verbose)
|
|
833
|
+
log.write(" - {} sd:".format(column_name), np.nanstd(sumstats[column_name]),verbose=verbose)
|
|
834
|
+
log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats[column_name])),verbose=verbose)
|
|
835
|
+
log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats[column_name])),verbose=verbose)
|
|
836
|
+
log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats[column_name])),verbose=verbose)
|
|
837
|
+
log.write("Finished allele frequency checking!")
|
|
833
838
|
return sumstats
|
|
834
839
|
|
|
835
840
|
def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
@@ -886,7 +891,7 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
886
891
|
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
887
892
|
if not force:
|
|
888
893
|
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
889
|
-
|
|
894
|
+
log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
|
|
890
895
|
|
|
891
896
|
########################
|
|
892
897
|
if sum(sumstats[eaf].isna())<10000:
|
|
@@ -901,8 +906,8 @@ def paralleleinferaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",p
|
|
|
901
906
|
###########################
|
|
902
907
|
|
|
903
908
|
afternumber = sum(sumstats[eaf].isna())
|
|
904
|
-
|
|
905
|
-
|
|
909
|
+
log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
|
|
910
|
+
log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
|
|
906
911
|
|
|
907
912
|
finished(log,verbose,_end_line)
|
|
908
913
|
return sumstats
|
|
@@ -936,13 +941,13 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
|
936
941
|
def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
|
|
937
942
|
if vcf_path is not None:
|
|
938
943
|
if vcf_chr_dict is None:
|
|
939
|
-
|
|
944
|
+
log.write(" -Checking prefix for chromosomes in vcf files..." ,verbose=verbose)
|
|
940
945
|
prefix = check_vcf_chr_prefix(vcf_path)
|
|
941
946
|
if prefix is not None:
|
|
942
|
-
|
|
947
|
+
log.write(" -Prefix for chromosomes: ",prefix)
|
|
943
948
|
vcf_chr_dict = get_number_to_chr(prefix=prefix)
|
|
944
949
|
else:
|
|
945
|
-
|
|
950
|
+
log.write(" -No prefix for chromosomes in the VCF files." ,verbose=verbose)
|
|
946
951
|
vcf_chr_dict = get_number_to_chr()
|
|
947
952
|
return vcf_chr_dict
|
|
948
953
|
|
gwaslab/io_preformat_input.py
CHANGED
|
@@ -418,17 +418,17 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
|
|
|
418
418
|
if type(value) is str:
|
|
419
419
|
if "\n" in value:
|
|
420
420
|
value_first_line=value.split("\n")[0]
|
|
421
|
-
|
|
421
|
+
log.write(" -",key," : "+value_first_line.strip()+"...",verbose=verbose)
|
|
422
422
|
elif value==" ":
|
|
423
|
-
|
|
423
|
+
log.write(' -',key,' : \\s ',verbose=verbose)
|
|
424
424
|
elif value=="\t":
|
|
425
|
-
|
|
425
|
+
log.write(' -',key,' : \\t',verbose=verbose)
|
|
426
426
|
else:
|
|
427
|
-
|
|
427
|
+
log.write(" -",key," : "+value.strip(),verbose=verbose)
|
|
428
428
|
elif type(value) is list:
|
|
429
|
-
|
|
429
|
+
log.write(" -",key," : "+','.join(value),verbose=verbose)
|
|
430
430
|
else:
|
|
431
|
-
|
|
431
|
+
log.write(" -",key," : ",value,verbose=verbose)
|
|
432
432
|
keys=[]
|
|
433
433
|
values=[]
|
|
434
434
|
for key,value in rename_dictionary.items():
|
|
@@ -437,21 +437,21 @@ def print_format_info(fmt,meta_data, rename_dictionary, verbose, log,output=Fals
|
|
|
437
437
|
if fmt!="gwaslab":
|
|
438
438
|
if output == False:
|
|
439
439
|
if fmt!="auto":
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
440
|
+
log.write(" -"+fmt+" to gwaslab format dictionary:",verbose=verbose)
|
|
441
|
+
log.write(" - "+fmt+" keys:",",".join(keys),verbose=verbose)
|
|
442
|
+
log.write(" - gwaslab values:",",".join(values),verbose=verbose)
|
|
443
443
|
else:
|
|
444
|
-
|
|
445
|
-
|
|
444
|
+
log.write(" - Auto-detection mode. Note: auto-detection assumes A1=EA; Alt=EA and Frq=EAF...",verbose=verbose)
|
|
445
|
+
log.write(" - Header conversion source: https://github.com/Cloufield/formatbook/blob/main/formats/auto.json",verbose=verbose)
|
|
446
446
|
else:
|
|
447
|
-
|
|
447
|
+
log.write(" -gwaslab to "+fmt+" format dictionary:",verbose=verbose)
|
|
448
448
|
keys=[]
|
|
449
449
|
values=[]
|
|
450
450
|
for key,value in rename_dictionary.items():
|
|
451
451
|
keys.append(key)
|
|
452
452
|
values.append(value)
|
|
453
|
-
|
|
454
|
-
|
|
453
|
+
log.write(" - gwaslab keys:", ','.join(keys),verbose=verbose)
|
|
454
|
+
log.write(" - "+fmt+" values:" , ','.join(values),verbose=verbose)
|
|
455
455
|
|
|
456
456
|
def process_neaf(sumstats,log,verbose):
|
|
457
457
|
log.write(" -NEAF is specified...",verbose=verbose)
|
gwaslab/io_read_ldsc.py
CHANGED
|
@@ -195,4 +195,52 @@ def read_greml(filelist=[]):
|
|
|
195
195
|
continue
|
|
196
196
|
row = pd.DataFrame([row], columns = summary.columns)
|
|
197
197
|
summary = pd.concat([summary, row], ignore_index=True)
|
|
198
|
-
return summary
|
|
198
|
+
return summary
|
|
199
|
+
|
|
200
|
+
def parse_ldsc_summary(ldsc_summary):
|
|
201
|
+
summary = pd.DataFrame(columns = ['h2_obs', 'h2_se','Lambda_gc','Mean_chi2','Intercept','Intercept_se',"Ratio","Ratio_se"])
|
|
202
|
+
lines = ldsc_summary.split("\n")
|
|
203
|
+
row={}
|
|
204
|
+
try:
|
|
205
|
+
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[0])
|
|
206
|
+
row["h2_obs"]=objects[1]
|
|
207
|
+
row["h2_se"]=objects[2]
|
|
208
|
+
|
|
209
|
+
##next line lambda gc
|
|
210
|
+
|
|
211
|
+
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[1])
|
|
212
|
+
row["Lambda_gc"] = objects[1]
|
|
213
|
+
##next line Mean_chi2
|
|
214
|
+
|
|
215
|
+
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[2])
|
|
216
|
+
row["Mean_chi2"]=objects[1]
|
|
217
|
+
##next line Intercept
|
|
218
|
+
|
|
219
|
+
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+|NA').findall(lines[3])
|
|
220
|
+
row["Intercept"]=objects[1]
|
|
221
|
+
row["Intercept_se"]=objects[2]
|
|
222
|
+
##next line Ratio
|
|
223
|
+
|
|
224
|
+
if re.compile('NA').findall(lines[4]):
|
|
225
|
+
row["Ratio"]="NA"
|
|
226
|
+
row["Ratio_se"]="NA"
|
|
227
|
+
elif re.compile('<').findall(lines[4]):
|
|
228
|
+
row["Ratio"]="Ratio < 0"
|
|
229
|
+
row["Ratio_se"]="NA"
|
|
230
|
+
else:
|
|
231
|
+
objects = re.compile('[a-zA-Z\s\d]+:|[-0-9.]+[e]?[-0-9.]+').findall(lines[4])
|
|
232
|
+
row["Ratio"]=objects[1]
|
|
233
|
+
row["Ratio_se"]=objects[2]
|
|
234
|
+
except:
|
|
235
|
+
row["h2_obs"]="NA"
|
|
236
|
+
row["h2_se"]="NA"
|
|
237
|
+
row["Lambda_gc"] = "NA"
|
|
238
|
+
row["Mean_chi2"]="NA"
|
|
239
|
+
row["Intercept"]="NA"
|
|
240
|
+
row["Intercept_se"]="NA"
|
|
241
|
+
row["Ratio"]="NA"
|
|
242
|
+
row["Ratio_se"]="NA"
|
|
243
|
+
|
|
244
|
+
#summary = summary.append(row,ignore_index=True)
|
|
245
|
+
row = pd.DataFrame([row], columns = summary.columns)
|
|
246
|
+
return row
|