gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/g_Sumstats_summary.py
CHANGED
|
@@ -15,7 +15,7 @@ def summarize(insumstats,
|
|
|
15
15
|
for i in [snpid,rsid,eaf,p,n,status]:
|
|
16
16
|
if i in insumstats.columns:
|
|
17
17
|
cols.append(i)
|
|
18
|
-
sumstats= insumstats
|
|
18
|
+
sumstats= insumstats[cols].copy()
|
|
19
19
|
###############################################################################
|
|
20
20
|
numeric_cols=[]
|
|
21
21
|
output = {}
|
|
@@ -68,7 +68,7 @@ def summarize(insumstats,
|
|
|
68
68
|
sumstats.drop(columns='uniq_index',inplace=True)
|
|
69
69
|
status_dic = {}
|
|
70
70
|
for index,row in status_summary.iterrows():
|
|
71
|
-
status_dic[str(index)]=row[0]
|
|
71
|
+
status_dic[str(index)]=row.iloc[0]
|
|
72
72
|
output["STATUS"]=status_dic
|
|
73
73
|
numeric_cols.append("STATUS")
|
|
74
74
|
df = pd.DataFrame.from_dict({(i,j): output[i][j]
|
|
@@ -84,7 +84,7 @@ def summarize(insumstats,
|
|
|
84
84
|
return df
|
|
85
85
|
|
|
86
86
|
def sum_status(id_to_use, sumstats):
|
|
87
|
-
results = sumstats.groupby("STATUS").count()
|
|
87
|
+
results = sumstats.groupby("STATUS",observed=True).count()
|
|
88
88
|
results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
|
|
89
89
|
return results
|
|
90
90
|
|
gwaslab/g_version.py
CHANGED
|
@@ -3,10 +3,10 @@ import subprocess
|
|
|
3
3
|
import os
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
def _show_version(log=Log()):
|
|
6
|
+
def _show_version(log=Log(), verbose=True):
|
|
7
7
|
# show version when loading sumstats
|
|
8
|
-
log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]))
|
|
9
|
-
log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
|
|
8
|
+
log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
|
|
9
|
+
log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com",verbose=verbose)
|
|
10
10
|
|
|
11
11
|
def _get_version():
|
|
12
12
|
# return short version string like v3.4.33
|
|
@@ -15,12 +15,12 @@ def _get_version():
|
|
|
15
15
|
def gwaslab_info():
|
|
16
16
|
# version meta information
|
|
17
17
|
dic={
|
|
18
|
-
"version":"3.4.
|
|
19
|
-
"release_date":"
|
|
18
|
+
"version":"3.4.39",
|
|
19
|
+
"release_date":"20240210"
|
|
20
20
|
}
|
|
21
21
|
return dic
|
|
22
22
|
|
|
23
|
-
def _checking_plink_version(v=2,log=Log()):
|
|
23
|
+
def _checking_plink_version(v=2,log=Log(), verbose=True):
|
|
24
24
|
if v==1:
|
|
25
25
|
which_plink_script = "plink --version"
|
|
26
26
|
elif v==2:
|
|
@@ -29,19 +29,19 @@ def _checking_plink_version(v=2,log=Log()):
|
|
|
29
29
|
log.write(" -PLINK version: {}".format(output.strip()))
|
|
30
30
|
return log
|
|
31
31
|
|
|
32
|
-
def _checking_r_version(r, log):
|
|
32
|
+
def _checking_r_version(r, log=Log(), verbose=True):
|
|
33
33
|
which_r_script = "{} --version".format(r)
|
|
34
34
|
output = subprocess.check_output(which_r_script, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
35
|
-
log.write(" -R version: {}".format(output.strip()))
|
|
35
|
+
log.write(" -R version: {}".format(output.strip()),verbose=verbose)
|
|
36
36
|
return log
|
|
37
37
|
|
|
38
|
-
def _check_susie_version(r,log):
|
|
38
|
+
def _check_susie_version(r,log=Log(), verbose=True):
|
|
39
39
|
rscript = 'print(packageVersion("susieR"))'
|
|
40
40
|
temp_r = "_gwaslab_susie_temp_check_version_{}.R".format(np.random.randint(1, 99999999))
|
|
41
41
|
with open(temp_r,"w") as file:
|
|
42
42
|
file.write(rscript)
|
|
43
43
|
which_susie_script = "{} {}".format(r, temp_r)
|
|
44
44
|
output = subprocess.check_output(which_susie_script, stderr=subprocess.STDOUT, shell=True,text=True)
|
|
45
|
-
log.write(" -SuSieR version: {}".format(output.strip()))
|
|
45
|
+
log.write(" -SuSieR version: {}".format(output.strip()),verbose=verbose)
|
|
46
46
|
os.remove(temp_r)
|
|
47
47
|
return log
|
gwaslab/hm_casting.py
CHANGED
|
@@ -5,18 +5,24 @@ from pandas.api.types import CategoricalDtype
|
|
|
5
5
|
from gwaslab.g_vchange_status import copy_status
|
|
6
6
|
from gwaslab.g_vchange_status import vchange_status
|
|
7
7
|
from gwaslab.qc_fix_sumstats import flipallelestats
|
|
8
|
+
from gwaslab.qc_check_datatype import check_datatype
|
|
9
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
8
10
|
from gwaslab.util_in_fill_data import filldata
|
|
9
11
|
from Bio import SeqIO
|
|
10
12
|
from itertools import combinations
|
|
11
13
|
|
|
12
|
-
def
|
|
14
|
+
def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
|
|
15
|
+
|
|
16
|
+
|
|
13
17
|
cols_to_drop = []
|
|
14
18
|
for i in sumstats.columns:
|
|
15
19
|
if i in ["SNPID","rsID"]:
|
|
16
20
|
cols_to_drop.append(i)
|
|
21
|
+
|
|
22
|
+
log.write("Start to merge sumstats...", verbose=verbose)
|
|
17
23
|
|
|
18
24
|
if len(cols_to_drop)>0:
|
|
19
|
-
log.write("Dropping old IDs:{}".format(cols_to_drop))
|
|
25
|
+
log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
|
|
20
26
|
sumstats = sumstats.drop(columns=cols_to_drop)
|
|
21
27
|
|
|
22
28
|
if ref_path is not None :
|
|
@@ -29,18 +35,20 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
|
|
|
29
35
|
if return_not_matched_mold:
|
|
30
36
|
mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
|
|
31
37
|
|
|
38
|
+
# mold sumffix + mold
|
|
32
39
|
mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
|
|
33
|
-
log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
|
|
40
|
+
log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
34
41
|
|
|
35
42
|
mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
|
|
36
|
-
|
|
43
|
+
|
|
44
|
+
log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
|
|
37
45
|
|
|
38
|
-
if ref_path is not None:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
46
|
+
#if ref_path is not None:
|
|
47
|
+
# # match removed sumstats
|
|
48
|
+
# mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
|
|
49
|
+
# iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
|
|
50
|
+
# _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
|
|
51
|
+
# mold_sumstats.drop(columns=["_INDEX",""])
|
|
44
52
|
|
|
45
53
|
if return_not_matched_mold == True:
|
|
46
54
|
sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
|
|
@@ -59,14 +67,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
|
|
|
59
67
|
|
|
60
68
|
all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
|
|
61
69
|
allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
|
|
62
|
-
sumstats
|
|
70
|
+
sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
|
|
63
71
|
|
|
64
72
|
is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
|
|
65
73
|
is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
|
|
66
74
|
is_allele_set_match = is_flipped_match | is_perfect_match
|
|
67
75
|
|
|
68
|
-
|
|
69
|
-
|
|
76
|
+
log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
|
|
77
|
+
log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
|
|
78
|
+
log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
|
|
79
|
+
log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
|
|
80
|
+
|
|
70
81
|
return sumstats.loc[is_allele_set_match,:]
|
|
71
82
|
|
|
72
83
|
def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
|
|
@@ -77,10 +88,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
|
|
|
77
88
|
nea2="NEA"+suffixes[1]
|
|
78
89
|
status1="STATUS"+suffixes[0]
|
|
79
90
|
status2="STATUS"+suffixes[1]
|
|
91
|
+
|
|
80
92
|
is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
|
|
81
93
|
is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
|
|
82
94
|
|
|
95
|
+
log.write(" -Aligning alleles with reference: ", verbose=verbose)
|
|
96
|
+
log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
|
|
97
|
+
log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
|
|
98
|
+
|
|
99
|
+
log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
|
|
83
100
|
sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
|
|
101
|
+
|
|
102
|
+
log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
|
|
84
103
|
sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
|
|
85
104
|
|
|
86
105
|
return sumstats
|
|
@@ -119,9 +138,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
|
|
|
119
138
|
if i not in order:
|
|
120
139
|
output_columns.append(i)
|
|
121
140
|
|
|
122
|
-
|
|
123
|
-
molded_sumstats = molded_sumstats
|
|
124
|
-
|
|
141
|
+
log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
|
|
142
|
+
molded_sumstats = molded_sumstats[ output_columns]
|
|
143
|
+
log.write("Finished sorting columns successfully!", verbose=verbose)
|
|
125
144
|
|
|
126
145
|
return molded_sumstats
|
|
127
146
|
|
|
@@ -154,7 +173,7 @@ def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=L
|
|
|
154
173
|
record_chr = int(str(record.id).strip("chrCHR").upper())
|
|
155
174
|
|
|
156
175
|
if record_chr in chromlist:
|
|
157
|
-
|
|
176
|
+
log.write(record_chr," ", end="",show_time=False,verbose=verbose)
|
|
158
177
|
chromlist.remove(record_chr)
|
|
159
178
|
else:
|
|
160
179
|
continue
|