gwaslab 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (63) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/g_Sumstats.py +98 -24
  6. gwaslab/g_SumstatsMulti.py +287 -0
  7. gwaslab/g_SumstatsPair.py +101 -16
  8. gwaslab/g_Sumstats_polars.py +245 -0
  9. gwaslab/g_headers.py +12 -3
  10. gwaslab/g_meta.py +123 -47
  11. gwaslab/g_meta_update.py +48 -0
  12. gwaslab/g_vchange_status_polars.py +44 -0
  13. gwaslab/g_version.py +2 -2
  14. gwaslab/hm_casting.py +169 -110
  15. gwaslab/hm_casting_polars.py +202 -0
  16. gwaslab/hm_harmonize_sumstats.py +19 -8
  17. gwaslab/io_load_ld.py +529 -0
  18. gwaslab/io_preformat_input.py +11 -0
  19. gwaslab/io_preformat_input_polars.py +632 -0
  20. gwaslab/io_process_args.py +25 -1
  21. gwaslab/io_read_ldsc.py +34 -3
  22. gwaslab/io_read_pipcs.py +62 -6
  23. gwaslab/prscs_gigrnd.py +122 -0
  24. gwaslab/prscs_mcmc_gtb.py +136 -0
  25. gwaslab/prscs_parse_genet.py +98 -0
  26. gwaslab/qc_build.py +53 -0
  27. gwaslab/qc_check_datatype.py +10 -8
  28. gwaslab/qc_check_datatype_polars.py +128 -0
  29. gwaslab/qc_fix_sumstats.py +25 -23
  30. gwaslab/qc_fix_sumstats_polars.py +193 -0
  31. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  32. gwaslab/util_ex_gwascatalog.py +71 -28
  33. gwaslab/util_ex_ldsc.py +67 -21
  34. gwaslab/util_ex_match_ldmatrix.py +396 -0
  35. gwaslab/util_ex_run_2samplemr.py +0 -2
  36. gwaslab/util_ex_run_ccgwas.py +155 -0
  37. gwaslab/util_ex_run_coloc.py +1 -1
  38. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  39. gwaslab/util_ex_run_mesusie.py +155 -0
  40. gwaslab/util_ex_run_mtag.py +92 -0
  41. gwaslab/util_ex_run_prscs.py +85 -0
  42. gwaslab/util_ex_run_susie.py +40 -9
  43. gwaslab/util_in_estimate_ess.py +18 -0
  44. gwaslab/util_in_fill_data.py +20 -1
  45. gwaslab/util_in_filter_value.py +10 -5
  46. gwaslab/util_in_get_sig.py +71 -13
  47. gwaslab/util_in_meta.py +168 -4
  48. gwaslab/util_in_meta_polars.py +174 -0
  49. gwaslab/viz_plot_compare_effect.py +87 -23
  50. gwaslab/viz_plot_credible_sets.py +55 -11
  51. gwaslab/viz_plot_effect.py +22 -12
  52. gwaslab/viz_plot_miamiplot2.py +3 -2
  53. gwaslab/viz_plot_mqqplot.py +84 -81
  54. gwaslab/viz_plot_qqplot.py +6 -6
  55. gwaslab/viz_plot_regional2.py +2 -1
  56. gwaslab/viz_plot_stackedregional.py +4 -1
  57. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/METADATA +8 -6
  58. gwaslab-3.5.8.dist-info/RECORD +117 -0
  59. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
  60. gwaslab-3.5.7.dist-info/RECORD +0 -96
  61. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
  62. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  63. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,128 @@
1
+ import gc
2
+ import pandas as pd
3
+ import polars as pl
4
+ import numpy as np
5
+ from gwaslab.g_Log import Log
6
+ # pandas.api.types.is_int64_dtype
7
+ # pandas.api.types.is_categorical_dtype
8
+
9
+ dtype_dict ={
10
+ "SNPID":[pl.String()],
11
+ "rsID": [pl.String()],
12
+ "CHR": [pl.Int64()],
13
+ "POS": [pl.Int64()],
14
+ "EA": [pl.String()],
15
+ "NEA":[pl.String()],
16
+ "REF":[pl.String()],
17
+ "ALT":[pl.String()],
18
+ "BETA":[pl.Float64()],
19
+ "BETA_95L":[pl.Float64()],
20
+ "BETA_95U":[pl.Float64()],
21
+ "SE":[pl.Float64()],
22
+ "N":[pl.Int64()],
23
+ "N_CASE":[pl.Int64()],
24
+ "N_CONTROL":[pl.Int64()],
25
+ "OR":[pl.Float64()],
26
+ "OR_95L":[pl.Float64()],
27
+ "OR_95U":[pl.Float64()],
28
+ "HR":[pl.Float64()],
29
+ "HR_95L":[pl.Float64()],
30
+ "HR_95U":[pl.Float64()],
31
+ "P":[pl.Float64()],
32
+ "MLOG10P":[pl.Float64()],
33
+ "Z":[pl.Float64()],
34
+ "F":[pl.Float64()],
35
+ "T":[pl.Float64()],
36
+ "TEST":[pl.String()],
37
+ "CHISQ":[pl.Float64()],
38
+ "I2":[pl.Float64()],
39
+ "P_HET":[pl.Float64()],
40
+ "SNPR2":[pl.Float64()],
41
+ "EAF":[pl.Float64()],
42
+ "NEAF":[pl.Float64()],
43
+ "MAF":[pl.Float64()],
44
+ "INFO":[pl.Float64()],
45
+ "DOF":[pl.Int64()],
46
+ "STATUS":[pl.String()],
47
+ "DIRECTION":[pl.String()],
48
+ 'PIP' :[pl.Float64()],
49
+ 'CREDIBLE_SET_INDEX':[pl.Int64()],
50
+ 'N_SNP' :[pl.Int64()],
51
+ 'LOCUS' :[pl.String()],
52
+ 'STUDY' :[pl.String()],
53
+ 'BETA_RANDOM' :[pl.Float64()],
54
+ 'SE_RANDOM' :[pl.Float64()],
55
+ 'Z_RANDOM' :[pl.Float64()],
56
+ 'P_RANDOM' :[pl.Float64()]
57
+ }
58
+
59
+ def check_datatype(sumstats, verbose=True, log=Log()):
60
+
61
+ #try:
62
+ headers = []
63
+ dtypes = []
64
+ verified = []
65
+ raw_verified =[]
66
+ for header,dtype in sumstats.schema.items():
67
+ width = max(len(header),len(str(dtype)))
68
+
69
+ header_fix_length = header + " "*(width- len(header) )
70
+
71
+ dtype_fix_length = str(dtype) + " "*(width- len(str(dtype)))
72
+
73
+ verified_str = verify_datatype(header, dtype)
74
+ verified_fix_length = verified_str + " " *(width- len(verified_str))
75
+
76
+ headers.append(format(header_fix_length))
77
+ dtypes.append((str(dtype_fix_length)))
78
+ verified.append(verified_fix_length)
79
+ if verified_str == "F":
80
+ raw_verified.append(header)
81
+
82
+ log.write(" -Column :", " ".join(headers), verbose=verbose)
83
+ log.write(" -DType :", " ".join(dtypes), verbose=verbose)
84
+ log.write(" -Verified:", " ".join(verified), verbose=verbose)
85
+
86
+ if len(raw_verified)>0:
87
+ log.warning("Columns with possibly incompatible dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
88
+ #except:
89
+ # pass
90
+
91
+ def verify_datatype(header, dtype):
92
+
93
+ if header in dtype_dict.keys():
94
+ if dtype in dtype_dict[header]:
95
+ return "T"
96
+ else:
97
+ return "F"
98
+ else:
99
+ return "NA"
100
+
101
+ def quick_convert_datatype(sumstats, log, verbose):
102
+ for col in sumstats.columns:
103
+ if col in dtype_dict.keys():
104
+ if sumstats[col].dtype not in dtype_dict[col]:
105
+ datatype=dtype_dict[col][0]
106
+ log.write(" -Trying to convert datatype for {}: {} -> {}...".format(col, str(sumstats[col].dtype), datatype), end="" ,verbose=verbose)
107
+ try:
108
+ sumstats = sumstats.cast({col: datatype})
109
+ log.write("{}".format(datatype),show_time=False, verbose=verbose)
110
+ except:
111
+ log.write("Failed...",show_time=False,verbose=verbose)
112
+ pass
113
+ return sumstats
114
+
115
+ def check_dataframe_shape(sumstats, log, verbose):
116
+ memory_in_mb = sumstats.estimated_size(unit="mb")
117
+ try:
118
+ log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
119
+ except:
120
+ log.warning("Error: cannot get Dataframe shape...")
121
+
122
+ def check_dataframe_memory_usage(sumstats, log, verbose):
123
+ memory_in_mb = sumstats.estimated_size(unit="mb")
124
+ try:
125
+ log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
126
+ except:
127
+ log.warning("Error: cannot get Memory usage...")
128
+
@@ -16,6 +16,8 @@ from gwaslab.bd_common_data import get_number_to_chr
16
16
  from gwaslab.bd_common_data import get_chr_list
17
17
  from gwaslab.qc_check_datatype import check_datatype
18
18
  from gwaslab.qc_check_datatype import check_dataframe_shape
19
+ from gwaslab.qc_build import _process_build
20
+ from gwaslab.qc_build import _set_build
19
21
  from gwaslab.g_version import _get_version
20
22
  from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
21
23
  from gwaslab.util_in_fill_data import _convert_betase_to_p
@@ -41,29 +43,29 @@ from gwaslab.bd_common_data import get_chain
41
43
 
42
44
  ###############################################################################################################
43
45
  # 20220514
44
- def _process_build(build,log,verbose):
45
- if str(build).lower() in ["hg19","19","37","b37","grch37"]:
46
- log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
47
- final_build = "19"
48
- elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
49
- log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
50
- final_build = "18"
51
- elif str(build).lower() in ["hg38","38","b38","grch38"]:
52
- log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
53
- final_build = "38"
54
- elif str(build).lower() in ["t2t","hs1","chm13","13"]:
55
- log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
56
- final_build = "13"
57
- else:
58
- log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
59
- final_build = "99"
60
- return final_build
61
-
62
- def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
63
- build = _process_build(build,log=log,verbose=verbose)
64
- sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
65
- sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
66
- return sumstats, build
46
+ #def _process_build(build,log,verbose):
47
+ # if str(build).lower() in ["hg19","19","37","b37","grch37"]:
48
+ # log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
49
+ # final_build = "19"
50
+ # elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
51
+ # log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
52
+ # final_build = "18"
53
+ # elif str(build).lower() in ["hg38","38","b38","grch38"]:
54
+ # log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
55
+ # final_build = "38"
56
+ # elif str(build).lower() in ["t2t","hs1","chm13","13"]:
57
+ # log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
58
+ # final_build = "13"
59
+ # else:
60
+ # log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
61
+ # final_build = "99"
62
+ # return final_build
63
+ #
64
+ #def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
65
+ # build = _process_build(build,log=log,verbose=verbose)
66
+ # sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
67
+ # sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
68
+ # return sumstats, build
67
69
 
68
70
  def fixID(sumstats,
69
71
  snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,
@@ -0,0 +1,193 @@
1
+ import re
2
+ import gc
3
+ import pandas as pd
4
+ import numpy as np
5
+ from itertools import repeat
6
+ from multiprocessing import Pool
7
+ from liftover import get_lifter
8
+ from liftover import ChainFile
9
+ from functools import partial
10
+ from gwaslab.g_vchange_status_polars import vchange_statusp
11
+ from gwaslab.g_vchange_status import status_match
12
+ from gwaslab.g_vchange_status import change_status
13
+ from gwaslab.g_Log import Log
14
+ from gwaslab.bd_common_data import get_chr_to_number
15
+ from gwaslab.bd_common_data import get_number_to_chr
16
+ from gwaslab.bd_common_data import get_chr_list
17
+ from gwaslab.qc_check_datatype import check_datatype
18
+ from gwaslab.qc_check_datatype import check_dataframe_shape
19
+ from gwaslab.qc_build import _process_build
20
+ from gwaslab.qc_build import _set_build
21
+ from gwaslab.g_version import _get_version
22
+ from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
23
+ from gwaslab.util_in_fill_data import _convert_betase_to_p
24
+ from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
25
+ from gwaslab.bd_common_data import get_chain
26
+ import polars as pl
27
+ ###############################################################################################################
28
+ # 20220426
29
+ def get_reverse_complementary_allele(a):
30
+ dic = str.maketrans({
31
+ "A":"T",
32
+ "T":"A",
33
+ "C":"G",
34
+ "G":"C"})
35
+ return a[::-1].translate(dic)
36
+
37
+ def flip_direction(string):
38
+ flipped_string=""
39
+ for char in string:
40
+ if char=="?":
41
+ flipped_string+="?"
42
+ elif char=="+":
43
+ flipped_string+="-"
44
+ elif char=="-":
45
+ flipped_string+="+"
46
+ else: #sometime it is 0
47
+ flipped_string+=char
48
+ return flipped_string
49
+
50
+ def flip_by_swap(sumstats, matched_index, log, verbose):
51
+ if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
52
+ log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
53
+
54
+ sumstats = sumstats.with_columns(
55
+ pl.when( matched_index )
56
+ .then( pl.col("EA") )
57
+ .otherwise( pl.col("NEA") )
58
+ .alias("NEA"),
59
+
60
+ pl.when( matched_index )
61
+ .then( pl.col("NEA") )
62
+ .otherwise( pl.col("EA") )
63
+ .alias("EA"),
64
+ )
65
+
66
+ return sumstats
67
+
68
+ def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
69
+ for header in ["OR","OR_95L","OR_95U","HR","HR_95L","HR_95U"]:
70
+ if header in sumstats.columns:
71
+ log.write(" -Flipping column: {header} = 1 / {header}...".format(header = header), verbose=verbose)
72
+ sumstats = sumstats.with_columns(
73
+ pl.when( matched_index )
74
+ .then( 1/ pl.col(header) )
75
+ .otherwise( pl.col(header) )
76
+ .alias(header)
77
+ )
78
+ return sumstats
79
+
80
+ def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
81
+ header="EAF"
82
+ if header in sumstats.columns:
83
+ log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
84
+ sumstats = sumstats.with_columns(
85
+ pl.when( matched_index )
86
+ .then( 1 - pl.col(header) )
87
+ .otherwise( pl.col(header) )
88
+ .alias(header)
89
+ )
90
+ return sumstats
91
+
92
+ def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
93
+ for header in ["BETA","BETA_95L","BETA_95U","T","Z"]:
94
+ if header in sumstats.columns:
95
+ log.write(" -Flipping column: {header} = - {header}...".format(header = header), verbose=verbose)
96
+ sumstats = sumstats.with_columns(
97
+ pl.when( matched_index )
98
+ .then( - pl.col(header) )
99
+ .otherwise( pl.col(header) )
100
+ .alias(header)
101
+ )
102
+
103
+ if "DIRECTION" in sumstats.columns:
104
+ sumstats = sumstats.with_columns(
105
+ pl.when( matched_index )
106
+ .then( pl.col("DIRECTION").map_batches(lambda x: pl.Series(flip_direction(x))) )
107
+ .otherwise( pl.col("DIRECTION") )
108
+ .alias("DIRECTION")
109
+ )
110
+ return sumstats
111
+
112
+ def flipallelestatsp(sumstats,status="STATUS",verbose=True,log=Log()):
113
+ ##start function with col checking#########################################################
114
+
115
+ if_stats_flipped = False
116
+ ###################get reverse complementary####################
117
+ pattern = r"\w\w\w\w\w[45]\w"
118
+ #matched_index = status_match(sumstats[status],6,[4,5]) #
119
+ #matched_index = sumstats[status].str[5].str.match(r"4|5")
120
+
121
+ matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[45]\w")
122
+
123
+ if len(sumstats.filter(matched_index))>0:
124
+ log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
125
+ log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
126
+ if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
127
+ log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
128
+
129
+ sumstats = sumstats.filter(matched_index).with_columns(
130
+ NEA = pl.col("NEA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x))),
131
+ EA = pl.col("EA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x)))
132
+ )
133
+
134
+ sumstats = vchange_statusp(sumstats, matched_index, status,6, "4","2")
135
+ log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
136
+ if_stats_flipped = True
137
+
138
+ ###################flip ref####################
139
+ pattern = r"\w\w\w\w\w[35]\w"
140
+ #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
141
+ matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[35]\w")
142
+ if len(sumstats.filter(matched_index))>0:
143
+ log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
144
+ log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
145
+
146
+ sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
147
+ sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
148
+ sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
149
+ sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
150
+
151
+ #change status
152
+ log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
153
+ sumstats = vchange_statusp(sumstats, matched_index,status,6, "35","12")
154
+ if_stats_flipped = True
155
+
156
+ ###################flip ref for undistingushable indels####################
157
+ pattern = r"\w\w\w\w[123][67]6"
158
+ #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
159
+ matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w[123][67]6")
160
+ if len(sumstats.filter(matched_index))>0:
161
+ log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
162
+ log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
163
+
164
+ sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
165
+ sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
166
+ sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
167
+ sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
168
+
169
+ #change status
170
+ log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
171
+ sumstats = vchange_statusp(sumstats, matched_index,status, 7, "6","4")
172
+ if_stats_flipped = True
173
+ # flip ref
174
+ ###################flip statistics for reverse strand panlindromic variants####################
175
+ pattern = r"\w\w\w\w\w[012]5"
176
+ #matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
177
+ matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[012]5")
178
+ if len(sumstats.filter(matched_index))>0:
179
+ log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
180
+ log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
181
+
182
+ sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
183
+ sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
184
+ sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
185
+
186
+ #change status
187
+ log.write(" -Changed the status for flipped variants: xxxxx[012]5: -> xxxxx[012]2", verbose=verbose)
188
+ sumstats = vchange_statusp(sumstats, matched_index,status,7, "5","2")
189
+ if_stats_flipped = True
190
+
191
+ if if_stats_flipped != True:
192
+ log.write(" -No statistics have been changed.")
193
+ return sumstats
@@ -16,6 +16,7 @@ def tofinemapping(sumstats,
16
16
  bfile=None,
17
17
  vcf=None,
18
18
  loci=None,
19
+ loci_chrpos=None,
19
20
  out="./",
20
21
  plink="plink",
21
22
  plink2="plink2",
@@ -28,8 +29,10 @@ def tofinemapping(sumstats,
28
29
  overwrite=False,
29
30
  log=Log(),
30
31
  suffixes=None,
32
+ extra_plink_option="",
31
33
  verbose=True,
32
34
  **kwargs):
35
+
33
36
  ##start function with col checking##########################################################
34
37
  _start_line = "calculate LD matrix"
35
38
  _end_line = "calculating LD matrix"
@@ -52,11 +55,21 @@ def tofinemapping(sumstats,
52
55
  if getlead_args is None:
53
56
  getlead_args={"windowsizekb":1000}
54
57
 
55
- if loci is None:
56
- log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
57
- sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
58
+ if loci_chrpos is None:
59
+ if loci is None:
60
+ log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
61
+ sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
62
+ else:
63
+ sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
58
64
  else:
59
- sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
65
+ sig_df = pd.DataFrame()
66
+ for chrpos in loci_chrpos:
67
+ chrpos_row_dict={}
68
+ chrpos_row_dict["SNPID"]="{}:{}".format(chrpos[0], chrpos[1])
69
+ chrpos_row_dict["CHR"] = chrpos[0]
70
+ chrpos_row_dict["POS"] = chrpos[1]
71
+ chrpos_row = pd.Series(chrpos_row_dict).to_frame().T
72
+ sig_df = pd.concat([sig_df, chrpos_row],ignore_index=True)
60
73
 
61
74
  log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
62
75
  log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
@@ -128,6 +141,8 @@ def tofinemapping(sumstats,
128
141
  filetype=filetype,
129
142
  plink=plink,
130
143
  plink2=plink2,
144
+ extra_plink_option=extra_plink_option,
145
+ ref_allele_path = matched_sumstats_path,
131
146
  verbose=verbose)
132
147
 
133
148
 
@@ -136,7 +151,7 @@ def tofinemapping(sumstats,
136
151
  row_dict["SNPID"]=row["SNPID"]
137
152
  row_dict["SNPID_LIST"] = matched_snp_list_path
138
153
  row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
139
- row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path
154
+ row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path+".gz"
140
155
  file_row = pd.Series(row_dict).to_frame().T
141
156
  output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
142
157
 
@@ -156,7 +171,7 @@ def tofinemapping(sumstats,
156
171
 
157
172
 
158
173
 
159
- def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
174
+ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,ref_allele_path, extra_plink_option="",verbose=True):
160
175
  '''
161
176
  Calculate LD r matrix by calling PLINK; return file name and log
162
177
  '''
@@ -177,18 +192,32 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
177
192
  if filetype=="pfile":
178
193
  raise ValueError("Please use bfile instead of pfile for PLINK1.")
179
194
 
195
+ #log.write(" -Flipping plink file ref allele to match...",verbose=verbose)
196
+ #script_vcf_to_bfile = """
197
+ #{} \
198
+ # --bfile {} \
199
+ # --extract {} \
200
+ # --chr {} \
201
+ # --ref-allele 'force' {} 4 1 \
202
+ # --threads {} {} \
203
+ # --make-bed \
204
+ # --out {}
205
+
206
+ #""".format(plink2, bfile_to_use, snplist_path, row["CHR"],ref_allele_path, n_cores, memory_flag if memory is not None else "", output_prefix+"_gwaslab_tmp")
207
+
208
+ log.write(" -Calculating r matrix...",verbose=verbose)
180
209
  script_vcf_to_bfile = """
181
210
  {} \
182
211
  --bfile {} \
183
- --keep-allele-order \
212
+ --a2-allele {} 4 1 \
184
213
  --extract {} \
185
214
  --chr {} \
186
215
  --{} square gz \
187
216
  --allow-no-sex \
188
217
  --threads {} {}\
189
218
  --write-snplist \
190
- --out {}
191
- """.format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
219
+ --out {} {}
220
+ """.format(plink, bfile_to_use, ref_allele_path, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix, extra_plink_option)
192
221
 
193
222
  try:
194
223
  output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
@@ -236,20 +265,20 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
236
265
  log.warning("Lead variant was not available in reference!")
237
266
 
238
267
  # adjust statistics
239
- output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
268
+ output_columns=["SNPID","CHR","POS","EA","NEA"]
240
269
  for suffix in suffixes:
241
270
  if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
242
- log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
243
- combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
271
+ #log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
272
+ #combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
244
273
  output_columns.append("BETA"+suffix)
245
274
  output_columns.append("SE"+suffix)
246
275
  if "Z" in locus_sumstats.columns:
247
- log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
248
- combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
276
+ #log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
277
+ #combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
249
278
  output_columns.append("Z"+suffix)
250
279
  if "EAF" in locus_sumstats.columns:
251
- log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
252
- combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
280
+ #log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
281
+ #combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
253
282
  output_columns.append("EAF"+suffix)
254
283
  if "N" in locus_sumstats.columns:
255
284
  output_columns.append("N"+suffix)
@@ -266,9 +295,9 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
266
295
  log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
267
296
 
268
297
  # create locus-sumstats EA, NEA, (BETA, SE), Z
269
- matched_sumstats_path = "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
298
+ matched_sumstats_path = "{}/{}_{}_{}.sumstats".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
270
299
 
271
- to_export_columns=["CHR","POS","EA_bim","NEA_bim"]
300
+ to_export_columns=["CHR","POS","EA","NEA"]
272
301
  for suffix in suffixes:
273
302
  if "Z"+suffix in matched_sumstats.columns :
274
303
  to_export_columns.append("Z"+suffix)
@@ -282,7 +311,8 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
282
311
 
283
312
  log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
284
313
  log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
285
- matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
314
+ matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, sep="\t",index=None)
315
+ matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path+".gz", sep="\t",index=None)
286
316
  return matched_snp_list_path, matched_sumstats_path
287
317
 
288
318
  def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
@@ -3,43 +3,86 @@ import json
3
3
  import pandas as pd
4
4
  import gwaslab as gl
5
5
  from gwaslab.g_Log import Log
6
+ from datetime import datetime
7
+ import os
6
8
 
7
- def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
9
+ def find_efo_cache(efo, path):
10
+ for root, dirs, files in os.walk(path):
11
+ for file in files:
12
+ if efo in file:
13
+ return os.path.join(root, file)
14
+ return False
15
+
16
+ def gwascatalog_trait(efo,
17
+ source="NCBI",
18
+ sig_level=5e-8,
19
+ use_cache=True,
20
+ cache_dir="./",
21
+ verbose=True,
22
+ log=Log()):
8
23
 
9
24
  #https://www.ebi.ac.uk/gwas/rest/docs/api
10
25
 
11
26
  base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
12
27
  log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
13
- log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
14
- log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
15
- log.write(" -EFO trait api: "+ base_url, verbose=verbose)
16
- text = requests.get(base_url)
28
+
29
+
30
+ if use_cache==True:
31
+ log.write("searching cache in : {}".format(cache_dir))
32
+ cache = find_efo_cache(efo, cache_dir)
33
+ if cache==False:
34
+ log.write(" -Cache not found for {}... Downloading from GWASCatalog...".format(cache), verbose=verbose)
35
+ else:
36
+ cache = False
37
+
38
+ if cache==False:
39
+ #log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
40
+ log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
41
+ log.write(" -EFO trait api: "+ base_url, verbose=verbose)
42
+ text = requests.get(base_url)
17
43
 
18
- log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
19
- if text.status_code!=200:
20
- log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
21
- log.write(" -Message:{}".format(text.text), verbose=verbose)
22
- return 0
44
+ log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
45
+ if text.status_code!=200:
46
+ log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
47
+ log.write(" -Message:{}".format(text.text), verbose=verbose)
48
+ return 0
23
49
 
24
- api_response = json.loads(text.text)
25
- log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
26
- log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
50
+ api_response = json.loads(text.text)
51
+ log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
52
+ log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
53
+
54
+ base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
55
+ log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
56
+ log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
57
+ log.write(" -Note: this step might take a while...", verbose=verbose)
58
+
59
+ # get request and check status code of response
60
+ raw_data = requests.get(base_url)
61
+
62
+ # whether to proceed based on status code
63
+ is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
64
+ if is_proceed is False: return False
65
+
66
+
67
+ log.write(" -Loading json ...", verbose=verbose)
68
+ # Transform API response from JSON into Python dictionary
69
+ api_response = json.loads(raw_data.text)
70
+
71
+ now = datetime.now() # current date and time
72
+ datestring = now.strftime("%Y%m%d")
73
+ json_path = cache_dir + "GWASCatalog_{}_associationsByTraitSummary_text_{}.json".format(efo, datestring)
74
+
75
+ try:
76
+ log.write(" -Saving json to: {} ...".format(json_path), verbose=verbose)
77
+ with open(json_path, 'w', encoding='utf-8') as f:
78
+ json.dump(api_response, f, ensure_ascii=False, indent=4)
79
+ except:
80
+ pass
81
+ else:
82
+ log.write(" -Loading cache for {}: {} ...".format(efo, cache), verbose=verbose)
83
+ with open(cache) as f:
84
+ api_response = json.load(f)
27
85
 
28
- base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
29
- log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
30
- log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
31
- log.write(" -Note: this step might take a while...", verbose=verbose)
32
-
33
- # get request and check status code of response
34
- raw_data = requests.get(base_url)
35
-
36
- # whether to proceed based on status code
37
- is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
38
- if is_proceed is False: return False
39
-
40
- log.write(" -Loading json ...", verbose=verbose)
41
- # Transform API response from JSON into Python dictionary
42
- api_response = json.loads(raw_data.text)
43
86
  log.write(" -Parsing json ...", verbose=verbose)
44
87
  # An
45
88
  records=list()