gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +75 -72
- gsMap/config.py +822 -316
- gsMap/create_slice_mean.py +154 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +28 -26
- gsMap/format_sumstats.py +233 -201
- gsMap/generate_ldscore.py +353 -209
- gsMap/latent_to_gene.py +92 -60
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +86 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +173 -140
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +105 -122
- gsMap/visualize.py +82 -64
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/METADATA +21 -6
- gsmap-1.72.3.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/WHEEL +1 -1
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/LICENSE +0 -0
- {gsmap-1.71.2.dist-info → gsmap-1.72.3.dist-info}/entry_points.txt +0 -0
gsMap/format_sumstats.py
CHANGED
@@ -1,100 +1,98 @@
|
|
1
|
-
import numpy as np
|
2
1
|
import logging
|
2
|
+
import math
|
3
3
|
import re
|
4
4
|
|
5
|
-
import math
|
6
5
|
import numpy as np
|
7
6
|
import pandas as pd
|
8
7
|
from scipy.stats import chi2
|
9
8
|
|
10
9
|
from gsMap.config import FormatSumstatsConfig
|
11
10
|
|
12
|
-
VALID_SNPS = {
|
11
|
+
VALID_SNPS = {"AC", "AG", "CA", "CT", "GA", "GT", "TC", "TG"}
|
13
12
|
logger = logging.getLogger(__name__)
|
14
13
|
|
15
14
|
default_cnames = {
|
16
15
|
# RS NUMBER
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
"SNP": "SNP",
|
17
|
+
"RS": "SNP",
|
18
|
+
"RSID": "SNP",
|
19
|
+
"RS_NUMBER": "SNP",
|
20
|
+
"RS_NUMBERS": "SNP",
|
22
21
|
# P-VALUE
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
22
|
+
"P": "P",
|
23
|
+
"PVALUE": "P",
|
24
|
+
"P_VALUE": "P",
|
25
|
+
"PVAL": "P",
|
26
|
+
"P_VAL": "P",
|
27
|
+
"GC_PVALUE": "P",
|
28
|
+
"p": "P",
|
30
29
|
# EFFECT_ALLELE (A1)
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
"A1": "A1",
|
31
|
+
"ALLELE1": "A1",
|
32
|
+
"ALLELE_1": "A1",
|
33
|
+
"EFFECT_ALLELE": "A1",
|
34
|
+
"REFERENCE_ALLELE": "A1",
|
35
|
+
"INC_ALLELE": "A1",
|
36
|
+
"EA": "A1",
|
38
37
|
# NON_EFFECT_ALLELE (A2)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
"A2": "A2",
|
39
|
+
"ALLELE2": "A2",
|
40
|
+
"ALLELE_2": "A2",
|
41
|
+
"OTHER_ALLELE": "A2",
|
42
|
+
"NON_EFFECT_ALLELE": "A2",
|
43
|
+
"DEC_ALLELE": "A2",
|
44
|
+
"NEA": "A2",
|
46
45
|
# N
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
'WEIGHT': 'N',
|
46
|
+
"N": "N",
|
47
|
+
"NCASE": "N_CAS",
|
48
|
+
"CASES_N": "N_CAS",
|
49
|
+
"N_CASE": "N_CAS",
|
50
|
+
"N_CASES": "N_CAS",
|
51
|
+
"N_CONTROLS": "N_CON",
|
52
|
+
"N_CAS": "N_CAS",
|
53
|
+
"N_CON": "N_CON",
|
54
|
+
"NCONTROL": "N_CON",
|
55
|
+
"CONTROLS_N": "N_CON",
|
56
|
+
"N_CONTROL": "N_CON",
|
57
|
+
"WEIGHT": "N",
|
60
58
|
# SIGNED STATISTICS
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
59
|
+
"ZSCORE": "Z",
|
60
|
+
"Z-SCORE": "Z",
|
61
|
+
"GC_ZSCORE": "Z",
|
62
|
+
"Z": "Z",
|
63
|
+
"OR": "OR",
|
64
|
+
"B": "BETA",
|
65
|
+
"BETA": "BETA",
|
66
|
+
"LOG_ODDS": "LOG_ODDS",
|
67
|
+
"EFFECTS": "BETA",
|
68
|
+
"EFFECT": "BETA",
|
69
|
+
"b": "BETA",
|
70
|
+
"beta": "BETA",
|
73
71
|
# SE
|
74
|
-
|
72
|
+
"se": "SE",
|
75
73
|
# INFO
|
76
|
-
|
77
|
-
|
74
|
+
"INFO": "INFO",
|
75
|
+
"Info": "INFO",
|
78
76
|
# MAF
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
77
|
+
"EAF": "FRQ",
|
78
|
+
"FRQ": "FRQ",
|
79
|
+
"MAF": "FRQ",
|
80
|
+
"FRQ_U": "FRQ",
|
81
|
+
"F_U": "FRQ",
|
82
|
+
"frq_A1": "FRQ",
|
83
|
+
"frq": "FRQ",
|
84
|
+
"freq": "FRQ",
|
87
85
|
}
|
88
86
|
|
89
87
|
|
90
88
|
def get_compression(fh):
|
91
|
-
|
89
|
+
"""
|
92
90
|
Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
|
93
|
-
|
94
|
-
if fh.endswith(
|
95
|
-
compression =
|
96
|
-
elif fh.endswith(
|
97
|
-
compression =
|
91
|
+
"""
|
92
|
+
if fh.endswith("gz"):
|
93
|
+
compression = "gzip"
|
94
|
+
elif fh.endswith("bz2"):
|
95
|
+
compression = "bz2"
|
98
96
|
else:
|
99
97
|
compression = None
|
100
98
|
|
@@ -102,9 +100,9 @@ def get_compression(fh):
|
|
102
100
|
|
103
101
|
|
104
102
|
def gwas_checkname(gwas, config):
|
105
|
-
|
103
|
+
"""
|
106
104
|
Iterpret column names of gwas
|
107
|
-
|
105
|
+
"""
|
108
106
|
old_name = gwas.columns
|
109
107
|
mapped_cnames = {}
|
110
108
|
for col in gwas.columns:
|
@@ -112,9 +110,22 @@ def gwas_checkname(gwas, config):
|
|
112
110
|
gwas.columns = list(mapped_cnames.values())
|
113
111
|
|
114
112
|
# When column names are provided by users
|
115
|
-
name_updates = {
|
116
|
-
|
117
|
-
|
113
|
+
name_updates = {
|
114
|
+
"SNP": config.snp,
|
115
|
+
"A1": config.a1,
|
116
|
+
"A2": config.a2,
|
117
|
+
"INFO": config.info,
|
118
|
+
"BETA": config.beta,
|
119
|
+
"SE": config.se,
|
120
|
+
"P": config.p,
|
121
|
+
"FRQ": config.frq,
|
122
|
+
"N": config.n,
|
123
|
+
"Z": config.z,
|
124
|
+
"Chr": config.chr,
|
125
|
+
"Pos": config.pos,
|
126
|
+
"OR": config.OR,
|
127
|
+
"SE_OR": config.se_OR,
|
128
|
+
}
|
118
129
|
|
119
130
|
for key, value in name_updates.items():
|
120
131
|
if value is not None and value in gwas.columns:
|
@@ -124,96 +135,102 @@ def gwas_checkname(gwas, config):
|
|
124
135
|
for head in new_name:
|
125
136
|
numc = list(new_name).count(head)
|
126
137
|
if numc > 1:
|
127
|
-
raise ValueError(
|
138
|
+
raise ValueError(
|
139
|
+
f"Found {numc} different {head} columns, please check your {head} column."
|
140
|
+
)
|
128
141
|
|
129
142
|
name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
|
130
143
|
|
131
144
|
# When at OR scale
|
132
|
-
if
|
133
|
-
gwas[
|
134
|
-
gwas[
|
145
|
+
if "OR" in new_name and "SE_OR" in new_name:
|
146
|
+
gwas["BETA"] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
|
147
|
+
gwas["SE"] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
|
135
148
|
|
136
149
|
interpreting = {
|
137
|
-
"SNP":
|
138
|
-
"A1":
|
139
|
-
"A2":
|
140
|
-
"BETA":
|
141
|
-
"SE":
|
142
|
-
"OR":
|
143
|
-
"SE_OR":
|
144
|
-
"P":
|
145
|
-
"Z":
|
146
|
-
"N":
|
147
|
-
"INFO":
|
148
|
-
"FRQ":
|
149
|
-
"Chr":
|
150
|
-
|
150
|
+
"SNP": "Variant ID (e.g., rs number).",
|
151
|
+
"A1": "Allele 1, interpreted as the effect allele for signed sumstat.",
|
152
|
+
"A2": "Allele 2, interpreted as the non-effect allele for signed sumstat.",
|
153
|
+
"BETA": "[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).",
|
154
|
+
"SE": "Standard error of the regression coefficient.",
|
155
|
+
"OR": "Odds ratio, will be transferred to linear scale.",
|
156
|
+
"SE_OR": "Standard error of the odds ratio, will be transferred to linear scale.",
|
157
|
+
"P": "P-Value.",
|
158
|
+
"Z": "Z-Value.",
|
159
|
+
"N": "Sample size.",
|
160
|
+
"INFO": "INFO score (imputation quality; higher → better imputation).",
|
161
|
+
"FRQ": "Allele frequency of A1.",
|
162
|
+
"Chr": "Chromsome.",
|
163
|
+
"Pos": "SNP positions.",
|
151
164
|
}
|
152
165
|
|
153
|
-
logger.info(
|
154
|
-
for key,
|
166
|
+
logger.info("\nIterpreting column names as follows:")
|
167
|
+
for key, _value in interpreting.items():
|
155
168
|
if key in new_name:
|
156
|
-
logger.info(f
|
169
|
+
logger.info(f"{name_dict[key]}: {interpreting[key]}")
|
157
170
|
|
158
171
|
return gwas
|
159
172
|
|
160
173
|
|
161
174
|
def gwas_checkformat(gwas, config):
|
162
|
-
|
175
|
+
"""
|
163
176
|
Check column names required for different format
|
164
|
-
|
165
|
-
if config.format ==
|
166
|
-
condition1 = np.any(np.isin([
|
167
|
-
condition2 = np.all(np.isin([
|
177
|
+
"""
|
178
|
+
if config.format == "gsMap":
|
179
|
+
condition1 = np.any(np.isin(["P", "Z"], gwas.columns))
|
180
|
+
condition2 = np.all(np.isin(["BETA", "SE"], gwas.columns))
|
168
181
|
if not (condition1 or condition2):
|
169
182
|
raise ValueError(
|
170
|
-
|
183
|
+
"To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required."
|
184
|
+
)
|
171
185
|
else:
|
172
|
-
if
|
186
|
+
if "Z" in gwas.columns:
|
173
187
|
pass
|
174
|
-
elif
|
175
|
-
gwas[
|
188
|
+
elif "P" in gwas.columns:
|
189
|
+
gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
|
176
190
|
else:
|
177
|
-
gwas[
|
191
|
+
gwas["Z"] = gwas.BETA / gwas.SE
|
178
192
|
|
179
|
-
elif config.format ==
|
180
|
-
condition = np.all(np.isin([
|
193
|
+
elif config.format == "COJO":
|
194
|
+
condition = np.all(np.isin(["A1", "A2", "FRQ", "BETA", "SE", "P", "N"], gwas.columns))
|
181
195
|
if not condition:
|
182
|
-
raise ValueError(
|
196
|
+
raise ValueError(
|
197
|
+
"To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required."
|
198
|
+
)
|
183
199
|
else:
|
184
|
-
gwas[
|
200
|
+
gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
|
185
201
|
|
186
202
|
return gwas
|
187
203
|
|
188
204
|
|
189
205
|
def filter_info(info, config):
|
190
|
-
|
206
|
+
"""Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO."""
|
191
207
|
if type(info) is pd.Series: # one INFO column
|
192
208
|
jj = ((info > 2.0) | (info < 0)) & info.notnull()
|
193
209
|
ii = info >= config.info_min
|
194
210
|
elif type(info) is pd.DataFrame: # several INFO columns
|
195
|
-
jj = ((
|
196
|
-
|
197
|
-
|
211
|
+
jj = ((info > 2.0) & info.notnull()).any(axis=1) | ((info < 0) & info.notnull()).any(
|
212
|
+
axis=1
|
213
|
+
)
|
214
|
+
ii = info.sum(axis=1) >= config.info_min * (len(info.columns))
|
198
215
|
else:
|
199
|
-
raise ValueError(
|
216
|
+
raise ValueError("Expected pd.DataFrame or pd.Series.")
|
200
217
|
|
201
218
|
bad_info = jj.sum()
|
202
219
|
if bad_info > 0:
|
203
|
-
msg =
|
220
|
+
msg = "WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled."
|
204
221
|
logger.warning(msg.format(N=bad_info))
|
205
222
|
|
206
223
|
return ii
|
207
224
|
|
208
225
|
|
209
226
|
def filter_frq(frq, config):
|
210
|
-
|
227
|
+
"""
|
211
228
|
Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
|
212
|
-
|
229
|
+
"""
|
213
230
|
jj = (frq < 0) | (frq > 1)
|
214
231
|
bad_frq = jj.sum()
|
215
232
|
if bad_frq > 0:
|
216
|
-
msg =
|
233
|
+
msg = "WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled."
|
217
234
|
logger.warning(msg.format(N=bad_frq))
|
218
235
|
|
219
236
|
frq = np.minimum(frq, 1 - frq)
|
@@ -222,161 +239,177 @@ def filter_frq(frq, config):
|
|
222
239
|
|
223
240
|
|
224
241
|
def filter_pvals(P, config):
|
225
|
-
|
242
|
+
"""Remove out-of-bounds P-values"""
|
226
243
|
ii = (P > 0) & (P <= 1)
|
227
244
|
bad_p = (~ii).sum()
|
228
245
|
if bad_p > 0:
|
229
|
-
msg =
|
246
|
+
msg = "WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled."
|
230
247
|
logger.warning(msg.format(N=bad_p))
|
231
248
|
|
232
249
|
return ii
|
233
250
|
|
234
251
|
|
235
252
|
def filter_alleles(a):
|
236
|
-
|
253
|
+
"""Remove alleles that do not describe strand-unambiguous SNPs"""
|
237
254
|
return a.isin(VALID_SNPS)
|
238
255
|
|
239
256
|
|
240
257
|
def gwas_qc(gwas, config):
|
241
|
-
|
242
|
-
Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
|
243
|
-
|
258
|
+
"""
|
259
|
+
Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
|
260
|
+
"""
|
244
261
|
old = len(gwas)
|
245
|
-
logger.info(
|
262
|
+
logger.info("\nFiltering SNPs as follows:")
|
246
263
|
# filter: SNPs with missing values
|
247
|
-
drops = {
|
264
|
+
drops = {"NA": 0, "P": 0, "INFO": 0, "FRQ": 0, "A": 0, "SNP": 0, "Dup": 0, "N": 0}
|
248
265
|
|
249
|
-
gwas = gwas.dropna(
|
250
|
-
lambda x: x !=
|
266
|
+
gwas = gwas.dropna(
|
267
|
+
axis=0, how="any", subset=filter(lambda x: x != "INFO", gwas.columns)
|
268
|
+
).reset_index(drop=True)
|
251
269
|
|
252
|
-
drops[
|
253
|
-
logger.info(f
|
270
|
+
drops["NA"] = old - len(gwas)
|
271
|
+
logger.info(f"Removed {drops['NA']} SNPs with missing values.")
|
254
272
|
|
255
273
|
# filter: SNPs with Info < 0.9
|
256
|
-
if
|
274
|
+
if "INFO" in gwas.columns:
|
257
275
|
old = len(gwas)
|
258
|
-
gwas = gwas.loc[filter_info(gwas[
|
259
|
-
drops[
|
260
|
-
logger.info(f
|
276
|
+
gwas = gwas.loc[filter_info(gwas["INFO"], config)]
|
277
|
+
drops["INFO"] = old - len(gwas)
|
278
|
+
logger.info(f"Removed {drops['INFO']} SNPs with INFO <= 0.9.")
|
261
279
|
|
262
280
|
# filter: SNPs with MAF <= 0.01
|
263
|
-
if
|
281
|
+
if "FRQ" in gwas.columns:
|
264
282
|
old = len(gwas)
|
265
|
-
gwas = gwas.loc[filter_frq(gwas[
|
266
|
-
drops[
|
267
|
-
logger.info(f
|
283
|
+
gwas = gwas.loc[filter_frq(gwas["FRQ"], config)]
|
284
|
+
drops["FRQ"] += old - len(gwas)
|
285
|
+
logger.info(f"Removed {drops['FRQ']} SNPs with MAF <= 0.01.")
|
268
286
|
|
269
287
|
# filter: P-value that out-of-bounds [0,1]
|
270
|
-
if
|
288
|
+
if "P" in gwas.columns:
|
271
289
|
old = len(gwas)
|
272
|
-
gwas = gwas.loc[filter_pvals(gwas[
|
273
|
-
drops[
|
274
|
-
logger.info(f
|
290
|
+
gwas = gwas.loc[filter_pvals(gwas["P"], config)]
|
291
|
+
drops["P"] += old - len(gwas)
|
292
|
+
logger.info(f"Removed {drops['P']} SNPs with out-of-bounds p-values.")
|
275
293
|
|
276
294
|
# filter: Variants that are strand-ambiguous
|
277
|
-
if
|
295
|
+
if "A1" in gwas.columns and "A2" in gwas.columns:
|
278
296
|
gwas.A1 = gwas.A1.str.upper()
|
279
297
|
gwas.A2 = gwas.A2.str.upper()
|
280
298
|
gwas = gwas.loc[filter_alleles(gwas.A1 + gwas.A2)]
|
281
|
-
drops[
|
282
|
-
logger.info(f
|
299
|
+
drops["A"] += old - len(gwas)
|
300
|
+
logger.info(f"Removed {drops['A']} variants that were not SNPs or were strand-ambiguous.")
|
283
301
|
|
284
302
|
# filter: Duplicated rs numbers
|
285
|
-
if
|
303
|
+
if "SNP" in gwas.columns:
|
286
304
|
old = len(gwas)
|
287
|
-
gwas = gwas.drop_duplicates(subset=
|
288
|
-
drops[
|
289
|
-
logger.info(f
|
305
|
+
gwas = gwas.drop_duplicates(subset="SNP").reset_index(drop=True)
|
306
|
+
drops["Dup"] += old - len(gwas)
|
307
|
+
logger.info(f"Removed {drops['Dup']} SNPs with duplicated rs numbers.")
|
290
308
|
|
291
309
|
# filter:Sample size
|
292
310
|
n_min = gwas.N.quantile(0.9) / 1.5
|
293
311
|
old = len(gwas)
|
294
312
|
gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
|
295
|
-
drops[
|
296
|
-
logger.info(f
|
313
|
+
drops["N"] += old - len(gwas)
|
314
|
+
logger.info(f"Removed {drops['N']} SNPs with N < {n_min}.")
|
297
315
|
|
298
316
|
return gwas
|
299
317
|
|
300
318
|
|
301
319
|
def variant_to_rsid(gwas, config):
|
302
|
-
|
320
|
+
"""
|
303
321
|
Convert variant id (Chr, Pos) to rsid
|
304
|
-
|
322
|
+
"""
|
305
323
|
logger.info("\nConverting the SNP position to rsid. This process may take some time.")
|
306
|
-
unique_ids = set(gwas[
|
307
|
-
chr_format = gwas[
|
308
|
-
chr_format = [re.sub(r
|
309
|
-
|
310
|
-
dtype = {
|
311
|
-
chunk_iter = pd.read_csv(
|
312
|
-
|
324
|
+
unique_ids = set(gwas["id"])
|
325
|
+
chr_format = gwas["Chr"].unique().astype(str)
|
326
|
+
chr_format = [re.sub(r"\d+", "", value) for value in chr_format][1]
|
327
|
+
|
328
|
+
dtype = {"chr": str, "pos": str, "ref": str, "alt": str, "dbsnp": str}
|
329
|
+
chunk_iter = pd.read_csv(
|
330
|
+
config.dbsnp,
|
331
|
+
chunksize=config.chunksize,
|
332
|
+
sep="\t",
|
333
|
+
skiprows=1,
|
334
|
+
dtype=dtype,
|
335
|
+
names=["chr", "pos", "ref", "alt", "dbsnp"],
|
336
|
+
)
|
313
337
|
|
314
338
|
# Iterate over chunks
|
315
339
|
matching_id = pd.DataFrame()
|
316
340
|
for chunk in chunk_iter:
|
317
|
-
chunk[
|
318
|
-
matching_id = pd.concat(
|
341
|
+
chunk["id"] = chr_format + chunk["chr"] + "_" + chunk["pos"]
|
342
|
+
matching_id = pd.concat(
|
343
|
+
[matching_id, chunk[chunk["id"].isin(unique_ids)][["dbsnp", "id"]]]
|
344
|
+
)
|
319
345
|
|
320
|
-
matching_id = matching_id.drop_duplicates(subset=
|
321
|
-
matching_id = matching_id.drop_duplicates(subset=
|
346
|
+
matching_id = matching_id.drop_duplicates(subset="dbsnp").reset_index(drop=True)
|
347
|
+
matching_id = matching_id.drop_duplicates(subset="id").reset_index(drop=True)
|
322
348
|
matching_id.index = matching_id.id
|
323
349
|
return matching_id
|
324
350
|
|
325
351
|
|
326
352
|
def clean_SNP_id(gwas, config):
|
327
|
-
|
353
|
+
"""
|
328
354
|
Clean SNP id
|
329
|
-
|
355
|
+
"""
|
330
356
|
old = len(gwas)
|
331
|
-
condition1 =
|
332
|
-
condition2 = np.all(np.isin([
|
357
|
+
condition1 = "SNP" in gwas.columns
|
358
|
+
condition2 = np.all(np.isin(["Chr", "Pos"], gwas.columns))
|
333
359
|
|
334
360
|
if not (condition1 or condition2):
|
335
|
-
raise ValueError(
|
361
|
+
raise ValueError("Either SNP rsid, or both SNP chromosome and position, are required.")
|
336
362
|
elif condition1:
|
337
363
|
pass
|
338
364
|
elif condition2:
|
339
365
|
if config.dbsnp is None:
|
340
|
-
raise ValueError(
|
366
|
+
raise ValueError("To Convert SNP positions to rsid, dbsnp reference is required.")
|
341
367
|
else:
|
342
|
-
gwas[
|
343
|
-
gwas = gwas.drop_duplicates(subset=
|
368
|
+
gwas["id"] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
|
369
|
+
gwas = gwas.drop_duplicates(subset="id").reset_index(drop=True)
|
344
370
|
gwas.index = gwas.id
|
345
371
|
|
346
372
|
matching_id = variant_to_rsid(gwas, config)
|
347
373
|
gwas = gwas.loc[matching_id.id]
|
348
|
-
gwas[
|
374
|
+
gwas["SNP"] = matching_id.dbsnp
|
349
375
|
num_fail = old - len(gwas)
|
350
|
-
logger.info(f
|
376
|
+
logger.info(f"Removed {num_fail} SNPs that did not convert to rsid.")
|
351
377
|
|
352
378
|
return gwas
|
353
379
|
|
354
380
|
|
355
381
|
def gwas_metadata(gwas, config):
|
356
|
-
|
382
|
+
"""
|
357
383
|
Report key features of GWAS data
|
358
|
-
|
359
|
-
logger.info(
|
360
|
-
CHISQ =
|
384
|
+
"""
|
385
|
+
logger.info("\nSummary of GWAS data:")
|
386
|
+
CHISQ = gwas.Z**2
|
361
387
|
mean_chisq = CHISQ.mean()
|
362
|
-
logger.info(
|
388
|
+
logger.info("Mean chi^2 = " + str(round(mean_chisq, 3)))
|
363
389
|
if mean_chisq < 1.02:
|
364
390
|
logger.warning("Mean chi^2 may be too small.")
|
365
391
|
|
366
|
-
logger.info(
|
367
|
-
logger.info(
|
368
|
-
logger.info(
|
392
|
+
logger.info("Lambda GC = " + str(round(CHISQ.median() / 0.4549, 3)))
|
393
|
+
logger.info("Max chi^2 = " + str(round(CHISQ.max(), 3)))
|
394
|
+
logger.info(
|
395
|
+
f"{(CHISQ > 29).sum()} Genome-wide significant SNPs (some may have been removed by filtering)."
|
396
|
+
)
|
369
397
|
|
370
398
|
|
371
399
|
def gwas_format(config: FormatSumstatsConfig):
|
372
|
-
|
400
|
+
"""
|
373
401
|
Format GWAS data
|
374
|
-
|
375
|
-
logger.info(f
|
402
|
+
"""
|
403
|
+
logger.info(f"------Formating gwas data for {config.sumstats}...")
|
376
404
|
compression_type = get_compression(config.sumstats)
|
377
|
-
gwas = pd.read_csv(
|
378
|
-
|
379
|
-
|
405
|
+
gwas = pd.read_csv(
|
406
|
+
config.sumstats,
|
407
|
+
delim_whitespace=True,
|
408
|
+
header=0,
|
409
|
+
compression=compression_type,
|
410
|
+
na_values=[".", "NA"],
|
411
|
+
)
|
412
|
+
logger.info(f"Read {len(gwas)} SNPs from {config.sumstats}.")
|
380
413
|
|
381
414
|
# Check name and format
|
382
415
|
gwas = gwas_checkname(gwas, config)
|
@@ -389,19 +422,18 @@ def gwas_format(config: FormatSumstatsConfig):
|
|
389
422
|
gwas_metadata(gwas, config)
|
390
423
|
|
391
424
|
# Saving the data
|
392
|
-
if config.format ==
|
393
|
-
keep = [
|
394
|
-
appendix =
|
395
|
-
elif config.format ==
|
425
|
+
if config.format == "COJO":
|
426
|
+
keep = ["SNP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"]
|
427
|
+
appendix = ".cojo"
|
428
|
+
elif config.format == "gsMap":
|
396
429
|
keep = ["SNP", "A1", "A2", "Z", "N"]
|
397
|
-
appendix =
|
430
|
+
appendix = ".sumstats"
|
398
431
|
|
399
|
-
if
|
400
|
-
keep = keep + [
|
432
|
+
if "Chr" in gwas.columns and "Pos" in gwas.columns and config.keep_chr_pos is True:
|
433
|
+
keep = keep + ["Chr", "Pos"]
|
401
434
|
|
402
435
|
gwas = gwas[keep]
|
403
|
-
out_name = config.out + appendix +
|
436
|
+
out_name = config.out + appendix + ".gz"
|
404
437
|
|
405
|
-
logger.info(f
|
406
|
-
gwas.to_csv(out_name, sep="\t", index=False,
|
407
|
-
float_format='%.3f', compression='gzip')
|
438
|
+
logger.info(f"\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.")
|
439
|
+
gwas.to_csv(out_name, sep="\t", index=False, float_format="%.3f", compression="gzip")
|