gsMap 1.67__py3-none-any.whl → 1.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/{GNN_VAE → GNN}/__init__.py +0 -0
- gsMap/{GNN_VAE → GNN}/adjacency_matrix.py +75 -75
- gsMap/{GNN_VAE → GNN}/model.py +89 -89
- gsMap/{GNN_VAE → GNN}/train.py +88 -86
- gsMap/__init__.py +5 -5
- gsMap/__main__.py +2 -2
- gsMap/cauchy_combination_test.py +141 -141
- gsMap/config.py +805 -803
- gsMap/diagnosis.py +273 -273
- gsMap/find_latent_representation.py +133 -145
- gsMap/format_sumstats.py +407 -407
- gsMap/generate_ldscore.py +618 -618
- gsMap/latent_to_gene.py +234 -234
- gsMap/main.py +31 -31
- gsMap/report.py +160 -160
- gsMap/run_all_mode.py +194 -194
- gsMap/setup.py +0 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +380 -380
- gsMap/templates/report_template.html +198 -198
- gsMap/utils/__init__.py +0 -0
- gsMap/utils/generate_r2_matrix.py +735 -735
- gsMap/utils/jackknife.py +514 -514
- gsMap/utils/make_annotations.py +518 -518
- gsMap/utils/manhattan_plot.py +639 -639
- gsMap/utils/regression_read.py +294 -294
- gsMap/visualize.py +198 -198
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/LICENSE +21 -21
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/METADATA +28 -22
- gsmap-1.70.dist-info/RECORD +31 -0
- gsmap-1.67.dist-info/RECORD +0 -31
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/WHEEL +0 -0
- {gsmap-1.67.dist-info → gsmap-1.70.dist-info}/entry_points.txt +0 -0
gsMap/format_sumstats.py
CHANGED
@@ -1,407 +1,407 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import logging
|
3
|
-
import re
|
4
|
-
|
5
|
-
import math
|
6
|
-
import numpy as np
|
7
|
-
import pandas as pd
|
8
|
-
from scipy.stats import chi2
|
9
|
-
|
10
|
-
from gsMap.config import FormatSumstatsConfig
|
11
|
-
|
12
|
-
VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
|
13
|
-
logger = logging.getLogger(__name__)
|
14
|
-
|
15
|
-
default_cnames = {
|
16
|
-
# RS NUMBER
|
17
|
-
'SNP': 'SNP',
|
18
|
-
'RS': 'SNP',
|
19
|
-
'RSID': 'SNP',
|
20
|
-
'RS_NUMBER': 'SNP',
|
21
|
-
'RS_NUMBERS': 'SNP',
|
22
|
-
# P-VALUE
|
23
|
-
'P': 'P',
|
24
|
-
'PVALUE': 'P',
|
25
|
-
'P_VALUE': 'P',
|
26
|
-
'PVAL': 'P',
|
27
|
-
'P_VAL': 'P',
|
28
|
-
'GC_PVALUE': 'P',
|
29
|
-
'p': 'P',
|
30
|
-
# EFFECT_ALLELE (A1)
|
31
|
-
'A1': 'A1',
|
32
|
-
'ALLELE1': 'A1',
|
33
|
-
'ALLELE_1': 'A1',
|
34
|
-
'EFFECT_ALLELE': 'A1',
|
35
|
-
'REFERENCE_ALLELE': 'A1',
|
36
|
-
'INC_ALLELE': 'A1',
|
37
|
-
'EA': 'A1',
|
38
|
-
# NON_EFFECT_ALLELE (A2)
|
39
|
-
'A2': 'A2',
|
40
|
-
'ALLELE2': 'A2',
|
41
|
-
'ALLELE_2': 'A2',
|
42
|
-
'OTHER_ALLELE': 'A2',
|
43
|
-
'NON_EFFECT_ALLELE': 'A2',
|
44
|
-
'DEC_ALLELE': 'A2',
|
45
|
-
'NEA': 'A2',
|
46
|
-
# N
|
47
|
-
'N': 'N',
|
48
|
-
'NCASE': 'N_CAS',
|
49
|
-
'CASES_N': 'N_CAS',
|
50
|
-
'N_CASE': 'N_CAS',
|
51
|
-
'N_CASES': 'N_CAS',
|
52
|
-
'N_CONTROLS': 'N_CON',
|
53
|
-
'N_CAS': 'N_CAS',
|
54
|
-
'N_CON': 'N_CON',
|
55
|
-
'N_CASE': 'N_CAS',
|
56
|
-
'NCONTROL': 'N_CON',
|
57
|
-
'CONTROLS_N': 'N_CON',
|
58
|
-
'N_CONTROL': 'N_CON',
|
59
|
-
'WEIGHT': 'N',
|
60
|
-
# SIGNED STATISTICS
|
61
|
-
'ZSCORE': 'Z',
|
62
|
-
'Z-SCORE': 'Z',
|
63
|
-
'GC_ZSCORE': 'Z',
|
64
|
-
'Z': 'Z',
|
65
|
-
'OR': 'OR',
|
66
|
-
'B': 'BETA',
|
67
|
-
'BETA': 'BETA',
|
68
|
-
'LOG_ODDS': 'LOG_ODDS',
|
69
|
-
'EFFECTS': 'BETA',
|
70
|
-
'EFFECT': 'BETA',
|
71
|
-
'b': 'BETA',
|
72
|
-
'beta': 'BETA',
|
73
|
-
# SE
|
74
|
-
'se': 'SE',
|
75
|
-
# INFO
|
76
|
-
'INFO': 'INFO',
|
77
|
-
'Info': 'INFO',
|
78
|
-
# MAF
|
79
|
-
'EAF': 'FRQ',
|
80
|
-
'FRQ': 'FRQ',
|
81
|
-
'MAF': 'FRQ',
|
82
|
-
'FRQ_U': 'FRQ',
|
83
|
-
'F_U': 'FRQ',
|
84
|
-
'frq_A1': 'FRQ',
|
85
|
-
'frq': 'FRQ',
|
86
|
-
'freq': 'FRQ'
|
87
|
-
}
|
88
|
-
|
89
|
-
|
90
|
-
def get_compression(fh):
|
91
|
-
'''
|
92
|
-
Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
|
93
|
-
'''
|
94
|
-
if fh.endswith('gz'):
|
95
|
-
compression = 'gzip'
|
96
|
-
elif fh.endswith('bz2'):
|
97
|
-
compression = 'bz2'
|
98
|
-
else:
|
99
|
-
compression = None
|
100
|
-
|
101
|
-
return compression
|
102
|
-
|
103
|
-
|
104
|
-
def gwas_checkname(gwas, config):
|
105
|
-
'''
|
106
|
-
Iterpret column names of gwas
|
107
|
-
'''
|
108
|
-
old_name = gwas.columns
|
109
|
-
mapped_cnames = {}
|
110
|
-
for col in gwas.columns:
|
111
|
-
mapped_cnames[col] = default_cnames.get(col, col)
|
112
|
-
gwas.columns = list(mapped_cnames.values())
|
113
|
-
|
114
|
-
# When column names are provided by users
|
115
|
-
name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
|
116
|
-
'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
|
117
|
-
'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
|
118
|
-
|
119
|
-
for key, value in name_updates.items():
|
120
|
-
if value is not None and value in gwas.columns:
|
121
|
-
gwas.rename(columns={value: key}, inplace=True)
|
122
|
-
new_name = gwas.columns
|
123
|
-
# check the name duplication
|
124
|
-
for head in new_name:
|
125
|
-
numc = list(new_name).count(head)
|
126
|
-
if numc > 1:
|
127
|
-
raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
|
128
|
-
|
129
|
-
name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
|
130
|
-
|
131
|
-
# When at OR scale
|
132
|
-
if 'OR' in new_name and 'SE_OR' in new_name:
|
133
|
-
gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
|
134
|
-
gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
|
135
|
-
|
136
|
-
interpreting = {
|
137
|
-
"SNP": 'Variant ID (e.g., rs number).',
|
138
|
-
"A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
|
139
|
-
"A2": 'Allele 2, interpreted as the non-effect allele for signed sumstat.',
|
140
|
-
"BETA": '[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).',
|
141
|
-
"SE": 'Standard error of the regression coefficient.',
|
142
|
-
"OR": 'Odds ratio, will be transferred to linear scale.',
|
143
|
-
"SE_OR": 'Standard error of the odds ratio, will be transferred to linear scale.',
|
144
|
-
"P": 'P-Value.',
|
145
|
-
"Z": 'Z-Value.',
|
146
|
-
"N": 'Sample size.',
|
147
|
-
"INFO": 'INFO score (imputation quality; higher → better imputation).',
|
148
|
-
"FRQ": 'Allele frequency of A1.',
|
149
|
-
"Chr": 'Chromsome.',
|
150
|
-
'Pos': 'SNP positions.'
|
151
|
-
}
|
152
|
-
|
153
|
-
logger.info(f'\nIterpreting column names as follows:')
|
154
|
-
for key, value in interpreting.items():
|
155
|
-
if key in new_name:
|
156
|
-
logger.info(f'{name_dict[key]}: {interpreting[key]}')
|
157
|
-
|
158
|
-
return gwas
|
159
|
-
|
160
|
-
|
161
|
-
def gwas_checkformat(gwas, config):
|
162
|
-
'''
|
163
|
-
Check column names required for different format
|
164
|
-
'''
|
165
|
-
if config.format == 'gsMap':
|
166
|
-
condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
|
167
|
-
condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
|
168
|
-
if not (condition1 or condition2):
|
169
|
-
raise ValueError(
|
170
|
-
'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
|
171
|
-
else:
|
172
|
-
if 'Z' in gwas.columns:
|
173
|
-
pass
|
174
|
-
elif 'P' in gwas.columns:
|
175
|
-
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
176
|
-
else:
|
177
|
-
gwas['Z'] = gwas.BETA / gwas.SE
|
178
|
-
|
179
|
-
elif config.format == 'COJO':
|
180
|
-
condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
|
181
|
-
if not condition:
|
182
|
-
raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
|
183
|
-
else:
|
184
|
-
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
185
|
-
|
186
|
-
return gwas
|
187
|
-
|
188
|
-
|
189
|
-
def filter_info(info, config):
|
190
|
-
'''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
|
191
|
-
if type(info) is pd.Series: # one INFO column
|
192
|
-
jj = ((info > 2.0) | (info < 0)) & info.notnull()
|
193
|
-
ii = info >= config.info_min
|
194
|
-
elif type(info) is pd.DataFrame: # several INFO columns
|
195
|
-
jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
|
196
|
-
(info < 0) & info.notnull()).any(axis=1))
|
197
|
-
ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
|
198
|
-
else:
|
199
|
-
raise ValueError('Expected pd.DataFrame or pd.Series.')
|
200
|
-
|
201
|
-
bad_info = jj.sum()
|
202
|
-
if bad_info > 0:
|
203
|
-
msg = 'WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled.'
|
204
|
-
logger.warning(msg.format(N=bad_info))
|
205
|
-
|
206
|
-
return ii
|
207
|
-
|
208
|
-
|
209
|
-
def filter_frq(frq, config):
|
210
|
-
'''
|
211
|
-
Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
|
212
|
-
'''
|
213
|
-
jj = (frq < 0) | (frq > 1)
|
214
|
-
bad_frq = jj.sum()
|
215
|
-
if bad_frq > 0:
|
216
|
-
msg = 'WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled.'
|
217
|
-
logger.warning(msg.format(N=bad_frq))
|
218
|
-
|
219
|
-
frq = np.minimum(frq, 1 - frq)
|
220
|
-
ii = frq > config.maf_min
|
221
|
-
return ii & ~jj
|
222
|
-
|
223
|
-
|
224
|
-
def filter_pvals(P, config):
|
225
|
-
'''Remove out-of-bounds P-values'''
|
226
|
-
ii = (P > 0) & (P <= 1)
|
227
|
-
bad_p = (~ii).sum()
|
228
|
-
if bad_p > 0:
|
229
|
-
msg = 'WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled.'
|
230
|
-
logger.warning(msg.format(N=bad_p))
|
231
|
-
|
232
|
-
return ii
|
233
|
-
|
234
|
-
|
235
|
-
def filter_alleles(a):
|
236
|
-
'''Remove alleles that do not describe strand-unambiguous SNPs'''
|
237
|
-
return a.isin(VALID_SNPS)
|
238
|
-
|
239
|
-
|
240
|
-
def gwas_qc(gwas, config):
|
241
|
-
'''
|
242
|
-
Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
|
243
|
-
'''
|
244
|
-
old = len(gwas)
|
245
|
-
logger.info(f'\nFiltering SNPs as follows:')
|
246
|
-
# filter: SNPs with missing values
|
247
|
-
drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
|
248
|
-
|
249
|
-
gwas = gwas.dropna(axis=0, how="any", subset=filter(
|
250
|
-
lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
|
251
|
-
|
252
|
-
drops['NA'] = old - len(gwas)
|
253
|
-
logger.info(f'Removed {drops["NA"]} SNPs with missing values.')
|
254
|
-
|
255
|
-
# filter: SNPs with Info < 0.9
|
256
|
-
if 'INFO' in gwas.columns:
|
257
|
-
old = len(gwas)
|
258
|
-
gwas = gwas.loc[filter_info(gwas['INFO'], config)]
|
259
|
-
drops['INFO'] = old - len(gwas)
|
260
|
-
logger.info(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
|
261
|
-
|
262
|
-
# filter: SNPs with MAF <= 0.01
|
263
|
-
if 'FRQ' in gwas.columns:
|
264
|
-
old = len(gwas)
|
265
|
-
gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
|
266
|
-
drops['FRQ'] += old - len(gwas)
|
267
|
-
logger.info(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
|
268
|
-
|
269
|
-
# filter: P-value that out-of-bounds [0,1]
|
270
|
-
if 'P' in gwas.columns:
|
271
|
-
old = len(gwas)
|
272
|
-
gwas = gwas.loc[filter_pvals(gwas['P'], config)]
|
273
|
-
drops['P'] += old - len(gwas)
|
274
|
-
logger.info(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
|
275
|
-
|
276
|
-
# filter: Variants that are strand-ambiguous
|
277
|
-
if 'A1' in gwas.columns and 'A2' in gwas.columns:
|
278
|
-
gwas.A1 = gwas.A1.str.upper()
|
279
|
-
gwas.A2 = gwas.A2.str.upper()
|
280
|
-
gwas = gwas.loc[filter_alleles(gwas.A1 + gwas.A2)]
|
281
|
-
drops['A'] += old - len(gwas)
|
282
|
-
logger.info(f'Removed {drops["A"]} variants that were not SNPs or were strand-ambiguous.')
|
283
|
-
|
284
|
-
# filter: Duplicated rs numbers
|
285
|
-
if 'SNP' in gwas.columns:
|
286
|
-
old = len(gwas)
|
287
|
-
gwas = gwas.drop_duplicates(subset='SNP').reset_index(drop=True)
|
288
|
-
drops['Dup'] += old - len(gwas)
|
289
|
-
logger.info(f'Removed {drops["Dup"]} SNPs with duplicated rs numbers.')
|
290
|
-
|
291
|
-
# filter:Sample size
|
292
|
-
n_min = gwas.N.quantile(0.9) / 1.5
|
293
|
-
old = len(gwas)
|
294
|
-
gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
|
295
|
-
drops['N'] += old - len(gwas)
|
296
|
-
logger.info(f'Removed {drops["N"]} SNPs with N < {n_min}.')
|
297
|
-
|
298
|
-
return gwas
|
299
|
-
|
300
|
-
|
301
|
-
def variant_to_rsid(gwas, config):
|
302
|
-
'''
|
303
|
-
Convert variant id (Chr, Pos) to rsid
|
304
|
-
'''
|
305
|
-
logger.info("\nConverting the SNP position to rsid. This process may take some time.")
|
306
|
-
unique_ids = set(gwas['id'])
|
307
|
-
chr_format = gwas['Chr'].unique().astype(str)
|
308
|
-
chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
|
309
|
-
|
310
|
-
dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
|
311
|
-
chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
|
312
|
-
dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
|
313
|
-
|
314
|
-
# Iterate over chunks
|
315
|
-
matching_id = pd.DataFrame()
|
316
|
-
for chunk in chunk_iter:
|
317
|
-
chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
|
318
|
-
matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
|
319
|
-
|
320
|
-
matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
|
321
|
-
matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
|
322
|
-
matching_id.index = matching_id.id
|
323
|
-
return matching_id
|
324
|
-
|
325
|
-
|
326
|
-
def clean_SNP_id(gwas, config):
|
327
|
-
'''
|
328
|
-
Clean SNP id
|
329
|
-
'''
|
330
|
-
old = len(gwas)
|
331
|
-
condition1 = 'SNP' in gwas.columns
|
332
|
-
condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
|
333
|
-
|
334
|
-
if not (condition1 or condition2):
|
335
|
-
raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
|
336
|
-
elif condition1:
|
337
|
-
pass
|
338
|
-
elif condition2:
|
339
|
-
if config.dbsnp is None:
|
340
|
-
raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
|
341
|
-
else:
|
342
|
-
gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
|
343
|
-
gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
|
344
|
-
gwas.index = gwas.id
|
345
|
-
|
346
|
-
matching_id = variant_to_rsid(gwas, config)
|
347
|
-
gwas = gwas.loc[matching_id.id]
|
348
|
-
gwas['SNP'] = matching_id.dbsnp
|
349
|
-
num_fail = old - len(gwas)
|
350
|
-
logger.info(f'Removed {num_fail} SNPs that did not convert to rsid.')
|
351
|
-
|
352
|
-
return gwas
|
353
|
-
|
354
|
-
|
355
|
-
def gwas_metadata(gwas, config):
|
356
|
-
'''
|
357
|
-
Report key features of GWAS data
|
358
|
-
'''
|
359
|
-
logger.info('\nSummary of GWAS data:')
|
360
|
-
CHISQ = (gwas.Z ** 2)
|
361
|
-
mean_chisq = CHISQ.mean()
|
362
|
-
logger.info('Mean chi^2 = ' + str(round(mean_chisq, 3)))
|
363
|
-
if mean_chisq < 1.02:
|
364
|
-
logger.warning("Mean chi^2 may be too small.")
|
365
|
-
|
366
|
-
logger.info('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
|
367
|
-
logger.info('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
|
368
|
-
logger.info('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
|
369
|
-
|
370
|
-
|
371
|
-
def gwas_format(config: FormatSumstatsConfig):
|
372
|
-
'''
|
373
|
-
Format GWAS data
|
374
|
-
'''
|
375
|
-
logger.info(f'------Formating gwas data for {config.sumstats}...')
|
376
|
-
compression_type = get_compression(config.sumstats)
|
377
|
-
gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
|
378
|
-
na_values=['.', 'NA'])
|
379
|
-
logger.info(f'Read {len(gwas)} SNPs from {config.sumstats}.')
|
380
|
-
|
381
|
-
# Check name and format
|
382
|
-
gwas = gwas_checkname(gwas, config)
|
383
|
-
gwas = gwas_checkformat(gwas, config)
|
384
|
-
# Clean the snp id
|
385
|
-
gwas = clean_SNP_id(gwas, config)
|
386
|
-
# QC
|
387
|
-
gwas = gwas_qc(gwas, config)
|
388
|
-
# Meta
|
389
|
-
gwas_metadata(gwas, config)
|
390
|
-
|
391
|
-
# Saving the data
|
392
|
-
if config.format == 'COJO':
|
393
|
-
keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
|
394
|
-
appendix = '.cojo'
|
395
|
-
elif config.format == 'gsMap':
|
396
|
-
keep = ["SNP", "A1", "A2", "Z", "N"]
|
397
|
-
appendix = '.sumstats'
|
398
|
-
|
399
|
-
if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
|
400
|
-
keep = keep + ['Chr', 'Pos']
|
401
|
-
|
402
|
-
gwas = gwas[keep]
|
403
|
-
out_name = config.out + appendix + '.gz'
|
404
|
-
|
405
|
-
logger.info(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
|
406
|
-
gwas.to_csv(out_name, sep="\t", index=False,
|
407
|
-
float_format='%.3f', compression='gzip')
|
1
|
+
import numpy as np
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
|
5
|
+
import math
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
from scipy.stats import chi2
|
9
|
+
|
10
|
+
from gsMap.config import FormatSumstatsConfig
|
11
|
+
|
12
|
+
VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
default_cnames = {
|
16
|
+
# RS NUMBER
|
17
|
+
'SNP': 'SNP',
|
18
|
+
'RS': 'SNP',
|
19
|
+
'RSID': 'SNP',
|
20
|
+
'RS_NUMBER': 'SNP',
|
21
|
+
'RS_NUMBERS': 'SNP',
|
22
|
+
# P-VALUE
|
23
|
+
'P': 'P',
|
24
|
+
'PVALUE': 'P',
|
25
|
+
'P_VALUE': 'P',
|
26
|
+
'PVAL': 'P',
|
27
|
+
'P_VAL': 'P',
|
28
|
+
'GC_PVALUE': 'P',
|
29
|
+
'p': 'P',
|
30
|
+
# EFFECT_ALLELE (A1)
|
31
|
+
'A1': 'A1',
|
32
|
+
'ALLELE1': 'A1',
|
33
|
+
'ALLELE_1': 'A1',
|
34
|
+
'EFFECT_ALLELE': 'A1',
|
35
|
+
'REFERENCE_ALLELE': 'A1',
|
36
|
+
'INC_ALLELE': 'A1',
|
37
|
+
'EA': 'A1',
|
38
|
+
# NON_EFFECT_ALLELE (A2)
|
39
|
+
'A2': 'A2',
|
40
|
+
'ALLELE2': 'A2',
|
41
|
+
'ALLELE_2': 'A2',
|
42
|
+
'OTHER_ALLELE': 'A2',
|
43
|
+
'NON_EFFECT_ALLELE': 'A2',
|
44
|
+
'DEC_ALLELE': 'A2',
|
45
|
+
'NEA': 'A2',
|
46
|
+
# N
|
47
|
+
'N': 'N',
|
48
|
+
'NCASE': 'N_CAS',
|
49
|
+
'CASES_N': 'N_CAS',
|
50
|
+
'N_CASE': 'N_CAS',
|
51
|
+
'N_CASES': 'N_CAS',
|
52
|
+
'N_CONTROLS': 'N_CON',
|
53
|
+
'N_CAS': 'N_CAS',
|
54
|
+
'N_CON': 'N_CON',
|
55
|
+
'N_CASE': 'N_CAS',
|
56
|
+
'NCONTROL': 'N_CON',
|
57
|
+
'CONTROLS_N': 'N_CON',
|
58
|
+
'N_CONTROL': 'N_CON',
|
59
|
+
'WEIGHT': 'N',
|
60
|
+
# SIGNED STATISTICS
|
61
|
+
'ZSCORE': 'Z',
|
62
|
+
'Z-SCORE': 'Z',
|
63
|
+
'GC_ZSCORE': 'Z',
|
64
|
+
'Z': 'Z',
|
65
|
+
'OR': 'OR',
|
66
|
+
'B': 'BETA',
|
67
|
+
'BETA': 'BETA',
|
68
|
+
'LOG_ODDS': 'LOG_ODDS',
|
69
|
+
'EFFECTS': 'BETA',
|
70
|
+
'EFFECT': 'BETA',
|
71
|
+
'b': 'BETA',
|
72
|
+
'beta': 'BETA',
|
73
|
+
# SE
|
74
|
+
'se': 'SE',
|
75
|
+
# INFO
|
76
|
+
'INFO': 'INFO',
|
77
|
+
'Info': 'INFO',
|
78
|
+
# MAF
|
79
|
+
'EAF': 'FRQ',
|
80
|
+
'FRQ': 'FRQ',
|
81
|
+
'MAF': 'FRQ',
|
82
|
+
'FRQ_U': 'FRQ',
|
83
|
+
'F_U': 'FRQ',
|
84
|
+
'frq_A1': 'FRQ',
|
85
|
+
'frq': 'FRQ',
|
86
|
+
'freq': 'FRQ'
|
87
|
+
}
|
88
|
+
|
89
|
+
|
90
|
+
def get_compression(fh):
|
91
|
+
'''
|
92
|
+
Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
|
93
|
+
'''
|
94
|
+
if fh.endswith('gz'):
|
95
|
+
compression = 'gzip'
|
96
|
+
elif fh.endswith('bz2'):
|
97
|
+
compression = 'bz2'
|
98
|
+
else:
|
99
|
+
compression = None
|
100
|
+
|
101
|
+
return compression
|
102
|
+
|
103
|
+
|
104
|
+
def gwas_checkname(gwas, config):
|
105
|
+
'''
|
106
|
+
Iterpret column names of gwas
|
107
|
+
'''
|
108
|
+
old_name = gwas.columns
|
109
|
+
mapped_cnames = {}
|
110
|
+
for col in gwas.columns:
|
111
|
+
mapped_cnames[col] = default_cnames.get(col, col)
|
112
|
+
gwas.columns = list(mapped_cnames.values())
|
113
|
+
|
114
|
+
# When column names are provided by users
|
115
|
+
name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
|
116
|
+
'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
|
117
|
+
'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
|
118
|
+
|
119
|
+
for key, value in name_updates.items():
|
120
|
+
if value is not None and value in gwas.columns:
|
121
|
+
gwas.rename(columns={value: key}, inplace=True)
|
122
|
+
new_name = gwas.columns
|
123
|
+
# check the name duplication
|
124
|
+
for head in new_name:
|
125
|
+
numc = list(new_name).count(head)
|
126
|
+
if numc > 1:
|
127
|
+
raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
|
128
|
+
|
129
|
+
name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
|
130
|
+
|
131
|
+
# When at OR scale
|
132
|
+
if 'OR' in new_name and 'SE_OR' in new_name:
|
133
|
+
gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
|
134
|
+
gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
|
135
|
+
|
136
|
+
interpreting = {
|
137
|
+
"SNP": 'Variant ID (e.g., rs number).',
|
138
|
+
"A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
|
139
|
+
"A2": 'Allele 2, interpreted as the non-effect allele for signed sumstat.',
|
140
|
+
"BETA": '[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).',
|
141
|
+
"SE": 'Standard error of the regression coefficient.',
|
142
|
+
"OR": 'Odds ratio, will be transferred to linear scale.',
|
143
|
+
"SE_OR": 'Standard error of the odds ratio, will be transferred to linear scale.',
|
144
|
+
"P": 'P-Value.',
|
145
|
+
"Z": 'Z-Value.',
|
146
|
+
"N": 'Sample size.',
|
147
|
+
"INFO": 'INFO score (imputation quality; higher → better imputation).',
|
148
|
+
"FRQ": 'Allele frequency of A1.',
|
149
|
+
"Chr": 'Chromsome.',
|
150
|
+
'Pos': 'SNP positions.'
|
151
|
+
}
|
152
|
+
|
153
|
+
logger.info(f'\nIterpreting column names as follows:')
|
154
|
+
for key, value in interpreting.items():
|
155
|
+
if key in new_name:
|
156
|
+
logger.info(f'{name_dict[key]}: {interpreting[key]}')
|
157
|
+
|
158
|
+
return gwas
|
159
|
+
|
160
|
+
|
161
|
+
def gwas_checkformat(gwas, config):
|
162
|
+
'''
|
163
|
+
Check column names required for different format
|
164
|
+
'''
|
165
|
+
if config.format == 'gsMap':
|
166
|
+
condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
|
167
|
+
condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
|
168
|
+
if not (condition1 or condition2):
|
169
|
+
raise ValueError(
|
170
|
+
'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
|
171
|
+
else:
|
172
|
+
if 'Z' in gwas.columns:
|
173
|
+
pass
|
174
|
+
elif 'P' in gwas.columns:
|
175
|
+
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
176
|
+
else:
|
177
|
+
gwas['Z'] = gwas.BETA / gwas.SE
|
178
|
+
|
179
|
+
elif config.format == 'COJO':
|
180
|
+
condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
|
181
|
+
if not condition:
|
182
|
+
raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
|
183
|
+
else:
|
184
|
+
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
185
|
+
|
186
|
+
return gwas
|
187
|
+
|
188
|
+
|
189
|
+
def filter_info(info, config):
|
190
|
+
'''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
|
191
|
+
if type(info) is pd.Series: # one INFO column
|
192
|
+
jj = ((info > 2.0) | (info < 0)) & info.notnull()
|
193
|
+
ii = info >= config.info_min
|
194
|
+
elif type(info) is pd.DataFrame: # several INFO columns
|
195
|
+
jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
|
196
|
+
(info < 0) & info.notnull()).any(axis=1))
|
197
|
+
ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
|
198
|
+
else:
|
199
|
+
raise ValueError('Expected pd.DataFrame or pd.Series.')
|
200
|
+
|
201
|
+
bad_info = jj.sum()
|
202
|
+
if bad_info > 0:
|
203
|
+
msg = 'WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled.'
|
204
|
+
logger.warning(msg.format(N=bad_info))
|
205
|
+
|
206
|
+
return ii
|
207
|
+
|
208
|
+
|
209
|
+
def filter_frq(frq, config):
|
210
|
+
'''
|
211
|
+
Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
|
212
|
+
'''
|
213
|
+
jj = (frq < 0) | (frq > 1)
|
214
|
+
bad_frq = jj.sum()
|
215
|
+
if bad_frq > 0:
|
216
|
+
msg = 'WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled.'
|
217
|
+
logger.warning(msg.format(N=bad_frq))
|
218
|
+
|
219
|
+
frq = np.minimum(frq, 1 - frq)
|
220
|
+
ii = frq > config.maf_min
|
221
|
+
return ii & ~jj
|
222
|
+
|
223
|
+
|
224
|
+
def filter_pvals(P, config):
|
225
|
+
'''Remove out-of-bounds P-values'''
|
226
|
+
ii = (P > 0) & (P <= 1)
|
227
|
+
bad_p = (~ii).sum()
|
228
|
+
if bad_p > 0:
|
229
|
+
msg = 'WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled.'
|
230
|
+
logger.warning(msg.format(N=bad_p))
|
231
|
+
|
232
|
+
return ii
|
233
|
+
|
234
|
+
|
235
|
+
def filter_alleles(a):
|
236
|
+
'''Remove alleles that do not describe strand-unambiguous SNPs'''
|
237
|
+
return a.isin(VALID_SNPS)
|
238
|
+
|
239
|
+
|
240
|
+
def gwas_qc(gwas, config):
|
241
|
+
'''
|
242
|
+
Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
|
243
|
+
'''
|
244
|
+
old = len(gwas)
|
245
|
+
logger.info(f'\nFiltering SNPs as follows:')
|
246
|
+
# filter: SNPs with missing values
|
247
|
+
drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
|
248
|
+
|
249
|
+
gwas = gwas.dropna(axis=0, how="any", subset=filter(
|
250
|
+
lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
|
251
|
+
|
252
|
+
drops['NA'] = old - len(gwas)
|
253
|
+
logger.info(f'Removed {drops["NA"]} SNPs with missing values.')
|
254
|
+
|
255
|
+
# filter: SNPs with Info < 0.9
|
256
|
+
if 'INFO' in gwas.columns:
|
257
|
+
old = len(gwas)
|
258
|
+
gwas = gwas.loc[filter_info(gwas['INFO'], config)]
|
259
|
+
drops['INFO'] = old - len(gwas)
|
260
|
+
logger.info(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
|
261
|
+
|
262
|
+
# filter: SNPs with MAF <= 0.01
|
263
|
+
if 'FRQ' in gwas.columns:
|
264
|
+
old = len(gwas)
|
265
|
+
gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
|
266
|
+
drops['FRQ'] += old - len(gwas)
|
267
|
+
logger.info(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
|
268
|
+
|
269
|
+
# filter: P-value that out-of-bounds [0,1]
|
270
|
+
if 'P' in gwas.columns:
|
271
|
+
old = len(gwas)
|
272
|
+
gwas = gwas.loc[filter_pvals(gwas['P'], config)]
|
273
|
+
drops['P'] += old - len(gwas)
|
274
|
+
logger.info(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
|
275
|
+
|
276
|
+
# filter: Variants that are strand-ambiguous
|
277
|
+
if 'A1' in gwas.columns and 'A2' in gwas.columns:
|
278
|
+
gwas.A1 = gwas.A1.str.upper()
|
279
|
+
gwas.A2 = gwas.A2.str.upper()
|
280
|
+
gwas = gwas.loc[filter_alleles(gwas.A1 + gwas.A2)]
|
281
|
+
drops['A'] += old - len(gwas)
|
282
|
+
logger.info(f'Removed {drops["A"]} variants that were not SNPs or were strand-ambiguous.')
|
283
|
+
|
284
|
+
# filter: Duplicated rs numbers
|
285
|
+
if 'SNP' in gwas.columns:
|
286
|
+
old = len(gwas)
|
287
|
+
gwas = gwas.drop_duplicates(subset='SNP').reset_index(drop=True)
|
288
|
+
drops['Dup'] += old - len(gwas)
|
289
|
+
logger.info(f'Removed {drops["Dup"]} SNPs with duplicated rs numbers.')
|
290
|
+
|
291
|
+
# filter:Sample size
|
292
|
+
n_min = gwas.N.quantile(0.9) / 1.5
|
293
|
+
old = len(gwas)
|
294
|
+
gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
|
295
|
+
drops['N'] += old - len(gwas)
|
296
|
+
logger.info(f'Removed {drops["N"]} SNPs with N < {n_min}.')
|
297
|
+
|
298
|
+
return gwas
|
299
|
+
|
300
|
+
|
301
|
+
def variant_to_rsid(gwas, config):
|
302
|
+
'''
|
303
|
+
Convert variant id (Chr, Pos) to rsid
|
304
|
+
'''
|
305
|
+
logger.info("\nConverting the SNP position to rsid. This process may take some time.")
|
306
|
+
unique_ids = set(gwas['id'])
|
307
|
+
chr_format = gwas['Chr'].unique().astype(str)
|
308
|
+
chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
|
309
|
+
|
310
|
+
dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
|
311
|
+
chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
|
312
|
+
dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
|
313
|
+
|
314
|
+
# Iterate over chunks
|
315
|
+
matching_id = pd.DataFrame()
|
316
|
+
for chunk in chunk_iter:
|
317
|
+
chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
|
318
|
+
matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
|
319
|
+
|
320
|
+
matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
|
321
|
+
matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
|
322
|
+
matching_id.index = matching_id.id
|
323
|
+
return matching_id
|
324
|
+
|
325
|
+
|
326
|
+
def clean_SNP_id(gwas, config):
|
327
|
+
'''
|
328
|
+
Clean SNP id
|
329
|
+
'''
|
330
|
+
old = len(gwas)
|
331
|
+
condition1 = 'SNP' in gwas.columns
|
332
|
+
condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
|
333
|
+
|
334
|
+
if not (condition1 or condition2):
|
335
|
+
raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
|
336
|
+
elif condition1:
|
337
|
+
pass
|
338
|
+
elif condition2:
|
339
|
+
if config.dbsnp is None:
|
340
|
+
raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
|
341
|
+
else:
|
342
|
+
gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
|
343
|
+
gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
|
344
|
+
gwas.index = gwas.id
|
345
|
+
|
346
|
+
matching_id = variant_to_rsid(gwas, config)
|
347
|
+
gwas = gwas.loc[matching_id.id]
|
348
|
+
gwas['SNP'] = matching_id.dbsnp
|
349
|
+
num_fail = old - len(gwas)
|
350
|
+
logger.info(f'Removed {num_fail} SNPs that did not convert to rsid.')
|
351
|
+
|
352
|
+
return gwas
|
353
|
+
|
354
|
+
|
355
|
+
def gwas_metadata(gwas, config):
|
356
|
+
'''
|
357
|
+
Report key features of GWAS data
|
358
|
+
'''
|
359
|
+
logger.info('\nSummary of GWAS data:')
|
360
|
+
CHISQ = (gwas.Z ** 2)
|
361
|
+
mean_chisq = CHISQ.mean()
|
362
|
+
logger.info('Mean chi^2 = ' + str(round(mean_chisq, 3)))
|
363
|
+
if mean_chisq < 1.02:
|
364
|
+
logger.warning("Mean chi^2 may be too small.")
|
365
|
+
|
366
|
+
logger.info('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
|
367
|
+
logger.info('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
|
368
|
+
logger.info('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
|
369
|
+
|
370
|
+
|
371
|
+
def gwas_format(config: FormatSumstatsConfig):
|
372
|
+
'''
|
373
|
+
Format GWAS data
|
374
|
+
'''
|
375
|
+
logger.info(f'------Formating gwas data for {config.sumstats}...')
|
376
|
+
compression_type = get_compression(config.sumstats)
|
377
|
+
gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
|
378
|
+
na_values=['.', 'NA'])
|
379
|
+
logger.info(f'Read {len(gwas)} SNPs from {config.sumstats}.')
|
380
|
+
|
381
|
+
# Check name and format
|
382
|
+
gwas = gwas_checkname(gwas, config)
|
383
|
+
gwas = gwas_checkformat(gwas, config)
|
384
|
+
# Clean the snp id
|
385
|
+
gwas = clean_SNP_id(gwas, config)
|
386
|
+
# QC
|
387
|
+
gwas = gwas_qc(gwas, config)
|
388
|
+
# Meta
|
389
|
+
gwas_metadata(gwas, config)
|
390
|
+
|
391
|
+
# Saving the data
|
392
|
+
if config.format == 'COJO':
|
393
|
+
keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
|
394
|
+
appendix = '.cojo'
|
395
|
+
elif config.format == 'gsMap':
|
396
|
+
keep = ["SNP", "A1", "A2", "Z", "N"]
|
397
|
+
appendix = '.sumstats'
|
398
|
+
|
399
|
+
if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
|
400
|
+
keep = keep + ['Chr', 'Pos']
|
401
|
+
|
402
|
+
gwas = gwas[keep]
|
403
|
+
out_name = config.out + appendix + '.gz'
|
404
|
+
|
405
|
+
logger.info(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
|
406
|
+
gwas.to_csv(out_name, sep="\t", index=False,
|
407
|
+
float_format='%.3f', compression='gzip')
|