gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/adjacency_matrix.py +1 -1
- gsMap/GNN_VAE/model.py +5 -5
- gsMap/GNN_VAE/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/cauchy_combination_test.py +14 -36
- gsMap/config.py +473 -404
- gsMap/diagnosis.py +273 -0
- gsMap/find_latent_representation.py +22 -86
- gsMap/format_sumstats.py +79 -82
- gsMap/generate_ldscore.py +145 -78
- gsMap/latent_to_gene.py +65 -104
- gsMap/main.py +1 -9
- gsMap/report.py +160 -0
- gsMap/run_all_mode.py +195 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +187 -112
- gsMap/templates/report_template.html +198 -0
- gsMap/utils/__init__.py +0 -0
- gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +1 -9
- gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
- gsMap/utils/manhattan_plot.py +639 -0
- gsMap/{regression_read.py → utils/regression_read.py} +1 -1
- gsMap/visualize.py +100 -55
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/METADATA +16 -46
- gsmap-1.63.dist-info/RECORD +30 -0
- gsmap-1.62.dist-info/RECORD +0 -24
- /gsMap/{jackknife.py → utils/jackknife.py} +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/LICENSE +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/WHEEL +0 -0
- {gsmap-1.62.dist-info → gsmap-1.63.dist-info}/entry_points.txt +0 -0
gsMap/format_sumstats.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1
|
-
import os
|
2
1
|
import numpy as np
|
3
|
-
import pandas as pd
|
4
|
-
import itertools as it
|
5
|
-
import math
|
6
|
-
import re
|
7
|
-
import argparse
|
8
2
|
import logging
|
9
|
-
|
3
|
+
import re
|
10
4
|
|
11
|
-
|
5
|
+
import math
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
from scipy.stats import chi2
|
12
9
|
|
10
|
+
from gsMap.config import FormatSumstatsConfig
|
13
11
|
|
14
|
-
VALID_SNPS =
|
12
|
+
VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
|
15
13
|
logger = logging.getLogger(__name__)
|
16
14
|
|
17
15
|
default_cnames = {
|
@@ -24,7 +22,7 @@ default_cnames = {
|
|
24
22
|
# P-VALUE
|
25
23
|
'P': 'P',
|
26
24
|
'PVALUE': 'P',
|
27
|
-
'P_VALUE':
|
25
|
+
'P_VALUE': 'P',
|
28
26
|
'PVAL': 'P',
|
29
27
|
'P_VAL': 'P',
|
30
28
|
'GC_PVALUE': 'P',
|
@@ -72,7 +70,7 @@ default_cnames = {
|
|
72
70
|
'EFFECT': 'BETA',
|
73
71
|
'b': 'BETA',
|
74
72
|
'beta': 'BETA',
|
75
|
-
#SE
|
73
|
+
# SE
|
76
74
|
'se': 'SE',
|
77
75
|
# INFO
|
78
76
|
'INFO': 'INFO',
|
@@ -103,7 +101,7 @@ def get_compression(fh):
|
|
103
101
|
return compression
|
104
102
|
|
105
103
|
|
106
|
-
def gwas_checkname(gwas,config):
|
104
|
+
def gwas_checkname(gwas, config):
|
107
105
|
'''
|
108
106
|
Iterpret column names of gwas
|
109
107
|
'''
|
@@ -114,21 +112,27 @@ def gwas_checkname(gwas,config):
|
|
114
112
|
gwas.columns = list(mapped_cnames.values())
|
115
113
|
|
116
114
|
# When column names are provided by users
|
117
|
-
name_updates = {'SNP': config.snp,'A1': config.a1,'A2': config.a2,'INFO': config.info,
|
118
|
-
'BETA': config.beta,'SE': config.se,'P': config.p,'FRQ': config.frq,'N': config.n,
|
119
|
-
'Z': config.z,'Chr': config.chr, 'Pos': config.pos,'OR':config.OR, 'SE_OR':config.se_OR}
|
115
|
+
name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
|
116
|
+
'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
|
117
|
+
'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
|
120
118
|
|
121
119
|
for key, value in name_updates.items():
|
122
120
|
if value is not None and value in gwas.columns:
|
123
121
|
gwas.rename(columns={value: key}, inplace=True)
|
124
122
|
new_name = gwas.columns
|
123
|
+
# check the name duplication
|
124
|
+
for head in new_name:
|
125
|
+
numc = list(new_name).count(head)
|
126
|
+
if numc > 1:
|
127
|
+
raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
|
128
|
+
|
125
129
|
name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
|
126
130
|
|
127
131
|
# When at OR scale
|
128
132
|
if 'OR' in new_name and 'SE_OR' in new_name:
|
129
|
-
gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
|
130
|
-
gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
|
131
|
-
|
133
|
+
gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
|
134
|
+
gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
|
135
|
+
|
132
136
|
interpreting = {
|
133
137
|
"SNP": 'Variant ID (e.g., rs number).',
|
134
138
|
"A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
|
@@ -142,7 +146,7 @@ def gwas_checkname(gwas,config):
|
|
142
146
|
"N": 'Sample size.',
|
143
147
|
"INFO": 'INFO score (imputation quality; higher → better imputation).',
|
144
148
|
"FRQ": 'Allele frequency of A1.',
|
145
|
-
"Chr":'Chromsome.',
|
149
|
+
"Chr": 'Chromsome.',
|
146
150
|
'Pos': 'SNP positions.'
|
147
151
|
}
|
148
152
|
|
@@ -150,45 +154,46 @@ def gwas_checkname(gwas,config):
|
|
150
154
|
for key, value in interpreting.items():
|
151
155
|
if key in new_name:
|
152
156
|
print(f'{name_dict[key]}: {interpreting[key]}')
|
153
|
-
|
157
|
+
|
154
158
|
return gwas
|
155
159
|
|
156
160
|
|
157
|
-
def gwas_checkformat(gwas,config):
|
161
|
+
def gwas_checkformat(gwas, config):
|
158
162
|
'''
|
159
163
|
Check column names required for different format
|
160
164
|
'''
|
161
|
-
if config.format=='gsMap':
|
162
|
-
condition1 = np.any(np.isin(['P', 'Z'],gwas.columns))
|
163
|
-
condition2 = np.all(np.isin(['BETA', 'SE'],gwas.columns))
|
165
|
+
if config.format == 'gsMap':
|
166
|
+
condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
|
167
|
+
condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
|
164
168
|
if not (condition1 or condition2):
|
165
|
-
raise ValueError(
|
169
|
+
raise ValueError(
|
170
|
+
'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
|
166
171
|
else:
|
167
172
|
if 'Z' in gwas.columns:
|
168
173
|
pass
|
169
174
|
elif 'P' in gwas.columns:
|
170
|
-
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
175
|
+
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
171
176
|
else:
|
172
177
|
gwas['Z'] = gwas.BETA / gwas.SE
|
173
178
|
|
174
|
-
elif config.format=='COJO':
|
175
|
-
condition = np.all(np.isin(['A1','A2','FRQ','BETA','SE','P','N'],gwas.columns))
|
176
|
-
if not condition:
|
179
|
+
elif config.format == 'COJO':
|
180
|
+
condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
|
181
|
+
if not condition:
|
177
182
|
raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
|
178
183
|
else:
|
179
184
|
gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
|
180
|
-
|
185
|
+
|
181
186
|
return gwas
|
182
187
|
|
183
188
|
|
184
|
-
def filter_info(info,config):
|
189
|
+
def filter_info(info, config):
|
185
190
|
'''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
|
186
191
|
if type(info) is pd.Series: # one INFO column
|
187
192
|
jj = ((info > 2.0) | (info < 0)) & info.notnull()
|
188
193
|
ii = info >= config.info_min
|
189
194
|
elif type(info) is pd.DataFrame: # several INFO columns
|
190
195
|
jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
|
191
|
-
|
196
|
+
(info < 0) & info.notnull()).any(axis=1))
|
192
197
|
ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
|
193
198
|
else:
|
194
199
|
raise ValueError('Expected pd.DataFrame or pd.Series.')
|
@@ -201,7 +206,7 @@ def filter_info(info,config):
|
|
201
206
|
return ii
|
202
207
|
|
203
208
|
|
204
|
-
def filter_frq(frq,config):
|
209
|
+
def filter_frq(frq, config):
|
205
210
|
'''
|
206
211
|
Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
|
207
212
|
'''
|
@@ -216,7 +221,7 @@ def filter_frq(frq,config):
|
|
216
221
|
return ii & ~jj
|
217
222
|
|
218
223
|
|
219
|
-
def filter_pvals(P,config):
|
224
|
+
def filter_pvals(P, config):
|
220
225
|
'''Remove out-of-bounds P-values'''
|
221
226
|
ii = (P > 0) & (P <= 1)
|
222
227
|
bad_p = (~ii).sum()
|
@@ -232,17 +237,17 @@ def filter_alleles(a):
|
|
232
237
|
return a.isin(VALID_SNPS)
|
233
238
|
|
234
239
|
|
235
|
-
def gwas_qc(gwas,config):
|
240
|
+
def gwas_qc(gwas, config):
|
236
241
|
'''
|
237
242
|
Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
|
238
243
|
'''
|
239
244
|
old = len(gwas)
|
240
245
|
print(f'\nFiltering SNPs as follows:')
|
241
246
|
# filter: SNPs with missing values
|
242
|
-
drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N':0}
|
247
|
+
drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
|
243
248
|
|
244
249
|
gwas = gwas.dropna(axis=0, how="any", subset=filter(
|
245
|
-
|
250
|
+
lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
|
246
251
|
|
247
252
|
drops['NA'] = old - len(gwas)
|
248
253
|
print(f'Removed {drops["NA"]} SNPs with missing values.')
|
@@ -250,21 +255,21 @@ def gwas_qc(gwas,config):
|
|
250
255
|
# filter: SNPs with Info < 0.9
|
251
256
|
if 'INFO' in gwas.columns:
|
252
257
|
old = len(gwas)
|
253
|
-
gwas = gwas.loc[filter_info(gwas['INFO'],config)]
|
258
|
+
gwas = gwas.loc[filter_info(gwas['INFO'], config)]
|
254
259
|
drops['INFO'] = old - len(gwas)
|
255
260
|
print(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
|
256
261
|
|
257
262
|
# filter: SNPs with MAF <= 0.01
|
258
263
|
if 'FRQ' in gwas.columns:
|
259
264
|
old = len(gwas)
|
260
|
-
gwas = gwas.loc[filter_frq(gwas['FRQ'],config)]
|
265
|
+
gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
|
261
266
|
drops['FRQ'] += old - len(gwas)
|
262
267
|
print(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
|
263
268
|
|
264
269
|
# filter: P-value that out-of-bounds [0,1]
|
265
270
|
if 'P' in gwas.columns:
|
266
271
|
old = len(gwas)
|
267
|
-
gwas = gwas.loc[filter_pvals(gwas['P'],config)]
|
272
|
+
gwas = gwas.loc[filter_pvals(gwas['P'], config)]
|
268
273
|
drops['P'] += old - len(gwas)
|
269
274
|
print(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
|
270
275
|
|
@@ -289,11 +294,11 @@ def gwas_qc(gwas,config):
|
|
289
294
|
gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
|
290
295
|
drops['N'] += old - len(gwas)
|
291
296
|
print(f'Removed {drops["N"]} SNPs with N < {n_min}.')
|
292
|
-
|
297
|
+
|
293
298
|
return gwas
|
294
299
|
|
295
300
|
|
296
|
-
def variant_to_rsid(gwas,config):
|
301
|
+
def variant_to_rsid(gwas, config):
|
297
302
|
'''
|
298
303
|
Convert variant id (Chr, Pos) to rsid
|
299
304
|
'''
|
@@ -303,42 +308,42 @@ def variant_to_rsid(gwas,config):
|
|
303
308
|
chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
|
304
309
|
|
305
310
|
dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
|
306
|
-
chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t",
|
311
|
+
chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
|
307
312
|
dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
|
308
313
|
|
309
314
|
# Iterate over chunks
|
310
315
|
matching_id = pd.DataFrame()
|
311
316
|
for chunk in chunk_iter:
|
312
|
-
chunk['id'] = chr_format+chunk["chr"]+"_"+chunk["pos"]
|
313
|
-
matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp','id']]])
|
314
|
-
|
317
|
+
chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
|
318
|
+
matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
|
319
|
+
|
315
320
|
matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
|
316
321
|
matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
|
317
322
|
matching_id.index = matching_id.id
|
318
323
|
return matching_id
|
319
324
|
|
320
325
|
|
321
|
-
def clean_SNP_id(gwas,config):
|
326
|
+
def clean_SNP_id(gwas, config):
|
322
327
|
'''
|
323
328
|
Clean SNP id
|
324
329
|
'''
|
325
330
|
old = len(gwas)
|
326
331
|
condition1 = 'SNP' in gwas.columns
|
327
|
-
condition2 = np.all(np.isin(['Chr', 'Pos'],gwas.columns))
|
332
|
+
condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
|
328
333
|
|
329
334
|
if not (condition1 or condition2):
|
330
|
-
|
335
|
+
raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
|
331
336
|
elif condition1:
|
332
337
|
pass
|
333
338
|
elif condition2:
|
334
339
|
if config.dbsnp is None:
|
335
|
-
|
340
|
+
raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
|
336
341
|
else:
|
337
|
-
gwas['id'] = gwas["Chr"].astype(str)+"_"+gwas["Pos"].astype(str)
|
342
|
+
gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
|
338
343
|
gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
|
339
344
|
gwas.index = gwas.id
|
340
|
-
|
341
|
-
matching_id = variant_to_rsid(gwas,config)
|
345
|
+
|
346
|
+
matching_id = variant_to_rsid(gwas, config)
|
342
347
|
gwas = gwas.loc[matching_id.id]
|
343
348
|
gwas['SNP'] = matching_id.dbsnp
|
344
349
|
num_fail = old - len(gwas)
|
@@ -347,7 +352,7 @@ def clean_SNP_id(gwas,config):
|
|
347
352
|
return gwas
|
348
353
|
|
349
354
|
|
350
|
-
def gwas_metadata(gwas,config):
|
355
|
+
def gwas_metadata(gwas, config):
|
351
356
|
'''
|
352
357
|
Report key features of GWAS data
|
353
358
|
'''
|
@@ -360,51 +365,43 @@ def gwas_metadata(gwas,config):
|
|
360
365
|
|
361
366
|
print('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
|
362
367
|
print('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
|
363
|
-
print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ> 29).sum()))
|
368
|
+
print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
|
364
369
|
|
365
370
|
|
366
|
-
def gwas_format(config:FormatSumstatsConfig):
|
371
|
+
def gwas_format(config: FormatSumstatsConfig):
|
367
372
|
'''
|
368
373
|
Format GWAS data
|
369
374
|
'''
|
370
375
|
print(f'------Formating gwas data for {config.sumstats}...')
|
371
|
-
|
372
|
-
gwas = pd.read_csv(config.sumstats,delim_whitespace=True,
|
373
|
-
|
376
|
+
compression_type = get_compression(config.sumstats)
|
377
|
+
gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
|
378
|
+
na_values=['.', 'NA'])
|
374
379
|
print(f'Read {len(gwas)} SNPs from {config.sumstats}.')
|
375
|
-
|
380
|
+
|
376
381
|
# Check name and format
|
377
|
-
gwas = gwas_checkname(gwas,config)
|
378
|
-
gwas = gwas_checkformat(gwas,config)
|
382
|
+
gwas = gwas_checkname(gwas, config)
|
383
|
+
gwas = gwas_checkformat(gwas, config)
|
379
384
|
# Clean the snp id
|
380
|
-
gwas = clean_SNP_id(gwas,config)
|
385
|
+
gwas = clean_SNP_id(gwas, config)
|
381
386
|
# QC
|
382
|
-
gwas = gwas_qc(gwas,config)
|
387
|
+
gwas = gwas_qc(gwas, config)
|
383
388
|
# Meta
|
384
|
-
gwas_metadata(gwas,config)
|
385
|
-
|
389
|
+
gwas_metadata(gwas, config)
|
390
|
+
|
386
391
|
# Saving the data
|
387
|
-
if config.format=='COJO':
|
388
|
-
keep = ['SNP','A1','A2','FRQ','BETA','SE','P','N']
|
392
|
+
if config.format == 'COJO':
|
393
|
+
keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
|
389
394
|
appendix = '.cojo'
|
390
|
-
elif config.format=='gsMap':
|
391
|
-
keep = ["A1","A2","Z","N"
|
395
|
+
elif config.format == 'gsMap':
|
396
|
+
keep = ["SNP", "A1", "A2", "Z", "N"]
|
392
397
|
appendix = '.sumstats'
|
393
398
|
|
394
399
|
if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
|
395
|
-
keep = keep + ['Chr','Pos']
|
400
|
+
keep = keep + ['Chr', 'Pos']
|
396
401
|
|
397
402
|
gwas = gwas[keep]
|
398
|
-
out_name = config.out + appendix +'.gz'
|
399
|
-
|
403
|
+
out_name = config.out + appendix + '.gz'
|
404
|
+
|
400
405
|
print(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
|
401
406
|
gwas.to_csv(out_name, sep="\t", index=False,
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
if __name__ == '__main__':
|
406
|
-
parser = argparse.ArgumentParser(description="Visualization the results")
|
407
|
-
parser = add_format_sumstats_args(parser)
|
408
|
-
args = parser.parse_args()
|
409
|
-
config = FormatSumstatsConfig(**vars(args))
|
410
|
-
gwas_format(config)
|
407
|
+
float_format='%.3f', compression='gzip')
|