gsMap 1.62__py3-none-any.whl → 1.63__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/format_sumstats.py CHANGED
@@ -1,17 +1,15 @@
1
- import os
2
1
  import numpy as np
3
- import pandas as pd
4
- import itertools as it
5
- import math
6
- import re
7
- import argparse
8
2
  import logging
9
- from scipy.stats import chi2
3
+ import re
10
4
 
11
- from gsMap.config import FormatSumstatsConfig, add_format_sumstats_args
5
+ import math
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy.stats import chi2
12
9
 
10
+ from gsMap.config import FormatSumstatsConfig
13
11
 
14
- VALID_SNPS = set(['AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'])
12
+ VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
15
13
  logger = logging.getLogger(__name__)
16
14
 
17
15
  default_cnames = {
@@ -24,7 +22,7 @@ default_cnames = {
24
22
  # P-VALUE
25
23
  'P': 'P',
26
24
  'PVALUE': 'P',
27
- 'P_VALUE': 'P',
25
+ 'P_VALUE': 'P',
28
26
  'PVAL': 'P',
29
27
  'P_VAL': 'P',
30
28
  'GC_PVALUE': 'P',
@@ -72,7 +70,7 @@ default_cnames = {
72
70
  'EFFECT': 'BETA',
73
71
  'b': 'BETA',
74
72
  'beta': 'BETA',
75
- #SE
73
+ # SE
76
74
  'se': 'SE',
77
75
  # INFO
78
76
  'INFO': 'INFO',
@@ -103,7 +101,7 @@ def get_compression(fh):
103
101
  return compression
104
102
 
105
103
 
106
- def gwas_checkname(gwas,config):
104
+ def gwas_checkname(gwas, config):
107
105
  '''
108
106
  Iterpret column names of gwas
109
107
  '''
@@ -114,21 +112,27 @@ def gwas_checkname(gwas,config):
114
112
  gwas.columns = list(mapped_cnames.values())
115
113
 
116
114
  # When column names are provided by users
117
- name_updates = {'SNP': config.snp,'A1': config.a1,'A2': config.a2,'INFO': config.info,
118
- 'BETA': config.beta,'SE': config.se,'P': config.p,'FRQ': config.frq,'N': config.n,
119
- 'Z': config.z,'Chr': config.chr, 'Pos': config.pos,'OR':config.OR, 'SE_OR':config.se_OR}
115
+ name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
116
+ 'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
117
+ 'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
120
118
 
121
119
  for key, value in name_updates.items():
122
120
  if value is not None and value in gwas.columns:
123
121
  gwas.rename(columns={value: key}, inplace=True)
124
122
  new_name = gwas.columns
123
+ # check the name duplication
124
+ for head in new_name:
125
+ numc = list(new_name).count(head)
126
+ if numc > 1:
127
+ raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
128
+
125
129
  name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
126
130
 
127
131
  # When at OR scale
128
132
  if 'OR' in new_name and 'SE_OR' in new_name:
129
- gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
130
- gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
131
-
133
+ gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
134
+ gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
135
+
132
136
  interpreting = {
133
137
  "SNP": 'Variant ID (e.g., rs number).',
134
138
  "A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
@@ -142,7 +146,7 @@ def gwas_checkname(gwas,config):
142
146
  "N": 'Sample size.',
143
147
  "INFO": 'INFO score (imputation quality; higher → better imputation).',
144
148
  "FRQ": 'Allele frequency of A1.',
145
- "Chr":'Chromsome.',
149
+ "Chr": 'Chromsome.',
146
150
  'Pos': 'SNP positions.'
147
151
  }
148
152
 
@@ -150,45 +154,46 @@ def gwas_checkname(gwas,config):
150
154
  for key, value in interpreting.items():
151
155
  if key in new_name:
152
156
  print(f'{name_dict[key]}: {interpreting[key]}')
153
-
157
+
154
158
  return gwas
155
159
 
156
160
 
157
- def gwas_checkformat(gwas,config):
161
+ def gwas_checkformat(gwas, config):
158
162
  '''
159
163
  Check column names required for different format
160
164
  '''
161
- if config.format=='gsMap':
162
- condition1 = np.any(np.isin(['P', 'Z'],gwas.columns))
163
- condition2 = np.all(np.isin(['BETA', 'SE'],gwas.columns))
165
+ if config.format == 'gsMap':
166
+ condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
167
+ condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
164
168
  if not (condition1 or condition2):
165
- raise ValueError('To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
169
+ raise ValueError(
170
+ 'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
166
171
  else:
167
172
  if 'Z' in gwas.columns:
168
173
  pass
169
174
  elif 'P' in gwas.columns:
170
- gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
175
+ gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
171
176
  else:
172
177
  gwas['Z'] = gwas.BETA / gwas.SE
173
178
 
174
- elif config.format=='COJO':
175
- condition = np.all(np.isin(['A1','A2','FRQ','BETA','SE','P','N'],gwas.columns))
176
- if not condition:
179
+ elif config.format == 'COJO':
180
+ condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
181
+ if not condition:
177
182
  raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
178
183
  else:
179
184
  gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
180
-
185
+
181
186
  return gwas
182
187
 
183
188
 
184
- def filter_info(info,config):
189
+ def filter_info(info, config):
185
190
  '''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
186
191
  if type(info) is pd.Series: # one INFO column
187
192
  jj = ((info > 2.0) | (info < 0)) & info.notnull()
188
193
  ii = info >= config.info_min
189
194
  elif type(info) is pd.DataFrame: # several INFO columns
190
195
  jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
191
- (info < 0) & info.notnull()).any(axis=1))
196
+ (info < 0) & info.notnull()).any(axis=1))
192
197
  ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
193
198
  else:
194
199
  raise ValueError('Expected pd.DataFrame or pd.Series.')
@@ -201,7 +206,7 @@ def filter_info(info,config):
201
206
  return ii
202
207
 
203
208
 
204
- def filter_frq(frq,config):
209
+ def filter_frq(frq, config):
205
210
  '''
206
211
  Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
207
212
  '''
@@ -216,7 +221,7 @@ def filter_frq(frq,config):
216
221
  return ii & ~jj
217
222
 
218
223
 
219
- def filter_pvals(P,config):
224
+ def filter_pvals(P, config):
220
225
  '''Remove out-of-bounds P-values'''
221
226
  ii = (P > 0) & (P <= 1)
222
227
  bad_p = (~ii).sum()
@@ -232,17 +237,17 @@ def filter_alleles(a):
232
237
  return a.isin(VALID_SNPS)
233
238
 
234
239
 
235
- def gwas_qc(gwas,config):
240
+ def gwas_qc(gwas, config):
236
241
  '''
237
242
  Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
238
243
  '''
239
244
  old = len(gwas)
240
245
  print(f'\nFiltering SNPs as follows:')
241
246
  # filter: SNPs with missing values
242
- drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N':0}
247
+ drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
243
248
 
244
249
  gwas = gwas.dropna(axis=0, how="any", subset=filter(
245
- lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
250
+ lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
246
251
 
247
252
  drops['NA'] = old - len(gwas)
248
253
  print(f'Removed {drops["NA"]} SNPs with missing values.')
@@ -250,21 +255,21 @@ def gwas_qc(gwas,config):
250
255
  # filter: SNPs with Info < 0.9
251
256
  if 'INFO' in gwas.columns:
252
257
  old = len(gwas)
253
- gwas = gwas.loc[filter_info(gwas['INFO'],config)]
258
+ gwas = gwas.loc[filter_info(gwas['INFO'], config)]
254
259
  drops['INFO'] = old - len(gwas)
255
260
  print(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
256
261
 
257
262
  # filter: SNPs with MAF <= 0.01
258
263
  if 'FRQ' in gwas.columns:
259
264
  old = len(gwas)
260
- gwas = gwas.loc[filter_frq(gwas['FRQ'],config)]
265
+ gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
261
266
  drops['FRQ'] += old - len(gwas)
262
267
  print(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
263
268
 
264
269
  # filter: P-value that out-of-bounds [0,1]
265
270
  if 'P' in gwas.columns:
266
271
  old = len(gwas)
267
- gwas = gwas.loc[filter_pvals(gwas['P'],config)]
272
+ gwas = gwas.loc[filter_pvals(gwas['P'], config)]
268
273
  drops['P'] += old - len(gwas)
269
274
  print(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
270
275
 
@@ -289,11 +294,11 @@ def gwas_qc(gwas,config):
289
294
  gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
290
295
  drops['N'] += old - len(gwas)
291
296
  print(f'Removed {drops["N"]} SNPs with N < {n_min}.')
292
-
297
+
293
298
  return gwas
294
299
 
295
300
 
296
- def variant_to_rsid(gwas,config):
301
+ def variant_to_rsid(gwas, config):
297
302
  '''
298
303
  Convert variant id (Chr, Pos) to rsid
299
304
  '''
@@ -303,42 +308,42 @@ def variant_to_rsid(gwas,config):
303
308
  chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
304
309
 
305
310
  dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
306
- chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
311
+ chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
307
312
  dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
308
313
 
309
314
  # Iterate over chunks
310
315
  matching_id = pd.DataFrame()
311
316
  for chunk in chunk_iter:
312
- chunk['id'] = chr_format+chunk["chr"]+"_"+chunk["pos"]
313
- matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp','id']]])
314
-
317
+ chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
318
+ matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
319
+
315
320
  matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
316
321
  matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
317
322
  matching_id.index = matching_id.id
318
323
  return matching_id
319
324
 
320
325
 
321
- def clean_SNP_id(gwas,config):
326
+ def clean_SNP_id(gwas, config):
322
327
  '''
323
328
  Clean SNP id
324
329
  '''
325
330
  old = len(gwas)
326
331
  condition1 = 'SNP' in gwas.columns
327
- condition2 = np.all(np.isin(['Chr', 'Pos'],gwas.columns))
332
+ condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
328
333
 
329
334
  if not (condition1 or condition2):
330
- raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
335
+ raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
331
336
  elif condition1:
332
337
  pass
333
338
  elif condition2:
334
339
  if config.dbsnp is None:
335
- raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
340
+ raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
336
341
  else:
337
- gwas['id'] = gwas["Chr"].astype(str)+"_"+gwas["Pos"].astype(str)
342
+ gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
338
343
  gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
339
344
  gwas.index = gwas.id
340
-
341
- matching_id = variant_to_rsid(gwas,config)
345
+
346
+ matching_id = variant_to_rsid(gwas, config)
342
347
  gwas = gwas.loc[matching_id.id]
343
348
  gwas['SNP'] = matching_id.dbsnp
344
349
  num_fail = old - len(gwas)
@@ -347,7 +352,7 @@ def clean_SNP_id(gwas,config):
347
352
  return gwas
348
353
 
349
354
 
350
- def gwas_metadata(gwas,config):
355
+ def gwas_metadata(gwas, config):
351
356
  '''
352
357
  Report key features of GWAS data
353
358
  '''
@@ -360,51 +365,43 @@ def gwas_metadata(gwas,config):
360
365
 
361
366
  print('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
362
367
  print('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
363
- print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ> 29).sum()))
368
+ print('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
364
369
 
365
370
 
366
- def gwas_format(config:FormatSumstatsConfig):
371
+ def gwas_format(config: FormatSumstatsConfig):
367
372
  '''
368
373
  Format GWAS data
369
374
  '''
370
375
  print(f'------Formating gwas data for {config.sumstats}...')
371
- gwas_file="/storage/yangjianLab/songliyang/GWAS_trait/COJO/Alcohol_Dependence.txt"
372
- gwas = pd.read_csv(config.sumstats,delim_whitespace=True,
373
- header=0,compression=get_compression(gwas_file),na_values=['.', 'NA'])
376
+ compression_type = get_compression(config.sumstats)
377
+ gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
378
+ na_values=['.', 'NA'])
374
379
  print(f'Read {len(gwas)} SNPs from {config.sumstats}.')
375
-
380
+
376
381
  # Check name and format
377
- gwas = gwas_checkname(gwas,config)
378
- gwas = gwas_checkformat(gwas,config)
382
+ gwas = gwas_checkname(gwas, config)
383
+ gwas = gwas_checkformat(gwas, config)
379
384
  # Clean the snp id
380
- gwas = clean_SNP_id(gwas,config)
385
+ gwas = clean_SNP_id(gwas, config)
381
386
  # QC
382
- gwas = gwas_qc(gwas,config)
387
+ gwas = gwas_qc(gwas, config)
383
388
  # Meta
384
- gwas_metadata(gwas,config)
385
-
389
+ gwas_metadata(gwas, config)
390
+
386
391
  # Saving the data
387
- if config.format=='COJO':
388
- keep = ['SNP','A1','A2','FRQ','BETA','SE','P','N']
392
+ if config.format == 'COJO':
393
+ keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
389
394
  appendix = '.cojo'
390
- elif config.format=='gsMap':
391
- keep = ["A1","A2","Z","N","SNP"]
395
+ elif config.format == 'gsMap':
396
+ keep = ["SNP", "A1", "A2", "Z", "N"]
392
397
  appendix = '.sumstats'
393
398
 
394
399
  if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
395
- keep = keep + ['Chr','Pos']
400
+ keep = keep + ['Chr', 'Pos']
396
401
 
397
402
  gwas = gwas[keep]
398
- out_name = config.out + appendix +'.gz'
399
-
403
+ out_name = config.out + appendix + '.gz'
404
+
400
405
  print(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
401
406
  gwas.to_csv(out_name, sep="\t", index=False,
402
- float_format='%.3f', compression = 'gzip')
403
-
404
-
405
- if __name__ == '__main__':
406
- parser = argparse.ArgumentParser(description="Visualization the results")
407
- parser = add_format_sumstats_args(parser)
408
- args = parser.parse_args()
409
- config = FormatSumstatsConfig(**vars(args))
410
- gwas_format(config)
407
+ float_format='%.3f', compression='gzip')