gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/format_sumstats.py CHANGED
@@ -1,100 +1,98 @@
1
- import numpy as np
2
1
  import logging
2
+ import math
3
3
  import re
4
4
 
5
- import math
6
5
  import numpy as np
7
6
  import pandas as pd
8
7
  from scipy.stats import chi2
9
8
 
10
9
  from gsMap.config import FormatSumstatsConfig
11
10
 
12
- VALID_SNPS = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'}
11
+ VALID_SNPS = {"AC", "AG", "CA", "CT", "GA", "GT", "TC", "TG"}
13
12
  logger = logging.getLogger(__name__)
14
13
 
15
14
  default_cnames = {
16
15
  # RS NUMBER
17
- 'SNP': 'SNP',
18
- 'RS': 'SNP',
19
- 'RSID': 'SNP',
20
- 'RS_NUMBER': 'SNP',
21
- 'RS_NUMBERS': 'SNP',
16
+ "SNP": "SNP",
17
+ "RS": "SNP",
18
+ "RSID": "SNP",
19
+ "RS_NUMBER": "SNP",
20
+ "RS_NUMBERS": "SNP",
22
21
  # P-VALUE
23
- 'P': 'P',
24
- 'PVALUE': 'P',
25
- 'P_VALUE': 'P',
26
- 'PVAL': 'P',
27
- 'P_VAL': 'P',
28
- 'GC_PVALUE': 'P',
29
- 'p': 'P',
22
+ "P": "P",
23
+ "PVALUE": "P",
24
+ "P_VALUE": "P",
25
+ "PVAL": "P",
26
+ "P_VAL": "P",
27
+ "GC_PVALUE": "P",
28
+ "p": "P",
30
29
  # EFFECT_ALLELE (A1)
31
- 'A1': 'A1',
32
- 'ALLELE1': 'A1',
33
- 'ALLELE_1': 'A1',
34
- 'EFFECT_ALLELE': 'A1',
35
- 'REFERENCE_ALLELE': 'A1',
36
- 'INC_ALLELE': 'A1',
37
- 'EA': 'A1',
30
+ "A1": "A1",
31
+ "ALLELE1": "A1",
32
+ "ALLELE_1": "A1",
33
+ "EFFECT_ALLELE": "A1",
34
+ "REFERENCE_ALLELE": "A1",
35
+ "INC_ALLELE": "A1",
36
+ "EA": "A1",
38
37
  # NON_EFFECT_ALLELE (A2)
39
- 'A2': 'A2',
40
- 'ALLELE2': 'A2',
41
- 'ALLELE_2': 'A2',
42
- 'OTHER_ALLELE': 'A2',
43
- 'NON_EFFECT_ALLELE': 'A2',
44
- 'DEC_ALLELE': 'A2',
45
- 'NEA': 'A2',
38
+ "A2": "A2",
39
+ "ALLELE2": "A2",
40
+ "ALLELE_2": "A2",
41
+ "OTHER_ALLELE": "A2",
42
+ "NON_EFFECT_ALLELE": "A2",
43
+ "DEC_ALLELE": "A2",
44
+ "NEA": "A2",
46
45
  # N
47
- 'N': 'N',
48
- 'NCASE': 'N_CAS',
49
- 'CASES_N': 'N_CAS',
50
- 'N_CASE': 'N_CAS',
51
- 'N_CASES': 'N_CAS',
52
- 'N_CONTROLS': 'N_CON',
53
- 'N_CAS': 'N_CAS',
54
- 'N_CON': 'N_CON',
55
- 'N_CASE': 'N_CAS',
56
- 'NCONTROL': 'N_CON',
57
- 'CONTROLS_N': 'N_CON',
58
- 'N_CONTROL': 'N_CON',
59
- 'WEIGHT': 'N',
46
+ "N": "N",
47
+ "NCASE": "N_CAS",
48
+ "CASES_N": "N_CAS",
49
+ "N_CASE": "N_CAS",
50
+ "N_CASES": "N_CAS",
51
+ "N_CONTROLS": "N_CON",
52
+ "N_CAS": "N_CAS",
53
+ "N_CON": "N_CON",
54
+ "NCONTROL": "N_CON",
55
+ "CONTROLS_N": "N_CON",
56
+ "N_CONTROL": "N_CON",
57
+ "WEIGHT": "N",
60
58
  # SIGNED STATISTICS
61
- 'ZSCORE': 'Z',
62
- 'Z-SCORE': 'Z',
63
- 'GC_ZSCORE': 'Z',
64
- 'Z': 'Z',
65
- 'OR': 'OR',
66
- 'B': 'BETA',
67
- 'BETA': 'BETA',
68
- 'LOG_ODDS': 'LOG_ODDS',
69
- 'EFFECTS': 'BETA',
70
- 'EFFECT': 'BETA',
71
- 'b': 'BETA',
72
- 'beta': 'BETA',
59
+ "ZSCORE": "Z",
60
+ "Z-SCORE": "Z",
61
+ "GC_ZSCORE": "Z",
62
+ "Z": "Z",
63
+ "OR": "OR",
64
+ "B": "BETA",
65
+ "BETA": "BETA",
66
+ "LOG_ODDS": "LOG_ODDS",
67
+ "EFFECTS": "BETA",
68
+ "EFFECT": "BETA",
69
+ "b": "BETA",
70
+ "beta": "BETA",
73
71
  # SE
74
- 'se': 'SE',
72
+ "se": "SE",
75
73
  # INFO
76
- 'INFO': 'INFO',
77
- 'Info': 'INFO',
74
+ "INFO": "INFO",
75
+ "Info": "INFO",
78
76
  # MAF
79
- 'EAF': 'FRQ',
80
- 'FRQ': 'FRQ',
81
- 'MAF': 'FRQ',
82
- 'FRQ_U': 'FRQ',
83
- 'F_U': 'FRQ',
84
- 'frq_A1': 'FRQ',
85
- 'frq': 'FRQ',
86
- 'freq': 'FRQ'
77
+ "EAF": "FRQ",
78
+ "FRQ": "FRQ",
79
+ "MAF": "FRQ",
80
+ "FRQ_U": "FRQ",
81
+ "F_U": "FRQ",
82
+ "frq_A1": "FRQ",
83
+ "frq": "FRQ",
84
+ "freq": "FRQ",
87
85
  }
88
86
 
89
87
 
90
88
  def get_compression(fh):
91
- '''
89
+ """
92
90
  Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
93
- '''
94
- if fh.endswith('gz'):
95
- compression = 'gzip'
96
- elif fh.endswith('bz2'):
97
- compression = 'bz2'
91
+ """
92
+ if fh.endswith("gz"):
93
+ compression = "gzip"
94
+ elif fh.endswith("bz2"):
95
+ compression = "bz2"
98
96
  else:
99
97
  compression = None
100
98
 
@@ -102,9 +100,9 @@ def get_compression(fh):
102
100
 
103
101
 
104
102
  def gwas_checkname(gwas, config):
105
- '''
103
+ """
106
104
  Iterpret column names of gwas
107
- '''
105
+ """
108
106
  old_name = gwas.columns
109
107
  mapped_cnames = {}
110
108
  for col in gwas.columns:
@@ -112,9 +110,22 @@ def gwas_checkname(gwas, config):
112
110
  gwas.columns = list(mapped_cnames.values())
113
111
 
114
112
  # When column names are provided by users
115
- name_updates = {'SNP': config.snp, 'A1': config.a1, 'A2': config.a2, 'INFO': config.info,
116
- 'BETA': config.beta, 'SE': config.se, 'P': config.p, 'FRQ': config.frq, 'N': config.n,
117
- 'Z': config.z, 'Chr': config.chr, 'Pos': config.pos, 'OR': config.OR, 'SE_OR': config.se_OR}
113
+ name_updates = {
114
+ "SNP": config.snp,
115
+ "A1": config.a1,
116
+ "A2": config.a2,
117
+ "INFO": config.info,
118
+ "BETA": config.beta,
119
+ "SE": config.se,
120
+ "P": config.p,
121
+ "FRQ": config.frq,
122
+ "N": config.n,
123
+ "Z": config.z,
124
+ "Chr": config.chr,
125
+ "Pos": config.pos,
126
+ "OR": config.OR,
127
+ "SE_OR": config.se_OR,
128
+ }
118
129
 
119
130
  for key, value in name_updates.items():
120
131
  if value is not None and value in gwas.columns:
@@ -124,96 +135,102 @@ def gwas_checkname(gwas, config):
124
135
  for head in new_name:
125
136
  numc = list(new_name).count(head)
126
137
  if numc > 1:
127
- raise ValueError(f"Found {numc} different {head} columns, please check your {head} column.")
138
+ raise ValueError(
139
+ f"Found {numc} different {head} columns, please check your {head} column."
140
+ )
128
141
 
129
142
  name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
130
143
 
131
144
  # When at OR scale
132
- if 'OR' in new_name and 'SE_OR' in new_name:
133
- gwas['BETA'] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
134
- gwas['SE'] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
145
+ if "OR" in new_name and "SE_OR" in new_name:
146
+ gwas["BETA"] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
147
+ gwas["SE"] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
135
148
 
136
149
  interpreting = {
137
- "SNP": 'Variant ID (e.g., rs number).',
138
- "A1": 'Allele 1, interpreted as the effect allele for signed sumstat.',
139
- "A2": 'Allele 2, interpreted as the non-effect allele for signed sumstat.',
140
- "BETA": '[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).',
141
- "SE": 'Standard error of the regression coefficient.',
142
- "OR": 'Odds ratio, will be transferred to linear scale.',
143
- "SE_OR": 'Standard error of the odds ratio, will be transferred to linear scale.',
144
- "P": 'P-Value.',
145
- "Z": 'Z-Value.',
146
- "N": 'Sample size.',
147
- "INFO": 'INFO score (imputation quality; higher → better imputation).',
148
- "FRQ": 'Allele frequency of A1.',
149
- "Chr": 'Chromsome.',
150
- 'Pos': 'SNP positions.'
150
+ "SNP": "Variant ID (e.g., rs number).",
151
+ "A1": "Allele 1, interpreted as the effect allele for signed sumstat.",
152
+ "A2": "Allele 2, interpreted as the non-effect allele for signed sumstat.",
153
+ "BETA": "[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).",
154
+ "SE": "Standard error of the regression coefficient.",
155
+ "OR": "Odds ratio, will be transferred to linear scale.",
156
+ "SE_OR": "Standard error of the odds ratio, will be transferred to linear scale.",
157
+ "P": "P-Value.",
158
+ "Z": "Z-Value.",
159
+ "N": "Sample size.",
160
+ "INFO": "INFO score (imputation quality; higher → better imputation).",
161
+ "FRQ": "Allele frequency of A1.",
162
+ "Chr": "Chromsome.",
163
+ "Pos": "SNP positions.",
151
164
  }
152
165
 
153
- logger.info(f'\nIterpreting column names as follows:')
154
- for key, value in interpreting.items():
166
+ logger.info("\nIterpreting column names as follows:")
167
+ for key, _value in interpreting.items():
155
168
  if key in new_name:
156
- logger.info(f'{name_dict[key]}: {interpreting[key]}')
169
+ logger.info(f"{name_dict[key]}: {interpreting[key]}")
157
170
 
158
171
  return gwas
159
172
 
160
173
 
161
174
  def gwas_checkformat(gwas, config):
162
- '''
175
+ """
163
176
  Check column names required for different format
164
- '''
165
- if config.format == 'gsMap':
166
- condition1 = np.any(np.isin(['P', 'Z'], gwas.columns))
167
- condition2 = np.all(np.isin(['BETA', 'SE'], gwas.columns))
177
+ """
178
+ if config.format == "gsMap":
179
+ condition1 = np.any(np.isin(["P", "Z"], gwas.columns))
180
+ condition2 = np.all(np.isin(["BETA", "SE"], gwas.columns))
168
181
  if not (condition1 or condition2):
169
182
  raise ValueError(
170
- 'To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required.')
183
+ "To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required."
184
+ )
171
185
  else:
172
- if 'Z' in gwas.columns:
186
+ if "Z" in gwas.columns:
173
187
  pass
174
- elif 'P' in gwas.columns:
175
- gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
188
+ elif "P" in gwas.columns:
189
+ gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
176
190
  else:
177
- gwas['Z'] = gwas.BETA / gwas.SE
191
+ gwas["Z"] = gwas.BETA / gwas.SE
178
192
 
179
- elif config.format == 'COJO':
180
- condition = np.all(np.isin(['A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N'], gwas.columns))
193
+ elif config.format == "COJO":
194
+ condition = np.all(np.isin(["A1", "A2", "FRQ", "BETA", "SE", "P", "N"], gwas.columns))
181
195
  if not condition:
182
- raise ValueError('To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required.')
196
+ raise ValueError(
197
+ "To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required."
198
+ )
183
199
  else:
184
- gwas['Z'] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas['BETA'] < 0, -1, 1)
200
+ gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
185
201
 
186
202
  return gwas
187
203
 
188
204
 
189
205
  def filter_info(info, config):
190
- '''Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO.'''
206
+ """Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO."""
191
207
  if type(info) is pd.Series: # one INFO column
192
208
  jj = ((info > 2.0) | (info < 0)) & info.notnull()
193
209
  ii = info >= config.info_min
194
210
  elif type(info) is pd.DataFrame: # several INFO columns
195
- jj = (((info > 2.0) & info.notnull()).any(axis=1) | (
196
- (info < 0) & info.notnull()).any(axis=1))
197
- ii = (info.sum(axis=1) >= config.info_min * (len(info.columns)))
211
+ jj = ((info > 2.0) & info.notnull()).any(axis=1) | ((info < 0) & info.notnull()).any(
212
+ axis=1
213
+ )
214
+ ii = info.sum(axis=1) >= config.info_min * (len(info.columns))
198
215
  else:
199
- raise ValueError('Expected pd.DataFrame or pd.Series.')
216
+ raise ValueError("Expected pd.DataFrame or pd.Series.")
200
217
 
201
218
  bad_info = jj.sum()
202
219
  if bad_info > 0:
203
- msg = 'WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled.'
220
+ msg = "WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled."
204
221
  logger.warning(msg.format(N=bad_info))
205
222
 
206
223
  return ii
207
224
 
208
225
 
209
226
  def filter_frq(frq, config):
210
- '''
227
+ """
211
228
  Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
212
- '''
229
+ """
213
230
  jj = (frq < 0) | (frq > 1)
214
231
  bad_frq = jj.sum()
215
232
  if bad_frq > 0:
216
- msg = 'WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled.'
233
+ msg = "WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled."
217
234
  logger.warning(msg.format(N=bad_frq))
218
235
 
219
236
  frq = np.minimum(frq, 1 - frq)
@@ -222,161 +239,177 @@ def filter_frq(frq, config):
222
239
 
223
240
 
224
241
  def filter_pvals(P, config):
225
- '''Remove out-of-bounds P-values'''
242
+ """Remove out-of-bounds P-values"""
226
243
  ii = (P > 0) & (P <= 1)
227
244
  bad_p = (~ii).sum()
228
245
  if bad_p > 0:
229
- msg = 'WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled.'
246
+ msg = "WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled."
230
247
  logger.warning(msg.format(N=bad_p))
231
248
 
232
249
  return ii
233
250
 
234
251
 
235
252
  def filter_alleles(a):
236
- '''Remove alleles that do not describe strand-unambiguous SNPs'''
253
+ """Remove alleles that do not describe strand-unambiguous SNPs"""
237
254
  return a.isin(VALID_SNPS)
238
255
 
239
256
 
240
257
  def gwas_qc(gwas, config):
241
- '''
242
- Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
243
- '''
258
+ """
259
+ Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
260
+ """
244
261
  old = len(gwas)
245
- logger.info(f'\nFiltering SNPs as follows:')
262
+ logger.info("\nFiltering SNPs as follows:")
246
263
  # filter: SNPs with missing values
247
- drops = {'NA': 0, 'P': 0, 'INFO': 0, 'FRQ': 0, 'A': 0, 'SNP': 0, 'Dup': 0, 'N': 0}
264
+ drops = {"NA": 0, "P": 0, "INFO": 0, "FRQ": 0, "A": 0, "SNP": 0, "Dup": 0, "N": 0}
248
265
 
249
- gwas = gwas.dropna(axis=0, how="any", subset=filter(
250
- lambda x: x != 'INFO', gwas.columns)).reset_index(drop=True)
266
+ gwas = gwas.dropna(
267
+ axis=0, how="any", subset=filter(lambda x: x != "INFO", gwas.columns)
268
+ ).reset_index(drop=True)
251
269
 
252
- drops['NA'] = old - len(gwas)
253
- logger.info(f'Removed {drops["NA"]} SNPs with missing values.')
270
+ drops["NA"] = old - len(gwas)
271
+ logger.info(f"Removed {drops['NA']} SNPs with missing values.")
254
272
 
255
273
  # filter: SNPs with Info < 0.9
256
- if 'INFO' in gwas.columns:
274
+ if "INFO" in gwas.columns:
257
275
  old = len(gwas)
258
- gwas = gwas.loc[filter_info(gwas['INFO'], config)]
259
- drops['INFO'] = old - len(gwas)
260
- logger.info(f'Removed {drops["INFO"]} SNPs with INFO <= 0.9.')
276
+ gwas = gwas.loc[filter_info(gwas["INFO"], config)]
277
+ drops["INFO"] = old - len(gwas)
278
+ logger.info(f"Removed {drops['INFO']} SNPs with INFO <= 0.9.")
261
279
 
262
280
  # filter: SNPs with MAF <= 0.01
263
- if 'FRQ' in gwas.columns:
281
+ if "FRQ" in gwas.columns:
264
282
  old = len(gwas)
265
- gwas = gwas.loc[filter_frq(gwas['FRQ'], config)]
266
- drops['FRQ'] += old - len(gwas)
267
- logger.info(f'Removed {drops["FRQ"]} SNPs with MAF <= 0.01.')
283
+ gwas = gwas.loc[filter_frq(gwas["FRQ"], config)]
284
+ drops["FRQ"] += old - len(gwas)
285
+ logger.info(f"Removed {drops['FRQ']} SNPs with MAF <= 0.01.")
268
286
 
269
287
  # filter: P-value that out-of-bounds [0,1]
270
- if 'P' in gwas.columns:
288
+ if "P" in gwas.columns:
271
289
  old = len(gwas)
272
- gwas = gwas.loc[filter_pvals(gwas['P'], config)]
273
- drops['P'] += old - len(gwas)
274
- logger.info(f'Removed {drops["P"]} SNPs with out-of-bounds p-values.')
290
+ gwas = gwas.loc[filter_pvals(gwas["P"], config)]
291
+ drops["P"] += old - len(gwas)
292
+ logger.info(f"Removed {drops['P']} SNPs with out-of-bounds p-values.")
275
293
 
276
294
  # filter: Variants that are strand-ambiguous
277
- if 'A1' in gwas.columns and 'A2' in gwas.columns:
295
+ if "A1" in gwas.columns and "A2" in gwas.columns:
278
296
  gwas.A1 = gwas.A1.str.upper()
279
297
  gwas.A2 = gwas.A2.str.upper()
280
298
  gwas = gwas.loc[filter_alleles(gwas.A1 + gwas.A2)]
281
- drops['A'] += old - len(gwas)
282
- logger.info(f'Removed {drops["A"]} variants that were not SNPs or were strand-ambiguous.')
299
+ drops["A"] += old - len(gwas)
300
+ logger.info(f"Removed {drops['A']} variants that were not SNPs or were strand-ambiguous.")
283
301
 
284
302
  # filter: Duplicated rs numbers
285
- if 'SNP' in gwas.columns:
303
+ if "SNP" in gwas.columns:
286
304
  old = len(gwas)
287
- gwas = gwas.drop_duplicates(subset='SNP').reset_index(drop=True)
288
- drops['Dup'] += old - len(gwas)
289
- logger.info(f'Removed {drops["Dup"]} SNPs with duplicated rs numbers.')
305
+ gwas = gwas.drop_duplicates(subset="SNP").reset_index(drop=True)
306
+ drops["Dup"] += old - len(gwas)
307
+ logger.info(f"Removed {drops['Dup']} SNPs with duplicated rs numbers.")
290
308
 
291
309
  # filter:Sample size
292
310
  n_min = gwas.N.quantile(0.9) / 1.5
293
311
  old = len(gwas)
294
312
  gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
295
- drops['N'] += old - len(gwas)
296
- logger.info(f'Removed {drops["N"]} SNPs with N < {n_min}.')
313
+ drops["N"] += old - len(gwas)
314
+ logger.info(f"Removed {drops['N']} SNPs with N < {n_min}.")
297
315
 
298
316
  return gwas
299
317
 
300
318
 
301
319
  def variant_to_rsid(gwas, config):
302
- '''
320
+ """
303
321
  Convert variant id (Chr, Pos) to rsid
304
- '''
322
+ """
305
323
  logger.info("\nConverting the SNP position to rsid. This process may take some time.")
306
- unique_ids = set(gwas['id'])
307
- chr_format = gwas['Chr'].unique().astype(str)
308
- chr_format = [re.sub(r'\d+', '', value) for value in chr_format][1]
309
-
310
- dtype = {'chr': str, 'pos': str, 'ref': str, 'alt': str, 'dbsnp': str}
311
- chunk_iter = pd.read_csv(config.dbsnp, chunksize=config.chunksize, sep="\t", skiprows=1,
312
- dtype=dtype, names=['chr', 'pos', 'ref', 'alt', 'dbsnp'])
324
+ unique_ids = set(gwas["id"])
325
+ chr_format = gwas["Chr"].unique().astype(str)
326
+ chr_format = [re.sub(r"\d+", "", value) for value in chr_format][1]
327
+
328
+ dtype = {"chr": str, "pos": str, "ref": str, "alt": str, "dbsnp": str}
329
+ chunk_iter = pd.read_csv(
330
+ config.dbsnp,
331
+ chunksize=config.chunksize,
332
+ sep="\t",
333
+ skiprows=1,
334
+ dtype=dtype,
335
+ names=["chr", "pos", "ref", "alt", "dbsnp"],
336
+ )
313
337
 
314
338
  # Iterate over chunks
315
339
  matching_id = pd.DataFrame()
316
340
  for chunk in chunk_iter:
317
- chunk['id'] = chr_format + chunk["chr"] + "_" + chunk["pos"]
318
- matching_id = pd.concat([matching_id, chunk[chunk['id'].isin(unique_ids)][['dbsnp', 'id']]])
341
+ chunk["id"] = chr_format + chunk["chr"] + "_" + chunk["pos"]
342
+ matching_id = pd.concat(
343
+ [matching_id, chunk[chunk["id"].isin(unique_ids)][["dbsnp", "id"]]]
344
+ )
319
345
 
320
- matching_id = matching_id.drop_duplicates(subset='dbsnp').reset_index(drop=True)
321
- matching_id = matching_id.drop_duplicates(subset='id').reset_index(drop=True)
346
+ matching_id = matching_id.drop_duplicates(subset="dbsnp").reset_index(drop=True)
347
+ matching_id = matching_id.drop_duplicates(subset="id").reset_index(drop=True)
322
348
  matching_id.index = matching_id.id
323
349
  return matching_id
324
350
 
325
351
 
326
352
  def clean_SNP_id(gwas, config):
327
- '''
353
+ """
328
354
  Clean SNP id
329
- '''
355
+ """
330
356
  old = len(gwas)
331
- condition1 = 'SNP' in gwas.columns
332
- condition2 = np.all(np.isin(['Chr', 'Pos'], gwas.columns))
357
+ condition1 = "SNP" in gwas.columns
358
+ condition2 = np.all(np.isin(["Chr", "Pos"], gwas.columns))
333
359
 
334
360
  if not (condition1 or condition2):
335
- raise ValueError('Either SNP rsid, or both SNP chromosome and position, are required.')
361
+ raise ValueError("Either SNP rsid, or both SNP chromosome and position, are required.")
336
362
  elif condition1:
337
363
  pass
338
364
  elif condition2:
339
365
  if config.dbsnp is None:
340
- raise ValueError('To Convert SNP positions to rsid, dbsnp reference is required.')
366
+ raise ValueError("To Convert SNP positions to rsid, dbsnp reference is required.")
341
367
  else:
342
- gwas['id'] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
343
- gwas = gwas.drop_duplicates(subset='id').reset_index(drop=True)
368
+ gwas["id"] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
369
+ gwas = gwas.drop_duplicates(subset="id").reset_index(drop=True)
344
370
  gwas.index = gwas.id
345
371
 
346
372
  matching_id = variant_to_rsid(gwas, config)
347
373
  gwas = gwas.loc[matching_id.id]
348
- gwas['SNP'] = matching_id.dbsnp
374
+ gwas["SNP"] = matching_id.dbsnp
349
375
  num_fail = old - len(gwas)
350
- logger.info(f'Removed {num_fail} SNPs that did not convert to rsid.')
376
+ logger.info(f"Removed {num_fail} SNPs that did not convert to rsid.")
351
377
 
352
378
  return gwas
353
379
 
354
380
 
355
381
  def gwas_metadata(gwas, config):
356
- '''
382
+ """
357
383
  Report key features of GWAS data
358
- '''
359
- logger.info('\nSummary of GWAS data:')
360
- CHISQ = (gwas.Z ** 2)
384
+ """
385
+ logger.info("\nSummary of GWAS data:")
386
+ CHISQ = gwas.Z**2
361
387
  mean_chisq = CHISQ.mean()
362
- logger.info('Mean chi^2 = ' + str(round(mean_chisq, 3)))
388
+ logger.info("Mean chi^2 = " + str(round(mean_chisq, 3)))
363
389
  if mean_chisq < 1.02:
364
390
  logger.warning("Mean chi^2 may be too small.")
365
391
 
366
- logger.info('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
367
- logger.info('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
368
- logger.info('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum()))
392
+ logger.info("Lambda GC = " + str(round(CHISQ.median() / 0.4549, 3)))
393
+ logger.info("Max chi^2 = " + str(round(CHISQ.max(), 3)))
394
+ logger.info(
395
+ f"{(CHISQ > 29).sum()} Genome-wide significant SNPs (some may have been removed by filtering)."
396
+ )
369
397
 
370
398
 
371
399
  def gwas_format(config: FormatSumstatsConfig):
372
- '''
400
+ """
373
401
  Format GWAS data
374
- '''
375
- logger.info(f'------Formating gwas data for {config.sumstats}...')
402
+ """
403
+ logger.info(f"------Formating gwas data for {config.sumstats}...")
376
404
  compression_type = get_compression(config.sumstats)
377
- gwas = pd.read_csv(config.sumstats, delim_whitespace=True, header=0, compression=compression_type,
378
- na_values=['.', 'NA'])
379
- logger.info(f'Read {len(gwas)} SNPs from {config.sumstats}.')
405
+ gwas = pd.read_csv(
406
+ config.sumstats,
407
+ delim_whitespace=True,
408
+ header=0,
409
+ compression=compression_type,
410
+ na_values=[".", "NA"],
411
+ )
412
+ logger.info(f"Read {len(gwas)} SNPs from {config.sumstats}.")
380
413
 
381
414
  # Check name and format
382
415
  gwas = gwas_checkname(gwas, config)
@@ -389,19 +422,18 @@ def gwas_format(config: FormatSumstatsConfig):
389
422
  gwas_metadata(gwas, config)
390
423
 
391
424
  # Saving the data
392
- if config.format == 'COJO':
393
- keep = ['SNP', 'A1', 'A2', 'FRQ', 'BETA', 'SE', 'P', 'N']
394
- appendix = '.cojo'
395
- elif config.format == 'gsMap':
425
+ if config.format == "COJO":
426
+ keep = ["SNP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"]
427
+ appendix = ".cojo"
428
+ elif config.format == "gsMap":
396
429
  keep = ["SNP", "A1", "A2", "Z", "N"]
397
- appendix = '.sumstats'
430
+ appendix = ".sumstats"
398
431
 
399
- if 'Chr' in gwas.columns and 'Pos' in gwas.columns and config.keep_chr_pos is True:
400
- keep = keep + ['Chr', 'Pos']
432
+ if "Chr" in gwas.columns and "Pos" in gwas.columns and config.keep_chr_pos is True:
433
+ keep = keep + ["Chr", "Pos"]
401
434
 
402
435
  gwas = gwas[keep]
403
- out_name = config.out + appendix + '.gz'
436
+ out_name = config.out + appendix + ".gz"
404
437
 
405
- logger.info(f'\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.')
406
- gwas.to_csv(out_name, sep="\t", index=False,
407
- float_format='%.3f', compression='gzip')
438
+ logger.info(f"\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.")
439
+ gwas.to_csv(out_name, sep="\t", index=False, float_format="%.3f", compression="gzip")