gwaslab 3.5.6__py3-none-any.whl → 3.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (64) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/g_Sumstats.py +98 -24
  6. gwaslab/g_SumstatsMulti.py +287 -0
  7. gwaslab/g_SumstatsPair.py +101 -16
  8. gwaslab/g_Sumstats_polars.py +245 -0
  9. gwaslab/g_headers.py +12 -3
  10. gwaslab/g_meta.py +123 -47
  11. gwaslab/g_meta_update.py +48 -0
  12. gwaslab/g_vchange_status_polars.py +44 -0
  13. gwaslab/g_version.py +2 -2
  14. gwaslab/hm_casting.py +169 -110
  15. gwaslab/hm_casting_polars.py +202 -0
  16. gwaslab/hm_harmonize_sumstats.py +19 -8
  17. gwaslab/io_load_ld.py +529 -0
  18. gwaslab/io_preformat_input.py +11 -0
  19. gwaslab/io_preformat_input_polars.py +632 -0
  20. gwaslab/io_process_args.py +25 -1
  21. gwaslab/io_read_ldsc.py +34 -3
  22. gwaslab/io_read_pipcs.py +62 -6
  23. gwaslab/prscs_gigrnd.py +122 -0
  24. gwaslab/prscs_mcmc_gtb.py +136 -0
  25. gwaslab/prscs_parse_genet.py +98 -0
  26. gwaslab/qc_build.py +53 -0
  27. gwaslab/qc_check_datatype.py +10 -8
  28. gwaslab/qc_check_datatype_polars.py +128 -0
  29. gwaslab/qc_fix_sumstats.py +25 -23
  30. gwaslab/qc_fix_sumstats_polars.py +193 -0
  31. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  32. gwaslab/util_ex_gwascatalog.py +71 -28
  33. gwaslab/util_ex_ldsc.py +67 -21
  34. gwaslab/util_ex_match_ldmatrix.py +396 -0
  35. gwaslab/util_ex_run_2samplemr.py +0 -2
  36. gwaslab/util_ex_run_ccgwas.py +155 -0
  37. gwaslab/util_ex_run_coloc.py +1 -1
  38. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  39. gwaslab/util_ex_run_mesusie.py +155 -0
  40. gwaslab/util_ex_run_mtag.py +92 -0
  41. gwaslab/util_ex_run_prscs.py +85 -0
  42. gwaslab/util_ex_run_susie.py +40 -9
  43. gwaslab/util_in_estimate_ess.py +18 -0
  44. gwaslab/util_in_fill_data.py +20 -1
  45. gwaslab/util_in_filter_value.py +10 -5
  46. gwaslab/util_in_get_sig.py +71 -13
  47. gwaslab/util_in_meta.py +168 -4
  48. gwaslab/util_in_meta_polars.py +174 -0
  49. gwaslab/viz_plot_compare_effect.py +87 -23
  50. gwaslab/viz_plot_credible_sets.py +55 -11
  51. gwaslab/viz_plot_effect.py +22 -12
  52. gwaslab/viz_plot_miamiplot2.py +3 -2
  53. gwaslab/viz_plot_mqqplot.py +165 -141
  54. gwaslab/viz_plot_qqplot.py +6 -6
  55. gwaslab/viz_plot_regional2.py +5 -13
  56. gwaslab/viz_plot_rg_heatmap.py +6 -1
  57. gwaslab/viz_plot_stackedregional.py +21 -6
  58. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/METADATA +9 -7
  59. gwaslab-3.5.8.dist-info/RECORD +117 -0
  60. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
  61. gwaslab-3.5.6.dist-info/RECORD +0 -96
  62. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
  63. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  64. {gwaslab-3.5.6.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/io_load_ld.py ADDED
@@ -0,0 +1,529 @@
1
+
2
+ import scipy.sparse as sparse
3
+ import numpy as np
4
+ import pandas as pd
5
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
6
+ import subprocess
7
+ import os
8
+ import re
9
+ import gc
10
+ import pandas as pd
11
+ import numpy as np
12
+ from gwaslab.g_Log import Log
13
+ from gwaslab.qc_fix_sumstats import start_to
14
+ from gwaslab.qc_fix_sumstats import finished
15
+ from gwaslab.util_in_get_sig import getsig
16
+ from gwaslab.util_ex_process_ref import _process_plink_input_files
17
+ from gwaslab.g_version import _checking_plink_version
18
+ from gwaslab.util_in_filter_value import _exclude_hla
19
+ from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
20
+ from gwaslab.util_ex_calculate_ldmatrix import _export_snplist_and_locus_sumstats
21
+ from gwaslab.viz_plot_regional2 import _get_lead_id
22
+ from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
23
+
24
+ def tofinemapping_using_ld(sumstats,
25
+ study=None,
26
+ ld_map_path=None,
27
+ ld_path=None,
28
+ ld_fmt = "npz",
29
+ ld_if_square = False,
30
+ ld_if_add_T = False,
31
+ ld_map_rename_dic = None,
32
+ ld_map_kwargs = None,
33
+ loci=None,
34
+ out="./",
35
+ windowsizekb=1000,
36
+ n_cores=1,
37
+ mode="r",
38
+ exclude_hla=False,
39
+ getlead_args=None,
40
+ memory=None,
41
+ overwrite=False,
42
+ log=Log(),
43
+ suffixes=None,
44
+ verbose=True,
45
+ **kwargs):
46
+ ##start function with col checking##########################################################
47
+ _start_line = "calculate LD matrix"
48
+ _end_line = "calculating LD matrix"
49
+ _start_cols =["SNPID","CHR","POS","EA","NEA"]
50
+ _start_function = ".calculate_ld_matrix()"
51
+ _must_args ={}
52
+
53
+ is_enough_info = start_to(sumstats=sumstats,
54
+ log=log,
55
+ verbose=verbose,
56
+ start_line=_start_line,
57
+ end_line=_end_line,
58
+ start_cols=_start_cols,
59
+ start_function=_start_function,
60
+ **_must_args)
61
+ if is_enough_info == False: raise ValueError("Not enough columns for calculating LD matrix")
62
+ ############################################################################################
63
+ if suffixes is None:
64
+ suffixes=[""]
65
+ if getlead_args is None:
66
+ getlead_args={"windowsizekb":1000}
67
+ if ld_map_kwargs is None:
68
+ ld_map_kwargs={}
69
+
70
+ if loci is None:
71
+ log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
72
+ sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
73
+ else:
74
+ sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
75
+
76
+ # Drop duplicate!!!!
77
+ log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
78
+ sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
79
+
80
+ # init Filelist DataFrame
81
+ output_file_list = pd.DataFrame(columns=["SNPID","SNPID_LIST","LD_R_MATRIX","LOCUS_SUMSTATS"])
82
+
83
+ plink_log=""
84
+
85
+ if exclude_hla==True:
86
+ sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
87
+
88
+ sig_df = sig_df.reset_index()
89
+
90
+ ## for each lead variant
91
+ for index, row in sig_df.iterrows():
92
+ # extract snplist in each locus
93
+ gc.collect()
94
+ log.write(" -Locus #{}---------------------------------------------------------------".format(index+1))
95
+ log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
96
+ locus_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
97
+
98
+ ld_map = _load_ld_map(ld_map_path, ld_map_rename_dic = ld_map_rename_dic, **ld_map_kwargs )
99
+
100
+ ## check available snps with reference file
101
+ matched_sumstats = _merge_ld_map_with_sumstats(row=row,
102
+ locus_sumstats=locus_sumstats,
103
+ ld_map=ld_map,
104
+ log=log,suffixes=suffixes)
105
+ if len(matched_sumstats)==0:
106
+ log.write(" -No matching LD information... Skipping...")
107
+ continue
108
+
109
+ #########################################################################################################
110
+ # create matched snp list
111
+ matched_snp_list_path, matched_sumstats_path=_export_snplist_and_locus_sumstats(matched_sumstats=matched_sumstats,
112
+ out=out,
113
+ study=study,
114
+ row=row,
115
+ windowsizekb=windowsizekb,
116
+ log=log,
117
+ suffixes=suffixes)
118
+ #########################################################################################################
119
+
120
+ ## Calculate ld matrix using PLINK
121
+ r_matrix = _load_ld_matrix(ld_path, fmt=ld_fmt, if_square=ld_if_square, if_add_T=ld_if_add_T, log=log, verbose=verbose)
122
+
123
+ matched_ld_matrix_path = _extract_variants(matched_sumstats, r_matrix, out, study, row, windowsizekb, log=log, verbose=verbose)
124
+
125
+ # print file list
126
+ row_dict={}
127
+ row_dict["SNPID"]=row["SNPID"]
128
+ row_dict["SNPID_LIST"] = matched_snp_list_path
129
+ row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
130
+ row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path
131
+ file_row = pd.Series(row_dict).to_frame().T
132
+ output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
133
+
134
+ if len(output_file_list)>0:
135
+ output_file_list["STUDY"] = study
136
+ nloci = len(output_file_list)
137
+ output_file_list_path = "{}/{}_{}loci_{}kb.filelist".format(out.rstrip("/"), study,nloci, windowsizekb)
138
+ output_file_list.to_csv(output_file_list_path,index=None,sep="\t")
139
+ log.write(" -File list is saved to: {}".format(output_file_list_path),verbose=verbose)
140
+ log.write(" -Finished LD matrix calculation.",verbose=verbose)
141
+ else:
142
+ output_file_list_path=None
143
+ log.write(" -No avaialable lead variants.",verbose=verbose)
144
+ log.write(" -Stopped LD matrix calculation.",verbose=verbose)
145
+ finished(log=log, verbose=verbose, end_line=_end_line)
146
+ return output_file_list_path, output_file_list, plink_log
147
+
148
+
149
+
150
+
151
+ def process_ld(sumstats,
152
+ ld_path,
153
+ ld_map_path,
154
+ region,
155
+ region_ref,
156
+ log,
157
+ verbose,
158
+ pos ,
159
+ nea,
160
+ ea,
161
+ region_ld_threshold,
162
+ ld_fmt = "npz",
163
+ ld_if_square = False,
164
+ ld_if_add_T = False,
165
+ ld_map_rename_dic = None,
166
+ ld_map_kwargs = None):
167
+ log.write("Start to load reference genotype...", verbose=verbose)
168
+ log.write(" -reference ld matrix path : "+ ld_path, verbose=verbose)
169
+
170
+ # load genotype data of the targeted region
171
+
172
+
173
+ log.write(" -Retrieving index...", verbose=verbose)
174
+
175
+ # match sumstats pos and ref pos:
176
+ # get ref index for its first appearance of sumstats pos
177
+ #######################################################################################
178
+ if ld_map_kwargs is None:
179
+ ld_map_kwargs={}
180
+
181
+ ld_map = _load_ld_map(ld_map_path,
182
+ ld_map_rename_dic = ld_map_rename_dic,
183
+ **ld_map_kwargs )
184
+
185
+ log.write(" -Ref variants: {}".format( len(ld_map) ), verbose=verbose)
186
+
187
+ ## check available snps with reference file
188
+ sumstats = _merge_ld_map_with_sumstats_for_regional(
189
+ locus_sumstats=sumstats,
190
+ ld_map=ld_map,
191
+ log=log,
192
+ suffixes=None,verbose=verbose)
193
+ sumstats["REFINDEX"] = sumstats["_INDEX_BIM"]
194
+
195
+ #############################################################################################
196
+
197
+ r_matrix = _load_ld_matrix(ld_path,
198
+ fmt=ld_fmt,
199
+ if_square=ld_if_square,
200
+ if_add_T=ld_if_add_T,
201
+ log=log,
202
+ verbose=verbose)
203
+
204
+ #for loop to add LD information
205
+ #############################################################################################
206
+ for ref_n, region_ref_single in enumerate(region_ref):
207
+
208
+ rsq = "RSQ_{}".format(ref_n)
209
+ ld_single = "LD_{}".format(ref_n)
210
+ lead = "LEAD_{}".format(ref_n)
211
+ sumstats[lead]= 0
212
+
213
+ # get lead variant id and pos
214
+ if region_ref_single is None:
215
+ # if not specified, use lead variant
216
+ lead_id = sumstats["scaled_P"].idxmax()
217
+ else:
218
+ # figure out lead variant
219
+ lead_id = _get_lead_id(sumstats, region_ref_single, log, verbose)
220
+
221
+ lead_series = None
222
+ if lead_id is None:
223
+
224
+ matched_snpid = re.match("(chr)?[0-9]+:[0-9]+:[ATCG]+:[ATCG]+",region_ref_single, re.IGNORECASE)
225
+
226
+ if matched_snpid is None:
227
+ sumstats[rsq] = None
228
+ sumstats[rsq] = sumstats[rsq].astype("float")
229
+ sumstats[ld_single] = 0
230
+ continue
231
+ else:
232
+
233
+ lead_snpid = matched_snpid.group(0).split(":")[1:]
234
+ lead_snpid[0]= int(lead_snpid[0])
235
+ lead_series = pd.Series(lead_snpid)
236
+
237
+ # if lead pos is available:
238
+ if sumstats.loc[lead_id, "REFINDEX"] is not None:
239
+ lead_snp_ref_index = sumstats.loc[lead_id, "REFINDEX"]
240
+
241
+ is_matched = ~sumstats["REFINDEX"].isna()
242
+
243
+ ref_index = sumstats.loc[is_matched,"REFINDEX"].astype("Int64")
244
+
245
+ sumstats.loc[is_matched, rsq] = r_matrix[int(lead_snp_ref_index), list(ref_index.values)]
246
+
247
+ else:
248
+ log.write(" -Lead SNP not found in reference...", verbose=verbose)
249
+ sumstats[rsq]=None
250
+ #
251
+ try:
252
+ sumstats.loc[lead_id,rsq]=1
253
+ except KeyError:
254
+ pass
255
+
256
+ sumstats[rsq] = sumstats[rsq].astype("float")
257
+ sumstats[ld_single] = 0
258
+
259
+ for index,ld_threshold in enumerate(region_ld_threshold):
260
+ # No data,LD = 0
261
+ # 0, 0.2 LD = 1
262
+ # 1, 0.4 LD = 2
263
+ # 2, 0.6 LD = 3
264
+ # 3, 0.8 LD = 4
265
+ # 4, 1.0 LD = 5
266
+ # lead LD = 6
267
+
268
+ if index==0:
269
+ to_change_color = sumstats[rsq]>-1
270
+ sumstats.loc[to_change_color,ld_single] = 1
271
+ to_change_color = sumstats[rsq]>ld_threshold
272
+ sumstats.loc[to_change_color,ld_single] = index+2
273
+
274
+ if lead_series is None:
275
+ sumstats.loc[lead_id,ld_single] = len(region_ld_threshold)+2
276
+ sumstats.loc[lead_id,lead] = 1
277
+
278
+ ####################################################################################################
279
+ final_shape_col = "SHAPE"
280
+ final_ld_col = "LD"
281
+ final_rsq_col = "RSQ"
282
+
283
+ sumstats[final_ld_col] = 0
284
+ sumstats[final_shape_col] = 1
285
+ sumstats[final_rsq_col] = 0.0
286
+
287
+ if len(region_ref)==1:
288
+ if lead_id is not None:
289
+ sumstats.loc[lead_id, final_shape_col] +=1
290
+
291
+ for i in range(len(region_ref)):
292
+ ld_single = "LD_{}".format(i)
293
+ current_rsq = "RSQ_{}".format(i)
294
+ a_ngt_b = sumstats[final_rsq_col] < sumstats[current_rsq]
295
+ #set levels with interval=100
296
+ sumstats.loc[a_ngt_b, final_ld_col] = 100 * (i+1) + sumstats.loc[a_ngt_b, ld_single]
297
+ sumstats.loc[a_ngt_b, final_rsq_col] = sumstats.loc[a_ngt_b, current_rsq]
298
+ sumstats.loc[a_ngt_b, final_shape_col] = i + 1
299
+
300
+ sumstats = sumstats.dropna(subset=[pos,nea,ea])
301
+
302
+ ####################################################################################################
303
+ log.write("Finished loading reference genotype successfully!", verbose=verbose)
304
+ return sumstats
305
+ ####################################################################################################
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+ ####################################################################################################
339
+ def _load_ld_matrix(path,
340
+ fmt="npz",
341
+ if_square=False,
342
+ if_add_T=False,
343
+ log=Log(),
344
+ verbose=True):
345
+
346
+ if fmt == "npz":
347
+ log.write(" -Loading LD matrix from npz file...",verbose=verbose)
348
+ r_matrix = sparse.load_npz(path).toarray()
349
+ if fmt == "txt":
350
+ log.write(" -Loading LD matrix from text file...",verbose=verbose)
351
+ r_matrix = np.loadtxt(path)
352
+
353
+ if if_add_T==True:
354
+ log.write(" -Transforming LD matrix by adding its transpose...",verbose=verbose)
355
+ r_matrix += r_matrix.T
356
+ if if_square==True:
357
+ log.write(" -Transforming LD matrix by squaring all elements...",verbose=verbose)
358
+ r_matrix = np.power(r_matrix,2)
359
+ return r_matrix
360
+
361
+ def _load_ld_map(path,
362
+ snpid="rsid",
363
+ chrom="chromosome",
364
+ pos="position",
365
+ ref="allele1",
366
+ alt="allele2",
367
+ ld_map_rename_dic = None,
368
+ **ld_map_kwargs):
369
+
370
+ if ld_map_rename_dic is not None:
371
+ if type(ld_map_rename_dic) is dict:
372
+ ld_map_rename_dic_to_use={ld_map_rename_dic["EA"]:'EA_bim',
373
+ ld_map_rename_dic["NEA"]:'NEA_bim',
374
+ ld_map_rename_dic["POS"]:'POS',
375
+ ld_map_rename_dic["CHR"]:'CHR',
376
+ ld_map_rename_dic["SNPID"]:'SNPID_bim'
377
+ }
378
+ ld_map_kwargs["usecols"]=list(ld_map_rename_dic.values())
379
+ else:
380
+ ld_map_rename_dic_to_use={ld_map_rename_dic[4]:'EA_bim',
381
+ ld_map_rename_dic[3]:'NEA_bim',
382
+ ld_map_rename_dic[2]:'POS',
383
+ ld_map_rename_dic[1]:'CHR',
384
+ ld_map_rename_dic[0]:'SNPID_bim'
385
+ }
386
+ ld_map_kwargs["usecols"]=ld_map_rename_dic
387
+ else:
388
+ ld_map_rename_dic_to_use={alt:'EA_bim',
389
+ ref:'NEA_bim',
390
+ pos:'POS',
391
+ chrom:'CHR',
392
+ snpid:"SNPID_bim"
393
+ }
394
+ ld_map_kwargs["usecols"]=[chrom, pos, ref, alt, snpid]
395
+ #rsid chromosome position allele1 allele2
396
+ if "sep" not in ld_map_kwargs:
397
+ ld_map_kwargs["sep"] = "\s+"
398
+
399
+ ld_map = pd.read_csv(path,**ld_map_kwargs)
400
+ ld_map = ld_map.rename(columns=ld_map_rename_dic_to_use, errors='ignore')
401
+ # "SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"
402
+ return ld_map
403
+
404
+ def _extract_variants(merged_sumstats, r_matrix, out, study, row, windowsizekb, log, verbose):
405
+
406
+ avaiable_index = merged_sumstats["_INDEX_BIM"].values
407
+
408
+ flipped = merged_sumstats["_FLIPPED"].values
409
+
410
+ reduced_r_matrix = r_matrix[np.ix_(avaiable_index, avaiable_index)]
411
+
412
+ log.write(" -Flipping LD matrix for {} variants...".format(sum(flipped)),verbose=verbose)
413
+ reduced_r_matrix[flipped,:] = -1 * reduced_r_matrix[flipped,:]
414
+ reduced_r_matrix[:,flipped] = -1 * reduced_r_matrix[:,flipped]
415
+
416
+ snplist_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"),study,row["SNPID"],windowsizekb)
417
+ output_prefix = "{}/{}_{}_{}".format(out.rstrip("/"),study,row["SNPID"],windowsizekb)
418
+ output_path = "{}.ld.gz".format(output_prefix)
419
+
420
+ pd.DataFrame(reduced_r_matrix).to_csv(output_path,sep="\t",index=None,header=None)
421
+ #reduced_r_matrix.to_csv("{}.ld.gz".format(output_prefix),se="\t")
422
+ return output_path
423
+
424
+ def _merge_ld_map_with_sumstats(row,
425
+ locus_sumstats,
426
+ ld_map,
427
+ log=Log(),
428
+ suffixes=None):
429
+ '''
430
+ align sumstats with bim
431
+ '''
432
+
433
+ index1= "_INDEX_SUMSTATS"
434
+ index2= "_INDEX_BIM"
435
+ locus_sumstats[index1] = locus_sumstats.index
436
+ ld_map[index2] = ld_map.index
437
+ locus_sumstats["_FLIPPED"] = False
438
+
439
+ if suffixes is None:
440
+ suffixes=[""]
441
+
442
+ log.write(" -Variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
443
+ # convert category to string
444
+ locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
445
+ locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
446
+
447
+ # matching by SNPID
448
+ # preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
449
+ combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
450
+
451
+ # match allele
452
+ perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
453
+ log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
454
+
455
+ # fliipped allele
456
+ #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
457
+ flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
458
+ log.write(" -Variants with flipped alleles:{}".format(sum(flipped_match)))
459
+
460
+ allele_match = perfect_match | flipped_match
461
+ log.write(" -Total Variants matched:{}".format(sum(allele_match)))
462
+
463
+ if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
464
+ log.warning("Lead variant was not available in reference!")
465
+
466
+ # adjust statistics
467
+ output_columns=["SNPID","CHR","POS","EA","NEA","_INDEX_BIM","_FLIPPED"]
468
+ for suffix in suffixes:
469
+ if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
470
+ #log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
471
+ #combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
472
+ output_columns.append("BETA"+suffix)
473
+ output_columns.append("SE"+suffix)
474
+ if "Z" in locus_sumstats.columns:
475
+ #log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
476
+ #combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
477
+ output_columns.append("Z"+suffix)
478
+ if "EAF" in locus_sumstats.columns:
479
+ #log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
480
+ #combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
481
+ output_columns.append("EAF"+suffix)
482
+ if "N" in locus_sumstats.columns:
483
+ output_columns.append("N"+suffix)
484
+ combined_df.loc[flipped_match,"_FLIPPED"] = True
485
+ return combined_df.loc[allele_match,output_columns]
486
+
487
+ def _merge_ld_map_with_sumstats_for_regional(
488
+ locus_sumstats,
489
+ ld_map,
490
+ log=Log(),
491
+ suffixes=None,
492
+ verbose=True):
493
+ '''
494
+ align sumstats with bim
495
+ '''
496
+
497
+ index1= "_INDEX_SUMSTATS"
498
+ index2= "_INDEX_BIM"
499
+ locus_sumstats[index1] = locus_sumstats.index
500
+ ld_map[index2] = ld_map.index
501
+
502
+ if suffixes is None:
503
+ suffixes=[""]
504
+
505
+ # convert category to string
506
+ locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
507
+ locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
508
+
509
+ # matching by SNPID
510
+ # preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
511
+ combined_df = pd.merge(locus_sumstats, ld_map, on=["CHR","POS"],how="left")
512
+ combined_df[["EA_bim","NEA_bim"]] = combined_df[["EA_bim","NEA_bim"]].fillna("N")
513
+ # match allele
514
+ perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
515
+
516
+ # fliipped allele
517
+ #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
518
+ flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
519
+
520
+ not_matched = combined_df[index2].isna()
521
+
522
+ allele_match = perfect_match | flipped_match
523
+
524
+ log.write(" -Total Variants matched:{}".format( sum(allele_match) ),verbose=verbose)
525
+ log.write(" -Total Variants not in reference:{}".format(sum(not_matched)),verbose=verbose)
526
+
527
+ return combined_df.loc[allele_match | not_matched,:]
528
+
529
+ ############################################################################################################################################################################################################################################################
@@ -34,6 +34,7 @@ def preformat(sumstats,
34
34
  f=None,
35
35
  t=None,
36
36
  p=None,
37
+ q=None,
37
38
  mlog10p=None,
38
39
  test=None,
39
40
  info=None,
@@ -51,6 +52,7 @@ def preformat(sumstats,
51
52
  dof=None,
52
53
  ncase=None,
53
54
  ncontrol=None,
55
+ neff=None,
54
56
  direction=None,
55
57
  status=None,
56
58
  study=None,
@@ -107,6 +109,9 @@ def preformat(sumstats,
107
109
  if "format_comment" in meta_data.keys():
108
110
  readargs["comment"] = meta_data["format_comment"]
109
111
 
112
+ if "format_other_cols" in meta_data.keys():
113
+ other += meta_data["format_other_cols"]
114
+
110
115
  if "sep" not in readargs.keys():
111
116
  readargs["sep"] = "\t"
112
117
 
@@ -215,6 +220,9 @@ def preformat(sumstats,
215
220
  if ncontrol and (type(ncontrol) is str):
216
221
  usecols.append(ncontrol)
217
222
  rename_dictionary[ncontrol]= "N_CONTROL"
223
+ if neff and (type(neff) is str):
224
+ usecols.append(neff)
225
+ rename_dictionary[neff]= "N_EFF"
218
226
  if beta:
219
227
  usecols.append(beta)
220
228
  rename_dictionary[beta]= "BETA"
@@ -233,6 +241,9 @@ def preformat(sumstats,
233
241
  if z:
234
242
  usecols.append(z)
235
243
  rename_dictionary[z]= "Z"
244
+ if q:
245
+ usecols.append(q)
246
+ rename_dictionary[q]= "Q"
236
247
  if p:
237
248
  usecols.append(p)
238
249
  rename_dictionary[p]= "P"