gwaslab 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (63) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/g_Sumstats.py +98 -24
  6. gwaslab/g_SumstatsMulti.py +287 -0
  7. gwaslab/g_SumstatsPair.py +101 -16
  8. gwaslab/g_Sumstats_polars.py +245 -0
  9. gwaslab/g_headers.py +12 -3
  10. gwaslab/g_meta.py +123 -47
  11. gwaslab/g_meta_update.py +48 -0
  12. gwaslab/g_vchange_status_polars.py +44 -0
  13. gwaslab/g_version.py +2 -2
  14. gwaslab/hm_casting.py +169 -110
  15. gwaslab/hm_casting_polars.py +202 -0
  16. gwaslab/hm_harmonize_sumstats.py +19 -8
  17. gwaslab/io_load_ld.py +529 -0
  18. gwaslab/io_preformat_input.py +11 -0
  19. gwaslab/io_preformat_input_polars.py +632 -0
  20. gwaslab/io_process_args.py +25 -1
  21. gwaslab/io_read_ldsc.py +34 -3
  22. gwaslab/io_read_pipcs.py +62 -6
  23. gwaslab/prscs_gigrnd.py +122 -0
  24. gwaslab/prscs_mcmc_gtb.py +136 -0
  25. gwaslab/prscs_parse_genet.py +98 -0
  26. gwaslab/qc_build.py +53 -0
  27. gwaslab/qc_check_datatype.py +10 -8
  28. gwaslab/qc_check_datatype_polars.py +128 -0
  29. gwaslab/qc_fix_sumstats.py +25 -23
  30. gwaslab/qc_fix_sumstats_polars.py +193 -0
  31. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  32. gwaslab/util_ex_gwascatalog.py +71 -28
  33. gwaslab/util_ex_ldsc.py +67 -21
  34. gwaslab/util_ex_match_ldmatrix.py +396 -0
  35. gwaslab/util_ex_run_2samplemr.py +0 -2
  36. gwaslab/util_ex_run_ccgwas.py +155 -0
  37. gwaslab/util_ex_run_coloc.py +1 -1
  38. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  39. gwaslab/util_ex_run_mesusie.py +155 -0
  40. gwaslab/util_ex_run_mtag.py +92 -0
  41. gwaslab/util_ex_run_prscs.py +85 -0
  42. gwaslab/util_ex_run_susie.py +40 -9
  43. gwaslab/util_in_estimate_ess.py +18 -0
  44. gwaslab/util_in_fill_data.py +20 -1
  45. gwaslab/util_in_filter_value.py +10 -5
  46. gwaslab/util_in_get_sig.py +71 -13
  47. gwaslab/util_in_meta.py +168 -4
  48. gwaslab/util_in_meta_polars.py +174 -0
  49. gwaslab/viz_plot_compare_effect.py +87 -23
  50. gwaslab/viz_plot_credible_sets.py +55 -11
  51. gwaslab/viz_plot_effect.py +22 -12
  52. gwaslab/viz_plot_miamiplot2.py +3 -2
  53. gwaslab/viz_plot_mqqplot.py +84 -81
  54. gwaslab/viz_plot_qqplot.py +6 -6
  55. gwaslab/viz_plot_regional2.py +2 -1
  56. gwaslab/viz_plot_stackedregional.py +4 -1
  57. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/METADATA +8 -6
  58. gwaslab-3.5.8.dist-info/RECORD +117 -0
  59. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
  60. gwaslab-3.5.7.dist-info/RECORD +0 -96
  61. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
  62. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  63. {gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0
gwaslab/util_ex_ldsc.py CHANGED
@@ -11,9 +11,10 @@ from gwaslab.util_in_filter_value import filtervalues
11
11
  from gwaslab.util_in_filter_value import _filter_palindromic
12
12
  from gwaslab.util_in_filter_value import _exclude_hla
13
13
  from gwaslab.util_in_filter_value import _exclude_sexchr
14
+ import copy
14
15
 
15
16
  class ARGS():
16
- def __init__(self, **kwargs):
17
+ def __init__(self, kwargs=None):
17
18
 
18
19
  self.out = "ldsc"
19
20
 
@@ -257,11 +258,12 @@ class ARGS():
257
258
  ####################################################################################################################
258
259
 
259
260
 
260
- def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=None, **kwargs):
261
- sumstats = insumstats.copy()
261
+ def _estimate_h2_by_ldsc(insumstats, log, meta=None,verbose=True, munge=False, munge_args=None, **raw_kwargs):
262
+ sumstats = insumstats
263
+ kwargs = copy.deepcopy(raw_kwargs)
262
264
 
263
265
  if "N" in sumstats.columns:
264
- sumstats["N"] = sumstats["N"].astype("int64")
266
+ sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
265
267
 
266
268
  if munge:
267
269
  if munge_args is None:
@@ -291,21 +293,25 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
291
293
  log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
292
294
  log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
293
295
 
294
-
295
-
296
+ if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
297
+ if "samp_prev" not in kwargs.keys():
298
+ kwargs["samp_prev"] = "{}".format(meta["gwaslab"]["sample_prevalence"])
299
+ if "pop_prev" not in kwargs.keys():
300
+ kwargs["pop_prev"] = "{}".format(meta["gwaslab"]["population_prevalence"])
296
301
 
297
302
  log.write(" -Arguments:", verbose=verbose)
298
303
  for key, value in kwargs.items():
299
304
  log.write(" -{}:{}".format(key, value), verbose=verbose)
300
- default_args = ARGS(**kwargs)
305
+
306
+ default_args = ARGS(kwargs = kwargs)
301
307
 
302
308
  if "Z" not in sumstats.columns:
303
309
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
304
310
 
305
311
  sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
306
-
312
+
307
313
  log.write(" -LDSC log:", verbose=verbose)
308
- summary = estimate_h2(sumstats, default_args, log)
314
+ summary = estimate_h2(sumstats, args = default_args, log = log)
309
315
 
310
316
  results_table = None
311
317
  if type(summary) is tuple:
@@ -321,10 +327,11 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
321
327
 
322
328
  ####################################################################################################################
323
329
 
324
- def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
330
+ def _estimate_partitioned_h2_by_ldsc(insumstats, log, meta=None,verbose=True, **raw_kwargs):
325
331
  sumstats = insumstats.copy()
332
+ kwargs = copy.deepcopy(raw_kwargs)
326
333
  if "N" in sumstats.columns:
327
- sumstats["N"] = sumstats["N"].astype("int64")
334
+ sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
328
335
  ##start function with col checking##########################################################
329
336
  _start_line = "run LD score regression"
330
337
  _end_line = "running LD score regression"
@@ -347,10 +354,16 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
347
354
  log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
348
355
  log.write(" -Arguments:", verbose=verbose)
349
356
 
357
+ if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
358
+ if "samp_prev" not in kwargs.keys():
359
+ kwargs["samp_prev"] = "{}".format(meta["gwaslab"]["sample_prevalence"])
360
+ if "pop_prev" not in kwargs.keys():
361
+ kwargs["pop_prev"] = "{}".format(meta["gwaslab"]["population_prevalence"])
362
+
350
363
  for key, value in kwargs.items():
351
364
  log.write(" -{}:{}".format(key, value), verbose=verbose)
352
365
 
353
- default_args = ARGS(**kwargs)
366
+ default_args = ARGS(kwargs = kwargs)
354
367
 
355
368
  if "Z" not in sumstats.columns:
356
369
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
@@ -369,10 +382,11 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
369
382
 
370
383
 
371
384
 
372
- def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
385
+ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, meta=None, verbose=True, **raw_kwargs):
373
386
  sumstats = insumstats.copy()
387
+ kwargs = copy.deepcopy(raw_kwargs)
374
388
  if "N" in sumstats.columns:
375
- sumstats["N"] = sumstats["N"].astype("int64")
389
+ sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
376
390
  ##start function with col checking##########################################################
377
391
  _start_line = "run LD score regression for genetic correlation"
378
392
  _end_line = "running LD score regression for genetic correlation"
@@ -395,18 +409,37 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
395
409
  log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
396
410
  log.write(" -Arguments:", verbose=verbose)
397
411
 
412
+
413
+
414
+ samp_prev_string=""
415
+ pop_prev_string=""
416
+
417
+ if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
418
+
419
+ if "samp_prev" not in kwargs.keys():
420
+ samp_prev_string = "{}".format(meta["gwaslab"]["sample_prevalence"])
421
+ if "pop_prev" not in kwargs.keys():
422
+ pop_prev_string = "{}".format(meta["gwaslab"]["population_prevalence"])
423
+
424
+ if "rg" in kwargs.keys():
425
+ alias = kwargs["rg"].split(",")[1:]
426
+ else:
427
+ alias=[]
428
+ for index, each_other_sumstats in enumerate(other_traits):
429
+ alias.append(each_other_sumstats.meta["gwaslab"]["study_name"])
430
+ kwargs["rg"]=",".join([meta["gwaslab"]["study_name"]]+alias)
431
+
398
432
  for key, value in kwargs.items():
399
433
  log.write(" -{}:{}".format(key, value), verbose=verbose)
400
-
401
- default_args = ARGS(**kwargs)
434
+
435
+ default_args = ARGS(kwargs = kwargs)
402
436
 
403
437
  if "Z" not in sumstats.columns:
404
438
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
405
439
 
406
440
  sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
407
-
441
+
408
442
  other_traits_to_use = []
409
- alias = default_args.rg.split(",")[1:]
410
443
 
411
444
  for index, each_other_sumstats in enumerate(other_traits):
412
445
  log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
@@ -419,6 +452,18 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
419
452
  to_append["Z"] = to_append["BETA"]/to_append["SE"]
420
453
 
421
454
  other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
455
+
456
+ if each_other_sumstats.meta["gwaslab"]["sample_prevalence"]!="Unknown" and each_other_sumstats.meta["gwaslab"]["population_prevalence"]!="Unknown" :
457
+ samp_prev_string += ",{}".format(meta["gwaslab"]["sample_prevalence"])
458
+ pop_prev_string += ",{}".format(meta["gwaslab"]["population_prevalence"])
459
+
460
+ if len(pop_prev_string.split(",")) == len(other_traits)+1 and len(samp_prev_string.split(",")) == len(other_traits)+1:
461
+ if "samp_prev" not in kwargs.keys():
462
+ log.write(" -{}:{}".format("samp_prev", samp_prev_string), verbose=verbose)
463
+ default_args.samp_prev = samp_prev_string
464
+ if "pop_prev" not in kwargs.keys():
465
+ log.write(" -{}:{}".format("pop_prev", pop_prev_string), verbose=verbose)
466
+ default_args.pop_prev = pop_prev_string
422
467
 
423
468
  log.write(" -LDSC log:", verbose=verbose)
424
469
  summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
@@ -431,10 +476,11 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
431
476
  ####################################################################################################################
432
477
 
433
478
 
434
- def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
479
+ def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **raw_kwargs):
435
480
  sumstats = insumstats.copy()
481
+ kwargs = copy.deepcopy(raw_kwargs)
436
482
  if "N" in sumstats.columns:
437
- sumstats["N"] = sumstats["N"].astype("int64")
483
+ sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
438
484
  ##start function with col checking##########################################################
439
485
  _start_line = "run LD score regression"
440
486
  _end_line = "running LD score regression"
@@ -460,7 +506,7 @@ def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
460
506
  for key, value in kwargs.items():
461
507
  log.write(" -{}:{}".format(key, value), verbose=verbose)
462
508
 
463
- default_args = ARGS(**kwargs)
509
+ default_args = ARGS(kwargs = kwargs)
464
510
 
465
511
  if "Z" not in sumstats.columns:
466
512
  sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
@@ -0,0 +1,396 @@
1
+ import scipy.sparse as sparse
2
+ import numpy as np
3
+ import pandas as pd
4
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
5
+ import subprocess
6
+ import os
7
+ import re
8
+ import gc
9
+ import pandas as pd
10
+ import numpy as np
11
+ from gwaslab.g_Log import Log
12
+ from gwaslab.qc_fix_sumstats import start_to
13
+ from gwaslab.qc_fix_sumstats import finished
14
+ from gwaslab.util_in_get_sig import getsig
15
+ from gwaslab.util_ex_process_ref import _process_plink_input_files
16
+ from gwaslab.g_version import _checking_plink_version
17
+ from gwaslab.util_in_filter_value import _exclude_hla
18
+ from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
19
+
20
+
21
+
22
+
23
+ def tofinemapping_m(sumstats,
24
+ studies=None,
25
+ group=None,
26
+ ld_paths = None,
27
+ ld_types = None,
28
+ ld_maps = None,
29
+ ld_map_dics = None,
30
+ bfile=None,
31
+ vcf=None,
32
+ locus=None,
33
+ loci=None,
34
+ loci_chrpos=None,
35
+ out="./",
36
+ plink="plink",
37
+ plink2="plink2",
38
+ windowsizekb=1000,
39
+ n_cores=1,
40
+ mode="r",
41
+ exclude_hla=False,
42
+ getlead_args=None,
43
+ memory=None,
44
+ overwrite=False,
45
+ log=Log(),
46
+ suffixes=None,
47
+ ld_map_kwargs=None,
48
+ extra_plink_option="",
49
+ verbose=True,
50
+ **kwargs):
51
+
52
+ ##start function with col checking##########################################################
53
+ _start_line = "calculate LD matrix"
54
+ _end_line = "calculating LD matrix"
55
+ _start_cols =["SNPID","CHR","POS","EA","NEA"]
56
+ _start_function = ".calculate_ld_matrix()"
57
+ _must_args ={}
58
+
59
+ is_enough_info = start_to(sumstats=sumstats,
60
+ log=log,
61
+ verbose=verbose,
62
+ start_line=_start_line,
63
+ end_line=_end_line,
64
+ start_cols=_start_cols,
65
+ start_function=_start_function,
66
+ **_must_args)
67
+
68
+ if is_enough_info == False: raise ValueError("Not enough columns for calculating LD matrix")
69
+
70
+ ############################################################################################
71
+ if suffixes is None:
72
+ suffixes=[""]
73
+ if getlead_args is None:
74
+ getlead_args={"windowsizekb":1000}
75
+ if ld_map_kwargs is None:
76
+ ld_map_kwargs={}
77
+
78
+ if loci is None:
79
+ log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
80
+ sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
81
+ else:
82
+ sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
83
+
84
+ # Drop duplicate!!!!
85
+ log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
86
+ sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
87
+
88
+ # init Filelist DataFrame
89
+ output_file_list = pd.DataFrame(columns=["SNPID","SNPID_LIST","LD_R_MATRIX","LOCUS_SUMSTATS"])
90
+
91
+ plink_log=""
92
+
93
+ if exclude_hla==True:
94
+ sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
95
+
96
+ sig_df = sig_df.reset_index()
97
+ row = sig_df.iloc[0,:]
98
+
99
+ matched_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
100
+
101
+ for i in range(2):
102
+ # for each study
103
+ gc.collect()
104
+ # get ld path and dic
105
+ ld_map_path = ld_maps[i]
106
+ ld_map_rename_dic = ld_map_dics[i]
107
+
108
+
109
+ log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
110
+ ld_map = _load_ld_map(ld_map_path,
111
+ ld_map_rename_dic = ld_map_rename_dic,
112
+ **ld_map_kwargs )
113
+
114
+ ## check available snps with reference file
115
+ matched_sumstats = _merge_ld_map_with_sumstats_m(row=row,
116
+ locus_sumstats=matched_sumstats,
117
+ ld_map=ld_map,
118
+ log=log,
119
+ index=i)
120
+
121
+ if len(matched_sumstats)==0:
122
+ log.write(" -No matching LD information... Skipping...")
123
+ continue
124
+
125
+ # drop na
126
+ matched_sumstats = matched_sumstats.dropna()
127
+
128
+ # export common variants list
129
+ matched_snp_list_path, matched_sumstats_paths=_export_snplist_and_locus_sumstats_m(matched_sumstats=matched_sumstats,
130
+ out=out,
131
+ group=group,
132
+ row=row,
133
+ windowsizekb=windowsizekb,
134
+ log=log)
135
+
136
+ for i in range(2):
137
+ ld_path = ld_paths[i]
138
+
139
+ r_matrix = _load_ld_matrix(ld_path,
140
+ fmt="txt",
141
+ if_square=False,
142
+ if_add_T=False,
143
+ log=log,
144
+ verbose=verbose)
145
+
146
+ matched_ld_matrix_path = _extract_variants_from_ld_matrix_m(merged_sumstats = matched_sumstats,
147
+ r_matrix = r_matrix,
148
+ out = out,
149
+ group = group,
150
+ row = row,
151
+ windowsizekb = windowsizekb,
152
+ index=i,
153
+ log=log, verbose=verbose)
154
+ # #########################################################################################################
155
+
156
+ row_dict={}
157
+ row_dict["SNPID"]=row["SNPID"]
158
+ row_dict["SNPID_LIST"] = matched_snp_list_path
159
+ row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
160
+ row_dict["LOCUS_SUMSTATS"] = matched_sumstats_paths[i] + ".gz"
161
+ row_dict["LOCUS"] = row["SNPID"]
162
+ row_dict["SUBSTUDY"]= i+1
163
+ row_dict["STUDY"] = studies[i]
164
+ file_row = pd.Series(row_dict).to_frame().T
165
+ output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
166
+
167
+ if len(output_file_list)>0:
168
+
169
+ output_file_list["GROUP"] = group
170
+ nloci = len(output_file_list)
171
+ output_file_list_path = "{}/{}_{}study_{}_{}kb.filelist".format(out.rstrip("/"), group,nloci, row["SNPID"], windowsizekb)
172
+ output_file_list.to_csv(output_file_list_path,index=None,sep="\t")
173
+ log.write(" -File list is saved to: {}".format(output_file_list_path),verbose=verbose)
174
+ log.write(" -Finished LD matrix calculation.",verbose=verbose)
175
+ else:
176
+ output_file_list_path=None
177
+ log.write(" -No avaialable lead variants.",verbose=verbose)
178
+ log.write(" -Stopped LD matrix calculation.",verbose=verbose)
179
+
180
+ finished(log=log, verbose=verbose, end_line=_end_line)
181
+
182
+ return output_file_list_path, output_file_list, plink_log
183
+
184
+
185
+ def _export_snplist_and_locus_sumstats_m(matched_sumstats, out, group, row, windowsizekb,log):
186
+ # study suffixes starting from 1
187
+ suffixes=["_{}".format(i+1) for i in range(2)]
188
+
189
+ matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), group, row["SNPID"] ,windowsizekb)
190
+
191
+ matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
192
+ log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
193
+
194
+ # create locus-sumstats EA, NEA, (BETA, SE), Z
195
+ matched_sumstats_paths =[]
196
+
197
+
198
+ for i in range(2):
199
+ # export sumstats for each study
200
+ suffix = suffixes[i]
201
+
202
+ matched_sumstats_path = "{}/{}_{}_{}_{}.sumstats".format(out.rstrip("/"), group, row["SNPID"] ,windowsizekb, i + 1)
203
+ matched_sumstats_paths.append(matched_sumstats_path)
204
+ to_export_columns=["CHR","POS","EA","NEA"]
205
+
206
+ if "Z"+suffix in matched_sumstats.columns :
207
+ to_export_columns.append("Z"+suffix)
208
+ if ("BETA"+suffix in matched_sumstats.columns) and ("SE"+suffix in matched_sumstats.columns):
209
+ to_export_columns.append("BETA"+suffix)
210
+ to_export_columns.append("SE"+suffix)
211
+ if "EAF"+suffix in matched_sumstats.columns :
212
+ to_export_columns.append("EAF"+suffix)
213
+ if "N"+suffix in matched_sumstats.columns:
214
+ to_export_columns.append("N"+suffix)
215
+
216
+ log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
217
+ log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
218
+ #matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, sep="\t",index=None)
219
+ rename_dic={
220
+ "BETA"+suffix:"Beta",
221
+ "SE"+suffix:"Se",
222
+ "SNPID":"SNP"
223
+ }
224
+ matched_sumstats[ ["SNPID"]+to_export_columns].rename(columns=rename_dic).to_csv(matched_sumstats_path, sep="\t",index=None)
225
+ matched_sumstats[ ["SNPID"]+to_export_columns].rename(columns=rename_dic).to_csv(matched_sumstats_path+".gz", sep="\t",index=None)
226
+
227
+ return matched_snp_list_path, matched_sumstats_paths
228
+
229
+ ###################################################################################################################################################################
230
+ ####################################################################################################
231
+ def _load_ld_matrix(path,
232
+ fmt="npz",
233
+ if_square=False,
234
+ if_add_T=False,
235
+ log=Log(),
236
+ verbose=True):
237
+
238
+ if fmt == "npz":
239
+ log.write(" -Loading LD matrix from npz file...",verbose=verbose)
240
+ r_matrix = sparse.load_npz(path).toarray()
241
+ if fmt == "txt":
242
+ log.write(" -Loading LD matrix from text file...",verbose=verbose)
243
+ r_matrix = np.loadtxt(path,delimiter="\t")
244
+ log.write(" -LD matrix shape : {}".format(r_matrix.shape) ,verbose=verbose)
245
+
246
+ if if_add_T==True:
247
+ log.write(" -Transforming LD matrix by adding its transpose...",verbose=verbose)
248
+ r_matrix += r_matrix.T
249
+ if if_square==True:
250
+ log.write(" -Transforming LD matrix by squaring all elements...",verbose=verbose)
251
+ r_matrix = np.power(r_matrix,2)
252
+ return r_matrix
253
+
254
+ def _load_ld_map(path,
255
+ snpid="rsid",
256
+ chrom="chromosome",
257
+ pos="position",
258
+ ref="allele1",
259
+ alt="allele2",
260
+ ld_map_rename_dic = None,
261
+ **ld_map_kwargs):
262
+
263
+ if ld_map_rename_dic is not None:
264
+ # ld map format
265
+ # SNPID_bim,CHR,POS, NEA_bim, EA_bim
266
+ if type(ld_map_rename_dic) is dict:
267
+ ld_map_rename_dic_to_use={ld_map_rename_dic["EA"]:'EA_bim',
268
+ ld_map_rename_dic["NEA"]:'NEA_bim',
269
+ ld_map_rename_dic["POS"]:'POS',
270
+ ld_map_rename_dic["CHR"]:'CHR',
271
+ ld_map_rename_dic["SNPID"]:'SNPID_bim'
272
+ }
273
+ ld_map_kwargs["usecols"]=list(ld_map_rename_dic.values())
274
+ else:
275
+ ld_map_rename_dic_to_use={ld_map_rename_dic[4]:'EA_bim',
276
+ ld_map_rename_dic[3]:'NEA_bim',
277
+ ld_map_rename_dic[2]:'POS',
278
+ ld_map_rename_dic[1]:'CHR',
279
+ ld_map_rename_dic[0]:'SNPID_bim'
280
+ }
281
+ ld_map_kwargs["usecols"]=ld_map_rename_dic
282
+ else:
283
+ ld_map_rename_dic_to_use={alt:'EA_bim',
284
+ ref:'NEA_bim',
285
+ pos:'POS',
286
+ chrom:'CHR',
287
+ snpid:"SNPID_bim"
288
+ }
289
+ ld_map_kwargs["usecols"]=[chrom, pos, ref, alt, snpid]
290
+ #rsid chromosome position allele1 allele2
291
+ if "sep" not in ld_map_kwargs:
292
+ ld_map_kwargs["sep"] = "\s+"
293
+
294
+ ld_map = pd.read_csv(path,**ld_map_kwargs)
295
+ ld_map = ld_map.rename(columns=ld_map_rename_dic_to_use, errors='ignore')
296
+ # "SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"
297
+ return ld_map
298
+
299
+ def _extract_variants_from_ld_matrix_m(merged_sumstats, r_matrix, out, group, row, windowsizekb, log, verbose, index):
300
+ # study suffixes starting from 1
301
+ index_bim_header = "_INDEX_BIM_{}".format(index + 1)
302
+ flipped_header = "_FLIPPED_{}".format(index + 1)
303
+
304
+ # a series of int to indicate if the variant index in raw ld matrix
305
+ avaiable_index = merged_sumstats[index_bim_header].values
306
+
307
+ # a series of boolean values to indicate if the variants is flipped
308
+ flipped = merged_sumstats[flipped_header].values
309
+
310
+ # extract the sub-matrix
311
+ reduced_r_matrix = r_matrix[np.ix_(avaiable_index, avaiable_index)]
312
+
313
+ log.write(" -Flipping LD matrix for {} variants...".format(sum(flipped)),verbose=verbose)
314
+ reduced_r_matrix[flipped,:] = -1 * reduced_r_matrix[flipped,:]
315
+ reduced_r_matrix[:,flipped] = -1 * reduced_r_matrix[:,flipped]
316
+
317
+ output_prefix = "{}/{}_{}_{}_{}".format(out.rstrip("/"),group,row["SNPID"],windowsizekb, index + 1)
318
+ output_path = "{}.ld.gz".format(output_prefix)
319
+
320
+ pd.DataFrame(reduced_r_matrix).to_csv(output_path,sep="\t",index=None,header=None)
321
+ #reduced_r_matrix.to_csv("{}.ld.gz".format(output_prefix),se="\t")
322
+ return output_path
323
+
324
+ def _merge_ld_map_with_sumstats_m(row,
325
+ locus_sumstats,
326
+ ld_map,
327
+ log=Log(),
328
+ index=None):
329
+ '''
330
+ align sumstats with ld map
331
+ '''
332
+ # study suffixes starting from 1
333
+ index_suffix = "_{}".format(index+1)
334
+
335
+ index1= "_INDEX_SUMSTATS"
336
+ index2= "_INDEX_BIM" +index_suffix
337
+
338
+ # Sumstats index
339
+ locus_sumstats[index1] = locus_sumstats.index
340
+
341
+ # ld map index
342
+ ld_map[index2] = ld_map.index
343
+
344
+ # init a column to show if the variants in LD map are flipped or not
345
+ locus_sumstats["_FLIPPED"+index_suffix] = False
346
+
347
+
348
+ log.write(" -Variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
349
+ # convert category to string
350
+ locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
351
+ locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
352
+
353
+ # matching by SNPID
354
+ # preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
355
+ # vairants without a match were removed
356
+ combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
357
+
358
+ # match allele
359
+ perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
360
+ log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
361
+
362
+ # fliipped allele
363
+ #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
364
+ flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
365
+ log.write(" -Variants with flipped alleles:{}".format(sum(flipped_match)))
366
+
367
+ allele_match = perfect_match | flipped_match
368
+ log.write(" -Total Variants matched:{}".format(sum(allele_match)))
369
+
370
+ combined_df.loc[flipped_match,"_FLIPPED"+index_suffix] = True
371
+
372
+ if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
373
+ log.warning("Lead variant was not available in reference!")
374
+
375
+ # adjust output columns
376
+ output_columns=["SNPID","CHR","POS","EA","NEA"]
377
+ for i in combined_df.columns:
378
+ if "_INDEX_BIM" in i:
379
+ output_columns.append(i)
380
+ if "_FLIPPED" in i:
381
+ output_columns.append(i)
382
+
383
+ for i in range(2):
384
+ # study suffixes starting from 1
385
+ index_suffix = "_{}".format(i+1)
386
+ if ("BETA"+index_suffix in combined_df.columns) and ("SE"+index_suffix in combined_df.columns):
387
+ output_columns.append("BETA"+index_suffix)
388
+ output_columns.append("SE"+index_suffix)
389
+ if "Z"+index_suffix in combined_df.columns:
390
+ output_columns.append("Z"+index_suffix)
391
+ if "EAF"+index_suffix in combined_df.columns:
392
+ output_columns.append("EAF"+index_suffix)
393
+ if "N"+index_suffix in combined_df.columns:
394
+ output_columns.append("N"+index_suffix)
395
+
396
+ return combined_df.loc[allele_match,output_columns]
@@ -99,8 +99,6 @@ def _run_two_sample_mr(sumstatspair_object,
99
99
  write.csv(results_directionality, "{prefix}.directionality", row.names = FALSE)
100
100
  '''.format(prefix = prefix)
101
101
 
102
-
103
-
104
102
  # Two Sample MR
105
103
  # Tests
106
104
  ## Pleiotropy