gwaslab 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (67) hide show
  1. gwaslab/__init__.py +2 -0
  2. gwaslab/bd_common_data.py +1 -0
  3. gwaslab/bd_get_hapmap3.py +0 -1
  4. gwaslab/data/formatbook.json +78 -0
  5. gwaslab/data/reference.json +3 -1
  6. gwaslab/g_Sumstats.py +110 -25
  7. gwaslab/g_SumstatsMulti.py +287 -0
  8. gwaslab/g_SumstatsPair.py +101 -16
  9. gwaslab/g_Sumstats_polars.py +245 -0
  10. gwaslab/g_headers.py +12 -3
  11. gwaslab/g_meta.py +124 -47
  12. gwaslab/g_meta_update.py +48 -0
  13. gwaslab/g_vchange_status_polars.py +44 -0
  14. gwaslab/g_version.py +2 -2
  15. gwaslab/hm_casting.py +169 -110
  16. gwaslab/hm_casting_polars.py +202 -0
  17. gwaslab/hm_harmonize_sumstats.py +19 -8
  18. gwaslab/io_load_ld.py +529 -0
  19. gwaslab/io_preformat_input.py +11 -0
  20. gwaslab/io_preformat_input_polars.py +632 -0
  21. gwaslab/io_process_args.py +25 -1
  22. gwaslab/io_read_ldsc.py +34 -3
  23. gwaslab/io_read_pipcs.py +62 -6
  24. gwaslab/prscs_gigrnd.py +122 -0
  25. gwaslab/prscs_mcmc_gtb.py +136 -0
  26. gwaslab/prscs_parse_genet.py +98 -0
  27. gwaslab/qc_build.py +53 -0
  28. gwaslab/qc_check_datatype.py +10 -8
  29. gwaslab/qc_check_datatype_polars.py +128 -0
  30. gwaslab/qc_fix_sumstats.py +25 -23
  31. gwaslab/qc_fix_sumstats_polars.py +193 -0
  32. gwaslab/util_ex_calculate_ldmatrix.py +49 -19
  33. gwaslab/util_ex_gwascatalog.py +71 -28
  34. gwaslab/util_ex_infer_ancestry.py +65 -0
  35. gwaslab/util_ex_ldsc.py +67 -21
  36. gwaslab/util_ex_match_ldmatrix.py +396 -0
  37. gwaslab/util_ex_run_2samplemr.py +0 -2
  38. gwaslab/util_ex_run_ccgwas.py +155 -0
  39. gwaslab/util_ex_run_coloc.py +1 -1
  40. gwaslab/util_ex_run_hyprcoloc.py +117 -0
  41. gwaslab/util_ex_run_magma.py +74 -0
  42. gwaslab/util_ex_run_mesusie.py +155 -0
  43. gwaslab/util_ex_run_mtag.py +92 -0
  44. gwaslab/util_ex_run_prscs.py +85 -0
  45. gwaslab/util_ex_run_susie.py +40 -9
  46. gwaslab/util_in_estimate_ess.py +18 -0
  47. gwaslab/util_in_fill_data.py +20 -1
  48. gwaslab/util_in_filter_value.py +10 -5
  49. gwaslab/util_in_get_sig.py +71 -13
  50. gwaslab/util_in_meta.py +168 -4
  51. gwaslab/util_in_meta_polars.py +174 -0
  52. gwaslab/viz_aux_annotate_plot.py +13 -2
  53. gwaslab/viz_plot_compare_effect.py +87 -23
  54. gwaslab/viz_plot_credible_sets.py +55 -11
  55. gwaslab/viz_plot_effect.py +22 -12
  56. gwaslab/viz_plot_miamiplot2.py +3 -2
  57. gwaslab/viz_plot_mqqplot.py +94 -84
  58. gwaslab/viz_plot_qqplot.py +9 -7
  59. gwaslab/viz_plot_regional2.py +2 -1
  60. gwaslab/viz_plot_stackedregional.py +4 -1
  61. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/METADATA +46 -68
  62. gwaslab-3.6.0.dist-info/RECORD +119 -0
  63. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/WHEEL +1 -1
  64. gwaslab-3.5.7.dist-info/RECORD +0 -96
  65. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE +0 -0
  66. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
  67. {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,20 @@ from gwaslab.g_version import _check_susie_version
9
9
  from gwaslab.qc_fix_sumstats import start_to
10
10
  from gwaslab.qc_fix_sumstats import finished
11
11
 
12
- def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr=0.1,refine="TRUE",L=10, fillldna=True, n=None, delete=False, susie_args="", log=Log(),verbose=True):
12
+ def _run_susie_rss(filepath,
13
+ r="Rscript",
14
+ mode="bs",
15
+ max_iter=100000,
16
+ min_abs_corr=0.1,
17
+ refine="TRUE",
18
+ L=10,
19
+ fillldna=True,
20
+ n=None,
21
+ delete=False, #if delete output file
22
+ susie_args="",
23
+ log=Log(),
24
+ main_sumstats=None,
25
+ verbose=True):
13
26
  ##start function with col checking##########################################################
14
27
  _start_line = "run finemapping using SuSieR from command line"
15
28
  _end_line = "running finemapping using SuSieR from command line"
@@ -43,8 +56,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
43
56
  for index, row in filelist.iterrows():
44
57
  gc.collect()
45
58
  study = row["STUDY"]
46
- ld_r_matrix = row["LD_R_MATRIX"]
47
- sumstats = row["LOCUS_SUMSTATS"]
59
+ ld_r_matrix = row["LD_R_MATRIX"] #ld matrix path
60
+ sumstats = row["LOCUS_SUMSTATS"] #sumsttas path
48
61
  output_prefix = sumstats.replace(".sumstats.gz","")
49
62
  log.write(" -Running for: {} - {}".format(row["SNPID"],row["STUDY"] ))
50
63
  log.write(" -Locus sumstats:{}".format(sumstats))
@@ -54,7 +67,7 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
54
67
  rscript='''
55
68
  library(susieR)
56
69
 
57
- sumstats <- read.csv("{}")
70
+ sumstats <- read.csv("{}",sep="\t")
58
71
 
59
72
  R <- as.matrix(read.csv("{}",sep="\t",header=FALSE))
60
73
  {}
@@ -67,6 +80,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
67
80
 
68
81
  output <- susie_fitted_summary$vars
69
82
  output$SNPID <- sumstats$SNPID[susie_fitted_summary$vars$variable]
83
+ output$LOCUS <- "{}"
84
+ output$STUDY <- "{}"
70
85
 
71
86
  write.csv(output, "{}.pipcs", row.names = FALSE)
72
87
  '''.format(sumstats,
@@ -79,6 +94,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
79
94
  refine,
80
95
  L,
81
96
  susie_args,
97
+ row["SNPID"],
98
+ row["STUDY"],
82
99
  output_prefix)
83
100
  susier_line = "susie_rss({}, n = {}, R = R, max_iter = {}, min_abs_corr={}, refine = {}, L = {}{})".format("z= sumstats$Z," if mode=="z" else "bhat = sumstats$BETA,shat = sumstats$SE,",
84
101
  n if n is not None else "n",
@@ -88,34 +105,48 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
88
105
  L,
89
106
  susie_args)
90
107
  log.write(" -SuSieR script: {}".format(susier_line))
91
- with open("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]),"w") as file:
108
+
109
+ temp_r_path = "_{}_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"],id(sumstats))
110
+ log.write(" -Createing temp R script: {}".format(temp_r_path))
111
+ with open(temp_r_path,"w") as file:
92
112
  file.write(rscript)
93
113
 
94
- script_run_r = "{} _{}_{}_gwaslab_susie_temp.R".format(r, study,row["SNPID"])
114
+ script_run_r = "{} {}".format(r, temp_r_path)
95
115
 
96
116
  try:
117
+ log.write(" -Running SuSieR from command line...")
97
118
  output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
98
119
  #plink_process = subprocess.Popen("exec "+script_run_r, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,text=True)
99
120
  #output1,output2 = plink_process.communicate()
100
121
  #output= output1 + output2+ "\n"
101
122
  #plink_process.kill()
102
- log.write(" -Running SuSieR from command line...")
123
+
103
124
  r_log+= output + "\n"
104
125
  pip_cs = pd.read_csv("{}.pipcs".format(output_prefix))
105
126
  pip_cs["LOCUS"] = row["SNPID"]
106
127
  pip_cs["STUDY"] = row["STUDY"]
107
128
  locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
108
129
 
109
- os.remove("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]))
130
+ os.remove(temp_r_path)
131
+ log.write(" -Removing temp R script: {}".format(temp_r_path))
132
+
110
133
  if delete == True:
111
134
  os.remove("{}.pipcs".format(output_prefix))
135
+ log.write(" -Removing output file: {}".format(temp_r_path))
112
136
  else:
113
137
  log.write(" -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)))
114
138
  except subprocess.CalledProcessError as e:
115
139
  log.write(e.output)
116
- os.remove("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]))
140
+ os.remove(temp_r_path)
141
+ log.write(" -Removing temp R script: {}".format(temp_r_path))
117
142
 
118
143
  locus_pip_cs = locus_pip_cs.rename(columns={"variable":"N_SNP","variable_prob":"PIP","cs":"CREDIBLE_SET_INDEX"})
144
+ locus_pip_cs = pd.merge(locus_pip_cs, main_sumstats, on="SNPID",how="left")
145
+
119
146
  finished(log=log, verbose=verbose, end_line=_end_line)
120
147
  return locus_pip_cs
121
148
 
149
+ def _get_cs_lead(pipcs):
150
+ leads = pipcs.loc[pipcs["CREDIBLE_SET_INDEX"]>0,:]
151
+ leads = leads.sort_values(by="PIP",ascending=False).drop_duplicates(subset=["STUDY","LOCUS","CREDIBLE_SET_INDEX"])
152
+ return leads
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+ from scipy.stats import norm
3
+ from gwaslab.g_Log import Log
4
+
5
+
6
+ def _get_ess(sumstats, method="metal",log=Log(),verbose=True):
7
+ log.write("Start to estimate effective sample size (N_EFF)...", verbose=verbose)
8
+ if type(method) is str:
9
+ if method =="metal":
10
+ log.write(" - Method: {} ".format(method), verbose=verbose)
11
+ log.write(" - Referencec: {} ".format("Willer, C. J., Li, Y., & Abecasis, G. R. (2010)"), verbose=verbose)
12
+ log.write(" - Equation: {} ".format(" N_EFF = 4 * N_CASE * N_CONTROL / (N_CASE + N_CONTROL)"), verbose=verbose)
13
+ # Willer, C. J., Li, Y., & Abecasis, G. R. (2010). METAL: fast and efficient meta-analysis of genomewide association scans. Bioinformatics, 26(17), 2190-2191.
14
+ sumstats["N_EFF"] = 4 / (1/sumstats["N_CASE"] + 1/sumstats["N_CONTROL"])
15
+ else:
16
+ sumstats["N_EFF"] = method
17
+ log.write("Finished estimating effective sample size (N_EFF)...", verbose=verbose)
18
+ return sumstats
@@ -355,4 +355,23 @@ def rank_based_int(series, c=3/8):
355
355
  #https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
356
356
  n=sum(~series.isna())
357
357
  normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
358
- return normalized_value
358
+ return normalized_value
359
+
360
+
361
+ ################################################################################################################################################################################
362
+
363
+ def _get_multi_min(sumstats_multi, col, nstudy,log=Log(), verbose=True):
364
+ cols =[]
365
+ for i in range(nstudy):
366
+ single_header = "{}_{}".format(col, i + 1)
367
+ if single_header in sumstats_multi.columns:
368
+ cols.append(single_header)
369
+
370
+ combined_header = "{}_{}".format(col, "MIN")
371
+ log.write(" -Filling {} using {}".format(combined_header,",".join(cols)), verbose=verbose)
372
+ sumstats_multi[combined_header] = sumstats_multi[cols].min(axis=1)
373
+
374
+ combined_header_index = "{}_{}_COL".format(col, "MIN")
375
+ sumstats_multi[combined_header_index] = sumstats_multi[cols].idxmin(axis=1)
376
+ return sumstats_multi
377
+
@@ -217,7 +217,10 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
217
217
  gc.collect()
218
218
  return sumstats
219
219
 
220
- def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
220
+ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS",
221
+ ea="EA", nea="NEA",build="19",
222
+ change_status=True,
223
+ verbose=True,log=Log()):
221
224
  ##start function with col checking##########################################################
222
225
  _start_line = "infer genome build version using hapmap3 SNPs"
223
226
  _end_line = "inferring genome build version using hapmap3 SNPs"
@@ -261,13 +264,15 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
261
264
 
262
265
  if match_count_for_19 > match_count_for_38:
263
266
  log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
264
- sumstats[status] = vchange_status(sumstats[status],1,"9","1")
265
- sumstats[status] = vchange_status(sumstats[status],2,"9","9")
267
+ if change_status==True:
268
+ sumstats[status] = vchange_status(sumstats[status],1,"9","1")
269
+ sumstats[status] = vchange_status(sumstats[status],2,"9","9")
266
270
  inferred_build="19"
267
271
  elif match_count_for_19 < match_count_for_38:
268
272
  log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
269
- sumstats[status] = vchange_status(sumstats[status],1,"9","3")
270
- sumstats[status] = vchange_status(sumstats[status],2,"9","8")
273
+ if change_status==True:
274
+ sumstats[status] = vchange_status(sumstats[status],1,"9","3")
275
+ sumstats[status] = vchange_status(sumstats[status],2,"9","8")
271
276
  inferred_build="38"
272
277
  else:
273
278
  log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)
@@ -17,6 +17,7 @@ from gwaslab.util_ex_gwascatalog import gwascatalog_trait
17
17
  from gwaslab.qc_fix_sumstats import check_dataframe_shape
18
18
  from gwaslab.qc_fix_sumstats import start_to
19
19
  from gwaslab.qc_fix_sumstats import finished
20
+ from gwaslab.qc_build import _check_build
20
21
  from gwaslab.util_in_correct_winnerscurse import wc_correct
21
22
  # getsig
22
23
  # closest_gene
@@ -372,6 +373,8 @@ def getnovel(insumstats,
372
373
  xymt=["X","Y","MT"],
373
374
  anno=False,
374
375
  wc_correction=False,
376
+ use_cache=True,
377
+ cache_dir="./",
375
378
  build="19",
376
379
  source="ensembl",
377
380
  gwascatalog_source="NCBI",
@@ -405,15 +408,26 @@ def getnovel(insumstats,
405
408
  ############################################################################################
406
409
  knownsig = pd.DataFrame()
407
410
  if efo != False:
411
+ # For GWAS catalog, checking if sumstats build is hg38
412
+ _check_build(target_build="38" ,build=build ,log=log,verbose=verbose)
413
+
408
414
  if type(efo) is not list:
409
415
  log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
410
- known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
416
+ known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,
417
+ sig_level=sig_level,
418
+ use_cache=use_cache,
419
+ cache_dir=cache_dir,
420
+ verbose=verbose,log=log)
411
421
  knownsig = known_Sumstats.data.copy()
412
- else:
422
+ else:
413
423
  knownsig=pd.DataFrame()
414
424
  log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
425
+
415
426
  for single_efo in efo:
416
- known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
427
+ known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,
428
+ use_cache=use_cache,
429
+ cache_dir=cache_dir,
430
+ sig_level=sig_level,verbose=verbose,log=log)
417
431
  known_Sumstats.data["EFOID"] = single_efo
418
432
  knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
419
433
  knownsig["CHR"] = knownsig["CHR"].astype("Int64")
@@ -832,44 +846,88 @@ def _check_novel_set(insumstats,
832
846
  else:
833
847
  reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
834
848
  ############################################################################################
835
-
849
+ #match group/trait
836
850
  try:
837
851
  no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
838
852
  if len(no_reference_avaialble)>0:
839
853
  log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
840
854
  except:
841
855
  pass
856
+ ############################################################################################
842
857
 
843
858
  log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
844
- known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
859
+ #known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
860
+ new_row_list = []
861
+ for index, row in allsig.iterrows():
862
+
863
+ row = check_overlap(row, snpset, snpid, group_key,reference_dict)
864
+ new_row_list = new_row_list+row
865
+ known_df = pd.DataFrame(new_row_list,
866
+ columns=[snpid,group_key, snpset,"KNOWN_SET","OVERLAP_VARIANT","KNOWN_SET_VARIANT"])
845
867
 
846
- allsig["KNOWN_SET"] = known_list.str[0]
847
- allsig["KNOWN_VARIANT"] = known_list.str[1]
868
+ allsig = pd.merge(allsig,known_df, on=[snpid, group_key, snpset],how="left")
869
+
870
+ #allsig["KNOWN_SET"] = known_list.str[0]
871
+ #allsig["OVERLAP_VARIANT"] = known_list.str[1]
872
+ #allsig["KNOWN_SET_VARIANT"] = known_list.str[2]
848
873
 
874
+ ##
875
+ is_overlapped = ~allsig["KNOWN_SET"].isna()
876
+ allsig["KNOWN_SET_SIZE"] = 0
877
+ allsig.loc[is_overlapped, "KNOWN_SET_SIZE"] = allsig.loc[is_overlapped, "KNOWN_SET_VARIANT"].str.len()
878
+
879
+ # sumstats set dic
849
880
  back_dict={}
850
881
  for i in allsig[group_key].unique():
882
+ # for each trait in sumstats
851
883
  back_dict[i] ={}
852
884
  for j in allsig.loc[allsig[group_key]==i,snpset].unique():
885
+ #for each locus in each trait
853
886
  back_dict[i][j] =set()
854
- for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
855
- back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
887
+ for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j),:].iterrows():
888
+ #for each variant in each locus
889
+ back_dict[i][j].add("{}".format(row["SNPID"]))
856
890
 
857
- allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
891
+ allsig["SUMSTATS_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
892
+ allsig["SUMSTATS_SET_SIZE"] = 0
893
+ allsig["SUMSTATS_SET_SIZE"] = allsig[ "SUMSTATS_SET_VARIANT"].str.len()
858
894
 
859
895
  finished(log,verbose,_end_line)
860
896
 
861
897
  return allsig
862
898
 
863
- def check_overlap(x,snpid, group_key,reference_dict):
899
+ def check_overlap(x,snpset, snpid, group_key,reference_dict):
900
+
901
+ matched=[]
864
902
  if x[group_key] in reference_dict.keys():
903
+ # if trait match
865
904
  for key, value in reference_dict[x[group_key]].items():
905
+ # locus and snplist
866
906
  if x[snpid] in value:
867
- return key, x[snpid]
868
- return pd.NA, pd.NA,
907
+ # if sumstats snp in reference snplist for locus
908
+ # return locus and snsumstats snppid
909
+ matched.append( (x[snpid], x[group_key], x[snpset], key, x[snpid], value))
910
+ if len(matched)==0:
911
+ matched = [(x[snpid], x[group_key], x[snpset], pd.NA, pd.NA, pd.NA)]
912
+ return matched
913
+
914
+ #def check_overlap(x,snpid, group_key,reference_dict):
915
+ # if x[group_key] in reference_dict.keys():
916
+ # # if trait match
917
+ # for key, value in reference_dict[x[group_key]].items():
918
+ # # locus and snplist
919
+ # if x[snpid] in value:
920
+ # # if sumstats snp in reference snplist for locus
921
+ # # return locus and snsumstats snppid
922
+ # return key, x[snpid], value
923
+ # return pd.NA, pd.NA, pd.NA
869
924
 
870
925
  def assign_set_variant(x,group_key,snpset,back_dict):
871
926
  if x[group_key] in back_dict.keys():
927
+ # if trait match
872
928
  if x[snpset] in back_dict[x[group_key]].keys():
929
+ #if locus match
873
930
  if len(back_dict[x[group_key]][x[snpset]]) >0:
931
+ # return sumstats snplist for locus
874
932
  return back_dict[x[group_key]][x[snpset]]
875
933
  return pd.NA
gwaslab/util_in_meta.py CHANGED
@@ -7,8 +7,12 @@ from gwaslab.g_Log import Log
7
7
  from gwaslab.io_to_pickle import load_data_from_pickle
8
8
  from gwaslab.g_Sumstats import Sumstats
9
9
  import gc
10
+ import statsmodels.api as sm
10
11
 
11
- def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log()):
12
+ def meta_analyze(sumstats_list,
13
+ random_effects=False,
14
+ match_allele=True,
15
+ log=Log()):
12
16
 
13
17
  ###########################################################################
14
18
  columns=["SNPID","CHR","POS","EA","NEA"]
@@ -16,6 +20,7 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
16
20
 
17
21
  log.write("Start to perform meta-analysis...")
18
22
  log.write(" -Datasets:")
23
+
19
24
  for index,sumstats_path in enumerate(sumstats_list):
20
25
  if isinstance(sumstats_path, pd.DataFrame):
21
26
  log.write(" -Sumstats #{}: {} ".format(index, sumstats_path))
@@ -42,8 +47,6 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
42
47
  del new_rows
43
48
  gc.collect()
44
49
 
45
-
46
-
47
50
  ###########################################################################
48
51
  log.write(" -Initiating result DataFrame...")
49
52
  columns=["SNPID","CHR","POS","EA","NEA","_BETAW_SUM","_EA_N","_NEA_N","_BETA2W_SUM","_W_SUM","EAF","N","DIRECTION","BETA","SE","DOF"]
@@ -231,4 +234,165 @@ def get_sumstats(input_path,usekeys=None):
231
234
  sumstats = sumstats[usekeys]
232
235
  else:
233
236
  sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
234
- return sumstats
237
+ return sumstats
238
+
239
+
240
+ ############################################################################################################################################################################
241
+
242
+ def meta_analyze_multi(sumstats_multi,
243
+ random_effects=False,
244
+ nstudy=1,
245
+ match_allele=True,
246
+ log=Log()):
247
+ log.write("Start to perform meta-analysis...")
248
+ ###########################################################################
249
+ log.write(" -Initiating result DataFrame...")
250
+ sumstats_multi["_INDEX"] = range(len(sumstats_multi))
251
+ results_df = _init_result_df(sumstats_multi)
252
+ ##########################################################################
253
+
254
+
255
+ log.write(" -Iterating through {} datasets to compute statistics for fixed-effect model...".format(nstudy))
256
+ for i in range(nstudy):
257
+ n="N_{}".format(i+1)
258
+ beta="BETA_{}".format(i+1)
259
+ se="SE_{}".format(i+1)
260
+ eaf="EAF_{}".format(i+1)
261
+ single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
262
+ to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
263
+
264
+
265
+ sumstats_index = to_use_sumstats.index
266
+
267
+ results_df_not_in_sumstat_index = results_df.index[~results_df.index.isin(to_use_sumstats.index)]
268
+
269
+ # N and DOF
270
+ results_df.loc[sumstats_index, "N"] += to_use_sumstats[n].fillna(0)
271
+ results_df.loc[sumstats_index, "DOF"] += 1
272
+
273
+ # BEAT and SE
274
+ results_df.loc[sumstats_index,"_BETA2W_SUM"] += to_use_sumstats[beta]**2 *(1/(to_use_sumstats[se]**2))
275
+ results_df.loc[sumstats_index,"_BETAW_SUM"] += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2))
276
+ results_df.loc[sumstats_index,"_W_SUM"] += 1/(to_use_sumstats[se]**2)
277
+ results_df.loc[sumstats_index,"_W2_SUM"] += results_df.loc[sumstats_index,"_W_SUM"]**2
278
+
279
+ # EAF
280
+ results_df.loc[sumstats_index,"_EA_N"] += to_use_sumstats[n]*to_use_sumstats[eaf]
281
+ results_df.loc[sumstats_index,"_NEA_N"] += to_use_sumstats[n]*(1 - to_use_sumstats[eaf])
282
+
283
+ # DIRECTION
284
+ beta_index = to_use_sumstats[to_use_sumstats[beta]>0].index
285
+ results_df.loc[beta_index, "DIRECTION"] += "+"
286
+ beta_index = to_use_sumstats[to_use_sumstats[beta]==0].index
287
+ results_df.loc[beta_index, "DIRECTION"] += "0"
288
+ beta_index = to_use_sumstats[to_use_sumstats[beta]<0].index
289
+ results_df.loc[beta_index, "DIRECTION"] += "-"
290
+ results_df.loc[results_df_not_in_sumstat_index, "DIRECTION"] += "?"
291
+
292
+ del to_use_sumstats
293
+ gc.collect()
294
+
295
+ ##############################################################################
296
+ # fixed - effect statistics
297
+ results_df["BETA"] = results_df["_BETAW_SUM"] / results_df["_W_SUM"]
298
+ results_df["EAF"] = results_df["_EA_N"] / (results_df["_EA_N"] + results_df["_NEA_N"])
299
+ results_df["SE"] = np.sqrt(1/results_df["_W_SUM"])
300
+ results_df["Z"] = results_df["BETA"] / results_df["SE"]
301
+ results_df["P"] = norm.sf(abs(results_df["Z"]))*2
302
+ results_df["Q"] = results_df["_BETA2W_SUM"] - (results_df["_BETAW_SUM"]**2 / results_df["_W_SUM"])
303
+
304
+ for dof in results_df["DOF"].unique():
305
+ results_df_dof_index = results_df["DOF"] == dof
306
+ results_df.loc[results_df_dof_index,"P_HET"] = chi2.sf(results_df.loc[results_df_dof_index, "Q"].values,dof)
307
+ gc.collect()
308
+
309
+ results_df["I2"] = (results_df["Q"] - results_df["DOF"])/results_df["Q"]
310
+ results_df.loc[results_df["I2"]<0, "I2"] = 0
311
+
312
+ results_df=results_df.drop(columns=["_EA_N","_NEA_N"])
313
+ gc.collect()
314
+
315
+ ###########################################################################
316
+ if random_effects==True:
317
+ log.write(" -Iterating through {} datasets to compute statistics for random-effects model...".format(nstudy))
318
+ results_df["_R2"] = (results_df["Q"] - results_df["DOF"])/(results_df["_W_SUM"] - (results_df["_W2_SUM"]/results_df["_W_SUM"]))
319
+ results_df.loc[results_df["_R2"]<0, "_R2"] = 0
320
+ variant_index_random = results_df[results_df["_R2"]>0].index
321
+
322
+ results_df["_BETAW_SUM_R"] = 0.0
323
+ results_df["_W_SUM_R"] = 0.0
324
+ results_df["BETA_RANDOM"] = results_df["BETA"]
325
+ results_df["SE_RANDOM"] = results_df["SE"]
326
+
327
+ for i in range(nstudy):
328
+ n="N_{}".format(i+1)
329
+ beta="BETA_{}".format(i+1)
330
+ se="SE_{}".format(i+1)
331
+ eaf="EAF_{}".format(i+1)
332
+ single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
333
+ to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
334
+ sumstats_index = to_use_sumstats.index
335
+
336
+ # BEAT and SE
337
+ results_df.loc[sumstats_index,"_BETAW_SUM_R"] += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"]))
338
+ results_df.loc[sumstats_index,"_W_SUM_R"] += 1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"])
339
+
340
+ del to_use_sumstats
341
+ del sumstats_index
342
+ gc.collect()
343
+
344
+ results_df.loc[variant_index_random,"BETA_RANDOM"] = results_df.loc[variant_index_random,"_BETAW_SUM_R"] / results_df.loc[variant_index_random,"_W_SUM_R"]
345
+ results_df.loc[variant_index_random,"SE_RANDOM"] = np.sqrt(1/results_df.loc[variant_index_random,"_W_SUM_R"])
346
+ results_df["Z_RANDOM"] = results_df["BETA_RANDOM"] / results_df["SE_RANDOM"]
347
+ results_df["P_RANDOM"] = norm.sf(abs(results_df["Z_RANDOM"]))*2
348
+ results_df = results_df.drop(columns=["_BETAW_SUM_R","_W_SUM_R"])
349
+
350
+ gc.collect()
351
+ ###########################################################################
352
+ results_df = results_df.drop(columns=["_BETAW_SUM","_BETA2W_SUM","_W_SUM","_R2","_W2_SUM"]).sort_values(by=["CHR","POS"]).reset_index()
353
+ gc.collect()
354
+ log.write("Finished meta-analysis successfully!")
355
+
356
+ if random_effects==True:
357
+ other_cols = ["BETA_RANDOM","SE_RANDOM","Z_RANDOM","P_RANDOM"]
358
+ else:
359
+ other_cols = []
360
+
361
+ results_df = results_df.drop(columns=["_INDEX"])
362
+
363
+ results_df = Sumstats(results_df, fmt="gwaslab", other = other_cols)
364
+
365
+ return results_df
366
+
367
+ def _init_result_df(sumstats):
368
+
369
+ results_df = sumstats[["_INDEX","SNPID","CHR","POS","EA","NEA"]]
370
+ results_df = results_df.drop_duplicates(subset="_INDEX").set_index("_INDEX")
371
+
372
+ results_df["N"] = 0
373
+ results_df["_BETAW_SUM"] = 0.0
374
+ results_df["_BETA2W_SUM"] = 0.0
375
+ results_df["_W_SUM"] = 0.0
376
+ results_df["_W2_SUM"] = 0.0
377
+ results_df["_EA_N"] = 0.0
378
+ results_df["_NEA_N"] = 0.0
379
+ results_df["N"] = 0
380
+ results_df["DIRECTION"] = ""
381
+ results_df["BETA"] = 0.0
382
+ results_df["SE"] = 0.0
383
+ results_df["DOF"] = -1
384
+ results_df["_R2"] = 0
385
+
386
+ dtype_dict ={
387
+ "_BETAW_SUM":"float64",
388
+ "_EA_N":"float64",
389
+ "_NEA_N":"float64",
390
+ "_BETA2W_SUM":"float64",
391
+ "_W_SUM":"float64",
392
+ "BETA":"float64",
393
+ "SE":"float64",
394
+ "N":"Int64",
395
+ "DOF":"Int64"
396
+ }
397
+ results_df=results_df.astype(dtype_dict)
398
+ return results_df