gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/bd_common_data.py CHANGED
@@ -280,17 +280,20 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
280
280
  protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
281
281
  # if not existing, extract protein coding records and output to a new file
282
282
  if not path.isfile(protein_coding_path):
283
+
283
284
  # get gene list
284
- if verbose: log.write(" - Extracting protein_coding genes from {}".format(gtfpath))
285
+ log.write(" - Extracting protein_coding genes from {}".format(gtfpath),verbose=verbose)
285
286
  gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
286
287
  gene_list = gtf.loc[(gtf["feature"]=="gene") & (gtf["gene_biotype"]=="protein_coding"),"gene_id"].values
287
- if verbose: log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)))
288
+ log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)),verbose=verbose)
289
+
288
290
  # extract entry using csv
289
291
  gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
290
292
  gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
291
293
  gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
292
294
  gtf_raw = gtf_raw.drop("_gene_id",axis=1)
293
- if verbose: log.write(" - Extracted records are saved to : {} ".format(protein_coding_path))
295
+
296
+ log.write(" - Extracted records are saved to : {} ".format(protein_coding_path),verbose=verbose)
294
297
  gtf_raw.to_csv(protein_coding_path, header=None, index=None, sep="\t")
295
298
 
296
299
  return protein_coding_path
gwaslab/bd_download.py CHANGED
@@ -106,7 +106,7 @@ def check_available_ref(log=Log(),verbose=True):
106
106
  Check available reference files for gwaslab.
107
107
  Return a dictionary of available reference files.
108
108
  '''
109
- if verbose : log.write("Start to check available reference files...")
109
+ log.write("Start to check available reference files...", verbose=verbose)
110
110
  #ref_path = path.dirname(__file__) + '/data/reference.json'
111
111
  ref_path = options.paths["reference"]
112
112
  if not path.exists(ref_path):
@@ -115,11 +115,11 @@ def check_available_ref(log=Log(),verbose=True):
115
115
  dicts = json.load(open(ref_path))
116
116
  if dicts is not None:
117
117
  for key,value in dicts.items():
118
- if verbose :log.write(" -",key," : ",value)
118
+ log.write(" -",key," : ",value, verbose=verbose)
119
119
  return dicts
120
120
  else:
121
- if verbose :log.write(" -No available reference files.")
122
- if verbose :log.write("Finished checking available reference files...")
121
+ log.write(" -No available reference files.", verbose=verbose)
122
+ log.write("Finished checking available reference files...", verbose=verbose)
123
123
  return {}
124
124
 
125
125
  def update_available_ref(log=Log()):
@@ -167,8 +167,8 @@ def get_path(name,log=Log(),verbose=True):
167
167
  #config_path = path.dirname(__file__) + '/data/config.json'
168
168
  config_path = options.paths["config"]
169
169
  if not path.exists(config_path):
170
- if verbose : log.write("Config file not exists...")
171
- if verbose : log.write("Created new config file...")
170
+ log.write("Config file not exists...", verbose=verbose)
171
+ log.write("Created new config file...", verbose=verbose)
172
172
  initiate_config()
173
173
  else:
174
174
  try:
@@ -176,9 +176,9 @@ def get_path(name,log=Log(),verbose=True):
176
176
  if path.exists(dicts[name]):
177
177
  return dicts[name]
178
178
  else:
179
- if verbose : log.write("File not exist.")
179
+ log.write("File not exist.", verbose=verbose)
180
180
  except:
181
- if verbose : log.write("No records in config file. Please download first.")
181
+ log.write("No records in config file. Please download first.", verbose=verbose)
182
182
  return False
183
183
 
184
184
  ##################################################################################
@@ -277,7 +277,7 @@ def check_file_integrity(local_path, md5sum,log):
277
277
  log.write(" -MD5 verified.")
278
278
  return 1
279
279
  else:
280
- log.write("WARNING: -MD5 VERIFICATION FAILED !")
280
+ log.warning("-MD5 VERIFICATION FAILED!")
281
281
  return 0
282
282
 
283
283
  def remove_file(name,log=Log()):
gwaslab/bd_get_hapmap3.py CHANGED
@@ -1,6 +1,10 @@
1
1
  import pandas as pd
2
2
  from os import path
3
3
  from gwaslab.g_Log import Log
4
+ from gwaslab.qc_fix_sumstats import start_to
5
+ from gwaslab.qc_fix_sumstats import skipped
6
+ from gwaslab.qc_fix_sumstats import finished
7
+
4
8
  #A unique identifier (e.g., the rs number)
5
9
  #Allele 1 (effect allele)
6
10
  #Allele 2 (non-effect allele)
@@ -8,30 +12,60 @@ from gwaslab.g_Log import Log
8
12
  #A P-value
9
13
  #A signed summary statistic (beta, OR, log odds, Z-score, etc)
10
14
 
11
- def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
12
- if verbose:log.write(" -Processing "+str(len(sumstats))+" raw variants...")
15
+ def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True, match_allele= True, log=Log()):
16
+ ##start function with col checking##########################################################
17
+ _start_line = "extract HapMap3 SNPs"
18
+ _end_line = "extracting HapMap3 SNPs"
19
+ _start_cols =[]
20
+ _start_function = ".gethapmap3"
21
+ _must_args ={}
22
+
23
+ is_enough_info = start_to(sumstats=sumstats,
24
+ log=log,
25
+ verbose=verbose,
26
+ start_line=_start_line,
27
+ end_line=_end_line,
28
+ start_cols=_start_cols,
29
+ start_function=_start_function,
30
+ **_must_args)
31
+ if is_enough_info == False: return None
13
32
 
33
+ ############################################################################################
14
34
  if build=="19":
15
35
  data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
16
36
  elif build=="38":
17
37
  data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
18
38
 
19
- if verbose:log.write(" -Loading Hapmap3 variants data...")
20
-
21
- hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"],dtype={"#CHROM":"string","POS":"string"})
39
+ log.write(" -Loading Hapmap3 variants from built-in datasets...", verbose=verbose)
40
+
41
+ if match_allele:
42
+ additional_cols= ["A1","A2"]
43
+ else:
44
+ additional_cols=[]
45
+ hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"]+additional_cols, dtype={"#CHROM":"string","POS":"string"})
46
+
22
47
  #rsid A1 A2 #CHROM POS
23
48
  #rs3094315 G A 1 752566
49
+
24
50
  if rsid in sumstats.columns:
25
51
  output = sumstats.loc[sumstats[rsid].isin(hapmap3_ref["rsid"].values),:].copy()
26
52
  return output
53
+
27
54
  elif chrom in sumstats.columns and pos in sumstats.columns:
28
- if verbose: log.write(" -Since rsID not in sumstats, chr:pos( build "+build+") will be used for matching...")
55
+ log.write(" -Since rsID not in sumstats, CHR:POS( build "+build+") will be used for matching...", verbose=verbose)
29
56
  sumstats ["chr:pos"] = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
30
57
  hapmap3_ref["chr:pos"] = hapmap3_ref["#CHROM"]+":"+hapmap3_ref["POS"]
31
58
  hapmap3_ref = hapmap3_ref.rename(columns={"rsid":"rsID"})
32
- output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
33
- output = output.drop(columns="chr:pos")
34
- if verbose: log.write(" -Raw input contains "+str(len(output))+" hapmaps variants based on chr:pos...")
59
+ output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]+additional_cols],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
60
+ if match_allele:
61
+ log.write(" -Checking if alleles are same...")
62
+ is_matched = ((output[ea].astype("string") == output["A1"]) & (output[nea].astype("string") == output["A2"])) \
63
+ | ((output[ea].astype("string") == output["A2"]) & (output[nea].astype("string") == output["A1"]))
64
+ log.write(" -Variants with macthed alleles: {}".format(sum(is_matched)))
65
+ output = output.loc[is_matched,:]
66
+ output = output.drop(columns=["chr:pos"]+additional_cols)
67
+ log.write(" -Raw input contains "+str(len(output))+" Hapmap3 variants based on CHR:POS...", verbose=verbose)
68
+ finished(log=log,verbose=verbose,end_line=_end_line)
35
69
  return output
36
70
  else:
37
71
  raise ValueError("Not enough information to match SNPs. Please check your sumstats...")
gwaslab/g_Log.py CHANGED
@@ -1,12 +1,12 @@
1
1
  import time
2
2
  class Log():
3
3
  def __init__(self):
4
- self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
4
+ self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
5
5
 
6
6
  def write(self,*message,end="\n",show_time=True, verbose=True):
7
7
  if show_time is True:
8
- if verbose: print(str(time.ctime(time.time())),*message,end=end)
9
- self.log_text = self.log_text + str(time.ctime(time.time())) + " " + " ".join(map(str,message)) + end
8
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
9
+ self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
10
10
  else:
11
11
  if verbose: print(*message,end=end)
12
12
  self.log_text = self.log_text + " ".join(map(str,message)) + end
@@ -21,5 +21,14 @@ class Log():
21
21
  print(self.log_text)
22
22
  def save(self,path,verbose=True):
23
23
  with open(path,"w") as f:
24
- if verbose: print(str(time.ctime(time.time())) + " " + " -Save log file to : ", path)
25
- f.write(self.log_text)
24
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
25
+ f.write(self.log_text)
26
+
27
+
28
+ def log(self,*message,end="\n",show_time=True, verbose=True):
29
+ if show_time is True:
30
+ if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
31
+ self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
32
+ else:
33
+ if verbose: print(*message,end=end)
34
+ self.log_text = self.log_text + " ".join(map(str,message)) + end
gwaslab/g_Sumstats.py CHANGED
@@ -32,6 +32,9 @@ from gwaslab.util_in_filter_value import filterout
32
32
  from gwaslab.util_in_filter_value import filterin
33
33
  from gwaslab.util_in_filter_value import filterregionin
34
34
  from gwaslab.util_in_filter_value import filterregionout
35
+ from gwaslab.util_in_filter_value import _filter_indel
36
+ from gwaslab.util_in_filter_value import _filter_palindromic
37
+ from gwaslab.util_in_filter_value import _filter_snp
35
38
  from gwaslab.util_in_filter_value import inferbuild
36
39
  from gwaslab.util_in_filter_value import sampling
37
40
  from gwaslab.util_in_filter_value import _get_flanking
@@ -44,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
44
47
  from gwaslab.util_in_get_density import assigndensity
45
48
  from gwaslab.util_in_get_sig import annogene
46
49
  from gwaslab.util_in_get_sig import getnovel
50
+ from gwaslab.util_in_get_sig import _check_cis
51
+ from gwaslab.util_in_get_sig import _check_novel_set
47
52
  from gwaslab.util_in_fill_data import filldata
48
53
  from gwaslab.bd_get_hapmap3 import gethapmap3
49
54
  from gwaslab.bd_common_data import get_chr_list
@@ -64,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
64
69
  from gwaslab.viz_plot_compare_af import plotdaf
65
70
  from gwaslab.util_ex_run_susie import _run_susie_rss
66
71
  from gwaslab.qc_fix_sumstats import _check_data_consistency
72
+ from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
73
+ from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
74
+ from gwaslab.bd_get_hapmap3 import gethapmap3
67
75
  import gc
68
76
 
69
77
  #20220309
@@ -121,7 +129,8 @@ class Sumstats():
121
129
  # basic attributes
122
130
  self.data = pd.DataFrame()
123
131
  self.log = Log()
124
-
132
+ self.ldsc_h2 = None
133
+ self.ldsc_rg = None
125
134
  # meta information
126
135
  self.meta = _init_meta()
127
136
  self.build = build
@@ -135,7 +144,7 @@ class Sumstats():
135
144
  self.pipcs = pd.DataFrame()
136
145
 
137
146
  # print gwaslab version information
138
- if verbose: _show_version(self.log)
147
+ _show_version(self.log, verbose=verbose)
139
148
 
140
149
  #preformat the data
141
150
  self.data = preformat(
@@ -405,19 +414,16 @@ class Sumstats():
405
414
  _check_data_consistency(self.data,log=self.log,**args)
406
415
  def check_id(self,**args):
407
416
  pass
408
-
409
417
  def check_ref(self,ref_seq,**args):
410
418
  self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
411
419
  self.data = checkref(self.data,ref_seq,log=self.log,**args)
412
420
  def infer_strand(self,ref_infer,**args):
413
421
  self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
414
422
  self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
415
-
416
423
  def flip_allele_stats(self,**args):
417
424
  self.data = flipallelestats(self.data,log=self.log,**args)
418
425
  def normalize_allele(self,**args):
419
426
  self.data = parallelnormalizeallele(self.data,log=self.log,**args)
420
-
421
427
  def assign_rsid(self,
422
428
  ref_rsid_tsv=None,
423
429
  ref_rsid_vcf=None,
@@ -428,14 +434,11 @@ class Sumstats():
428
434
  if ref_rsid_vcf is not None:
429
435
  self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
430
436
  self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
431
-
432
437
  def rsid_to_chrpos(self,**args):
433
438
  self.data = rsidtochrpos(self.data,log=self.log,**args)
434
-
435
439
  def rsid_to_chrpos2(self,**args):
436
440
  self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
437
441
 
438
-
439
442
  ############################################################################################################
440
443
 
441
444
  def sort_coordinate(self,**sort_args):
@@ -458,7 +461,6 @@ class Sumstats():
458
461
  return new_Sumstats_object
459
462
  else:
460
463
  self.data = _get_flanking(self.data, **args)
461
-
462
464
  def filter_flanking_by_chrpos(self, chrpos, inplace=False,**args):
463
465
  if inplace is False:
464
466
  new_Sumstats_object = copy.deepcopy(self)
@@ -466,7 +468,6 @@ class Sumstats():
466
468
  return new_Sumstats_object
467
469
  else:
468
470
  self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
469
-
470
471
  def filter_flanking_by_id(self, snpid, inplace=False,**args):
471
472
  if inplace is False:
472
473
  new_Sumstats_object = copy.deepcopy(self)
@@ -474,7 +475,6 @@ class Sumstats():
474
475
  return new_Sumstats_object
475
476
  else:
476
477
  self.data = _get_flanking_by_id(self.data, snpid, **args)
477
-
478
478
  def filter_value(self, expr, inplace=False, **args):
479
479
  if inplace is False:
480
480
  new_Sumstats_object = copy.deepcopy(self)
@@ -482,7 +482,6 @@ class Sumstats():
482
482
  return new_Sumstats_object
483
483
  else:
484
484
  self.data = filtervalues(self.data, expr,log=self.log,**args)
485
-
486
485
  def filter_out(self, inplace=False, **args):
487
486
  if inplace is False:
488
487
  new_Sumstats_object = copy.deepcopy(self)
@@ -490,7 +489,6 @@ class Sumstats():
490
489
  return new_Sumstats_object
491
490
  else:
492
491
  self.data = filterout(self.data,log=self.log,**args)
493
-
494
492
  def filter_in(self, inplace=False, **args):
495
493
  if inplace is False:
496
494
  new_Sumstats_object = copy.deepcopy(self)
@@ -512,7 +510,28 @@ class Sumstats():
512
510
  return new_Sumstats_object
513
511
  else:
514
512
  self.data = filterregionout(self.data,log=self.log,**args)
515
-
513
+ def filter_palindromic(self, inplace=False, **args):
514
+ if inplace is False:
515
+ new_Sumstats_object = copy.deepcopy(self)
516
+ new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
517
+ return new_Sumstats_object
518
+ else:
519
+ self.data = _filter_palindromic(self.data,log=self.log,**args)
520
+ def filter_snp(self, inplace=False, **args):
521
+ if inplace is False:
522
+ new_Sumstats_object = copy.deepcopy(self)
523
+ new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
524
+ return new_Sumstats_object
525
+ else:
526
+ self.data = _filter_snp(self.data,log=self.log,**args)
527
+ def filter_indel(self, inplace=False, **args):
528
+ if inplace is False:
529
+ new_Sumstats_object = copy.deepcopy(self)
530
+ new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
531
+ return new_Sumstats_object
532
+ else:
533
+ self.data = _filter_indel(self.data,log=self.log,**args)
534
+
516
535
  def random_variants(self,inplace=False,n=1,p=None,**args):
517
536
  if inplace is True:
518
537
  self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
@@ -520,18 +539,25 @@ class Sumstats():
520
539
  new_Sumstats_object = copy.deepcopy(self)
521
540
  new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
522
541
  return new_Sumstats_object
523
-
542
+
543
+ def filter_hapmap3(self, inplace=False, build=None, **args ):
544
+ if build is None:
545
+ build = self.meta["gwaslab"]["genome_build"]
546
+ if inplace is True:
547
+ self.data = gethapmap3(self.data, build=build,log=self.log, **args)
548
+ else:
549
+ new_Sumstats_object = copy.deepcopy(self)
550
+ new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
551
+ return new_Sumstats_object
524
552
  ######################################################################
525
553
 
526
554
  def check_af(self,ref_infer,**args):
527
555
  self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
528
556
  self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
529
-
530
557
  def infer_af(self,ref_infer,**args):
531
558
  self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
532
559
  self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
533
560
  self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
534
-
535
561
  def plot_daf(self, **args):
536
562
  fig,outliers = plotdaf(self.data, **args)
537
563
  return fig, outliers
@@ -637,7 +663,37 @@ class Sumstats():
637
663
  **args)
638
664
  # return sumstats object
639
665
  return output
640
-
666
+
667
+ def check_cis(self, **args):
668
+ if "SNPID" in self.data.columns:
669
+ id_to_use = "SNPID"
670
+ else:
671
+ id_to_use = "rsID"
672
+ output = _check_cis(self.data,
673
+ id=id_to_use,
674
+ chrom="CHR",
675
+ pos="POS",
676
+ p="P",
677
+ log=self.log,
678
+ **args)
679
+ # return sumstats object
680
+ return output
681
+
682
+ def check_novel_set(self, **args):
683
+ if "SNPID" in self.data.columns:
684
+ id_to_use = "SNPID"
685
+ else:
686
+ id_to_use = "rsID"
687
+ output = _check_novel_set(self.data,
688
+ id=id_to_use,
689
+ chrom="CHR",
690
+ pos="POS",
691
+ p="P",
692
+ log=self.log,
693
+ **args)
694
+ # return sumstats object
695
+ return output
696
+
641
697
  def anno_gene(self, **args):
642
698
  if "SNPID" in self.data.columns:
643
699
  id_to_use = "SNPID"
@@ -673,6 +729,18 @@ class Sumstats():
673
729
  output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
674
730
  self.meta["Genomic inflation factor"] = output
675
731
  return output
732
+
733
+ def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
734
+ if build is None:
735
+ build = self.meta["gwaslab"]["genome_build"]
736
+ insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
737
+ self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
738
+
739
+ def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
740
+ if build is None:
741
+ build = self.meta["gwaslab"]["genome_build"]
742
+ insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
743
+ self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
676
744
  # external ################################################################################################
677
745
 
678
746
  def to_finemapping(self,**args):
gwaslab/g_SumstatsPair.py CHANGED
@@ -6,23 +6,26 @@ from gwaslab.util_in_filter_value import filtervalues
6
6
  from gwaslab.g_Log import Log
7
7
  from math import floor
8
8
  from gwaslab.g_Sumstats import Sumstats
9
- from gwaslab.hm_casting import _merge_mold_with_sumstats
9
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
10
10
  from gwaslab.hm_casting import _align_with_mold
11
11
  from gwaslab.hm_casting import _fill_missing_columns
12
12
  from gwaslab.hm_casting import _check_daf
13
13
  from gwaslab.hm_casting import _assign_warning_code
14
14
  from gwaslab.qc_fix_sumstats import flipallelestats
15
+ from gwaslab.qc_check_datatype import check_datatype
16
+ from gwaslab.qc_check_datatype import check_dataframe_shape
15
17
  from gwaslab.hm_casting import _renaming_cols
16
18
  from gwaslab.hm_casting import _sort_pair_cols
17
19
  from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
18
20
  from gwaslab.util_ex_run_coloc import _run_coloc_susie
19
21
  from gwaslab.viz_plot_miamiplot2 import plot_miami2
22
+ from gwaslab.viz_plot_compare_af import plotdaf
20
23
  from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
21
24
  from gwaslab.util_ex_run_clumping import _clump
22
25
  from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
23
26
 
24
27
  class SumstatsPair( ):
25
- def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
28
+ def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
26
29
 
27
30
  if not isinstance(sumstatsObject1, Sumstats):
28
31
  raise ValueError("Please provide GWASLab Sumstats Object #1.")
@@ -34,7 +37,9 @@ class SumstatsPair( ):
34
37
  self.study_name = "{}_{}".format("STUDY1", "STUDY2")
35
38
  self.snp_info_cols = []
36
39
  self.stats_cols =[]
37
- self.other_cols=[]
40
+ self.stats_cols2 =[]
41
+ self.other_cols =[]
42
+ self.other_cols2 =[]
38
43
  self.log = Log()
39
44
  self.suffixes = suffixes
40
45
  self.colocalization=pd.DataFrame()
@@ -43,28 +48,53 @@ class SumstatsPair( ):
43
48
  self.mr = {}
44
49
  self.clumps ={}
45
50
  self.ns = None
51
+ self.to_finemapping_file_path = ""
52
+ self.plink_log = ""
46
53
 
47
54
  self.log.write( "Start to create SumstatsPair object..." )
48
55
 
56
+ self.log.write( " -Checking sumstats 1..." , verbose=verbose)
57
+ check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
58
+ check_dataframe_shape(sumstats=sumstatsObject1.data,
59
+ log=self.log,
60
+ verbose=verbose)
61
+
62
+ self.log.write( " -Checking sumstats 2..." , verbose=verbose)
63
+ check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
64
+ check_dataframe_shape(sumstats=sumstatsObject2.data,
65
+ log=self.log,
66
+ verbose=verbose)
67
+
49
68
  for i in sumstatsObject1.data.columns:
50
69
  if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
51
70
  self.snp_info_cols.append(i)
52
- elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","OR95L","OR95U","MAF","EAF"]:
71
+ elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
53
72
  self.stats_cols.append(i)
54
73
  else:
55
74
  self.other_cols.append(i)
56
-
57
- self.data = sumstatsObject1.data.loc[:,self.snp_info_cols + self.stats_cols]
58
-
75
+ for i in sumstatsObject2.data.columns:
76
+ if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
77
+ continue
78
+ elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
79
+ self.stats_cols2.append(i)
80
+ else:
81
+ self.other_cols2.append(i)
82
+
83
+ self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
84
+ self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
85
+ self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
86
+ self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
87
+
88
+ # extract only info and stats cols
89
+ self.data = sumstatsObject1.data
90
+
91
+ #rename with _1
59
92
  self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
60
-
61
93
  self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
94
+ self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
62
95
 
63
96
  self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
64
97
 
65
- self.to_finemapping_file_path = ""
66
- self.plink_log = ""
67
-
68
98
  if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
69
99
  n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
70
100
  n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
@@ -74,8 +104,9 @@ class SumstatsPair( ):
74
104
 
75
105
  def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
76
106
 
77
- molded_sumstats, sumstats1 = _merge_mold_with_sumstats(self.data,
78
- sumstatsObject2.data,
107
+ # sumstats1 with suffix _1, sumstats2 with no suffix
108
+ molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
109
+ sumstats=sumstatsObject2.data,
79
110
  log=self.log,
80
111
  verbose=verbose,
81
112
  suffixes=(suffixes[0],""),
@@ -83,16 +114,21 @@ class SumstatsPair( ):
83
114
 
84
115
  molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
85
116
 
117
+ # flip sumstats2 statistics
86
118
  molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
87
119
 
120
+ # drop sumstats2 EA NEA
88
121
  molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
122
+
123
+ # rename sumstats1 EA NEA
89
124
  molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
90
125
 
91
- if not len(set(self.stats_cols) & set (sumstatsObject2.data.columns)) == len(self.stats_cols):
92
- cols_to_fill = set(self.stats_cols).difference(set(sumstatsObject2.data.columns))
126
+ if not set(self.stats_cols2) == set(self.stats_cols):
127
+ cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
93
128
  molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
94
129
 
95
- molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols, log=self.log, verbose=verbose, suffixes=suffixes)
130
+ # rename sumstast2 with _2
131
+ molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
96
132
 
97
133
  molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
98
134
 
@@ -108,13 +144,7 @@ class SumstatsPair( ):
108
144
  def run_coloc_susie(self,**args):
109
145
 
110
146
  self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
111
-
112
- def plot_miami(self,**args):
113
147
 
114
- plot_miami2(merged_sumstats=self.data,
115
- suffixes=self.suffixes,
116
- **args)
117
-
118
148
  def run_two_sample_mr(self, clump=False, **args):
119
149
  exposure1 = self.study_name.split("_")[0]
120
150
  outcome2 = self.study_name.split("_")[1]
@@ -130,4 +160,21 @@ class SumstatsPair( ):
130
160
  return new_Sumstats_object
131
161
  else:
132
162
  self.data = filtervalues(self.data, expr,log=self.log,**args)
133
- gc.collect()
163
+ gc.collect()
164
+
165
+ ## Visualization #############################################################################################################################################
166
+ def plot_miami(self,**args):
167
+
168
+ plot_miami2(merged_sumstats=self.data,
169
+ suffixes=self.suffixes,
170
+ **args)
171
+
172
+ def compare_af(self, **args):
173
+
174
+ return plotdaf( self.data,
175
+ eaf="EAF_2",
176
+ raf="EAF_1",
177
+ xlabel="Effect Allele Frequency in Sumstats 1",
178
+ ylabel="Effect Allele Frequency in Sumstats 2",
179
+ **args)
180
+
gwaslab/g_SumstatsT.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  from gwaslab.g_Sumstats import Sumstats
4
- from gwaslab.hm_casting import _merge_mold_with_sumstats
4
+ from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
5
5
  from gwaslab.hm_casting import _align_with_mold
6
6
  from gwaslab.hm_casting import _fill_missing_columns
7
7
  from gwaslab.hm_casting import _check_daf
@@ -34,7 +34,7 @@ class SumstatsT( ):
34
34
 
35
35
  def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
36
36
 
37
- molded_sumstats = _merge_mold_with_sumstats(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
37
+ molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
38
38
 
39
39
  molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)
40
40