gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/bd_common_data.py CHANGED
@@ -280,17 +280,20 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
280
280
  protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
281
281
  # if not existing, extract protein coding records and output to a new file
282
282
  if not path.isfile(protein_coding_path):
283
+
283
284
  # get gene list
284
- if verbose: log.write(" - Extracting protein_coding genes from {}".format(gtfpath))
285
+ log.write(" - Extracting protein_coding genes from {}".format(gtfpath),verbose=verbose)
285
286
  gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
286
287
  gene_list = gtf.loc[(gtf["feature"]=="gene") & (gtf["gene_biotype"]=="protein_coding"),"gene_id"].values
287
- if verbose: log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)))
288
+ log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)),verbose=verbose)
289
+
288
290
  # extract entry using csv
289
291
  gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
290
292
  gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
291
293
  gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
292
294
  gtf_raw = gtf_raw.drop("_gene_id",axis=1)
293
- if verbose: log.write(" - Extracted records are saved to : {} ".format(protein_coding_path))
295
+
296
+ log.write(" - Extracted records are saved to : {} ".format(protein_coding_path),verbose=verbose)
294
297
  gtf_raw.to_csv(protein_coding_path, header=None, index=None, sep="\t")
295
298
 
296
299
  return protein_coding_path
gwaslab/bd_download.py CHANGED
@@ -106,7 +106,7 @@ def check_available_ref(log=Log(),verbose=True):
106
106
  Check available reference files for gwaslab.
107
107
  Return a dictionary of available reference files.
108
108
  '''
109
- if verbose : log.write("Start to check available reference files...")
109
+ log.write("Start to check available reference files...", verbose=verbose)
110
110
  #ref_path = path.dirname(__file__) + '/data/reference.json'
111
111
  ref_path = options.paths["reference"]
112
112
  if not path.exists(ref_path):
@@ -115,11 +115,11 @@ def check_available_ref(log=Log(),verbose=True):
115
115
  dicts = json.load(open(ref_path))
116
116
  if dicts is not None:
117
117
  for key,value in dicts.items():
118
- if verbose :log.write(" -",key," : ",value)
118
+ log.write(" -",key," : ",value, verbose=verbose)
119
119
  return dicts
120
120
  else:
121
- if verbose :log.write(" -No available reference files.")
122
- if verbose :log.write("Finished checking available reference files...")
121
+ log.write(" -No available reference files.", verbose=verbose)
122
+ log.write("Finished checking available reference files...", verbose=verbose)
123
123
  return {}
124
124
 
125
125
  def update_available_ref(log=Log()):
@@ -167,8 +167,8 @@ def get_path(name,log=Log(),verbose=True):
167
167
  #config_path = path.dirname(__file__) + '/data/config.json'
168
168
  config_path = options.paths["config"]
169
169
  if not path.exists(config_path):
170
- if verbose : log.write("Config file not exists...")
171
- if verbose : log.write("Created new config file...")
170
+ log.write("Config file not exists...", verbose=verbose)
171
+ log.write("Created new config file...", verbose=verbose)
172
172
  initiate_config()
173
173
  else:
174
174
  try:
@@ -176,9 +176,9 @@ def get_path(name,log=Log(),verbose=True):
176
176
  if path.exists(dicts[name]):
177
177
  return dicts[name]
178
178
  else:
179
- if verbose : log.write("File not exist.")
179
+ log.write("File not exist.", verbose=verbose)
180
180
  except:
181
- if verbose : log.write("No records in config file. Please download first.")
181
+ log.write("No records in config file. Please download first.", verbose=verbose)
182
182
  return False
183
183
 
184
184
  ##################################################################################
@@ -277,7 +277,7 @@ def check_file_integrity(local_path, md5sum,log):
277
277
  log.write(" -MD5 verified.")
278
278
  return 1
279
279
  else:
280
- log.write("WARNING: -MD5 VERIFICATION FAILED !")
280
+ log.warning("-MD5 VERIFICATION FAILED!")
281
281
  return 0
282
282
 
283
283
  def remove_file(name,log=Log()):
gwaslab/bd_get_hapmap3.py CHANGED
@@ -1,6 +1,10 @@
1
1
  import pandas as pd
2
2
  from os import path
3
3
  from gwaslab.g_Log import Log
4
+ from gwaslab.qc_fix_sumstats import start_to
5
+ from gwaslab.qc_fix_sumstats import skipped
6
+ from gwaslab.qc_fix_sumstats import finished
7
+
4
8
  #A unique identifier (e.g., the rs number)
5
9
  #Allele 1 (effect allele)
6
10
  #Allele 2 (non-effect allele)
@@ -8,30 +12,60 @@ from gwaslab.g_Log import Log
8
12
  #A P-value
9
13
  #A signed summary statistic (beta, OR, log odds, Z-score, etc)
10
14
 
11
- def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
12
- if verbose:log.write(" -Processing "+str(len(sumstats))+" raw variants...")
15
+ def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True, match_allele= True, log=Log()):
16
+ ##start function with col checking##########################################################
17
+ _start_line = "extract HapMap3 SNPs"
18
+ _end_line = "extracting HapMap3 SNPs"
19
+ _start_cols =[]
20
+ _start_function = ".gethapmap3"
21
+ _must_args ={}
22
+
23
+ is_enough_info = start_to(sumstats=sumstats,
24
+ log=log,
25
+ verbose=verbose,
26
+ start_line=_start_line,
27
+ end_line=_end_line,
28
+ start_cols=_start_cols,
29
+ start_function=_start_function,
30
+ **_must_args)
31
+ if is_enough_info == False: return None
13
32
 
33
+ ############################################################################################
14
34
  if build=="19":
15
35
  data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
16
36
  elif build=="38":
17
37
  data_path = path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
18
38
 
19
- if verbose:log.write(" -Loading Hapmap3 variants data...")
20
-
21
- hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"],dtype={"#CHROM":"string","POS":"string"})
39
+ log.write(" -Loading Hapmap3 variants from built-in datasets...", verbose=verbose)
40
+
41
+ if match_allele:
42
+ additional_cols= ["A1","A2"]
43
+ else:
44
+ additional_cols=[]
45
+ hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"]+additional_cols, dtype={"#CHROM":"string","POS":"string"})
46
+
22
47
  #rsid A1 A2 #CHROM POS
23
48
  #rs3094315 G A 1 752566
49
+
24
50
  if rsid in sumstats.columns:
25
51
  output = sumstats.loc[sumstats[rsid].isin(hapmap3_ref["rsid"].values),:].copy()
26
52
  return output
53
+
27
54
  elif chrom in sumstats.columns and pos in sumstats.columns:
28
- if verbose: log.write(" -Since rsID not in sumstats, chr:pos( build "+build+") will be used for matching...")
55
+ log.write(" -Since rsID not in sumstats, CHR:POS( build "+build+") will be used for matching...", verbose=verbose)
29
56
  sumstats ["chr:pos"] = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
30
57
  hapmap3_ref["chr:pos"] = hapmap3_ref["#CHROM"]+":"+hapmap3_ref["POS"]
31
58
  hapmap3_ref = hapmap3_ref.rename(columns={"rsid":"rsID"})
32
- output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
33
- output = output.drop(columns="chr:pos")
34
- if verbose: log.write(" -Raw input contains "+str(len(output))+" hapmaps variants based on chr:pos...")
59
+ output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]+additional_cols],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
60
+ if match_allele:
61
+ log.write(" -Checking if alleles are same...")
62
+ is_matched = ((output[ea].astype("string") == output["A1"]) & (output[nea].astype("string") == output["A2"])) \
63
+ | ((output[ea].astype("string") == output["A2"]) & (output[nea].astype("string") == output["A1"]))
64
+ log.write(" -Variants with macthed alleles: {}".format(sum(is_matched)))
65
+ output = output.loc[is_matched,:]
66
+ output = output.drop(columns=["chr:pos"]+additional_cols)
67
+ log.write(" -Raw input contains "+str(len(output))+" Hapmap3 variants based on CHR:POS...", verbose=verbose)
68
+ finished(log=log,verbose=verbose,end_line=_end_line)
35
69
  return output
36
70
  else:
37
71
  raise ValueError("Not enough information to match SNPs. Please check your sumstats...")