gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,7 @@ def summarize(insumstats,
15
15
  for i in [snpid,rsid,eaf,p,n,status]:
16
16
  if i in insumstats.columns:
17
17
  cols.append(i)
18
- sumstats= insumstats.loc[:,cols].copy()
18
+ sumstats= insumstats[cols].copy()
19
19
  ###############################################################################
20
20
  numeric_cols=[]
21
21
  output = {}
@@ -68,7 +68,7 @@ def summarize(insumstats,
68
68
  sumstats.drop(columns='uniq_index',inplace=True)
69
69
  status_dic = {}
70
70
  for index,row in status_summary.iterrows():
71
- status_dic[str(index)]=row[0]
71
+ status_dic[str(index)]=row.iloc[0]
72
72
  output["STATUS"]=status_dic
73
73
  numeric_cols.append("STATUS")
74
74
  df = pd.DataFrame.from_dict({(i,j): output[i][j]
@@ -84,7 +84,7 @@ def summarize(insumstats,
84
84
  return df
85
85
 
86
86
  def sum_status(id_to_use, sumstats):
87
- results = sumstats.groupby("STATUS").count()
87
+ results = sumstats.groupby("STATUS",observed=True).count()
88
88
  results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
89
89
  return results
90
90
 
gwaslab/g_version.py CHANGED
@@ -3,10 +3,10 @@ import subprocess
3
3
  import os
4
4
  import numpy as np
5
5
 
6
- def _show_version(log=Log()):
6
+ def _show_version(log=Log(), verbose=True):
7
7
  # show version when loading sumstats
8
- log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]))
9
- log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
8
+ log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
9
+ log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com",verbose=verbose)
10
10
 
11
11
  def _get_version():
12
12
  # return short version string like v3.4.33
@@ -15,12 +15,12 @@ def _get_version():
15
15
  def gwaslab_info():
16
16
  # version meta information
17
17
  dic={
18
- "version":"3.4.37",
19
- "release_date":"20240129"
18
+ "version":"3.4.39",
19
+ "release_date":"20240210"
20
20
  }
21
21
  return dic
22
22
 
23
- def _checking_plink_version(v=2,log=Log()):
23
+ def _checking_plink_version(v=2,log=Log(), verbose=True):
24
24
  if v==1:
25
25
  which_plink_script = "plink --version"
26
26
  elif v==2:
@@ -29,19 +29,19 @@ def _checking_plink_version(v=2,log=Log()):
29
29
  log.write(" -PLINK version: {}".format(output.strip()))
30
30
  return log
31
31
 
32
- def _checking_r_version(r, log):
32
+ def _checking_r_version(r, log=Log(), verbose=True):
33
33
  which_r_script = "{} --version".format(r)
34
34
  output = subprocess.check_output(which_r_script, stderr=subprocess.STDOUT, shell=True,text=True)
35
- log.write(" -R version: {}".format(output.strip()))
35
+ log.write(" -R version: {}".format(output.strip()),verbose=verbose)
36
36
  return log
37
37
 
38
- def _check_susie_version(r,log):
38
+ def _check_susie_version(r,log=Log(), verbose=True):
39
39
  rscript = 'print(packageVersion("susieR"))'
40
40
  temp_r = "_gwaslab_susie_temp_check_version_{}.R".format(np.random.randint(1, 99999999))
41
41
  with open(temp_r,"w") as file:
42
42
  file.write(rscript)
43
43
  which_susie_script = "{} {}".format(r, temp_r)
44
44
  output = subprocess.check_output(which_susie_script, stderr=subprocess.STDOUT, shell=True,text=True)
45
- log.write(" -SuSieR version: {}".format(output.strip()))
45
+ log.write(" -SuSieR version: {}".format(output.strip()),verbose=verbose)
46
46
  os.remove(temp_r)
47
47
  return log
gwaslab/hm_casting.py CHANGED
@@ -5,18 +5,24 @@ from pandas.api.types import CategoricalDtype
5
5
  from gwaslab.g_vchange_status import copy_status
6
6
  from gwaslab.g_vchange_status import vchange_status
7
7
  from gwaslab.qc_fix_sumstats import flipallelestats
8
+ from gwaslab.qc_check_datatype import check_datatype
9
+ from gwaslab.qc_fix_sumstats import start_to
8
10
  from gwaslab.util_in_fill_data import filldata
9
11
  from Bio import SeqIO
10
12
  from itertools import combinations
11
13
 
12
- def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
14
+ def _merge_mold_with_sumstats_by_chrpos(mold, sumstats, ref_path=None, windowsizeb=10, log=Log(),suffixes=("_MOLD",""),verbose=True,return_not_matched_mold =False):
15
+
16
+
13
17
  cols_to_drop = []
14
18
  for i in sumstats.columns:
15
19
  if i in ["SNPID","rsID"]:
16
20
  cols_to_drop.append(i)
21
+
22
+ log.write("Start to merge sumstats...", verbose=verbose)
17
23
 
18
24
  if len(cols_to_drop)>0:
19
- log.write("Dropping old IDs:{}".format(cols_to_drop))
25
+ log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
20
26
  sumstats = sumstats.drop(columns=cols_to_drop)
21
27
 
22
28
  if ref_path is not None :
@@ -29,18 +35,20 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
29
35
  if return_not_matched_mold:
30
36
  mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
31
37
 
38
+ # mold sumffix + mold
32
39
  mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
33
- log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
40
+ log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
34
41
 
35
42
  mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
36
- log.write("Matched variants:{}".format(len(mold_sumstats)))
43
+
44
+ log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
37
45
 
38
- if ref_path is not None:
39
- # match removed sumstats
40
- mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
41
- iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
42
- _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
43
- mold_sumstats.drop(columns=["_INDEX",""])
46
+ #if ref_path is not None:
47
+ # # match removed sumstats
48
+ # mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
49
+ # iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
50
+ # _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
51
+ # mold_sumstats.drop(columns=["_INDEX",""])
44
52
 
45
53
  if return_not_matched_mold == True:
46
54
  sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
@@ -59,14 +67,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
59
67
 
60
68
  all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
61
69
  allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
62
- sumstats.loc[:, [nea1,ea1,nea2,ea2]] = sumstats.loc[:, [nea1,ea1,nea2,ea2]].astype(allele_type)
70
+ sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
63
71
 
64
72
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
65
73
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
66
74
  is_allele_set_match = is_flipped_match | is_perfect_match
67
75
 
68
- sumstats.loc[~is_allele_set_match,:]
69
-
76
+ log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
77
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
78
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
79
+ log.write(" -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
80
+
70
81
  return sumstats.loc[is_allele_set_match,:]
71
82
 
72
83
  def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
@@ -77,10 +88,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
77
88
  nea2="NEA"+suffixes[1]
78
89
  status1="STATUS"+suffixes[0]
79
90
  status2="STATUS"+suffixes[1]
91
+
80
92
  is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
81
93
  is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
82
94
 
95
+ log.write(" -Aligning alleles with reference: ", verbose=verbose)
96
+ log.write(" -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
97
+ log.write(" -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
98
+
99
+ log.write(" -For perfect match: copy STATUS from reference...", verbose=verbose)
83
100
  sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
101
+
102
+ log.write(" -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
84
103
  sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
85
104
 
86
105
  return sumstats
@@ -119,9 +138,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
119
138
  if i not in order:
120
139
  output_columns.append(i)
121
140
 
122
- if verbose: log.write(" -Reordering columns to :", ",".join(output_columns))
123
- molded_sumstats = molded_sumstats.loc[:, output_columns]
124
- if verbose: log.write("Finished sorting columns successfully!")
141
+ log.write(" -Reordering columns to :", ",".join(output_columns), verbose=verbose)
142
+ molded_sumstats = molded_sumstats[ output_columns]
143
+ log.write("Finished sorting columns successfully!", verbose=verbose)
125
144
 
126
145
  return molded_sumstats
127
146
 
@@ -154,7 +173,7 @@ def _match_two_sumstats(mold,sumstats,ref_path,windowsizeb=25,verbose=True,log=L
154
173
  record_chr = int(str(record.id).strip("chrCHR").upper())
155
174
 
156
175
  if record_chr in chromlist:
157
- if verbose: log.write(record_chr," ", end="",show_time=False)
176
+ log.write(record_chr," ", end="",show_time=False,verbose=verbose)
158
177
  chromlist.remove(record_chr)
159
178
  else:
160
179
  continue