gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/g_Log.py +14 -5
  5. gwaslab/g_Sumstats.py +86 -18
  6. gwaslab/g_SumstatsPair.py +70 -23
  7. gwaslab/g_SumstatsT.py +2 -2
  8. gwaslab/g_version.py +10 -10
  9. gwaslab/hm_casting.py +9 -4
  10. gwaslab/hm_harmonize_sumstats.py +88 -83
  11. gwaslab/io_preformat_input.py +14 -14
  12. gwaslab/io_read_ldsc.py +49 -1
  13. gwaslab/ldsc_irwls.py +198 -0
  14. gwaslab/ldsc_jackknife.py +514 -0
  15. gwaslab/ldsc_ldscore.py +417 -0
  16. gwaslab/ldsc_parse.py +294 -0
  17. gwaslab/ldsc_regressions.py +747 -0
  18. gwaslab/ldsc_sumstats.py +629 -0
  19. gwaslab/qc_check_datatype.py +1 -1
  20. gwaslab/qc_fix_sumstats.py +163 -161
  21. gwaslab/util_ex_calculate_ldmatrix.py +2 -2
  22. gwaslab/util_ex_gwascatalog.py +24 -24
  23. gwaslab/util_ex_ldproxyfinder.py +9 -9
  24. gwaslab/util_ex_ldsc.py +189 -0
  25. gwaslab/util_in_calculate_gc.py +6 -6
  26. gwaslab/util_in_calculate_power.py +42 -43
  27. gwaslab/util_in_convert_h2.py +8 -8
  28. gwaslab/util_in_fill_data.py +28 -28
  29. gwaslab/util_in_filter_value.py +91 -52
  30. gwaslab/util_in_get_density.py +8 -8
  31. gwaslab/util_in_get_sig.py +407 -65
  32. gwaslab/viz_aux_annotate_plot.py +12 -12
  33. gwaslab/viz_aux_quickfix.py +18 -18
  34. gwaslab/viz_aux_reposition_text.py +3 -3
  35. gwaslab/viz_aux_save_figure.py +14 -5
  36. gwaslab/viz_plot_compare_af.py +29 -30
  37. gwaslab/viz_plot_compare_effect.py +63 -71
  38. gwaslab/viz_plot_miamiplot2.py +6 -6
  39. gwaslab/viz_plot_mqqplot.py +17 -3
  40. gwaslab/viz_plot_qqplot.py +1 -1
  41. gwaslab/viz_plot_regionalplot.py +33 -32
  42. gwaslab/viz_plot_rg_heatmap.py +28 -26
  43. gwaslab/viz_plot_stackedregional.py +40 -21
  44. gwaslab/viz_plot_trumpetplot.py +50 -55
  45. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  46. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
  47. gwaslab-3.4.39.dist-info/RECORD +80 -0
  48. gwaslab-3.4.38.dist-info/RECORD +0 -72
  49. /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  50. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  51. {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -199,7 +199,7 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
199
199
  log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
200
200
 
201
201
  if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
202
- log.write(" -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
202
+ log.warning("Lead variant was not available in reference!")
203
203
 
204
204
  # adjust statistics
205
205
  output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
@@ -256,4 +256,4 @@ def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
256
256
  if list(matched_sumstats_snpid) == list(snpid_list):
257
257
  log.write(" -Sumstats SNPID order and LD matrix SNPID order are matched.")
258
258
  else:
259
- log.write(" -Warning: Sumstats SNPID order and LD matrix SNPID order are not matched...")
259
+ log.warning("Sumstats SNPID order and LD matrix SNPID order are not matched!")
@@ -9,26 +9,26 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
9
9
  #https://www.ebi.ac.uk/gwas/rest/docs/api
10
10
 
11
11
  base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
12
- if verbose: log.write("Start to retrieve data from GWASCatalog...")
13
- if verbose: log.write(" -Please make sure your sumstats is based on GRCh38...")
14
- if verbose: log.write(" -Requesting (GET) trait information through the GWASCatalog API...")
15
- if verbose: log.write(" -EFO trait api: "+ base_url)
12
+ log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
13
+ log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
14
+ log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
15
+ log.write(" -EFO trait api: "+ base_url, verbose=verbose)
16
16
  text = requests.get(base_url)
17
- if verbose:
18
- log.write(" -Status code: {}".format(text.status_code))
19
- if text.status_code!=200:
20
- log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.")
21
- log.write(" -Message:{}".format(text.text))
22
- return 0
17
+
18
+ log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
19
+ if text.status_code!=200:
20
+ log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
21
+ log.write(" -Message:{}".format(text.text), verbose=verbose)
22
+ return 0
23
23
 
24
24
  api_response = json.loads(text.text)
25
- if verbose: log.write(" -Trait Name:",api_response["trait"])
26
- if verbose: log.write(" -Trait URL:",api_response["uri"])
25
+ log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
26
+ log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
27
27
 
28
28
  base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
29
- if verbose: log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...")
30
- if verbose: log.write(" -associationsByTraitSummary API: "+ base_url)
31
- if verbose: log.write(" -Note: this step might take a while...")
29
+ log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
30
+ log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
31
+ log.write(" -Note: this step might take a while...", verbose=verbose)
32
32
 
33
33
  # get request and check status code of response
34
34
  raw_data = requests.get(base_url)
@@ -37,13 +37,13 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
37
37
  is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
38
38
  if is_proceed is False: return False
39
39
 
40
- if verbose: log.write(" -Loading json ...")
40
+ log.write(" -Loading json ...", verbose=verbose)
41
41
  # Transform API response from JSON into Python dictionary
42
42
  api_response = json.loads(raw_data.text)
43
- if verbose: log.write(" -Parsing json ...")
43
+ log.write(" -Parsing json ...", verbose=verbose)
44
44
  # An
45
45
  records=list()
46
- if verbose: log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]))
46
+ log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]), verbose=verbose)
47
47
 
48
48
  for association in api_response["_embedded"]["associations"]:
49
49
  #association statistics:
@@ -126,12 +126,12 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
126
126
  records.append(row)
127
127
  #rsid locations
128
128
  gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
129
- if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
129
+ log.write(" -Loading retrieved data into gwaslab Sumstats object ...", verbose=verbose)
130
130
  sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
131
131
  sigs.fix_pos(verbose=False)
132
132
  sigs.fix_chr(verbose=False)
133
133
  sigs.sort_coordinate(verbose=False)
134
- if verbose: log.write("Finished retrieving data from GWASCatalog...")
134
+ log.write("Finished retrieving data from GWASCatalog...", verbose=verbose)
135
135
  #return gwaslab Sumstats object
136
136
  return sigs
137
137
 
@@ -142,14 +142,14 @@ def check_request_status_code(request_code,verbose=True,log=Log()):
142
142
  is_proceed=False
143
143
 
144
144
  if request_code == 200:
145
- if verbose: log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...")
145
+ log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...", verbose=verbose)
146
146
  is_proceed=True
147
147
  elif request_code == 404:
148
- if verbose: log.write(" -Status code 404 Not Found: The requested resource did not exist ...")
148
+ log.write(" -Status code 404 Not Found: The requested resource did not exist ...", verbose=verbose)
149
149
  elif request_code == 301:
150
- if verbose: log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...")
150
+ log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...", verbose=verbose)
151
151
  elif request_code == 400:
152
- if verbose: log.write(" -Status code 400 Bad Request: The requested resource did not exist ...")
152
+ log.write(" -Status code 400 Bad Request: The requested resource did not exist ...", verbose=verbose)
153
153
 
154
154
  return is_proceed
155
155
 
@@ -49,8 +49,8 @@ def _extract_with_ld_proxy( snplist=None,
49
49
  ld_threshold=0.8
50
50
  ):
51
51
  ### Load vcf#######################################################################################
52
- if verbose: log.write("Start to load reference genotype...")
53
- if verbose: log.write(" -reference vcf path : "+ vcf_path)
52
+ log.write("Start to load reference genotype...", verbose=verbose)
53
+ log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
54
54
  if tabix is None:
55
55
  tabix = which("tabix")
56
56
  vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
@@ -122,7 +122,7 @@ def _extract_with_ld_proxy( snplist=None,
122
122
 
123
123
  extracted_sumstats = pd.concat([extracted_sumstats, ld_proxies],ignore_index=True)
124
124
 
125
- if verbose: log.write("Finished loading reference genotype successfully!")
125
+ log.write("Finished loading reference genotype successfully!", verbose=verbose)
126
126
  return extracted_sumstats
127
127
 
128
128
 
@@ -139,13 +139,13 @@ def _get_rsq( row,
139
139
  ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
140
140
 
141
141
  if ref_genotype is None:
142
- if verbose: log.write(" -Warning: no data was retrieved. Skipping ...")
142
+ log.warning("No data was retrieved. Skipping ...", verbose=verbose)
143
143
  ref_genotype=dict()
144
144
  ref_genotype["variants/POS"]=np.array([],dtype="int64")
145
145
  return None
146
146
 
147
- if verbose: log.write(" -Retrieving index...")
148
- if verbose: log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])))
147
+ log.write(" -Retrieving index...", verbose=verbose)
148
+ log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
149
149
  # match sumstats pos and ref pos:
150
150
  # get ref index for its first appearance of sumstats pos
151
151
  #######################################################################################
@@ -170,7 +170,7 @@ def _get_rsq( row,
170
170
  else:
171
171
  # no position match
172
172
  return None
173
- if verbose: log.write(" -Matching variants using POS, NEA, EA ...")
173
+ log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
174
174
 
175
175
  sumstats["REFINDEX"] = sumstats.loc[:,["POS","NEA","EA"]].apply(lambda x: match_varaint(x), axis=1)
176
176
  log.write(" -Matched variants in sumstats and vcf:{} ".format(sum(~sumstats["REFINDEX"].isna())))
@@ -190,7 +190,7 @@ def _get_rsq( row,
190
190
  lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
191
191
  other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
192
192
 
193
- if verbose: log.write(" -Calculating Rsq...")
193
+ log.write(" -Calculating Rsq...", verbose=verbose)
194
194
 
195
195
  if len(other_snp_genotype)>1:
196
196
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
@@ -198,7 +198,7 @@ def _get_rsq( row,
198
198
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
199
199
  sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
200
200
  else:
201
- if verbose: log.write(" -Lead SNP not found in reference...")
201
+ log.write(" -Lead SNP not found in reference...", verbose=verbose)
202
202
  sumstats["RSQ"]=None
203
203
 
204
204
  sumstats["RSQ"] = sumstats["RSQ"].astype("float")
@@ -0,0 +1,189 @@
1
+ from gwaslab.ldsc_sumstats import estimate_h2
2
+ from gwaslab.ldsc_sumstats import estimate_rg
3
+ from gwaslab.g_Log import Log
4
+ from gwaslab.qc_fix_sumstats import start_to
5
+ from gwaslab.qc_fix_sumstats import finished
6
+ from gwaslab.qc_fix_sumstats import skipped
7
+ from gwaslab.io_read_ldsc import parse_ldsc_summary
8
+
9
+ class ARGS():
10
+ def __init__(self, **args):
11
+
12
+ self.out = "ldsc"
13
+ self.bfile = None
14
+ self.l2 = False
15
+ self.extract = None
16
+ self.keep = None
17
+ self.ld_wind_snps = None
18
+ self.ld_wind_kb = None
19
+ self.ld_wind_cm = None
20
+ self.print_snps = None
21
+ self.annot =None
22
+ self.thin_annot = False
23
+ self.cts_bin = None
24
+ self.cts_breaks = None
25
+ self.cts_names = None
26
+ self.per_allele = False
27
+ self.pq_exp =None
28
+ self.no_print_annot = False
29
+
30
+ if "h2" in args.keys():
31
+ self.h2 = args["h2"]
32
+ else:
33
+ self.h2 = None
34
+
35
+ self.h2_cts = None
36
+
37
+ if "rg" in args.keys():
38
+ self.rg = args["rg"]
39
+ else:
40
+ self.rg = None
41
+
42
+ if "ref_ld" in args.keys():
43
+ self.ref_ld = args["ref_ld"]
44
+ else:
45
+ self.ref_ld = None
46
+
47
+ if "ref_ld_chr" in args.keys():
48
+ self.ref_ld_chr = args["ref_ld_chr"]
49
+ else:
50
+ self.ref_ld_chr = None
51
+
52
+ if "w_ld" in args.keys():
53
+ self.w_ld = args["w_ld"]
54
+ else:
55
+ self.w_ld = None
56
+
57
+ if "w_ld_chr" in args.keys():
58
+ self.w_ld_chr = args["w_ld_chr"]
59
+ else:
60
+ self.w_ld_chr = None
61
+
62
+ self.overlap_annot = False
63
+ self.print_coefficients = "ldsc"
64
+ self.frqfile = None
65
+ self.frqfile_chr = None
66
+ self.no_intercept = None
67
+ self.intercept_h2 = None
68
+ self.intercept_gencov = None
69
+ self.M = None
70
+ self.two_step = None
71
+ self.chisq_max = None
72
+ self.ref_ld_chr_cts = None
73
+ self.print_cov = None
74
+ self.print_delete_vals = False
75
+ self.chunk_size = 50
76
+ self.pickle = False
77
+ self.yes_really = False
78
+ self.invert_anyway = False
79
+ self.n_blocks = 200
80
+ self.not_M_5_50 = False
81
+ self.no_check_alleles = False
82
+ self.return_silly_things = False
83
+
84
+ if "samp_prev" in args.keys():
85
+ self.samp_prev = args["samp_prev"]
86
+ else:
87
+ self.samp_prev = None
88
+
89
+ if "pop_prev" in args.keys():
90
+ self.pop_prev = args["pop_prev"]
91
+ else:
92
+ self.pop_prev = None
93
+
94
+ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, **args):
95
+ sumstats = insumstats.copy()
96
+ ##start function with col checking##########################################################
97
+ _start_line = "run LD score regression"
98
+ _end_line = "running LD score regression"
99
+ _start_cols =[]
100
+ _start_function = ".estimate_h2_by_ldsc()"
101
+ _must_args ={}
102
+
103
+ is_enough_info = start_to(sumstats=sumstats,
104
+ log=log,
105
+ verbose=verbose,
106
+ start_line=_start_line,
107
+ end_line=_end_line,
108
+ start_cols=_start_cols,
109
+ start_function=_start_function,
110
+ **_must_args)
111
+ if is_enough_info == False: return None
112
+ ############################################################################################
113
+ log.write(" -Run single variate LD score regression:", verbose=verbose)
114
+ log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
115
+ log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
116
+ log.write(" -Arguments:", verbose=verbose)
117
+
118
+ for key, value in args.items():
119
+ log.write(" -{}:{}".format(key, value), verbose=verbose)
120
+
121
+ default_args = ARGS(**args)
122
+
123
+ if "Z" not in sumstats.columns:
124
+ sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
125
+
126
+ sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
127
+
128
+ log.write(" -LDSC log:", verbose=verbose)
129
+ summary = estimate_h2(sumstats, default_args, log)
130
+
131
+ log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
132
+ finished(log=log,verbose=verbose,end_line=_end_line)
133
+ return parse_ldsc_summary(summary)
134
+
135
+ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **args):
136
+ sumstats = insumstats.copy()
137
+ ##start function with col checking##########################################################
138
+ _start_line = "run LD score regression for genetic correlation"
139
+ _end_line = "running LD score regression for genetic correlation"
140
+ _start_cols =[]
141
+ _start_function = ".estimate_rg_by_ldsc()"
142
+ _must_args ={}
143
+
144
+ is_enough_info = start_to(sumstats=sumstats,
145
+ log=log,
146
+ verbose=verbose,
147
+ start_line=_start_line,
148
+ end_line=_end_line,
149
+ start_cols=_start_cols,
150
+ start_function=_start_function,
151
+ **_must_args)
152
+ if is_enough_info == False: return None
153
+ ############################################################################################
154
+ log.write(" -Run cross-trait LD score regression:", verbose=verbose)
155
+ log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
156
+ log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
157
+ log.write(" -Arguments:", verbose=verbose)
158
+
159
+ for key, value in args.items():
160
+ log.write(" -{}:{}".format(key, value), verbose=verbose)
161
+
162
+ default_args = ARGS(**args)
163
+
164
+ if "Z" not in sumstats.columns:
165
+ sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
166
+
167
+ sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
168
+
169
+ other_traits_to_use = []
170
+ alias = default_args.rg.split(",")[1:]
171
+
172
+ for index, each_other_sumstats in enumerate(other_traits):
173
+ log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
174
+ if "rsID" not in each_other_sumstats.data.columns:
175
+ to_append = each_other_sumstats.filter_hapmap3(verbose=False).data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
176
+ else:
177
+ to_append = each_other_sumstats.data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
178
+
179
+ if "Z" not in to_append.columns:
180
+ to_append["Z"] = to_append["BETA"]/to_append["SE"]
181
+
182
+ other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
183
+
184
+ log.write(" -LDSC log:", verbose=verbose)
185
+ summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
186
+
187
+ log.write(" -Results have been stored in .ldsc_rg", verbose=verbose)
188
+ finished(log=log,verbose=verbose,end_line=_end_line)
189
+ return summary
@@ -12,34 +12,34 @@ def lambdaGC(insumstats,include_chrXYMT=True, x=23 ,y=24, mt=25, mode="P",level=
12
12
  sumstats=insumstats.loc[:,["CHR",mode]]
13
13
 
14
14
  if include_chrXYMT is False:
15
- if verbose: log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.")
15
+ log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.", verbose=verbose)
16
16
  xymt= [x,y,mt,"chrx","chry","chrmt","chrX","chrY","chrMT","chrM","M","x","y","mt","X","Y","MT"]
17
17
  sumstats = sumstats.loc[~sumstats["CHR"].isin(xymt),:]
18
18
 
19
19
  indata = sumstats[mode].values
20
20
  if len(indata) == 0:
21
- if verbose: log.write(" -No available variants to use for calculation.")
21
+ log.write(" -No available variants to use for calculation.", verbose=verbose)
22
22
  return np.nan
23
23
  if mode=="p" or mode=="P":
24
24
  observedMedianChi2 = sp.stats.chi2.isf(np.nanmedian(indata),1)
25
25
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
26
26
  lambdagc=observedMedianChi2/expectedMedianChi2
27
- if verbose: log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc))
27
+ log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
28
28
  elif mode=="mlog10p" or mode=="MLOG10P":
29
29
  observedMedianChi2 = sp.stats.chi2.isf( np.nanmedian(np.power(10,-indata)) ,1)
30
30
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
31
31
  lambdagc=observedMedianChi2/expectedMedianChi2
32
- if verbose: log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
32
+ log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
33
33
  elif mode=="z" or mode=="Z":
34
34
  observedMedianChi2 = np.median((indata)**2)
35
35
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
36
36
  lambdagc=observedMedianChi2/expectedMedianChi2
37
- if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
37
+ if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
38
38
  elif mode=="chi2" or mode=="CHISQ":
39
39
  observedMedianChi2 = np.median(indata)
40
40
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
41
41
  lambdagc=observedMedianChi2/expectedMedianChi2
42
- if verbose:log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
42
+ log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
43
43
  else:
44
44
  return np.nan
45
45
  return lambdagc
@@ -21,24 +21,23 @@ def get_power(
21
21
  log=Log(),
22
22
  verbose=True
23
23
  ):
24
- if verbose: log.write(" Start to calculate statistical power...")
24
+ log.write(" Start to calculate statistical power...", verbose=verbose)
25
25
  if mode=="b":
26
- if verbose:
27
- log.write(" -Input settings (b mode):")
28
- log.write(" -Number of cases:{}".format(ncase))
29
- log.write(" -Number of controls:{}".format(ncontrol))
30
- if genotype_rr is not None:
31
- log.write(" -Risk allele RR:{:.3f}".format(genotype_rr))
32
- elif genotype_or is not None:
33
- log.write(" -Risk allele OR:{:.3f}".format(genotype_or))
34
- elif beta is not None:
35
- log.write(" -Risk allele beta:{:.3f}".format(beta))
36
- else:
37
- genotype_rr = 0.1
38
- log.write(" -Risk allele RR:{:.3f}".format(genotype_rr))
39
- log.write(" -Disease prevalence:{:.3f}".format(prevalence))
40
- log.write(" -Risk allele frequency: {:.3f}".format(daf))
41
- log.write(" -Significance level: {:.3e}".format(sig_level))
26
+ log.write(" -Input settings (b mode):", verbose=verbose)
27
+ log.write(" -Number of cases:{}".format(ncase), verbose=verbose)
28
+ log.write(" -Number of controls:{}".format(ncontrol), verbose=verbose)
29
+ if genotype_rr is not None:
30
+ log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
31
+ elif genotype_or is not None:
32
+ log.write(" -Risk allele OR:{:.3f}".format(genotype_or), verbose=verbose)
33
+ elif beta is not None:
34
+ log.write(" -Risk allele beta:{:.3f}".format(beta), verbose=verbose)
35
+ else:
36
+ genotype_rr = 0.1
37
+ log.write(" -Risk allele RR:{:.3f}".format(genotype_rr), verbose=verbose)
38
+ log.write(" -Disease prevalence:{:.3f}".format(prevalence), verbose=verbose)
39
+ log.write(" -Risk allele frequency: {:.3f}".format(daf), verbose=verbose)
40
+ log.write(" -Significance level: {:.3e}".format(sig_level), verbose=verbose)
42
41
  # Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
43
42
  aaf = daf**2
44
43
  abf = 2 * (daf) * (1 - daf)
@@ -56,11 +55,11 @@ def get_power(
56
55
  # https://jamanetwork.com/journals/jama/fullarticle/188182
57
56
 
58
57
  if or_to_rr ==False:
59
- if verbose: log.write(" -Alogorithm: Skol, Andrew D., et al. Nature genetics 38.2 (2006): 209-213....")
60
- if verbose: log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....")
58
+ log.write(" -Alogorithm: Skol, Andrew D., et al. Nature genetics 38.2 (2006): 209-213....", verbose=verbose)
59
+ log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
61
60
  else:
62
- if verbose: log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence))
63
- if verbose: log.write(" -Alogorithm: Zhang, J., & Kai, F. Y. (1998). What's the relative risk?: A method of correcting the odds ratio in cohort studies of common outcomes. Jama, 280(19), 1690-1691.....")
61
+ log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
62
+ log.write(" -Alogorithm: Zhang, J., & Kai, F. Y. (1998). What's the relative risk?: A method of correcting the odds ratio in cohort studies of common outcomes. Jama, 280(19), 1690-1691.....", verbose=verbose)
64
63
 
65
64
  # additive
66
65
  x = [ 2*genotype_rr-1, genotype_rr, 1 ]
@@ -68,19 +67,19 @@ def get_power(
68
67
  aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
69
68
  abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
70
69
  bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
71
- if verbose: log.write("Probability of disease :")
72
- if verbose: log.write(" - Individuals with AA genotype: {:.3f}".format(aap))
73
- if verbose: log.write(" - Individuals with AB genotype: {:.3f}".format(abp))
74
- if verbose: log.write(" - Individuals with BB genotype: {:.3f}".format(bbp))
70
+ log.write("Probability of disease :", verbose=verbose)
71
+ log.write(" - Individuals with AA genotype: {:.3f}".format(aap), verbose=verbose)
72
+ log.write(" - Individuals with AB genotype: {:.3f}".format(abp), verbose=verbose)
73
+ log.write(" - Individuals with BB genotype: {:.3f}".format(bbp), verbose=verbose)
75
74
 
76
75
  pcase= (aap * aaf + abp * abf*0.5) / prevalence
77
76
  pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
78
77
 
79
78
  vcase = pcase *(1-pcase)
80
79
  vcontrol =pcontrol *(1-pcontrol)
81
- if verbose: log.write("Expected risk allele frequency:")
82
- if verbose: log.write(" - In cases: {:.3f}".format(pcase))
83
- if verbose: log.write(" - In controls: {:.3f}".format(pcontrol))
80
+ log.write("Expected risk allele frequency:", verbose=verbose)
81
+ log.write(" - In cases: {:.3f}".format(pcase), verbose=verbose)
82
+ log.write(" - In controls: {:.3f}".format(pcontrol), verbose=verbose)
84
83
 
85
84
  num= (pcase - pcontrol)
86
85
  den= np.sqrt( (vcase/ncase + vcontrol/ncontrol)*0.5 )
@@ -88,22 +87,22 @@ def get_power(
88
87
 
89
88
  c = ss.norm.isf(sig_level/2)
90
89
  power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
91
- if verbose: log.write("Expected power: {:.3f}".format(power))
90
+ log.write("Expected power: {:.3f}".format(power), verbose=verbose)
92
91
 
93
92
  elif mode=="q":
94
93
  if beta is None:
95
94
  beta = 0.1
96
- if verbose:
97
- log.write(" -Input settings (q mode):")
98
- log.write(" -Significance level: {}".format(sig_level))
99
- log.write(" -EAF: {}".format(eaf))
100
- log.write(" -BETA: {}".format(beta))
101
- log.write(" -N: {}".format(n))
102
- log.write(" -SNPR2: {}".format(2*eaf*(1-eaf)*(beta**2)))
95
+
96
+ log.write(" -Input settings (q mode):", verbose=verbose)
97
+ log.write(" -Significance level: {}".format(sig_level), verbose=verbose)
98
+ log.write(" -EAF: {}".format(eaf), verbose=verbose)
99
+ log.write(" -BETA: {}".format(beta), verbose=verbose)
100
+ log.write(" -N: {}".format(n), verbose=verbose)
101
+ log.write(" -SNPR2: {}".format(2*eaf*(1-eaf)*(beta**2)), verbose=verbose)
103
102
  c = ss.chi2.isf(sig_level,df=1)
104
103
  NCP = n * 2*eaf*(1-eaf)*(beta**2)/vary
105
104
  power = 1 - ss.ncx2.cdf(c, df=1, nc=NCP)
106
- if verbose: log.write("Finished calculating statistical power.")
105
+ log.write("Finished calculating statistical power.", verbose=verbose)
107
106
  return power
108
107
 
109
108
  def get_beta(
@@ -137,11 +136,11 @@ def get_beta(
137
136
  eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
138
137
  betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
139
138
 
140
- if verbose: log.write(" -Updating eaf-beta matrix...")
139
+ log.write(" -Updating eaf-beta matrix...", verbose=verbose)
141
140
  for i in range(n_matrix):
142
141
  eaf_beta_matrix[i,] = calculate_power_single(beta=betas,eaf=eafs[i],n=n,sig_level=sig_level,vary=vary)
143
142
 
144
- if verbose: log.write(" -Extracting eaf-beta combinations with power = {}...".format(t))
143
+ log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
145
144
  i,j=1,1
146
145
  eaf_beta = []
147
146
  while i<n_matrix-1 and j<n_matrix-1:
@@ -207,11 +206,11 @@ def get_beta_binary(
207
206
  eafs = np.linspace(eaf_range[1],eaf_range[0],n_matrix)
208
207
  betas = np.linspace(beta_range[0],beta_range[1],n_matrix)
209
208
 
210
- if verbose: log.write(" -Updating eaf-beta matrix...")
209
+ log.write(" -Updating eaf-beta matrix...", verbose=verbose)
211
210
  if or_to_rr ==False:
212
- if verbose: log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....")
211
+ log.write(" -GRR is approximated using OR. For prevalence < 10%, GRR is very similar to OR....", verbose=verbose)
213
212
  else:
214
- if verbose: log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence))
213
+ log.write(" -OR is converted to GRR using base prevalence: {}".format(prevalence), verbose=verbose)
215
214
 
216
215
  for i in range(n_matrix):
217
216
  eaf_beta_matrix[i,] = calculate_power_single(beta=betas,
@@ -222,7 +221,7 @@ def get_beta_binary(
222
221
  sig_level=sig_level,
223
222
  or_to_rr=or_to_rr)
224
223
 
225
- if verbose: log.write(" -Extracting eaf-beta combinations with power = {}...".format(t))
224
+ log.write(" -Extracting eaf-beta combinations with power = {}...".format(t), verbose=verbose)
226
225
  i,j=1,1
227
226
  eaf_beta = []
228
227
  while i<n_matrix-1 and j<n_matrix-1:
@@ -65,7 +65,7 @@ def _get_per_snp_r2(sumstats,
65
65
  adjuested=False,
66
66
  verbose=True):
67
67
  # Pierce, B. L., Ahsan, H., & VanderWeele, T. J. (2011). Power and instrument strength requirements for Mendelian randomization studies using multiple genetic variants. International journal of epidemiology, 40(3), 740-752.
68
- if verbose: log.write("Start to calculate per-SNP heritibility...")
68
+ log.write("Start to calculate per-SNP heritibility...", verbose=verbose)
69
69
  if type(k) is int or type(k) is float:
70
70
  pass
71
71
  elif k =="all":
@@ -81,18 +81,18 @@ def _get_per_snp_r2(sumstats,
81
81
  # Var(e) = betase**2 * 2 * N * MAF * (1-MAF)
82
82
  # r2 = Var(beta * X) / Var(y)
83
83
 
84
- if verbose: log.write(" -Calculating per-SNP rsq by 2 * (BETA**2) * AF * (1-AF) / Var(y)...")
84
+ log.write(" -Calculating per-SNP rsq by 2 * (BETA**2) * AF * (1-AF) / Var(y)...", verbose=verbose)
85
85
  sumstats["_VAR(BETAX)"] = 2*(sumstats[beta]**2)*sumstats[af]*(1-sumstats[af])
86
86
 
87
87
  if type(vary) is int or type(vary) is float:
88
- if verbose: log.write(" -Var(y) is provided: {}...".format(vary))
88
+ log.write(" -Var(y) is provided: {}...".format(vary), verbose=verbose)
89
89
  sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / vary
90
90
  elif vary=="se":
91
- if verbose: log.write(" -Var(y) is estimated from VAR(BETA * X), N, MAF, SE: {}...".format(vary))
91
+ log.write(" -Var(y) is estimated from VAR(BETA * X), N, MAF, SE: {}...".format(vary), verbose=verbose)
92
92
  sumstats["_SIGMA2"] = sumstats[se]**2 * 2*(sumstats[n])*sumstats[af]*(1-sumstats[af])
93
93
  sumstats["SNPR2"] = sumstats["_VAR(BETAX)"] / (sumstats["_SIGMA2"] + sumstats["_VAR(BETAX)"])
94
94
  else:
95
- if verbose: log.write(" -Warning: Not enough informationfor calculation.")
95
+ log.warning("Not enough information for calculation.")
96
96
 
97
97
  if mode=="b":
98
98
  if ncase not in sumstats.columns:
@@ -117,11 +117,11 @@ def _get_per_snp_r2(sumstats,
117
117
  else:
118
118
  snpr2 = "SNPR2"
119
119
  if n in sumstats.columns:
120
- if verbose: log.write(" -Calculating F-statistic: F = [(N-k-1)/k] * (r2/1-r2)... where k = {}".format(k))
121
- if verbose: log.write(" -For r2, {} is used.".format(snpr2))
120
+ log.write(" -Calculating F-statistic: F = [(N-k-1)/k] * (r2/1-r2)... where k = {}".format(k), verbose=verbose)
121
+ log.write(" -For r2, {} is used.".format(snpr2), verbose=verbose)
122
122
  sumstats["F"] = sumstats[snpr2]*(sumstats[n]-1 -k)/((1-sumstats[snpr2]) * k)
123
123
 
124
- if verbose: log.write("Finished calculating per-SNP heritability!")
124
+ log.write("Finished calculating per-SNP heritability!", verbose=verbose)
125
125
  return sumstats
126
126
  #
127
127
  def get_population_allele_frequency(af, prop, odds_ratio, prevalence,eps=1e-15):