gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ def tofinemapping(sumstats,
12
12
  study=None,
13
13
  bfile=None,
14
14
  vcf=None,
15
+ loci=None,
15
16
  out="./",
16
17
  windowsizekb=1000,
17
18
  n_cores=1,
@@ -27,8 +28,13 @@ def tofinemapping(sumstats,
27
28
  suffixes=[""]
28
29
  if getlead_args is None:
29
30
  getlead_args={"windowsizekb":1000}
30
- sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
31
-
31
+
32
+ if loci is None:
33
+ log.write(" -Loci were not provided. All significant loci will be automatically extracted...")
34
+ sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
35
+ else:
36
+ sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
37
+
32
38
  # Drop duplicate!!!!
33
39
  log.write(" -Dropping duplicated SNPIDs...")
34
40
  sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
@@ -170,6 +176,7 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
170
176
  def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=None):
171
177
  if suffixes is None:
172
178
  suffixes=[""]
179
+
173
180
  log.write(" -#variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
174
181
  # convert category to string
175
182
  locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
@@ -180,28 +187,35 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
180
187
  combined_df = pd.merge(ref_bim, locus_sumstats, on="SNPID",how="inner")
181
188
 
182
189
  # match allele
183
- allele_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) ) | ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
184
- log.write(" -#Variants with matched alleles:{}".format(sum(allele_match)))
190
+ perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
191
+ log.write(" -#Variants with perfect matched alleles:{}".format(sum(perfect_match)))
185
192
 
186
193
  # fliipped allele
187
- ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
188
- log.write(" -#Variants with flipped alleles:{}".format(sum(ea_mis_match)))
194
+ #ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
195
+ flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
196
+ log.write(" -#Variants with flipped alleles:{}".format(sum(flipped_match)))
189
197
 
190
- if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
191
- log.write(" -Warning: Lead variant was not available in reference!!!!!!!!!!!!!!!")
198
+ allele_match = perfect_match | flipped_match
199
+ log.write(" -#Total Variants matched:{}".format(sum(allele_match)))
200
+
201
+ if row["SNPID"] not in combined_df.loc[perfect_match,"SNPID"].values:
202
+ log.warning("Lead variant was not available in reference!")
192
203
 
193
204
  # adjust statistics
194
205
  output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
195
206
  for suffix in suffixes:
196
207
  if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
197
- combined_df.loc[ea_mis_match,"BETA"+suffix] = - combined_df.loc[ea_mis_match,"BETA"+suffix]
208
+ log.write(" -Flipping BETA{} for variants with flipped alleles...".format(suffix))
209
+ combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
198
210
  output_columns.append("BETA"+suffix)
199
211
  output_columns.append("SE"+suffix)
200
212
  if "Z" in locus_sumstats.columns:
201
- combined_df.loc[ea_mis_match,"Z"+suffix] = - combined_df.loc[ea_mis_match,"Z"+suffix]
213
+ log.write(" -Flipping Z{} for variants with flipped alleles...".format(suffix))
214
+ combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
202
215
  output_columns.append("Z"+suffix)
203
216
  if "EAF" in locus_sumstats.columns:
204
- combined_df.loc[ea_mis_match,"EAF"+suffix] = 1 - combined_df.loc[ea_mis_match,"EAF"+suffix]
217
+ log.write(" -Flipping EAF{} for variants with flipped alleles...".format(suffix))
218
+ combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
205
219
  output_columns.append("EAF"+suffix)
206
220
  if "N" in locus_sumstats.columns:
207
221
  output_columns.append("N"+suffix)
@@ -215,6 +229,7 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
215
229
  matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
216
230
 
217
231
  matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
232
+ log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
218
233
 
219
234
  # create locus-sumstats EA, NEA, (BETA, SE), Z
220
235
  matched_sumstats_path = "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
@@ -230,7 +245,10 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
230
245
  to_export_columns.append("EAF"+suffix)
231
246
  if "N"+suffix in matched_sumstats.columns:
232
247
  to_export_columns.append("N"+suffix)
233
- matched_sumstats.loc[:, ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
248
+
249
+ log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
250
+ log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
251
+ matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
234
252
  return matched_snp_list_path, matched_sumstats_path
235
253
 
236
254
  def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
@@ -238,4 +256,4 @@ def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):
238
256
  if list(matched_sumstats_snpid) == list(snpid_list):
239
257
  log.write(" -Sumstats SNPID order and LD matrix SNPID order are matched.")
240
258
  else:
241
- log.write(" -Warning: Sumstats SNPID order and LD matrix SNPID order are not matched...")
259
+ log.warning("Sumstats SNPID order and LD matrix SNPID order are not matched!")
@@ -9,26 +9,26 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
9
9
  #https://www.ebi.ac.uk/gwas/rest/docs/api
10
10
 
11
11
  base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
12
- if verbose: log.write("Start to retrieve data from GWASCatalog...")
13
- if verbose: log.write(" -Please make sure your sumstats is based on GRCh38...")
14
- if verbose: log.write(" -Requesting (GET) trait information through the GWASCatalog API...")
15
- if verbose: log.write(" -EFO trait api: "+ base_url)
12
+ log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
13
+ log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
14
+ log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
15
+ log.write(" -EFO trait api: "+ base_url, verbose=verbose)
16
16
  text = requests.get(base_url)
17
- if verbose:
18
- log.write(" -Status code: {}".format(text.status_code))
19
- if text.status_code!=200:
20
- log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.")
21
- log.write(" -Message:{}".format(text.text))
22
- return 0
17
+
18
+ log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
19
+ if text.status_code!=200:
20
+ log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
21
+ log.write(" -Message:{}".format(text.text), verbose=verbose)
22
+ return 0
23
23
 
24
24
  api_response = json.loads(text.text)
25
- if verbose: log.write(" -Trait Name:",api_response["trait"])
26
- if verbose: log.write(" -Trait URL:",api_response["uri"])
25
+ log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
26
+ log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
27
27
 
28
28
  base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
29
- if verbose: log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...")
30
- if verbose: log.write(" -associationsByTraitSummary API: "+ base_url)
31
- if verbose: log.write(" -Note: this step might take a while...")
29
+ log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
30
+ log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
31
+ log.write(" -Note: this step might take a while...", verbose=verbose)
32
32
 
33
33
  # get request and check status code of response
34
34
  raw_data = requests.get(base_url)
@@ -37,13 +37,13 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
37
37
  is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
38
38
  if is_proceed is False: return False
39
39
 
40
- if verbose: log.write(" -Loading json ...")
40
+ log.write(" -Loading json ...", verbose=verbose)
41
41
  # Transform API response from JSON into Python dictionary
42
42
  api_response = json.loads(raw_data.text)
43
- if verbose: log.write(" -Parsing json ...")
43
+ log.write(" -Parsing json ...", verbose=verbose)
44
44
  # An
45
45
  records=list()
46
- if verbose: log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]))
46
+ log.write(" -Number of reported associations for "+ efo +" in GWASCatalog:",len( api_response["_embedded"]["associations"]), verbose=verbose)
47
47
 
48
48
  for association in api_response["_embedded"]["associations"]:
49
49
  #association statistics:
@@ -126,12 +126,12 @@ def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
126
126
  records.append(row)
127
127
  #rsid locations
128
128
  gwascatalog_lead_snps = pd.DataFrame(records,columns=["SNPID","CHR","POS","REPORT_GENENAME","CLOSEST_GENENAMES","FUNCTION_CLASS","OR","BETA","SE","P","TRAIT","STUDY","PUBMEDID","AUTHOR"])
129
- if verbose: log.write(" -Loading retrieved data into gwaslab Sumstats object ...")
130
- sigs = gl.Sumstats(gwascatalog_lead_snps,fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
129
+ log.write(" -Loading retrieved data into gwaslab Sumstats object ...", verbose=verbose)
130
+ sigs = gl.Sumstats(gwascatalog_lead_snps.copy(),fmt="gwaslab",other=['REPORT_GENENAME', 'CLOSEST_GENENAMES','TRAIT', 'STUDY', 'PUBMEDID','AUTHOR'],verbose=False)
131
131
  sigs.fix_pos(verbose=False)
132
132
  sigs.fix_chr(verbose=False)
133
133
  sigs.sort_coordinate(verbose=False)
134
- if verbose: log.write("Finished retrieving data from GWASCatalog...")
134
+ log.write("Finished retrieving data from GWASCatalog...", verbose=verbose)
135
135
  #return gwaslab Sumstats object
136
136
  return sigs
137
137
 
@@ -142,14 +142,14 @@ def check_request_status_code(request_code,verbose=True,log=Log()):
142
142
  is_proceed=False
143
143
 
144
144
  if request_code == 200:
145
- if verbose: log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...")
145
+ log.write(" -Status code 200 OK: Retrieved data from GWASCatalog successffully ...", verbose=verbose)
146
146
  is_proceed=True
147
147
  elif request_code == 404:
148
- if verbose: log.write(" -Status code 404 Not Found: The requested resource did not exist ...")
148
+ log.write(" -Status code 404 Not Found: The requested resource did not exist ...", verbose=verbose)
149
149
  elif request_code == 301:
150
- if verbose: log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...")
150
+ log.write(" -Status code 301 Moved Permanently: The requested resource did not exist ...", verbose=verbose)
151
151
  elif request_code == 400:
152
- if verbose: log.write(" -Status code 400 Bad Request: The requested resource did not exist ...")
152
+ log.write(" -Status code 400 Bad Request: The requested resource did not exist ...", verbose=verbose)
153
153
 
154
154
  return is_proceed
155
155
 
@@ -46,11 +46,11 @@ def _extract_with_ld_proxy( snplist=None,
46
46
  log=Log(),
47
47
  verbose=True,
48
48
  windowsizekb=100,
49
- ld_threshold=0.8,
49
+ ld_threshold=0.8
50
50
  ):
51
51
  ### Load vcf#######################################################################################
52
- if verbose: log.write("Start to load reference genotype...")
53
- if verbose: log.write(" -reference vcf path : "+ vcf_path)
52
+ log.write("Start to load reference genotype...", verbose=verbose)
53
+ log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
54
54
  if tabix is None:
55
55
  tabix = which("tabix")
56
56
  vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
@@ -122,7 +122,7 @@ def _extract_with_ld_proxy( snplist=None,
122
122
 
123
123
  extracted_sumstats = pd.concat([extracted_sumstats, ld_proxies],ignore_index=True)
124
124
 
125
- if verbose: log.write("Finished loading reference genotype successfully!")
125
+ log.write("Finished loading reference genotype successfully!", verbose=verbose)
126
126
  return extracted_sumstats
127
127
 
128
128
 
@@ -139,13 +139,13 @@ def _get_rsq( row,
139
139
  ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
140
140
 
141
141
  if ref_genotype is None:
142
- if verbose: log.write(" -Warning: no data was retrieved. Skipping ...")
142
+ log.warning("No data was retrieved. Skipping ...", verbose=verbose)
143
143
  ref_genotype=dict()
144
144
  ref_genotype["variants/POS"]=np.array([],dtype="int64")
145
145
  return None
146
146
 
147
- if verbose: log.write(" -Retrieving index...")
148
- if verbose: log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])))
147
+ log.write(" -Retrieving index...", verbose=verbose)
148
+ log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
149
149
  # match sumstats pos and ref pos:
150
150
  # get ref index for its first appearance of sumstats pos
151
151
  #######################################################################################
@@ -170,7 +170,7 @@ def _get_rsq( row,
170
170
  else:
171
171
  # no position match
172
172
  return None
173
- if verbose: log.write(" -Matching variants using POS, NEA, EA ...")
173
+ log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
174
174
 
175
175
  sumstats["REFINDEX"] = sumstats.loc[:,["POS","NEA","EA"]].apply(lambda x: match_varaint(x), axis=1)
176
176
  log.write(" -Matched variants in sumstats and vcf:{} ".format(sum(~sumstats["REFINDEX"].isna())))
@@ -190,7 +190,7 @@ def _get_rsq( row,
190
190
  lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
191
191
  other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
192
192
 
193
- if verbose: log.write(" -Calculating Rsq...")
193
+ log.write(" -Calculating Rsq...", verbose=verbose)
194
194
 
195
195
  if len(other_snp_genotype)>1:
196
196
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
@@ -198,7 +198,7 @@ def _get_rsq( row,
198
198
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
199
199
  sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
200
200
  else:
201
- if verbose: log.write(" -Lead SNP not found in reference...")
201
+ log.write(" -Lead SNP not found in reference...", verbose=verbose)
202
202
  sumstats["RSQ"]=None
203
203
 
204
204
  sumstats["RSQ"] = sumstats["RSQ"].astype("float")
@@ -0,0 +1,189 @@
1
+ from gwaslab.ldsc_sumstats import estimate_h2
2
+ from gwaslab.ldsc_sumstats import estimate_rg
3
+ from gwaslab.g_Log import Log
4
+ from gwaslab.qc_fix_sumstats import start_to
5
+ from gwaslab.qc_fix_sumstats import finished
6
+ from gwaslab.qc_fix_sumstats import skipped
7
+ from gwaslab.io_read_ldsc import parse_ldsc_summary
8
+
9
+ class ARGS():
10
+ def __init__(self, **args):
11
+
12
+ self.out = "ldsc"
13
+ self.bfile = None
14
+ self.l2 = False
15
+ self.extract = None
16
+ self.keep = None
17
+ self.ld_wind_snps = None
18
+ self.ld_wind_kb = None
19
+ self.ld_wind_cm = None
20
+ self.print_snps = None
21
+ self.annot =None
22
+ self.thin_annot = False
23
+ self.cts_bin = None
24
+ self.cts_breaks = None
25
+ self.cts_names = None
26
+ self.per_allele = False
27
+ self.pq_exp =None
28
+ self.no_print_annot = False
29
+
30
+ if "h2" in args.keys():
31
+ self.h2 = args["h2"]
32
+ else:
33
+ self.h2 = None
34
+
35
+ self.h2_cts = None
36
+
37
+ if "rg" in args.keys():
38
+ self.rg = args["rg"]
39
+ else:
40
+ self.rg = None
41
+
42
+ if "ref_ld" in args.keys():
43
+ self.ref_ld = args["ref_ld"]
44
+ else:
45
+ self.ref_ld = None
46
+
47
+ if "ref_ld_chr" in args.keys():
48
+ self.ref_ld_chr = args["ref_ld_chr"]
49
+ else:
50
+ self.ref_ld_chr = None
51
+
52
+ if "w_ld" in args.keys():
53
+ self.w_ld = args["w_ld"]
54
+ else:
55
+ self.w_ld = None
56
+
57
+ if "w_ld_chr" in args.keys():
58
+ self.w_ld_chr = args["w_ld_chr"]
59
+ else:
60
+ self.w_ld_chr = None
61
+
62
+ self.overlap_annot = False
63
+ self.print_coefficients = "ldsc"
64
+ self.frqfile = None
65
+ self.frqfile_chr = None
66
+ self.no_intercept = None
67
+ self.intercept_h2 = None
68
+ self.intercept_gencov = None
69
+ self.M = None
70
+ self.two_step = None
71
+ self.chisq_max = None
72
+ self.ref_ld_chr_cts = None
73
+ self.print_cov = None
74
+ self.print_delete_vals = False
75
+ self.chunk_size = 50
76
+ self.pickle = False
77
+ self.yes_really = False
78
+ self.invert_anyway = False
79
+ self.n_blocks = 200
80
+ self.not_M_5_50 = False
81
+ self.no_check_alleles = False
82
+ self.return_silly_things = False
83
+
84
+ if "samp_prev" in args.keys():
85
+ self.samp_prev = args["samp_prev"]
86
+ else:
87
+ self.samp_prev = None
88
+
89
+ if "pop_prev" in args.keys():
90
+ self.pop_prev = args["pop_prev"]
91
+ else:
92
+ self.pop_prev = None
93
+
94
+ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, **args):
95
+ sumstats = insumstats.copy()
96
+ ##start function with col checking##########################################################
97
+ _start_line = "run LD score regression"
98
+ _end_line = "running LD score regression"
99
+ _start_cols =[]
100
+ _start_function = ".estimate_h2_by_ldsc()"
101
+ _must_args ={}
102
+
103
+ is_enough_info = start_to(sumstats=sumstats,
104
+ log=log,
105
+ verbose=verbose,
106
+ start_line=_start_line,
107
+ end_line=_end_line,
108
+ start_cols=_start_cols,
109
+ start_function=_start_function,
110
+ **_must_args)
111
+ if is_enough_info == False: return None
112
+ ############################################################################################
113
+ log.write(" -Run single variate LD score regression:", verbose=verbose)
114
+ log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
115
+ log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
116
+ log.write(" -Arguments:", verbose=verbose)
117
+
118
+ for key, value in args.items():
119
+ log.write(" -{}:{}".format(key, value), verbose=verbose)
120
+
121
+ default_args = ARGS(**args)
122
+
123
+ if "Z" not in sumstats.columns:
124
+ sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
125
+
126
+ sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
127
+
128
+ log.write(" -LDSC log:", verbose=verbose)
129
+ summary = estimate_h2(sumstats, default_args, log)
130
+
131
+ log.write(" -Results have been stored in .ldsc_h2", verbose=verbose)
132
+ finished(log=log,verbose=verbose,end_line=_end_line)
133
+ return parse_ldsc_summary(summary)
134
+
135
+ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **args):
136
+ sumstats = insumstats.copy()
137
+ ##start function with col checking##########################################################
138
+ _start_line = "run LD score regression for genetic correlation"
139
+ _end_line = "running LD score regression for genetic correlation"
140
+ _start_cols =[]
141
+ _start_function = ".estimate_rg_by_ldsc()"
142
+ _must_args ={}
143
+
144
+ is_enough_info = start_to(sumstats=sumstats,
145
+ log=log,
146
+ verbose=verbose,
147
+ start_line=_start_line,
148
+ end_line=_end_line,
149
+ start_cols=_start_cols,
150
+ start_function=_start_function,
151
+ **_must_args)
152
+ if is_enough_info == False: return None
153
+ ############################################################################################
154
+ log.write(" -Run cross-trait LD score regression:", verbose=verbose)
155
+ log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
156
+ log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
157
+ log.write(" -Arguments:", verbose=verbose)
158
+
159
+ for key, value in args.items():
160
+ log.write(" -{}:{}".format(key, value), verbose=verbose)
161
+
162
+ default_args = ARGS(**args)
163
+
164
+ if "Z" not in sumstats.columns:
165
+ sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
166
+
167
+ sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
168
+
169
+ other_traits_to_use = []
170
+ alias = default_args.rg.split(",")[1:]
171
+
172
+ for index, each_other_sumstats in enumerate(other_traits):
173
+ log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
174
+ if "rsID" not in each_other_sumstats.data.columns:
175
+ to_append = each_other_sumstats.filter_hapmap3(verbose=False).data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
176
+ else:
177
+ to_append = each_other_sumstats.data.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
178
+
179
+ if "Z" not in to_append.columns:
180
+ to_append["Z"] = to_append["BETA"]/to_append["SE"]
181
+
182
+ other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
183
+
184
+ log.write(" -LDSC log:", verbose=verbose)
185
+ summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
186
+
187
+ log.write(" -Results have been stored in .ldsc_rg", verbose=verbose)
188
+ finished(log=log,verbose=verbose,end_line=_end_line)
189
+ return summary
@@ -89,7 +89,7 @@ def _load_single_bim_to_ref_bims(bpfile_prefix, ref_bims, log):
89
89
  sep="\s+",
90
90
  usecols=[0,1,3,4,5],
91
91
  header=None,
92
- dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"NEA_bim",5:"EA_bim"})
92
+ dtype={1:"string",0:"category", 3:"int", 4:"string", 5:"string"}).rename(columns={1:"SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"})
93
93
  log.write(" -#variants in ref file: {}".format(len(single_bim)))
94
94
  ref_bims.append(single_bim)
95
95
  return ref_bims
@@ -104,7 +104,7 @@ def _load_single_pvar_to_ref_bims(bpfile_prefix, ref_bims, log):
104
104
  usecols=[0,1,2,3,4],
105
105
  header=None,
106
106
  comment="#",
107
- dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"NEA_bim",4:"EA_bim"})
107
+ dtype={2:"string",0:"category", 1:"int", 3:"string", 4:"string"}).rename(columns={2:"SNPID",0:"CHR_bim",1:"POS_bim",3:"EA_bim",4:"NEA_bim"})
108
108
  log.write(" -#variants in ref file: {}".format(len(single_bim)))
109
109
  ref_bims.append(single_bim)
110
110
  return ref_bims
@@ -265,7 +265,7 @@ def _process_vcf(ref_file_prefix,
265
265
  except subprocess.CalledProcessError as e:
266
266
  log.write(e.output)
267
267
  else:
268
- log.write(" -Plink {} for CHR {} exists. Skipping...".format(convert ,i))
268
+ log.write(" -Plink {} for CHR {} exists: {}. Skipping...".format(convert ,i, bpfile_prefix))
269
269
 
270
270
  if load_bim == True:
271
271
  if convert == "bfile":
@@ -68,12 +68,16 @@ def _run_coloc_susie(filepath, r="Rscript",
68
68
  D1 <- list( "LD"=R, "beta"=df[,"BETA_1"],"varbeta"=df[,"SE_1"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type1}","N"={n1}{d1_args})
69
69
  D2 <- list( "LD"=R, "beta"=df[,"BETA_2"],"varbeta"=df[,"SE_2"]**2,"snp"=df[,"SNPID"],"position"=df[,"POS"],"type"="{type2}","N"={n2}{d2_args})
70
70
 
71
+ abf <- coloc.abf(dataset1=D1,dataset2=D2)
72
+ write.csv(t(data.frame(abf$summary)) , "{output_prefix}.coloc.abf", row.names = FALSE)
73
+
71
74
  S1=runsusie(D1{susie_args})
72
75
  S2=runsusie(D2{susie_args})
73
76
 
74
77
  susie.res=coloc.susie(S1,S2{coloc_args})
75
78
 
76
79
  write.csv(susie.res$summary, "{output_prefix}.coloc.susie", row.names = FALSE)
80
+
77
81
  '''.format(sumstats_path = sumstats,
78
82
  ld_r_matrix_path = ld_r_matrix,
79
83
  fillna_script = "R[is.na(R)] <- 0" if fillldna==True else "",
@@ -87,7 +91,9 @@ def _run_coloc_susie(filepath, r="Rscript",
87
91
  coloc_args = coloc_args,
88
92
  output_prefix = output_prefix)
89
93
 
90
- log.write(" -coloc script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
94
+ log.write(" -coloc abf script: {}".format("coloc.abf(dataset1=D1,dataset2=D2)"), verbose=verbose)
95
+ log.write(" -coloc susie script: {}".format("coloc.susie(S1,S2)"), verbose=verbose)
96
+
91
97
  with open("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]),"w") as file:
92
98
  file.write(rscript)
93
99
 
@@ -101,21 +107,37 @@ def _run_coloc_susie(filepath, r="Rscript",
101
107
  #plink_process.kill()
102
108
  log.write(" Running coloc.SuSieR from command line...", verbose=verbose)
103
109
  r_log+= output + "\n"
110
+
111
+ pip_cs = pd.read_csv("{}.coloc.abf".format(output_prefix))
112
+ if len(pip_cs)==0:
113
+ log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
114
+ else:
115
+ pip_cs["Locus"] = row["SNPID"]
116
+ pip_cs["STUDY"] = row["study"]
117
+ pip_cs["hit1"] = row["SNPID"]
118
+ pip_cs["METHOD"] = "abf"
119
+ locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
120
+
104
121
  pip_cs = pd.read_csv("{}.coloc.susie".format(output_prefix))
105
122
  if len(pip_cs)==0:
106
123
  log.write(" -SuSieR result for {} is empty. Please check parameters.".format(output_prefix), verbose=verbose)
107
124
  else:
108
125
  pip_cs["Locus"] = row["SNPID"]
109
126
  pip_cs["STUDY"] = row["study"]
127
+ pip_cs["METHOD"] = "susie"
110
128
  locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
129
+
111
130
  os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
131
+
112
132
  if delete == True:
113
- os.remove("{}.pipcs".format(output_prefix))
133
+ os.remove("{}.coloc.susie".format(output_prefix))
134
+ os.remove("{}.coloc.abf".format(output_prefix))
114
135
  else:
115
- log.write(" -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)), verbose=verbose)
136
+ log.write(" -coloc-abf result summary to: {}".format("{}.coloc.abf".format(output_prefix)), verbose=verbose)
137
+ log.write(" -coloc-susie result summary to: {}".format("{}.coloc.susie".format(output_prefix)), verbose=verbose)
116
138
 
117
139
  except subprocess.CalledProcessError as e:
118
140
  log.write(e.output)
119
141
  os.remove("_{}_{}_gwaslab_coloc_susie_temp.R".format(study,row["SNPID"]))
120
- log.write("Finished finemapping using SuSieR.", verbose=verbose)
142
+ log.write("Finished clocalization using coloc and SuSiE.", verbose=verbose)
121
143
  return locus_pip_cs
@@ -12,34 +12,34 @@ def lambdaGC(insumstats,include_chrXYMT=True, x=23 ,y=24, mt=25, mode="P",level=
12
12
  sumstats=insumstats.loc[:,["CHR",mode]]
13
13
 
14
14
  if include_chrXYMT is False:
15
- if verbose: log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.")
15
+ log.write(" -Excluding chrX, chrY, chrMT from lambda GC calculation.", verbose=verbose)
16
16
  xymt= [x,y,mt,"chrx","chry","chrmt","chrX","chrY","chrMT","chrM","M","x","y","mt","X","Y","MT"]
17
17
  sumstats = sumstats.loc[~sumstats["CHR"].isin(xymt),:]
18
18
 
19
19
  indata = sumstats[mode].values
20
20
  if len(indata) == 0:
21
- if verbose: log.write(" -No available variants to use for calculation.")
21
+ log.write(" -No available variants to use for calculation.", verbose=verbose)
22
22
  return np.nan
23
23
  if mode=="p" or mode=="P":
24
24
  observedMedianChi2 = sp.stats.chi2.isf(np.nanmedian(indata),1)
25
25
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
26
26
  lambdagc=observedMedianChi2/expectedMedianChi2
27
- if verbose: log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc))
27
+ log.write(" -Lambda GC (P mode) at "+ str(1 - level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
28
28
  elif mode=="mlog10p" or mode=="MLOG10P":
29
29
  observedMedianChi2 = sp.stats.chi2.isf( np.nanmedian(np.power(10,-indata)) ,1)
30
30
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
31
31
  lambdagc=observedMedianChi2/expectedMedianChi2
32
- if verbose: log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
32
+ log.write(" -Lambda GC (MLOG10P mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
33
33
  elif mode=="z" or mode=="Z":
34
34
  observedMedianChi2 = np.median((indata)**2)
35
35
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
36
36
  lambdagc=observedMedianChi2/expectedMedianChi2
37
- if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
37
+ if verbose:log.write(" -Lambda GC (Z mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
38
38
  elif mode=="chi2" or mode=="CHISQ":
39
39
  observedMedianChi2 = np.median(indata)
40
40
  expectedMedianChi2 = sp.stats.chi2.ppf(level,1)
41
41
  lambdagc=observedMedianChi2/expectedMedianChi2
42
- if verbose:log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc))
42
+ log.write(" -Lambda GC (CHISQ mode) at "+ str(1- level)+ " is"," ","{:.5f}".format(lambdagc), verbose=verbose)
43
43
  else:
44
44
  return np.nan
45
45
  return lambdagc