gwaslab 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

@@ -37,7 +37,7 @@ from gwaslab.hm_harmonize_sumstats import auto_check_vcf_chr_dict
37
37
  #check if in outcome and exposure snp list
38
38
  #replace
39
39
 
40
- def _extract_with_ld_proxy( snplist=None,
40
+ def _extract_with_ld_proxy( snplist=None,
41
41
  common_sumstats=None,
42
42
  sumstats1=None,
43
43
  vcf_path=None,
@@ -58,6 +58,7 @@ def _extract_with_ld_proxy( snplist=None,
58
58
  is_needed=[]
59
59
  no_need =[]
60
60
 
61
+ print(common_sumstats.head())
61
62
  for i in snplist:
62
63
  if i in common_sumstats["SNPID"].values:
63
64
  no_need.append(i)
@@ -72,7 +73,7 @@ def _extract_with_ld_proxy( snplist=None,
72
73
  if len(in_sumstats)==0:
73
74
  log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
74
75
  else:
75
- log.write(" -{}} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
76
+ log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
76
77
 
77
78
  for index,row in in_sumstats.iterrows():
78
79
  # determine SNP and select region
@@ -93,6 +94,16 @@ def _extract_with_ld_proxy( snplist=None,
93
94
  if len(flanking_sumstats)==0:
94
95
  log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
95
96
  continue
97
+
98
+ _get_rsq_single(in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
99
+ row_pos=row["POS"],
100
+ vcf_path=vcf_path,
101
+ region=region,
102
+ log=log,
103
+ verbose=verbose,
104
+ vcf_chr_dict=vcf_chr_dict,
105
+ tabix=tabix)
106
+
96
107
 
97
108
  flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
98
109
  sumstats = flanking_sumstats,
@@ -126,6 +137,81 @@ def _extract_with_ld_proxy( snplist=None,
126
137
  return extracted_sumstats
127
138
 
128
139
 
140
+ def _extract_ld_proxy( snplist=None,
141
+ common_sumstats=None,
142
+ vcf_path=None,
143
+ vcf_chr_dict=None,
144
+ tabix=None,
145
+ log=Log(),
146
+ verbose=True,
147
+ windowsizekb=100,
148
+ ld_threshold=0.8
149
+ ):
150
+ ### Load vcf#######################################################################################
151
+ log.write("Start to load reference genotype...", verbose=verbose)
152
+ log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
153
+ if tabix is None:
154
+ tabix = which("tabix")
155
+ vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
156
+
157
+ ld_proxies = pd.DataFrame()
158
+ in_sumstats = common_sumstats.loc[common_sumstats["SNPID"].isin(snplist),:]
159
+
160
+ if len(in_sumstats)==0:
161
+ log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
162
+ else:
163
+ log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
164
+
165
+ for index,row in in_sumstats.iterrows():
166
+ # determine SNP and select region
167
+ snpid = row["SNPID"]
168
+ chrom= int(row["CHR"])
169
+ start= int(row["POS"]-windowsizekb*1000)
170
+ end= int(row["POS"]+windowsizekb*1000)
171
+
172
+ region = (chrom, start, end)
173
+
174
+ ### #######################################################################################
175
+ #is_flanking = common_sumstats["CHR"] == chrom & common_sumstats["CHR"]>start & common_sumstats["CHR"]<end
176
+ #flanking_sumstats = common_sumstats.loc[is_flanking,:]
177
+ flanking_sumstats = common_sumstats.query('CHR == @chrom and @start < POS < @end',engine='python').copy()
178
+
179
+ log.write(" -Extract {} variants in flanking region of {} for checking: {}:{}-{}".format(len(flanking_sumstats), snpid, chrom, start, end), verbose=verbose)
180
+
181
+ if len(flanking_sumstats)==0:
182
+ log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
183
+ continue
184
+
185
+ flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA","EA"]],
186
+ sumstats = flanking_sumstats,
187
+ row_pos=row["POS"],
188
+ vcf_path=vcf_path,
189
+ region=region,
190
+ log=log,
191
+ verbose=verbose,
192
+ vcf_chr_dict=vcf_chr_dict,
193
+ tabix=tabix)
194
+ if flanking_sumstats is None:
195
+ log.write(" -{} is not found in the vcf...Skipping!".format(snpid))
196
+ continue
197
+ flanking_sumstats = flanking_sumstats.loc[flanking_sumstats["RSQ"]>ld_threshold,:]
198
+
199
+ log.write(" -Variants in LD with {} (RSQ > {}): {}".format(snpid, ld_threshold,len(flanking_sumstats)), verbose=verbose)
200
+
201
+ if len(flanking_sumstats)>0:
202
+ flanking_sumstats["LD_REF_VARIANT"]= snpid
203
+ for i,row_with_rsq in flanking_sumstats.iterrows():
204
+ if row_with_rsq["SNPID"] in common_sumstats["SNPID"].values:
205
+ log.write(" -Top Proxy for {} is found: {} (LD RSQ= {})".format(snpid, row_with_rsq["SNPID"], row_with_rsq["RSQ"]))
206
+ break
207
+ #row_with_rsq = pd.DataFrame(row_with_rsq)
208
+ ld_proxies = pd.concat([ld_proxies, flanking_sumstats], ignore_index=True)
209
+
210
+
211
+ log.write("Finished loading reference genotype successfully!", verbose=verbose)
212
+ return ld_proxies.sort_values(by="RSQ",ascending=False)
213
+
214
+
129
215
  def _get_rsq( row,
130
216
  sumstats,
131
217
  row_pos,
@@ -205,4 +291,77 @@ def _get_rsq( row,
205
291
  return sumstats
206
292
 
207
293
  def _check_if_in_sumstats2(row, sumstast):
208
- pass
294
+ pass
295
+
296
+ def _get_rsq_single( row,
297
+ row_pos,
298
+ vcf_path,
299
+ region,
300
+ log,
301
+ verbose,
302
+ vcf_chr_dict,
303
+ tabix):
304
+ #load genotype data of the targeted region
305
+ ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
306
+
307
+ if ref_genotype is None:
308
+ log.warning("No data was retrieved. Skipping ...", verbose=verbose)
309
+ ref_genotype=dict()
310
+ ref_genotype["variants/POS"]=np.array([],dtype="int64")
311
+ return None
312
+
313
+ log.write(" -Retrieving index...", verbose=verbose)
314
+ log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
315
+ # match sumstats pos and ref pos:
316
+ # get ref index for its first appearance of sumstats pos
317
+ #######################################################################################
318
+ def match_varaint(x):
319
+ # x: "POS,NEA,EA"
320
+ if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
321
+ if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
322
+ # multiple position matches
323
+ for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
324
+ # for each possible match, compare ref and alt
325
+ if x.iloc[1] == ref_genotype["variants/REF"][j]:
326
+ if x.iloc[2] in ref_genotype["variants/ALT"][j]:
327
+ return j
328
+ elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
329
+ if x.iloc[2] == ref_genotype["variants/REF"][j]:
330
+ return j
331
+ else:
332
+ return None
333
+ else:
334
+ # single match
335
+ return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
336
+ else:
337
+ # no position match
338
+ return None
339
+
340
+ #############################################################################################
341
+ lead_pos = row_pos
342
+
343
+ # if lead pos is available:
344
+ if lead_pos in ref_genotype["variants/POS"]:
345
+
346
+ # get ref index for lead snp
347
+ lead_snp_ref_index = match_varaint(row)
348
+ #lead_snp_ref_index = np.where(ref_genotype["variants/POS"] == lead_pos)[0][0]
349
+
350
+ # non-na other snp index
351
+ other_snps_ref_index = list(range(len(ref_genotype["calldata/GT"])))
352
+ other_snps_ref_index.remove(lead_snp_ref_index)
353
+
354
+ # get genotype
355
+ lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
356
+ other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
357
+
358
+ log.write(" -Calculating Rsq...", verbose=verbose)
359
+
360
+ if len(other_snp_genotype)>1:
361
+ valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
362
+ else:
363
+ valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
364
+
365
+ ld_proxy = pd.DataFrame( {"SNPID":ref_genotype["variants/ID"][other_snps_ref_index],"RSQ":valid_r2 })
366
+
367
+ return ld_proxy.sort_values(by="RSQ",ascending=False)
gwaslab/util_ex_ldsc.py CHANGED
@@ -260,6 +260,9 @@ class ARGS():
260
260
  def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=None, **kwargs):
261
261
  sumstats = insumstats.copy()
262
262
 
263
+ if "N" in sumstats.columns:
264
+ sumstats["N"] = sumstats["N"].astype("int64")
265
+
263
266
  if munge:
264
267
  if munge_args is None:
265
268
  munge_args={}
@@ -320,6 +323,8 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
320
323
 
321
324
  def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
322
325
  sumstats = insumstats.copy()
326
+ if "N" in sumstats.columns:
327
+ sumstats["N"] = sumstats["N"].astype("int64")
323
328
  ##start function with col checking##########################################################
324
329
  _start_line = "run LD score regression"
325
330
  _end_line = "running LD score regression"
@@ -366,6 +371,8 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
366
371
 
367
372
  def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
368
373
  sumstats = insumstats.copy()
374
+ if "N" in sumstats.columns:
375
+ sumstats["N"] = sumstats["N"].astype("int64")
369
376
  ##start function with col checking##########################################################
370
377
  _start_line = "run LD score regression for genetic correlation"
371
378
  _end_line = "running LD score regression for genetic correlation"
@@ -426,6 +433,8 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
426
433
 
427
434
  def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
428
435
  sumstats = insumstats.copy()
436
+ if "N" in sumstats.columns:
437
+ sumstats["N"] = sumstats["N"].astype("int64")
429
438
  ##start function with col checking##########################################################
430
439
  _start_line = "run LD score regression"
431
440
  _end_line = "running LD score regression"
@@ -21,6 +21,8 @@ def _run_two_sample_mr(sumstatspair_object,
21
21
  n1=None,
22
22
  n2=None,
23
23
  binary1=False,
24
+ cck1=None,
25
+ cck2=None,
24
26
  ncase1=None,
25
27
  ncontrol1=None,
26
28
  prevalence1=None,
@@ -35,6 +37,22 @@ def _run_two_sample_mr(sumstatspair_object,
35
37
  if methods is None:
36
38
  methods = ["mr_ivw","mr_simple_mode","mr_weighted_median","mr_egger_regression","mr_ivw_mre", "mr_weighted_mode"]
37
39
  methods_string = '"{}"'.format('","'.join(methods))
40
+
41
+ if cck1 is not None:
42
+ log.write(" - ncase1, ncontrol1, prevalence1:{}".format(cck1))
43
+ binary1 = True
44
+ ncase1 = cck1[0]
45
+ ncontrol1 = cck1[1]
46
+ prevalence1 = cck1[2]
47
+ n1 = ncase1 + ncontrol1
48
+ if cck2 is not None:
49
+ log.write(" - ncase2, ncontrol2, prevalence2:{}".format(cck2))
50
+ binary2 = True
51
+ ncase2 = cck2[0]
52
+ ncontrol2 = cck2[1]
53
+ prevalence2 = cck2[2]
54
+ n2 = ncase2 + ncontrol2
55
+
38
56
  if clump==True:
39
57
  sumstatspair = sumstatspair_object.clumps["clumps"]
40
58
  else:
@@ -64,10 +82,16 @@ def _run_two_sample_mr(sumstatspair_object,
64
82
 
65
83
  ###
66
84
  calculate_r_script = ""
85
+
67
86
  if binary1==True:
68
87
  calculate_r_script+= _make_script_for_calculating_r("exposure", ncase1, ncontrol1, prevalence1)
88
+ else:
89
+ calculate_r_script+= _make_script_for_calculating_r_quant("exposure")
90
+
69
91
  if binary2==True:
70
92
  calculate_r_script+= _make_script_for_calculating_r("outcome", ncase2, ncontrol2, prevalence2)
93
+ else:
94
+ calculate_r_script+= _make_script_for_calculating_r_quant("outcome")
71
95
 
72
96
  # create scripts
73
97
  directionality_test_script='''
@@ -218,6 +242,16 @@ def _make_script_for_calculating_r(exposure_or_outcome, ncase, ncontrol, prevale
218
242
  return script
219
243
 
220
244
 
245
+ def _make_script_for_calculating_r_quant(exposure_or_outcome):
246
+ script = """
247
+ harmonized_data$"r.{exposure_or_outcome}" <- get_r_from_bsen( harmonized_data$"beta.{exposure_or_outcome}",
248
+ harmonized_data$"se.{exposure_or_outcome}",
249
+ harmonized_data$"samplesize.{exposure_or_outcome}"
250
+ )
251
+ """.format(
252
+ exposure_or_outcome = exposure_or_outcome
253
+ )
254
+ return script
221
255
 
222
256
 
223
257
  def _filter_by_f(sumstatspair, f_check, n1, binary1=None, ncase1=None, ncontrol1=None, prevalence1=None, log=Log() ):
@@ -162,7 +162,7 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
162
162
  log.write(e.output)
163
163
  #os.system(script)
164
164
 
165
- clumped = pd.read_csv("{}.clumps".format(out_single_chr),usecols=[2,0,1,3],sep="\s+")
165
+ clumped = pd.read_csv("{}.clumps".format(out_single_chr),sep="\s+")
166
166
  results = pd.concat([results,clumped],ignore_index=True)
167
167
 
168
168
  # remove temp SNPIDP file
@@ -172,7 +172,9 @@ def _clump(insumstats, vcf=None, scaled=False, out="clumping_plink2",
172
172
  log.write("Finished clumping.",verbose=verbose)
173
173
  results_sumstats = insumstats.loc[insumstats["SNPID"].isin(results["SNPID"]),:].copy()
174
174
  finished(log=log, verbose=verbose, end_line=_end_line)
175
- return results_sumstats, plink_log
175
+
176
+ return results_sumstats, results, plink_log
177
+
176
178
 
177
179
 
178
180
 
@@ -1,6 +1,7 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  import scipy.stats as ss
4
+ from scipy.stats import norm
4
5
  from scipy import stats
5
6
  from gwaslab.g_Log import Log
6
7
  import gc
@@ -8,6 +9,7 @@ import gc
8
9
  from gwaslab.g_version import _get_version
9
10
  from gwaslab.qc_check_datatype import check_datatype
10
11
 
12
+
11
13
  def filldata(
12
14
  insumstats,
13
15
  to_fill=None,
@@ -38,7 +40,7 @@ def filldata(
38
40
  for i in skip_cols:
39
41
  to_fill.remove(i)
40
42
  log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
41
- if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
43
+ if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF","SIG"]))==0:
42
44
  log.write(" -No available columns to fill. Skipping.", verbose=verbose)
43
45
  log.write("Finished filling data using existing columns.", verbose=verbose)
44
46
  return sumstats
@@ -217,6 +219,20 @@ def fill_maf(sumstats,log,verbose=True,filled_count=0):
217
219
  return 0,filled_count
218
220
  return 1,filled_count
219
221
 
222
+ def fill_sig(sumstats,log,sig_level=5e-8, verbose=True,filled_count=0):
223
+ if "P" in sumstats.columns or "MLOG10P" in sumstats.columns:
224
+ log.write(" - Determining significant using P and MLOG10P with threshold:{}".format(sig_level), verbose=verbose)
225
+ if "P" in sumstats.columns:
226
+ is_sig = sumstats["P"]<sig_level
227
+ elif "MLOG10P" in sumstats.columns:
228
+ is_sig = sumstats["MLOG10P"]>np.log10(sig_level)
229
+ sumstats["SIGNIFICANT"] = False
230
+ sumstats.loc[is_sig, "SIGNIFICANT"] = True
231
+ filled_count +=1
232
+ else:
233
+ return 0,filled_count
234
+ return 1,filled_count
235
+
220
236
  ####################################################################################################################
221
237
  def fill_extreme_mlog10(sumstats, z):
222
238
  log_pvalue = np.log(2) + ss.norm.logsf(np.abs(sumstats[z])) #two-sided
@@ -287,7 +303,10 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
287
303
  else:
288
304
  status,filled_count = fill_mlog10p(sumstats,log,verbose=verbose)
289
305
  if status == 1 : to_fill.remove("MLOG10P")
290
-
306
+
307
+ if "SIG" in to_fill:
308
+ status,filled_count = fill_sig(sumstats,sig_level=sig_level ,log=log,verbose=verbose,filled_count=filled_count)
309
+ if status == 1 : to_fill.remove("SIG")
291
310
  if filled_count == 0:
292
311
  break
293
312
 
@@ -330,4 +349,10 @@ def _convert_or_to_beta(OR):
330
349
  return np.log(OR)
331
350
 
332
351
  def _convert_beta_to_or(beta):
333
- return np.exp(beta)
352
+ return np.exp(beta)
353
+
354
+ def rank_based_int(series, c=3/8):
355
+ #https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
356
+ n=sum(~series.isna())
357
+ normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
358
+ return normalized_value
@@ -513,4 +513,69 @@ def _exclude(sumstats, exclude=None, id_use="SNPID", log=Log(), verbose=True ):
513
513
  log.write(" -Excluding {} variants from sumstats...".format(len(exclude)),verbose=verbose)
514
514
  sumstats = sumstats.loc[~sumstats[id_use].isin(exclude),:]
515
515
  log.write(" -Excluded {} variants from sumstats...".format(len(sumstats)),verbose=verbose)
516
- return sumstats
516
+ return sumstats
517
+
518
+ def _filter_region(sumstats, region, chrom="CHR",pos="POS",log=Log(),verbose=True):
519
+ if region is not None:
520
+ region_chr = region[0]
521
+ region_start = region[1]
522
+ region_end = region[2]
523
+
524
+ log.write(" -Extract SNPs in region : chr{}:{}-{}...".format(region_chr, region[1], region[2]),verbose=verbose)
525
+
526
+ in_region_snp = (sumstats[chrom]==region_chr) & (sumstats[pos]<region_end) & (sumstats[pos]>region_start)
527
+
528
+ log.write(" -Extract SNPs in specified regions: "+str(sum(in_region_snp)),verbose=verbose)
529
+ sumstats = sumstats.loc[in_region_snp,:]
530
+ return sumstats.copy()
531
+
532
+ def _search_variants( sumstats, snplist=None,
533
+ snpid="SNPID" ,rsid="rsID",
534
+ chrom="CHR",pos="POS",ea="EA",nea="NEA",
535
+ log=Log(),verbose=True):
536
+ log.write("Start to search for variants...", verbose=verbose)
537
+ # create a boolean col with FALSE
538
+ if snpid in sumstats.columns:
539
+ is_extract = sumstats[snpid]!=sumstats[snpid]
540
+ else:
541
+ is_extract = sumstats[rsid]!=sumstats[rsid]
542
+
543
+ # search each variant
544
+ for variant in snplist:
545
+
546
+ if pd.api.types.is_list_like(variant):
547
+ # (1:1234)
548
+ single_chrom=variant[0]
549
+ single_pos=variant[1]
550
+ is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
551
+
552
+ elif pd.api.types.is_string_dtype(type(variant)):
553
+ # rs123
554
+ if "rsID" in sumstats.columns:
555
+ is_extract = is_extract | (sumstats["rsID"] == variant)
556
+
557
+ # 1:123:A:D
558
+ if "SNPID" in sumstats.columns:
559
+ is_extract = is_extract | (sumstats["SNPID"] == variant)
560
+
561
+ # 1:123:A:D -> (1:1234)
562
+ a= re.match(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)([:_-]([ATCG]+)[:_-]([ATCG]+))?$', variant, flags=0)
563
+
564
+ if a is not None:
565
+ if a[4] is None:
566
+ single_chrom=int(a[2])
567
+ single_pos=int(a[3])
568
+ is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
569
+ else:
570
+ single_chrom = int(a[2])
571
+ single_pos = int(a[3])
572
+ single_ea = a[5]
573
+ single_nea = a[6]
574
+ a_match = ((sumstats[nea] == single_nea) & (sumstats[ea] == single_ea)) | ((sumstats[nea] == single_ea) & (sumstats[ea] == single_nea))
575
+ is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom) & a_match)
576
+
577
+ to_search = sumstats.loc[is_extract,:].copy()
578
+ log.write(" -Found {} variants...".format(len(to_search)),verbose=verbose)
579
+
580
+ log.write("Finished searching variants.", verbose=verbose)
581
+ return to_search
@@ -0,0 +1,51 @@
1
+ import pandas as pd
2
+ from gwaslab.g_Log import Log
3
+ import re
4
+
5
+ def _extract_variant(variant_set, sumstats_dic, log=Log(), verbose=True):
6
+
7
+ combined = pd.DataFrame()
8
+ log.write("Start to initialize gl.SumstatsSet...", verbose=verbose)
9
+ for key, sumstats_gls in sumstats_dic.items():
10
+ log.write(" -{} : {}".format(key, sumstats_gls), verbose=verbose)
11
+
12
+ for key, sumstats_gls in sumstats_dic.items():
13
+
14
+ sumstats_single = sumstats_gls.data
15
+
16
+ # create a boolean col with FALSE
17
+ is_extract = sumstats_single["SNPID"]!=sumstats_single["SNPID"]
18
+
19
+ for variant in variant_set:
20
+
21
+ if pd.api.types.is_list_like(variant):
22
+
23
+ chrom=variant[0]
24
+ pos=variant[1]
25
+
26
+ is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
27
+ elif pd.api.types.is_string_dtype(type(variant)):
28
+
29
+ is_extract = is_extract | (sumstats_single["SNPID"] == variant)
30
+
31
+ a= re.search(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)[:_-][ATCG]+[:_-][ATCG]+$', variant, flags=0)
32
+ if a is not None:
33
+ chrom=int(a[2])
34
+ pos=int(a[3])
35
+ is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
36
+
37
+ to_extract = sumstats_single.loc[is_extract,:].copy()
38
+ log.write(" -Extracted {} variants from {}".format(len(to_extract), key),verbose=verbose)
39
+ to_extract["STUDY"] = key
40
+
41
+ to_extract_cols=["STUDY"]
42
+
43
+ default_cols=["SNPID","EA","NEA","CHR","POS","BETA","SE","P","MLOG10P","EAF","MAF","STATUS"]
44
+
45
+ for i in default_cols:
46
+ if i in sumstats_single.columns:
47
+ to_extract_cols.append(i)
48
+
49
+ combined = pd.concat([combined, to_extract[to_extract_cols]], ignore_index=True)
50
+ log.write("Finished initializing gl.SumstatsSet.", verbose=verbose)
51
+ return combined
@@ -52,7 +52,8 @@ def get_default_path(keyword,fmt="png"):
52
52
  "esc":"effect_size_comparision",
53
53
  "afc":"allele_frequency_comparision",
54
54
  "gwheatmap":"genome_wide_heatmap",
55
- "scatter":"scatter"
55
+ "scatter":"scatter",
56
+ "forest":"forest"
56
57
  }
57
58
  prefix = path_dictionary[keyword]
58
59
  count = 1
@@ -0,0 +1,99 @@
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ from gwaslab.g_Log import Log
6
+ from gwaslab.viz_aux_quickfix import _quick_assign_i_with_rank
7
+ from gwaslab.viz_plot_mqqplot import _process_xtick
8
+ from gwaslab.viz_plot_mqqplot import _process_xlabel
9
+ from gwaslab.bd_common_data import get_number_to_chr
10
+ from gwaslab.util_in_filter_value import _filter_region
11
+ from gwaslab.io_process_args import _extract_kwargs
12
+
13
+ def _plot_cs(pipcs,
14
+ region,
15
+ figax=None,
16
+ _posdiccul=None,
17
+ xtick_chr_dict=None,
18
+ pip="PIP",
19
+ onlycs=False,
20
+ cs="CREDIBLE_SET_INDEX",
21
+ marker_size=(45,85),
22
+ fontsize = 12,
23
+ font_family = "Arial",
24
+ legend_title="Credible sets",
25
+ log=Log(),
26
+ verbose=True,
27
+ **kwargs):
28
+ '''
29
+ pipcs : a DataFrame of finemapping results
30
+ '''
31
+ ## parameters #############################
32
+ if xtick_chr_dict is None:
33
+ xtick_chr_dict = get_number_to_chr()
34
+
35
+ scatter_kwargs = _extract_kwargs("scatter", dict(), locals())
36
+
37
+ region_marker_shapes = ['o', '^','s','D','*','P','X','h','8']
38
+ region_ld_colors_m = ["grey","#E51819","green","#F07818","#AD5691","yellow","purple"]
39
+
40
+
41
+ ## filter data #############################
42
+ pipcs = _filter_region(pipcs, region)
43
+ if onlycs ==True:
44
+ pipcs = pipcs.loc[pipcs[cs]>0,:]
45
+
46
+ pipcs[cs] = pipcs[cs].astype("string")
47
+
48
+ ## figure and ax #############################
49
+ if figax is not None:
50
+ ax=figax[1]
51
+ fig=figax[0]
52
+ else:
53
+ fig, ax = plt.subplots()
54
+
55
+ # assign i
56
+ pipcs,chrom_df=_quick_assign_i_with_rank(pipcs, chrpad=0.00,
57
+ use_rank=False,
58
+ chrom="CHR",pos="POS",
59
+ drop_chr_start=False,
60
+ _posdiccul=_posdiccul)
61
+ pipcs = pipcs.sort_values(by=cs,ascending=True)
62
+
63
+ ## plot ##########################################
64
+ scatter_kwargs["markers"]= {m:region_marker_shapes[i] for i,m in enumerate(pipcs[cs].unique())}
65
+ palette = sns.color_palette(region_ld_colors_m,n_colors=pipcs[cs].nunique())
66
+ edgecolor="none"
67
+
68
+ plot = sns.scatterplot(data=pipcs,
69
+ x="i",
70
+ y=pip,
71
+ hue=cs,
72
+ edgecolor=edgecolor,
73
+ palette=palette,
74
+ style=cs,
75
+ s=marker_size[1],
76
+ ax=ax,
77
+ **scatter_kwargs)
78
+
79
+ # process legend
80
+ handles, labels = ax.get_legend_handles_labels()
81
+ new_labels = []
82
+ new_handles = []
83
+ ncol = len(labels)
84
+
85
+ for i,label in enumerate(labels):
86
+ if label in [str(j) for j in range(1,10)]:
87
+ new_labels.append(labels[i])
88
+ new_handles.append(handles[i])
89
+
90
+ ax.legend(labels =new_labels,
91
+ handles=new_handles,
92
+ loc="upper right",
93
+ bbox_to_anchor=(0.995, 0.995),
94
+ ncol=1,
95
+ scatterpoints=2,
96
+ title=legend_title,
97
+ frameon=True)
98
+
99
+ return fig, log