gwaslab 3.5.5__py3-none-any.whl → 3.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -1
- gwaslab/g_Sumstats.py +27 -1
- gwaslab/g_SumstatsSet.py +663 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_harmonize_sumstats.py +91 -1
- gwaslab/qc_fix_sumstats.py +1 -1
- gwaslab/util_ex_ldproxyfinder.py +162 -3
- gwaslab/util_in_fill_data.py +19 -2
- gwaslab/util_in_filter_value.py +52 -1
- gwaslab/util_in_merge.py +51 -0
- gwaslab/viz_aux_save_figure.py +2 -1
- gwaslab/viz_plot_effect.py +283 -0
- gwaslab/viz_plot_miamiplot2.py +1 -1
- gwaslab/viz_plot_mqqplot.py +17 -0
- gwaslab/viz_plot_regional2.py +133 -32
- gwaslab/viz_plot_stackedregional.py +0 -1
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/METADATA +2 -2
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/RECORD +22 -19
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/WHEEL +1 -1
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/LICENSE +0 -0
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.5.dist-info → gwaslab-3.5.6.dist-info}/top_level.txt +0 -0
gwaslab/hm_harmonize_sumstats.py
CHANGED
|
@@ -1490,10 +1490,100 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
|
1490
1490
|
elif record.ref==alt and (ref in record.alts):
|
|
1491
1491
|
return 1 - record.info[alt_freq][0]
|
|
1492
1492
|
return np.nan
|
|
1493
|
+
##############################################################################################################################################################################################
|
|
1493
1494
|
|
|
1495
|
+
################################################################################################################
|
|
1494
1496
|
|
|
1497
|
+
def _paralleleinferafwithmaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",
|
|
1498
|
+
eaf="EAF",maf="MAF",ref_eaf="_REF_EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
|
|
1499
|
+
##start function with col checking##########################################################
|
|
1500
|
+
_start_line = "infer sumstats EAF from sumstats MAF using reference VCF ALT frequency"
|
|
1501
|
+
_end_line = "inferring sumstats EAF from sumstats MAF using reference VCF ALT frequency"
|
|
1502
|
+
_start_cols = [chr,pos,ref,alt,status]
|
|
1503
|
+
_start_function = ".infer_af()"
|
|
1504
|
+
_must_args ={"ref_alt_freq":ref_alt_freq}
|
|
1495
1505
|
|
|
1496
|
-
|
|
1506
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
1507
|
+
log=log,
|
|
1508
|
+
verbose=verbose,
|
|
1509
|
+
start_line=_start_line,
|
|
1510
|
+
end_line=_end_line,
|
|
1511
|
+
start_cols=_start_cols,
|
|
1512
|
+
start_function=_start_function,
|
|
1513
|
+
n_cores=n_cores,
|
|
1514
|
+
ref_vcf=ref_infer,
|
|
1515
|
+
**_must_args)
|
|
1516
|
+
if is_enough_info == False: return sumstats
|
|
1517
|
+
############################################################################################
|
|
1518
|
+
chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
|
|
1519
|
+
|
|
1520
|
+
if eaf not in sumstats.columns:
|
|
1521
|
+
sumstats[eaf]=np.nan
|
|
1522
|
+
if ref_eaf not in sumstats.columns:
|
|
1523
|
+
sumstats[ref_eaf]=np.nan
|
|
1524
|
+
|
|
1525
|
+
prenumber = sum(sumstats[eaf].isna())
|
|
1526
|
+
|
|
1527
|
+
# ref_alt_freq INFO in vcf was provided
|
|
1528
|
+
if ref_alt_freq is not None:
|
|
1529
|
+
log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
|
|
1530
|
+
if not force:
|
|
1531
|
+
good_chrpos = sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
|
|
1532
|
+
log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
|
|
1533
|
+
|
|
1534
|
+
########################
|
|
1535
|
+
#extract ref af
|
|
1536
|
+
if sum(sumstats[eaf].isna())<10000:
|
|
1537
|
+
n_cores=1
|
|
1538
|
+
#df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
1539
|
+
df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
|
|
1540
|
+
pool = Pool(n_cores)
|
|
1541
|
+
map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=ref_eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
1542
|
+
sumstats.loc[good_chrpos,[ref_eaf]] = pd.concat(pool.map(map_func,df_split))
|
|
1543
|
+
pool.close()
|
|
1544
|
+
pool.join()
|
|
1545
|
+
|
|
1546
|
+
###########################
|
|
1547
|
+
# infer sumstats EAF
|
|
1548
|
+
# based on sumstats MAF and reference EAF
|
|
1549
|
+
is_filpped = ((sumstats[ref_eaf]>=0.5)&(sumstats[maf]<=0.5)) |((sumstats[ref_eaf]<0.5)&(sumstats[maf]>0.5))
|
|
1550
|
+
sumstats[eaf] = sumstats[maf]
|
|
1551
|
+
log.write(" -Flipping MAF to obtain EAF for {} variants".format(sum(is_filpped)),verbose=verbose)
|
|
1552
|
+
sumstats.loc[is_filpped,eaf] = 1 - sumstats.loc[is_filpped,maf]
|
|
1553
|
+
|
|
1554
|
+
###########################
|
|
1555
|
+
afternumber = sum(sumstats[eaf].isna())
|
|
1556
|
+
log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
|
|
1557
|
+
log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
|
|
1558
|
+
sumstats = sumstats.drop(columns=[ref_eaf])
|
|
1559
|
+
|
|
1560
|
+
finished(log,verbose,_end_line)
|
|
1561
|
+
return sumstats
|
|
1562
|
+
|
|
1563
|
+
def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
|
|
1564
|
+
#vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
|
|
1565
|
+
vcf_reader = VariantFile(ref_infer)
|
|
1566
|
+
def afapply(x,vcf,alt_freq,chr_dict):
|
|
1567
|
+
return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
|
|
1568
|
+
map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
|
|
1569
|
+
status_inferred = sumstats.apply(map_func,axis=1)
|
|
1570
|
+
sumstats[eaf] = status_inferred.values
|
|
1571
|
+
sumstats[eaf]=sumstats[eaf].astype("float")
|
|
1572
|
+
return sumstats
|
|
1573
|
+
|
|
1574
|
+
def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
|
|
1575
|
+
if chr_dict is not None: chr=chr_dict[chr]
|
|
1576
|
+
chr_seq = vcf_reader.fetch(chr,start,end)
|
|
1577
|
+
|
|
1578
|
+
for record in chr_seq:
|
|
1579
|
+
if record.pos==end:
|
|
1580
|
+
if record.ref==ref and (alt in record.alts):
|
|
1581
|
+
return record.info[alt_freq][0]
|
|
1582
|
+
elif record.ref==alt and (ref in record.alts):
|
|
1583
|
+
return 1 - record.info[alt_freq][0]
|
|
1584
|
+
return np.nan
|
|
1585
|
+
|
|
1586
|
+
##############################################################################################################################################################################################
|
|
1497
1587
|
def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
|
|
1498
1588
|
if vcf_path is not None:
|
|
1499
1589
|
if vcf_chr_dict is None:
|
gwaslab/qc_fix_sumstats.py
CHANGED
gwaslab/util_ex_ldproxyfinder.py
CHANGED
|
@@ -37,7 +37,7 @@ from gwaslab.hm_harmonize_sumstats import auto_check_vcf_chr_dict
|
|
|
37
37
|
#check if in outcome and exposure snp list
|
|
38
38
|
#replace
|
|
39
39
|
|
|
40
|
-
def _extract_with_ld_proxy(
|
|
40
|
+
def _extract_with_ld_proxy( snplist=None,
|
|
41
41
|
common_sumstats=None,
|
|
42
42
|
sumstats1=None,
|
|
43
43
|
vcf_path=None,
|
|
@@ -58,6 +58,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
58
58
|
is_needed=[]
|
|
59
59
|
no_need =[]
|
|
60
60
|
|
|
61
|
+
print(common_sumstats.head())
|
|
61
62
|
for i in snplist:
|
|
62
63
|
if i in common_sumstats["SNPID"].values:
|
|
63
64
|
no_need.append(i)
|
|
@@ -72,7 +73,7 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
72
73
|
if len(in_sumstats)==0:
|
|
73
74
|
log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
|
|
74
75
|
else:
|
|
75
|
-
log.write(" -{}
|
|
76
|
+
log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
|
|
76
77
|
|
|
77
78
|
for index,row in in_sumstats.iterrows():
|
|
78
79
|
# determine SNP and select region
|
|
@@ -93,6 +94,16 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
93
94
|
if len(flanking_sumstats)==0:
|
|
94
95
|
log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
|
|
95
96
|
continue
|
|
97
|
+
|
|
98
|
+
_get_rsq_single(in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
|
|
99
|
+
row_pos=row["POS"],
|
|
100
|
+
vcf_path=vcf_path,
|
|
101
|
+
region=region,
|
|
102
|
+
log=log,
|
|
103
|
+
verbose=verbose,
|
|
104
|
+
vcf_chr_dict=vcf_chr_dict,
|
|
105
|
+
tabix=tabix)
|
|
106
|
+
|
|
96
107
|
|
|
97
108
|
flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA_1","EA_1"]],
|
|
98
109
|
sumstats = flanking_sumstats,
|
|
@@ -126,6 +137,81 @@ def _extract_with_ld_proxy( snplist=None,
|
|
|
126
137
|
return extracted_sumstats
|
|
127
138
|
|
|
128
139
|
|
|
140
|
+
def _extract_ld_proxy( snplist=None,
|
|
141
|
+
common_sumstats=None,
|
|
142
|
+
vcf_path=None,
|
|
143
|
+
vcf_chr_dict=None,
|
|
144
|
+
tabix=None,
|
|
145
|
+
log=Log(),
|
|
146
|
+
verbose=True,
|
|
147
|
+
windowsizekb=100,
|
|
148
|
+
ld_threshold=0.8
|
|
149
|
+
):
|
|
150
|
+
### Load vcf#######################################################################################
|
|
151
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
152
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
153
|
+
if tabix is None:
|
|
154
|
+
tabix = which("tabix")
|
|
155
|
+
vcf_chr_dict = auto_check_vcf_chr_dict(vcf_path=vcf_path, vcf_chr_dict=vcf_chr_dict, verbose=verbose, log=log)
|
|
156
|
+
|
|
157
|
+
ld_proxies = pd.DataFrame()
|
|
158
|
+
in_sumstats = common_sumstats.loc[common_sumstats["SNPID"].isin(snplist),:]
|
|
159
|
+
|
|
160
|
+
if len(in_sumstats)==0:
|
|
161
|
+
log.write(" -No available variants for LD proxy checking...Skipping... ", verbose=verbose)
|
|
162
|
+
else:
|
|
163
|
+
log.write(" -{} available variants for LD proxy checking... ".format(len(in_sumstats)), verbose=verbose)
|
|
164
|
+
|
|
165
|
+
for index,row in in_sumstats.iterrows():
|
|
166
|
+
# determine SNP and select region
|
|
167
|
+
snpid = row["SNPID"]
|
|
168
|
+
chrom= int(row["CHR"])
|
|
169
|
+
start= int(row["POS"]-windowsizekb*1000)
|
|
170
|
+
end= int(row["POS"]+windowsizekb*1000)
|
|
171
|
+
|
|
172
|
+
region = (chrom, start, end)
|
|
173
|
+
|
|
174
|
+
### #######################################################################################
|
|
175
|
+
#is_flanking = common_sumstats["CHR"] == chrom & common_sumstats["CHR"]>start & common_sumstats["CHR"]<end
|
|
176
|
+
#flanking_sumstats = common_sumstats.loc[is_flanking,:]
|
|
177
|
+
flanking_sumstats = common_sumstats.query('CHR == @chrom and @start < POS < @end',engine='python').copy()
|
|
178
|
+
|
|
179
|
+
log.write(" -Extract {} variants in flanking region of {} for checking: {}:{}-{}".format(len(flanking_sumstats), snpid, chrom, start, end), verbose=verbose)
|
|
180
|
+
|
|
181
|
+
if len(flanking_sumstats)==0:
|
|
182
|
+
log.write(" -No availble variants in the region...Skipping!", verbose=verbose)
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
flanking_sumstats = _get_rsq(row =in_sumstats.loc[index,["POS","NEA","EA"]],
|
|
186
|
+
sumstats = flanking_sumstats,
|
|
187
|
+
row_pos=row["POS"],
|
|
188
|
+
vcf_path=vcf_path,
|
|
189
|
+
region=region,
|
|
190
|
+
log=log,
|
|
191
|
+
verbose=verbose,
|
|
192
|
+
vcf_chr_dict=vcf_chr_dict,
|
|
193
|
+
tabix=tabix)
|
|
194
|
+
if flanking_sumstats is None:
|
|
195
|
+
log.write(" -{} is not found in the vcf...Skipping!".format(snpid))
|
|
196
|
+
continue
|
|
197
|
+
flanking_sumstats = flanking_sumstats.loc[flanking_sumstats["RSQ"]>ld_threshold,:]
|
|
198
|
+
|
|
199
|
+
log.write(" -Variants in LD with {} (RSQ > {}): {}".format(snpid, ld_threshold,len(flanking_sumstats)), verbose=verbose)
|
|
200
|
+
|
|
201
|
+
if len(flanking_sumstats)>0:
|
|
202
|
+
flanking_sumstats["LD_REF_VARIANT"]= snpid
|
|
203
|
+
for i,row_with_rsq in flanking_sumstats.iterrows():
|
|
204
|
+
if row_with_rsq["SNPID"] in common_sumstats["SNPID"].values:
|
|
205
|
+
log.write(" -Top Proxy for {} is found: {} (LD RSQ= {})".format(snpid, row_with_rsq["SNPID"], row_with_rsq["RSQ"]))
|
|
206
|
+
break
|
|
207
|
+
#row_with_rsq = pd.DataFrame(row_with_rsq)
|
|
208
|
+
ld_proxies = pd.concat([ld_proxies, flanking_sumstats], ignore_index=True)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
212
|
+
return ld_proxies.sort_values(by="RSQ",ascending=False)
|
|
213
|
+
|
|
214
|
+
|
|
129
215
|
def _get_rsq( row,
|
|
130
216
|
sumstats,
|
|
131
217
|
row_pos,
|
|
@@ -205,4 +291,77 @@ def _get_rsq( row,
|
|
|
205
291
|
return sumstats
|
|
206
292
|
|
|
207
293
|
def _check_if_in_sumstats2(row, sumstast):
|
|
208
|
-
pass
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
def _get_rsq_single( row,
|
|
297
|
+
row_pos,
|
|
298
|
+
vcf_path,
|
|
299
|
+
region,
|
|
300
|
+
log,
|
|
301
|
+
verbose,
|
|
302
|
+
vcf_chr_dict,
|
|
303
|
+
tabix):
|
|
304
|
+
#load genotype data of the targeted region
|
|
305
|
+
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
306
|
+
|
|
307
|
+
if ref_genotype is None:
|
|
308
|
+
log.warning("No data was retrieved. Skipping ...", verbose=verbose)
|
|
309
|
+
ref_genotype=dict()
|
|
310
|
+
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
314
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
315
|
+
# match sumstats pos and ref pos:
|
|
316
|
+
# get ref index for its first appearance of sumstats pos
|
|
317
|
+
#######################################################################################
|
|
318
|
+
def match_varaint(x):
|
|
319
|
+
# x: "POS,NEA,EA"
|
|
320
|
+
if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
|
|
321
|
+
if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
|
|
322
|
+
# multiple position matches
|
|
323
|
+
for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
|
|
324
|
+
# for each possible match, compare ref and alt
|
|
325
|
+
if x.iloc[1] == ref_genotype["variants/REF"][j]:
|
|
326
|
+
if x.iloc[2] in ref_genotype["variants/ALT"][j]:
|
|
327
|
+
return j
|
|
328
|
+
elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
|
|
329
|
+
if x.iloc[2] == ref_genotype["variants/REF"][j]:
|
|
330
|
+
return j
|
|
331
|
+
else:
|
|
332
|
+
return None
|
|
333
|
+
else:
|
|
334
|
+
# single match
|
|
335
|
+
return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
|
|
336
|
+
else:
|
|
337
|
+
# no position match
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
#############################################################################################
|
|
341
|
+
lead_pos = row_pos
|
|
342
|
+
|
|
343
|
+
# if lead pos is available:
|
|
344
|
+
if lead_pos in ref_genotype["variants/POS"]:
|
|
345
|
+
|
|
346
|
+
# get ref index for lead snp
|
|
347
|
+
lead_snp_ref_index = match_varaint(row)
|
|
348
|
+
#lead_snp_ref_index = np.where(ref_genotype["variants/POS"] == lead_pos)[0][0]
|
|
349
|
+
|
|
350
|
+
# non-na other snp index
|
|
351
|
+
other_snps_ref_index = list(range(len(ref_genotype["calldata/GT"])))
|
|
352
|
+
other_snps_ref_index.remove(lead_snp_ref_index)
|
|
353
|
+
|
|
354
|
+
# get genotype
|
|
355
|
+
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
356
|
+
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
357
|
+
|
|
358
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
359
|
+
|
|
360
|
+
if len(other_snp_genotype)>1:
|
|
361
|
+
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
362
|
+
else:
|
|
363
|
+
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
364
|
+
|
|
365
|
+
ld_proxy = pd.DataFrame( {"SNPID":ref_genotype["variants/ID"][other_snps_ref_index],"RSQ":valid_r2 })
|
|
366
|
+
|
|
367
|
+
return ld_proxy.sort_values(by="RSQ",ascending=False)
|
gwaslab/util_in_fill_data.py
CHANGED
|
@@ -40,7 +40,7 @@ def filldata(
|
|
|
40
40
|
for i in skip_cols:
|
|
41
41
|
to_fill.remove(i)
|
|
42
42
|
log.write(" -Skipping columns: ",skip_cols, verbose=verbose)
|
|
43
|
-
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF"]))==0:
|
|
43
|
+
if len(set(to_fill) & set(["OR","OR_95L","OR_95U","BETA","SE","P","Z","CHISQ","MLOG10P","MAF","SIG"]))==0:
|
|
44
44
|
log.write(" -No available columns to fill. Skipping.", verbose=verbose)
|
|
45
45
|
log.write("Finished filling data using existing columns.", verbose=verbose)
|
|
46
46
|
return sumstats
|
|
@@ -219,6 +219,20 @@ def fill_maf(sumstats,log,verbose=True,filled_count=0):
|
|
|
219
219
|
return 0,filled_count
|
|
220
220
|
return 1,filled_count
|
|
221
221
|
|
|
222
|
+
def fill_sig(sumstats,log,sig_level=5e-8, verbose=True,filled_count=0):
|
|
223
|
+
if "P" in sumstats.columns or "MLOG10P" in sumstats.columns:
|
|
224
|
+
log.write(" - Determining significant using P and MLOG10P with threshold:{}".format(sig_level), verbose=verbose)
|
|
225
|
+
if "P" in sumstats.columns:
|
|
226
|
+
is_sig = sumstats["P"]<sig_level
|
|
227
|
+
elif "MLOG10P" in sumstats.columns:
|
|
228
|
+
is_sig = sumstats["MLOG10P"]>np.log10(sig_level)
|
|
229
|
+
sumstats["SIGNIFICANT"] = False
|
|
230
|
+
sumstats.loc[is_sig, "SIGNIFICANT"] = True
|
|
231
|
+
filled_count +=1
|
|
232
|
+
else:
|
|
233
|
+
return 0,filled_count
|
|
234
|
+
return 1,filled_count
|
|
235
|
+
|
|
222
236
|
####################################################################################################################
|
|
223
237
|
def fill_extreme_mlog10(sumstats, z):
|
|
224
238
|
log_pvalue = np.log(2) + ss.norm.logsf(np.abs(sumstats[z])) #two-sided
|
|
@@ -289,7 +303,10 @@ def fill_iteratively(sumstats,raw_to_fill,log,only_sig,df,extreme,verbose,sig_le
|
|
|
289
303
|
else:
|
|
290
304
|
status,filled_count = fill_mlog10p(sumstats,log,verbose=verbose)
|
|
291
305
|
if status == 1 : to_fill.remove("MLOG10P")
|
|
292
|
-
|
|
306
|
+
|
|
307
|
+
if "SIG" in to_fill:
|
|
308
|
+
status,filled_count = fill_sig(sumstats,sig_level=sig_level ,log=log,verbose=verbose,filled_count=filled_count)
|
|
309
|
+
if status == 1 : to_fill.remove("SIG")
|
|
293
310
|
if filled_count == 0:
|
|
294
311
|
break
|
|
295
312
|
|
gwaslab/util_in_filter_value.py
CHANGED
|
@@ -527,4 +527,55 @@ def _filter_region(sumstats, region, chrom="CHR",pos="POS",log=Log(),verbose=Tru
|
|
|
527
527
|
|
|
528
528
|
log.write(" -Extract SNPs in specified regions: "+str(sum(in_region_snp)),verbose=verbose)
|
|
529
529
|
sumstats = sumstats.loc[in_region_snp,:]
|
|
530
|
-
return sumstats.copy()
|
|
530
|
+
return sumstats.copy()
|
|
531
|
+
|
|
532
|
+
def _search_variants( sumstats, snplist=None,
|
|
533
|
+
snpid="SNPID" ,rsid="rsID",
|
|
534
|
+
chrom="CHR",pos="POS",ea="EA",nea="NEA",
|
|
535
|
+
log=Log(),verbose=True):
|
|
536
|
+
log.write("Start to search for variants...", verbose=verbose)
|
|
537
|
+
# create a boolean col with FALSE
|
|
538
|
+
if snpid in sumstats.columns:
|
|
539
|
+
is_extract = sumstats[snpid]!=sumstats[snpid]
|
|
540
|
+
else:
|
|
541
|
+
is_extract = sumstats[rsid]!=sumstats[rsid]
|
|
542
|
+
|
|
543
|
+
# search each variant
|
|
544
|
+
for variant in snplist:
|
|
545
|
+
|
|
546
|
+
if pd.api.types.is_list_like(variant):
|
|
547
|
+
# (1:1234)
|
|
548
|
+
single_chrom=variant[0]
|
|
549
|
+
single_pos=variant[1]
|
|
550
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
|
|
551
|
+
|
|
552
|
+
elif pd.api.types.is_string_dtype(type(variant)):
|
|
553
|
+
# rs123
|
|
554
|
+
if "rsID" in sumstats.columns:
|
|
555
|
+
is_extract = is_extract | (sumstats["rsID"] == variant)
|
|
556
|
+
|
|
557
|
+
# 1:123:A:D
|
|
558
|
+
if "SNPID" in sumstats.columns:
|
|
559
|
+
is_extract = is_extract | (sumstats["SNPID"] == variant)
|
|
560
|
+
|
|
561
|
+
# 1:123:A:D -> (1:1234)
|
|
562
|
+
a= re.match(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)([:_-]([ATCG]+)[:_-]([ATCG]+))?$', variant, flags=0)
|
|
563
|
+
|
|
564
|
+
if a is not None:
|
|
565
|
+
if a[4] is None:
|
|
566
|
+
single_chrom=int(a[2])
|
|
567
|
+
single_pos=int(a[3])
|
|
568
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom))
|
|
569
|
+
else:
|
|
570
|
+
single_chrom = int(a[2])
|
|
571
|
+
single_pos = int(a[3])
|
|
572
|
+
single_ea = a[5]
|
|
573
|
+
single_nea = a[6]
|
|
574
|
+
a_match = ((sumstats[nea] == single_nea) & (sumstats[ea] == single_ea)) | ((sumstats[nea] == single_ea) & (sumstats[ea] == single_nea))
|
|
575
|
+
is_extract = is_extract | ((sumstats[pos] == single_pos ) &(sumstats[chrom] == single_chrom) & a_match)
|
|
576
|
+
|
|
577
|
+
to_search = sumstats.loc[is_extract,:].copy()
|
|
578
|
+
log.write(" -Found {} variants...".format(len(to_search)),verbose=verbose)
|
|
579
|
+
|
|
580
|
+
log.write("Finished searching variants.", verbose=verbose)
|
|
581
|
+
return to_search
|
gwaslab/util_in_merge.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from gwaslab.g_Log import Log
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
def _extract_variant(variant_set, sumstats_dic, log=Log(), verbose=True):
|
|
6
|
+
|
|
7
|
+
combined = pd.DataFrame()
|
|
8
|
+
log.write("Start to initialize gl.SumstatsSet...", verbose=verbose)
|
|
9
|
+
for key, sumstats_gls in sumstats_dic.items():
|
|
10
|
+
log.write(" -{} : {}".format(key, sumstats_gls), verbose=verbose)
|
|
11
|
+
|
|
12
|
+
for key, sumstats_gls in sumstats_dic.items():
|
|
13
|
+
|
|
14
|
+
sumstats_single = sumstats_gls.data
|
|
15
|
+
|
|
16
|
+
# create a boolean col with FALSE
|
|
17
|
+
is_extract = sumstats_single["SNPID"]!=sumstats_single["SNPID"]
|
|
18
|
+
|
|
19
|
+
for variant in variant_set:
|
|
20
|
+
|
|
21
|
+
if pd.api.types.is_list_like(variant):
|
|
22
|
+
|
|
23
|
+
chrom=variant[0]
|
|
24
|
+
pos=variant[1]
|
|
25
|
+
|
|
26
|
+
is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
|
|
27
|
+
elif pd.api.types.is_string_dtype(type(variant)):
|
|
28
|
+
|
|
29
|
+
is_extract = is_extract | (sumstats_single["SNPID"] == variant)
|
|
30
|
+
|
|
31
|
+
a= re.search(r'^(chr|Chr|CHR)?(\d+)[:_-](\d+)[:_-][ATCG]+[:_-][ATCG]+$', variant, flags=0)
|
|
32
|
+
if a is not None:
|
|
33
|
+
chrom=int(a[2])
|
|
34
|
+
pos=int(a[3])
|
|
35
|
+
is_extract = is_extract | ((sumstats_single["POS"] == pos ) &(sumstats_single["CHR"] == chrom))
|
|
36
|
+
|
|
37
|
+
to_extract = sumstats_single.loc[is_extract,:].copy()
|
|
38
|
+
log.write(" -Extracted {} variants from {}".format(len(to_extract), key),verbose=verbose)
|
|
39
|
+
to_extract["STUDY"] = key
|
|
40
|
+
|
|
41
|
+
to_extract_cols=["STUDY"]
|
|
42
|
+
|
|
43
|
+
default_cols=["SNPID","EA","NEA","CHR","POS","BETA","SE","P","MLOG10P","EAF","MAF","STATUS"]
|
|
44
|
+
|
|
45
|
+
for i in default_cols:
|
|
46
|
+
if i in sumstats_single.columns:
|
|
47
|
+
to_extract_cols.append(i)
|
|
48
|
+
|
|
49
|
+
combined = pd.concat([combined, to_extract[to_extract_cols]], ignore_index=True)
|
|
50
|
+
log.write("Finished initializing gl.SumstatsSet.", verbose=verbose)
|
|
51
|
+
return combined
|
gwaslab/viz_aux_save_figure.py
CHANGED
|
@@ -52,7 +52,8 @@ def get_default_path(keyword,fmt="png"):
|
|
|
52
52
|
"esc":"effect_size_comparision",
|
|
53
53
|
"afc":"allele_frequency_comparision",
|
|
54
54
|
"gwheatmap":"genome_wide_heatmap",
|
|
55
|
-
"scatter":"scatter"
|
|
55
|
+
"scatter":"scatter",
|
|
56
|
+
"forest":"forest"
|
|
56
57
|
}
|
|
57
58
|
prefix = path_dictionary[keyword]
|
|
58
59
|
count = 1
|