gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/util_in_get_sig.py
CHANGED
|
@@ -58,9 +58,9 @@ def getsig(insumstats,
|
|
|
58
58
|
if is_enough_info == False: return None
|
|
59
59
|
############################################################################################
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
log.write(" -Processing "+str(len(insumstats))+" variants...", verbose=verbose)
|
|
62
|
+
log.write(" -Significance threshold :", sig_level, verbose=verbose)
|
|
63
|
+
log.write(" -Sliding window size:", str(windowsizekb) ," kb", verbose=verbose)
|
|
64
64
|
|
|
65
65
|
#load data
|
|
66
66
|
sumstats=insumstats.loc[~insumstats[id].isna(),:].copy()
|
|
@@ -90,12 +90,12 @@ def getsig(insumstats,
|
|
|
90
90
|
sumstats[p] = pd.to_numeric(sumstats[p], errors='coerce')
|
|
91
91
|
sumstats_sig = sumstats.loc[sumstats[p]<sig_level,:].copy()
|
|
92
92
|
sumstats_sig.loc[:,"__SCALEDP"] = pd.to_numeric(sumstats_sig[p], errors='coerce')
|
|
93
|
-
|
|
93
|
+
log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...", verbose=verbose)
|
|
94
94
|
|
|
95
95
|
#sort the coordinates
|
|
96
96
|
sumstats_sig = sumstats_sig.sort_values([chrom,pos])
|
|
97
97
|
if sumstats_sig is None:
|
|
98
|
-
|
|
98
|
+
log.write(" -No lead snps at given significance threshold!", verbose=verbose)
|
|
99
99
|
return None
|
|
100
100
|
|
|
101
101
|
#init
|
|
@@ -148,7 +148,7 @@ def getsig(insumstats,
|
|
|
148
148
|
sig_index_list.append(current_sig_index)
|
|
149
149
|
continue
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
log.write(" -Identified "+str(len(sig_index_list))+" lead variants!", verbose=verbose)
|
|
152
152
|
|
|
153
153
|
# drop internal __SCALEDP
|
|
154
154
|
sumstats_sig = sumstats_sig.drop("__SCALEDP",axis=1)
|
|
@@ -158,8 +158,8 @@ def getsig(insumstats,
|
|
|
158
158
|
|
|
159
159
|
# annotate GENENAME
|
|
160
160
|
if anno is True and len(output)>0:
|
|
161
|
-
|
|
162
|
-
|
|
161
|
+
log.write(" -Annotating variants using references:{}".format(source), verbose=verbose)
|
|
162
|
+
log.write(" -Annotating variants using references based on genome build:{}".format(build), verbose=verbose)
|
|
163
163
|
|
|
164
164
|
output = annogene(
|
|
165
165
|
output,
|
|
@@ -249,13 +249,13 @@ def annogene(
|
|
|
249
249
|
source="ensembl",
|
|
250
250
|
verbose=True):
|
|
251
251
|
|
|
252
|
-
|
|
252
|
+
log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
|
|
253
253
|
output = insumstats.copy()
|
|
254
254
|
|
|
255
255
|
if source == "ensembl":
|
|
256
256
|
if build=="19":
|
|
257
257
|
#data = EnsemblRelease(75)
|
|
258
|
-
|
|
258
|
+
log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes", verbose=verbose)
|
|
259
259
|
#zcat Homo_sapiens.GRCh37.75.gtf.gz|
|
|
260
260
|
#grep -E 'processed_transcript|protein_coding|_gene'
|
|
261
261
|
#| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
|
|
@@ -275,7 +275,7 @@ def annogene(
|
|
|
275
275
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source), axis=1)),
|
|
276
276
|
index=output.index).values
|
|
277
277
|
elif build=="38":
|
|
278
|
-
|
|
278
|
+
log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
|
|
279
279
|
#gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
|
|
280
280
|
gtf_path = check_and_download("ensembl_hg38_gtf")
|
|
281
281
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -292,7 +292,7 @@ def annogene(
|
|
|
292
292
|
|
|
293
293
|
if source == "refseq":
|
|
294
294
|
if build=="19":
|
|
295
|
-
|
|
295
|
+
log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
|
|
296
296
|
#gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
|
|
297
297
|
gtf_path = check_and_download("refseq_hg19_gtf")
|
|
298
298
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -307,7 +307,7 @@ def annogene(
|
|
|
307
307
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
|
|
308
308
|
index=output.index).values
|
|
309
309
|
elif build=="38":
|
|
310
|
-
|
|
310
|
+
log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
|
|
311
311
|
#gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
|
|
312
312
|
gtf_path = check_and_download("refseq_hg38_gtf")
|
|
313
313
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -321,7 +321,7 @@ def annogene(
|
|
|
321
321
|
output.loc[:,["LOCATION","GENE"]] = pd.DataFrame(
|
|
322
322
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
|
|
323
323
|
index=output.index).values
|
|
324
|
-
|
|
324
|
+
log.write("Finished annotating variants with nearest gene name(s) successfully!", verbose=verbose)
|
|
325
325
|
return output
|
|
326
326
|
|
|
327
327
|
def getnovel(insumstats,
|
|
@@ -333,6 +333,8 @@ def getnovel(insumstats,
|
|
|
333
333
|
known=False,
|
|
334
334
|
efo=False,
|
|
335
335
|
only_novel=False,
|
|
336
|
+
group_key=None,
|
|
337
|
+
if_get_lead = True,
|
|
336
338
|
windowsizekb_for_novel=1000,
|
|
337
339
|
windowsizekb=500,
|
|
338
340
|
sig_level=5e-8,
|
|
@@ -362,36 +364,30 @@ def getnovel(insumstats,
|
|
|
362
364
|
if is_enough_info == False: return None
|
|
363
365
|
############################################################################################
|
|
364
366
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
if insumstats["POS"].max()*10 > big_number:
|
|
372
|
-
big_number = int(big_number * 10)
|
|
373
|
-
else:
|
|
374
|
-
break
|
|
367
|
+
if if_get_lead == True:
|
|
368
|
+
allsig = getsig(insumstats=insumstats,
|
|
369
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
370
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
371
|
+
else:
|
|
372
|
+
allsig = insumstats.copy()
|
|
375
373
|
|
|
376
|
-
|
|
377
|
-
allsig["TCHR+POS"]=allsig[chrom]*big_number + allsig[pos]
|
|
378
|
-
|
|
374
|
+
############################################################################################
|
|
379
375
|
knownsig = pd.DataFrame()
|
|
380
376
|
if efo != False:
|
|
381
377
|
if type(efo) is not list:
|
|
382
|
-
|
|
378
|
+
log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
|
|
383
379
|
known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
|
|
384
380
|
knownsig = known_Sumstats.data.copy()
|
|
385
381
|
else:
|
|
386
382
|
knownsig=pd.DataFrame()
|
|
387
|
-
|
|
383
|
+
log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
|
|
388
384
|
for single_efo in efo:
|
|
389
385
|
known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
|
|
390
386
|
known_Sumstats.data["EFOID"] = single_efo
|
|
391
387
|
knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
|
|
392
388
|
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
393
389
|
knownsig["POS"] = knownsig["POS"].astype("Int64")
|
|
394
|
-
|
|
390
|
+
log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)), verbose=verbose)
|
|
395
391
|
if type(known) is pd.DataFrame:
|
|
396
392
|
knownsig_2 = known.copy()
|
|
397
393
|
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
@@ -406,19 +402,230 @@ def getnovel(insumstats,
|
|
|
406
402
|
knownsig["POS"] = knownsig["POS"].astype("Int64")
|
|
407
403
|
if "SNPID" not in knownsig.columns:
|
|
408
404
|
knownsig["SNPID"] =knownsig["CHR"].astype("string") + ":" + knownsig["POS"].astype("string")
|
|
405
|
+
|
|
409
406
|
if len(knownsig)<1:
|
|
410
407
|
raise ValueError("Please input a dataframe of known loci or valid efo code")
|
|
411
|
-
|
|
412
|
-
# create helper column TCHR+POS for knownsig
|
|
413
|
-
knownsig["TCHR+POS"]=knownsig[chrom]*big_number + knownsig[pos]
|
|
414
|
-
|
|
415
|
-
if verbose: log.write(" -Lead variants in known loci:",len(knownsig))
|
|
416
|
-
if verbose: log.write(" -Checking the minimum distance between identified lead variants and provided known variants...")
|
|
417
408
|
|
|
409
|
+
if group_key is not None:
|
|
410
|
+
if (group_key not in allsig.columns) or (group_key not in knownsig.columns):
|
|
411
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
412
|
+
|
|
413
|
+
# create helper column TCHR+POS for knownsig and all sig
|
|
414
|
+
############################################################################################
|
|
415
|
+
maxpos = insumstats["POS"].max()
|
|
416
|
+
big_number = determine_big_number(maxpos)
|
|
417
|
+
knownsig = add_tchr_pos(knownsig, chrom, pos, big_number)
|
|
418
|
+
allsig = add_tchr_pos(allsig, chrom, pos, big_number)
|
|
419
|
+
############################################################################################
|
|
418
420
|
#sorting
|
|
419
421
|
allsig = allsig.sort_values(by="TCHR+POS",ignore_index=True)
|
|
420
422
|
knownsig = knownsig.sort_values(by="TCHR+POS",ignore_index=True)
|
|
423
|
+
############################################################################################
|
|
424
|
+
if group_key is not None:
|
|
425
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
426
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
427
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
428
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
429
|
+
|
|
430
|
+
log.write(" -Lead variants in known loci:",len(knownsig), verbose=verbose)
|
|
431
|
+
log.write(" -Checking the minimum distance between identified lead variants and provided known variants...", verbose=verbose)
|
|
432
|
+
|
|
433
|
+
############################################################################################
|
|
434
|
+
if group_key is None:
|
|
435
|
+
# get distance
|
|
436
|
+
allsig = determine_distance(allsig, knownsig)
|
|
437
|
+
# get other info
|
|
438
|
+
allsig = fill_meta_info_for_known(allsig, knownsig)
|
|
439
|
+
############################################################################################
|
|
440
|
+
# determine if novel
|
|
441
|
+
allsig = determine_novel(allsig, windowsizekb_for_novel)
|
|
442
|
+
# determine location
|
|
443
|
+
allsig = determine_location(allsig)
|
|
444
|
+
# if not on same chromosome, distance set to pd.NA
|
|
445
|
+
allsig = determine_if_same_chromosome(allsig, knownsig, maxpos)
|
|
446
|
+
############################################################################################
|
|
447
|
+
else:
|
|
448
|
+
#groups1 = set(allsig[group_key].unique())
|
|
449
|
+
#groups2 = set(knownsig[group_key].unique())
|
|
450
|
+
#common_group = groups1.intersection(groups2)
|
|
451
|
+
|
|
452
|
+
#allsig_no_group = allsig.loc[~allsig[group_key].isin(common_group),:].copy()
|
|
453
|
+
allsig_group = pd.DataFrame()
|
|
454
|
+
|
|
455
|
+
for key in allsig[group_key].unique():
|
|
456
|
+
allsig_single_group = allsig.loc[allsig[group_key]==key,:].copy()
|
|
457
|
+
knownsig_single_group = knownsig.loc[knownsig[group_key]==key,:].copy()
|
|
458
|
+
|
|
459
|
+
#if len(allsig_single_group) >0 and len(knownsig_single_group) >0:
|
|
460
|
+
allsig_single_group = determine_distance(allsig_single_group, knownsig_single_group)
|
|
461
|
+
# get other info
|
|
462
|
+
allsig_single_group = fill_meta_info_for_known(allsig_single_group, knownsig_single_group)
|
|
463
|
+
|
|
464
|
+
# determine if novel
|
|
465
|
+
allsig_single_group = determine_novel(allsig_single_group, windowsizekb_for_novel)
|
|
466
|
+
|
|
467
|
+
# determine location
|
|
468
|
+
allsig_single_group = determine_location(allsig_single_group)
|
|
469
|
+
|
|
470
|
+
# if not on same chromosome, distance set to pd.NA
|
|
471
|
+
allsig_single_group = determine_if_same_chromosome(allsig_single_group, knownsig_single_group, maxpos)
|
|
472
|
+
|
|
473
|
+
allsig_group = pd.concat([allsig_group, allsig_single_group], ignore_index=True)
|
|
474
|
+
|
|
475
|
+
allsig = allsig_group
|
|
476
|
+
#pd.concat([allsig_no_group, allsig_group], ignore_index=True)
|
|
477
|
+
|
|
478
|
+
# drop helper column TCHR+POS
|
|
479
|
+
allsig = allsig.drop(["TCHR+POS"], axis=1)
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
allsig = allsig.where(~pd.isna(allsig), pd.NA)
|
|
483
|
+
except:
|
|
484
|
+
pass
|
|
485
|
+
|
|
486
|
+
log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...", verbose=verbose)
|
|
487
|
+
log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...", verbose=verbose)
|
|
488
|
+
|
|
489
|
+
finished(log,verbose,_end_line)
|
|
490
|
+
|
|
491
|
+
# how to return
|
|
492
|
+
if only_novel is True:
|
|
493
|
+
if output_known is True:
|
|
494
|
+
return allsig.loc[allsig["NOVEL"],:], knownsig
|
|
495
|
+
else:
|
|
496
|
+
return allsig.loc[allsig["NOVEL"],:]
|
|
497
|
+
else:
|
|
498
|
+
if output_known is True:
|
|
499
|
+
return allsig, knownsig
|
|
500
|
+
else:
|
|
501
|
+
return allsig
|
|
502
|
+
##################################################################################################################################################################################################
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _check_cis(insumstats,
|
|
506
|
+
id,
|
|
507
|
+
chrom,
|
|
508
|
+
pos,
|
|
509
|
+
p,
|
|
510
|
+
use_p=False,
|
|
511
|
+
known=False,
|
|
512
|
+
group_key=None,
|
|
513
|
+
if_get_lead = False,
|
|
514
|
+
windowsizekb=500,
|
|
515
|
+
sig_level=5e-8,
|
|
516
|
+
log=Log(),
|
|
517
|
+
xymt=["X","Y","MT"],
|
|
518
|
+
anno=False,
|
|
519
|
+
build="19",
|
|
520
|
+
source="ensembl",
|
|
521
|
+
verbose=True):
|
|
522
|
+
##start function with col checking##########################################################
|
|
523
|
+
_start_line = "check if variants are in cis or trans regions"
|
|
524
|
+
_end_line = "checking if variants are in cis or trans regions"
|
|
525
|
+
_start_cols = [chrom,pos, group_key]
|
|
526
|
+
_start_function = ".check_cis()"
|
|
527
|
+
_must_args ={}
|
|
528
|
+
|
|
529
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
530
|
+
log=log,
|
|
531
|
+
verbose=verbose,
|
|
532
|
+
start_line=_start_line,
|
|
533
|
+
end_line=_end_line,
|
|
534
|
+
start_cols=_start_cols,
|
|
535
|
+
start_function=_start_function,
|
|
536
|
+
**_must_args)
|
|
537
|
+
if is_enough_info == False: return None
|
|
538
|
+
############################################################################################
|
|
539
|
+
|
|
540
|
+
if if_get_lead == True:
|
|
541
|
+
allsig = getsig(insumstats=insumstats,
|
|
542
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
543
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
544
|
+
else:
|
|
545
|
+
allsig = insumstats.copy()
|
|
546
|
+
|
|
547
|
+
############################################################################################
|
|
548
|
+
knownsig = pd.DataFrame()
|
|
549
|
+
if type(known) is pd.DataFrame:
|
|
550
|
+
knownsig_2 = known.copy()
|
|
551
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
552
|
+
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
553
|
+
knownsig["START"] = knownsig["START"].astype("Int64")
|
|
554
|
+
knownsig["END"] = knownsig["END"].astype("Int64")
|
|
555
|
+
elif type(known) is str:
|
|
556
|
+
knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
|
|
557
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
558
|
+
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
559
|
+
knownsig["START"] = knownsig["START"].astype("Int64")
|
|
560
|
+
knownsig["END"] = knownsig["END"].astype("Int64")
|
|
561
|
+
|
|
562
|
+
if len(knownsig)<1:
|
|
563
|
+
raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
|
|
564
|
+
|
|
565
|
+
if group_key is not None:
|
|
566
|
+
if group_key not in knownsig.columns:
|
|
567
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
568
|
+
|
|
569
|
+
############################################################################################
|
|
570
|
+
if group_key is not None:
|
|
571
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
572
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
573
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
574
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
575
|
+
|
|
576
|
+
log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
|
|
577
|
+
log.write(" -Window size in kb adding to start and end: {}...".format(windowsizekb), verbose=verbose)
|
|
578
|
+
############################################################################################
|
|
579
|
+
#convert to a dict
|
|
580
|
+
reference_dict = {}
|
|
581
|
+
for index,row in knownsig.iterrows():
|
|
582
|
+
reference_dict[row[group_key]] = (row["CHR"], row["START"], row["END"] )
|
|
583
|
+
############################################################################################
|
|
584
|
+
try:
|
|
585
|
+
no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
|
|
586
|
+
if len(no_reference_avaialble)>0:
|
|
587
|
+
log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble.unique())), verbose=verbose)
|
|
588
|
+
except:
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
allsig = allsig.where(~pd.isna(allsig), pd.NA)
|
|
595
|
+
except:
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
number_of_cis = sum(allsig["CIS/TRANS"] == "Cis")
|
|
600
|
+
number_of_trans = sum(allsig["CIS/TRANS"] == "Trans")
|
|
601
|
+
number_of_noreference = sum(allsig["CIS/TRANS"] == "NoReference")
|
|
602
|
+
log.write (" -Number of Cis variants: {}".format(number_of_cis),verbose=verbose)
|
|
603
|
+
log.write (" -Number of Trans variants: {}".format(number_of_trans),verbose=verbose)
|
|
604
|
+
log.write (" -Number of NoReference variants: {}".format(number_of_noreference),verbose=verbose)
|
|
605
|
+
except:
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
finished(log,verbose,_end_line)
|
|
421
609
|
|
|
610
|
+
return allsig
|
|
611
|
+
|
|
612
|
+
###################################################################################################################################################################################################
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def determine_big_number(maxpos, big_number = 1000000000):
|
|
616
|
+
for i in range(7):
|
|
617
|
+
if maxpos*10 > big_number:
|
|
618
|
+
big_number = int(big_number * 10)
|
|
619
|
+
else:
|
|
620
|
+
break
|
|
621
|
+
return big_number
|
|
622
|
+
|
|
623
|
+
def add_tchr_pos(df, chrom, pos, big_number):
|
|
624
|
+
df["TCHR+POS"]=df[chrom]*big_number + df[pos]
|
|
625
|
+
return df
|
|
626
|
+
|
|
627
|
+
def fill_meta_info_for_known(allsig, knownsig):
|
|
628
|
+
if len(allsig)==0 or len(knownsig)==0: return allsig
|
|
422
629
|
if "SNPID" in knownsig.columns:
|
|
423
630
|
knownids=knownsig["SNPID"].values
|
|
424
631
|
if "PUBMEDID" in knownsig.columns:
|
|
@@ -427,12 +634,7 @@ def getnovel(insumstats,
|
|
|
427
634
|
knownauthor=knownsig["AUTHOR"].values
|
|
428
635
|
if "EFOID" in knownsig.columns:
|
|
429
636
|
knownefo=knownsig["EFOID"].values
|
|
430
|
-
|
|
431
|
-
# get distance
|
|
432
|
-
lambda x:np.min(np.abs(knownsig["TCHR+POS"]-x))
|
|
433
|
-
allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
|
|
434
|
-
|
|
435
|
-
# get other info
|
|
637
|
+
|
|
436
638
|
if "SNPID" in knownsig.columns:
|
|
437
639
|
allsig["KNOWN_ID"] = allsig["TCHR+POS"].apply(lambda x:knownids[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
438
640
|
if "PUBMEDID" in knownsig.columns:
|
|
@@ -440,20 +642,48 @@ def getnovel(insumstats,
|
|
|
440
642
|
if "AUTHOR" in knownsig.columns:
|
|
441
643
|
allsig["KNOWN_AUTHOR"] = allsig["TCHR+POS"].apply(lambda x:knownauthor[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
442
644
|
if "EFOID" in knownsig.columns:
|
|
443
|
-
allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
645
|
+
allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
646
|
+
return allsig
|
|
444
647
|
|
|
445
|
-
|
|
648
|
+
def determine_if_cis(x, group_key,windowsizekb, reference_dict):
|
|
649
|
+
if x[group_key] in reference_dict.keys():
|
|
650
|
+
is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
|
|
651
|
+
is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
|
|
652
|
+
is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
|
|
653
|
+
|
|
654
|
+
if is_same_chr and is_large_than_start and is_smaller_than_end:
|
|
655
|
+
return "Cis"
|
|
656
|
+
else:
|
|
657
|
+
return "Trans"
|
|
658
|
+
else:
|
|
659
|
+
return "NoReference"
|
|
660
|
+
|
|
661
|
+
def determine_distance(allsig, knownsig):
|
|
662
|
+
if len(allsig)==0:
|
|
663
|
+
return allsig
|
|
664
|
+
if len(knownsig)==0:
|
|
665
|
+
allsig["DISTANCE_TO_KNOWN"] = pd.NA
|
|
666
|
+
return allsig
|
|
667
|
+
allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
|
|
668
|
+
return allsig
|
|
669
|
+
|
|
670
|
+
def determine_novel(allsig, windowsizekb_for_novel):
|
|
671
|
+
if len(allsig)==0 or "DISTANCE_TO_KNOWN" not in allsig.columns:
|
|
672
|
+
return allsig
|
|
446
673
|
allsig["NOVEL"] = allsig["DISTANCE_TO_KNOWN"].abs() > windowsizekb_for_novel*1000
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
674
|
+
allsig.loc[allsig["DISTANCE_TO_KNOWN"].isna(), "NOVEL"] = True
|
|
675
|
+
return allsig
|
|
676
|
+
|
|
677
|
+
def determine_location(allsig):
|
|
678
|
+
allsig["LOCATION_OF_KNOWN"]="NoReference"
|
|
450
679
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"]== 0,"LOCATION_OF_KNOWN"] = "Same"
|
|
451
680
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"] > 0 ,"LOCATION_OF_KNOWN"] = "Upstream"
|
|
452
681
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"] < 0 ,"LOCATION_OF_KNOWN"] = "Downstream"
|
|
682
|
+
return allsig
|
|
453
683
|
|
|
454
|
-
|
|
455
|
-
if sum(allsig["DISTANCE_TO_KNOWN"].abs() >
|
|
456
|
-
not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() >
|
|
684
|
+
def determine_if_same_chromosome(allsig, knownsig, maxpos):
|
|
685
|
+
if sum(allsig["DISTANCE_TO_KNOWN"].abs() > maxpos)>0:
|
|
686
|
+
not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > maxpos
|
|
457
687
|
allsig.loc[ not_on_same_chromosome ,"DISTANCE_TO_KNOWN"] = pd.NA
|
|
458
688
|
allsig.loc[ not_on_same_chromosome ,"LOCATION_OF_KNOWN"] = "NoneOnThisChr"
|
|
459
689
|
if "SNPID" in knownsig.columns:
|
|
@@ -464,23 +694,135 @@ def getnovel(insumstats,
|
|
|
464
694
|
allsig.loc[ not_on_same_chromosome ,"KNOWN_AUTHOR"] = pd.NA
|
|
465
695
|
if "EFOID" in knownsig.columns:
|
|
466
696
|
allsig.loc[ not_on_same_chromosome ,"KNOWN_EFOID"] = pd.NA
|
|
697
|
+
return allsig
|
|
467
698
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
699
|
+
def _check_novel_set(insumstats,
|
|
700
|
+
id,
|
|
701
|
+
chrom,
|
|
702
|
+
pos,
|
|
703
|
+
p,
|
|
704
|
+
use_p=False,
|
|
705
|
+
known=False,
|
|
706
|
+
group_key=None,
|
|
707
|
+
snpset="SNPSET",
|
|
708
|
+
snpid="SNPID",
|
|
709
|
+
if_get_lead = False,
|
|
710
|
+
windowsizekb=500,
|
|
711
|
+
sig_level=5e-8,
|
|
712
|
+
log=Log(),
|
|
713
|
+
xymt=["X","Y","MT"],
|
|
714
|
+
anno=False,
|
|
715
|
+
build="19",
|
|
716
|
+
source="ensembl",
|
|
717
|
+
verbose=True):
|
|
473
718
|
|
|
474
|
-
|
|
719
|
+
##start function with col checking##########################################################
|
|
720
|
+
_start_line = "check if variant sets are overlapping with those in reference file"
|
|
721
|
+
_end_line = "checking if variant sets are overlapping with those in reference file"
|
|
722
|
+
_start_cols = [chrom,pos, group_key]
|
|
723
|
+
_start_function = ".check_cis()"
|
|
724
|
+
_must_args ={}
|
|
725
|
+
|
|
726
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
727
|
+
log=log,
|
|
728
|
+
verbose=verbose,
|
|
729
|
+
start_line=_start_line,
|
|
730
|
+
end_line=_end_line,
|
|
731
|
+
start_cols=_start_cols,
|
|
732
|
+
start_function=_start_function,
|
|
733
|
+
**_must_args)
|
|
734
|
+
if is_enough_info == False: return None
|
|
735
|
+
############################################################################################
|
|
475
736
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
else:
|
|
481
|
-
return allsig.loc[allsig["NOVEL"],:]
|
|
737
|
+
if if_get_lead == True:
|
|
738
|
+
allsig = getsig(insumstats=insumstats,
|
|
739
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
740
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
482
741
|
else:
|
|
483
|
-
|
|
484
|
-
|
|
742
|
+
allsig = insumstats.copy()
|
|
743
|
+
|
|
744
|
+
############################################################################################
|
|
745
|
+
knownsig = pd.DataFrame()
|
|
746
|
+
if type(known) is pd.DataFrame:
|
|
747
|
+
knownsig_2 = known.copy()
|
|
748
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
749
|
+
knownsig[snpid] = knownsig[snpid].astype("string")
|
|
750
|
+
knownsig[snpset] = knownsig[snpset].astype("string")
|
|
751
|
+
knownsig[group_key] = knownsig[group_key].astype("string")
|
|
752
|
+
elif type(known) is str:
|
|
753
|
+
knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
|
|
754
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
755
|
+
knownsig[snpid] = knownsig[snpid].astype("string")
|
|
756
|
+
knownsig[snpset] = knownsig[snpset].astype("string")
|
|
757
|
+
knownsig[group_key] = knownsig[group_key].astype("string")
|
|
758
|
+
|
|
759
|
+
if len(knownsig)<1:
|
|
760
|
+
raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
|
|
761
|
+
|
|
762
|
+
if group_key is not None:
|
|
763
|
+
if group_key not in knownsig.columns:
|
|
764
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
765
|
+
|
|
766
|
+
############################################################################################
|
|
767
|
+
if group_key is not None:
|
|
768
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
769
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
770
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
771
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
772
|
+
|
|
773
|
+
log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
|
|
774
|
+
|
|
775
|
+
############################################################################################
|
|
776
|
+
#convert to a dict
|
|
777
|
+
reference_dict = {}
|
|
778
|
+
|
|
779
|
+
for index,row in knownsig.iterrows():
|
|
780
|
+
if row[group_key] in reference_dict.keys():
|
|
781
|
+
if row[snpset] in reference_dict[row[group_key]].keys():
|
|
782
|
+
reference_dict[row[group_key]][row[snpset]].add(row[snpid])
|
|
783
|
+
else:
|
|
784
|
+
reference_dict[row[group_key]][row[snpset]] = set([row[snpid]])
|
|
485
785
|
else:
|
|
486
|
-
|
|
786
|
+
reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
|
|
787
|
+
############################################################################################
|
|
788
|
+
|
|
789
|
+
try:
|
|
790
|
+
no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
|
|
791
|
+
if len(no_reference_avaialble)>0:
|
|
792
|
+
log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
|
|
793
|
+
except:
|
|
794
|
+
pass
|
|
795
|
+
|
|
796
|
+
log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
|
|
797
|
+
known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
|
|
798
|
+
|
|
799
|
+
allsig["KNOWN_SET"] = known_list.str[0]
|
|
800
|
+
allsig["KNOWN_VARIANT"] = known_list.str[1]
|
|
801
|
+
|
|
802
|
+
back_dict={}
|
|
803
|
+
for i in allsig[group_key].unique():
|
|
804
|
+
back_dict[i] ={}
|
|
805
|
+
for j in allsig.loc[allsig[group_key]==i,snpset].unique():
|
|
806
|
+
back_dict[i][j] =set()
|
|
807
|
+
for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
|
|
808
|
+
back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
|
|
809
|
+
|
|
810
|
+
allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
|
|
811
|
+
|
|
812
|
+
finished(log,verbose,_end_line)
|
|
813
|
+
|
|
814
|
+
return allsig
|
|
815
|
+
|
|
816
|
+
def check_overlap(x,snpid, group_key,reference_dict):
|
|
817
|
+
if x[group_key] in reference_dict.keys():
|
|
818
|
+
for key, value in reference_dict[x[group_key]].items():
|
|
819
|
+
if x[snpid] in value:
|
|
820
|
+
return key, x[snpid]
|
|
821
|
+
return pd.NA, pd.NA,
|
|
822
|
+
|
|
823
|
+
def assign_set_variant(x,group_key,snpset,back_dict):
|
|
824
|
+
if x[group_key] in back_dict.keys():
|
|
825
|
+
if x[snpset] in back_dict[x[group_key]].keys():
|
|
826
|
+
if len(back_dict[x[group_key]][x[snpset]]) >0:
|
|
827
|
+
return back_dict[x[group_key]][x[snpset]]
|
|
828
|
+
return pd.NA
|