gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/util_in_get_sig.py
CHANGED
|
@@ -13,8 +13,9 @@ from gwaslab.bd_common_data import get_chr_to_NC
|
|
|
13
13
|
from gwaslab.bd_common_data import gtf_to_protein_coding
|
|
14
14
|
from gwaslab.bd_download import check_and_download
|
|
15
15
|
from gwaslab.util_ex_gwascatalog import gwascatalog_trait
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
from gwaslab.qc_fix_sumstats import check_dataframe_shape
|
|
17
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
18
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
18
19
|
# getsig
|
|
19
20
|
# closest_gene
|
|
20
21
|
# annogene
|
|
@@ -39,11 +40,27 @@ def getsig(insumstats,
|
|
|
39
40
|
"""
|
|
40
41
|
Extract the lead variants using a sliding window. P or MLOG10P will be used and converted to SCALEDP for sorting.
|
|
41
42
|
"""
|
|
43
|
+
##start function with col checking##########################################################
|
|
44
|
+
_start_line = "extract lead variants"
|
|
45
|
+
_end_line = "extracting lead variants"
|
|
46
|
+
_start_cols = [chrom,pos]
|
|
47
|
+
_start_function = ".get_lead()"
|
|
48
|
+
_must_args ={}
|
|
49
|
+
|
|
50
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
51
|
+
log=log,
|
|
52
|
+
verbose=verbose,
|
|
53
|
+
start_line=_start_line,
|
|
54
|
+
end_line=_end_line,
|
|
55
|
+
start_cols=_start_cols,
|
|
56
|
+
start_function=_start_function,
|
|
57
|
+
**_must_args)
|
|
58
|
+
if is_enough_info == False: return None
|
|
59
|
+
############################################################################################
|
|
42
60
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if verbose: log.write(" -Sliding window size:", str(windowsizekb) ," kb")
|
|
61
|
+
log.write(" -Processing "+str(len(insumstats))+" variants...", verbose=verbose)
|
|
62
|
+
log.write(" -Significance threshold :", sig_level, verbose=verbose)
|
|
63
|
+
log.write(" -Sliding window size:", str(windowsizekb) ," kb", verbose=verbose)
|
|
47
64
|
|
|
48
65
|
#load data
|
|
49
66
|
sumstats=insumstats.loc[~insumstats[id].isna(),:].copy()
|
|
@@ -73,12 +90,12 @@ def getsig(insumstats,
|
|
|
73
90
|
sumstats[p] = pd.to_numeric(sumstats[p], errors='coerce')
|
|
74
91
|
sumstats_sig = sumstats.loc[sumstats[p]<sig_level,:].copy()
|
|
75
92
|
sumstats_sig.loc[:,"__SCALEDP"] = pd.to_numeric(sumstats_sig[p], errors='coerce')
|
|
76
|
-
|
|
93
|
+
log.write(" -Found "+str(len(sumstats_sig))+" significant variants in total...", verbose=verbose)
|
|
77
94
|
|
|
78
95
|
#sort the coordinates
|
|
79
96
|
sumstats_sig = sumstats_sig.sort_values([chrom,pos])
|
|
80
97
|
if sumstats_sig is None:
|
|
81
|
-
|
|
98
|
+
log.write(" -No lead snps at given significance threshold!", verbose=verbose)
|
|
82
99
|
return None
|
|
83
100
|
|
|
84
101
|
#init
|
|
@@ -131,7 +148,7 @@ def getsig(insumstats,
|
|
|
131
148
|
sig_index_list.append(current_sig_index)
|
|
132
149
|
continue
|
|
133
150
|
|
|
134
|
-
|
|
151
|
+
log.write(" -Identified "+str(len(sig_index_list))+" lead variants!", verbose=verbose)
|
|
135
152
|
|
|
136
153
|
# drop internal __SCALEDP
|
|
137
154
|
sumstats_sig = sumstats_sig.drop("__SCALEDP",axis=1)
|
|
@@ -141,8 +158,8 @@ def getsig(insumstats,
|
|
|
141
158
|
|
|
142
159
|
# annotate GENENAME
|
|
143
160
|
if anno is True and len(output)>0:
|
|
144
|
-
|
|
145
|
-
|
|
161
|
+
log.write(" -Annotating variants using references:{}".format(source), verbose=verbose)
|
|
162
|
+
log.write(" -Annotating variants using references based on genome build:{}".format(build), verbose=verbose)
|
|
146
163
|
|
|
147
164
|
output = annogene(
|
|
148
165
|
output,
|
|
@@ -155,11 +172,9 @@ def getsig(insumstats,
|
|
|
155
172
|
source=source,
|
|
156
173
|
verbose=verbose)
|
|
157
174
|
|
|
158
|
-
# Finishing
|
|
159
|
-
if verbose: log.write("Finished extracting lead variants successfully!")
|
|
160
175
|
# drop internal id
|
|
161
176
|
output = output.drop("__ID",axis=1)
|
|
162
|
-
|
|
177
|
+
finished(log,verbose,_end_line)
|
|
163
178
|
return output.copy()
|
|
164
179
|
|
|
165
180
|
|
|
@@ -234,13 +249,13 @@ def annogene(
|
|
|
234
249
|
source="ensembl",
|
|
235
250
|
verbose=True):
|
|
236
251
|
|
|
237
|
-
|
|
252
|
+
log.write("Start to annotate variants with nearest gene name(s)...", verbose=verbose)
|
|
238
253
|
output = insumstats.copy()
|
|
239
254
|
|
|
240
255
|
if source == "ensembl":
|
|
241
256
|
if build=="19":
|
|
242
257
|
#data = EnsemblRelease(75)
|
|
243
|
-
|
|
258
|
+
log.write(" -Assigning Gene name using ensembl_hg19_gtf for protein coding genes", verbose=verbose)
|
|
244
259
|
#zcat Homo_sapiens.GRCh37.75.gtf.gz|
|
|
245
260
|
#grep -E 'processed_transcript|protein_coding|_gene'
|
|
246
261
|
#| gzip >Homo_sapiens.GRCh37.75.processed.chr.gtf.gz
|
|
@@ -260,7 +275,7 @@ def annogene(
|
|
|
260
275
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source), axis=1)),
|
|
261
276
|
index=output.index).values
|
|
262
277
|
elif build=="38":
|
|
263
|
-
|
|
278
|
+
log.write(" -Assigning Gene name using ensembl_hg38_gtf for protein coding genes", verbose=verbose)
|
|
264
279
|
#gtf_path = check_and_download("ensembl_hg38_gtf_protein_coding")
|
|
265
280
|
gtf_path = check_and_download("ensembl_hg38_gtf")
|
|
266
281
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -277,7 +292,7 @@ def annogene(
|
|
|
277
292
|
|
|
278
293
|
if source == "refseq":
|
|
279
294
|
if build=="19":
|
|
280
|
-
|
|
295
|
+
log.write(" -Assigning Gene name using NCBI refseq latest GRCh37 for protein coding genes", verbose=verbose)
|
|
281
296
|
#gtf_path = check_and_download("refseq_hg19_gtf_protein_coding")
|
|
282
297
|
gtf_path = check_and_download("refseq_hg19_gtf")
|
|
283
298
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -292,7 +307,7 @@ def annogene(
|
|
|
292
307
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
|
|
293
308
|
index=output.index).values
|
|
294
309
|
elif build=="38":
|
|
295
|
-
|
|
310
|
+
log.write(" -Assigning Gene name using NCBI refseq latest GRCh38 for protein coding genes", verbose=verbose)
|
|
296
311
|
#gtf_path = check_and_download("refseq_hg38_gtf_protein_coding")
|
|
297
312
|
gtf_path = check_and_download("refseq_hg38_gtf")
|
|
298
313
|
gtf_path = gtf_to_protein_coding(gtf_path,log=log,verbose=verbose)
|
|
@@ -306,7 +321,7 @@ def annogene(
|
|
|
306
321
|
output.loc[:,["LOCATION","GENE"]] = pd.DataFrame(
|
|
307
322
|
list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source,build=build), axis=1)),
|
|
308
323
|
index=output.index).values
|
|
309
|
-
|
|
324
|
+
log.write("Finished annotating variants with nearest gene name(s) successfully!", verbose=verbose)
|
|
310
325
|
return output
|
|
311
326
|
|
|
312
327
|
def getnovel(insumstats,
|
|
@@ -318,6 +333,8 @@ def getnovel(insumstats,
|
|
|
318
333
|
known=False,
|
|
319
334
|
efo=False,
|
|
320
335
|
only_novel=False,
|
|
336
|
+
group_key=None,
|
|
337
|
+
if_get_lead = True,
|
|
321
338
|
windowsizekb_for_novel=1000,
|
|
322
339
|
windowsizekb=500,
|
|
323
340
|
sig_level=5e-8,
|
|
@@ -329,37 +346,48 @@ def getnovel(insumstats,
|
|
|
329
346
|
gwascatalog_source="NCBI",
|
|
330
347
|
output_known=False,
|
|
331
348
|
verbose=True):
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
for i in range(7):
|
|
339
|
-
if insumstats["POS"].max()*10 > big_number:
|
|
340
|
-
big_number = int(big_number * 10)
|
|
341
|
-
else:
|
|
342
|
-
break
|
|
349
|
+
##start function with col checking##########################################################
|
|
350
|
+
_start_line = "check if lead variants are known"
|
|
351
|
+
_end_line = "checking if lead variants are known"
|
|
352
|
+
_start_cols = [chrom,pos]
|
|
353
|
+
_start_function = ".get_novel()"
|
|
354
|
+
_must_args ={}
|
|
343
355
|
|
|
344
|
-
|
|
345
|
-
|
|
356
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
357
|
+
log=log,
|
|
358
|
+
verbose=verbose,
|
|
359
|
+
start_line=_start_line,
|
|
360
|
+
end_line=_end_line,
|
|
361
|
+
start_cols=_start_cols,
|
|
362
|
+
start_function=_start_function,
|
|
363
|
+
**_must_args)
|
|
364
|
+
if is_enough_info == False: return None
|
|
365
|
+
############################################################################################
|
|
346
366
|
|
|
367
|
+
if if_get_lead == True:
|
|
368
|
+
allsig = getsig(insumstats=insumstats,
|
|
369
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
370
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
371
|
+
else:
|
|
372
|
+
allsig = insumstats.copy()
|
|
373
|
+
|
|
374
|
+
############################################################################################
|
|
347
375
|
knownsig = pd.DataFrame()
|
|
348
376
|
if efo != False:
|
|
349
377
|
if type(efo) is not list:
|
|
350
|
-
|
|
378
|
+
log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
|
|
351
379
|
known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
|
|
352
380
|
knownsig = known_Sumstats.data.copy()
|
|
353
381
|
else:
|
|
354
382
|
knownsig=pd.DataFrame()
|
|
355
|
-
|
|
383
|
+
log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
|
|
356
384
|
for single_efo in efo:
|
|
357
385
|
known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
|
|
358
386
|
known_Sumstats.data["EFOID"] = single_efo
|
|
359
387
|
knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
|
|
360
388
|
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
361
389
|
knownsig["POS"] = knownsig["POS"].astype("Int64")
|
|
362
|
-
|
|
390
|
+
log.write(" -Retrieved {} associations from GWAS catalog.".format(len(knownsig)), verbose=verbose)
|
|
363
391
|
if type(known) is pd.DataFrame:
|
|
364
392
|
knownsig_2 = known.copy()
|
|
365
393
|
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
@@ -374,19 +402,230 @@ def getnovel(insumstats,
|
|
|
374
402
|
knownsig["POS"] = knownsig["POS"].astype("Int64")
|
|
375
403
|
if "SNPID" not in knownsig.columns:
|
|
376
404
|
knownsig["SNPID"] =knownsig["CHR"].astype("string") + ":" + knownsig["POS"].astype("string")
|
|
405
|
+
|
|
377
406
|
if len(knownsig)<1:
|
|
378
407
|
raise ValueError("Please input a dataframe of known loci or valid efo code")
|
|
379
|
-
|
|
380
|
-
# create helper column TCHR+POS for knownsig
|
|
381
|
-
knownsig["TCHR+POS"]=knownsig[chrom]*big_number + knownsig[pos]
|
|
382
|
-
|
|
383
|
-
if verbose: log.write(" -Lead variants in known loci:",len(knownsig))
|
|
384
|
-
if verbose: log.write(" -Checking the minimum distance between identified lead variants and provided known variants...")
|
|
385
408
|
|
|
409
|
+
if group_key is not None:
|
|
410
|
+
if (group_key not in allsig.columns) or (group_key not in knownsig.columns):
|
|
411
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
412
|
+
|
|
413
|
+
# create helper column TCHR+POS for knownsig and all sig
|
|
414
|
+
############################################################################################
|
|
415
|
+
maxpos = insumstats["POS"].max()
|
|
416
|
+
big_number = determine_big_number(maxpos)
|
|
417
|
+
knownsig = add_tchr_pos(knownsig, chrom, pos, big_number)
|
|
418
|
+
allsig = add_tchr_pos(allsig, chrom, pos, big_number)
|
|
419
|
+
############################################################################################
|
|
386
420
|
#sorting
|
|
387
421
|
allsig = allsig.sort_values(by="TCHR+POS",ignore_index=True)
|
|
388
422
|
knownsig = knownsig.sort_values(by="TCHR+POS",ignore_index=True)
|
|
423
|
+
############################################################################################
|
|
424
|
+
if group_key is not None:
|
|
425
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
426
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
427
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
428
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
429
|
+
|
|
430
|
+
log.write(" -Lead variants in known loci:",len(knownsig), verbose=verbose)
|
|
431
|
+
log.write(" -Checking the minimum distance between identified lead variants and provided known variants...", verbose=verbose)
|
|
432
|
+
|
|
433
|
+
############################################################################################
|
|
434
|
+
if group_key is None:
|
|
435
|
+
# get distance
|
|
436
|
+
allsig = determine_distance(allsig, knownsig)
|
|
437
|
+
# get other info
|
|
438
|
+
allsig = fill_meta_info_for_known(allsig, knownsig)
|
|
439
|
+
############################################################################################
|
|
440
|
+
# determine if novel
|
|
441
|
+
allsig = determine_novel(allsig, windowsizekb_for_novel)
|
|
442
|
+
# determine location
|
|
443
|
+
allsig = determine_location(allsig)
|
|
444
|
+
# if not on same chromosome, distance set to pd.NA
|
|
445
|
+
allsig = determine_if_same_chromosome(allsig, knownsig, maxpos)
|
|
446
|
+
############################################################################################
|
|
447
|
+
else:
|
|
448
|
+
#groups1 = set(allsig[group_key].unique())
|
|
449
|
+
#groups2 = set(knownsig[group_key].unique())
|
|
450
|
+
#common_group = groups1.intersection(groups2)
|
|
451
|
+
|
|
452
|
+
#allsig_no_group = allsig.loc[~allsig[group_key].isin(common_group),:].copy()
|
|
453
|
+
allsig_group = pd.DataFrame()
|
|
454
|
+
|
|
455
|
+
for key in allsig[group_key].unique():
|
|
456
|
+
allsig_single_group = allsig.loc[allsig[group_key]==key,:].copy()
|
|
457
|
+
knownsig_single_group = knownsig.loc[knownsig[group_key]==key,:].copy()
|
|
458
|
+
|
|
459
|
+
#if len(allsig_single_group) >0 and len(knownsig_single_group) >0:
|
|
460
|
+
allsig_single_group = determine_distance(allsig_single_group, knownsig_single_group)
|
|
461
|
+
# get other info
|
|
462
|
+
allsig_single_group = fill_meta_info_for_known(allsig_single_group, knownsig_single_group)
|
|
463
|
+
|
|
464
|
+
# determine if novel
|
|
465
|
+
allsig_single_group = determine_novel(allsig_single_group, windowsizekb_for_novel)
|
|
466
|
+
|
|
467
|
+
# determine location
|
|
468
|
+
allsig_single_group = determine_location(allsig_single_group)
|
|
469
|
+
|
|
470
|
+
# if not on same chromosome, distance set to pd.NA
|
|
471
|
+
allsig_single_group = determine_if_same_chromosome(allsig_single_group, knownsig_single_group, maxpos)
|
|
472
|
+
|
|
473
|
+
allsig_group = pd.concat([allsig_group, allsig_single_group], ignore_index=True)
|
|
474
|
+
|
|
475
|
+
allsig = allsig_group
|
|
476
|
+
#pd.concat([allsig_no_group, allsig_group], ignore_index=True)
|
|
477
|
+
|
|
478
|
+
# drop helper column TCHR+POS
|
|
479
|
+
allsig = allsig.drop(["TCHR+POS"], axis=1)
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
allsig = allsig.where(~pd.isna(allsig), pd.NA)
|
|
483
|
+
except:
|
|
484
|
+
pass
|
|
485
|
+
|
|
486
|
+
log.write(" -Identified ",len(allsig)-sum(allsig["NOVEL"])," known vairants in current sumstats...", verbose=verbose)
|
|
487
|
+
log.write(" -Identified ",sum(allsig["NOVEL"])," novel vairants in current sumstats...", verbose=verbose)
|
|
488
|
+
|
|
489
|
+
finished(log,verbose,_end_line)
|
|
490
|
+
|
|
491
|
+
# how to return
|
|
492
|
+
if only_novel is True:
|
|
493
|
+
if output_known is True:
|
|
494
|
+
return allsig.loc[allsig["NOVEL"],:], knownsig
|
|
495
|
+
else:
|
|
496
|
+
return allsig.loc[allsig["NOVEL"],:]
|
|
497
|
+
else:
|
|
498
|
+
if output_known is True:
|
|
499
|
+
return allsig, knownsig
|
|
500
|
+
else:
|
|
501
|
+
return allsig
|
|
502
|
+
##################################################################################################################################################################################################
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _check_cis(insumstats,
|
|
506
|
+
id,
|
|
507
|
+
chrom,
|
|
508
|
+
pos,
|
|
509
|
+
p,
|
|
510
|
+
use_p=False,
|
|
511
|
+
known=False,
|
|
512
|
+
group_key=None,
|
|
513
|
+
if_get_lead = False,
|
|
514
|
+
windowsizekb=500,
|
|
515
|
+
sig_level=5e-8,
|
|
516
|
+
log=Log(),
|
|
517
|
+
xymt=["X","Y","MT"],
|
|
518
|
+
anno=False,
|
|
519
|
+
build="19",
|
|
520
|
+
source="ensembl",
|
|
521
|
+
verbose=True):
|
|
522
|
+
##start function with col checking##########################################################
|
|
523
|
+
_start_line = "check if variants are in cis or trans regions"
|
|
524
|
+
_end_line = "checking if variants are in cis or trans regions"
|
|
525
|
+
_start_cols = [chrom,pos, group_key]
|
|
526
|
+
_start_function = ".check_cis()"
|
|
527
|
+
_must_args ={}
|
|
528
|
+
|
|
529
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
530
|
+
log=log,
|
|
531
|
+
verbose=verbose,
|
|
532
|
+
start_line=_start_line,
|
|
533
|
+
end_line=_end_line,
|
|
534
|
+
start_cols=_start_cols,
|
|
535
|
+
start_function=_start_function,
|
|
536
|
+
**_must_args)
|
|
537
|
+
if is_enough_info == False: return None
|
|
538
|
+
############################################################################################
|
|
389
539
|
|
|
540
|
+
if if_get_lead == True:
|
|
541
|
+
allsig = getsig(insumstats=insumstats,
|
|
542
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
543
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
544
|
+
else:
|
|
545
|
+
allsig = insumstats.copy()
|
|
546
|
+
|
|
547
|
+
############################################################################################
|
|
548
|
+
knownsig = pd.DataFrame()
|
|
549
|
+
if type(known) is pd.DataFrame:
|
|
550
|
+
knownsig_2 = known.copy()
|
|
551
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
552
|
+
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
553
|
+
knownsig["START"] = knownsig["START"].astype("Int64")
|
|
554
|
+
knownsig["END"] = knownsig["END"].astype("Int64")
|
|
555
|
+
elif type(known) is str:
|
|
556
|
+
knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
|
|
557
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
558
|
+
knownsig["CHR"] = knownsig["CHR"].astype("Int64")
|
|
559
|
+
knownsig["START"] = knownsig["START"].astype("Int64")
|
|
560
|
+
knownsig["END"] = knownsig["END"].astype("Int64")
|
|
561
|
+
|
|
562
|
+
if len(knownsig)<1:
|
|
563
|
+
raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
|
|
564
|
+
|
|
565
|
+
if group_key is not None:
|
|
566
|
+
if group_key not in knownsig.columns:
|
|
567
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
568
|
+
|
|
569
|
+
############################################################################################
|
|
570
|
+
if group_key is not None:
|
|
571
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
572
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
573
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
574
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
575
|
+
|
|
576
|
+
log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
|
|
577
|
+
log.write(" -Window size in kb adding to start and end: {}...".format(windowsizekb), verbose=verbose)
|
|
578
|
+
############################################################################################
|
|
579
|
+
#convert to a dict
|
|
580
|
+
reference_dict = {}
|
|
581
|
+
for index,row in knownsig.iterrows():
|
|
582
|
+
reference_dict[row[group_key]] = (row["CHR"], row["START"], row["END"] )
|
|
583
|
+
############################################################################################
|
|
584
|
+
try:
|
|
585
|
+
no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
|
|
586
|
+
if len(no_reference_avaialble)>0:
|
|
587
|
+
log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble.unique())), verbose=verbose)
|
|
588
|
+
except:
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
allsig["CIS/TRANS"] = allsig.apply(lambda x: determine_if_cis(x, group_key,windowsizekb, reference_dict), axis=1)
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
allsig = allsig.where(~pd.isna(allsig), pd.NA)
|
|
595
|
+
except:
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
number_of_cis = sum(allsig["CIS/TRANS"] == "Cis")
|
|
600
|
+
number_of_trans = sum(allsig["CIS/TRANS"] == "Trans")
|
|
601
|
+
number_of_noreference = sum(allsig["CIS/TRANS"] == "NoReference")
|
|
602
|
+
log.write (" -Number of Cis variants: {}".format(number_of_cis),verbose=verbose)
|
|
603
|
+
log.write (" -Number of Trans variants: {}".format(number_of_trans),verbose=verbose)
|
|
604
|
+
log.write (" -Number of NoReference variants: {}".format(number_of_noreference),verbose=verbose)
|
|
605
|
+
except:
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
finished(log,verbose,_end_line)
|
|
609
|
+
|
|
610
|
+
return allsig
|
|
611
|
+
|
|
612
|
+
###################################################################################################################################################################################################
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def determine_big_number(maxpos, big_number = 1000000000):
|
|
616
|
+
for i in range(7):
|
|
617
|
+
if maxpos*10 > big_number:
|
|
618
|
+
big_number = int(big_number * 10)
|
|
619
|
+
else:
|
|
620
|
+
break
|
|
621
|
+
return big_number
|
|
622
|
+
|
|
623
|
+
def add_tchr_pos(df, chrom, pos, big_number):
|
|
624
|
+
df["TCHR+POS"]=df[chrom]*big_number + df[pos]
|
|
625
|
+
return df
|
|
626
|
+
|
|
627
|
+
def fill_meta_info_for_known(allsig, knownsig):
|
|
628
|
+
if len(allsig)==0 or len(knownsig)==0: return allsig
|
|
390
629
|
if "SNPID" in knownsig.columns:
|
|
391
630
|
knownids=knownsig["SNPID"].values
|
|
392
631
|
if "PUBMEDID" in knownsig.columns:
|
|
@@ -395,12 +634,7 @@ def getnovel(insumstats,
|
|
|
395
634
|
knownauthor=knownsig["AUTHOR"].values
|
|
396
635
|
if "EFOID" in knownsig.columns:
|
|
397
636
|
knownefo=knownsig["EFOID"].values
|
|
398
|
-
|
|
399
|
-
# get distance
|
|
400
|
-
lambda x:np.min(np.abs(knownsig["TCHR+POS"]-x))
|
|
401
|
-
allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
|
|
402
|
-
|
|
403
|
-
# get other info
|
|
637
|
+
|
|
404
638
|
if "SNPID" in knownsig.columns:
|
|
405
639
|
allsig["KNOWN_ID"] = allsig["TCHR+POS"].apply(lambda x:knownids[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
406
640
|
if "PUBMEDID" in knownsig.columns:
|
|
@@ -408,20 +642,48 @@ def getnovel(insumstats,
|
|
|
408
642
|
if "AUTHOR" in knownsig.columns:
|
|
409
643
|
allsig["KNOWN_AUTHOR"] = allsig["TCHR+POS"].apply(lambda x:knownauthor[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
410
644
|
if "EFOID" in knownsig.columns:
|
|
411
|
-
allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
645
|
+
allsig["KNOWN_EFOID"] = allsig["TCHR+POS"].apply(lambda x:knownefo[np.argmin(np.abs(knownsig["TCHR+POS"]-x))])
|
|
646
|
+
return allsig
|
|
647
|
+
|
|
648
|
+
def determine_if_cis(x, group_key,windowsizekb, reference_dict):
|
|
649
|
+
if x[group_key] in reference_dict.keys():
|
|
650
|
+
is_same_chr = str(reference_dict[x[group_key]][0]) == str(x["CHR"])
|
|
651
|
+
is_large_than_start = int(reference_dict[x[group_key]][1]) - windowsizekb*1000 <= x["POS"]
|
|
652
|
+
is_smaller_than_end = int(reference_dict[x[group_key]][2]) + windowsizekb*1000 >= x["POS"]
|
|
653
|
+
|
|
654
|
+
if is_same_chr and is_large_than_start and is_smaller_than_end:
|
|
655
|
+
return "Cis"
|
|
656
|
+
else:
|
|
657
|
+
return "Trans"
|
|
658
|
+
else:
|
|
659
|
+
return "NoReference"
|
|
412
660
|
|
|
413
|
-
|
|
661
|
+
def determine_distance(allsig, knownsig):
|
|
662
|
+
if len(allsig)==0:
|
|
663
|
+
return allsig
|
|
664
|
+
if len(knownsig)==0:
|
|
665
|
+
allsig["DISTANCE_TO_KNOWN"] = pd.NA
|
|
666
|
+
return allsig
|
|
667
|
+
allsig["DISTANCE_TO_KNOWN"] = allsig["TCHR+POS"].apply(lambda x:min(knownsig["TCHR+POS"]-x, key=abs))
|
|
668
|
+
return allsig
|
|
669
|
+
|
|
670
|
+
def determine_novel(allsig, windowsizekb_for_novel):
|
|
671
|
+
if len(allsig)==0 or "DISTANCE_TO_KNOWN" not in allsig.columns:
|
|
672
|
+
return allsig
|
|
414
673
|
allsig["NOVEL"] = allsig["DISTANCE_TO_KNOWN"].abs() > windowsizekb_for_novel*1000
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
674
|
+
allsig.loc[allsig["DISTANCE_TO_KNOWN"].isna(), "NOVEL"] = True
|
|
675
|
+
return allsig
|
|
676
|
+
|
|
677
|
+
def determine_location(allsig):
|
|
678
|
+
allsig["LOCATION_OF_KNOWN"]="NoReference"
|
|
418
679
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"]== 0,"LOCATION_OF_KNOWN"] = "Same"
|
|
419
680
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"] > 0 ,"LOCATION_OF_KNOWN"] = "Upstream"
|
|
420
681
|
allsig.loc[ allsig["DISTANCE_TO_KNOWN"] < 0 ,"LOCATION_OF_KNOWN"] = "Downstream"
|
|
682
|
+
return allsig
|
|
421
683
|
|
|
422
|
-
|
|
423
|
-
if sum(allsig["DISTANCE_TO_KNOWN"].abs() >
|
|
424
|
-
not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() >
|
|
684
|
+
def determine_if_same_chromosome(allsig, knownsig, maxpos):
|
|
685
|
+
if sum(allsig["DISTANCE_TO_KNOWN"].abs() > maxpos)>0:
|
|
686
|
+
not_on_same_chromosome = allsig["DISTANCE_TO_KNOWN"].abs() > maxpos
|
|
425
687
|
allsig.loc[ not_on_same_chromosome ,"DISTANCE_TO_KNOWN"] = pd.NA
|
|
426
688
|
allsig.loc[ not_on_same_chromosome ,"LOCATION_OF_KNOWN"] = "NoneOnThisChr"
|
|
427
689
|
if "SNPID" in knownsig.columns:
|
|
@@ -432,23 +694,135 @@ def getnovel(insumstats,
|
|
|
432
694
|
allsig.loc[ not_on_same_chromosome ,"KNOWN_AUTHOR"] = pd.NA
|
|
433
695
|
if "EFOID" in knownsig.columns:
|
|
434
696
|
allsig.loc[ not_on_same_chromosome ,"KNOWN_EFOID"] = pd.NA
|
|
697
|
+
return allsig
|
|
435
698
|
|
|
436
|
-
|
|
437
|
-
|
|
699
|
+
def _check_novel_set(insumstats,
|
|
700
|
+
id,
|
|
701
|
+
chrom,
|
|
702
|
+
pos,
|
|
703
|
+
p,
|
|
704
|
+
use_p=False,
|
|
705
|
+
known=False,
|
|
706
|
+
group_key=None,
|
|
707
|
+
snpset="SNPSET",
|
|
708
|
+
snpid="SNPID",
|
|
709
|
+
if_get_lead = False,
|
|
710
|
+
windowsizekb=500,
|
|
711
|
+
sig_level=5e-8,
|
|
712
|
+
log=Log(),
|
|
713
|
+
xymt=["X","Y","MT"],
|
|
714
|
+
anno=False,
|
|
715
|
+
build="19",
|
|
716
|
+
source="ensembl",
|
|
717
|
+
verbose=True):
|
|
718
|
+
|
|
719
|
+
##start function with col checking##########################################################
|
|
720
|
+
_start_line = "check if variant sets are overlapping with those in reference file"
|
|
721
|
+
_end_line = "checking if variant sets are overlapping with those in reference file"
|
|
722
|
+
_start_cols = [chrom,pos, group_key]
|
|
723
|
+
_start_function = ".check_cis()"
|
|
724
|
+
_must_args ={}
|
|
438
725
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
726
|
+
is_enough_info = start_to(sumstats=insumstats,
|
|
727
|
+
log=log,
|
|
728
|
+
verbose=verbose,
|
|
729
|
+
start_line=_start_line,
|
|
730
|
+
end_line=_end_line,
|
|
731
|
+
start_cols=_start_cols,
|
|
732
|
+
start_function=_start_function,
|
|
733
|
+
**_must_args)
|
|
734
|
+
if is_enough_info == False: return None
|
|
735
|
+
############################################################################################
|
|
443
736
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
else:
|
|
449
|
-
return allsig.loc[allsig["NOVEL"],:]
|
|
737
|
+
if if_get_lead == True:
|
|
738
|
+
allsig = getsig(insumstats=insumstats,
|
|
739
|
+
id=id,chrom=chrom,pos=pos,p=p,use_p=use_p,windowsizekb=windowsizekb,sig_level=sig_level,log=log,
|
|
740
|
+
xymt=xymt,anno=anno,build=build, source=source,verbose=verbose)
|
|
450
741
|
else:
|
|
451
|
-
|
|
452
|
-
|
|
742
|
+
allsig = insumstats.copy()
|
|
743
|
+
|
|
744
|
+
############################################################################################
|
|
745
|
+
knownsig = pd.DataFrame()
|
|
746
|
+
if type(known) is pd.DataFrame:
|
|
747
|
+
knownsig_2 = known.copy()
|
|
748
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
749
|
+
knownsig[snpid] = knownsig[snpid].astype("string")
|
|
750
|
+
knownsig[snpset] = knownsig[snpset].astype("string")
|
|
751
|
+
knownsig[group_key] = knownsig[group_key].astype("string")
|
|
752
|
+
elif type(known) is str:
|
|
753
|
+
knownsig_2 = pd.read_csv(known,sep="\s+",dtype={"CHR":"Int64","POS":"Int64"})
|
|
754
|
+
knownsig = pd.concat([knownsig, knownsig_2],ignore_index=True)
|
|
755
|
+
knownsig[snpid] = knownsig[snpid].astype("string")
|
|
756
|
+
knownsig[snpset] = knownsig[snpset].astype("string")
|
|
757
|
+
knownsig[group_key] = knownsig[group_key].astype("string")
|
|
758
|
+
|
|
759
|
+
if len(knownsig)<1:
|
|
760
|
+
raise ValueError("Please input a dataframe of gene list with GENE, CHR, START, END.")
|
|
761
|
+
|
|
762
|
+
if group_key is not None:
|
|
763
|
+
if group_key not in knownsig.columns:
|
|
764
|
+
raise ValueError("Please check if group_key is in both sumstats and list of known associations.")
|
|
765
|
+
|
|
766
|
+
############################################################################################
|
|
767
|
+
if group_key is not None:
|
|
768
|
+
number_of_groups_allsig = allsig[group_key].nunique()
|
|
769
|
+
number_of_groups_known = knownsig[group_key].nunique()
|
|
770
|
+
log.write(" -Number of groups in sumstats:{}".format(number_of_groups_allsig), verbose=verbose)
|
|
771
|
+
log.write(" -Number of groups in reference:{}".format(number_of_groups_known), verbose=verbose)
|
|
772
|
+
|
|
773
|
+
log.write(" -Checking if variants in cis/trans regions grouped by {}...".format(group_key), verbose=verbose)
|
|
774
|
+
|
|
775
|
+
############################################################################################
|
|
776
|
+
#convert to a dict
|
|
777
|
+
reference_dict = {}
|
|
778
|
+
|
|
779
|
+
for index,row in knownsig.iterrows():
|
|
780
|
+
if row[group_key] in reference_dict.keys():
|
|
781
|
+
if row[snpset] in reference_dict[row[group_key]].keys():
|
|
782
|
+
reference_dict[row[group_key]][row[snpset]].add(row[snpid])
|
|
783
|
+
else:
|
|
784
|
+
reference_dict[row[group_key]][row[snpset]] = set([row[snpid]])
|
|
453
785
|
else:
|
|
454
|
-
|
|
786
|
+
reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
|
|
787
|
+
############################################################################################
|
|
788
|
+
|
|
789
|
+
try:
|
|
790
|
+
no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
|
|
791
|
+
if len(no_reference_avaialble)>0:
|
|
792
|
+
log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
|
|
793
|
+
except:
|
|
794
|
+
pass
|
|
795
|
+
|
|
796
|
+
log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
|
|
797
|
+
known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
|
|
798
|
+
|
|
799
|
+
allsig["KNOWN_SET"] = known_list.str[0]
|
|
800
|
+
allsig["KNOWN_VARIANT"] = known_list.str[1]
|
|
801
|
+
|
|
802
|
+
back_dict={}
|
|
803
|
+
for i in allsig[group_key].unique():
|
|
804
|
+
back_dict[i] ={}
|
|
805
|
+
for j in allsig.loc[allsig[group_key]==i,snpset].unique():
|
|
806
|
+
back_dict[i][j] =set()
|
|
807
|
+
for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
|
|
808
|
+
back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
|
|
809
|
+
|
|
810
|
+
allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
|
|
811
|
+
|
|
812
|
+
finished(log,verbose,_end_line)
|
|
813
|
+
|
|
814
|
+
return allsig
|
|
815
|
+
|
|
816
|
+
def check_overlap(x,snpid, group_key,reference_dict):
|
|
817
|
+
if x[group_key] in reference_dict.keys():
|
|
818
|
+
for key, value in reference_dict[x[group_key]].items():
|
|
819
|
+
if x[snpid] in value:
|
|
820
|
+
return key, x[snpid]
|
|
821
|
+
return pd.NA, pd.NA,
|
|
822
|
+
|
|
823
|
+
def assign_set_variant(x,group_key,snpset,back_dict):
|
|
824
|
+
if x[group_key] in back_dict.keys():
|
|
825
|
+
if x[snpset] in back_dict[x[group_key]].keys():
|
|
826
|
+
if len(back_dict[x[group_key]][x[snpset]]) >0:
|
|
827
|
+
return back_dict[x[group_key]][x[snpset]]
|
|
828
|
+
return pd.NA
|