gwaslab 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/__init__.py +2 -0
- gwaslab/bd_common_data.py +1 -0
- gwaslab/bd_get_hapmap3.py +0 -1
- gwaslab/data/formatbook.json +78 -0
- gwaslab/data/reference.json +3 -1
- gwaslab/g_Sumstats.py +110 -25
- gwaslab/g_SumstatsMulti.py +287 -0
- gwaslab/g_SumstatsPair.py +101 -16
- gwaslab/g_Sumstats_polars.py +245 -0
- gwaslab/g_headers.py +12 -3
- gwaslab/g_meta.py +124 -47
- gwaslab/g_meta_update.py +48 -0
- gwaslab/g_vchange_status_polars.py +44 -0
- gwaslab/g_version.py +2 -2
- gwaslab/hm_casting.py +169 -110
- gwaslab/hm_casting_polars.py +202 -0
- gwaslab/hm_harmonize_sumstats.py +19 -8
- gwaslab/io_load_ld.py +529 -0
- gwaslab/io_preformat_input.py +11 -0
- gwaslab/io_preformat_input_polars.py +632 -0
- gwaslab/io_process_args.py +25 -1
- gwaslab/io_read_ldsc.py +34 -3
- gwaslab/io_read_pipcs.py +62 -6
- gwaslab/prscs_gigrnd.py +122 -0
- gwaslab/prscs_mcmc_gtb.py +136 -0
- gwaslab/prscs_parse_genet.py +98 -0
- gwaslab/qc_build.py +53 -0
- gwaslab/qc_check_datatype.py +10 -8
- gwaslab/qc_check_datatype_polars.py +128 -0
- gwaslab/qc_fix_sumstats.py +25 -23
- gwaslab/qc_fix_sumstats_polars.py +193 -0
- gwaslab/util_ex_calculate_ldmatrix.py +49 -19
- gwaslab/util_ex_gwascatalog.py +71 -28
- gwaslab/util_ex_infer_ancestry.py +65 -0
- gwaslab/util_ex_ldsc.py +67 -21
- gwaslab/util_ex_match_ldmatrix.py +396 -0
- gwaslab/util_ex_run_2samplemr.py +0 -2
- gwaslab/util_ex_run_ccgwas.py +155 -0
- gwaslab/util_ex_run_coloc.py +1 -1
- gwaslab/util_ex_run_hyprcoloc.py +117 -0
- gwaslab/util_ex_run_magma.py +74 -0
- gwaslab/util_ex_run_mesusie.py +155 -0
- gwaslab/util_ex_run_mtag.py +92 -0
- gwaslab/util_ex_run_prscs.py +85 -0
- gwaslab/util_ex_run_susie.py +40 -9
- gwaslab/util_in_estimate_ess.py +18 -0
- gwaslab/util_in_fill_data.py +20 -1
- gwaslab/util_in_filter_value.py +10 -5
- gwaslab/util_in_get_sig.py +71 -13
- gwaslab/util_in_meta.py +168 -4
- gwaslab/util_in_meta_polars.py +174 -0
- gwaslab/viz_aux_annotate_plot.py +13 -2
- gwaslab/viz_plot_compare_effect.py +87 -23
- gwaslab/viz_plot_credible_sets.py +55 -11
- gwaslab/viz_plot_effect.py +22 -12
- gwaslab/viz_plot_miamiplot2.py +3 -2
- gwaslab/viz_plot_mqqplot.py +94 -84
- gwaslab/viz_plot_qqplot.py +9 -7
- gwaslab/viz_plot_regional2.py +2 -1
- gwaslab/viz_plot_stackedregional.py +4 -1
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/METADATA +46 -68
- gwaslab-3.6.0.dist-info/RECORD +119 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/WHEEL +1 -1
- gwaslab-3.5.7.dist-info/RECORD +0 -96
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from gwaslab.g_Log import Log
|
|
4
|
+
|
|
5
|
+
def _infer_ancestry(sumstats,
|
|
6
|
+
ancestry_af=None,
|
|
7
|
+
build="19",
|
|
8
|
+
log=Log(),
|
|
9
|
+
verbose=True):
|
|
10
|
+
log.write("Start to infer ancestry based on Fst...", verbose=verbose)
|
|
11
|
+
ref_af = pd.read_csv(ancestry_af, sep="\t")
|
|
12
|
+
|
|
13
|
+
data_af = pd.merge(sumstats[["CHR","POS","EA","NEA","EAF"]] ,ref_af,on=["CHR","POS"],how="inner")
|
|
14
|
+
|
|
15
|
+
log.write(f" -Estimating Fst using {len(data_af)} variants...", verbose=verbose)
|
|
16
|
+
|
|
17
|
+
is_filp = data_af["EA"] == data_af["ALT"]
|
|
18
|
+
data_af.loc[is_filp, ["EA","NEA"]] = data_af.loc[is_filp, ["NEA","EA"]]
|
|
19
|
+
data_af.loc[is_filp, "EAF"] = 1 - data_af.loc[is_filp, "EAF"]
|
|
20
|
+
|
|
21
|
+
headers = []
|
|
22
|
+
for i in ['GBR', 'FIN', 'CHS', 'PUR', 'CDX',
|
|
23
|
+
'CLM', 'IBS', 'PEL', 'PJL', 'KHV', 'ACB', 'GWD', 'ESN', 'BEB', 'MSL',
|
|
24
|
+
'STU', 'ITU', 'CEU', 'YRI', 'CHB', 'JPT', 'LWK', 'ASW', 'MXL', 'TSI',
|
|
25
|
+
'GIH', 'EUR', 'EAS', 'AMR', 'SAS', 'AFR']:
|
|
26
|
+
headers.append(f"FST_{i}")
|
|
27
|
+
data_af[f"FST_{i}"] = data_af.apply(lambda x: calculate_fst(x["EAF"], x[i]), axis=1)
|
|
28
|
+
|
|
29
|
+
for i,value in data_af[headers].mean().sort_values().items():
|
|
30
|
+
log.write( f" -{i} : {value}", verbose=verbose)
|
|
31
|
+
|
|
32
|
+
closest_ancestry = data_af[headers].mean().sort_values().idxmin()
|
|
33
|
+
|
|
34
|
+
log.write(f" -Closest Ancestry: {closest_ancestry.split('_')[1]}", verbose=verbose)
|
|
35
|
+
log.write("Finished inferring ancestry.", verbose=verbose)
|
|
36
|
+
return closest_ancestry.split("_")[1]
|
|
37
|
+
|
|
38
|
+
def calculate_fst(p_1, p_2):
|
|
39
|
+
# https://bios1140.github.io/understanding-fst-the-fixation-index.html
|
|
40
|
+
# calculate q1 and q2
|
|
41
|
+
q_1 = 1 - p_1
|
|
42
|
+
q_2 = 1 - p_2
|
|
43
|
+
|
|
44
|
+
# calculate total allele frequency
|
|
45
|
+
p_t = (p_1 + p_2)/2
|
|
46
|
+
q_t = 1 - p_t
|
|
47
|
+
|
|
48
|
+
# calculate expected heterozygosity
|
|
49
|
+
# first calculate expected heterozygosity for the two populations
|
|
50
|
+
# pop1
|
|
51
|
+
hs_1 = 2*p_1*q_1
|
|
52
|
+
# pop2
|
|
53
|
+
hs_2 = 2*p_2*q_2
|
|
54
|
+
# then take the mean of this
|
|
55
|
+
hs = (hs_1 + hs_2)/2
|
|
56
|
+
|
|
57
|
+
# next calculate expected heterozygosity for the metapopulations
|
|
58
|
+
ht = 2*p_t*q_t
|
|
59
|
+
|
|
60
|
+
# calculate fst
|
|
61
|
+
fst = (ht - hs)/ht
|
|
62
|
+
|
|
63
|
+
# return output
|
|
64
|
+
return fst
|
|
65
|
+
|
gwaslab/util_ex_ldsc.py
CHANGED
|
@@ -11,9 +11,10 @@ from gwaslab.util_in_filter_value import filtervalues
|
|
|
11
11
|
from gwaslab.util_in_filter_value import _filter_palindromic
|
|
12
12
|
from gwaslab.util_in_filter_value import _exclude_hla
|
|
13
13
|
from gwaslab.util_in_filter_value import _exclude_sexchr
|
|
14
|
+
import copy
|
|
14
15
|
|
|
15
16
|
class ARGS():
|
|
16
|
-
def __init__(self,
|
|
17
|
+
def __init__(self, kwargs=None):
|
|
17
18
|
|
|
18
19
|
self.out = "ldsc"
|
|
19
20
|
|
|
@@ -257,11 +258,12 @@ class ARGS():
|
|
|
257
258
|
####################################################################################################################
|
|
258
259
|
|
|
259
260
|
|
|
260
|
-
def _estimate_h2_by_ldsc(insumstats,
|
|
261
|
-
sumstats = insumstats
|
|
261
|
+
def _estimate_h2_by_ldsc(insumstats, log, meta=None,verbose=True, munge=False, munge_args=None, **raw_kwargs):
|
|
262
|
+
sumstats = insumstats
|
|
263
|
+
kwargs = copy.deepcopy(raw_kwargs)
|
|
262
264
|
|
|
263
265
|
if "N" in sumstats.columns:
|
|
264
|
-
sumstats["N"] = sumstats["N"].
|
|
266
|
+
sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
|
|
265
267
|
|
|
266
268
|
if munge:
|
|
267
269
|
if munge_args is None:
|
|
@@ -291,21 +293,25 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
|
|
|
291
293
|
log.write(" -Adopted from LDSC source code: https://github.com/bulik/ldsc", verbose=verbose)
|
|
292
294
|
log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
|
|
293
295
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
+
if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
|
|
297
|
+
if "samp_prev" not in kwargs.keys():
|
|
298
|
+
kwargs["samp_prev"] = "{}".format(meta["gwaslab"]["sample_prevalence"])
|
|
299
|
+
if "pop_prev" not in kwargs.keys():
|
|
300
|
+
kwargs["pop_prev"] = "{}".format(meta["gwaslab"]["population_prevalence"])
|
|
296
301
|
|
|
297
302
|
log.write(" -Arguments:", verbose=verbose)
|
|
298
303
|
for key, value in kwargs.items():
|
|
299
304
|
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
300
|
-
|
|
305
|
+
|
|
306
|
+
default_args = ARGS(kwargs = kwargs)
|
|
301
307
|
|
|
302
308
|
if "Z" not in sumstats.columns:
|
|
303
309
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
304
310
|
|
|
305
311
|
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
306
|
-
|
|
312
|
+
|
|
307
313
|
log.write(" -LDSC log:", verbose=verbose)
|
|
308
|
-
summary = estimate_h2(sumstats, default_args, log)
|
|
314
|
+
summary = estimate_h2(sumstats, args = default_args, log = log)
|
|
309
315
|
|
|
310
316
|
results_table = None
|
|
311
317
|
if type(summary) is tuple:
|
|
@@ -321,10 +327,11 @@ def _estimate_h2_by_ldsc(insumstats, log, verbose=True, munge=False, munge_args=
|
|
|
321
327
|
|
|
322
328
|
####################################################################################################################
|
|
323
329
|
|
|
324
|
-
def _estimate_partitioned_h2_by_ldsc(insumstats,
|
|
330
|
+
def _estimate_partitioned_h2_by_ldsc(insumstats, log, meta=None,verbose=True, **raw_kwargs):
|
|
325
331
|
sumstats = insumstats.copy()
|
|
332
|
+
kwargs = copy.deepcopy(raw_kwargs)
|
|
326
333
|
if "N" in sumstats.columns:
|
|
327
|
-
sumstats["N"] = sumstats["N"].
|
|
334
|
+
sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
|
|
328
335
|
##start function with col checking##########################################################
|
|
329
336
|
_start_line = "run LD score regression"
|
|
330
337
|
_end_line = "running LD score regression"
|
|
@@ -347,10 +354,16 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
|
347
354
|
log.write(" -Please cite LDSC: Bulik-Sullivan, et al. LD Score Regression Distinguishes Confounding from Polygenicity in Genome-Wide Association Studies. Nature Genetics, 2015.", verbose=verbose)
|
|
348
355
|
log.write(" -Arguments:", verbose=verbose)
|
|
349
356
|
|
|
357
|
+
if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
|
|
358
|
+
if "samp_prev" not in kwargs.keys():
|
|
359
|
+
kwargs["samp_prev"] = "{}".format(meta["gwaslab"]["sample_prevalence"])
|
|
360
|
+
if "pop_prev" not in kwargs.keys():
|
|
361
|
+
kwargs["pop_prev"] = "{}".format(meta["gwaslab"]["population_prevalence"])
|
|
362
|
+
|
|
350
363
|
for key, value in kwargs.items():
|
|
351
364
|
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
352
365
|
|
|
353
|
-
default_args = ARGS(
|
|
366
|
+
default_args = ARGS(kwargs = kwargs)
|
|
354
367
|
|
|
355
368
|
if "Z" not in sumstats.columns:
|
|
356
369
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
@@ -369,10 +382,11 @@ def _estimate_partitioned_h2_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
|
369
382
|
|
|
370
383
|
|
|
371
384
|
|
|
372
|
-
def _estimate_rg_by_ldsc(insumstats,
|
|
385
|
+
def _estimate_rg_by_ldsc(insumstats, other_traits ,log, meta=None, verbose=True, **raw_kwargs):
|
|
373
386
|
sumstats = insumstats.copy()
|
|
387
|
+
kwargs = copy.deepcopy(raw_kwargs)
|
|
374
388
|
if "N" in sumstats.columns:
|
|
375
|
-
sumstats["N"] = sumstats["N"].
|
|
389
|
+
sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
|
|
376
390
|
##start function with col checking##########################################################
|
|
377
391
|
_start_line = "run LD score regression for genetic correlation"
|
|
378
392
|
_end_line = "running LD score regression for genetic correlation"
|
|
@@ -395,18 +409,37 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
|
|
|
395
409
|
log.write(" -Please cite LDSC: Bulik-Sullivan, B., et al. An Atlas of Genetic Correlations across Human Diseases and Traits. Nature Genetics, 2015.", verbose=verbose)
|
|
396
410
|
log.write(" -Arguments:", verbose=verbose)
|
|
397
411
|
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
samp_prev_string=""
|
|
415
|
+
pop_prev_string=""
|
|
416
|
+
|
|
417
|
+
if meta["gwaslab"]["sample_prevalence"]!="Unknown" and meta["gwaslab"]["population_prevalence"]!="Unknown" :
|
|
418
|
+
|
|
419
|
+
if "samp_prev" not in kwargs.keys():
|
|
420
|
+
samp_prev_string = "{}".format(meta["gwaslab"]["sample_prevalence"])
|
|
421
|
+
if "pop_prev" not in kwargs.keys():
|
|
422
|
+
pop_prev_string = "{}".format(meta["gwaslab"]["population_prevalence"])
|
|
423
|
+
|
|
424
|
+
if "rg" in kwargs.keys():
|
|
425
|
+
alias = kwargs["rg"].split(",")[1:]
|
|
426
|
+
else:
|
|
427
|
+
alias=[]
|
|
428
|
+
for index, each_other_sumstats in enumerate(other_traits):
|
|
429
|
+
alias.append(each_other_sumstats.meta["gwaslab"]["study_name"])
|
|
430
|
+
kwargs["rg"]=",".join([meta["gwaslab"]["study_name"]]+alias)
|
|
431
|
+
|
|
398
432
|
for key, value in kwargs.items():
|
|
399
433
|
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
400
|
-
|
|
401
|
-
default_args = ARGS(
|
|
434
|
+
|
|
435
|
+
default_args = ARGS(kwargs = kwargs)
|
|
402
436
|
|
|
403
437
|
if "Z" not in sumstats.columns:
|
|
404
438
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
405
439
|
|
|
406
440
|
sumstats = sumstats.rename(columns={"EA":"A1","NEA":"A2","rsID":"SNP"})
|
|
407
|
-
|
|
441
|
+
|
|
408
442
|
other_traits_to_use = []
|
|
409
|
-
alias = default_args.rg.split(",")[1:]
|
|
410
443
|
|
|
411
444
|
for index, each_other_sumstats in enumerate(other_traits):
|
|
412
445
|
log.write(" -Processing sumstats with alias {} ({})".format(alias[index], each_other_sumstats.meta["gwaslab"]["study_name"]))
|
|
@@ -419,6 +452,18 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
|
|
|
419
452
|
to_append["Z"] = to_append["BETA"]/to_append["SE"]
|
|
420
453
|
|
|
421
454
|
other_traits_to_use.append(to_append[["SNP","A1","A2","Z","N"]])
|
|
455
|
+
|
|
456
|
+
if each_other_sumstats.meta["gwaslab"]["sample_prevalence"]!="Unknown" and each_other_sumstats.meta["gwaslab"]["population_prevalence"]!="Unknown" :
|
|
457
|
+
samp_prev_string += ",{}".format(meta["gwaslab"]["sample_prevalence"])
|
|
458
|
+
pop_prev_string += ",{}".format(meta["gwaslab"]["population_prevalence"])
|
|
459
|
+
|
|
460
|
+
if len(pop_prev_string.split(",")) == len(other_traits)+1 and len(samp_prev_string.split(",")) == len(other_traits)+1:
|
|
461
|
+
if "samp_prev" not in kwargs.keys():
|
|
462
|
+
log.write(" -{}:{}".format("samp_prev", samp_prev_string), verbose=verbose)
|
|
463
|
+
default_args.samp_prev = samp_prev_string
|
|
464
|
+
if "pop_prev" not in kwargs.keys():
|
|
465
|
+
log.write(" -{}:{}".format("pop_prev", pop_prev_string), verbose=verbose)
|
|
466
|
+
default_args.pop_prev = pop_prev_string
|
|
422
467
|
|
|
423
468
|
log.write(" -LDSC log:", verbose=verbose)
|
|
424
469
|
summary = estimate_rg(sumstats[["SNP","A1","A2","Z","N"]], other_traits_to_use, default_args, log)[1]
|
|
@@ -431,10 +476,11 @@ def _estimate_rg_by_ldsc(insumstats, other_traits ,log, verbose=True, **kwargs):
|
|
|
431
476
|
####################################################################################################################
|
|
432
477
|
|
|
433
478
|
|
|
434
|
-
def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **
|
|
479
|
+
def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **raw_kwargs):
|
|
435
480
|
sumstats = insumstats.copy()
|
|
481
|
+
kwargs = copy.deepcopy(raw_kwargs)
|
|
436
482
|
if "N" in sumstats.columns:
|
|
437
|
-
sumstats["N"] = sumstats["N"].
|
|
483
|
+
sumstats["N"] = sumstats["N"].fillna(sumstats["N"].median()).apply("int64")
|
|
438
484
|
##start function with col checking##########################################################
|
|
439
485
|
_start_line = "run LD score regression"
|
|
440
486
|
_end_line = "running LD score regression"
|
|
@@ -460,7 +506,7 @@ def _estimate_h2_cts_by_ldsc(insumstats, log, verbose=True, **kwargs):
|
|
|
460
506
|
for key, value in kwargs.items():
|
|
461
507
|
log.write(" -{}:{}".format(key, value), verbose=verbose)
|
|
462
508
|
|
|
463
|
-
default_args = ARGS(
|
|
509
|
+
default_args = ARGS(kwargs = kwargs)
|
|
464
510
|
|
|
465
511
|
if "Z" not in sumstats.columns:
|
|
466
512
|
sumstats["Z"] = sumstats["BETA"]/sumstats["SE"]
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
import scipy.sparse as sparse
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
|
|
5
|
+
import subprocess
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import gc
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from gwaslab.g_Log import Log
|
|
12
|
+
from gwaslab.qc_fix_sumstats import start_to
|
|
13
|
+
from gwaslab.qc_fix_sumstats import finished
|
|
14
|
+
from gwaslab.util_in_get_sig import getsig
|
|
15
|
+
from gwaslab.util_ex_process_ref import _process_plink_input_files
|
|
16
|
+
from gwaslab.g_version import _checking_plink_version
|
|
17
|
+
from gwaslab.util_in_filter_value import _exclude_hla
|
|
18
|
+
from gwaslab.util_ex_calculate_ldmatrix import _extract_variants_in_locus
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def tofinemapping_m(sumstats,
|
|
24
|
+
studies=None,
|
|
25
|
+
group=None,
|
|
26
|
+
ld_paths = None,
|
|
27
|
+
ld_types = None,
|
|
28
|
+
ld_maps = None,
|
|
29
|
+
ld_map_dics = None,
|
|
30
|
+
bfile=None,
|
|
31
|
+
vcf=None,
|
|
32
|
+
locus=None,
|
|
33
|
+
loci=None,
|
|
34
|
+
loci_chrpos=None,
|
|
35
|
+
out="./",
|
|
36
|
+
plink="plink",
|
|
37
|
+
plink2="plink2",
|
|
38
|
+
windowsizekb=1000,
|
|
39
|
+
n_cores=1,
|
|
40
|
+
mode="r",
|
|
41
|
+
exclude_hla=False,
|
|
42
|
+
getlead_args=None,
|
|
43
|
+
memory=None,
|
|
44
|
+
overwrite=False,
|
|
45
|
+
log=Log(),
|
|
46
|
+
suffixes=None,
|
|
47
|
+
ld_map_kwargs=None,
|
|
48
|
+
extra_plink_option="",
|
|
49
|
+
verbose=True,
|
|
50
|
+
**kwargs):
|
|
51
|
+
|
|
52
|
+
##start function with col checking##########################################################
|
|
53
|
+
_start_line = "calculate LD matrix"
|
|
54
|
+
_end_line = "calculating LD matrix"
|
|
55
|
+
_start_cols =["SNPID","CHR","POS","EA","NEA"]
|
|
56
|
+
_start_function = ".calculate_ld_matrix()"
|
|
57
|
+
_must_args ={}
|
|
58
|
+
|
|
59
|
+
is_enough_info = start_to(sumstats=sumstats,
|
|
60
|
+
log=log,
|
|
61
|
+
verbose=verbose,
|
|
62
|
+
start_line=_start_line,
|
|
63
|
+
end_line=_end_line,
|
|
64
|
+
start_cols=_start_cols,
|
|
65
|
+
start_function=_start_function,
|
|
66
|
+
**_must_args)
|
|
67
|
+
|
|
68
|
+
if is_enough_info == False: raise ValueError("Not enough columns for calculating LD matrix")
|
|
69
|
+
|
|
70
|
+
############################################################################################
|
|
71
|
+
if suffixes is None:
|
|
72
|
+
suffixes=[""]
|
|
73
|
+
if getlead_args is None:
|
|
74
|
+
getlead_args={"windowsizekb":1000}
|
|
75
|
+
if ld_map_kwargs is None:
|
|
76
|
+
ld_map_kwargs={}
|
|
77
|
+
|
|
78
|
+
if loci is None:
|
|
79
|
+
log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
|
|
80
|
+
sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
|
|
81
|
+
else:
|
|
82
|
+
sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
|
|
83
|
+
|
|
84
|
+
# Drop duplicate!!!!
|
|
85
|
+
log.write(" -Dropping duplicated SNPIDs...",verbose=verbose)
|
|
86
|
+
sumstats = sumstats.drop_duplicates(subset=["SNPID"]).copy()
|
|
87
|
+
|
|
88
|
+
# init Filelist DataFrame
|
|
89
|
+
output_file_list = pd.DataFrame(columns=["SNPID","SNPID_LIST","LD_R_MATRIX","LOCUS_SUMSTATS"])
|
|
90
|
+
|
|
91
|
+
plink_log=""
|
|
92
|
+
|
|
93
|
+
if exclude_hla==True:
|
|
94
|
+
sig_df = _exclude_hla(sig_df, log=log, verbose=verbose)
|
|
95
|
+
|
|
96
|
+
sig_df = sig_df.reset_index()
|
|
97
|
+
row = sig_df.iloc[0,:]
|
|
98
|
+
|
|
99
|
+
matched_sumstats = _extract_variants_in_locus(sumstats, windowsizekb, locus = (row["CHR"],row["POS"]))
|
|
100
|
+
|
|
101
|
+
for i in range(2):
|
|
102
|
+
# for each study
|
|
103
|
+
gc.collect()
|
|
104
|
+
# get ld path and dic
|
|
105
|
+
ld_map_path = ld_maps[i]
|
|
106
|
+
ld_map_rename_dic = ld_map_dics[i]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
log.write(" -Processing locus with lead variant {} at CHR {} POS {} ...".format(row["SNPID"],row["CHR"],row["POS"]))
|
|
110
|
+
ld_map = _load_ld_map(ld_map_path,
|
|
111
|
+
ld_map_rename_dic = ld_map_rename_dic,
|
|
112
|
+
**ld_map_kwargs )
|
|
113
|
+
|
|
114
|
+
## check available snps with reference file
|
|
115
|
+
matched_sumstats = _merge_ld_map_with_sumstats_m(row=row,
|
|
116
|
+
locus_sumstats=matched_sumstats,
|
|
117
|
+
ld_map=ld_map,
|
|
118
|
+
log=log,
|
|
119
|
+
index=i)
|
|
120
|
+
|
|
121
|
+
if len(matched_sumstats)==0:
|
|
122
|
+
log.write(" -No matching LD information... Skipping...")
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# drop na
|
|
126
|
+
matched_sumstats = matched_sumstats.dropna()
|
|
127
|
+
|
|
128
|
+
# export common variants list
|
|
129
|
+
matched_snp_list_path, matched_sumstats_paths=_export_snplist_and_locus_sumstats_m(matched_sumstats=matched_sumstats,
|
|
130
|
+
out=out,
|
|
131
|
+
group=group,
|
|
132
|
+
row=row,
|
|
133
|
+
windowsizekb=windowsizekb,
|
|
134
|
+
log=log)
|
|
135
|
+
|
|
136
|
+
for i in range(2):
|
|
137
|
+
ld_path = ld_paths[i]
|
|
138
|
+
|
|
139
|
+
r_matrix = _load_ld_matrix(ld_path,
|
|
140
|
+
fmt="txt",
|
|
141
|
+
if_square=False,
|
|
142
|
+
if_add_T=False,
|
|
143
|
+
log=log,
|
|
144
|
+
verbose=verbose)
|
|
145
|
+
|
|
146
|
+
matched_ld_matrix_path = _extract_variants_from_ld_matrix_m(merged_sumstats = matched_sumstats,
|
|
147
|
+
r_matrix = r_matrix,
|
|
148
|
+
out = out,
|
|
149
|
+
group = group,
|
|
150
|
+
row = row,
|
|
151
|
+
windowsizekb = windowsizekb,
|
|
152
|
+
index=i,
|
|
153
|
+
log=log, verbose=verbose)
|
|
154
|
+
# #########################################################################################################
|
|
155
|
+
|
|
156
|
+
row_dict={}
|
|
157
|
+
row_dict["SNPID"]=row["SNPID"]
|
|
158
|
+
row_dict["SNPID_LIST"] = matched_snp_list_path
|
|
159
|
+
row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
|
|
160
|
+
row_dict["LOCUS_SUMSTATS"] = matched_sumstats_paths[i] + ".gz"
|
|
161
|
+
row_dict["LOCUS"] = row["SNPID"]
|
|
162
|
+
row_dict["SUBSTUDY"]= i+1
|
|
163
|
+
row_dict["STUDY"] = studies[i]
|
|
164
|
+
file_row = pd.Series(row_dict).to_frame().T
|
|
165
|
+
output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
|
|
166
|
+
|
|
167
|
+
if len(output_file_list)>0:
|
|
168
|
+
|
|
169
|
+
output_file_list["GROUP"] = group
|
|
170
|
+
nloci = len(output_file_list)
|
|
171
|
+
output_file_list_path = "{}/{}_{}study_{}_{}kb.filelist".format(out.rstrip("/"), group,nloci, row["SNPID"], windowsizekb)
|
|
172
|
+
output_file_list.to_csv(output_file_list_path,index=None,sep="\t")
|
|
173
|
+
log.write(" -File list is saved to: {}".format(output_file_list_path),verbose=verbose)
|
|
174
|
+
log.write(" -Finished LD matrix calculation.",verbose=verbose)
|
|
175
|
+
else:
|
|
176
|
+
output_file_list_path=None
|
|
177
|
+
log.write(" -No avaialable lead variants.",verbose=verbose)
|
|
178
|
+
log.write(" -Stopped LD matrix calculation.",verbose=verbose)
|
|
179
|
+
|
|
180
|
+
finished(log=log, verbose=verbose, end_line=_end_line)
|
|
181
|
+
|
|
182
|
+
return output_file_list_path, output_file_list, plink_log
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _export_snplist_and_locus_sumstats_m(matched_sumstats, out, group, row, windowsizekb,log):
|
|
186
|
+
# study suffixes starting from 1
|
|
187
|
+
suffixes=["_{}".format(i+1) for i in range(2)]
|
|
188
|
+
|
|
189
|
+
matched_snp_list_path = "{}/{}_{}_{}.snplist.raw".format(out.rstrip("/"), group, row["SNPID"] ,windowsizekb)
|
|
190
|
+
|
|
191
|
+
matched_sumstats["SNPID"].to_csv(matched_snp_list_path, index=None, header=None)
|
|
192
|
+
log.write(" -Exporting SNP list of {} to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
|
|
193
|
+
|
|
194
|
+
# create locus-sumstats EA, NEA, (BETA, SE), Z
|
|
195
|
+
matched_sumstats_paths =[]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
for i in range(2):
|
|
199
|
+
# export sumstats for each study
|
|
200
|
+
suffix = suffixes[i]
|
|
201
|
+
|
|
202
|
+
matched_sumstats_path = "{}/{}_{}_{}_{}.sumstats".format(out.rstrip("/"), group, row["SNPID"] ,windowsizekb, i + 1)
|
|
203
|
+
matched_sumstats_paths.append(matched_sumstats_path)
|
|
204
|
+
to_export_columns=["CHR","POS","EA","NEA"]
|
|
205
|
+
|
|
206
|
+
if "Z"+suffix in matched_sumstats.columns :
|
|
207
|
+
to_export_columns.append("Z"+suffix)
|
|
208
|
+
if ("BETA"+suffix in matched_sumstats.columns) and ("SE"+suffix in matched_sumstats.columns):
|
|
209
|
+
to_export_columns.append("BETA"+suffix)
|
|
210
|
+
to_export_columns.append("SE"+suffix)
|
|
211
|
+
if "EAF"+suffix in matched_sumstats.columns :
|
|
212
|
+
to_export_columns.append("EAF"+suffix)
|
|
213
|
+
if "N"+suffix in matched_sumstats.columns:
|
|
214
|
+
to_export_columns.append("N"+suffix)
|
|
215
|
+
|
|
216
|
+
log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
|
|
217
|
+
log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
|
|
218
|
+
#matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, sep="\t",index=None)
|
|
219
|
+
rename_dic={
|
|
220
|
+
"BETA"+suffix:"Beta",
|
|
221
|
+
"SE"+suffix:"Se",
|
|
222
|
+
"SNPID":"SNP"
|
|
223
|
+
}
|
|
224
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].rename(columns=rename_dic).to_csv(matched_sumstats_path, sep="\t",index=None)
|
|
225
|
+
matched_sumstats[ ["SNPID"]+to_export_columns].rename(columns=rename_dic).to_csv(matched_sumstats_path+".gz", sep="\t",index=None)
|
|
226
|
+
|
|
227
|
+
return matched_snp_list_path, matched_sumstats_paths
|
|
228
|
+
|
|
229
|
+
###################################################################################################################################################################
|
|
230
|
+
####################################################################################################
|
|
231
|
+
def _load_ld_matrix(path,
|
|
232
|
+
fmt="npz",
|
|
233
|
+
if_square=False,
|
|
234
|
+
if_add_T=False,
|
|
235
|
+
log=Log(),
|
|
236
|
+
verbose=True):
|
|
237
|
+
|
|
238
|
+
if fmt == "npz":
|
|
239
|
+
log.write(" -Loading LD matrix from npz file...",verbose=verbose)
|
|
240
|
+
r_matrix = sparse.load_npz(path).toarray()
|
|
241
|
+
if fmt == "txt":
|
|
242
|
+
log.write(" -Loading LD matrix from text file...",verbose=verbose)
|
|
243
|
+
r_matrix = np.loadtxt(path,delimiter="\t")
|
|
244
|
+
log.write(" -LD matrix shape : {}".format(r_matrix.shape) ,verbose=verbose)
|
|
245
|
+
|
|
246
|
+
if if_add_T==True:
|
|
247
|
+
log.write(" -Transforming LD matrix by adding its transpose...",verbose=verbose)
|
|
248
|
+
r_matrix += r_matrix.T
|
|
249
|
+
if if_square==True:
|
|
250
|
+
log.write(" -Transforming LD matrix by squaring all elements...",verbose=verbose)
|
|
251
|
+
r_matrix = np.power(r_matrix,2)
|
|
252
|
+
return r_matrix
|
|
253
|
+
|
|
254
|
+
def _load_ld_map(path,
|
|
255
|
+
snpid="rsid",
|
|
256
|
+
chrom="chromosome",
|
|
257
|
+
pos="position",
|
|
258
|
+
ref="allele1",
|
|
259
|
+
alt="allele2",
|
|
260
|
+
ld_map_rename_dic = None,
|
|
261
|
+
**ld_map_kwargs):
|
|
262
|
+
|
|
263
|
+
if ld_map_rename_dic is not None:
|
|
264
|
+
# ld map format
|
|
265
|
+
# SNPID_bim,CHR,POS, NEA_bim, EA_bim
|
|
266
|
+
if type(ld_map_rename_dic) is dict:
|
|
267
|
+
ld_map_rename_dic_to_use={ld_map_rename_dic["EA"]:'EA_bim',
|
|
268
|
+
ld_map_rename_dic["NEA"]:'NEA_bim',
|
|
269
|
+
ld_map_rename_dic["POS"]:'POS',
|
|
270
|
+
ld_map_rename_dic["CHR"]:'CHR',
|
|
271
|
+
ld_map_rename_dic["SNPID"]:'SNPID_bim'
|
|
272
|
+
}
|
|
273
|
+
ld_map_kwargs["usecols"]=list(ld_map_rename_dic.values())
|
|
274
|
+
else:
|
|
275
|
+
ld_map_rename_dic_to_use={ld_map_rename_dic[4]:'EA_bim',
|
|
276
|
+
ld_map_rename_dic[3]:'NEA_bim',
|
|
277
|
+
ld_map_rename_dic[2]:'POS',
|
|
278
|
+
ld_map_rename_dic[1]:'CHR',
|
|
279
|
+
ld_map_rename_dic[0]:'SNPID_bim'
|
|
280
|
+
}
|
|
281
|
+
ld_map_kwargs["usecols"]=ld_map_rename_dic
|
|
282
|
+
else:
|
|
283
|
+
ld_map_rename_dic_to_use={alt:'EA_bim',
|
|
284
|
+
ref:'NEA_bim',
|
|
285
|
+
pos:'POS',
|
|
286
|
+
chrom:'CHR',
|
|
287
|
+
snpid:"SNPID_bim"
|
|
288
|
+
}
|
|
289
|
+
ld_map_kwargs["usecols"]=[chrom, pos, ref, alt, snpid]
|
|
290
|
+
#rsid chromosome position allele1 allele2
|
|
291
|
+
if "sep" not in ld_map_kwargs:
|
|
292
|
+
ld_map_kwargs["sep"] = "\s+"
|
|
293
|
+
|
|
294
|
+
ld_map = pd.read_csv(path,**ld_map_kwargs)
|
|
295
|
+
ld_map = ld_map.rename(columns=ld_map_rename_dic_to_use, errors='ignore')
|
|
296
|
+
# "SNPID",0:"CHR_bim",3:"POS_bim",4:"EA_bim",5:"NEA_bim"
|
|
297
|
+
return ld_map
|
|
298
|
+
|
|
299
|
+
def _extract_variants_from_ld_matrix_m(merged_sumstats, r_matrix, out, group, row, windowsizekb, log, verbose, index):
|
|
300
|
+
# study suffixes starting from 1
|
|
301
|
+
index_bim_header = "_INDEX_BIM_{}".format(index + 1)
|
|
302
|
+
flipped_header = "_FLIPPED_{}".format(index + 1)
|
|
303
|
+
|
|
304
|
+
# a series of int to indicate if the variant index in raw ld matrix
|
|
305
|
+
avaiable_index = merged_sumstats[index_bim_header].values
|
|
306
|
+
|
|
307
|
+
# a series of boolean values to indicate if the variants is flipped
|
|
308
|
+
flipped = merged_sumstats[flipped_header].values
|
|
309
|
+
|
|
310
|
+
# extract the sub-matrix
|
|
311
|
+
reduced_r_matrix = r_matrix[np.ix_(avaiable_index, avaiable_index)]
|
|
312
|
+
|
|
313
|
+
log.write(" -Flipping LD matrix for {} variants...".format(sum(flipped)),verbose=verbose)
|
|
314
|
+
reduced_r_matrix[flipped,:] = -1 * reduced_r_matrix[flipped,:]
|
|
315
|
+
reduced_r_matrix[:,flipped] = -1 * reduced_r_matrix[:,flipped]
|
|
316
|
+
|
|
317
|
+
output_prefix = "{}/{}_{}_{}_{}".format(out.rstrip("/"),group,row["SNPID"],windowsizekb, index + 1)
|
|
318
|
+
output_path = "{}.ld.gz".format(output_prefix)
|
|
319
|
+
|
|
320
|
+
pd.DataFrame(reduced_r_matrix).to_csv(output_path,sep="\t",index=None,header=None)
|
|
321
|
+
#reduced_r_matrix.to_csv("{}.ld.gz".format(output_prefix),se="\t")
|
|
322
|
+
return output_path
|
|
323
|
+
|
|
324
|
+
def _merge_ld_map_with_sumstats_m(row,
|
|
325
|
+
locus_sumstats,
|
|
326
|
+
ld_map,
|
|
327
|
+
log=Log(),
|
|
328
|
+
index=None):
|
|
329
|
+
'''
|
|
330
|
+
align sumstats with ld map
|
|
331
|
+
'''
|
|
332
|
+
# study suffixes starting from 1
|
|
333
|
+
index_suffix = "_{}".format(index+1)
|
|
334
|
+
|
|
335
|
+
index1= "_INDEX_SUMSTATS"
|
|
336
|
+
index2= "_INDEX_BIM" +index_suffix
|
|
337
|
+
|
|
338
|
+
# Sumstats index
|
|
339
|
+
locus_sumstats[index1] = locus_sumstats.index
|
|
340
|
+
|
|
341
|
+
# ld map index
|
|
342
|
+
ld_map[index2] = ld_map.index
|
|
343
|
+
|
|
344
|
+
# init a column to show if the variants in LD map are flipped or not
|
|
345
|
+
locus_sumstats["_FLIPPED"+index_suffix] = False
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
log.write(" -Variants in locus ({}): {}".format(row["SNPID"],len(locus_sumstats)))
|
|
349
|
+
# convert category to string
|
|
350
|
+
locus_sumstats["EA"] = locus_sumstats["EA"].astype("string")
|
|
351
|
+
locus_sumstats["NEA"] = locus_sumstats["NEA"].astype("string")
|
|
352
|
+
|
|
353
|
+
# matching by SNPID
|
|
354
|
+
# preserve bim keys (use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.)
|
|
355
|
+
# vairants without a match were removed
|
|
356
|
+
combined_df = pd.merge(ld_map, locus_sumstats, on=["CHR","POS"],how="inner")
|
|
357
|
+
|
|
358
|
+
# match allele
|
|
359
|
+
perfect_match = ((combined_df["EA"] == combined_df["EA_bim"]) & (combined_df["NEA"] == combined_df["NEA_bim"]) )
|
|
360
|
+
log.write(" -Variants with perfect matched alleles:{}".format(sum(perfect_match)))
|
|
361
|
+
|
|
362
|
+
# fliipped allele
|
|
363
|
+
#ea_mis_match = combined_df["EA"] != combined_df["EA_bim"]
|
|
364
|
+
flipped_match = ((combined_df["EA"] == combined_df["NEA_bim"])& (combined_df["NEA"] == combined_df["EA_bim"]))
|
|
365
|
+
log.write(" -Variants with flipped alleles:{}".format(sum(flipped_match)))
|
|
366
|
+
|
|
367
|
+
allele_match = perfect_match | flipped_match
|
|
368
|
+
log.write(" -Total Variants matched:{}".format(sum(allele_match)))
|
|
369
|
+
|
|
370
|
+
combined_df.loc[flipped_match,"_FLIPPED"+index_suffix] = True
|
|
371
|
+
|
|
372
|
+
if row["SNPID"] not in combined_df.loc[allele_match,"SNPID"].values:
|
|
373
|
+
log.warning("Lead variant was not available in reference!")
|
|
374
|
+
|
|
375
|
+
# adjust output columns
|
|
376
|
+
output_columns=["SNPID","CHR","POS","EA","NEA"]
|
|
377
|
+
for i in combined_df.columns:
|
|
378
|
+
if "_INDEX_BIM" in i:
|
|
379
|
+
output_columns.append(i)
|
|
380
|
+
if "_FLIPPED" in i:
|
|
381
|
+
output_columns.append(i)
|
|
382
|
+
|
|
383
|
+
for i in range(2):
|
|
384
|
+
# study suffixes starting from 1
|
|
385
|
+
index_suffix = "_{}".format(i+1)
|
|
386
|
+
if ("BETA"+index_suffix in combined_df.columns) and ("SE"+index_suffix in combined_df.columns):
|
|
387
|
+
output_columns.append("BETA"+index_suffix)
|
|
388
|
+
output_columns.append("SE"+index_suffix)
|
|
389
|
+
if "Z"+index_suffix in combined_df.columns:
|
|
390
|
+
output_columns.append("Z"+index_suffix)
|
|
391
|
+
if "EAF"+index_suffix in combined_df.columns:
|
|
392
|
+
output_columns.append("EAF"+index_suffix)
|
|
393
|
+
if "N"+index_suffix in combined_df.columns:
|
|
394
|
+
output_columns.append("N"+index_suffix)
|
|
395
|
+
|
|
396
|
+
return combined_df.loc[allele_match,output_columns]
|
gwaslab/util_ex_run_2samplemr.py
CHANGED