biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path, PosixPath # noqa: F401
|
|
2
|
+
|
|
3
|
+
from biopipen.utils.misc import logger
|
|
4
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
5
|
+
|
|
6
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
|
|
7
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
8
|
+
outdir = Path(outfile).parent
|
|
9
|
+
|
|
10
|
+
envs = {{envs | dict | repr}} # pyright: ignore
|
|
11
|
+
bcftools = envs.pop("bcftools")
|
|
12
|
+
tabix = envs.pop("tabix")
|
|
13
|
+
keep = envs.pop("keep")
|
|
14
|
+
ncores = envs.pop("ncores")
|
|
15
|
+
includes = envs.pop("includes")
|
|
16
|
+
excludes = envs.pop("excludes")
|
|
17
|
+
gz = envs.pop("gz")
|
|
18
|
+
index = envs.pop("index")
|
|
19
|
+
|
|
20
|
+
# a.vcf.gz -> a
|
|
21
|
+
# a.vcf -> a
|
|
22
|
+
stem = Path(infile).stem
|
|
23
|
+
if stem.endswith(".vcf"):
|
|
24
|
+
stem = stem[:-4]
|
|
25
|
+
# .vcf.gz
|
|
26
|
+
# .gz
|
|
27
|
+
ext = ".vcf.gz" if index or gz else '.vcf'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def normalize_expr(expr, flag, prev_n_filters=0):
|
|
31
|
+
out = {}
|
|
32
|
+
if not expr:
|
|
33
|
+
return out
|
|
34
|
+
if isinstance(expr, list):
|
|
35
|
+
for ex in expr:
|
|
36
|
+
out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (ex, flag)
|
|
37
|
+
elif isinstance(expr, dict):
|
|
38
|
+
for name, ex in expr.items():
|
|
39
|
+
out[name] = (ex, flag)
|
|
40
|
+
else: # str
|
|
41
|
+
out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (expr, flag)
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def handle_filter(vcf, fname, filt, flag, final):
|
|
46
|
+
logger.info("- Handling filter %s: %s ...", fname, filt)
|
|
47
|
+
|
|
48
|
+
arguments = envs.copy()
|
|
49
|
+
arguments[flag] = filt
|
|
50
|
+
arguments["_"] = vcf
|
|
51
|
+
arguments["o"] = outfile if final else outdir / f"{stem}.{fname}{ext}"
|
|
52
|
+
if keep:
|
|
53
|
+
arguments["s"] = fname
|
|
54
|
+
|
|
55
|
+
run_bcftools(arguments, bcftools=bcftools, index=index and final, tabix=tabix)
|
|
56
|
+
|
|
57
|
+
if final:
|
|
58
|
+
flagfile = outdir.joinpath(f"{stem}.{fname}{ext}")
|
|
59
|
+
if flagfile.is_symlink():
|
|
60
|
+
flagfile.unlink()
|
|
61
|
+
outdir.joinpath(f"{stem}.{fname}{ext}").symlink_to(outfile)
|
|
62
|
+
|
|
63
|
+
return arguments["o"]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
includes = normalize_expr(includes, "include")
|
|
67
|
+
excludes = normalize_expr(excludes, "exclude", len(includes))
|
|
68
|
+
includes.update(excludes)
|
|
69
|
+
|
|
70
|
+
if index and not gz:
|
|
71
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
72
|
+
gz = True
|
|
73
|
+
|
|
74
|
+
envs[""] = [bcftools, "filter"]
|
|
75
|
+
envs["_"] = infile
|
|
76
|
+
envs["o"] = outfile
|
|
77
|
+
envs["threads"] = ncores
|
|
78
|
+
|
|
79
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
80
|
+
envs["O"] = "z" if gz else "v"
|
|
81
|
+
|
|
82
|
+
if keep:
|
|
83
|
+
envs["soft_filter"] = "+"
|
|
84
|
+
|
|
85
|
+
if "m" not in envs and "mode" not in envs:
|
|
86
|
+
envs["m"] = "+"
|
|
87
|
+
|
|
88
|
+
# bcftools can be only done once at one filter
|
|
89
|
+
for i, (fname, (filt, flag)) in enumerate(includes.items()):
|
|
90
|
+
infile = handle_filter(infile, fname, filt, flag, i == len(includes) - 1)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
from pathlib import Path, PosixPath # noqa: F401
|
|
3
|
+
|
|
4
|
+
from biopipen.utils.misc import run_command, logger
|
|
5
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
6
|
+
|
|
7
|
+
infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
8
|
+
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
9
|
+
envs = {{envs | dict | repr}} # pyright: ignore
|
|
10
|
+
|
|
11
|
+
outdir = Path(outfile).parent
|
|
12
|
+
bcftools = envs.pop("bcftools")
|
|
13
|
+
tabix = envs.pop("tabix")
|
|
14
|
+
ncores = envs.pop("ncores")
|
|
15
|
+
gz = envs.pop("gz")
|
|
16
|
+
index = envs.pop("index")
|
|
17
|
+
chrsize = envs.pop("chrsize")
|
|
18
|
+
notfound = envs.pop("notfound")
|
|
19
|
+
|
|
20
|
+
if chrsize:
|
|
21
|
+
class Contig:
|
|
22
|
+
def __init__(self, name: str, length: str):
|
|
23
|
+
self.name = name
|
|
24
|
+
self.length = length
|
|
25
|
+
|
|
26
|
+
def __str__(self) -> str:
|
|
27
|
+
return f"##contig=<ID={self.name},length={self.length}>"
|
|
28
|
+
|
|
29
|
+
def parse_header(header_file: Path) -> tuple[list[str], dict[str, Contig]]:
|
|
30
|
+
hlines = []
|
|
31
|
+
ctgs = {}
|
|
32
|
+
with open(header_file) as fh:
|
|
33
|
+
for line in fh:
|
|
34
|
+
if line.startswith("##contig"):
|
|
35
|
+
ctg = line.strip().split("##contig=<ID=")[1].split(",length=")
|
|
36
|
+
ctgs[ctg[0]] = Contig(ctg[0], ctg[1].replace(">", ""))
|
|
37
|
+
else:
|
|
38
|
+
hlines.append(line.strip())
|
|
39
|
+
return hlines, ctgs
|
|
40
|
+
|
|
41
|
+
def match_contigs(
|
|
42
|
+
ctgs: dict[str, Contig],
|
|
43
|
+
chroms: list[str],
|
|
44
|
+
notfound: Literal["error", "remove", "start", "end"],
|
|
45
|
+
) -> list[str]:
|
|
46
|
+
if (
|
|
47
|
+
ctgs
|
|
48
|
+
and chroms
|
|
49
|
+
and all(chrom.startswith("chr") for chrom in chroms)
|
|
50
|
+
and not any(chrom.startswith("chr") for chrom in ctgs)
|
|
51
|
+
):
|
|
52
|
+
logger.warning(
|
|
53
|
+
"Removing 'chr' prefix from chromosomes in envs.chrsize file, "
|
|
54
|
+
"because the input VCF file does not have 'chr' prefix."
|
|
55
|
+
)
|
|
56
|
+
chroms = [chrom[3:] for chrom in chroms]
|
|
57
|
+
|
|
58
|
+
new_ctgs = []
|
|
59
|
+
for chrom in chroms:
|
|
60
|
+
if chrom in ctgs:
|
|
61
|
+
new_ctgs.append(str(ctgs[chrom]))
|
|
62
|
+
del ctgs[chrom]
|
|
63
|
+
|
|
64
|
+
if ctgs:
|
|
65
|
+
if notfound == "error":
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Chromosomes not found in envs.chrsize file: "
|
|
68
|
+
f"{', '.join(ctgs.keys())}"
|
|
69
|
+
)
|
|
70
|
+
elif notfound == "start":
|
|
71
|
+
new_ctgs = [str(ctg) for ctg in ctgs.values()] + new_ctgs
|
|
72
|
+
elif notfound == "end":
|
|
73
|
+
new_ctgs = new_ctgs + [str(ctg) for ctg in ctgs.values()]
|
|
74
|
+
|
|
75
|
+
return new_ctgs
|
|
76
|
+
|
|
77
|
+
chroms = []
|
|
78
|
+
with Path(chrsize).expanduser().open() as fh:
|
|
79
|
+
for line in fh:
|
|
80
|
+
chrom = line.strip().split()[0]
|
|
81
|
+
chroms.append(chrom)
|
|
82
|
+
|
|
83
|
+
header_file = outdir / "header.txt"
|
|
84
|
+
run_command(f'{bcftools} view -h {infile} > {header_file}', fg=True)
|
|
85
|
+
header_lines, contigs = parse_header(header_file)
|
|
86
|
+
new_contigs = match_contigs(contigs, chroms, notfound=notfound)
|
|
87
|
+
header_lines = [header_lines[0], *new_contigs, *header_lines[1:]]
|
|
88
|
+
reheader_file = outdir / "reheader.txt"
|
|
89
|
+
with open(reheader_file, "w") as fh:
|
|
90
|
+
fh.writelines([f"{line}\n" for line in header_lines])
|
|
91
|
+
|
|
92
|
+
reheader_vcf = outdir / f"{Path(infile).stem}_reheader.vcf"
|
|
93
|
+
run_command([
|
|
94
|
+
bcftools, "reheader",
|
|
95
|
+
"--header", reheader_file,
|
|
96
|
+
"-o", reheader_vcf,
|
|
97
|
+
infile
|
|
98
|
+
], fg=True)
|
|
99
|
+
|
|
100
|
+
infile = reheader_vcf
|
|
101
|
+
|
|
102
|
+
envs[""] = [bcftools, "sort"]
|
|
103
|
+
envs["_"] = infile
|
|
104
|
+
envs["o"] = outfile
|
|
105
|
+
|
|
106
|
+
if index and not gz:
|
|
107
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
108
|
+
gz = True
|
|
109
|
+
|
|
110
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
111
|
+
envs["O"] = "z" if gz else "v"
|
|
112
|
+
|
|
113
|
+
run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from contextlib import suppress
|
|
2
|
+
# In case there are paths passed to envs
|
|
3
|
+
from pathlib import PosixPath # noqa: F401
|
|
4
|
+
|
|
5
|
+
from biopipen.utils.misc import logger
|
|
6
|
+
from biopipen.utils.reference import tabix_index
|
|
7
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
8
|
+
|
|
9
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
|
|
10
|
+
regions_file = {{in.regions_file | repr}} # pyright: ignore
|
|
11
|
+
samples_file = {{in.samples_file | repr}} # pyright: ignore
|
|
12
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
13
|
+
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
14
|
+
|
|
15
|
+
bcftools = envs.pop("bcftools")
|
|
16
|
+
tabix = envs.pop("tabix")
|
|
17
|
+
ncores = envs.pop("ncores")
|
|
18
|
+
gz = envs.pop("gz")
|
|
19
|
+
index = envs.pop("index")
|
|
20
|
+
|
|
21
|
+
if regions_file:
|
|
22
|
+
if "R" in envs or "regions_file" in envs or "regions-file" in envs:
|
|
23
|
+
logger.warning(
|
|
24
|
+
"Ignoring envs\[regions_file/regions-file/R] "
|
|
25
|
+
"because in.regionsfile is provided."
|
|
26
|
+
)
|
|
27
|
+
with suppress(KeyError):
|
|
28
|
+
del envs["regions_file"]
|
|
29
|
+
with suppress(KeyError):
|
|
30
|
+
del envs["regions-file"]
|
|
31
|
+
with suppress(KeyError):
|
|
32
|
+
del envs["R"]
|
|
33
|
+
elif "R" in envs or "regions_file" in envs or "regions-file" in envs:
|
|
34
|
+
regions_file = (
|
|
35
|
+
envs.pop("regions_file", None)
|
|
36
|
+
or envs.pop("regions-file", None)
|
|
37
|
+
or envs.pop("R", None)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if samples_file:
|
|
41
|
+
if "S" in envs or "samples_file" in envs or "samples-file" in envs:
|
|
42
|
+
logger.warning(
|
|
43
|
+
"Ignoring envs[samples_file/samples-file/S] "
|
|
44
|
+
"because in.samples_file is provided."
|
|
45
|
+
)
|
|
46
|
+
with suppress(KeyError):
|
|
47
|
+
del envs["samples_file"]
|
|
48
|
+
with suppress(KeyError):
|
|
49
|
+
del envs["samples-file"]
|
|
50
|
+
with suppress(KeyError):
|
|
51
|
+
del envs["S"]
|
|
52
|
+
elif "S" in envs or "samples_file" in envs or "samples-file" in envs:
|
|
53
|
+
samples_file = (
|
|
54
|
+
envs.pop("samples_file", None)
|
|
55
|
+
or envs.pop("samples-file", None)
|
|
56
|
+
or envs.pop("S", None)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if index and not gz:
|
|
60
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
61
|
+
gz = True
|
|
62
|
+
|
|
63
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
64
|
+
envs["O"] = "z" if gz else "v"
|
|
65
|
+
|
|
66
|
+
envs[""] = [bcftools, "view"]
|
|
67
|
+
envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
|
|
68
|
+
envs["o"] = outfile
|
|
69
|
+
envs["threads"] = ncores
|
|
70
|
+
envs["regions_file"] = regions_file
|
|
71
|
+
envs["samples_file"] = samples_file
|
|
72
|
+
|
|
73
|
+
run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
|
|
@@ -63,7 +63,7 @@ def fix_vcffile(vcffile, outfile, fixes):
|
|
|
63
63
|
else:
|
|
64
64
|
modify_fixes.append(fix)
|
|
65
65
|
|
|
66
|
-
inopen = gzip.open if vcffile.endswith(".gz") else open
|
|
66
|
+
inopen = gzip.open if str(vcffile).endswith(".gz") else open
|
|
67
67
|
with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
|
|
68
68
|
for line in fin:
|
|
69
69
|
obj = line_to_obj(line)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Utilities for bcftools"""
|
|
2
|
+
|
|
3
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
4
|
+
from biopipen.utils.reference import tabix_index
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def bcftools_version(bcftools: str) -> tuple[int, ...]:
|
|
8
|
+
"""Get the version of bcftools
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
bcftools (str): Path to bcftools
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
tuple[int, ...]: The version of bcftools
|
|
15
|
+
"""
|
|
16
|
+
bversion = (
|
|
17
|
+
run_command([bcftools, "version"], stdout="return")
|
|
18
|
+
.splitlines()[0] # bcftools 1.20
|
|
19
|
+
.replace("bcftools", "")
|
|
20
|
+
.strip() # 1.20
|
|
21
|
+
.split(".")
|
|
22
|
+
)
|
|
23
|
+
return tuple(map(int, bversion))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run_bcftools(
|
|
27
|
+
args: dict[str, object],
|
|
28
|
+
bcftools: str,
|
|
29
|
+
index: bool,
|
|
30
|
+
tabix: str
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Run bcftools with the given arguments
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
args: Arguments to pass to bcftools
|
|
36
|
+
bcftools (str): Path to bcftools
|
|
37
|
+
index (bool): Whether to index the output
|
|
38
|
+
tabix (str): Path to tabix
|
|
39
|
+
"""
|
|
40
|
+
if not index:
|
|
41
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
42
|
+
else:
|
|
43
|
+
bversion = bcftools_version(bcftools)
|
|
44
|
+
if bversion >= (1, 20):
|
|
45
|
+
# requires bcftools 1.20+
|
|
46
|
+
# '--write-index tbi' not working
|
|
47
|
+
# it has to be '--write-index=tbi'
|
|
48
|
+
args["write_index=tbi"] = True
|
|
49
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
50
|
+
else:
|
|
51
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
52
|
+
tabix_index(args["o"], "vcf", tmpdir=False, tabix=tabix)
|
biopipen/utils/gene.R
CHANGED
|
@@ -1,49 +1,95 @@
|
|
|
1
|
-
|
|
2
|
-
library(
|
|
1
|
+
suppressPackageStartupMessages({
|
|
2
|
+
library(rlang)
|
|
3
|
+
library(dplyr)
|
|
4
|
+
library(mygene)
|
|
5
|
+
})
|
|
3
6
|
|
|
4
|
-
|
|
7
|
+
|
|
8
|
+
#@' Convert gene names between different formats
|
|
9
|
+
#@'
|
|
10
|
+
#@' @param genes A character/integer vector of gene names/ids
|
|
11
|
+
#@' @param species A character vector of species names
|
|
12
|
+
#@' @param infmt A character vector of input gene name formats
|
|
13
|
+
#@' See the available scopes at
|
|
14
|
+
#@' https://docs.mygene.info/en/latest/doc/data.html#available-fields
|
|
15
|
+
#@' You can use ensg as a shortcut for ensembl.gene
|
|
16
|
+
#@' @param outfmt A character vector of output gene name formats
|
|
17
|
+
#@' @param dup How to deal with duplicate gene names found.
|
|
18
|
+
#@' "first": keep the first one (default), sorted by score descendingly
|
|
19
|
+
#@' "last": keep the last one, sorted by score descendingly
|
|
20
|
+
#@' "all": keep all of them, each will be a separate row
|
|
21
|
+
#@' "<X>": combine them into a single string, separated by X
|
|
22
|
+
#@' @param notfound How to deal with gene names that are not found
|
|
23
|
+
#@' "error": stop with an error message
|
|
24
|
+
#@' "use-query": use the query gene name as the converted gene name
|
|
25
|
+
#@' "skip": skip the gene names that are not found
|
|
26
|
+
#@' "ignore": Same as "skip"
|
|
27
|
+
#@' "na": use NA as the converted gene name (default)
|
|
28
|
+
#@' @param suppress_messages Whether to suppress the warning messages
|
|
29
|
+
#@' @return A tibble with the query gene names and the converted gene names
|
|
30
|
+
#@' When a gene name is not found, the converted name will be NA
|
|
31
|
+
#@' When duplicate gene names are found, the one with the highest score will be kept
|
|
32
|
+
#@' @export
|
|
33
|
+
gene_name_conversion <- function(
|
|
5
34
|
genes,
|
|
6
|
-
species,
|
|
7
35
|
infmt,
|
|
8
36
|
outfmt,
|
|
9
|
-
|
|
37
|
+
dup = "first",
|
|
38
|
+
species = "human",
|
|
39
|
+
notfound = "na",
|
|
40
|
+
suppress_messages = FALSE
|
|
10
41
|
) {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
42
|
+
notfound <- arg_match(notfound, c("error", "use-query", "skip", "ignore", "na"))
|
|
43
|
+
|
|
44
|
+
if (infmt %in% c("ensg", "ensmusg")) { infmt = "ensembl.gene" }
|
|
45
|
+
if (outfmt %in% c("ensg", "ensmusg")) { outfmt = "ensembl.gene" }
|
|
46
|
+
|
|
47
|
+
orig_genes <- genes
|
|
48
|
+
if (infmt == "ensembl.gene") {
|
|
49
|
+
# Remove version numbers from ensembl gene ids
|
|
50
|
+
genes <- gsub("\\..*", "", genes)
|
|
51
|
+
}
|
|
52
|
+
query_df <- tibble(query = genes, orig = orig_genes)
|
|
53
|
+
|
|
54
|
+
if (suppress_messages) {
|
|
55
|
+
capture.output(suppressWarnings(suppressMessages({
|
|
56
|
+
out <- queryMany(genes, scopes=infmt, fields=outfmt, species=species) %>%
|
|
57
|
+
as_tibble()
|
|
58
|
+
})))
|
|
59
|
+
} else {
|
|
60
|
+
out <- queryMany(genes, scopes=infmt, fields=outfmt, species=species) %>%
|
|
61
|
+
as_tibble()
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (nrow(out) == 0) {
|
|
65
|
+
return(tibble(query = orig_genes, converted = NA_character_))
|
|
26
66
|
}
|
|
27
67
|
|
|
28
|
-
if (
|
|
29
|
-
|
|
68
|
+
if (dup == "first") {
|
|
69
|
+
out = out %>% group_by(query) %>% arrange(desc(X_score)) %>%
|
|
70
|
+
slice_head(n=1) %>% ungroup() %>% dplyr::select(all_of(c("query", outfmt)))
|
|
71
|
+
} else if (dup == "last") {
|
|
72
|
+
out = out %>% group_by(query) %>% arrange(X_score) %>%
|
|
73
|
+
slice_head(n=1) %>% ungroup() %>% dplyr::select(all_of(c("query", outfmt)))
|
|
74
|
+
} else if (dup != "all") {
|
|
75
|
+
out = out %>% group_by(query) %>% arrange(desc(X_score)) %>%
|
|
76
|
+
summarise(!!sym(outfmt) := paste(unique(!!sym(outfmt)), collapse=dup))
|
|
30
77
|
}
|
|
78
|
+
out <- query_df %>%
|
|
79
|
+
left_join(out, by="query") %>%
|
|
80
|
+
dplyr::select(-"query") %>%
|
|
81
|
+
dplyr::select(query = orig, everything())
|
|
31
82
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
)
|
|
41
|
-
} else if (notfound == "error" && any(is.na(out[[outfmt[1]]]))) {
|
|
42
|
-
nagenes = out %>% filter(is.na(.[[outfmt[1]]])) %>% pull("query")
|
|
43
|
-
stop(paste("Query genes not found:", paste(nagenes, collapse=",")))
|
|
44
|
-
} else if (notfound == "skip") {
|
|
45
|
-
out = out %>% filter(!is.na(.[[outfmt[1]]]))
|
|
83
|
+
if (notfound == "error") {
|
|
84
|
+
if (any(is.na(out[[outfmt]]))) {
|
|
85
|
+
nagenes = out %>% filter(is.na(.[[outfmt]])) %>% pull("query")
|
|
86
|
+
stop(paste("Query genes not found:", paste(nagenes, collapse=",")))
|
|
87
|
+
}
|
|
88
|
+
} else if (notfound == "use-query") {
|
|
89
|
+
out = out %>% mutate(!!sym(outfmt) := coalesce(!!sym(outfmt), query))
|
|
90
|
+
} else if (notfound == "skip" || notfound == "ignore") {
|
|
91
|
+
out = out %>% filter(!is.na(!!sym(outfmt)))
|
|
46
92
|
}
|
|
47
93
|
|
|
48
|
-
return
|
|
94
|
+
return(out)
|
|
49
95
|
}
|
biopipen/utils/gene.py
CHANGED
|
@@ -1,86 +1,134 @@
|
|
|
1
1
|
"""Do gene name conversion"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
import contextlib
|
|
6
|
+
import pandas as pd
|
|
2
7
|
from mygene import MyGeneInfo
|
|
3
|
-
from datar.all import (
|
|
4
|
-
c,
|
|
5
|
-
f,
|
|
6
|
-
group_by,
|
|
7
|
-
desc,
|
|
8
|
-
arrange,
|
|
9
|
-
slice_head,
|
|
10
|
-
tibble,
|
|
11
|
-
left_join,
|
|
12
|
-
mutate,
|
|
13
|
-
is_na,
|
|
14
|
-
across,
|
|
15
|
-
if_else,
|
|
16
|
-
filter_,
|
|
17
|
-
pull,
|
|
18
|
-
select,
|
|
19
|
-
)
|
|
20
8
|
|
|
21
9
|
mygene = MyGeneInfo()
|
|
22
10
|
|
|
23
11
|
|
|
24
|
-
class QueryGenesNotFound(
|
|
12
|
+
class QueryGenesNotFound(ValueError):
|
|
25
13
|
"""When genes cannot be found"""
|
|
26
14
|
|
|
27
15
|
|
|
28
16
|
def gene_name_conversion(
|
|
29
|
-
genes,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
17
|
+
genes: list[str],
|
|
18
|
+
infmt: str | list[str],
|
|
19
|
+
outfmt: str,
|
|
20
|
+
dup: str = "first",
|
|
21
|
+
species: str = "human",
|
|
22
|
+
notfound: str = "na",
|
|
23
|
+
suppress_messages: bool = False,
|
|
34
24
|
):
|
|
35
25
|
"""Convert gene names using MyGeneInfo
|
|
36
26
|
|
|
37
27
|
Args:
|
|
38
|
-
genes: A
|
|
39
|
-
species:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
28
|
+
genes: A character/integer vector of gene names/ids
|
|
29
|
+
species: A character vector of species names
|
|
30
|
+
infmt: A character vector of input gene name formats
|
|
31
|
+
See the available scopes at
|
|
32
|
+
https://docs.mygene.info/en/latest/doc/data.html#available-fields
|
|
33
|
+
You can use ensg as a shortcut for ensembl.gene
|
|
34
|
+
outfmt: A character vector of output gene name formats
|
|
35
|
+
dup: How to deal with duplicate gene names found.
|
|
36
|
+
first: keep the first one (default), sorted by score descendingly
|
|
37
|
+
last: keep the last one, sorted by score descendingly
|
|
38
|
+
all: keep all of them, each will be a separate row
|
|
39
|
+
<X>: combine them into a single string, separated by X
|
|
40
|
+
notfound: How to deal with gene names that are not found
|
|
41
|
+
error: stop with an error message
|
|
42
|
+
use-query: use the query gene name as the converted gene name
|
|
43
|
+
skip: skip the gene names that are not found
|
|
44
|
+
ignore: Same as "skip"
|
|
45
|
+
na: use NA as the converted gene name (default)
|
|
46
|
+
suppress_messages: Suppress the messages while querying
|
|
51
47
|
|
|
52
48
|
Returns:
|
|
53
|
-
A dataframe with
|
|
49
|
+
A dataframe with the query gene names and the converted gene names
|
|
50
|
+
When a gene name is not found, the converted name will be "NA"
|
|
51
|
+
When duplicate gene names are found, the one with the highest score will be kept
|
|
54
52
|
"""
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
notfound = notfound.lower()
|
|
54
|
+
if notfound not in ("error", "use-query", "skip", "ignore", "na"):
|
|
55
|
+
raise ValueError(
|
|
56
|
+
"`notfound` of `gene_name_conversion` must be one of "
|
|
57
|
+
"'error', 'use-query', 'skip', 'ignore', 'na'"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if infmt in ["ensg", "ensmusg"]:
|
|
61
|
+
infmt = "ensembl.gene"
|
|
62
|
+
if outfmt in ["ensg", "ensmusg"]:
|
|
63
|
+
outfmt = "ensembl.gene"
|
|
64
|
+
|
|
65
|
+
orig_genes = genes[:]
|
|
66
|
+
if infmt == "ensembl.gene":
|
|
67
|
+
# Remove version numbers from ensembl gene ids
|
|
68
|
+
genes = [re.sub("\\..*", "", gene) for gene in genes]
|
|
69
|
+
|
|
70
|
+
query_df = pd.DataFrame({"query": genes, "orig": orig_genes})
|
|
71
|
+
|
|
72
|
+
if suppress_messages:
|
|
73
|
+
with contextlib.redirect_stdout(None):
|
|
74
|
+
out = mygene.querymany(
|
|
75
|
+
genes,
|
|
76
|
+
scopes=infmt,
|
|
77
|
+
fields=outfmt,
|
|
78
|
+
species=species,
|
|
79
|
+
as_dataframe=True,
|
|
80
|
+
df_index=False,
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
out = mygene.querymany(
|
|
57
84
|
genes,
|
|
58
85
|
scopes=infmt,
|
|
59
86
|
fields=outfmt,
|
|
87
|
+
species=species,
|
|
60
88
|
as_dataframe=True,
|
|
61
89
|
df_index=False,
|
|
62
|
-
species=species,
|
|
63
90
|
)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
outfmt,
|
|
76
|
-
lambda col, query: if_else(is_na(col), query, col),
|
|
77
|
-
query=f.query,
|
|
78
|
-
)
|
|
91
|
+
|
|
92
|
+
if out.shape[0] == 0:
|
|
93
|
+
return pd.DataFrame({"query": genes, "converted": ["NA"] * len(genes)})
|
|
94
|
+
|
|
95
|
+
if dup == "first":
|
|
96
|
+
out = (
|
|
97
|
+
out
|
|
98
|
+
.sort_values("_score", ascending=False)
|
|
99
|
+
.groupby("query")
|
|
100
|
+
.head(1)
|
|
101
|
+
.reset_index(drop=True)
|
|
79
102
|
)
|
|
80
|
-
elif
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
103
|
+
elif dup == "last":
|
|
104
|
+
out = (
|
|
105
|
+
out
|
|
106
|
+
.sort_values("_score", ascending=False)
|
|
107
|
+
.groupby("query")
|
|
108
|
+
.tail(1)
|
|
109
|
+
.reset_index(drop=True)
|
|
110
|
+
)
|
|
111
|
+
elif dup != "all":
|
|
112
|
+
out = (
|
|
113
|
+
out
|
|
114
|
+
.sort_values("_score", ascending=False)
|
|
115
|
+
.groupby("query")
|
|
116
|
+
.agg({outfmt: lambda x: f"{dup}".join([str(x) for x in x.unique()])})
|
|
117
|
+
.reset_index()
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
out = pd.merge(query_df, out, on="query", how="left")
|
|
121
|
+
out = out.drop(columns=["query"]).rename(columns={"orig": "query"})
|
|
122
|
+
|
|
123
|
+
if notfound == "error":
|
|
124
|
+
if out[outfmt].isnull().any():
|
|
125
|
+
nagenes = out[out[outfmt].isnull()]["query"].tolist()
|
|
126
|
+
raise QueryGenesNotFound(f"Query genes not found: {','.join(nagenes)}")
|
|
127
|
+
elif notfound == "use-query":
|
|
128
|
+
out[outfmt] = out[outfmt].combine_first(out["query"])
|
|
129
|
+
elif notfound in ["skip", "ignore"]:
|
|
130
|
+
out = out.dropna(subset=[outfmt])
|
|
131
|
+
else: # notfound == "na"
|
|
132
|
+
out[outfmt] = out[outfmt].fillna("NA")
|
|
85
133
|
|
|
86
134
|
return out
|