biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +146 -0
- biopipen/ns/regulation.py +214 -0
- biopipen/ns/scrna.py +15 -3
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +74 -2
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +140 -0
- biopipen/scripts/plot/QQPlot.R +62 -0
- biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulation/MotifScan.py +159 -0
- biopipen/scripts/regulation/atSNP.R +33 -0
- biopipen/scripts/regulation/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/CellsDistribution.R +2 -0
- biopipen/scripts/scrna/MarkersFinder.R +59 -67
- biopipen/scripts/scrna/SeuratClustering.R +63 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +84 -43
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +197 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
suppressPackageStartupMessages({
|
|
4
|
+
library(dplyr)
|
|
5
|
+
library(tidyr)
|
|
6
|
+
library(tibble)
|
|
7
|
+
})
|
|
8
|
+
|
|
9
|
+
indir <- {{in.indir | r}}
|
|
10
|
+
outdir <- {{out.outdir | r}}
|
|
11
|
+
plink <- {{envs.plink | r}}
|
|
12
|
+
indep <- {{envs.indep | r}}
|
|
13
|
+
highld <- {{envs.highld | r}}
|
|
14
|
+
devpars <- {{envs.devpars | r}}
|
|
15
|
+
pihat <- {{envs.pihat | r}}
|
|
16
|
+
samid <- {{envs.samid | r}}
|
|
17
|
+
annofile <- {{envs.anno | r}}
|
|
18
|
+
doplot <- {{envs.plot | r}}
|
|
19
|
+
seed <- {{envs.seed | r}}
|
|
20
|
+
ncores <- {{envs.ncores | r}}
|
|
21
|
+
|
|
22
|
+
bedfile <- Sys.glob(file.path(indir, '*.bed'))
|
|
23
|
+
if (length(bedfile) == 0)
|
|
24
|
+
stop("No bed files found in the input directory.")
|
|
25
|
+
if (length(bedfile) > 1) {
|
|
26
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
27
|
+
bedfile <- bedfile[1]
|
|
28
|
+
}
|
|
29
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
30
|
+
output <- file.path(outdir, basename(input))
|
|
31
|
+
|
|
32
|
+
cmd <- c(
|
|
33
|
+
plink,
|
|
34
|
+
"--threads", ncores,
|
|
35
|
+
"--bfile", input,
|
|
36
|
+
"--indep-pairwise", indep,
|
|
37
|
+
# One should be mindful of running this with < 50 samples
|
|
38
|
+
# "--bad-ld",
|
|
39
|
+
"--out", output
|
|
40
|
+
)
|
|
41
|
+
if (!is.null(highld) && !isFALSE(highld)) {
|
|
42
|
+
cmd <- c(cmd, "--range", "--exclude", highld)
|
|
43
|
+
}
|
|
44
|
+
run_command(cmd, fg = TRUE)
|
|
45
|
+
|
|
46
|
+
prunein <- paste0(output, '.prune.in')
|
|
47
|
+
cmd <- c(
|
|
48
|
+
plink,
|
|
49
|
+
"--threads", ncores,
|
|
50
|
+
"--bfile", input,
|
|
51
|
+
"--extract", prunein,
|
|
52
|
+
"--genome",
|
|
53
|
+
"--out", output
|
|
54
|
+
)
|
|
55
|
+
run_command(cmd, fg = TRUE)
|
|
56
|
+
|
|
57
|
+
genome <- read.table(
|
|
58
|
+
paste0(output, '.genome'),
|
|
59
|
+
row.names = NULL,
|
|
60
|
+
header = TRUE,
|
|
61
|
+
check.names = FALSE
|
|
62
|
+
)
|
|
63
|
+
# "unmelt" it
|
|
64
|
+
# FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
|
|
65
|
+
# s1 s1 s2 s2 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.866584 0.0000 0.9194
|
|
66
|
+
# s1 s1 s2 s2 UN NA 0.4846 0.3724 0.1431 0.3293 -1 0.913945 0.7236 2.0375
|
|
67
|
+
# s1 s1 s3 s3 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.867186 0.0000 1.0791
|
|
68
|
+
genome$SAMPLE1 <- paste(genome$FID1, genome$IID1, sep = "\t")
|
|
69
|
+
genome$SAMPLE2 <- paste(genome$FID2, genome$IID2, sep = "\t")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# get all samples
|
|
73
|
+
samples <- unique(c(genome$SAMPLE1, genome$SAMPLE2))
|
|
74
|
+
# make paired into a distance-like matrix
|
|
75
|
+
similarity <- genome %>%
|
|
76
|
+
select(SAMPLE1, SAMPLE2, PI_HAT) %>%
|
|
77
|
+
pivot_wider(names_from = SAMPLE2, values_from = PI_HAT, values_fill = NA) %>%
|
|
78
|
+
as.data.frame() %>%
|
|
79
|
+
column_to_rownames("SAMPLE1")
|
|
80
|
+
rm(genome)
|
|
81
|
+
# get the rownames back
|
|
82
|
+
samids <- rownames(similarity)
|
|
83
|
+
# get samples that didn't involved
|
|
84
|
+
missedrow <- setdiff(samples, rownames(similarity))
|
|
85
|
+
missedcol <- setdiff(samples, colnames(similarity))
|
|
86
|
+
similarity[missedrow, ] <- NA
|
|
87
|
+
similarity[, missedcol] <- NA
|
|
88
|
+
# order the matrix
|
|
89
|
+
similarity <- similarity[samples, samples, drop = FALSE]
|
|
90
|
+
# transpose the matrix to get the symmetric values
|
|
91
|
+
sim2 <- t(similarity)
|
|
92
|
+
isna <- is.na(similarity)
|
|
93
|
+
# fill the na's with their symmetric values
|
|
94
|
+
similarity[isna] <- sim2[isna]
|
|
95
|
+
rm(sim2)
|
|
96
|
+
# still missing: keep them
|
|
97
|
+
similarity[is.na(similarity)] <- 0
|
|
98
|
+
# get the marks (samples that fail the pihat cutoff)
|
|
99
|
+
nsams <- length(samples)
|
|
100
|
+
fails <- which(similarity > pihat)
|
|
101
|
+
marks <- data.frame(x = (fails - 1)%%nsams + 1, y = ceiling(fails/nsams))
|
|
102
|
+
diag(similarity) <- 1
|
|
103
|
+
|
|
104
|
+
failflags <- rep(F, nrow(marks))
|
|
105
|
+
freqs <- as.data.frame(table(factor(as.matrix(marks))))
|
|
106
|
+
freqs <- freqs[order(freqs$Freq, decreasing = T), 'Var1', drop = T]
|
|
107
|
+
ibd.fail <- c()
|
|
108
|
+
while (sum(failflags) < nrow(marks)) {
|
|
109
|
+
samidx <- freqs[1]
|
|
110
|
+
ibd.fail <- c(ibd.fail, samples[samidx])
|
|
111
|
+
freqs <- freqs[-1]
|
|
112
|
+
sapply(1:nrow(marks), function(i) {
|
|
113
|
+
if (samidx %in% marks[i,])
|
|
114
|
+
failflags[i] <<- TRUE
|
|
115
|
+
})
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
ibd_fail_file <- paste0(output, '.ibd.fail')
|
|
119
|
+
writeLines(ibd.fail, ibd_fail_file)
|
|
120
|
+
cmd <- c(
|
|
121
|
+
plink,
|
|
122
|
+
"--threads", ncores,
|
|
123
|
+
"--bfile", input,
|
|
124
|
+
"--remove", ibd_fail_file,
|
|
125
|
+
"--make-bed",
|
|
126
|
+
"--out", output
|
|
127
|
+
)
|
|
128
|
+
run_command(cmd, fg = TRUE)
|
|
129
|
+
|
|
130
|
+
if (doplot) {
|
|
131
|
+
set.seed(seed)
|
|
132
|
+
library(ComplexHeatmap)
|
|
133
|
+
fontsize8 <- gpar(fontsize = 8)
|
|
134
|
+
fontsize9 <- gpar(fontsize = 9)
|
|
135
|
+
ht_opt$heatmap_row_names_gp <- fontsize8
|
|
136
|
+
ht_opt$heatmap_column_names_gp <- fontsize8
|
|
137
|
+
ht_opt$legend_title_gp <- fontsize9
|
|
138
|
+
ht_opt$legend_labels_gp <- fontsize8
|
|
139
|
+
ht_opt$simple_anno_size <- unit(3, "mm")
|
|
140
|
+
|
|
141
|
+
samids <- sapply(samples, function(sid) {
|
|
142
|
+
fidiid <- unlist(strsplit(sid, "\t", fixed = TRUE))
|
|
143
|
+
gsub(
|
|
144
|
+
"{fid}",
|
|
145
|
+
fidiid[1],
|
|
146
|
+
gsub("{iid}", fidiid[2], samid, fixed = TRUE),
|
|
147
|
+
fixed = TRUE
|
|
148
|
+
)
|
|
149
|
+
})
|
|
150
|
+
rownames(similarity) <- samids
|
|
151
|
+
colnames(similarity) <- samids
|
|
152
|
+
|
|
153
|
+
annos <- list()
|
|
154
|
+
if (!is.null(annofile) && !isFALSE(annofile)) {
|
|
155
|
+
options(stringsAsFactors = TRUE)
|
|
156
|
+
andata <- read.table(annofile, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
|
|
157
|
+
andata <- andata[samids, , drop = FALSE]
|
|
158
|
+
for (anname in colnames(andata)) {
|
|
159
|
+
annos[[anname]] <- as.matrix(andata[, anname])
|
|
160
|
+
}
|
|
161
|
+
annos$annotation_name_gp <- fontsize8
|
|
162
|
+
annos <- do.call(HeatmapAnnotation, annos)
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
args <- list(
|
|
166
|
+
name = "PI_HAT",
|
|
167
|
+
cell_fun = function(j, i, x, y, width, height, fill) {
|
|
168
|
+
if (similarity[i, j] > pihat && i != j)
|
|
169
|
+
grid.points(x, y, pch = 4, size = unit(.5, "char"))
|
|
170
|
+
},
|
|
171
|
+
#heatmap_legend_param = list(
|
|
172
|
+
# title_gp = fontsize9,
|
|
173
|
+
# labels_gp = fontsize8
|
|
174
|
+
#),
|
|
175
|
+
clustering_distance_rows = function(m) as.dist(1-m),
|
|
176
|
+
clustering_distance_columns = function(m) as.dist(1-m),
|
|
177
|
+
top_annotation = if (length(annos) == 0) NULL else annos
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
plotHeatmap(
|
|
181
|
+
similarity,
|
|
182
|
+
outfile = paste0(output, '.ibd.png'),
|
|
183
|
+
args = args,
|
|
184
|
+
draw = list(
|
|
185
|
+
annotation_legend_list = list(
|
|
186
|
+
Legend(
|
|
187
|
+
labels = paste(">", pihat),
|
|
188
|
+
title = "",
|
|
189
|
+
type = "points",
|
|
190
|
+
pch = 4,
|
|
191
|
+
title_gp = fontsize9,
|
|
192
|
+
labels_gp = fontsize8)),
|
|
193
|
+
merge_legend = TRUE
|
|
194
|
+
),
|
|
195
|
+
devpars = devpars
|
|
196
|
+
)
|
|
197
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
3
|
+
|
|
4
|
+
indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
|
|
5
|
+
namefile = {{in.namefile | repr}} # pyright: ignore
|
|
6
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
7
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
8
|
+
bcftools = {{envs.bcftools | repr}} # pyright: ignore
|
|
9
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
match_alt = {{envs.match_alt | repr}} # pyright: ignore
|
|
11
|
+
|
|
12
|
+
bedfile = list(Path(indir).glob("*.bed"))
|
|
13
|
+
if len(bedfile) == 0:
|
|
14
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
15
|
+
elif len(bedfile) > 1:
|
|
16
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
17
|
+
|
|
18
|
+
bedfile = bedfile[0]
|
|
19
|
+
input = bedfile.with_suffix("")
|
|
20
|
+
output = Path(outdir) / bedfile.stem
|
|
21
|
+
|
|
22
|
+
if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
|
|
23
|
+
logger.info("VCF file received, extracting names")
|
|
24
|
+
def alt_matched(bim_alt, vcf_alt, match_alt):
|
|
25
|
+
if match_alt == "none":
|
|
26
|
+
return True
|
|
27
|
+
if match_alt == "exact":
|
|
28
|
+
return bim_alt == vcf_alt
|
|
29
|
+
|
|
30
|
+
bim_alts = bim_alt.split(",")
|
|
31
|
+
vcf_alts = vcf_alt.split(",")
|
|
32
|
+
if match_alt == "all":
|
|
33
|
+
return set(bim_alts) == set(vcf_alts)
|
|
34
|
+
if match_alt == "any":
|
|
35
|
+
return bool(set(bim_alts) & set(vcf_alts))
|
|
36
|
+
if match_alt == "first_included":
|
|
37
|
+
return bim_alts[0] in vcf_alts
|
|
38
|
+
if match_alt == "first":
|
|
39
|
+
return bim_alts[0] == vcf_alts[0]
|
|
40
|
+
|
|
41
|
+
raise ValueError(f"Unknown match_alt: {match_alt}")
|
|
42
|
+
|
|
43
|
+
def readline(f):
|
|
44
|
+
line = f.readline().strip()
|
|
45
|
+
return line.split("\t") if line else None
|
|
46
|
+
|
|
47
|
+
namefile_tmp = Path(outdir) / "_namefile_from_vcf.txt"
|
|
48
|
+
infofile = Path(outdir) / "_information_from_vcf_unsorted.txt"
|
|
49
|
+
sorted_infofile = Path(outdir) / "_information_from_vcf_sorted.txt"
|
|
50
|
+
sorted_bim = Path(outdir) / "_sorted_bim.txt"
|
|
51
|
+
bt_cmd = [
|
|
52
|
+
bcftools, "query",
|
|
53
|
+
"-f", "%CHROM\\t%ID\\t0\\t%POS\\t%ALT\\t%REF\\n",
|
|
54
|
+
"-o", infofile,
|
|
55
|
+
namefile,
|
|
56
|
+
]
|
|
57
|
+
## infofile
|
|
58
|
+
# 1 rs10492 0 10492 T C
|
|
59
|
+
logger.info("- Extracting information from VCF file ...")
|
|
60
|
+
run_command(bt_cmd, fg=True)
|
|
61
|
+
# sort infofile
|
|
62
|
+
logger.info("- Sorting the information from VCF file ...")
|
|
63
|
+
run_command(
|
|
64
|
+
[
|
|
65
|
+
"sort",
|
|
66
|
+
"-k1,1", "-k4,4n", "-k6,6",
|
|
67
|
+
infofile,
|
|
68
|
+
"--parallel", ncores,
|
|
69
|
+
"-o", sorted_infofile
|
|
70
|
+
],
|
|
71
|
+
env={"LC_ALL": "C"},
|
|
72
|
+
fg=True,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
## .bim file
|
|
76
|
+
# 1 1_10492 0 10492 T C
|
|
77
|
+
# sort .bim file
|
|
78
|
+
logger.info("- Sorting the .bim file ...")
|
|
79
|
+
run_command(
|
|
80
|
+
[
|
|
81
|
+
"sort",
|
|
82
|
+
"-k1,1", "-k4,4n", "-k6,6",
|
|
83
|
+
input.with_suffix(".bim"),
|
|
84
|
+
"--parallel", ncores,
|
|
85
|
+
"-o", sorted_bim
|
|
86
|
+
],
|
|
87
|
+
env={"LC_ALL": "C"},
|
|
88
|
+
fg=True,
|
|
89
|
+
)
|
|
90
|
+
# query namefile for records in sorted bim file
|
|
91
|
+
logger.info("- Matching and generating the name file ...")
|
|
92
|
+
with sorted_bim.open() as fbim, sorted_infofile.open() as finfo, namefile_tmp.open("w") as fout: # noqa: E501
|
|
93
|
+
bim = readline(fbim)
|
|
94
|
+
info = readline(finfo)
|
|
95
|
+
while bim and info:
|
|
96
|
+
if (
|
|
97
|
+
bim[0] == info[0]
|
|
98
|
+
and bim[3] == info[3]
|
|
99
|
+
and bim[5] == info[5]
|
|
100
|
+
and alt_matched(bim[4], info[4], match_alt)
|
|
101
|
+
):
|
|
102
|
+
fout.write(f"{bim[1]}\t{info[1]}\n")
|
|
103
|
+
bim = readline(fbim)
|
|
104
|
+
info = readline(finfo)
|
|
105
|
+
elif (
|
|
106
|
+
bim[0] < info[0]
|
|
107
|
+
or (bim[0] == info[0] and bim[3] < info[3])
|
|
108
|
+
or (bim[0] == info[0] and bim[3] == info[3] and bim[5] < info[5])
|
|
109
|
+
):
|
|
110
|
+
bim = readline(fbim)
|
|
111
|
+
else:
|
|
112
|
+
info = readline(finfo)
|
|
113
|
+
|
|
114
|
+
namefile = namefile_tmp
|
|
115
|
+
|
|
116
|
+
args = {
|
|
117
|
+
"": plink,
|
|
118
|
+
"bfile": input,
|
|
119
|
+
"out": output,
|
|
120
|
+
"make_bed": True,
|
|
121
|
+
"update_name": namefile,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
@@ -11,6 +11,7 @@ id_exprs <- {{envs.id_exprs | r}}
|
|
|
11
11
|
pval_cols <- {{envs.pval_cols | r}}
|
|
12
12
|
method <- {{envs.method | r}}
|
|
13
13
|
na <- {{envs.na | r}}
|
|
14
|
+
keep_single <- {{envs.keep_single | r}}
|
|
14
15
|
padj <- {{envs.padj | r}}
|
|
15
16
|
|
|
16
17
|
if (method == "fisher") { method = "sumlog" }
|
|
@@ -102,7 +103,7 @@ if (length(infiles) == 1 && padj == "none") {
|
|
|
102
103
|
if (length(ps) == 0) {
|
|
103
104
|
metaps <- c(metaps, NA)
|
|
104
105
|
ns <- c(ns, NA)
|
|
105
|
-
} else if (length(ps) == 1) {
|
|
106
|
+
} else if (length(ps) == 1 && keep_single) {
|
|
106
107
|
metaps <- c(metaps, ps)
|
|
107
108
|
ns <- c(ns, 1)
|
|
108
109
|
} else {
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
|
|
3
|
+
library(metap)
|
|
4
|
+
library(rlang)
|
|
5
|
+
library(dplyr)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
outfile <- {{out.outfile | r}}
|
|
9
|
+
id_cols <- {{envs.id_cols | r}}
|
|
10
|
+
pval_col <- {{envs.pval_col | r}}
|
|
11
|
+
method <- {{envs.method | r}}
|
|
12
|
+
na <- {{envs.na | r}}
|
|
13
|
+
keep_single <- {{envs.keep_single | r}}
|
|
14
|
+
padj <- {{envs.padj | r}}
|
|
15
|
+
|
|
16
|
+
if (method == "fisher") { method = "sumlog" }
|
|
17
|
+
|
|
18
|
+
# Check pval_cols
|
|
19
|
+
if (is.null(pval_col)) { stop("Must provide envs.pval_col") }
|
|
20
|
+
|
|
21
|
+
# Check id_cols
|
|
22
|
+
if (is.null(id_cols)) { stop("Must provide envs.id_cols") }
|
|
23
|
+
if (length(id_cols) == 1) {
|
|
24
|
+
id_cols <- trimws(strsplit(id_cols, ",")[[1]])
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
log_info("Reading input and performing meta-analysis ...")
|
|
28
|
+
outdata <- read.table(
|
|
29
|
+
infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE
|
|
30
|
+
) %>%
|
|
31
|
+
group_by(!!!syms(id_cols)) %>%
|
|
32
|
+
summarise(
|
|
33
|
+
N = n(),
|
|
34
|
+
.pvals = list(!!sym(pval_col)),
|
|
35
|
+
.groups = "drop"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
metaps <- c()
|
|
39
|
+
ns <- c()
|
|
40
|
+
for (ps in outdata$.pvals) {
|
|
41
|
+
if (na == -1) {
|
|
42
|
+
ps <- ps[!is.na(ps)]
|
|
43
|
+
} else {
|
|
44
|
+
ps[is.na(ps)] <- na
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (length(ps) == 0) {
|
|
48
|
+
metaps <- c(metaps, NA)
|
|
49
|
+
ns <- c(ns, NA)
|
|
50
|
+
} else if (length(ps) == 1 && keep_single) {
|
|
51
|
+
metaps <- c(metaps, ps)
|
|
52
|
+
ns <- c(ns, 1)
|
|
53
|
+
} else {
|
|
54
|
+
metaps <- c(metaps, do.call(method, list(ps))$p)
|
|
55
|
+
ns <- c(ns, length(ps))
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
outdata$MetaPval <- metaps
|
|
59
|
+
outdata$N <- ns
|
|
60
|
+
outdata$.pvals <- NULL
|
|
61
|
+
outdata <- outdata %>% arrange(MetaPval)
|
|
62
|
+
|
|
63
|
+
if (padj != "none") {
|
|
64
|
+
log_info("Calculating adjusted p-values ...")
|
|
65
|
+
outdata$MetaPadj <- p.adjust(outdata$MetaPval, method = padj)
|
|
66
|
+
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
log_info("Writing output ...")
|
|
70
|
+
write.table(outdata, outfile, quote = FALSE, sep = "\t", row.names = FALSE)
|
|
@@ -130,13 +130,6 @@ shared_clusters = function(name) {
|
|
|
130
130
|
row.names=TRUE, col.names=TRUE, quote=FALSE, sep="\t"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
|
-
if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
|
|
134
|
-
anno = NULL
|
|
135
|
-
} else {
|
|
136
|
-
anno = as.list(immdata$meta[, case$heatmap_meta, drop=FALSE])
|
|
137
|
-
anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
|
|
138
|
-
}
|
|
139
|
-
|
|
140
133
|
if (!is.null(case$sample_order) && length(case$sample_order) > 0) {
|
|
141
134
|
if (length(case$sample_order) == 1) {
|
|
142
135
|
case$sample_order = trimws(strsplit(case$sample_order, ",")[[1]])
|
|
@@ -148,6 +141,18 @@ shared_clusters = function(name) {
|
|
|
148
141
|
plotdata = plotdata[, case$sample_order, drop=FALSE]
|
|
149
142
|
}
|
|
150
143
|
|
|
144
|
+
if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
|
|
145
|
+
anno = NULL
|
|
146
|
+
} else {
|
|
147
|
+
anno = as.list(
|
|
148
|
+
immdata$meta[
|
|
149
|
+
match(colnames(plotdata), immdata$meta$Sample),
|
|
150
|
+
case$heatmap_meta,
|
|
151
|
+
drop=FALSE
|
|
152
|
+
])
|
|
153
|
+
anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
|
|
154
|
+
}
|
|
155
|
+
|
|
151
156
|
cluster_rows = case$cluster_rows && nrow(plotdata) > 2
|
|
152
157
|
col_samples = colnames(plotdata)
|
|
153
158
|
if (!cluster_rows) {
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
from contextlib import suppress
|
|
3
|
+
from pathlib import PosixPath # noqa: F401
|
|
4
|
+
|
|
5
|
+
from biopipen.utils.reference import tabix_index
|
|
6
|
+
from biopipen.utils.misc import logger
|
|
7
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
8
|
+
|
|
9
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa: E999
|
|
10
|
+
annfile = {{in.annfile | repr}} # pyright: ignore
|
|
11
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
12
|
+
joboutdir = {{job.outdir | repr}} # pyright: ignore
|
|
13
|
+
envs = {{envs | dict | repr}} # pyright: ignore
|
|
14
|
+
|
|
15
|
+
bcftools = envs.pop("bcftools")
|
|
16
|
+
tabix = envs.pop("tabix")
|
|
17
|
+
ncores = envs.pop("ncores")
|
|
18
|
+
columns = envs.pop("columns")
|
|
19
|
+
remove = envs.pop("remove")
|
|
20
|
+
header = envs.pop("header")
|
|
21
|
+
gz = envs.pop("gz")
|
|
22
|
+
index = envs.pop("index")
|
|
23
|
+
|
|
24
|
+
if isinstance(columns, list):
|
|
25
|
+
columns = ",".join(columns)
|
|
26
|
+
|
|
27
|
+
if "c" in envs:
|
|
28
|
+
logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
|
|
29
|
+
del envs["c"]
|
|
30
|
+
|
|
31
|
+
if isinstance(remove, list):
|
|
32
|
+
remove = ",".join(remove)
|
|
33
|
+
|
|
34
|
+
if "x" in envs:
|
|
35
|
+
logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
|
|
36
|
+
del envs["x"]
|
|
37
|
+
|
|
38
|
+
envs_has_annfile = "a" in envs or "annotations" in envs
|
|
39
|
+
headerfile = path.join(joboutdir, "header.txt")
|
|
40
|
+
if header:
|
|
41
|
+
with open(headerfile, "w") as fh:
|
|
42
|
+
fh.writelines(header)
|
|
43
|
+
|
|
44
|
+
if annfile and envs_has_annfile:
|
|
45
|
+
logger.warning(
|
|
46
|
+
"Ignoring envs\[a/annotations] because in.annfile is provided."
|
|
47
|
+
)
|
|
48
|
+
with suppress(KeyError):
|
|
49
|
+
del envs["a"]
|
|
50
|
+
with suppress(KeyError):
|
|
51
|
+
del envs["annotations"]
|
|
52
|
+
elif not annfile and envs_has_annfile:
|
|
53
|
+
annfile = envs.pop("annotations", None) or envs.pop("a", None)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if index and not gz:
|
|
57
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
58
|
+
gz = True
|
|
59
|
+
|
|
60
|
+
envs[""] = [bcftools, "annotate"]
|
|
61
|
+
envs["o"] = outfile
|
|
62
|
+
envs["threads"] = ncores
|
|
63
|
+
|
|
64
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
65
|
+
envs["O"] = "z" if gz else "v"
|
|
66
|
+
|
|
67
|
+
if columns:
|
|
68
|
+
envs["columns"] = columns
|
|
69
|
+
if not annfile:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"envs.columns specified but no in.annfile/envs.annfile provided."
|
|
72
|
+
)
|
|
73
|
+
envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
|
|
74
|
+
|
|
75
|
+
if remove:
|
|
76
|
+
envs["remove"] = remove
|
|
77
|
+
# no need to index it
|
|
78
|
+
envs["_"] = infile
|
|
79
|
+
|
|
80
|
+
if "columns" not in envs and "remove" not in envs:
|
|
81
|
+
logger.warning(
|
|
82
|
+
"No columns/remove specified, no columns will be carried over or removed."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if annfile:
|
|
86
|
+
envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
|
|
87
|
+
|
|
88
|
+
if header:
|
|
89
|
+
envs["header_lines"] = headerfile
|
|
90
|
+
|
|
91
|
+
run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from pathlib import Path, PosixPath # noqa: F401
|
|
2
|
+
|
|
3
|
+
from biopipen.utils.misc import logger
|
|
4
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
5
|
+
|
|
6
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
|
|
7
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
8
|
+
outdir = Path(outfile).parent
|
|
9
|
+
|
|
10
|
+
envs = {{envs | dict | repr}} # pyright: ignore
|
|
11
|
+
bcftools = envs.pop("bcftools")
|
|
12
|
+
tabix = envs.pop("tabix")
|
|
13
|
+
keep = envs.pop("keep")
|
|
14
|
+
ncores = envs.pop("ncores")
|
|
15
|
+
includes = envs.pop("includes")
|
|
16
|
+
excludes = envs.pop("excludes")
|
|
17
|
+
gz = envs.pop("gz")
|
|
18
|
+
index = envs.pop("index")
|
|
19
|
+
|
|
20
|
+
# a.vcf.gz -> a
|
|
21
|
+
# a.vcf -> a
|
|
22
|
+
stem = Path(infile).stem
|
|
23
|
+
if stem.endswith(".vcf"):
|
|
24
|
+
stem = stem[:-4]
|
|
25
|
+
# .vcf.gz
|
|
26
|
+
# .gz
|
|
27
|
+
ext = ".vcf.gz" if index or gz else '.vcf'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def normalize_expr(expr, flag, prev_n_filters=0):
|
|
31
|
+
out = {}
|
|
32
|
+
if not expr:
|
|
33
|
+
return out
|
|
34
|
+
if isinstance(expr, list):
|
|
35
|
+
for ex in expr:
|
|
36
|
+
out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (ex, flag)
|
|
37
|
+
elif isinstance(expr, dict):
|
|
38
|
+
for name, ex in expr.items():
|
|
39
|
+
out[name] = (ex, flag)
|
|
40
|
+
else: # str
|
|
41
|
+
out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (expr, flag)
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def handle_filter(vcf, fname, filt, flag, final):
|
|
46
|
+
logger.info("- Handling filter %s: %s ...", fname, filt)
|
|
47
|
+
|
|
48
|
+
arguments = envs.copy()
|
|
49
|
+
arguments[flag] = filt
|
|
50
|
+
arguments["_"] = vcf
|
|
51
|
+
arguments["o"] = outfile if final else outdir / f"{stem}.{fname}{ext}"
|
|
52
|
+
if keep:
|
|
53
|
+
arguments["s"] = fname
|
|
54
|
+
|
|
55
|
+
run_bcftools(arguments, bcftools=bcftools, index=index and final, tabix=tabix)
|
|
56
|
+
|
|
57
|
+
if final:
|
|
58
|
+
flagfile = outdir.joinpath(f"{stem}.{fname}{ext}")
|
|
59
|
+
if flagfile.is_symlink():
|
|
60
|
+
flagfile.unlink()
|
|
61
|
+
outdir.joinpath(f"{stem}.{fname}{ext}").symlink_to(outfile)
|
|
62
|
+
|
|
63
|
+
return arguments["o"]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
includes = normalize_expr(includes, "include")
|
|
67
|
+
excludes = normalize_expr(excludes, "exclude", len(includes))
|
|
68
|
+
includes.update(excludes)
|
|
69
|
+
|
|
70
|
+
if index and not gz:
|
|
71
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
72
|
+
gz = True
|
|
73
|
+
|
|
74
|
+
envs[""] = [bcftools, "filter"]
|
|
75
|
+
envs["_"] = infile
|
|
76
|
+
envs["o"] = outfile
|
|
77
|
+
envs["threads"] = ncores
|
|
78
|
+
|
|
79
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
80
|
+
envs["O"] = "z" if gz else "v"
|
|
81
|
+
|
|
82
|
+
if keep:
|
|
83
|
+
envs["soft_filter"] = "+"
|
|
84
|
+
|
|
85
|
+
if "m" not in envs and "mode" not in envs:
|
|
86
|
+
envs["m"] = "+"
|
|
87
|
+
|
|
88
|
+
# bcftools can be only done once at one filter
|
|
89
|
+
for i, (fname, (filt, flag)) in enumerate(includes.items()):
|
|
90
|
+
infile = handle_filter(infile, fname, filt, flag, i == len(includes) - 1)
|