biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +8 -0
- biopipen/ns/bam.py +0 -2
- biopipen/ns/bed.py +35 -0
- biopipen/ns/cellranger_pipeline.py +5 -5
- biopipen/ns/cnv.py +18 -2
- biopipen/ns/cnvkit_pipeline.py +16 -11
- biopipen/ns/gene.py +68 -23
- biopipen/ns/misc.py +2 -15
- biopipen/ns/plot.py +204 -0
- biopipen/ns/regulatory.py +214 -0
- biopipen/ns/scrna.py +31 -5
- biopipen/ns/snp.py +516 -8
- biopipen/ns/stats.py +167 -3
- biopipen/ns/vcf.py +196 -0
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/scripts/bam/CNVpytor.py +144 -46
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMerge.py +1 -1
- biopipen/scripts/cnv/AneuploidyScore.R +30 -7
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
- biopipen/scripts/cnv/TMADScore.R +21 -5
- biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
- biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
- biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
- biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
- biopipen/scripts/delim/SampleInfo.R +10 -5
- biopipen/scripts/gene/GeneNameConversion.R +65 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/plot/Manhattan.R +146 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/atSNP.R +33 -0
- biopipen/scripts/regulatory/motifBreakR.R +1594 -0
- biopipen/scripts/scrna/MarkersFinder.R +69 -67
- biopipen/scripts/scrna/SeuratClustering.R +71 -29
- biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
- biopipen/scripts/scrna/SeuratPreparing.R +252 -122
- biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
- biopipen/scripts/snp/MatrixEQTL.R +85 -44
- biopipen/scripts/snp/Plink2GTMat.py +133 -0
- biopipen/scripts/snp/PlinkCallRate.R +190 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +298 -0
- biopipen/scripts/snp/PlinkFromVcf.py +78 -0
- biopipen/scripts/snp/PlinkHWE.R +80 -0
- biopipen/scripts/snp/PlinkHet.R +92 -0
- biopipen/scripts/snp/PlinkIBD.R +200 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/Mediation.R +94 -0
- biopipen/scripts/stats/MetaPvalue.R +2 -1
- biopipen/scripts/stats/MetaPvalue1.R +70 -0
- biopipen/scripts/tcr/TCRClusterStats.R +12 -7
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/VcfFix_utils.py +1 -1
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/utils/gene.R +83 -37
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.R +56 -0
- biopipen/utils/misc.py +5 -2
- biopipen/utils/reference.py +54 -10
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
- biopipen/ns/bcftools.py +0 -111
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
source("{{biopipen_dir}}/utils/plot.R")
|
|
3
|
+
suppressPackageStartupMessages({
|
|
4
|
+
library(dplyr)
|
|
5
|
+
library(tidyr)
|
|
6
|
+
library(tibble)
|
|
7
|
+
})
|
|
8
|
+
|
|
9
|
+
indir <- {{in.indir | r}}
|
|
10
|
+
outdir <- {{out.outdir | r}}
|
|
11
|
+
plink <- {{envs.plink | r}}
|
|
12
|
+
indep <- {{envs.indep | r}}
|
|
13
|
+
highld <- {{envs.highld | r}}
|
|
14
|
+
devpars <- {{envs.devpars | r}}
|
|
15
|
+
pihat <- {{envs.pihat | r}}
|
|
16
|
+
samid <- {{envs.samid | r}}
|
|
17
|
+
annofile <- {{envs.anno | r}}
|
|
18
|
+
doplot <- {{envs.plot | r}}
|
|
19
|
+
seed <- {{envs.seed | r}}
|
|
20
|
+
ncores <- {{envs.ncores | r}}
|
|
21
|
+
|
|
22
|
+
bedfile <- Sys.glob(file.path(indir, '*.bed'))
|
|
23
|
+
if (length(bedfile) == 0)
|
|
24
|
+
stop("No bed files found in the input directory.")
|
|
25
|
+
if (length(bedfile) > 1) {
|
|
26
|
+
log_warn("Multiple bed files found in the input directory. Using the first one.")
|
|
27
|
+
bedfile <- bedfile[1]
|
|
28
|
+
}
|
|
29
|
+
input <- tools::file_path_sans_ext(bedfile)
|
|
30
|
+
output <- file.path(outdir, basename(input))
|
|
31
|
+
|
|
32
|
+
cmd <- c(
|
|
33
|
+
plink,
|
|
34
|
+
"--threads", ncores,
|
|
35
|
+
"--bfile", input,
|
|
36
|
+
"--indep-pairwise", indep,
|
|
37
|
+
"--keep-allele-order",
|
|
38
|
+
# One should be mindful of running this with < 50 samples
|
|
39
|
+
# "--bad-ld",
|
|
40
|
+
"--out", output
|
|
41
|
+
)
|
|
42
|
+
if (!is.null(highld) && !isFALSE(highld)) {
|
|
43
|
+
cmd <- c(cmd, "--range", "--exclude", highld)
|
|
44
|
+
}
|
|
45
|
+
run_command(cmd, fg = TRUE)
|
|
46
|
+
|
|
47
|
+
prunein <- paste0(output, '.prune.in')
|
|
48
|
+
cmd <- c(
|
|
49
|
+
plink,
|
|
50
|
+
"--threads", ncores,
|
|
51
|
+
"--bfile", input,
|
|
52
|
+
"--extract", prunein,
|
|
53
|
+
"--keep-allele-order",
|
|
54
|
+
"--genome",
|
|
55
|
+
"--out", output
|
|
56
|
+
)
|
|
57
|
+
run_command(cmd, fg = TRUE)
|
|
58
|
+
|
|
59
|
+
genome <- read.table(
|
|
60
|
+
paste0(output, '.genome'),
|
|
61
|
+
row.names = NULL,
|
|
62
|
+
header = TRUE,
|
|
63
|
+
check.names = FALSE
|
|
64
|
+
)
|
|
65
|
+
# "unmelt" it
|
|
66
|
+
# FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
|
|
67
|
+
# s1 s1 s2 s2 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.866584 0.0000 0.9194
|
|
68
|
+
# s1 s1 s2 s2 UN NA 0.4846 0.3724 0.1431 0.3293 -1 0.913945 0.7236 2.0375
|
|
69
|
+
# s1 s1 s3 s3 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.867186 0.0000 1.0791
|
|
70
|
+
genome$SAMPLE1 <- paste(genome$FID1, genome$IID1, sep = "\t")
|
|
71
|
+
genome$SAMPLE2 <- paste(genome$FID2, genome$IID2, sep = "\t")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# get all samples
|
|
75
|
+
samples <- unique(c(genome$SAMPLE1, genome$SAMPLE2))
|
|
76
|
+
# make paired into a distance-like matrix
|
|
77
|
+
similarity <- genome %>%
|
|
78
|
+
select(SAMPLE1, SAMPLE2, PI_HAT) %>%
|
|
79
|
+
pivot_wider(names_from = SAMPLE2, values_from = PI_HAT, values_fill = NA) %>%
|
|
80
|
+
as.data.frame() %>%
|
|
81
|
+
column_to_rownames("SAMPLE1")
|
|
82
|
+
rm(genome)
|
|
83
|
+
# get the rownames back
|
|
84
|
+
samids <- rownames(similarity)
|
|
85
|
+
# get samples that didn't involved
|
|
86
|
+
missedrow <- setdiff(samples, rownames(similarity))
|
|
87
|
+
missedcol <- setdiff(samples, colnames(similarity))
|
|
88
|
+
similarity[missedrow, ] <- NA
|
|
89
|
+
similarity[, missedcol] <- NA
|
|
90
|
+
# order the matrix
|
|
91
|
+
similarity <- similarity[samples, samples, drop = FALSE]
|
|
92
|
+
# transpose the matrix to get the symmetric values
|
|
93
|
+
sim2 <- t(similarity)
|
|
94
|
+
isna <- is.na(similarity)
|
|
95
|
+
# fill the na's with their symmetric values
|
|
96
|
+
similarity[isna] <- sim2[isna]
|
|
97
|
+
rm(sim2)
|
|
98
|
+
# still missing: keep them
|
|
99
|
+
similarity[is.na(similarity)] <- 0
|
|
100
|
+
# get the marks (samples that fail the pihat cutoff)
|
|
101
|
+
nsams <- length(samples)
|
|
102
|
+
fails <- which(similarity > pihat)
|
|
103
|
+
marks <- data.frame(x = (fails - 1)%%nsams + 1, y = ceiling(fails/nsams))
|
|
104
|
+
diag(similarity) <- 1
|
|
105
|
+
|
|
106
|
+
failflags <- rep(F, nrow(marks))
|
|
107
|
+
freqs <- as.data.frame(table(factor(as.matrix(marks))))
|
|
108
|
+
freqs <- freqs[order(freqs$Freq, decreasing = T), 'Var1', drop = T]
|
|
109
|
+
ibd.fail <- c()
|
|
110
|
+
while (sum(failflags) < nrow(marks)) {
|
|
111
|
+
samidx <- freqs[1]
|
|
112
|
+
ibd.fail <- c(ibd.fail, samples[samidx])
|
|
113
|
+
freqs <- freqs[-1]
|
|
114
|
+
sapply(1:nrow(marks), function(i) {
|
|
115
|
+
if (samidx %in% marks[i,])
|
|
116
|
+
failflags[i] <<- TRUE
|
|
117
|
+
})
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
ibd_fail_file <- paste0(output, '.ibd.fail')
|
|
121
|
+
writeLines(ibd.fail, ibd_fail_file)
|
|
122
|
+
cmd <- c(
|
|
123
|
+
plink,
|
|
124
|
+
"--threads", ncores,
|
|
125
|
+
"--bfile", input,
|
|
126
|
+
"--remove", ibd_fail_file,
|
|
127
|
+
"--keep-allele-order",
|
|
128
|
+
"--make-bed",
|
|
129
|
+
"--out", output
|
|
130
|
+
)
|
|
131
|
+
run_command(cmd, fg = TRUE)
|
|
132
|
+
|
|
133
|
+
if (doplot) {
|
|
134
|
+
set.seed(seed)
|
|
135
|
+
library(ComplexHeatmap)
|
|
136
|
+
fontsize8 <- gpar(fontsize = 8)
|
|
137
|
+
fontsize9 <- gpar(fontsize = 9)
|
|
138
|
+
ht_opt$heatmap_row_names_gp <- fontsize8
|
|
139
|
+
ht_opt$heatmap_column_names_gp <- fontsize8
|
|
140
|
+
ht_opt$legend_title_gp <- fontsize9
|
|
141
|
+
ht_opt$legend_labels_gp <- fontsize8
|
|
142
|
+
ht_opt$simple_anno_size <- unit(3, "mm")
|
|
143
|
+
|
|
144
|
+
samids <- sapply(samples, function(sid) {
|
|
145
|
+
fidiid <- unlist(strsplit(sid, "\t", fixed = TRUE))
|
|
146
|
+
gsub(
|
|
147
|
+
"{fid}",
|
|
148
|
+
fidiid[1],
|
|
149
|
+
gsub("{iid}", fidiid[2], samid, fixed = TRUE),
|
|
150
|
+
fixed = TRUE
|
|
151
|
+
)
|
|
152
|
+
})
|
|
153
|
+
rownames(similarity) <- samids
|
|
154
|
+
colnames(similarity) <- samids
|
|
155
|
+
|
|
156
|
+
annos <- list()
|
|
157
|
+
if (!is.null(annofile) && !isFALSE(annofile)) {
|
|
158
|
+
options(stringsAsFactors = TRUE)
|
|
159
|
+
andata <- read.table(annofile, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
|
|
160
|
+
andata <- andata[samids, , drop = FALSE]
|
|
161
|
+
for (anname in colnames(andata)) {
|
|
162
|
+
annos[[anname]] <- as.matrix(andata[, anname])
|
|
163
|
+
}
|
|
164
|
+
annos$annotation_name_gp <- fontsize8
|
|
165
|
+
annos <- do.call(HeatmapAnnotation, annos)
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
args <- list(
|
|
169
|
+
name = "PI_HAT",
|
|
170
|
+
cell_fun = function(j, i, x, y, width, height, fill) {
|
|
171
|
+
if (similarity[i, j] > pihat && i != j)
|
|
172
|
+
grid.points(x, y, pch = 4, size = unit(.5, "char"))
|
|
173
|
+
},
|
|
174
|
+
#heatmap_legend_param = list(
|
|
175
|
+
# title_gp = fontsize9,
|
|
176
|
+
# labels_gp = fontsize8
|
|
177
|
+
#),
|
|
178
|
+
clustering_distance_rows = function(m) as.dist(1-m),
|
|
179
|
+
clustering_distance_columns = function(m) as.dist(1-m),
|
|
180
|
+
top_annotation = if (length(annos) == 0) NULL else annos
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
plotHeatmap(
|
|
184
|
+
similarity,
|
|
185
|
+
outfile = paste0(output, '.ibd.png'),
|
|
186
|
+
args = args,
|
|
187
|
+
draw = list(
|
|
188
|
+
annotation_legend_list = list(
|
|
189
|
+
Legend(
|
|
190
|
+
labels = paste(">", pihat),
|
|
191
|
+
title = "",
|
|
192
|
+
type = "points",
|
|
193
|
+
pch = 4,
|
|
194
|
+
title_gp = fontsize9,
|
|
195
|
+
labels_gp = fontsize8)),
|
|
196
|
+
merge_legend = TRUE
|
|
197
|
+
),
|
|
198
|
+
devpars = devpars
|
|
199
|
+
)
|
|
200
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
3
|
+
|
|
4
|
+
indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
|
|
5
|
+
namefile = {{in.namefile | repr}} # pyright: ignore
|
|
6
|
+
outdir = {{out.outdir | repr}} # pyright: ignore
|
|
7
|
+
plink = {{envs.plink | repr}} # pyright: ignore
|
|
8
|
+
bcftools = {{envs.bcftools | repr}} # pyright: ignore
|
|
9
|
+
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
match_alt = {{envs.match_alt | repr}} # pyright: ignore
|
|
11
|
+
|
|
12
|
+
bedfile = list(Path(indir).glob("*.bed"))
|
|
13
|
+
if len(bedfile) == 0:
|
|
14
|
+
raise FileNotFoundError(f"No .bed file found in `in.indir`")
|
|
15
|
+
elif len(bedfile) > 1:
|
|
16
|
+
logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
|
|
17
|
+
|
|
18
|
+
bedfile = bedfile[0]
|
|
19
|
+
input = bedfile.with_suffix("")
|
|
20
|
+
output = Path(outdir) / bedfile.stem
|
|
21
|
+
|
|
22
|
+
if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
|
|
23
|
+
logger.info("VCF file received, extracting names")
|
|
24
|
+
def alt_matched(bim_alt, vcf_alt, match_alt):
|
|
25
|
+
if match_alt == "none":
|
|
26
|
+
return True
|
|
27
|
+
if match_alt == "exact":
|
|
28
|
+
return bim_alt == vcf_alt
|
|
29
|
+
|
|
30
|
+
bim_alts = bim_alt.split(",")
|
|
31
|
+
vcf_alts = vcf_alt.split(",")
|
|
32
|
+
if match_alt == "all":
|
|
33
|
+
return set(bim_alts) == set(vcf_alts)
|
|
34
|
+
if match_alt == "any":
|
|
35
|
+
return bool(set(bim_alts) & set(vcf_alts))
|
|
36
|
+
if match_alt == "first_included":
|
|
37
|
+
return bim_alts[0] in vcf_alts
|
|
38
|
+
if match_alt == "first":
|
|
39
|
+
return bim_alts[0] == vcf_alts[0]
|
|
40
|
+
|
|
41
|
+
raise ValueError(f"Unknown match_alt: {match_alt}")
|
|
42
|
+
|
|
43
|
+
def readline(f):
|
|
44
|
+
line = f.readline().strip()
|
|
45
|
+
return line.split("\t") if line else None
|
|
46
|
+
|
|
47
|
+
namefile_tmp = Path(outdir) / "_namefile_from_vcf.txt"
|
|
48
|
+
infofile = Path(outdir) / "_information_from_vcf_unsorted.txt"
|
|
49
|
+
sorted_infofile = Path(outdir) / "_information_from_vcf_sorted.txt"
|
|
50
|
+
sorted_bim = Path(outdir) / "_sorted_bim.txt"
|
|
51
|
+
bt_cmd = [
|
|
52
|
+
bcftools, "query",
|
|
53
|
+
"-f", "%CHROM\\t%ID\\t0\\t%POS\\t%ALT\\t%REF\\n",
|
|
54
|
+
"-o", infofile,
|
|
55
|
+
namefile,
|
|
56
|
+
]
|
|
57
|
+
## infofile
|
|
58
|
+
# 1 rs10492 0 10492 T C
|
|
59
|
+
logger.info("- Extracting information from VCF file ...")
|
|
60
|
+
run_command(bt_cmd, fg=True)
|
|
61
|
+
# sort infofile
|
|
62
|
+
logger.info("- Sorting the information from VCF file ...")
|
|
63
|
+
run_command(
|
|
64
|
+
[
|
|
65
|
+
"sort",
|
|
66
|
+
"-k1,1", "-k4,4n", "-k6,6",
|
|
67
|
+
infofile,
|
|
68
|
+
"--parallel", ncores,
|
|
69
|
+
"-o", sorted_infofile
|
|
70
|
+
],
|
|
71
|
+
env={"LC_ALL": "C"},
|
|
72
|
+
fg=True,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
## .bim file
|
|
76
|
+
# 1 1_10492 0 10492 T C
|
|
77
|
+
# sort .bim file
|
|
78
|
+
logger.info("- Sorting the .bim file ...")
|
|
79
|
+
run_command(
|
|
80
|
+
[
|
|
81
|
+
"sort",
|
|
82
|
+
"-k1,1", "-k4,4n", "-k6,6",
|
|
83
|
+
input.with_suffix(".bim"),
|
|
84
|
+
"--parallel", ncores,
|
|
85
|
+
"-o", sorted_bim
|
|
86
|
+
],
|
|
87
|
+
env={"LC_ALL": "C"},
|
|
88
|
+
fg=True,
|
|
89
|
+
)
|
|
90
|
+
# query namefile for records in sorted bim file
|
|
91
|
+
logger.info("- Matching and generating the name file ...")
|
|
92
|
+
with sorted_bim.open() as fbim, sorted_infofile.open() as finfo, namefile_tmp.open("w") as fout: # noqa: E501
|
|
93
|
+
bim = readline(fbim)
|
|
94
|
+
info = readline(finfo)
|
|
95
|
+
while bim and info:
|
|
96
|
+
if (
|
|
97
|
+
bim[0] == info[0]
|
|
98
|
+
and bim[3] == info[3]
|
|
99
|
+
and bim[5] == info[5]
|
|
100
|
+
and alt_matched(bim[4], info[4], match_alt)
|
|
101
|
+
):
|
|
102
|
+
fout.write(f"{bim[1]}\t{info[1]}\n")
|
|
103
|
+
bim = readline(fbim)
|
|
104
|
+
info = readline(finfo)
|
|
105
|
+
elif (
|
|
106
|
+
bim[0] < info[0]
|
|
107
|
+
or (bim[0] == info[0] and bim[3] < info[3])
|
|
108
|
+
or (bim[0] == info[0] and bim[3] == info[3] and bim[5] < info[5])
|
|
109
|
+
):
|
|
110
|
+
bim = readline(fbim)
|
|
111
|
+
else:
|
|
112
|
+
info = readline(finfo)
|
|
113
|
+
|
|
114
|
+
namefile = namefile_tmp
|
|
115
|
+
|
|
116
|
+
args = {
|
|
117
|
+
"": plink,
|
|
118
|
+
"bfile": input,
|
|
119
|
+
"out": output,
|
|
120
|
+
"make_bed": True,
|
|
121
|
+
"update_name": namefile,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
|
|
3
|
+
library(rlang)
|
|
4
|
+
library(parallel)
|
|
5
|
+
library(mediation)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
fmlfile <- {{in.fmlfile | r}}
|
|
9
|
+
outfile <- {{out.outfile | r}}
|
|
10
|
+
|
|
11
|
+
ncores <- {{envs.ncores | r}}
|
|
12
|
+
sims <- {{envs.sims | r}}
|
|
13
|
+
args <- {{envs.args | r}}
|
|
14
|
+
padj <- {{envs.padj | r}}
|
|
15
|
+
cases <- {{envs.cases | r}}
|
|
16
|
+
transpose_input <- {{envs.transpose_input | r}}
|
|
17
|
+
|
|
18
|
+
set.seed(123)
|
|
19
|
+
|
|
20
|
+
log_info("Reading input file ...")
|
|
21
|
+
indata <- read.table(infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
|
|
22
|
+
if (transpose_input) { indata <- t(indata) }
|
|
23
|
+
|
|
24
|
+
log_info("Reading formula file/cases ...")
|
|
25
|
+
if (!is.null(fmlfile)) {
|
|
26
|
+
if (!is.null(cases) && length(cases) > 0) {
|
|
27
|
+
log_warn("envs.cases ignored as in.fmlfile is provided")
|
|
28
|
+
}
|
|
29
|
+
fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL)
|
|
30
|
+
# Case M Y X Cov Model_M Model_Y
|
|
31
|
+
cases <- split(fmldata, fmldata$Case)
|
|
32
|
+
} else if (is.null(cases) || length(cases) == 0) {
|
|
33
|
+
stop("Either envs.cases or in.fmlfile must be provided")
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
args <- args %||% list()
|
|
37
|
+
|
|
38
|
+
medanalysis = function(casename) {
|
|
39
|
+
case <- cases[[casename]]
|
|
40
|
+
log_info("- Case:", casename)
|
|
41
|
+
M <- case$M
|
|
42
|
+
Y <- case$Y
|
|
43
|
+
X <- case$X
|
|
44
|
+
covs <- case$Cov
|
|
45
|
+
modelm <- match.fun(case$Model_M)
|
|
46
|
+
modely <- match.fun(case$Model_Y)
|
|
47
|
+
fmlm <- as.formula(sprintf("%s ~ %s", bQuote(M), bQuote(X)))
|
|
48
|
+
fmly <- as.formula(sprintf("%s ~ %s + %s", bQuote(Y), bQuote(M), bQuote(X)))
|
|
49
|
+
if (!is.null(covs) && length(covs) == 1) {
|
|
50
|
+
covs <- trimws(strsplit(covs, ",")[[1]])
|
|
51
|
+
}
|
|
52
|
+
if (!is.null(covs)) {
|
|
53
|
+
cov_fml <- as.formula(sprintf("~ . + %s", paste(bQuote(covs), collapse = " + ")))
|
|
54
|
+
fmlm <- update.formula(fmlm, cov_fml)
|
|
55
|
+
fmly <- update.formula(fmly, cov_fml)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
margs <- args
|
|
59
|
+
args$sims <- sims
|
|
60
|
+
args$model.m <- modelm(fmlm, data = indata)
|
|
61
|
+
args$model.y <- modely(fmly, data = indata)
|
|
62
|
+
args$treat <- X
|
|
63
|
+
args$mediator <- M
|
|
64
|
+
args$outcome <- Y
|
|
65
|
+
if (!is.null(covs)) {
|
|
66
|
+
args$covariates <- indata[, covs, drop = FALSE]
|
|
67
|
+
}
|
|
68
|
+
med <- do_call(mediate, args)
|
|
69
|
+
if (is.na(med$d1.p) || is.na(med$n1)) {
|
|
70
|
+
NULL
|
|
71
|
+
} else {
|
|
72
|
+
data.frame(
|
|
73
|
+
Case = casename,
|
|
74
|
+
M = M,
|
|
75
|
+
X = X,
|
|
76
|
+
Y = Y,
|
|
77
|
+
ACME = med$d1,
|
|
78
|
+
ACME95CI1 = med$d1.ci[1],
|
|
79
|
+
ACME95CI2 = med$d1.ci[2],
|
|
80
|
+
TotalEffect = med$tau.coef,
|
|
81
|
+
ADE = med$z1,
|
|
82
|
+
PropMediated = med$n1,
|
|
83
|
+
Pval = med$d1.p
|
|
84
|
+
)
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
out <- do_call(rbind, mclapply(names(cases), medanalysis, mc.cores = ncores))
|
|
89
|
+
|
|
90
|
+
if (padj != "none") {
|
|
91
|
+
out$Padj <- p.adjust(out$Pval, method = padj)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
write.table(out, file = outfile, sep = "\t", quote = FALSE, row.names = FALSE)
|
|
@@ -11,6 +11,7 @@ id_exprs <- {{envs.id_exprs | r}}
|
|
|
11
11
|
pval_cols <- {{envs.pval_cols | r}}
|
|
12
12
|
method <- {{envs.method | r}}
|
|
13
13
|
na <- {{envs.na | r}}
|
|
14
|
+
keep_single <- {{envs.keep_single | r}}
|
|
14
15
|
padj <- {{envs.padj | r}}
|
|
15
16
|
|
|
16
17
|
if (method == "fisher") { method = "sumlog" }
|
|
@@ -102,7 +103,7 @@ if (length(infiles) == 1 && padj == "none") {
|
|
|
102
103
|
if (length(ps) == 0) {
|
|
103
104
|
metaps <- c(metaps, NA)
|
|
104
105
|
ns <- c(ns, NA)
|
|
105
|
-
} else if (length(ps) == 1) {
|
|
106
|
+
} else if (length(ps) == 1 && keep_single) {
|
|
106
107
|
metaps <- c(metaps, ps)
|
|
107
108
|
ns <- c(ns, 1)
|
|
108
109
|
} else {
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
source("{{biopipen_dir}}/utils/misc.R")
|
|
2
|
+
|
|
3
|
+
library(metap)
|
|
4
|
+
library(rlang)
|
|
5
|
+
library(dplyr)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
outfile <- {{out.outfile | r}}
|
|
9
|
+
id_cols <- {{envs.id_cols | r}}
|
|
10
|
+
pval_col <- {{envs.pval_col | r}}
|
|
11
|
+
method <- {{envs.method | r}}
|
|
12
|
+
na <- {{envs.na | r}}
|
|
13
|
+
keep_single <- {{envs.keep_single | r}}
|
|
14
|
+
padj <- {{envs.padj | r}}
|
|
15
|
+
|
|
16
|
+
if (method == "fisher") { method = "sumlog" }
|
|
17
|
+
|
|
18
|
+
# Check pval_cols
|
|
19
|
+
if (is.null(pval_col)) { stop("Must provide envs.pval_col") }
|
|
20
|
+
|
|
21
|
+
# Check id_cols
|
|
22
|
+
if (is.null(id_cols)) { stop("Must provide envs.id_cols") }
|
|
23
|
+
if (length(id_cols) == 1) {
|
|
24
|
+
id_cols <- trimws(strsplit(id_cols, ",")[[1]])
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
log_info("Reading input and performing meta-analysis ...")
|
|
28
|
+
outdata <- read.table(
|
|
29
|
+
infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE
|
|
30
|
+
) %>%
|
|
31
|
+
group_by(!!!syms(id_cols)) %>%
|
|
32
|
+
summarise(
|
|
33
|
+
N = n(),
|
|
34
|
+
.pvals = list(!!sym(pval_col)),
|
|
35
|
+
.groups = "drop"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
metaps <- c()
|
|
39
|
+
ns <- c()
|
|
40
|
+
for (ps in outdata$.pvals) {
|
|
41
|
+
if (na == -1) {
|
|
42
|
+
ps <- ps[!is.na(ps)]
|
|
43
|
+
} else {
|
|
44
|
+
ps[is.na(ps)] <- na
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (length(ps) == 0) {
|
|
48
|
+
metaps <- c(metaps, NA)
|
|
49
|
+
ns <- c(ns, NA)
|
|
50
|
+
} else if (length(ps) == 1 && keep_single) {
|
|
51
|
+
metaps <- c(metaps, ps)
|
|
52
|
+
ns <- c(ns, 1)
|
|
53
|
+
} else {
|
|
54
|
+
metaps <- c(metaps, do.call(method, list(ps))$p)
|
|
55
|
+
ns <- c(ns, length(ps))
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
outdata$MetaPval <- metaps
|
|
59
|
+
outdata$N <- ns
|
|
60
|
+
outdata$.pvals <- NULL
|
|
61
|
+
outdata <- outdata %>% arrange(MetaPval)
|
|
62
|
+
|
|
63
|
+
if (padj != "none") {
|
|
64
|
+
log_info("Calculating adjusted p-values ...")
|
|
65
|
+
outdata$MetaPadj <- p.adjust(outdata$MetaPval, method = padj)
|
|
66
|
+
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
log_info("Writing output ...")
|
|
70
|
+
write.table(outdata, outfile, quote = FALSE, sep = "\t", row.names = FALSE)
|
|
@@ -130,13 +130,6 @@ shared_clusters = function(name) {
|
|
|
130
130
|
row.names=TRUE, col.names=TRUE, quote=FALSE, sep="\t"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
|
-
if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
|
|
134
|
-
anno = NULL
|
|
135
|
-
} else {
|
|
136
|
-
anno = as.list(immdata$meta[, case$heatmap_meta, drop=FALSE])
|
|
137
|
-
anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
|
|
138
|
-
}
|
|
139
|
-
|
|
140
133
|
if (!is.null(case$sample_order) && length(case$sample_order) > 0) {
|
|
141
134
|
if (length(case$sample_order) == 1) {
|
|
142
135
|
case$sample_order = trimws(strsplit(case$sample_order, ",")[[1]])
|
|
@@ -148,6 +141,18 @@ shared_clusters = function(name) {
|
|
|
148
141
|
plotdata = plotdata[, case$sample_order, drop=FALSE]
|
|
149
142
|
}
|
|
150
143
|
|
|
144
|
+
if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
|
|
145
|
+
anno = NULL
|
|
146
|
+
} else {
|
|
147
|
+
anno = as.list(
|
|
148
|
+
immdata$meta[
|
|
149
|
+
match(colnames(plotdata), immdata$meta$Sample),
|
|
150
|
+
case$heatmap_meta,
|
|
151
|
+
drop=FALSE
|
|
152
|
+
])
|
|
153
|
+
anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
|
|
154
|
+
}
|
|
155
|
+
|
|
151
156
|
cluster_rows = case$cluster_rows && nrow(plotdata) > 2
|
|
152
157
|
col_samples = colnames(plotdata)
|
|
153
158
|
if (!cluster_rows) {
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from os import path
|
|
2
|
+
from contextlib import suppress
|
|
3
|
+
from pathlib import PosixPath # noqa: F401
|
|
4
|
+
|
|
5
|
+
from biopipen.utils.reference import tabix_index
|
|
6
|
+
from biopipen.utils.misc import logger
|
|
7
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
8
|
+
|
|
9
|
+
infile = {{in.infile | repr}} # pyright: ignore # noqa: E999
|
|
10
|
+
annfile = {{in.annfile | repr}} # pyright: ignore
|
|
11
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
12
|
+
joboutdir = {{job.outdir | repr}} # pyright: ignore
|
|
13
|
+
envs = {{envs | dict | repr}} # pyright: ignore
|
|
14
|
+
|
|
15
|
+
bcftools = envs.pop("bcftools")
|
|
16
|
+
tabix = envs.pop("tabix")
|
|
17
|
+
ncores = envs.pop("ncores")
|
|
18
|
+
columns = envs.pop("columns")
|
|
19
|
+
remove = envs.pop("remove")
|
|
20
|
+
header = envs.pop("header")
|
|
21
|
+
gz = envs.pop("gz")
|
|
22
|
+
index = envs.pop("index")
|
|
23
|
+
|
|
24
|
+
if isinstance(columns, list):
|
|
25
|
+
columns = ",".join(columns)
|
|
26
|
+
|
|
27
|
+
if "c" in envs:
|
|
28
|
+
logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
|
|
29
|
+
del envs["c"]
|
|
30
|
+
|
|
31
|
+
if isinstance(remove, list):
|
|
32
|
+
remove = ",".join(remove)
|
|
33
|
+
|
|
34
|
+
if "x" in envs:
|
|
35
|
+
logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
|
|
36
|
+
del envs["x"]
|
|
37
|
+
|
|
38
|
+
envs_has_annfile = "a" in envs or "annotations" in envs
|
|
39
|
+
headerfile = path.join(joboutdir, "header.txt")
|
|
40
|
+
if header:
|
|
41
|
+
with open(headerfile, "w") as fh:
|
|
42
|
+
fh.writelines(header)
|
|
43
|
+
|
|
44
|
+
if annfile and envs_has_annfile:
|
|
45
|
+
logger.warning(
|
|
46
|
+
"Ignoring envs\[a/annotations] because in.annfile is provided."
|
|
47
|
+
)
|
|
48
|
+
with suppress(KeyError):
|
|
49
|
+
del envs["a"]
|
|
50
|
+
with suppress(KeyError):
|
|
51
|
+
del envs["annotations"]
|
|
52
|
+
elif not annfile and envs_has_annfile:
|
|
53
|
+
annfile = envs.pop("annotations", None) or envs.pop("a", None)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if index and not gz:
|
|
57
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
58
|
+
gz = True
|
|
59
|
+
|
|
60
|
+
envs[""] = [bcftools, "annotate"]
|
|
61
|
+
envs["o"] = outfile
|
|
62
|
+
envs["threads"] = ncores
|
|
63
|
+
|
|
64
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
65
|
+
envs["O"] = "z" if gz else "v"
|
|
66
|
+
|
|
67
|
+
if columns:
|
|
68
|
+
envs["columns"] = columns
|
|
69
|
+
if not annfile:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"envs.columns specified but no in.annfile/envs.annfile provided."
|
|
72
|
+
)
|
|
73
|
+
envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
|
|
74
|
+
|
|
75
|
+
if remove:
|
|
76
|
+
envs["remove"] = remove
|
|
77
|
+
# no need to index it
|
|
78
|
+
envs["_"] = infile
|
|
79
|
+
|
|
80
|
+
if "columns" not in envs and "remove" not in envs:
|
|
81
|
+
logger.warning(
|
|
82
|
+
"No columns/remove specified, no columns will be carried over or removed."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if annfile:
|
|
86
|
+
envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
|
|
87
|
+
|
|
88
|
+
if header:
|
|
89
|
+
envs["header_lines"] = headerfile
|
|
90
|
+
|
|
91
|
+
run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
|