biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +204 -0
  11. biopipen/ns/regulatory.py +214 -0
  12. biopipen/ns/scrna.py +31 -5
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +167 -3
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/delim/SampleInfo.R +10 -5
  36. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  37. biopipen/scripts/gene/GenePromoters.R +61 -0
  38. biopipen/scripts/misc/Shell.sh +15 -0
  39. biopipen/scripts/plot/Manhattan.R +146 -0
  40. biopipen/scripts/plot/QQPlot.R +146 -0
  41. biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
  42. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
  43. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
  44. biopipen/scripts/regulatory/MotifScan.py +159 -0
  45. biopipen/scripts/regulatory/atSNP.R +33 -0
  46. biopipen/scripts/regulatory/motifBreakR.R +1594 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +69 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +71 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratPreparing.R +252 -122
  51. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  52. biopipen/scripts/snp/MatrixEQTL.R +85 -44
  53. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  54. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  55. biopipen/scripts/snp/PlinkFilter.py +100 -0
  56. biopipen/scripts/snp/PlinkFreq.R +298 -0
  57. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  58. biopipen/scripts/snp/PlinkHWE.R +80 -0
  59. biopipen/scripts/snp/PlinkHet.R +92 -0
  60. biopipen/scripts/snp/PlinkIBD.R +200 -0
  61. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  62. biopipen/scripts/stats/Mediation.R +94 -0
  63. biopipen/scripts/stats/MetaPvalue.R +2 -1
  64. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  65. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  66. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  67. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  68. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  69. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  70. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  71. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  72. biopipen/utils/gene.R +83 -37
  73. biopipen/utils/gene.py +108 -60
  74. biopipen/utils/misc.R +56 -0
  75. biopipen/utils/misc.py +5 -2
  76. biopipen/utils/reference.py +54 -10
  77. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
  78. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
  79. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
  80. biopipen/ns/bcftools.py +0 -111
  81. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  82. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  83. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  84. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  85. {biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,200 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+ source("{{biopipen_dir}}/utils/plot.R")
3
+ suppressPackageStartupMessages({
4
+ library(dplyr)
5
+ library(tidyr)
6
+ library(tibble)
7
+ })
8
+
9
+ indir <- {{in.indir | r}}
10
+ outdir <- {{out.outdir | r}}
11
+ plink <- {{envs.plink | r}}
12
+ indep <- {{envs.indep | r}}
13
+ highld <- {{envs.highld | r}}
14
+ devpars <- {{envs.devpars | r}}
15
+ pihat <- {{envs.pihat | r}}
16
+ samid <- {{envs.samid | r}}
17
+ annofile <- {{envs.anno | r}}
18
+ doplot <- {{envs.plot | r}}
19
+ seed <- {{envs.seed | r}}
20
+ ncores <- {{envs.ncores | r}}
21
+
22
+ bedfile <- Sys.glob(file.path(indir, '*.bed'))
23
+ if (length(bedfile) == 0)
24
+ stop("No bed files found in the input directory.")
25
+ if (length(bedfile) > 1) {
26
+ log_warn("Multiple bed files found in the input directory. Using the first one.")
27
+ bedfile <- bedfile[1]
28
+ }
29
+ input <- tools::file_path_sans_ext(bedfile)
30
+ output <- file.path(outdir, basename(input))
31
+
32
+ cmd <- c(
33
+ plink,
34
+ "--threads", ncores,
35
+ "--bfile", input,
36
+ "--indep-pairwise", indep,
37
+ "--keep-allele-order",
38
+ # One should be mindful of running this with < 50 samples
39
+ # "--bad-ld",
40
+ "--out", output
41
+ )
42
+ if (!is.null(highld) && !isFALSE(highld)) {
43
+ cmd <- c(cmd, "--range", "--exclude", highld)
44
+ }
45
+ run_command(cmd, fg = TRUE)
46
+
47
+ prunein <- paste0(output, '.prune.in')
48
+ cmd <- c(
49
+ plink,
50
+ "--threads", ncores,
51
+ "--bfile", input,
52
+ "--extract", prunein,
53
+ "--keep-allele-order",
54
+ "--genome",
55
+ "--out", output
56
+ )
57
+ run_command(cmd, fg = TRUE)
58
+
59
+ genome <- read.table(
60
+ paste0(output, '.genome'),
61
+ row.names = NULL,
62
+ header = TRUE,
63
+ check.names = FALSE
64
+ )
65
+ # "unmelt" it
66
+ # FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE DST PPC RATIO
67
+ # s1 s1 s2 s2 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.866584 0.0000 0.9194
68
+ # s1 s1 s2 s2 UN NA 0.4846 0.3724 0.1431 0.3293 -1 0.913945 0.7236 2.0375
69
+ # s1 s1 s3 s3 UN NA 1.0000 0.0000 0.0000 0.0000 -1 0.867186 0.0000 1.0791
70
+ genome$SAMPLE1 <- paste(genome$FID1, genome$IID1, sep = "\t")
71
+ genome$SAMPLE2 <- paste(genome$FID2, genome$IID2, sep = "\t")
72
+
73
+
74
+ # get all samples
75
+ samples <- unique(c(genome$SAMPLE1, genome$SAMPLE2))
76
+ # make paired into a distance-like matrix
77
+ similarity <- genome %>%
78
+ select(SAMPLE1, SAMPLE2, PI_HAT) %>%
79
+ pivot_wider(names_from = SAMPLE2, values_from = PI_HAT, values_fill = NA) %>%
80
+ as.data.frame() %>%
81
+ column_to_rownames("SAMPLE1")
82
+ rm(genome)
83
+ # get the rownames back
84
+ samids <- rownames(similarity)
85
+ # get samples that didn't involved
86
+ missedrow <- setdiff(samples, rownames(similarity))
87
+ missedcol <- setdiff(samples, colnames(similarity))
88
+ similarity[missedrow, ] <- NA
89
+ similarity[, missedcol] <- NA
90
+ # order the matrix
91
+ similarity <- similarity[samples, samples, drop = FALSE]
92
+ # transpose the matrix to get the symmetric values
93
+ sim2 <- t(similarity)
94
+ isna <- is.na(similarity)
95
+ # fill the na's with their symmetric values
96
+ similarity[isna] <- sim2[isna]
97
+ rm(sim2)
98
+ # still missing: keep them
99
+ similarity[is.na(similarity)] <- 0
100
+ # get the marks (samples that fail the pihat cutoff)
101
+ nsams <- length(samples)
102
+ fails <- which(similarity > pihat)
103
+ marks <- data.frame(x = (fails - 1)%%nsams + 1, y = ceiling(fails/nsams))
104
+ diag(similarity) <- 1
105
+
106
+ failflags <- rep(F, nrow(marks))
107
+ freqs <- as.data.frame(table(factor(as.matrix(marks))))
108
+ freqs <- freqs[order(freqs$Freq, decreasing = T), 'Var1', drop = T]
109
+ ibd.fail <- c()
110
+ while (sum(failflags) < nrow(marks)) {
111
+ samidx <- freqs[1]
112
+ ibd.fail <- c(ibd.fail, samples[samidx])
113
+ freqs <- freqs[-1]
114
+ sapply(1:nrow(marks), function(i) {
115
+ if (samidx %in% marks[i,])
116
+ failflags[i] <<- TRUE
117
+ })
118
+ }
119
+
120
+ ibd_fail_file <- paste0(output, '.ibd.fail')
121
+ writeLines(ibd.fail, ibd_fail_file)
122
+ cmd <- c(
123
+ plink,
124
+ "--threads", ncores,
125
+ "--bfile", input,
126
+ "--remove", ibd_fail_file,
127
+ "--keep-allele-order",
128
+ "--make-bed",
129
+ "--out", output
130
+ )
131
+ run_command(cmd, fg = TRUE)
132
+
133
+ if (doplot) {
134
+ set.seed(seed)
135
+ library(ComplexHeatmap)
136
+ fontsize8 <- gpar(fontsize = 8)
137
+ fontsize9 <- gpar(fontsize = 9)
138
+ ht_opt$heatmap_row_names_gp <- fontsize8
139
+ ht_opt$heatmap_column_names_gp <- fontsize8
140
+ ht_opt$legend_title_gp <- fontsize9
141
+ ht_opt$legend_labels_gp <- fontsize8
142
+ ht_opt$simple_anno_size <- unit(3, "mm")
143
+
144
+ samids <- sapply(samples, function(sid) {
145
+ fidiid <- unlist(strsplit(sid, "\t", fixed = TRUE))
146
+ gsub(
147
+ "{fid}",
148
+ fidiid[1],
149
+ gsub("{iid}", fidiid[2], samid, fixed = TRUE),
150
+ fixed = TRUE
151
+ )
152
+ })
153
+ rownames(similarity) <- samids
154
+ colnames(similarity) <- samids
155
+
156
+ annos <- list()
157
+ if (!is.null(annofile) && !isFALSE(annofile)) {
158
+ options(stringsAsFactors = TRUE)
159
+ andata <- read.table(annofile, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
160
+ andata <- andata[samids, , drop = FALSE]
161
+ for (anname in colnames(andata)) {
162
+ annos[[anname]] <- as.matrix(andata[, anname])
163
+ }
164
+ annos$annotation_name_gp <- fontsize8
165
+ annos <- do.call(HeatmapAnnotation, annos)
166
+ }
167
+
168
+ args <- list(
169
+ name = "PI_HAT",
170
+ cell_fun = function(j, i, x, y, width, height, fill) {
171
+ if (similarity[i, j] > pihat && i != j)
172
+ grid.points(x, y, pch = 4, size = unit(.5, "char"))
173
+ },
174
+ #heatmap_legend_param = list(
175
+ # title_gp = fontsize9,
176
+ # labels_gp = fontsize8
177
+ #),
178
+ clustering_distance_rows = function(m) as.dist(1-m),
179
+ clustering_distance_columns = function(m) as.dist(1-m),
180
+ top_annotation = if (length(annos) == 0) NULL else annos
181
+ )
182
+
183
+ plotHeatmap(
184
+ similarity,
185
+ outfile = paste0(output, '.ibd.png'),
186
+ args = args,
187
+ draw = list(
188
+ annotation_legend_list = list(
189
+ Legend(
190
+ labels = paste(">", pihat),
191
+ title = "",
192
+ type = "points",
193
+ pch = 4,
194
+ title_gp = fontsize9,
195
+ labels_gp = fontsize8)),
196
+ merge_legend = TRUE
197
+ ),
198
+ devpars = devpars
199
+ )
200
+ }
@@ -0,0 +1,124 @@
1
+ from pathlib import Path
2
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
3
+
4
+ indir = {{in.indir | repr}} # pyright: ignore # noqa: #999
5
+ namefile = {{in.namefile | repr}} # pyright: ignore
6
+ outdir = {{out.outdir | repr}} # pyright: ignore
7
+ plink = {{envs.plink | repr}} # pyright: ignore
8
+ bcftools = {{envs.bcftools | repr}} # pyright: ignore
9
+ ncores = {{envs.ncores | repr}} # pyright: ignore
10
+ match_alt = {{envs.match_alt | repr}} # pyright: ignore
11
+
12
+ bedfile = list(Path(indir).glob("*.bed"))
13
+ if len(bedfile) == 0:
14
+ raise FileNotFoundError(f"No .bed file found in `in.indir`")
15
+ elif len(bedfile) > 1:
16
+ logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
17
+
18
+ bedfile = bedfile[0]
19
+ input = bedfile.with_suffix("")
20
+ output = Path(outdir) / bedfile.stem
21
+
22
+ if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
23
+ logger.info("VCF file received, extracting names")
24
+ def alt_matched(bim_alt, vcf_alt, match_alt):
25
+ if match_alt == "none":
26
+ return True
27
+ if match_alt == "exact":
28
+ return bim_alt == vcf_alt
29
+
30
+ bim_alts = bim_alt.split(",")
31
+ vcf_alts = vcf_alt.split(",")
32
+ if match_alt == "all":
33
+ return set(bim_alts) == set(vcf_alts)
34
+ if match_alt == "any":
35
+ return bool(set(bim_alts) & set(vcf_alts))
36
+ if match_alt == "first_included":
37
+ return bim_alts[0] in vcf_alts
38
+ if match_alt == "first":
39
+ return bim_alts[0] == vcf_alts[0]
40
+
41
+ raise ValueError(f"Unknown match_alt: {match_alt}")
42
+
43
+ def readline(f):
44
+ line = f.readline().strip()
45
+ return line.split("\t") if line else None
46
+
47
+ namefile_tmp = Path(outdir) / "_namefile_from_vcf.txt"
48
+ infofile = Path(outdir) / "_information_from_vcf_unsorted.txt"
49
+ sorted_infofile = Path(outdir) / "_information_from_vcf_sorted.txt"
50
+ sorted_bim = Path(outdir) / "_sorted_bim.txt"
51
+ bt_cmd = [
52
+ bcftools, "query",
53
+ "-f", "%CHROM\\t%ID\\t0\\t%POS\\t%ALT\\t%REF\\n",
54
+ "-o", infofile,
55
+ namefile,
56
+ ]
57
+ ## infofile
58
+ # 1 rs10492 0 10492 T C
59
+ logger.info("- Extracting information from VCF file ...")
60
+ run_command(bt_cmd, fg=True)
61
+ # sort infofile
62
+ logger.info("- Sorting the information from VCF file ...")
63
+ run_command(
64
+ [
65
+ "sort",
66
+ "-k1,1", "-k4,4n", "-k6,6",
67
+ infofile,
68
+ "--parallel", ncores,
69
+ "-o", sorted_infofile
70
+ ],
71
+ env={"LC_ALL": "C"},
72
+ fg=True,
73
+ )
74
+
75
+ ## .bim file
76
+ # 1 1_10492 0 10492 T C
77
+ # sort .bim file
78
+ logger.info("- Sorting the .bim file ...")
79
+ run_command(
80
+ [
81
+ "sort",
82
+ "-k1,1", "-k4,4n", "-k6,6",
83
+ input.with_suffix(".bim"),
84
+ "--parallel", ncores,
85
+ "-o", sorted_bim
86
+ ],
87
+ env={"LC_ALL": "C"},
88
+ fg=True,
89
+ )
90
+ # query namefile for records in sorted bim file
91
+ logger.info("- Matching and generating the name file ...")
92
+ with sorted_bim.open() as fbim, sorted_infofile.open() as finfo, namefile_tmp.open("w") as fout: # noqa: E501
93
+ bim = readline(fbim)
94
+ info = readline(finfo)
95
+ while bim and info:
96
+ if (
97
+ bim[0] == info[0]
98
+ and bim[3] == info[3]
99
+ and bim[5] == info[5]
100
+ and alt_matched(bim[4], info[4], match_alt)
101
+ ):
102
+ fout.write(f"{bim[1]}\t{info[1]}\n")
103
+ bim = readline(fbim)
104
+ info = readline(finfo)
105
+ elif (
106
+ bim[0] < info[0]
107
+ or (bim[0] == info[0] and bim[3] < info[3])
108
+ or (bim[0] == info[0] and bim[3] == info[3] and bim[5] < info[5])
109
+ ):
110
+ bim = readline(fbim)
111
+ else:
112
+ info = readline(finfo)
113
+
114
+ namefile = namefile_tmp
115
+
116
+ args = {
117
+ "": plink,
118
+ "bfile": input,
119
+ "out": output,
120
+ "make_bed": True,
121
+ "update_name": namefile,
122
+ }
123
+
124
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
@@ -0,0 +1,94 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+
3
+ library(rlang)
4
+ library(parallel)
5
+ library(mediation)
6
+
7
+ infile <- {{in.infile | r}}
8
+ fmlfile <- {{in.fmlfile | r}}
9
+ outfile <- {{out.outfile | r}}
10
+
11
+ ncores <- {{envs.ncores | r}}
12
+ sims <- {{envs.sims | r}}
13
+ args <- {{envs.args | r}}
14
+ padj <- {{envs.padj | r}}
15
+ cases <- {{envs.cases | r}}
16
+ transpose_input <- {{envs.transpose_input | r}}
17
+
18
+ set.seed(123)
19
+
20
+ log_info("Reading input file ...")
21
+ indata <- read.table(infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
22
+ if (transpose_input) { indata <- t(indata) }
23
+
24
+ log_info("Reading formula file/cases ...")
25
+ if (!is.null(fmlfile)) {
26
+ if (!is.null(cases) && length(cases) > 0) {
27
+ log_warn("envs.cases ignored as in.fmlfile is provided")
28
+ }
29
+ fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL)
30
+ # Case M Y X Cov Model_M Model_Y
31
+ cases <- split(fmldata, fmldata$Case)
32
+ } else if (is.null(cases) || length(cases) == 0) {
33
+ stop("Either envs.cases or in.fmlfile must be provided")
34
+ }
35
+
36
+ args <- args %||% list()
37
+
38
+ medanalysis = function(casename) {
39
+ case <- cases[[casename]]
40
+ log_info("- Case:", casename)
41
+ M <- case$M
42
+ Y <- case$Y
43
+ X <- case$X
44
+ covs <- case$Cov
45
+ modelm <- match.fun(case$Model_M)
46
+ modely <- match.fun(case$Model_Y)
47
+ fmlm <- as.formula(sprintf("%s ~ %s", bQuote(M), bQuote(X)))
48
+ fmly <- as.formula(sprintf("%s ~ %s + %s", bQuote(Y), bQuote(M), bQuote(X)))
49
+ if (!is.null(covs) && length(covs) == 1) {
50
+ covs <- trimws(strsplit(covs, ",")[[1]])
51
+ }
52
+ if (!is.null(covs)) {
53
+ cov_fml <- as.formula(sprintf("~ . + %s", paste(bQuote(covs), collapse = " + ")))
54
+ fmlm <- update.formula(fmlm, cov_fml)
55
+ fmly <- update.formula(fmly, cov_fml)
56
+ }
57
+
58
+ margs <- args
59
+ args$sims <- sims
60
+ args$model.m <- modelm(fmlm, data = indata)
61
+ args$model.y <- modely(fmly, data = indata)
62
+ args$treat <- X
63
+ args$mediator <- M
64
+ args$outcome <- Y
65
+ if (!is.null(covs)) {
66
+ args$covariates <- indata[, covs, drop = FALSE]
67
+ }
68
+ med <- do_call(mediate, args)
69
+ if (is.na(med$d1.p) || is.na(med$n1)) {
70
+ NULL
71
+ } else {
72
+ data.frame(
73
+ Case = casename,
74
+ M = M,
75
+ X = X,
76
+ Y = Y,
77
+ ACME = med$d1,
78
+ ACME95CI1 = med$d1.ci[1],
79
+ ACME95CI2 = med$d1.ci[2],
80
+ TotalEffect = med$tau.coef,
81
+ ADE = med$z1,
82
+ PropMediated = med$n1,
83
+ Pval = med$d1.p
84
+ )
85
+ }
86
+ }
87
+
88
+ out <- do_call(rbind, mclapply(names(cases), medanalysis, mc.cores = ncores))
89
+
90
+ if (padj != "none") {
91
+ out$Padj <- p.adjust(out$Pval, method = padj)
92
+ }
93
+
94
+ write.table(out, file = outfile, sep = "\t", quote = FALSE, row.names = FALSE)
@@ -11,6 +11,7 @@ id_exprs <- {{envs.id_exprs | r}}
11
11
  pval_cols <- {{envs.pval_cols | r}}
12
12
  method <- {{envs.method | r}}
13
13
  na <- {{envs.na | r}}
14
+ keep_single <- {{envs.keep_single | r}}
14
15
  padj <- {{envs.padj | r}}
15
16
 
16
17
  if (method == "fisher") { method = "sumlog" }
@@ -102,7 +103,7 @@ if (length(infiles) == 1 && padj == "none") {
102
103
  if (length(ps) == 0) {
103
104
  metaps <- c(metaps, NA)
104
105
  ns <- c(ns, NA)
105
- } else if (length(ps) == 1) {
106
+ } else if (length(ps) == 1 && keep_single) {
106
107
  metaps <- c(metaps, ps)
107
108
  ns <- c(ns, 1)
108
109
  } else {
@@ -0,0 +1,70 @@
1
+ source("{{biopipen_dir}}/utils/misc.R")
2
+
3
+ library(metap)
4
+ library(rlang)
5
+ library(dplyr)
6
+
7
+ infile <- {{in.infile | r}}
8
+ outfile <- {{out.outfile | r}}
9
+ id_cols <- {{envs.id_cols | r}}
10
+ pval_col <- {{envs.pval_col | r}}
11
+ method <- {{envs.method | r}}
12
+ na <- {{envs.na | r}}
13
+ keep_single <- {{envs.keep_single | r}}
14
+ padj <- {{envs.padj | r}}
15
+
16
+ if (method == "fisher") { method = "sumlog" }
17
+
18
+ # Check pval_cols
19
+ if (is.null(pval_col)) { stop("Must provide envs.pval_col") }
20
+
21
+ # Check id_cols
22
+ if (is.null(id_cols)) { stop("Must provide envs.id_cols") }
23
+ if (length(id_cols) == 1) {
24
+ id_cols <- trimws(strsplit(id_cols, ",")[[1]])
25
+ }
26
+
27
+ log_info("Reading input and performing meta-analysis ...")
28
+ outdata <- read.table(
29
+ infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE
30
+ ) %>%
31
+ group_by(!!!syms(id_cols)) %>%
32
+ summarise(
33
+ N = n(),
34
+ .pvals = list(!!sym(pval_col)),
35
+ .groups = "drop"
36
+ )
37
+
38
+ metaps <- c()
39
+ ns <- c()
40
+ for (ps in outdata$.pvals) {
41
+ if (na == -1) {
42
+ ps <- ps[!is.na(ps)]
43
+ } else {
44
+ ps[is.na(ps)] <- na
45
+ }
46
+
47
+ if (length(ps) == 0) {
48
+ metaps <- c(metaps, NA)
49
+ ns <- c(ns, NA)
50
+ } else if (length(ps) == 1 && keep_single) {
51
+ metaps <- c(metaps, ps)
52
+ ns <- c(ns, 1)
53
+ } else {
54
+ metaps <- c(metaps, do.call(method, list(ps))$p)
55
+ ns <- c(ns, length(ps))
56
+ }
57
+ }
58
+ outdata$MetaPval <- metaps
59
+ outdata$N <- ns
60
+ outdata$.pvals <- NULL
61
+ outdata <- outdata %>% arrange(MetaPval)
62
+
63
+ if (padj != "none") {
64
+ log_info("Calculating adjusted p-values ...")
65
+ outdata$MetaPadj <- p.adjust(outdata$MetaPval, method = padj)
66
+
67
+ }
68
+
69
+ log_info("Writing output ...")
70
+ write.table(outdata, outfile, quote = FALSE, sep = "\t", row.names = FALSE)
@@ -130,13 +130,6 @@ shared_clusters = function(name) {
130
130
  row.names=TRUE, col.names=TRUE, quote=FALSE, sep="\t"
131
131
  )
132
132
 
133
- if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
134
- anno = NULL
135
- } else {
136
- anno = as.list(immdata$meta[, case$heatmap_meta, drop=FALSE])
137
- anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
138
- }
139
-
140
133
  if (!is.null(case$sample_order) && length(case$sample_order) > 0) {
141
134
  if (length(case$sample_order) == 1) {
142
135
  case$sample_order = trimws(strsplit(case$sample_order, ",")[[1]])
@@ -148,6 +141,18 @@ shared_clusters = function(name) {
148
141
  plotdata = plotdata[, case$sample_order, drop=FALSE]
149
142
  }
150
143
 
144
+ if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
145
+ anno = NULL
146
+ } else {
147
+ anno = as.list(
148
+ immdata$meta[
149
+ match(colnames(plotdata), immdata$meta$Sample),
150
+ case$heatmap_meta,
151
+ drop=FALSE
152
+ ])
153
+ anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
154
+ }
155
+
151
156
  cluster_rows = case$cluster_rows && nrow(plotdata) > 2
152
157
  col_samples = colnames(plotdata)
153
158
  if (!cluster_rows) {
@@ -0,0 +1,91 @@
1
+ from os import path
2
+ from contextlib import suppress
3
+ from pathlib import PosixPath # noqa: F401
4
+
5
+ from biopipen.utils.reference import tabix_index
6
+ from biopipen.utils.misc import logger
7
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
8
+
9
+ infile = {{in.infile | repr}} # pyright: ignore # noqa: E999
10
+ annfile = {{in.annfile | repr}} # pyright: ignore
11
+ outfile = {{out.outfile | repr}} # pyright: ignore
12
+ joboutdir = {{job.outdir | repr}} # pyright: ignore
13
+ envs = {{envs | dict | repr}} # pyright: ignore
14
+
15
+ bcftools = envs.pop("bcftools")
16
+ tabix = envs.pop("tabix")
17
+ ncores = envs.pop("ncores")
18
+ columns = envs.pop("columns")
19
+ remove = envs.pop("remove")
20
+ header = envs.pop("header")
21
+ gz = envs.pop("gz")
22
+ index = envs.pop("index")
23
+
24
+ if isinstance(columns, list):
25
+ columns = ",".join(columns)
26
+
27
+ if "c" in envs:
28
+ logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
29
+ del envs["c"]
30
+
31
+ if isinstance(remove, list):
32
+ remove = ",".join(remove)
33
+
34
+ if "x" in envs:
35
+ logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
36
+ del envs["x"]
37
+
38
+ envs_has_annfile = "a" in envs or "annotations" in envs
39
+ headerfile = path.join(joboutdir, "header.txt")
40
+ if header:
41
+ with open(headerfile, "w") as fh:
42
+ fh.writelines(header)
43
+
44
+ if annfile and envs_has_annfile:
45
+ logger.warning(
46
+ "Ignoring envs\[a/annotations] because in.annfile is provided."
47
+ )
48
+ with suppress(KeyError):
49
+ del envs["a"]
50
+ with suppress(KeyError):
51
+ del envs["annotations"]
52
+ elif not annfile and envs_has_annfile:
53
+ annfile = envs.pop("annotations", None) or envs.pop("a", None)
54
+
55
+
56
+ if index and not gz:
57
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
58
+ gz = True
59
+
60
+ envs[""] = [bcftools, "annotate"]
61
+ envs["o"] = outfile
62
+ envs["threads"] = ncores
63
+
64
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
65
+ envs["O"] = "z" if gz else "v"
66
+
67
+ if columns:
68
+ envs["columns"] = columns
69
+ if not annfile:
70
+ raise ValueError(
71
+ "envs.columns specified but no in.annfile/envs.annfile provided."
72
+ )
73
+ envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
74
+
75
+ if remove:
76
+ envs["remove"] = remove
77
+ # no need to index it
78
+ envs["_"] = infile
79
+
80
+ if "columns" not in envs and "remove" not in envs:
81
+ logger.warning(
82
+ "No columns/remove specified, no columns will be carried over or removed."
83
+ )
84
+
85
+ if annfile:
86
+ envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
87
+
88
+ if header:
89
+ envs["header_lines"] = headerfile
90
+
91
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)