PyPI - biopipen - Versions diffs - 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

biopipen 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (83) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +8 -0
biopipen/ns/bam.py +0 -2
biopipen/ns/bed.py +35 -0
biopipen/ns/cellranger_pipeline.py +5 -5
biopipen/ns/cnv.py +18 -2
biopipen/ns/cnvkit_pipeline.py +16 -11
biopipen/ns/gene.py +68 -23
biopipen/ns/misc.py +2 -15
biopipen/ns/plot.py +146 -0
biopipen/ns/regulation.py +214 -0
biopipen/ns/scrna.py +15 -3
biopipen/ns/snp.py +516 -8
biopipen/ns/stats.py +74 -2
biopipen/ns/vcf.py +196 -0
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/scripts/bam/CNVpytor.py +144 -46
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMerge.py +1 -1
biopipen/scripts/cnv/AneuploidyScore.R +30 -7
biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
biopipen/scripts/cnv/TMADScore.R +21 -5
biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
biopipen/scripts/gene/GeneNameConversion.R +65 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/plot/Manhattan.R +140 -0
biopipen/scripts/plot/QQPlot.R +62 -0
biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
biopipen/scripts/regulation/MotifScan.py +159 -0
biopipen/scripts/regulation/atSNP.R +33 -0
biopipen/scripts/regulation/motifBreakR.R +1594 -0
biopipen/scripts/scrna/CellsDistribution.R +2 -0
biopipen/scripts/scrna/MarkersFinder.R +59 -67
biopipen/scripts/scrna/SeuratClustering.R +63 -29
biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
biopipen/scripts/snp/MatrixEQTL.R +84 -43
biopipen/scripts/snp/Plink2GTMat.py +133 -0
biopipen/scripts/snp/PlinkCallRate.R +190 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +298 -0
biopipen/scripts/snp/PlinkFromVcf.py +78 -0
biopipen/scripts/snp/PlinkHWE.R +80 -0
biopipen/scripts/snp/PlinkHet.R +92 -0
biopipen/scripts/snp/PlinkIBD.R +197 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/MetaPvalue.R +2 -1
biopipen/scripts/stats/MetaPvalue1.R +70 -0
biopipen/scripts/tcr/TCRClusterStats.R +12 -7
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/VcfFix_utils.py +1 -1
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/utils/gene.R +83 -37
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.R +56 -0
biopipen/utils/misc.py +5 -2
biopipen/utils/reference.py +54 -10
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
biopipen/ns/bcftools.py +0 -111
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0

biopipen/scripts/snp/PlinkIBD.R ADDED Viewed

@@ -0,0 +1,197 @@
+source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/plot.R")
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(tidyr)
+    library(tibble)
+})
+indir    <- {{in.indir | r}}
+outdir   <- {{out.outdir | r}}
+plink    <- {{envs.plink | r}}
+indep    <- {{envs.indep | r}}
+highld   <- {{envs.highld | r}}
+devpars  <- {{envs.devpars | r}}
+pihat    <- {{envs.pihat | r}}
+samid    <- {{envs.samid | r}}
+annofile <- {{envs.anno | r}}
+doplot   <- {{envs.plot | r}}
+seed     <- {{envs.seed | r}}
+ncores   <- {{envs.ncores | r}}
+bedfile <- Sys.glob(file.path(indir, '*.bed'))
+if (length(bedfile) == 0)
+    stop("No bed files found in the input directory.")
+if (length(bedfile) > 1) {
+    log_warn("Multiple bed files found in the input directory. Using the first one.")
+    bedfile <- bedfile[1]
+}
+input  <- tools::file_path_sans_ext(bedfile)
+output <- file.path(outdir, basename(input))
+cmd <- c(
+    plink,
+    "--threads", ncores,
+    "--bfile", input,
+    "--indep-pairwise", indep,
+	# One should be mindful of running this with < 50 samples
+	# "--bad-ld",
+    "--out", output
+)
+if (!is.null(highld) && !isFALSE(highld)) {
+    cmd <- c(cmd, "--range", "--exclude", highld)
+}
+run_command(cmd, fg = TRUE)
+prunein <- paste0(output, '.prune.in')
+cmd <- c(
+    plink,
+    "--threads", ncores,
+    "--bfile", input,
+    "--extract", prunein,
+    "--genome",
+    "--out", output
+)
+run_command(cmd, fg = TRUE)
+genome <- read.table(
+    paste0(output, '.genome'),
+    row.names = NULL,
+    header = TRUE,
+    check.names = FALSE
+)
+# "unmelt" it
+# FID1 IID1 FID2 IID2 RT EZ Z0     Z1     Z2     PI_HAT PHE DST      PPC    RATIO
+# s1   s1   s2   s2   UN NA 1.0000 0.0000 0.0000 0.0000 -1  0.866584 0.0000 0.9194
+# s1   s1   s2   s2   UN NA 0.4846 0.3724 0.1431 0.3293 -1  0.913945 0.7236 2.0375
+# s1   s1   s3   s3   UN NA 1.0000 0.0000 0.0000 0.0000 -1  0.867186 0.0000 1.0791
+genome$SAMPLE1 <- paste(genome$FID1, genome$IID1, sep = "\t")
+genome$SAMPLE2 <- paste(genome$FID2, genome$IID2, sep = "\t")
+# get all samples
+samples <- unique(c(genome$SAMPLE1, genome$SAMPLE2))
+# make paired into a distance-like matrix
+similarity <- genome %>%
+    select(SAMPLE1, SAMPLE2, PI_HAT) %>%
+    pivot_wider(names_from = SAMPLE2, values_from = PI_HAT, values_fill = NA) %>%
+    as.data.frame() %>%
+    column_to_rownames("SAMPLE1")
+rm(genome)
+# get the rownames back
+samids <- rownames(similarity)
+# get samples that didn't involved
+missedrow <- setdiff(samples, rownames(similarity))
+missedcol <- setdiff(samples, colnames(similarity))
+similarity[missedrow, ] <- NA
+similarity[, missedcol] <- NA
+# order the matrix
+similarity <- similarity[samples, samples, drop = FALSE]
+# transpose the matrix to get the symmetric values
+sim2 <- t(similarity)
+isna <- is.na(similarity)
+# fill the na's with their symmetric values
+similarity[isna] <- sim2[isna]
+rm(sim2)
+# still missing: keep them
+similarity[is.na(similarity)] <- 0
+# get the marks (samples that fail the pihat cutoff)
+nsams <- length(samples)
+fails <- which(similarity > pihat)
+marks <- data.frame(x = (fails - 1)%%nsams + 1, y = ceiling(fails/nsams))
+diag(similarity) <- 1
+failflags <- rep(F, nrow(marks))
+freqs <- as.data.frame(table(factor(as.matrix(marks))))
+freqs <- freqs[order(freqs$Freq, decreasing = T), 'Var1', drop = T]
+ibd.fail <- c()
+while (sum(failflags) < nrow(marks)) {
+	samidx <- freqs[1]
+	ibd.fail <- c(ibd.fail, samples[samidx])
+	freqs <- freqs[-1]
+	sapply(1:nrow(marks), function(i) {
+		if (samidx %in% marks[i,])
+			failflags[i] <<- TRUE
+	})
+}
+ibd_fail_file <- paste0(output, '.ibd.fail')
+writeLines(ibd.fail, ibd_fail_file)
+cmd <- c(
+    plink,
+    "--threads", ncores,
+    "--bfile", input,
+    "--remove", ibd_fail_file,
+	"--make-bed",
+    "--out", output
+)
+run_command(cmd, fg = TRUE)
+if (doplot) {
+	set.seed(seed)
+	library(ComplexHeatmap)
+	fontsize8 <- gpar(fontsize = 8)
+	fontsize9 <- gpar(fontsize = 9)
+	ht_opt$heatmap_row_names_gp <- fontsize8
+	ht_opt$heatmap_column_names_gp <- fontsize8
+	ht_opt$legend_title_gp <- fontsize9
+	ht_opt$legend_labels_gp <- fontsize8
+	ht_opt$simple_anno_size <- unit(3, "mm")
+	samids <- sapply(samples, function(sid) {
+		fidiid <- unlist(strsplit(sid, "\t", fixed = TRUE))
+		gsub(
+            "{fid}",
+            fidiid[1],
+            gsub("{iid}", fidiid[2], samid, fixed = TRUE),
+            fixed = TRUE
+        )
+	})
+	rownames(similarity) <- samids
+	colnames(similarity) <- samids
+	annos <- list()
+	if (!is.null(annofile) && !isFALSE(annofile)) {
+		options(stringsAsFactors = TRUE)
+		andata <- read.table(annofile, header = TRUE, row.names = 1, sep = "\t", check.names = FALSE)
+		andata <- andata[samids, , drop = FALSE]
+		for (anname in colnames(andata)) {
+			annos[[anname]] <- as.matrix(andata[, anname])
+		}
+		annos$annotation_name_gp <- fontsize8
+		annos <- do.call(HeatmapAnnotation, annos)
+	}
+	args <- list(
+		name = "PI_HAT",
+		cell_fun = function(j, i, x, y, width, height, fill) {
+			if (similarity[i, j] > pihat && i != j)
+				grid.points(x, y, pch = 4, size = unit(.5, "char"))
+		},
+		#heatmap_legend_param = list(
+		#	title_gp  = fontsize9,
+		#	labels_gp = fontsize8
+		#),
+		clustering_distance_rows = function(m) as.dist(1-m),
+		clustering_distance_columns = function(m) as.dist(1-m),
+		top_annotation = if (length(annos) == 0) NULL else annos
+	)
+	plotHeatmap(
+		similarity,
+		outfile = paste0(output, '.ibd.png'),
+		args = args,
+		draw = list(
+			annotation_legend_list = list(
+				Legend(
+					labels = paste(">", pihat),
+					title = "",
+					type = "points",
+					pch = 4,
+					title_gp = fontsize9,
+					labels_gp = fontsize8)),
+			merge_legend = TRUE
+        ),
+		devpars = devpars
+    )
+}

biopipen/scripts/snp/PlinkUpdateName.py ADDED Viewed

@@ -0,0 +1,124 @@
+from pathlib import Path
+from biopipen.utils.misc import run_command, dict_to_cli_args, logger
+indir = {{in.indir | repr}}  # pyright: ignore # noqa: #999
+namefile = {{in.namefile | repr}}  # pyright: ignore
+outdir = {{out.outdir | repr}}  # pyright: ignore
+plink = {{envs.plink | repr}}  # pyright: ignore
+bcftools = {{envs.bcftools | repr}}  # pyright: ignore
+ncores = {{envs.ncores | repr}}  # pyright: ignore
+match_alt = {{envs.match_alt | repr}}  # pyright: ignore
+bedfile = list(Path(indir).glob("*.bed"))
+if len(bedfile) == 0:
+    raise FileNotFoundError(f"No .bed file found in `in.indir`")
+elif len(bedfile) > 1:
+    logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
+bedfile = bedfile[0]
+input = bedfile.with_suffix("")
+output = Path(outdir) / bedfile.stem
+if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
+    logger.info("VCF file received, extracting names")
+    def alt_matched(bim_alt, vcf_alt, match_alt):
+        if match_alt == "none":
+            return True
+        if match_alt == "exact":
+            return bim_alt == vcf_alt
+        bim_alts = bim_alt.split(",")
+        vcf_alts = vcf_alt.split(",")
+        if match_alt == "all":
+            return set(bim_alts) == set(vcf_alts)
+        if match_alt == "any":
+            return bool(set(bim_alts) & set(vcf_alts))
+        if match_alt == "first_included":
+            return bim_alts[0] in vcf_alts
+        if match_alt == "first":
+            return bim_alts[0] == vcf_alts[0]
+        raise ValueError(f"Unknown match_alt: {match_alt}")
+    def readline(f):
+        line = f.readline().strip()
+        return line.split("\t") if line else None
+    namefile_tmp = Path(outdir) / "_namefile_from_vcf.txt"
+    infofile = Path(outdir) / "_information_from_vcf_unsorted.txt"
+    sorted_infofile = Path(outdir) / "_information_from_vcf_sorted.txt"
+    sorted_bim = Path(outdir) / "_sorted_bim.txt"
+    bt_cmd = [
+        bcftools, "query",
+        "-f", "%CHROM\\t%ID\\t0\\t%POS\\t%ALT\\t%REF\\n",
+        "-o", infofile,
+        namefile,
+    ]
+    ## infofile
+    # 1	rs10492	0	10492	T	C
+    logger.info("- Extracting information from VCF file ...")
+    run_command(bt_cmd, fg=True)
+    # sort infofile
+    logger.info("- Sorting the information from VCF file ...")
+    run_command(
+        [
+            "sort",
+            "-k1,1", "-k4,4n", "-k6,6",
+            infofile,
+            "--parallel", ncores,
+            "-o", sorted_infofile
+        ],
+        env={"LC_ALL": "C"},
+        fg=True,
+    )
+    ## .bim file
+    # 1	1_10492	0	10492	T	C
+    # sort .bim file
+    logger.info("- Sorting the .bim file ...")
+    run_command(
+        [
+            "sort",
+            "-k1,1", "-k4,4n", "-k6,6",
+            input.with_suffix(".bim"),
+            "--parallel", ncores,
+            "-o", sorted_bim
+        ],
+        env={"LC_ALL": "C"},
+        fg=True,
+    )
+    # query namefile for records in sorted bim file
+    logger.info("- Matching and generating the name file ...")
+    with sorted_bim.open() as fbim, sorted_infofile.open() as finfo, namefile_tmp.open("w") as fout:  # noqa: E501
+        bim = readline(fbim)
+        info = readline(finfo)
+        while bim and info:
+            if (
+                bim[0] == info[0]
+                and bim[3] == info[3]
+                and bim[5] == info[5]
+                and alt_matched(bim[4], info[4], match_alt)
+            ):
+                fout.write(f"{bim[1]}\t{info[1]}\n")
+                bim = readline(fbim)
+                info = readline(finfo)
+            elif (
+                bim[0] < info[0]
+                or (bim[0] == info[0] and bim[3] < info[3])
+                or (bim[0] == info[0] and bim[3] == info[3] and bim[5] < info[5])
+            ):
+                bim = readline(fbim)
+            else:
+                info = readline(finfo)
+    namefile = namefile_tmp
+args = {
+    "": plink,
+    "bfile": input,
+    "out": output,
+    "make_bed": True,
+    "update_name": namefile,
+}
+run_command(dict_to_cli_args(args, dashify=True), fg=True)

biopipen/scripts/stats/MetaPvalue.R CHANGED Viewed

@@ -11,6 +11,7 @@ id_exprs <- {{envs.id_exprs | r}}
 pval_cols <- {{envs.pval_cols | r}}
 method <- {{envs.method | r}}
 na <- {{envs.na | r}}
+keep_single <- {{envs.keep_single | r}}
 padj <- {{envs.padj | r}}
 if (method == "fisher") { method = "sumlog" }
@@ -102,7 +103,7 @@ if (length(infiles) == 1 && padj == "none") {
         if (length(ps) == 0) {
             metaps <- c(metaps, NA)
             ns <- c(ns, NA)
-        } else if (length(ps) == 1) {
+        } else if (length(ps) == 1 && keep_single) {
             metaps <- c(metaps, ps)
             ns <- c(ns, 1)
         } else {

biopipen/scripts/stats/MetaPvalue1.R ADDED Viewed

@@ -0,0 +1,70 @@
+source("{{biopipen_dir}}/utils/misc.R")
+library(metap)
+library(rlang)
+library(dplyr)
+infile <- {{in.infile | r}}
+outfile <- {{out.outfile | r}}
+id_cols <- {{envs.id_cols | r}}
+pval_col <- {{envs.pval_col | r}}
+method <- {{envs.method | r}}
+na <- {{envs.na | r}}
+keep_single <- {{envs.keep_single | r}}
+padj <- {{envs.padj | r}}
+if (method == "fisher") { method = "sumlog" }
+# Check pval_cols
+if (is.null(pval_col)) { stop("Must provide envs.pval_col") }
+# Check id_cols
+if (is.null(id_cols)) { stop("Must provide envs.id_cols") }
+if (length(id_cols) == 1) {
+    id_cols <- trimws(strsplit(id_cols, ",")[[1]])
+}
+log_info("Reading input and performing meta-analysis ...")
+outdata <- read.table(
+        infile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE
+    ) %>%
+    group_by(!!!syms(id_cols)) %>%
+    summarise(
+        N = n(),
+        .pvals = list(!!sym(pval_col)),
+        .groups = "drop"
+    )
+metaps <- c()
+ns <- c()
+for (ps in outdata$.pvals) {
+    if (na == -1) {
+        ps <- ps[!is.na(ps)]
+    } else {
+        ps[is.na(ps)] <- na
+    }
+    if (length(ps) == 0) {
+        metaps <- c(metaps, NA)
+        ns <- c(ns, NA)
+    } else if (length(ps) == 1 && keep_single) {
+        metaps <- c(metaps, ps)
+        ns <- c(ns, 1)
+    } else {
+        metaps <- c(metaps, do.call(method, list(ps))$p)
+        ns <- c(ns, length(ps))
+    }
+}
+outdata$MetaPval <- metaps
+outdata$N <- ns
+outdata$.pvals <- NULL
+outdata <- outdata %>% arrange(MetaPval)
+if (padj != "none") {
+    log_info("Calculating adjusted p-values ...")
+    outdata$MetaPadj <- p.adjust(outdata$MetaPval, method = padj)
+}
+log_info("Writing output ...")
+write.table(outdata, outfile, quote = FALSE, sep = "\t", row.names = FALSE)

biopipen/scripts/tcr/TCRClusterStats.R CHANGED Viewed

@@ -130,13 +130,6 @@ shared_clusters = function(name) {
         row.names=TRUE, col.names=TRUE, quote=FALSE, sep="\t"
     )
-    if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
-        anno = NULL
-    } else {
-        anno = as.list(immdata$meta[, case$heatmap_meta, drop=FALSE])
-        anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
-    }
     if (!is.null(case$sample_order) && length(case$sample_order) > 0) {
         if (length(case$sample_order) == 1) {
             case$sample_order = trimws(strsplit(case$sample_order, ",")[[1]])
@@ -148,6 +141,18 @@ shared_clusters = function(name) {
         plotdata = plotdata[, case$sample_order, drop=FALSE]
     }
+    if (is.null(case$heatmap_meta) || length(case$heatmap_meta) == 0) {
+        anno = NULL
+    } else {
+        anno = as.list(
+            immdata$meta[
+                match(colnames(plotdata), immdata$meta$Sample),
+                case$heatmap_meta,
+                drop=FALSE
+            ])
+        anno = do_call(ComplexHeatmap::HeatmapAnnotation, anno)
+    }
     cluster_rows = case$cluster_rows && nrow(plotdata) > 2
     col_samples = colnames(plotdata)
     if (!cluster_rows) {

biopipen/scripts/vcf/BcftoolsAnnotate.py ADDED Viewed

@@ -0,0 +1,91 @@
+from os import path
+from contextlib import suppress
+from pathlib import PosixPath  # noqa: F401
+from biopipen.utils.reference import tabix_index
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infile = {{in.infile | repr}}  # pyright: ignore # noqa: E999
+annfile = {{in.annfile | repr}}  # pyright: ignore
+outfile = {{out.outfile | repr}}  # pyright: ignore
+joboutdir = {{job.outdir | repr}}  # pyright: ignore
+envs = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+ncores = envs.pop("ncores")
+columns = envs.pop("columns")
+remove = envs.pop("remove")
+header = envs.pop("header")
+gz = envs.pop("gz")
+index = envs.pop("index")
+if isinstance(columns, list):
+    columns = ",".join(columns)
+if "c" in envs:
+    logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
+    del envs["c"]
+if isinstance(remove, list):
+    remove = ",".join(remove)
+if "x" in envs:
+    logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
+    del envs["x"]
+envs_has_annfile = "a" in envs or "annotations" in envs
+headerfile = path.join(joboutdir, "header.txt")
+if header:
+    with open(headerfile, "w") as fh:
+        fh.writelines(header)
+if annfile and envs_has_annfile:
+    logger.warning(
+        "Ignoring envs\[a/annotations] because in.annfile is provided."
+    )
+    with suppress(KeyError):
+        del envs["a"]
+    with suppress(KeyError):
+        del envs["annotations"]
+elif not annfile and envs_has_annfile:
+    annfile = envs.pop("annotations", None) or envs.pop("a", None)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+envs[""] = [bcftools, "annotate"]
+envs["o"] = outfile
+envs["threads"] = ncores
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+if columns:
+    envs["columns"] = columns
+    if not annfile:
+        raise ValueError(
+            "envs.columns specified but no in.annfile/envs.annfile provided."
+        )
+    envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
+if remove:
+    envs["remove"] = remove
+    # no need to index it
+    envs["_"] = infile
+if "columns" not in envs and "remove" not in envs:
+    logger.warning(
+        "No columns/remove specified, no columns will be carried over or removed."
+    )
+if annfile:
+    envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
+if header:
+    envs["header_lines"] = headerfile
+run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)

biopipen/scripts/vcf/BcftoolsFilter.py ADDED Viewed

@@ -0,0 +1,90 @@
+from pathlib import Path, PosixPath  # noqa: F401
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infile = {{in.infile | repr}}  # pyright: ignore # noqa: #999
+outfile = {{out.outfile | repr}}  # pyright: ignore
+outdir = Path(outfile).parent
+envs = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+keep = envs.pop("keep")
+ncores = envs.pop("ncores")
+includes = envs.pop("includes")
+excludes = envs.pop("excludes")
+gz = envs.pop("gz")
+index = envs.pop("index")
+# a.vcf.gz -> a
+# a.vcf -> a
+stem = Path(infile).stem
+if stem.endswith(".vcf"):
+    stem = stem[:-4]
+# .vcf.gz
+# .gz
+ext = ".vcf.gz" if index or gz else '.vcf'
+def normalize_expr(expr, flag, prev_n_filters=0):
+    out = {}
+    if not expr:
+        return out
+    if isinstance(expr, list):
+        for ex in expr:
+            out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (ex, flag)
+    elif isinstance(expr, dict):
+        for name, ex in expr.items():
+            out[name] = (ex, flag)
+    else: # str
+        out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (expr, flag)
+    return out
+def handle_filter(vcf, fname, filt, flag, final):
+    logger.info("- Handling filter %s: %s ...", fname, filt)
+    arguments = envs.copy()
+    arguments[flag] = filt
+    arguments["_"] = vcf
+    arguments["o"] = outfile if final else outdir / f"{stem}.{fname}{ext}"
+    if keep:
+        arguments["s"] = fname
+    run_bcftools(arguments, bcftools=bcftools, index=index and final, tabix=tabix)
+    if final:
+        flagfile = outdir.joinpath(f"{stem}.{fname}{ext}")
+        if flagfile.is_symlink():
+            flagfile.unlink()
+        outdir.joinpath(f"{stem}.{fname}{ext}").symlink_to(outfile)
+    return arguments["o"]
+includes = normalize_expr(includes, "include")
+excludes = normalize_expr(excludes, "exclude", len(includes))
+includes.update(excludes)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+envs[""] = [bcftools, "filter"]
+envs["_"] = infile
+envs["o"] = outfile
+envs["threads"] = ncores
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+if keep:
+    envs["soft_filter"] = "+"
+if "m" not in envs and "mode" not in envs:
+    envs["m"] = "+"
+# bcftools can be only done once at one filter
+for i, (fname, (filt, flag)) in enumerate(includes.items()):
+    infile = handle_filter(infile, fname, filt, flag, i == len(includes) - 1)

biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl