PyPI - biopipen - Versions diffs - 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl - Mend

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +8 -0
biopipen/ns/bam.py +0 -2
biopipen/ns/bed.py +35 -0
biopipen/ns/cellranger_pipeline.py +5 -5
biopipen/ns/cnv.py +18 -2
biopipen/ns/cnvkit_pipeline.py +16 -11
biopipen/ns/gene.py +68 -23
biopipen/ns/misc.py +2 -15
biopipen/ns/plot.py +204 -0
biopipen/ns/regulatory.py +214 -0
biopipen/ns/scrna.py +31 -5
biopipen/ns/snp.py +516 -8
biopipen/ns/stats.py +167 -3
biopipen/ns/vcf.py +196 -0
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/scripts/bam/CNVpytor.py +144 -46
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMerge.py +1 -1
biopipen/scripts/cnv/AneuploidyScore.R +30 -7
biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
biopipen/scripts/cnv/TMADScore.R +21 -5
biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
biopipen/scripts/delim/SampleInfo.R +10 -5
biopipen/scripts/gene/GeneNameConversion.R +65 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/plot/Manhattan.R +146 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/atSNP.R +33 -0
biopipen/scripts/regulatory/motifBreakR.R +1594 -0
biopipen/scripts/scrna/MarkersFinder.R +69 -67
biopipen/scripts/scrna/SeuratClustering.R +71 -29
biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
biopipen/scripts/scrna/SeuratPreparing.R +252 -122
biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
biopipen/scripts/snp/MatrixEQTL.R +85 -44
biopipen/scripts/snp/Plink2GTMat.py +133 -0
biopipen/scripts/snp/PlinkCallRate.R +190 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +298 -0
biopipen/scripts/snp/PlinkFromVcf.py +78 -0
biopipen/scripts/snp/PlinkHWE.R +80 -0
biopipen/scripts/snp/PlinkHet.R +92 -0
biopipen/scripts/snp/PlinkIBD.R +200 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/Mediation.R +94 -0
biopipen/scripts/stats/MetaPvalue.R +2 -1
biopipen/scripts/stats/MetaPvalue1.R +70 -0
biopipen/scripts/tcr/TCRClusterStats.R +12 -7
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/VcfFix_utils.py +1 -1
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/utils/gene.R +83 -37
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.R +56 -0
biopipen/utils/misc.py +5 -2
biopipen/utils/reference.py +54 -10
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
biopipen/ns/bcftools.py +0 -111
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0

biopipen/scripts/snp/MatrixEQTL.R CHANGED Viewed

@@ -1,5 +1,6 @@
 source("{{biopipen_dir}}/utils/misc.R")
 library(rlang)
+library(rtracklayer)
 library(MatrixEQTL)
 snpfile = {{in.geno | r}}
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
 model = {{envs.model | r}}
 pval = {{envs.pval | r}}
+match_samples = {{envs.match_samples | r}}
 transp = {{envs.transp | r}}
 fdr = {{envs.fdr | r}}
 snppos = {{envs.snppos | r}}
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
     transp <- 1e-5
 }
-transpose_file <- function(file) {
+transpose_file <- function(file, what) {
+    if (is.null(file)) return(NULL)
+    log_info("Transposing {what} file ...")
     out <- file.path(joboutdir, paste0(
         tools::file_path_sans_ext(basename(file)),
         ".transposed.",
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
     out
 }
-if (transpose_geno) snpfile = transpose_file(snpfile)
-if (transpose_expr) expfile = transpose_file(expfile)
-if (transpose_cov) covfile = transpose_file(covfile)
+if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
+if (transpose_expr) expfile = transpose_file(expfile, "expr")
+if (transpose_cov) covfile = transpose_file(covfile, "cov")
+log_info("Loading SNP data ...")
 snps = SlicedData$new();
 snps$fileDelimiter = "\t";       # the TAB character
 snps$fileOmitCharacters = "NA";  # denote missing values;
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1;        # one column of row labels
 snps$fileSliceSize = 10000;      # read file in pieces of 2,000 rows
 snps$LoadFile( snpfile );
+log_info("Loading gene expression data ...")
 gene = SlicedData$new();
 gene$fileDelimiter = "\t";       # the TAB character
 gene$fileOmitCharacters = "NA";  # denote missing values;
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
 cvrt = SlicedData$new();
 if (!is.null(covfile) && file.exists(covfile)) {
-    covmatrix = t(read.table.inopts(covfile, list(cnames=TRUE, rnames=TRUE)))
+    log_info("Loading covariate data ...")
+    covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
     cvrt$CreateFromMatrix( as.matrix(covmatrix) )
 }
+log_info("Matching samples ...")
+if (match_samples) {
+    # let matrixEQTL raise an error if samples do not match
+} else {
+    n_sample_snps = snps$nCols()
+    n_sample_gene = gene$nCols()
+    common_samples = intersect(snps$columnNames, gene$columnNames)
+    if (!is.null(covfile)) {
+        common_samples = intersect(common_samples, cvrt$columnNames)
+        n_sample_cov = cvrt$nCols()
+        cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
+    }
+    snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
+    gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
+    log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
+    log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
+    if (!is.null(covfile)) {
+        log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
+    }
+}
+log_info("Composing engine parameters ...")
 engine_params = list()
 engine_params$snps = snps
 engine_params$gene = gene
 engine_params$cvrt = cvrt
-engine_params$output_file_name = ifelse(trans_enabled, alleqtl, NULL)
-engine_params$pvOutputThreshold = ifelse(trans_enabled, transp, 0)
+engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
+engine_params$pvOutputThreshold = if(trans_enabled) min(transp, 1) else 0
 engine_params$useModel = model
 engine_params$errorCovariance = numeric()
 engine_params$verbose = TRUE
@@ -89,66 +118,78 @@ noq = function(s) {
 }
 if (cis_enabled) {
+    log_info("Loading SNP positions ...")
     if (endsWith(snppos, ".bed")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE))
-        snppos_data = snppos_data[, c(4, 1, 2)]
-        colnames(snppos_data) = c("snp", "chr", "pos")
+        snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
+        snppos_data = data.frame(
+            snp = snppos_data$V4,
+            chr = snppos_data$V1,
+            pos = snppos_data$V3
+        )
     } else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE));
-        snppos_data = snppos_data[, c(9, 1, 4)]
-        colnames(snppos_data) = c("snp", "chr", "pos")
-        snppos_data$snp = unlist(lapply(snppos_data$snp, function(x) {
-            for (s in unlist(strsplit(x, '; ', fixed=T))) {
-                if (startsWith(s, "snp_id "))
-                    return(noq(substring(s, 8)))
-                else if (startsWith(s, "rs_id "))
-                    return(noq(substring(s, 7)))
-                else if (startsWith(s, "rs "))
-                    return(noq(substring(s, 4)))
-            }
-        }))
+        snppos_data = import(snppos)
+        elem_meta = elementMetadata(snppos_data)
+        snppos_data = data.frame(
+            snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
+            chr = as.character(seqnames(snppos_data)),
+            pos = start(snppos_data)
+        )
     } else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE))
+        snppos_data = read.table(
+            snppos,
+            header=FALSE,
+            row.names=NULL,
+            stringsAsFactors=FALSE,
+            check.names=FALSE
+        )
         snppos_data = snppos_data[, c(3, 1, 2)]
         colnames(snppos_data) = c("snp", "chr", "pos")
     } else {
-        snppos_data = read.table.inopts(snppos, list(cnames=TRUE))
+        snppos_data = read.table(
+            snppos,
+            header=FALSE,
+            row.names=NULL,
+            stringsAsFactors=FALSE,
+            check.names=FALSE
+        )
         colnames(snppos_data) = c("snp", "chr", "pos")
     }
+    log_info("Loading gene positions ...")
     if (endsWith(genepos, ".bed")) {
-        genepos_data = read.table.inopts(genepos,
-                                         list(cnames=FALSE, rnames=FALSE))
-        genepos_data = genepos_data[, c(4, 1:3)]
-        colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
+        genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
+        genepos_data = data.frame(
+            geneid = genepos_data$V4,
+            chr = genepos_data$V1,
+            s1 = genepos_data$V2,
+            s2 = genepos_data$V3
+        )
     } else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
-        genepos_data = read.table.inopts(genepos,
-                                         list(cnames=FALSE, rnames=FALSE))
-        genepos_data = genepos_data[, c(9, 1, 4, 5)]
-        colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
-        genepos_data$geneid = noquote(unlist(lapply(genepos_data$geneid, function(x) {
-            for (s in unlist(strsplit(x, '; ', fixed=T))) {
-                if (startsWith(s, "gene_id "))
-                    return(noq(substring(s, 9)))
-            }
-        })))
+        genepos_data = import(genepos)
+        elem_meta = elementMetadata(genepos_data)
+        genepos_data = data.frame(
+            geneid = elem_meta$gene_id %||% elem_meta$gene_name,
+            chr = as.character(seqnames(genepos_data)),
+            s1 = start(genepos_data),
+            s2 = end(genepos_data)
+        )
     } else {
         genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
         colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
     }
+    log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
     engine_params$output_file_name.cis = outfile
-    engine_params$pvOutputThreshold.cis = pval
+    engine_params$pvOutputThreshold.cis = min(pval, 1)
     engine_params$cisDist = dist
     engine_params$snpspos = snppos_data
     engine_params$genepos = genepos_data
     do_call(Matrix_eQTL_main, engine_params)
+    if (!file.exists(alleqtl)) file.create(alleqtl)
 } else {
+    log_info("Running MatrixEQTL without cis-eQTLs ...")
     do_call(Matrix_eQTL_engine, engine_params)
-    file.create(outfile)
+    if (!file.exists(outfile)) file.create(outfile)
 }
 if (pval == 0) {

biopipen/scripts/snp/Plink2GTMat.py ADDED Viewed

@@ -0,0 +1,133 @@
+from os import path
+from glob import glob
+from biopipen.utils.misc import run_command, logger
+indir = {{in.indir | repr}}  # noqa: E999 # pyright: ignore
+outfile = {{out.outfile | repr}}  # pyright: ignore
+plink = {{envs.plink | repr}}  # pyright: ignore
+ncores = {{envs.ncores | repr}}  # pyright: ignore
+transpose = {{envs.transpose | repr}}  # pyright: ignore
+samid = {{envs.samid | repr}}  # pyright: ignore
+varid = {{envs.varid | repr}}  # pyright: ignore
+trans_chr = {{envs.trans_chr | repr}}  # pyright: ignore
+missing_id = {{envs.missing_id | repr}}  # pyright: ignore
+trans_chr = trans_chr or {}
+bedfile = glob(path.join(indir, '*.bed'))
+if len(bedfile) == 0:
+    raise FileNotFoundError(f"No .bed file found in `in.indir`")
+elif len(bedfile) > 1:
+    logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
+bedfile = bedfile[0]
+input   = path.splitext(bedfile)[0]
+output  = path.splitext(outfile)[0]
+cmd = [
+    plink,
+    "--bfile", input,
+    "--out", output,
+    "--threads", ncores,
+    "--keep-allele-order",
+    "--recode", "A-transpose" if not transpose else "A",
+]
+# if transpose:
+#     cmd += ["tabx"]
+run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
+if not transpose:  # rows are variants, columns are samples
+    # .traw file is created, tab-separated, with the following columns:
+    trawfile = output + ".traw"
+    # CHR     Chromosome code
+    # SNP     Variant identifier
+    # (C)M	  Position in morgans or centimorgans
+    # POS     Base-pair coordinate
+    # COUNTED Counted allele (defaults to A1), the actual alternative allele
+    #           with --keep-allele-order
+    # ALT     Other allele(s), comma-separated, the actual reference allele
+    # <FID>_<IID>... Allelic dosages
+    #   (0/1/2/'NA' for diploid variants, 0/2/'NA' for haploid)
+    with open(trawfile, 'r') as fin:
+        with open(outfile, 'w') as fout:
+            samples = fin.readline().strip().split('\t')[6:]
+            header = ["Variant"]
+            for sam in samples:
+                try:
+                    fid, iid = sam.split('_')
+                except ValueError:
+                    raise ValueError(
+                        f"Can't determine FID and IID from sample ID: {sam}, "
+                        f"extra underscore (_) detected."
+                    ) from None
+                sam = samid.replace('{fid}', fid).replace('{iid}', iid)
+                header.append(sam)
+            fout.write('\t'.join(header) + '\n')
+            for line in fin:
+                line = line.strip().split('\t')
+                chrom = trans_chr.get(line[0], line[0])
+                var = line[1]
+                if var == "." or var == "":
+                    var = missing_id
+                pos = line[3]
+                ref = line[5]
+                alt = line[4]
+                variant = (
+                    varid
+                    .replace('{chr}', chrom)
+                    .replace('{varid}', var)
+                    .replace('{pos}', pos)
+                    .replace('{ref}', ref)
+                    .replace('{alt}', alt)
+                )
+                record = [variant] + line[6:]
+                fout.write('\t'.join(record) + '\n')
+else:
+    # .raw file is created, tab-separated, with the following columns:
+    rawfile = output + ".raw"
+    # FID       Family ID
+    # IID       Individual ID
+    # PAT       Paternal ID
+    # MAT       Maternal ID
+    # SEX       Sex (1 = male, 2 = female, 0 = unknown)
+    # PHENOTYPE Main phenotype value
+    # <VariantID>... Allelic dosage (0/1/2/NA for diploid variants, 0/2/NA for haploid)
+    #
+    # Variant information may not be included in <VariantID>
+    # We use the .bim file to get the variant information
+    bimfile = input + ".bim"
+    with open(rawfile, 'r') as fin:
+        with open(outfile, 'w') as fout:
+            header = ["Sample"]
+            with open(bimfile, 'r') as fbim:
+                for line in fbim:
+                    line = line.strip().split('\t')
+                    chrom = trans_chr.get(line[0], line[0])
+                    var = line[1]
+                    if var == "." or var == "":
+                        var = missing_id
+                    pos = line[3]
+                    ref = line[5]
+                    alt = line[4]
+                    variant = (
+                        varid
+                        .replace('{chr}', chrom)
+                        .replace('{varid}', var)
+                        .replace('{pos}', pos)
+                        .replace('{ref}', ref)
+                        .replace('{alt}', alt)
+                    )
+                    header.append(variant)
+            fout.write('\t'.join(header) + '\n')
+            next(fin)  # skip header
+            for line in fin:
+                line = line.strip().split('\t')
+                fid = line[0]
+                iid = line[1]
+                sam = samid.replace('{fid}', fid).replace('{iid}', iid)
+                record = [sam] + line[6:]
+                fout.write('\t'.join(record) + '\n')

biopipen/scripts/snp/PlinkCallRate.R ADDED Viewed

@@ -0,0 +1,190 @@
+source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/plot.R")
+library(ggprism)
+theme_set(theme_prism())
+indir <- {{in.indir | r}}
+outdir <- {{out.outdir | r}}
+plink <- {{envs.plink | r}}
+ncores <- {{envs.ncores | r}}
+doplot <- {{envs.plot | r}}
+devpars <- {{envs.devpars | r}}
+samplecr <- {{envs.samplecr | r}}
+varcr <- {{envs.varcr | r}}
+max_iter <- {{envs.max_iter | r}}
+bedfile = Sys.glob(file.path(indir, '*.bed'))
+if (length(bedfile) == 0)
+    stop("No bed files found in the input directory.")
+if (length(bedfile) > 1) {
+    log_warn("Multiple bed files found in the input directory. Using the first one.")
+    bedfile <- bedfile[1]
+}
+input <- tools::file_path_sans_ext(bedfile)
+output <- file.path(outdir, basename(input))
+all_smiss_file = paste0(output, '.smiss')
+all_vmiss_file = paste0(output, '.vmiss')
+all_samplecr_fail_file = paste0(output, '.samplecr.fail')
+all_varcr_fail_file = paste0(output, '.varcr.fail')
+if (file.exists(all_smiss_file)) invisible(file.remove(all_smiss_file))
+if (file.exists(all_vmiss_file)) invisible(file.remove(all_vmiss_file))
+for (i in 1:max_iter) {
+    log_info("Iteration {i} ...")
+    # iter_out <- paste0(output, "-", i)
+    iter_dir <- file.path(outdir, paste0("iter", i))
+    dir.create(iter_dir, showWarnings = FALSE)
+    iter_out <- file.path(iter_dir, basename(output))
+    cmd <- c(
+        plink,
+        "--threads", ncores,
+        "--bfile", input,
+        "--missing",
+        "--out", iter_out
+    )
+    run_command(cmd, fg = TRUE)
+    smissfile <- paste0(iter_out, '.smiss')
+    smiss <- read.table(
+        smissfile,
+        header = TRUE,
+        row.names = NULL,
+        check.names = FALSE,
+        comment.char = ""
+    )
+    smiss$Iteration <- i
+    # append it to all_smiss_file
+    write.table(
+        smiss,
+        all_smiss_file,
+        append = i > 1,
+        col.names = !file.exists(all_smiss_file),
+        row.names = FALSE,
+        sep = "\t",
+        quote = FALSE
+    )
+    callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
+    rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
+    callrate.sample.fail = rownames(callrate.sample[
+        callrate.sample$Callrate < samplecr, , drop = FALSE
+    ])
+    writeLines(callrate.sample.fail, con = file(paste0(iter_out, '.samplecr.fail')))
+    # append it to all_samplecr_fail_file
+    write(
+        paste0(sapply(
+            callrate.sample.fail,
+            function(x){ paste0(x, "\n") }
+        ), collapse = ""),
+        file = file(all_samplecr_fail_file),
+        append = i > 1
+    )
+    vmiss <- read.table(
+        paste0(iter_out, '.vmiss'),
+        header = TRUE,
+        row.names = NULL,
+        check.names = FALSE,
+        comment.char = ""
+    )
+    vmiss$Iteration <- i
+    # append it to all_vmiss_file
+    write.table(
+        vmiss,
+        all_vmiss_file,
+        append = i > 1,
+        col.names = !file.exists(all_vmiss_file),
+        row.names = FALSE,
+        sep = "\t",
+        quote = FALSE
+    )
+    vmiss$Callrate <- 1 - vmiss$F_MISS
+    callrate.var.fail <- vmiss[which(vmiss$Callrate < varcr), 'ID', drop = TRUE]
+    writeLines(callrate.var.fail, con = file(paste0(iter_out, '.varcr.fail')))
+    # append it to all_varcr_fail_file
+    write(
+        paste0(sapply(
+            callrate.var.fail,
+            function(x){ paste0(x, "\n") }
+        ), collapse = ""),
+        file = file(all_varcr_fail_file),
+        append = i > 1
+    )
+    if (length(callrate.sample.fail) == 0 && length(callrate.var.fail) == 0) {
+        # make symbolic links to output from input .bed, .bim and .fam files
+        file.symlink(paste0(input, '.bed'), paste0(output, '.bed'))
+        file.symlink(paste0(input, '.bim'), paste0(output, '.bim'))
+        file.symlink(paste0(input, '.fam'), paste0(output, '.fam'))
+        break
+    }
+    # remove samples in iter_out.samplecr.fail and variants in iter_out.varcr.fail
+    cmd <- c(
+        plink,
+        "--threads", ncores,
+        "--bfile", input,
+        "--remove", paste0(iter_out, '.samplecr.fail'),
+        "--exclude", paste0(iter_out, '.varcr.fail'),
+        "--make-bed",
+        "--out", iter_out
+    )
+    run_command(cmd, fg = TRUE)
+    input <- iter_out
+}
+smiss <- read.table(
+    smissfile,
+    header = TRUE,
+    row.names = NULL,
+    check.names = FALSE,
+    comment.char = ""
+)
+callrate.sample <- data.frame(Callrate = 1 - smiss$F_MISS)
+rownames(callrate.sample) <- paste(smiss$FID, smiss$IID, sep = "\t")
+vmiss <- read.table(
+    paste0(iter_out, '.vmiss'),
+    header = TRUE,
+    row.names = NULL,
+    check.names = FALSE,
+    comment.char = ""
+)
+vmiss$Callrate <- 1 - vmiss$F_MISS
+if (doplot) {
+    log_info("Plotting ...")
+    callrate.sample$Status <- "Pass"
+    callrate.sample[callrate.sample.fail, "Status"] <- "Fail"
+    plotGG(
+        data = callrate.sample,
+        geom = "histogram",
+        outfile = paste0(output, '.samplecr.png'),
+        args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
+        ggs = c(
+            'xlab("Sample Call Rate")',
+            'ylab("Count")',
+            'geom_vline(xintercept = samplecr, color = "red", linetype="dashed")',
+            'theme(legend.position = "none")',
+            'geom_text(aes(x = samplecr, y = Inf, label = samplecr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
+            'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
+        )
+    )
+    vmiss$Status <- "Pass"
+    vmiss[which(vmiss$Callrate < varcr), "Status"] <- "Fail"
+    plotGG(
+        data = vmiss,
+        geom = "histogram",
+        outfile = paste0(output, '.varcr.png'),
+        args = list(aes(fill = Status, x = Callrate), alpha = 0.8, bins = 50),
+        ggs = c(
+            'xlab("Variant Call Rate")',
+            'ylab("Count")',
+            'geom_vline(xintercept = varcr, color = "red", linetype="dashed")',
+            'theme(legend.position = "none")',
+            'geom_text(aes(x = varcr, y = Inf, label = varcr), colour="red", angle=90, vjust = 1.2, hjust = 1.2)',
+            'scale_fill_manual(values = c("Pass" = "blue3", "Fail" = "red3"))'
+        ),
+        devpars = devpars
+    )
+}

biopipen/scripts/snp/PlinkFilter.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Script for snp.PlinkFilter"""
+from pathlib import Path
+from biopipen.utils.misc import run_command, dict_to_cli_args, logger
+indir = {{in.indir | repr}}  # pyright: ignore # noqa: #999
+samples_file = {{in.samples_file | repr}}  # pyright: ignore
+variants_file = {{in.variants_file | repr}}  # pyright: ignore
+outdir = {{out.outdir | repr}}  # pyright: ignore
+plink = {{envs.plink | repr}}  # pyright: ignore
+ncores = {{envs.ncores | repr}}  # pyright: ignore
+samples = {{envs.samples | repr}}  # pyright: ignore
+variants = {{envs.variants | repr}}  # pyright: ignore
+e_samples_file = {{envs.samples_file | repr}}  # pyright: ignore
+e_variants_file = {{envs.variants_file | repr}}  # pyright: ignore
+keep = {{envs.keep | repr}}  # pyright: ignore
+vfile_type = {{envs.vfile_type | repr}}  # pyright: ignore
+chr = {{envs.chr | repr}}  # pyright: ignore
+not_chr = {{envs.not_chr | repr}}  # pyright: ignore
+autosome = {{envs.autosome | repr}}  # pyright: ignore
+autosome_xy = {{envs.autosome_xy | repr}}  # pyright: ignore
+snps_only = {{envs.snps_only | repr}}  # pyright: ignore
+samples_file = samples_file or e_samples_file
+if not samples_file and samples:
+    samples_file = Path(outdir) / "_samples.txt"
+    if isinstance(samples, str):
+        samples = [s.strip() for s in samples.split(",")]
+    with open(samples_file, "w") as fh:
+        fh.writelines(
+            [
+                line.replace("/", "\t") + "\n"
+                if "/" in line
+                else line + "\t" + line + "\n"
+                for line in samples
+            ]
+        )
+variants_file = variants_file or e_variants_file
+if not variants_file and variants:
+    if vfile_type != "id":
+        logger.warning(
+            "envs.vfile_type should be 'id' if only envs.variants is provided."
+        )
+        vfile_type = "id"
+    variants_file = Path(outdir) / "_variants.txt"
+    if isinstance(variants, str):
+        variants = [v.strip() for v in variants.split(",")]
+    with open(variants_file, "w") as fh:
+        fh.writelines([line + "\n" for line in variants])
+bedfile = list(Path(indir).glob("*.bed"))
+if len(bedfile) == 0:
+    raise FileNotFoundError(f"No .bed file found in `in.indir`")
+elif len(bedfile) > 1:
+    logger.warning(f"Multiple .bed files found in `in.indir`, using the first one.")
+bedfile = bedfile[0]
+input = bedfile.with_suffix("")
+output = Path(outdir) / bedfile.stem
+args = {
+    "": [plink],
+    "bfile": input,
+    "out": output,
+    "threads": ncores,
+    "make-bed": True,
+}
+if keep:
+    if samples_file:
+        args["keep"] = samples_file
+    if variants_file:
+        args["extract"] = (
+            variants_file if vfile_type == "id" else [vfile_type, variants_file]
+        )
+else:
+    if samples_file:
+        args["remove"] = samples_file
+    if variants_file:
+        args["exclude"] = (
+            variants_file if vfile_type == "id" else [vfile_type, variants_file]
+        )
+if chr:
+    args["chr"] = chr
+if not_chr:
+    args["not_chr"] = not_chr
+if autosome:
+    args["autosome"] = True
+if autosome_xy:
+    args["autosome"] = True
+if snps_only:
+    args["snps_only"] = snps_only
+run_command(dict_to_cli_args(args, dashify=True, dup_key=False), fg=True)

biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

Potentially problematic release.

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl