PyPI - biopipen - Versions diffs - 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl - Mend

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (85) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +8 -0
biopipen/ns/bam.py +0 -2
biopipen/ns/bed.py +35 -0
biopipen/ns/cellranger_pipeline.py +5 -5
biopipen/ns/cnv.py +18 -2
biopipen/ns/cnvkit_pipeline.py +16 -11
biopipen/ns/gene.py +68 -23
biopipen/ns/misc.py +2 -15
biopipen/ns/plot.py +204 -0
biopipen/ns/regulatory.py +214 -0
biopipen/ns/scrna.py +31 -5
biopipen/ns/snp.py +516 -8
biopipen/ns/stats.py +167 -3
biopipen/ns/vcf.py +196 -0
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/scripts/bam/CNVpytor.py +144 -46
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMerge.py +1 -1
biopipen/scripts/cnv/AneuploidyScore.R +30 -7
biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
biopipen/scripts/cnv/TMADScore.R +21 -5
biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
biopipen/scripts/delim/SampleInfo.R +10 -5
biopipen/scripts/gene/GeneNameConversion.R +65 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/plot/Manhattan.R +146 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +226 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +126 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +96 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/atSNP.R +33 -0
biopipen/scripts/regulatory/motifBreakR.R +1594 -0
biopipen/scripts/scrna/MarkersFinder.R +69 -67
biopipen/scripts/scrna/SeuratClustering.R +71 -29
biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
biopipen/scripts/scrna/SeuratPreparing.R +252 -122
biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
biopipen/scripts/snp/MatrixEQTL.R +85 -44
biopipen/scripts/snp/Plink2GTMat.py +133 -0
biopipen/scripts/snp/PlinkCallRate.R +190 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +298 -0
biopipen/scripts/snp/PlinkFromVcf.py +78 -0
biopipen/scripts/snp/PlinkHWE.R +80 -0
biopipen/scripts/snp/PlinkHet.R +92 -0
biopipen/scripts/snp/PlinkIBD.R +200 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/Mediation.R +94 -0
biopipen/scripts/stats/MetaPvalue.R +2 -1
biopipen/scripts/stats/MetaPvalue1.R +70 -0
biopipen/scripts/tcr/TCRClusterStats.R +12 -7
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/VcfFix_utils.py +1 -1
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/utils/gene.R +83 -37
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.R +56 -0
biopipen/utils/misc.py +5 -2
biopipen/utils/reference.py +54 -10
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/METADATA +2 -2
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/RECORD +80 -51
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/entry_points.txt +1 -1
biopipen/ns/bcftools.py +0 -111
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
{biopipen-0.28.1.dist-info → biopipen-0.29.1.dist-info}/WHEEL +0 -0

biopipen/scripts/bam/CNVpytor.py CHANGED Viewed

@@ -1,15 +1,15 @@
 from pathlib import Path
+import warnings
 import pandas
-from biopipen.scripts.vcf.VcfFix_utils import HeaderContig, fix_vcffile
+from datetime import datetime
 from biopipen.utils.reference import bam_index
-from biopipen.utils.misc import run_command, dict_to_cli_args
+from biopipen.utils.misc import run_command, dict_to_cli_args, logger
-bamfile = {{in.bamfile | quote}}  # pyright: ignore
+bamfile = {{in.bamfile | quote}}  # pyright: ignore # noqa
 snpfile = {{in.snpfile | repr}}  # pyright: ignore
 outdir = Path({{out.outdir | quote}})  # pyright: ignore
 cnvpytor = {{envs.cnvpytor | quote}}  # pyright: ignore
-cnvnator2vcf = {{envs.cnvnator2vcf | quote}}  # pyright: ignore
 samtools = {{envs.samtools | quote}}  # pyright: ignore
 ncores = {{envs.ncores | int}}  # pyright: ignore
 refdir = {{envs.refdir | quote}}  # pyright: ignore
@@ -20,7 +20,6 @@ args = {{envs | repr}}  # pyright: ignore
 del args['cnvpytor']
 del args['ncores']
-del args['cnvnator2vcf']
 del args['samtools']
 del args['refdir']
 del args['genome']
@@ -236,47 +235,138 @@ def load_chrsize():
                 yield chrom, int(size)
-def cnvpytor2vcf(infile, snp, fix=True):
-    unfixedfile = Path(infile).with_suffix(f".unfixed.vcf")
-    outfile = Path(infile).with_suffix(f".vcf")
-    stdout = run_command(
-        dict_to_cli_args(
-            {
-                "": cnvpytor2vcf,
-                "reference": genome,
-                "_": [infile, refdir],
-            },
-            prefix="-",
-        ),
-        stdout="return",
-    )
-    if fix:
-        unfixedfile.write_text(stdout)
+def parse_chrom(chrom, chromdir):
+    file = Path(chromdir) / f"{chrom}.fa"
+    if not file.exists():
+        warnings.warn(f"Chromosome file not found in refdir: {chrom}")
+        return ""
-        fixes = [
-            {
-                "kind": "format",
-                "id": "PE",
-                "fix": lambda obj: setattr(obj, 'Type', 'String')
-            },
-            {
-                "kind": "fields",
-                "fix": lambda items: items.__setitem__(-1, Path(bamfile).stem)
-            }
-        ]
+    seq = ""
+    with open(file) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith(">"):
+                seq = ""
+            else:
+                seq += line
+    return seq
+def cnvpytor2vcf(infile, snp):
+    # snp: in case to be used in the future
+    outfile = Path(infile).with_suffix(f".vcf")
+    # stdout = run_command(
+    #     dict_to_cli_args(
+    #         {
+    #             "": cnvnator2vcf,
+    #             "reference": genome,
+    #             "_": [infile, refdir],
+    #         },
+    #         prefix="-",
+    #     ),
+    #     stdout="return",
+    # )
+    ## command hangs
+    with open(infile) as fin, open(outfile, "w") as fout:
+        fout.write("##fileformat=VCFv4.2\n")
+        fout.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
+        fout.write(f"##reference={genome}\n")
+        fout.write(f"##source=CNVpytor\n")
         for chrom, size in load_chrsize():
-            fixes.append({
-                "kind": "contig",
-                "append": True,
-                "fix": (
-                    lambda obj, chrom=chrom, size=size:
-                    HeaderContig(ID=chrom, length=size)
-                )
-            })
+            fout.write(f"##contig=<ID={chrom},length={size}>\n")
+        fout.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
+        fout.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n')
+        fout.write('##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">\n')
+        fout.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n')
+        fout.write('##INFO=<ID=natorRD,Number=1,Type=Float,Description="Normalized RD">\n')
+        fout.write('##INFO=<ID=natorP1,Number=1,Type=Float,Description="e-val by t-test">\n')
+        fout.write('##INFO=<ID=natorP2,Number=1,Type=Float,Description="e-val by Gaussian tail">\n')
+        fout.write('##INFO=<ID=natorP3,Number=1,Type=Float,Description="e-val by t-test (middle)">\n')
+        fout.write('##INFO=<ID=natorP4,Number=1,Type=Float,Description="e-val by Gaussian tail (middle)">\n')
+        fout.write('##INFO=<ID=natorQ0,Number=1,Type=Float,Description="Fraction of reads with 0 mapping quality">\n')
+        fout.write('##INFO=<ID=natorPE,Number=1,Type=Integer,Description="Number of paired-ends support the event">\n')
+        fout.write('##INFO=<ID=SAMPLES,Number=.,Type=String,Description="Sample genotyped to have the variant">\n')
+        fout.write('##ALT=<ID=DEL,Description="Deletion">\n')
+        fout.write('##ALT=<ID=DUP,Description="Duplication">\n')
+        fout.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
+        fout.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
+        fout.write('##FORMAT=<ID=PE,Number=1,Type=String,Description="Number of paired-ends that support the event">\n')
+        fout.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{Path(bamfile).stem}\n")
+        prev_chrom, chrom_seq, count = "", "", 0
+        for line in fin:
+            # type, coor, length, rd, p1, p2, p3, p4, q0, pe = line.strip("\n").split()
+            items = line.strip("\n").split()
+            type, coor, length = items[:3]
+            rd = float(items[3]) if len(items) > 3 else False
+            p1 = items[4] if len(items) > 4 else ""
+            p2 = items[5] if len(items) > 5 else ""
+            p3 = items[6] if len(items) > 6 else ""
+            p4 = items[7] if len(items) > 7 else ""
+            q0 = items[8] if len(items) > 8 else ""
+            pe = items[9] if len(items) > 9 else ""
+            chrom, pos = coor.split(":")
+            start, end = pos.split("-")
+            start, end = int(start), int(end)
+            is_del = type == "deletion"
+            is_dup = type == "duplication"
+            if not is_del and not is_dup:
+                warnings.warn(f"Skipping unrecognized CNV type: {type}")
+                continue
-        fix_vcffile(unfixedfile, outfile, fixes)
-    else:
-        outfile.write_text(stdout)
+            if chrom != prev_chrom:
+                chrom_seq = parse_chrom(chrom, refdir)
+                prev_chrom = chrom
+            count += 1
+            info = f"END={end}"
+            info += f";SVTYPE=DEL;SVLEN=-{length}" if is_del else f";SVTYPE=DUP;SVLEN={length}"
+            info += ";IMPRECISE"
+            info += f";natorRD={rd}" if rd is not False else ""
+            info += f";natorP1={p1}" if p1 else ""
+            info += f";natorP2={p2}" if p2 else ""
+            info += f";natorP3={p3}" if p3 else ""
+            info += f";natorP4={p4}" if p4 else ""
+            info += f";natorQ0={q0}" if q0 else ""
+            info += f";natorPE={pe}" if pe else ""
+            gt = "GT"
+            if rd is not False:
+                gt += ":CN"
+                gt += ":PE" if pe else ""
+                gt += "\t"
+                if is_del and rd < 0.25:
+                    gt += "1/1:0"
+                elif is_del and rd >= 0.25:
+                    gt += "0/1:1"
+                elif rd <= 1.75:
+                    gt += "0/1:2"
+                elif rd > 1.75 and rd <= 2.25:
+                    gt += "1/1:2"
+                elif rd > 2.25:
+                    gt += f"./2:{rd:.0f}"
+                else:
+                    gt = "GT:PE\t./." if pe else "GT\t./."
+                gt += f":{pe}" if pe else ""
+            else:
+                gt += "\t./."
+            fout.write("\t".join(
+                [
+                    chrom,
+                    str(start),
+                    f"CNVpytor_{'del_' if is_del else 'dup_'}{count}",
+                    chrom_seq[start - 1] if start < len(chrom_seq) else "N",
+                    "<DEL>" if is_del else "<DUP>",
+                    ".",
+                    "PASS",
+                    info,
+                    gt,
+                ]
+            ) + "\n")
 def do_case():
@@ -290,7 +380,7 @@ def do_case():
     rootfile = outdir / "file.pytor"
     case["j"] = case.get("j", ncores)
-    # read depth signal
+    logger.info("Reading depth signals ...")
     run_command(
         dict_to_cli_args(
             {
@@ -305,7 +395,7 @@ def do_case():
         fg=True,
     )
-    # predicting cnv
+    logger.info("Predicting CNVs ...")
     run_command(
         dict_to_cli_args(
             {
@@ -314,6 +404,7 @@ def do_case():
                 "his": binsizes,
             },
             prefix="-",
+            dup_key=False,
         ),
         fg=True,
     )
@@ -326,6 +417,7 @@ def do_case():
                 "partition": binsizes,
             },
             prefix="-",
+            dup_key=False,
         ),
         fg=True,
     )
@@ -336,6 +428,7 @@ def do_case():
         mask_snps = snp.pop("mask_snps", True)
         baf_nomask = snp.pop("baf_nomask", False)
+        logger.info("Importing SNP data ...")
         run_command(
             dict_to_cli_args(
                 {
@@ -350,6 +443,7 @@ def do_case():
         )
         if mask_snps:
+            logger.info("Masking 1000 Genome SNPs ...")
             run_command(
                 dict_to_cli_args(
                     {
@@ -362,6 +456,7 @@ def do_case():
                 fg=True,
             )
+        logger.info("Calculating BAF histograms ...")
         run_command(
             dict_to_cli_args(
                 {
@@ -375,8 +470,9 @@ def do_case():
             fg=True,
         )
-    # call
+    logger.info("Predicting CNV regions using joint caller ...")
     for binsize in binsizes:
+        logger.info(f"- binsize: {binsize}")
         outfile = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.tsv"
         outfile_filtered = outdir / f"calls{'.combined' if snp is not False else ''}.{binsize}.filtered.tsv"
         run_command(
@@ -392,6 +488,7 @@ def do_case():
             stdout=outfile,
         )
+        logger.info("  Converting to other formats ...")
         cnvpytor2other(outfile, bool(snp), "gff")
         cnvpytor2other(outfile, bool(snp), "bed")
         cnvpytor2vcf(outfile, bool(snp))
@@ -424,6 +521,7 @@ def do_case():
         cnvpytor2vcf(outfile_filtered, bool(snp))
         # plots
+        logger.info("  Plotting ...")
         manplot = outdir / f"manhattan.{binsize}.png"
         run_command(
             dict_to_cli_args(

biopipen/scripts/bed/BedtoolsIntersect.py ADDED Viewed

@@ -0,0 +1,54 @@
+from pathlib import Path
+from biopipen.utils.misc import run_command, dict_to_cli_args, logger
+afile = Path({{in.afile | repr}})  # pyright: ignore # noqa: #999
+bfile = Path({{in.bfile | repr}})  # pyright: ignore
+outfile = {{out.outfile | repr}}  # pyright: ignore
+envs = {{envs | repr}}  # pyright: ignore
+bedtools = envs.pop("bedtools")
+sort = envs.pop("sort")
+chrsize = envs.pop("chrsize")
+postcmd = envs.pop("postcmd", None)
+outdir = Path(outfile).parent
+if chrsize and "g" in envs:
+    logger.warning("Ignoring envs.g because envs.chrsize is provided.")
+    envs["g"] = Path(chrsize).expanduser()
+elif chrsize:
+    envs["g"] = Path(chrsize).expanduser()
+if sort:
+    afile_sorted = outdir / f"{afile.stem}_sorted{afile.suffix}"
+    bfile_sorted = outdir / f"{bfile.stem}_sorted{bfile.suffix}"
+    run_command(
+        [bedtools, "sort", "-g", envs["g"], "-i", afile],
+        stdout=afile_sorted,
+    )
+    run_command(
+        [bedtools, "sort", "-g", envs["g"], "-i", bfile],
+        stdout=bfile_sorted,
+    )
+    afile = afile_sorted
+    bfile = bfile_sorted
+envs[""] = [bedtools, "intersect"]
+envs["a"] = afile
+envs["b"] = bfile
+envs.setdefault("sorted", True)
+if envs["sorted"] and not "g" in envs:
+    raise ValueError("envs.g is required or manullay set envs.sorted to False.")
+if postcmd:
+    ofile = Path(outfile).with_suffix(".prior.bt")
+    run_command(dict_to_cli_args(envs, prefix="-"), stdout=ofile)
+    postcmd_file = outdir / "_postcmd.sh"
+    postcmd_file.write_text(postcmd)
+    run_command(
+        ["bash", postcmd_file],
+        env={"infile": ofile, "outfile": outfile, "outdir": outdir},
+        fg=True,
+    )
+else:
+    run_command(dict_to_cli_args(envs, prefix="-"), stdout=outfile)

biopipen/scripts/bed/BedtoolsMerge.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from biopipen.utils import run_command, dict_to_cli_args
-inbed = {{in.inbed | repr}}  # pyright: ignore
+inbed = {{in.inbed | repr}}  # pyright: ignore # noqa: #999
 outbed = {{out.outbed | repr}}  # pyright: ignore
 envs = {{envs | repr}}  # pyright: ignore
 bedtools = envs.pop("bedtools", "bedtools")

biopipen/scripts/cnv/AneuploidyScore.R CHANGED Viewed

@@ -127,13 +127,32 @@ getCAA <- function(segf, cytoarm, tcn_col,
   return(as(seg_cyto_chr, "GRangesList"))
 }
-segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
-seg = data.frame(
-    seqnames = segments[, chrom_col],
-    start = segments[, start_col],
-    end = segments[, end_col],
-    seg.mean = segments[, seg_col]
-)
+if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
+  library(VariantAnnotation)
+  vcf = readVcf(segfile)
+  seg = data.frame(
+      seqnames = as.character(seqnames(vcf)),
+      start = start(vcf),
+      end = vcf@info[[end_col]],
+      seg.mean = vcf@info[[seg_col]]
+  )
+} else if (endsWith(segfile, ".bed")) {
+  segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      seqnames = segments[, 1],
+      start = segments[, 2],
+      end = segments[, 3],
+      seg.mean = segments[, 5]
+  )
+} else {
+  segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      seqnames = segments[, chrom_col],
+      start = segments[, start_col],
+      end = segments[, end_col],
+      seg.mean = segments[, seg_col]
+  )
+}
 {% if envs.segmean_transform %}
 segmean_transform = {{envs.segmean_transform}}
@@ -168,6 +187,10 @@ if (is.character(cn_transform)) {
 }
 {% endif %}
+seg <- seg[
+  !is.na(seg$seg.mean) & !is.na(seg$TCN) & !is.infinite(seg$seg.mean) & !is.infinite(seg$TCN),,
+  drop=FALSE]
 write.table(seg, file.path(outdir, "seg.txt"), sep="\t", quote=F, row.names=F, col.names=T)
 wgd_ploidy = checkIfWGD(

biopipen/scripts/cnv/AneuploidyScoreSummary.R CHANGED Viewed

@@ -52,8 +52,11 @@ if (!is.null(group_cols)) {
 if (!is.null(metafile)) {
     metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
-    sample_col = colnames(metadf)[1]
-    colnames(metadf)[1] = "Sample"
+    if (!is.null(metadf$Sample)) {
+        metadf$Sample = as.character(metadf$Sample)
+    } else {
+        colnames(metadf)[1] = "Sample"
+    }
     metadf = metadf[metadf$Sample %in% sams, c("Sample", meta_cols), drop=FALSE]
     if (nrow(metadf) != length(sams)) {
         stop(paste("Not all samples in metafile:", paste(setdiff(sams, metadf$Sample), collapse=", ")))

biopipen/scripts/cnv/TMADScore.R CHANGED Viewed

@@ -11,11 +11,27 @@ if (is.character(segmean_transform)) {
     segmean_transform = eval(parse(text=segmean_transform))
 } # otherwise NULL
-segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
-seg = data.frame(
-    chrom = segments[, chrom_col],
-    log2 = segments[, seg_col]
-)
+if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
+  library(VariantAnnotation)
+  segments = readVcf(segfile)
+  seg = data.frame(
+      chrom = as.character(seqnames(segments)),
+      log2 = segments@info[[seg_col]]
+  )
+} else if (endsWith(segfile, ".bed")) {
+  segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      chrom = segments[, 1],
+      log2 = segments[, 5]
+  )
+} else {
+  segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      chrom = segments[, chrom_col],
+      log2 = segments[, seg_col]
+  )
+}
 rm(segments)
 if (!is.null(excl_chroms) && length(excl_chroms) > 0) {

biopipen/scripts/cnv/TMADScoreSummary.R CHANGED Viewed

@@ -49,8 +49,12 @@ if (!is.null(group_cols)) {
 data = data.frame(Sample = sams, tMAD = tmads)
 if (file.exists(metafile) && length(meta_cols) > 0) {
     metadf = read.table(metafile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
-    sample_col = colnames(metadf)[1]
-    meta = metadf[, c(sample_col, meta_cols), drop=FALSE]
+    if (!is.null(metadf$Sample)) {
+        metadf$Sample = as.character(metadf$Sample)
+    } else {
+        colnames(metadf)[1] = "Sample"
+    }
+    meta = metadf[, c("Sample", meta_cols), drop=FALSE]
     colnames(meta) = c("Sample", meta_cols)
     data = data %>% left_join(meta, by="Sample")
 }

biopipen/scripts/cnvkit/CNVkitAccess.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
 excfiles = {{in.excfiles | repr}}  # pyright: ignore
@@ -12,7 +13,7 @@ def main():
         "": [cnvkit, "access"],
         "s": min_gap_size,
         "o": outfile,
-        "_": reffile,
+        "_": Path(reffile).expanduser(),
     }
     if excfiles:
         other_args["exclude"] = excfiles

biopipen/scripts/cnvkit/CNVkitAutobin.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
 bamfiles = {{in.bamfiles | repr}}  # pyright: ignore
@@ -20,7 +21,7 @@ short_names = {{envs.short_names | repr}}  # pyright: ignore
 def main():
     args = dict(
-        f=reffile,
+        f=Path(reffile).expanduser(),
         m=method,
         g=accfile,
         t=baitfile,
@@ -29,7 +30,7 @@ def main():
         target_min_size=target_min_size,
         antitarget_max_size=antitarget_max_size,
         antitarget_min_size=antitarget_min_size,
-        annotate=annotate,
+        annotate=Path(annotate).expanduser(),
         short_names=short_names,
         target_output_bed=target_file,
         antitarget_output_bed=antitarget_file,

biopipen/scripts/cnvkit/CNVkitBatch.py CHANGED Viewed

@@ -42,7 +42,7 @@ def gen_access():
         exclude=access_excludes or False,
         s=access_min_gap_size or False,
         o=accessfile,
-        _=ref,
+        _=Path(ref).expanduser(),
     )
     args[""] = [cnvkit, "access"]
     run_command(dict_to_cli_args(args, dashify=True), fg=True)

biopipen/scripts/cnvkit/CNVkitCoverage.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
 bamfile = {{in.bamfile | quote}}  # pyright: ignore
@@ -13,7 +14,7 @@ ncores = {{envs.ncores | repr}}  # pyright: ignore
 def main():
     args = dict(
-        f=reffile,
+        f=Path(reffile).expanduser(),
         c=count,
         q=min_mapq,
         p=ncores,

biopipen/scripts/cnvkit/CNVkitGuessBaits.py CHANGED Viewed

@@ -60,7 +60,7 @@ params.update({
     "o": targetfile,
     "c": covfile,
     "p": ncores,
-    "f": ref,
+    "f": Path(ref).expanduser(),
     "s": samtools,
     "_": bamfiles,
 })

biopipen/scripts/cnvkit/CNVkitHeatmap.py CHANGED Viewed

@@ -4,7 +4,7 @@ from diot import Diot
 from biopipen.utils.misc import run_command, dict_to_cli_args
-segfiles = {{in.segfiles | repr}}  # pyright: ignore
+segfiles = {{in.segfiles | repr}}  # pyright: ignore # noqa
 sample_sex = {{in.sample_sex | repr}}  # pyright: ignore
 outdir = {{out.outdir | repr}}  # pyright: ignore
 cnvkit = {{envs.cnvkit | quote}}  # pyright: ignore

biopipen/scripts/cnvkit/CNVkitReference.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
 covfiles = {{in.covfiles | repr}}  # pyright: ignore
@@ -18,7 +19,7 @@ no_rmask = {{envs.no_rmask | repr}}  # pyright: ignore
 def main():
     args = dict(
-        f=reffile,
+        f=Path(reffile).expanduser(),
         o=outfile,
         c=cluster,
         min_cluster_size=min_cluster_size,

biopipen/scripts/delim/SampleInfo.R CHANGED Viewed

@@ -88,7 +88,11 @@ for (name in names(stats)) {
     group <- if (is.null(stat$group)) sym("..group") else sym(stat$group)
     count_on <- paste0("..count.", stat$on)
     if (!is_continuous) {
-        data <- data %>% add_count(!!group, name = count_on)
+        if (!is.null(stat$each)) {
+            data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
+        } else {
+            data <- data %>% add_count(!!group, name = count_on)
+        }
     }
     if (is.null(stat$devpars)) {
@@ -141,18 +145,19 @@ for (name in names(stats)) {
         } else {
             data <- data %>%
                 distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
+                mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
                 group_by(!!sym(stat$each))
         }
         p <- ggplot(
-            data %>% arrange(!!group),
-            aes(x = "", y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
+            data %>% mutate(.size = sum(!!sym(count_on))),
+            aes(x = sqrt(.size) / 2, width = sqrt(.size), y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
         ) +
-            geom_bar(stat="identity", width=1, color="white", position = position_stack(reverse = TRUE)) +
+            geom_bar(stat="identity", color="white", position = position_fill(reverse = TRUE)) +
             coord_polar("y", start = 0) +
             theme_void() +
             theme(plot.title = element_text(hjust = 0.5)) +
             geom_label_repel(
-                position = position_stack(vjust = 0.5),
+                position = position_fill(reverse = TRUE,vjust = .5),
                 color="#333333",
                 fill="#EEEEEE",
                 size=4

biopipen/scripts/gene/GeneNameConversion.R ADDED Viewed

@@ -0,0 +1,65 @@
+source("{{biopipen_dir}}/utils/misc.R")
+source("{{biopipen_dir}}/utils/gene.R")
+infile <- {{in.infile | quote}}
+outfile <- {{out.outfile | quote}}
+notfound <- {{envs.notfound | r}}
+genecol <- {{envs.genecol | r}}
+output <- {{envs.output | r}}
+dup <- {{envs.dup | r}}
+infmt <- {{envs.infmt | r}}
+outfmt <- {{envs.outfmt | r}}
+species <- {{envs.species | r}}
+if (is.na(notfound)) {
+    notfound = "na"
+}
+df <- read.table(infile, header=TRUE, sep="\t", check.names=FALSE)
+if (genecol == 0) {
+    log_warn("envs.genecol should be 1-based, but 0 was given. Using 1 instead.")
+    genecol <- 1
+}
+if (is.numeric(genecol)) { genecol <- colnames(df)[genecol] }
+if (dup == "combine") { dup <- ";" }
+genes <- df[[genecol]]
+converted <- gene_name_conversion(
+    genes=genes,
+    species=species,
+    infmt=infmt,
+    outfmt=outfmt,
+    notfound=notfound,
+    dup=dup
+)
+#    <genecol> <outfmt>
+# 1  1255_g_at   GUCA1A
+# 2    1316_at     THRA
+# 3    1320_at   PTPN21
+# 4    1294_at  MIR5193
+# order the converted dataframe by the original gene column
+converted <- converted[order(match(converted$query, genes)), , drop=FALSE]
+outcol <- outfmt
+if (notfound == "skip" || notfound == "ignore") {
+    df <- df[df[[genecol]] %in% converted$query, , drop=FALSE]
+}
+if (output == "append") {
+    if (outfmt %in% colnames(df)) {
+        log_warn("The output column name already exists in the input dataframe. Appending with a suffix `_1`.")
+        outcol <- paste(outfmt, "_1", sep="")
+    }
+    df[[outcol]] <- converted[[outfmt]]
+} else if (output == "replace") {
+    df[[genecol]] <- converted[[outfmt]]
+} else if (output == "with-query") {
+    df <- converted
+} else {
+    df <- converted[, outfmt, drop=FALSE]
+}
+write.table(df, file=outfile, sep="\t", quote=FALSE, row.names=FALSE)

biopipen 0.28.1__py3-none-any.whl → 0.29.1__py3-none-any.whl

Potentially problematic release.

biopipen 0.28.1py3-none-any.whl → 0.29.1py3-none-any.whl