PyPI - biopipen - Versions diffs - 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

biopipen 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (134) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +6 -0
biopipen/core/filters.py +77 -26
biopipen/core/testing.py +6 -1
biopipen/ns/bam.py +39 -0
biopipen/ns/cellranger.py +5 -0
biopipen/ns/cellranger_pipeline.py +2 -2
biopipen/ns/cnvkit_pipeline.py +4 -1
biopipen/ns/delim.py +33 -27
biopipen/ns/protein.py +99 -0
biopipen/ns/scrna.py +411 -250
biopipen/ns/snp.py +16 -3
biopipen/ns/tcr.py +125 -1
biopipen/ns/vcf.py +34 -0
biopipen/ns/web.py +5 -1
biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
biopipen/reports/tcr/ClonalStats.svelte +15 -0
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +2 -2
biopipen/scripts/bam/BamSampling.py +4 -4
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +3 -3
biopipen/scripts/bam/CNVpytor.py +10 -10
biopipen/scripts/bam/ControlFREEC.py +11 -11
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +20 -9
biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/SampleInfo.R +85 -139
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +4 -4
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifScan.py +8 -8
biopipen/scripts/scrna/CellCellCommunication.py +59 -22
biopipen/scripts/scrna/CellsDistribution.R +31 -6
biopipen/scripts/scrna/MarkersFinder.R +272 -602
biopipen/scripts/scrna/MetaMarkers.R +16 -7
biopipen/scripts/scrna/RadarPlots.R +75 -35
biopipen/scripts/scrna/SCP-plot.R +15202 -0
biopipen/scripts/scrna/ScVelo.py +0 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +138 -81
biopipen/scripts/scrna/SlingShot.R +71 -0
biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
biopipen/scripts/snp/Plink2GTMat.py +26 -11
biopipen/scripts/snp/PlinkFilter.py +7 -7
biopipen/scripts/snp/PlinkFromVcf.py +8 -5
biopipen/scripts/snp/PlinkSimulation.py +4 -4
biopipen/scripts/snp/PlinkUpdateName.py +4 -4
biopipen/scripts/stats/ChowTest.R +48 -22
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
biopipen/scripts/tcr/ClonalStats.R +484 -0
biopipen/scripts/tcr/CloneResidency.R +23 -5
biopipen/scripts/tcr/Immunarch-basic.R +8 -1
biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
biopipen/scripts/tcr/ScRepLoading.R +127 -0
biopipen/scripts/tcr/TCRClusterStats.R +24 -7
biopipen/scripts/tcr/TCRDock.py +10 -6
biopipen/scripts/tcr/TESSA.R +6 -1
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +4 -4
biopipen/scripts/vcf/BcftoolsView.py +5 -5
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +12 -3
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +3 -3
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
biopipen/scripts/web/gcloud_common.py +1 -1
biopipen/utils/gsea.R +96 -42
biopipen/utils/misc.R +205 -7
biopipen/utils/misc.py +17 -8
biopipen/utils/plot.R +53 -17
biopipen/utils/reference.py +11 -11
biopipen/utils/repr.R +146 -0
biopipen/utils/vcf.py +1 -1
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/delim/SampleInfo.R CHANGED Viewed

@@ -1,14 +1,11 @@
-{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
-{{ biopipen_dir | joinpaths: "utils", "mutate_helpers.R" | source_r }}
 library(rlang)
 library(dplyr)
-library(ggplot2)
-library(ggprism)
-library(ggrepel)
+library(biopipen.utils)
+library(plotthis)
 infile <- {{in.infile | r}}
 outfile <- {{out.outfile | r}}
+joboutdir <- {{job.outdir | r}}
 sep <- {{envs.sep | r}}
 mutaters <- {{envs.mutaters | r}}
 save_mutated <- {{envs.save_mutated | r}}
@@ -16,6 +13,9 @@ defaults <- {{envs.defaults | r}}
 stats <- {{envs.stats | r}}
 exclude_cols <- {{envs.exclude_cols | r}}
+log <- get_logger()
+reporter <- get_reporter()
 if (is.null(exclude_cols)) {
     exclude_cols <- c()
 } else {
@@ -29,6 +29,50 @@ if (colnames(indata)[1] == "row.names") {
     stop("Wrong number of column names. Do you have the right `sep`?")
 }
+#' Get plotthis function from plot_type
+#'
+#' @param plot_type The plot type
+#' @param gglogger_register Register the plotthis function to gglogger
+#' @param return_name Return the name of the function instead of the function
+#' @return The plotthis function
+#' @export
+get_plotthis_fn <- function(plot_type, gglogger_register = TRUE, return_name = FALSE) {
+    fn_name <- switch(plot_type,
+        hist = "Histogram",
+        histo = "Histogram",
+        histogram = "Histogram",
+        featuredim = "FeatureDimPlot",
+        splitbar = "SplitBarPlot",
+        enrichmap = "EnrichMap",
+        enrichnet = "EnrichNetwork",
+        enrichnetwork = "EnrichNetwork",
+        gsea = "GSEAPlot",
+        gseasummary = "GSEASummaryPlot",
+        gseasum = "GSEASummaryPlot",
+        heatmap = "Heatmap",
+        network = "Network",
+        pie = "PieChart",
+        wordcloud = "WordCloudPlot",
+        venn = "VennDiagram",
+        paste0(tools::toTitleCase(plot_type), "Plot")
+    )
+    if (return_name) {
+        return(fn_name)
+    }
+    fn <- tryCatch({
+        utils::getFromNamespace(fn_name, "plotthis")
+    }, error = function(e) {
+        stop("Unknown plot type: ", plot_type)
+    })
+    if (gglogger_register) {
+        gglogger::register(fn, fn_name)
+    } else {
+        fn
+    }
+}
+log$info("Applying mutaters to the data if any ...")
 if (!is.null(mutaters) && length(mutaters) > 0) {
     mutdata <- indata %>%
         mutate(!!!lapply(mutaters, parse_expr))
@@ -44,7 +88,9 @@ write.table(
     col.names = TRUE,
     quote = FALSE
 )
-add_report(
+reporter$add(
     list(
         kind = "descr",
         content = "The samples used in the analysis. Each row is a sample, and columns are the meta information about the sample. This is literally the input sample information file, but the paths to the scRNA-seq and scTCR-seq data are hidden.",
@@ -59,144 +105,44 @@ add_report(
     h1 = "Sample Information"
 )
-theme_set(theme_prism())
-for (name in names(stats)) {
-    stat <- list_update(defaults, stats[[name]])
-    plotfile <- file.path(outdir, paste0(name, ".png"))
-    is_continuous <- FALSE
-    if (!is.null(stat$subset)) {
-        data <- mutdata %>% filter(!!parse_expr(stat$subset))
-    } else {
-        data <- mutdata
-    }
-    if (!is.null(stat$group) && !stat$na_group) {
-        data <- data %>% filter(!is.na(!!sym(stat$group)))
-    }
-    if (!is.null(stat$each) && !stat$na_each) {
-        data <- data %>% filter(!is.na(!!sym(stat$each)))
-    }
+if (length(stats) > 0) {
+    cases <- expand_cases(stats, defaults)
+    for (name in names(cases)) {
+        log$info("- Statistic: {name}")
-    if (is.numeric(data[[stat$on]])) {
-        is_continuous <- TRUE
-    }
+        case <- cases[[name]]
+        info <- case_info(name, outdir, is_dir = FALSE, create = TRUE)
+        case <- extract_vars(case, "plot_type", "more_formats", "save_code", "section", "subset", "devpars", "descr")
-    if (is.null(stat$plot)) {
-        stat$plot <- if (is_continuous) "boxplot" else "pie"
-    }
+        plot_fn <- get_plotthis_fn(plot_type)
+        more_formats <- unique(c("png", more_formats))
-    data$..group <- "All"
-    group <- if (is.null(stat$group)) sym("..group") else sym(stat$group)
-    count_on <- paste0("..count.", stat$on)
-    if (!is_continuous) {
-        if (!is.null(stat$each)) {
-            data <- data %>% add_count(!!group, !!sym(stat$each), name = count_on)
+        if (!is.null(subset)) {
+            case$data <- mutdata %>% dplyr::filter(!!parse_expr(subset))
         } else {
-            data <- data %>% add_count(!!group, name = count_on)
+            case$data <- mutdata
         }
-    }
-    if (is.null(stat$devpars)) {
-        stat$devpars <- list()
-    }
-    if (is.null(stat$devpars$width)) {
-        stat$devpars$width <- 800
-    }
-    if (is.null(stat$devpars$height)) {
-        stat$devpars$height <- 600
-    }
-    if (is.null(stat$devpars$res)) {
-        stat$devpars$res <- 100
-    }
-    png(
-        plotfile,
-        width = stat$devpars$width,
-        height = stat$devpars$height,
-        res = stat$devpars$res
-    )
-    if (stat$plot == "boxplot" || stat$plot == "box") {
-        p <- ggplot(data, aes(x=!!group, y=!!sym(stat$on), fill=!!group)) +
-            geom_boxplot(position = "dodge") +
-            scale_fill_biopipen(alpha = .6) +
-            xlab("")
-    } else if (stat$plot == "violin" ||
-               stat$plot == "violinplot" ||
-               stat$plot == "vlnplot") {
-        p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill=!!group)) +
-            geom_violin(position = "dodge") +
-            scale_fill_biopipen(alpha = .6) +
-            xlab("")
-    } else if (
-        (grepl("violin", stat$plot) || grepl("vln", stat$plot)) &&
-        grepl("box", stat$plot)
-    ) {
-        p <- ggplot(data, aes(x = !!group, y = !!sym(stat$on), fill = !!group)) +
-            geom_violin(position = "dodge") +
-            geom_boxplot(width = 0.1, position = position_dodge(0.9), fill="white") +
-            scale_fill_biopipen(alpha = .6) +
-            xlab("")
-    } else if (stat$plot == "histogram" || stat$plot == "hist") {
-        p <- ggplot(data, aes(x = !!sym(stat$on), fill = !!group)) +
-            geom_histogram(bins = 10, position = "dodge", alpha = 0.8, color = "white") +
-            scale_fill_biopipen(alpha = .6)
-    } else if (stat$plot == "pie" || stat$plot == "piechart") {
-        if (is.null(stat$each)) {
-            data <- data %>% distinct(!!group, .keep_all = TRUE)
-        } else {
-            data <- data %>%
-                distinct(!!group, !!sym(stat$each), .keep_all = TRUE) %>%
-                mutate(!!group := factor(!!group, levels = unique(!!group))) %>%
-                group_by(!!sym(stat$each))
+        p <- do_call(gglogger::register(plot_fn, name = plot_type), case)
+        save_plot(p, info$prefix, devpars, formats = more_formats)
+        if (save_code) {
+            save_plotcode(
+                p,
+                setup = c('library(plotthis)', '', 'load("data.RData")', 'list2env(case, envir = .GlobalEnv)'),
+                prefix = info$caseprefix,
+                "case",
+                auto_data_setup = FALSE
+            )
         }
-        p <- ggplot(
-            data %>% mutate(.size = sum(!!sym(count_on))),
-            aes(x = sqrt(.size) / 2, width = sqrt(.size), y = !!sym(count_on), fill = !!group, label = !!sym(count_on))
-        ) +
-            geom_bar(stat="identity", color="white", position = position_fill(reverse = TRUE)) +
-            coord_polar("y", start = 0) +
-            theme_void() +
-            theme(plot.title = element_text(hjust = 0.5)) +
-            geom_label_repel(
-                position = position_fill(reverse = TRUE,vjust = .5),
-                color="#333333",
-                fill="#EEEEEE",
-                size=4
-            ) +
-            scale_fill_biopipen(alpha = .6, name = group) +
-            ggtitle(paste0("# ", stat$on))
-    } else if (stat$plot == "bar" || stat$plot == "barplot") {
-        if (is.null(stat$each)) {
-            data <- data %>% distinct(!!group, .keep_all = TRUE)
-        } else {
-            data <- data %>% distinct(!!group, !!sym(stat$each), .keep_all = TRUE)
-        }
-        p <- ggplot(
-            data,
-            aes(x = !!group, y = !!sym(count_on), fill = !!group)) +
-            geom_bar(stat = "identity") +
-            scale_fill_biopipen(alpha = .6) +
-            ylab(paste0("# ", stat$on))
-    } else {
-        stop("Unknown plot type: ", stat$plot)
-    }
-    if (!is.null(stat$each)) {
-        p <- p + facet_wrap(vars(!!sym(stat$each)), ncol = stat$ncol)
+        reporter$add(
+            reporter$image(
+                info$prefix,
+                c("png", more_formats),
+                save_code,
+                kind = "table_image"
+            ),
+            h1 = "Statistics", ui = "table_of_images:2"
+        )
     }
-    print(p)
-    dev.off()
-    by_desc <- ifelse(is.null(stat$by), "", paste0(" by ", stat$by))
-    descr <- ifelse(
-        is_continuous,
-        paste0("The distribution of ", stat$on, by_desc),
-        paste0("The number of ", stat$on, by_desc)
-    )
-    add_report(
-        list(kind = "table_image", src = plotfile, name = name, descr = descr),
-        h1 = "Statistics",
-        ui = "table_of_images:2"
-    )
 }
-save_report(outdir)

biopipen/scripts/misc/Config2File.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import json
 import rtoml
-configstr = {{in.config | repr}}  # pyright: ignore
-outfile = {{out.outfile | quote}}  # pyright: ignore
+configstr: str = {{in.config | quote}}  # pyright: ignore  # noqa
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
 infmt = {{envs.infmt | quote}}  # pyright: ignore
 outfmt = {{envs.outfmt | quote}}  # pyright: ignore

biopipen/scripts/misc/Str2File.py CHANGED Viewed

@@ -1,6 +1,6 @@
-instr = {{in.str | repr}}  # pyright: ignore
+instr: str = {{in.str | quote}}  # pyright: ignore  # noqa
 name = {{repr(in.name or envs.name)}}  # pyright: ignore
-outfile = {{out.outfile | quote}}  # pyright: ignore
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
 with open(outfile, "wt") as fout:
     fout.write(instr)

biopipen/scripts/protein/MMCIF2PDB.py ADDED Viewed

@@ -0,0 +1,33 @@
+from pathlib import Path
+from shutil import which
+from diot import Diot  # noqa: F401
+from biopipen.utils.misc import run_command, dict_to_cli_args
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+envs: dict = {{envs | repr}}  # pyright: ignore
+tool: str = envs.pop("tool", "maxit")
+maxit: str = envs.pop("maxit", "maxit")
+beem = envs.pop("beem", "BeEM")
+if tool == "maxit":
+    maxit_found = which(maxit)
+    if not maxit_found:
+        raise ValueError(f"maxit executable not found: {maxit}")
+    maxit_exe = Path(maxit_found).expanduser().resolve()
+    rcsbroot = maxit_exe.parent.parent
+    envs["input"] = infile
+    envs["output"] = outfile
+    envs["o"] = 2
+    envs["log"] = Path(outfile).with_suffix(".log")
+    run_command([maxit, *dict_to_cli_args(envs, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
+else:
+    outfile: Path = Path(outfile)  # type: ignore
+    envs["_"] = infile
+    envs["p"] = outfile.parent.joinpath(outfile.stem)
+    envs["outfmt"] = 3
+    args = dict_to_cli_args(envs, prefix="-", sep="=")
+    run_command([beem, *args], fg=True)

biopipen/scripts/protein/PDB2Fasta.py ADDED Viewed

@@ -0,0 +1,60 @@
+# """
+# LICENSE
+# GNU General Public License v2.0
+# The code is based on the script from:
+# https://github.com/kad-ecoli/pdb2fasta/blob/master/pdb2fasta.py
+# The original code is licensed under GNU General Public License v2.0.
+# The original code is modified by biopipen developers to fit the biopipen.
+# """
+from __future__ import annotations
+import re
+from collections import defaultdict
+from pathlib import Path
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa: E999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+chains: str | list | None = {{envs.chains | repr}}  # pyright: ignore
+wrap: int = {{envs.wrap | repr}}  # pyright: ignore
+if isinstance(chains, str):
+    chains = [chain.strip() for chain in chains.split(",")]
+aa3to1 = {
+   'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
+   'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
+   'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
+   'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
+   'MSE':'M',
+}
+ca_pattern = re.compile(
+    r"^ATOM\s{2,6}\d{1,5}\s{2}CA\s[\sA]([A-Z]{3})\s([\s\w])|^HETATM\s{0,4}\d{1,5}\s{2}CA\s[\sA](MSE)\s([\s\w])"  # noqa: W605
+)
+filename = Path(infile).stem
+chain_dict = defaultdict(str)
+with open(infile, 'r') as fp:
+    for line in fp:
+        if line.startswith("ENDMDL"):
+            break
+        match_list = ca_pattern.findall(line)
+        if match_list:
+            resn = match_list[0][0] + match_list[0][2]
+            chain = match_list[0][1] + match_list[0][3]
+            if chains is None or chain in chains:
+                chain_dict[chain] += aa3to1[resn]
+with open(outfile, 'w') as fp:
+    for chain in chain_dict:
+        fp.write(f">{filename}:{chain}\n")
+        sequence = chain_dict[chain]
+        if wrap > 0:
+            for i in range(0, len(sequence), 80):
+                fp.write(sequence[i:i+80] + "\n")
+        else:
+            fp.write(sequence + "\n")

biopipen/scripts/protein/Prodigy.py CHANGED Viewed

@@ -2,15 +2,15 @@ import json
 import logging
 import sys
 from pathlib import Path
-from prodigy_prot.predict_IC import (
+from prodigy_prot.predict_IC import (  # type: ignore
     Prodigy,
     check_path,
     parse_structure,
 )
-infile = {{in.infile | repr}}  # pyright: ignore # noqa
-outfile = {{out.outfile | repr}}  # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
 distance_cutoff = {{envs.distance_cutoff | float}}  # pyright: ignore
 acc_threshold = {{envs.acc_threshold | float}}  # pyright: ignore
 temperature = {{envs.temperature | float}}  # pyright: ignore

biopipen/scripts/protein/RMSD.py ADDED Viewed

@@ -0,0 +1,178 @@
+from pathlib import Path
+from shutil import which
+from diot import Diot  # noqa: F401
+from biopipen.utils.misc import run_command, dict_to_cli_args
+infile1: str = {{in.infile1 | quote}}  # pyright: ignore # noqa
+infile2: str = {{in.infile2 | quote}}  # pyright: ignore # noqa
+outfile: str = {{out.outfile | quote}}  # pyright: ignore # noqa
+outdir: str = {{job.outdir | quote}}  # pyright: ignore # noqa
+envs: dict = {{envs | repr}}  # pyright: ignore # noqa
+conv_tool = envs.pop("conv_tool", "maxit")
+maxit = envs.pop("maxit", "maxit")
+beem = envs.pop("beem", "BeEM")
+ca_only = envs.pop("ca_only", False)
+# aa20_only = envs.pop("aa20_only", False)
+duel = envs.pop("duel", "keep")
+calculate_rmsd = envs.pop("calculate_rmsd", "calculate_rmsd")
+def cif_to_pdb(cif_file, pdb_file:Path):
+    if conv_tool == "maxit":
+        maxit_bin = Path(which(maxit)).resolve()
+        rcsbroot = Path(maxit_bin).parent.parent
+        args = {"input": cif_file, "output": pdb_file, "o": 2, "log": pdb_file.with_suffix(".log")}
+        run_command([maxit, *dict_to_cli_args(args, prefix="-")], fg=True, env={"RCSBROOT": rcsbroot})
+    else:
+        args = {"_": cif_file, "p": pdb_file.parent.joinpath(pdb_file.stem)}
+        args = dict_to_cli_args(args, prefix="-", sep="=")
+        run_command([beem, *args], fg=True)
+def pdb_to_ca_pdb(pdb_file: Path, ca_pdb_file: Path):
+    """Extract C-alpha atoms from a PDB file and still keep the original order and metadata."""
+    with open(pdb_file, "r") as f, open(ca_pdb_file, "w") as fw:
+        for line in f:
+            if line.startswith("ATOM") and line[12:16].strip() == "CA":
+                fw.write(line)
+# def pdb_to_aa20_pdb(pdb_file: Path, aa20_pdb_file: Path):
+#     """Extract the 20 amino acids from a PDB file and still keep the original order and metadata."""
+#     with open(pdb_file, "r") as f, open(aa20_pdb_file, "w") as fw:
+#         for line in f:
+#             if line.startswith("ATOM") and line[17:20].strip() in (
+#                 "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY",
+#                 "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER",
+#                 "THR", "TRP", "TYR", "VAL",
+#             ):
+#                 fw.write(line)
+def deduel_pdb(pdb_file: Path, deduel_pdb_file: Path):
+    """Remove/Handle the duel atoms in a PDB file."""
+    def is_duel(atom1, atom2):
+        #           1         2
+        # 01234567890123456789012345
+        # ATOM    913  CA ATYR A 113
+        # ATOM    914  CA BTYR A 113
+        # The key should be "ATOM|CA |TYR| A| 113"
+        return (
+            atom1[:4] == atom2[:4] and
+            atom1[12:16] == atom2[12:16] and
+            atom1[17:20] == atom2[17:20] and
+            atom1[21] == atom2[21] and
+            atom1[22:26] == atom2[22:26] and
+            atom1[16] != atom2[16]
+        )
+    def clean_atom(atom):
+        return atom[:16] + " " + atom[17:]
+    last_atom = ""
+    with open(pdb_file, "r") as f, open(deduel_pdb_file, "w") as fw:
+        for line in f:
+            if not line.startswith("ATOM"):
+                fw.write(line)
+                continue
+            if not is_duel(last_atom, line):
+                if last_atom:
+                    fw.write(clean_atom(last_atom))
+                last_atom = line
+            # is duel
+            elif duel == "keep":
+                fw.write(clean_atom(last_atom))
+                fw.write(clean_atom(line))
+                last_atom = ""
+            elif duel == "keep_first":
+                fw.write(clean_atom(last_atom))
+                last_atom = ""
+            elif duel == "keep_last":
+                fw.write(clean_atom(line))
+                last_atom = ""
+            elif duel == "average":
+                # Average the coordinates
+                x1 = float(last_atom[30:38])
+                y1 = float(last_atom[38:46])
+                z1 = float(last_atom[46:54])
+                x2 = float(line[30:38])
+                y2 = float(line[38:46])
+                z2 = float(line[46:54])
+                x = (x1 + x2) / 2.0
+                y = (y1 + y2) / 2.0
+                z = (z1 + z2) / 2.0
+                fw.write(clean_atom(last_atom[:30] + f"{x:8.3f}{y:8.3f}{z:8.3f}" + last_atom[54:]))
+                last_atom = ""
+        if last_atom:
+            fw.write(last_atom)
+def index_of(lst, item) -> int:
+    try:
+        return lst.index(item)
+    except ValueError:
+        return -1
+if infile1.endswith(".cif"):
+    pdb1 = Path(outdir) / f"{Path(infile1).stem}.pdb"
+    cif_to_pdb(infile1, pdb1)
+    infile1 = pdb1  # type: ignore
+if infile2.endswith(".cif"):
+    pdb2 = Path(outdir) / f"{Path(infile2).stem}.pdb"
+    cif_to_pdb(infile2, pdb2)
+    infile2 = pdb2  # type: ignore
+if ca_only:
+    ca_pdb1 = Path(outdir) / f"{Path(infile1).stem}.ca.pdb"
+    pdb_to_ca_pdb(infile1, ca_pdb1) # type: ignore
+    infile1 = ca_pdb1  # type: ignore
+    ca_pdb2 = Path(outdir) / f"{Path(infile2).stem}.ca.pdb"
+    pdb_to_ca_pdb(infile2, ca_pdb2) # type: ignore
+    infile2 = ca_pdb2  # type: ignore
+# if aa20_only:
+#     aa20_pdb1 = Path(outdir) / f"{Path(infile1).stem}.aa20.pdb"
+#     pdb_to_aa20_pdb(infile1, aa20_pdb1) # type: ignore
+#     infile1 = aa20_pdb1  # type: ignore
+#     aa20_pdb2 = Path(outdir) / f"{Path(infile2).stem}.aa20.pdb"
+#     pdb_to_aa20_pdb(infile2, aa20_pdb2) # type: ignore
+#     infile2 = aa20_pdb2  # type: ignore
+if duel != "keep":
+    deduel_pdb1 = Path(outdir) / f"{Path(infile1).stem}.deduel.pdb"
+    deduel_pdb(infile1, deduel_pdb1) # type: ignore
+    infile1 = deduel_pdb1  # type: ignore
+    deduel_pdb2 = Path(outdir) / f"{Path(infile2).stem}.deduel.pdb"
+    deduel_pdb(infile2, deduel_pdb2) # type: ignore
+    infile2 = deduel_pdb2  # type: ignore
+envs["_"] = [infile1, infile2]
+envs = dict_to_cli_args(envs, dashify=True)
+idx_ur = index_of(envs, "--ur")
+if idx_ur != -1:
+    envs[idx_ur] = "-ur"
+idx_urks = index_of(envs, "--urks")
+if idx_urks != -1:
+    envs[idx_urks] = "-urks"
+idx_nh = index_of(envs, "--nh")
+if idx_nh != -1:
+    envs[idx_nh] = "-nh"
+out: str = run_command([calculate_rmsd, *envs], stdout="return")  # type: ignore
+out = out.strip()
+try:
+    float(out)
+except (ValueError, TypeError):
+    raise ValueError(out)
+Path(outfile).write_text(out)

biopipen/scripts/regulatory/MotifScan.py CHANGED Viewed

@@ -5,20 +5,20 @@ import re
 from pathlib import PosixPath  # noqa: F401
 from biopipen.utils.misc import run_command, dict_to_cli_args, logger
-motiffile = {{in.motiffile | repr}}  # pyright: ignore # noqa: #999
-seqfile = {{in.seqfile | repr}}  # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
+motiffile: str = {{in.motiffile | quote}}  # pyright: ignore # noqa: #999
+seqfile: str = {{in.seqfile | quote}}  # pyright: ignore
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
 tool = {{envs.tool | repr}}  # pyright: ignore
 fimo = {{envs.fimo | repr}}  # pyright: ignore
-motif_col = {{envs.motif_col | repr}}  # pyright: ignore
-regulator_col = {{envs.regulator_col | repr}}  # pyright: ignore
+motif_col: str | int = {{envs.motif_col | repr}}  # pyright: ignore
+regulator_col: str | int = {{envs.regulator_col | repr}}  # pyright: ignore
 notfound = {{envs.notfound | repr}}  # pyright: ignore
-motifdb = {{envs.motifdb | repr}}  # pyright: ignore
+motifdb: str | None = {{envs.motifdb | repr}}  # pyright: ignore
 cutoff = {{envs.cutoff | repr}}  # pyright: ignore
 q = {{envs.q | repr}}  # pyright: ignore
 q_cutoff = {{envs.q_cutoff | repr}}  # pyright: ignore
-args = {{envs.args | dict | repr}}  # pyright: ignore
+args: dict = {{envs.args | dict | repr}}  # pyright: ignore
 # Check if the tool is supported
 if tool != "fimo":
@@ -41,7 +41,7 @@ if isinstance(motif_col, str) or isinstance(regulator_col, str):
     with open(motiffile, "r") as f:
         header = f.readline().strip().split("\t")
         if isinstance(motif_col, str):
-            motif_col = header.index(motif_col) + 1
+            motif_col: int = header.index(motif_col) + 1
         if isinstance(regulator_col, str):
             regulator_col = header.index(regulator_col) + 1
 if isinstance(motif_col, int):

biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl