PyPI - biopipen - Versions diffs - 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

biopipen 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (134) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +6 -0
biopipen/core/filters.py +77 -26
biopipen/core/testing.py +6 -1
biopipen/ns/bam.py +39 -0
biopipen/ns/cellranger.py +5 -0
biopipen/ns/cellranger_pipeline.py +2 -2
biopipen/ns/cnvkit_pipeline.py +4 -1
biopipen/ns/delim.py +33 -27
biopipen/ns/protein.py +99 -0
biopipen/ns/scrna.py +411 -250
biopipen/ns/snp.py +16 -3
biopipen/ns/tcr.py +125 -1
biopipen/ns/vcf.py +34 -0
biopipen/ns/web.py +5 -1
biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
biopipen/reports/tcr/ClonalStats.svelte +15 -0
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +2 -2
biopipen/scripts/bam/BamSampling.py +4 -4
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +3 -3
biopipen/scripts/bam/CNVpytor.py +10 -10
biopipen/scripts/bam/ControlFREEC.py +11 -11
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +20 -9
biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/SampleInfo.R +85 -139
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +4 -4
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifScan.py +8 -8
biopipen/scripts/scrna/CellCellCommunication.py +59 -22
biopipen/scripts/scrna/CellsDistribution.R +31 -6
biopipen/scripts/scrna/MarkersFinder.R +272 -602
biopipen/scripts/scrna/MetaMarkers.R +16 -7
biopipen/scripts/scrna/RadarPlots.R +75 -35
biopipen/scripts/scrna/SCP-plot.R +15202 -0
biopipen/scripts/scrna/ScVelo.py +0 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -25
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -47
biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -385
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +33 -13
biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -228
biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
biopipen/scripts/scrna/SeuratMap2Ref.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +138 -81
biopipen/scripts/scrna/SlingShot.R +71 -0
biopipen/scripts/scrna/TopExpressingGenes.R +9 -7
biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
biopipen/scripts/snp/Plink2GTMat.py +26 -11
biopipen/scripts/snp/PlinkFilter.py +7 -7
biopipen/scripts/snp/PlinkFromVcf.py +8 -5
biopipen/scripts/snp/PlinkSimulation.py +4 -4
biopipen/scripts/snp/PlinkUpdateName.py +4 -4
biopipen/scripts/stats/ChowTest.R +48 -22
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +12 -2
biopipen/scripts/tcr/ClonalStats.R +484 -0
biopipen/scripts/tcr/CloneResidency.R +23 -5
biopipen/scripts/tcr/Immunarch-basic.R +8 -1
biopipen/scripts/tcr/Immunarch-clonality.R +5 -0
biopipen/scripts/tcr/Immunarch-diversity.R +25 -4
biopipen/scripts/tcr/Immunarch-geneusage.R +15 -1
biopipen/scripts/tcr/Immunarch-kmer.R +14 -1
biopipen/scripts/tcr/Immunarch-overlap.R +15 -1
biopipen/scripts/tcr/Immunarch-spectratyping.R +10 -1
biopipen/scripts/tcr/Immunarch-tracking.R +6 -0
biopipen/scripts/tcr/Immunarch-vjjunc.R +33 -0
biopipen/scripts/tcr/ScRepLoading.R +127 -0
biopipen/scripts/tcr/TCRClusterStats.R +24 -7
biopipen/scripts/tcr/TCRDock.py +10 -6
biopipen/scripts/tcr/TESSA.R +6 -1
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +4 -4
biopipen/scripts/vcf/BcftoolsView.py +5 -5
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +12 -3
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +3 -3
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
biopipen/scripts/web/gcloud_common.py +1 -1
biopipen/utils/gsea.R +96 -42
biopipen/utils/misc.R +205 -7
biopipen/utils/misc.py +17 -8
biopipen/utils/plot.R +53 -17
biopipen/utils/reference.py +11 -11
biopipen/utils/repr.R +146 -0
biopipen/utils/vcf.py +1 -1
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/METADATA +9 -9
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/RECORD +131 -122
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -139
biopipen/scripts/scrna/SeuratPreparing-common.R +0 -452
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -201
{biopipen-0.32.1.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/scrna/SeuratPreparing.R CHANGED Viewed

@@ -1,12 +1,9 @@
-{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
-{{ biopipen_dir | joinpaths: "utils", "caching.R" | source_r }}
 library(Seurat)
 library(future)
 library(bracer)
-library(ggplot2)
 library(dplyr)
-# library(tidyseurat)
+library(glue)
+library(biopipen.utils)
 metafile <- {{in.metafile | quote}}
 rdsfile <- {{out.rdsfile | quote}}
@@ -14,10 +11,9 @@ joboutdir <- {{job.outdir | quote}}
 envs <- {{envs | r: todot = "-", skip = 1}}
 if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
-if (length(envs$cache) > 1) {
-    log_warn("Multiple cache directories (envs.cache) detected, using the first one.")
-    envs$cache <- envs$cache[1]
-}
+log <- get_logger()
+reporter <- get_reporter()
 set.seed(8525)
 # 8TB
@@ -26,15 +22,15 @@ options(future.rng.onMisuse="ignore")
 options(Seurat.object.assay.version = "v5")
 plan(strategy = "multicore", workers = envs$ncores)
-{{ biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-common.R" | source_r }}
-add_report(
+reporter$add(
     list(
         kind = "descr",
         name = "Filters applied",
         content = paste0(
             "<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
-            "<p>Gene filters: ", html_escape(stringify_list(envs$gene_qc)), "</p>"
+            "<p>Gene filters: </p>",
+            "<p>- Min Cells: ", envs$gene_qc$min_cells, "</p>",
+            "<p>- Excludes: ", html_escape(envs$gene_qc$excludes %||% "Not set"), "</p>"
         )
     ),
     h1 = "Filters and QC"
@@ -48,16 +44,6 @@ metadata <- read.table(
     check.names = FALSE
 )
-cache_sig <- capture.output(str(metadata))
-dig_sig <- digest::digest(cache_sig, algo = "md5")
-dig_sig <- substr(dig_sig, 1, 8)
-cache_dir <- NULL
-if (is.character(envs$cache)) {
-    cache_dir <- file.path(envs$cache, paste0(dig_sig, ".seuratpreparing_cache"))
-    dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
-    writeLines(cache_sig, file.path(cache_dir, "signature.txt"))
-}
 meta_cols = colnames(metadata)
 if (!"Sample" %in% meta_cols) {
     stop("Error: Column `Sample` is not found in metafile.")
@@ -66,77 +52,148 @@ if (!"RNAData" %in% meta_cols) {
     stop("Error: Column `RNAData` is not found in metafile.")
 }
-samples = as.character(metadata$Sample)
-# used for plotting
-cell_qc_df = NULL
-plotsdir = file.path(joboutdir, "plots")
-dir.create(plotsdir, showWarnings = FALSE, recursive = TRUE)
-# features for cell QC
-feats = c(
-    "nFeature_RNA", "nCount_RNA",
-    "percent.mt", "percent.ribo", "percent.hb", "percent.plat"
-)
-sobj <- run_cell_qc(sobj)
-# plot and report the QC
-log_info("Plotting and reporting QC ...")
-dim_df = report_cell_qc(nrow(sobj))
-if (is.list(envs$gene_qc)) {
-    sobj <- run_gene_qc(sobj)
-}
-dim_df = rbind(
-    dim_df,
-    data.frame(
-        when = "After_Gene_QC",
-        nCells = ncol(sobj),
-        nGenes = nrow(sobj)
-    )
+qcdir = file.path(joboutdir, "qc")
+dir.create(qcdir, showWarnings = FALSE, recursive = TRUE)
+sobj <- LoadSeuratAndPerformQC(
+    metadata,
+    per_sample_qc = envs$cell_qc_per_sample,
+    cell_qc = envs$cell_qc,
+    gene_qc = envs$gene_qc,
+    tmpdir = joboutdir,
+    log = log,
+    cache = envs$cache)
+log$info("Saving dimension table ...")
+dim_df <- data.frame(
+    when = c("Before QC", "After QC"),
+    nCells = c(nrow(sobj@misc$cell_qc_df), sum(sobj@misc$cell_qc_df$.QC)),
+    nGenes = c(sobj@misc$gene_qc$before, sobj@misc$gene_qc$after)
 )
-log_info("Saving dimension table ...")
-write.table(dim_df, file = file.path(plotsdir, "dim.txt"),
+write.table(dim_df, file = file.path(qcdir, "dim.txt"),
             row.names = FALSE, quote = FALSE, sep = "\t")
-add_report(
+reporter$add(
     list(
         kind = "descr",
-        content = paste(
-            "The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC."
-        )
+        content = "The dimension table for the Seurat object. The table contains the number of cells and genes before and after QC. Note that the cell QC is performed before gene QC."
     ),
     list(
         kind = "table",
-        data = list(path = file.path(plotsdir, "dim.txt"))
+        data = list(path = file.path(qcdir, "dim.txt"))
     ),
-    h1 = "Filters and QC"
+    h1 = "Filters and QC",
+    h2 = "Dimension table"
 )
-sobj <- run_transformation(sobj)
-sobj <- run_integration(sobj)
+log$info("Visualizing QC metrics ...")
+for (pname in names(envs$qc_plots)) {
+    args <- envs$qc_plots[[pname]]
+    args$kind <- args$kind %||% "cell"
+    args$devpars <- args$devpars %||% list()
+    args$more_formats <- args$more_formats %||% character()
+    args$save_code <- args$save_code %||% FALSE
+    extract_vars(args, "kind", "devpars", "more_formats", "save_code")
+    if (kind == "gene") kind <- "gene_qc"
+    if (kind == "cell") kind <- "cell_qc"
+    args$object <- sobj
+    plot_fn <- if (kind == "cell_qc") {
+        gglogger::register(VizSeuratCellQC)
+    } else {
+        gglogger::register(VizSeuratGeneQC)
+    }
+    p <- do_call(plot_fn, args)
+    prefix <- file.path(qcdir, paste0(slugify(pname), "_", kind))
+    save_plot(p, prefix, devpars, formats = c("png", more_formats))
+    if (save_code) {
+        save_plotcode(p, prefix,
+            setup = c("library(biopipen.utils)", "load('data.RData')", "invisible(list2env('args'))"),
+            "args",
+            auto_data_setup = FALSE)
+    }
+    reporter$add(
+        reporter$image(prefix, more_formats, save_code, kind = "image"),
+        h1 = "Filters and QC",
+        h2 = html_escape(pname)
+    )
+}
+sobj <- RunSeuratTransformation(
+    sobj,
+    use_sct = envs$use_sct,
+    SCTransformArgs = envs$SCTransform,
+    NormalizeDataArgs = envs$NormalizeData,
+    FindVariableFeaturesArgs = envs$FindVariableFeatures,
+    ScaleDataArgs = envs$ScaleData,
+    RunPCAArgs = envs$RunPCA,
+    log = log,
+    cache = envs$cache
+)
+sobj <- RunSeuratIntegration(
+    sobj,
+    no_integration = envs$no_integration,
+    IntegrateLayersArgs = envs$IntegrateLayers,
+    log = log,
+    cache = envs$cache
+)
 # This is the last step, doesn't need to be cached
-if (!is.null(envs$doublet_detector) && envs$doublet_detector != "none") {
-    {{* biopipen_dir | joinpaths: "scripts", "scrna", "SeuratPreparing-doublet_detection.R" | source_r }}
-    detector <- tolower(envs$doublet_detector)
-    if (detector == "doubletfinder") detector <- "DoubletFinder"
-    if (detector == "scdblfinder") detector <- "scDblFinder"
-    dd <- run_dd(detector)
-    save_dd(dd, detector)
-    sobj <- add_dd_to_seurat(sobj, dd)
-    plot_dd(sobj, dd, detector)
-    sobj <- filter_dd(sobj, dd, detector)
-    report_dd(detector)
-}
+if (!identical(envs$doublet_detector, "none")) {
+    dbldir <- file.path(joboutdir, "doublets")
+    dir.create(dbldir, showWarnings = FALSE, recursive = TRUE)
+    sobj <- RunSeuratDoubletDetection(
+        sobj,
+        tool = envs$doublet_detector,
+        DoubletFinderArgs = envs$DoubletFinder,
+        scDblFinderArgs = envs$scDblFinder,
+        filter = FALSE,
+        log = log,
+        cache = envs$cache
+    )
+    log$info("Visualizing doublet detection results ...")
+    if (identical(tolower(envs$doublet_detector), "doubletfinder")) {
+        p <- VizSeuratDoublets(sobj, plot_type = "pK", x_text_angle = 90)
+        save_plot(
+            p, file.path(dbldir, "doubletfinder_pk"),
+            devpars = list(res = 100, width = 800, height = 600),
+            formats = "png")
+        reporter$add(
+            list(
+                kind = "descr",
+                content = paste(
+                    "The pK plot from DoubletFinder to select the optimal pK value.",
+                    "See more at https://github.com/chris-mcginnis-ucsf/DoubletFinder"
+                )
+            ),
+            list(
+                kind = "image",
+                src = file.path(dbldir, "doubletfinder_pk.png")
+            ),
+            h1 = glue("Doublet detection using {envs$doublet_detector}"),
+            h2 = "BC metric vs pK"
+        )
+    }
+    for (pt in c("dim", "pie")) {
+        p <- VizSeuratDoublets(sobj, plot_type = pt)
+        save_plot(p, file.path(dbldir, paste0("doublets_", pt)), formats = "png")
+        reporter$add(
+            list(
+                src = file.path(dbldir, paste0("doublets_", pt, ".png")),
+                descr = ifelse(pt == "dim", "Dimention Reduction Plot", "Pie Chart")
+            ),
+            h1 = glue("Doublet detection using {envs$doublet_detector}"),
+            h2 = "Doublets distribution",
+            ui = "table_of_images"
+        )
+    }
-log_info("Saving QC'ed seurat object ...")
-saveRDS(sobj, rdsfile)
+    sobj <- subset(sobj, subset = !!sym(paste0(sobj@misc$doublets$tool, "_DropletType")) != "doublet")
+}
-save_report(joboutdir)
+log$info("Saving QC'ed seurat object ...")
+reporter$save(joboutdir)
+saveRDS(sobj, rdsfile)

biopipen/scripts/scrna/SlingShot.R ADDED Viewed

@@ -0,0 +1,71 @@
+{{ biopipen_dir | joinpaths: "utils", "misc.R" | source_r }}
+library(rlang)
+library(Seurat)
+library(slingshot)
+sobjfile <- {{in.sobjfile | r}}
+outfile <- {{out.outfile | r}}
+group_by <- {{envs.group_by | r}}
+reduction <- {{envs.reduction | r}}
+dims <- {{envs.dims | r}}
+start <- {{envs.start | r}}
+end <- {{envs.end | r}}
+prefix <- {{envs.prefix | r}}
+reverse <- {{envs.reverse | r}}
+align_start <- {{envs.align_start | r}}
+seed <- {{envs.seed | r}}
+set.seed(seed)
+if (is.null(group_by)) {
+    stop("envs.group_by is required")
+}
+log_info("Reading Seurat object ...")
+srt <- readRDS(sobjfile)
+if (!group_by %in% colnames(srt@meta.data)) {
+    stop(paste("Grouping column", group_by, "not found in the Seurat object"))
+}
+reduction <- reduction %||% DefaultDimReduc(srt)
+dims <- expand_dims(dims)
+if (is.null(prefix)) {
+    prefix <- ""
+} else {
+    prefix <- paste0(prefix, "_")
+}
+log_info("Filtering cells in NA group_by ...")
+srt_sub <- srt[, !is.na(srt[[group_by, drop = TRUE]])]
+log_info("Running Slingshot ...")
+sl <- slingshot(
+    data = as.data.frame(srt_sub[[reduction]]@cell.embeddings[, dims]),
+    clusterLabels = as.character(srt_sub[[group_by, drop = TRUE]]),
+    start.clus = start, end.clus = end
+)
+command <- pbmc_small@commands[[1]]
+attr(command, "name") <- "SlingShot"
+attr(command, "call.string") <- "slingshot(...)"
+attr(command, "params") <- list()
+srt@commands <- srt@commands %||% list()
+srt@commands$Slingshot <- command
+df <- as.data.frame(slingPseudotime(sl))
+colnames(df) <- paste0(prefix, colnames(df))
+if (isTRUE(reverse)) {
+    if (isTRUE(align_start)) {
+        df <- apply(df, 2, function(x) max(x, na.rm = TRUE) - x)
+    } else {
+        df <- max(df, na.rm = TRUE) - df
+    }
+}
+srt <- AddMetaData(srt, metadata = df)
+srt <- AddMetaData(srt, metadata = slingBranchID(sl), col.name = paste0(prefix, "BranchID"))
+log_info("Saving Seurat object ...")
+saveRDS(srt, outfile)

biopipen/scripts/scrna/TopExpressingGenes.R CHANGED Viewed

@@ -161,14 +161,16 @@ do_enrich <- function(expr, odir) {
             next
         }
-        png(
-            file.path(odir, paste0("Enrichr-", db, ".png")),
-            res = 100, height = 1000, width = 1000
-        )
-        print(
-            plotEnrich(enriched[[db]], showTerms = 20, title = db) +
+        enrich_p <- plotEnrich(enriched[[db]], showTerms = 20, title = db) +
             theme_prism()
-        )
+        enrich_plot <- file.path(odir, paste0("Enrichr-", db, ".png"))
+        png(enrich_plot, res = 100, height = 1000, width = 1000)
+        print(enrich_p)
+        dev.off()
+        enrich_plot_pdf <- file.path(odir, paste0("Enrichr-", db, ".pdf"))
+        pdf(enrich_plot_pdf, height = 10, width = 10)
+        print(enrich_p)
         dev.off()
     }
 }

biopipen/scripts/scrna/celltypist-wrapper.py CHANGED Viewed

@@ -7,14 +7,13 @@ parser.add_argument(
 parser.add_argument("-o", "--output", required=True, help="Output file")
 parser.add_argument("-m", "--model", required=True, help="Model file")
 parser.add_argument(
-    "-v", "--majority_voting",
-    action="store_true",
-    help="Majority voting"
+    "-v", "--majority_voting", action="store_true", help="Majority voting"
 )
 parser.add_argument(
-    "-c", "--over_clustering",
+    "-c",
+    "--over_clustering",
     default="seurat_clusters",
-    help="Over clustering. Ignored if the column does not exist."
+    help="Over clustering. Ignored if the column does not exist.",
 )
@@ -44,7 +43,9 @@ if __name__ == "__main__":
     if args.output.endswith(".h5ad"):
         try:
-            out_adata._raw._var.rename(columns={"_index": "features"}, inplace=True)
+            out_adata._raw._var.rename(  # type: ignore
+                columns={"_index": "features"}, inplace=True
+            )
             del out_adata.raw
         except (KeyError, AttributeError):
             pass

biopipen/scripts/snp/Plink2GTMat.py CHANGED Viewed

@@ -3,15 +3,16 @@ from os import path
 from glob import glob
 from biopipen.utils.misc import run_command, logger
-indir = {{in.indir | repr}}  # noqa: E999 # pyright: ignore
-outfile = {{out.outfile | repr}}  # pyright: ignore
-plink = {{envs.plink | repr}}  # pyright: ignore
-ncores = {{envs.ncores | repr}}  # pyright: ignore
-transpose = {{envs.transpose | repr}}  # pyright: ignore
-samid = {{envs.samid | repr}}  # pyright: ignore
-varid = {{envs.varid | repr}}  # pyright: ignore
-trans_chr = {{envs.trans_chr | repr}}  # pyright: ignore
-missing_id = {{envs.missing_id | repr}}  # pyright: ignore
+indir: str = {{in.indir | quote}}  # noqa: E999 # pyright: ignore
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+plink: str = {{envs.plink | quote}}  # pyright: ignore
+ncores: int = {{envs.ncores | repr}}  # pyright: ignore
+transpose: bool = {{envs.transpose | repr}}  # pyright: ignore
+samid: str = {{envs.samid | repr}}  # pyright: ignore
+varid: str = {{envs.varid | repr}}  # pyright: ignore
+trans_chr: dict = {{envs.trans_chr | repr}}  # pyright: ignore
+missing_id: str = {{envs.missing_id | repr}}  # pyright: ignore
+gtcoding: str = {{envs.gtcoding | repr}}  # pyright: ignore
 trans_chr = trans_chr or {}
 bedfile = glob(path.join(indir, '*.bed'))
@@ -37,6 +38,14 @@ cmd = [
 run_command(cmd, fg=True, env={"cwd": path.dirname(outfile)})
+def _vcf_gtcoding(gt):
+    try:
+        return str(2 - int(gt))
+    except (ValueError, TypeError):
+        return "NA"
 if not transpose:  # rows are variants, columns are samples
     # .traw file is created, tab-separated, with the following columns:
     trawfile = output + ".traw"
@@ -82,7 +91,10 @@ if not transpose:  # rows are variants, columns are samples
                     .replace('{ref}', ref)
                     .replace('{alt}', alt)
                 )
-                record = [variant] + line[6:]
+                if gtcoding == "plink":
+                    record = [variant] + line[6:]
+                else:  # vcf
+                    record = [variant] + [_vcf_gtcoding(x) for x in line[6:]]
                 fout.write('\t'.join(record) + '\n')
 else:
@@ -129,5 +141,8 @@ else:
                 fid = line[0]
                 iid = line[1]
                 sam = samid.replace('{fid}', fid).replace('{iid}', iid)
-                record = [sam] + line[6:]
+                if gtcoding == "plink":
+                    record = [sam] + line[6:]
+                else:  # vcf
+                    record = [sam] + [_vcf_gtcoding(x) for x in line[6:]]
                 fout.write('\t'.join(record) + '\n')

biopipen/scripts/snp/PlinkFilter.py CHANGED Viewed

@@ -1,17 +1,17 @@
-"""Script for snp.PlinkFilter"""
+from __future__ import annotations
 from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args, logger
-indir = {{in.indir | repr}}  # pyright: ignore # noqa: #999
-samples_file = {{in.samples_file | repr}}  # pyright: ignore
-variants_file = {{in.variants_file | repr}}  # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
+indir: str = {{in.indir | quote}}  # pyright: ignore # noqa: #999
+samples_file = {{in.samples_file | quote}}  # pyright: ignore
+variants_file = {{in.variants_file | quote}}  # pyright: ignore
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
 plink = {{envs.plink | repr}}  # pyright: ignore
 ncores = {{envs.ncores | repr}}  # pyright: ignore
-samples = {{envs.samples | repr}}  # pyright: ignore
-variants = {{envs.variants | repr}}  # pyright: ignore
+samples: list[str] | str = {{envs.samples | repr}}  # pyright: ignore
+variants: list[str] | str = {{envs.variants | repr}}  # pyright: ignore
 e_samples_file = {{envs.samples_file | repr}}  # pyright: ignore
 e_variants_file = {{envs.variants_file | repr}}  # pyright: ignore
 keep = {{envs.keep | repr}}  # pyright: ignore

biopipen/scripts/snp/PlinkFromVcf.py CHANGED Viewed

@@ -1,12 +1,14 @@
-from os import path
+from __future__ import annotations
+from os import path, PathLike
 from biopipen.core.filters import dict_to_cli_args
 from biopipen.utils.reference import tabix_index
 from biopipen.utils.misc import run_command
-invcf = {{in.invcf | repr}}  # noqa: E999 # pyright: ignore
-outprefix = {{in.invcf | stem0 | repr}} # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
-args = {{envs | dict | repr}}  # pyright: ignore
+invcf: str | PathLike = {{in.invcf | quote}}  # noqa: E999 # pyright: ignore
+outprefix: str = {{in.invcf | stem0 | quote}} # pyright: ignore
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
+args: dict = {{envs | dict}}  # pyright: ignore
 plink = args.pop("plink")
 tabix = args.pop("tabix")
@@ -23,6 +25,7 @@ args.setdefault("max_alleles", 2)
 # This makes it possible to keep the allele order in the output
 # no need for plink2
 # args["keep_allele_order"] = True
+args.setdefault("keep_allele_order", True)
 # resolve plink 1.x --set-missing-var-ids doesn't distinguish $1, $2,...
 # for ref and alts

biopipen/scripts/snp/PlinkSimulation.py CHANGED Viewed

@@ -4,9 +4,9 @@ from slugify import slugify
 from simpleconf import Config
 from biopipen.utils.misc import logger, run_command, dict_to_cli_args
-configfile = {{in.configfile | repr}}  # pyright: ignore # noqa: E999
-outdir = {{out.outdir | repr}}  # pyright: ignore
-gtmatfile = {{out.gtmat | repr}}  # pyright: ignore
+configfile: str = {{in.configfile | quote}}  # pyright: ignore # noqa: E999
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
+gtmatfile: str = {{out.gtmat | quote}}  # pyright: ignore
 config = Config.load(configfile)
 default_nsnps = {{envs.nsnps | repr}}  # pyright: ignore
@@ -21,7 +21,7 @@ default_maxfreq = {{envs.maxfreq | repr}}  # pyright: ignore
 default_hetodds = {{envs.hetodds | repr}}  # pyright: ignore
 default_homodds = {{envs.homodds | repr}}  # pyright: ignore
 default_missing = {{envs.missing | repr}}  # pyright: ignore
-default_args = {{envs.args | repr}}  # pyright: ignore
+default_args: dict = {{envs.args | repr}}  # pyright: ignore
 default_transpose_gtmat = {{envs.transpose_gtmat | repr}}  # pyright: ignore
 default_sample_prefix = {{envs.sample_prefix | repr}}  # pyright: ignore

biopipen/scripts/snp/PlinkUpdateName.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args, logger
-indir = {{in.indir | repr}}  # pyright: ignore # noqa: #999
-namefile = {{in.namefile | repr}}  # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
+indir: str = {{in.indir | quote}}  # pyright: ignore # noqa: #999
+namefile: str = {{in.namefile | quote}}  # pyright: ignore
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
 plink = {{envs.plink | repr}}  # pyright: ignore
 bcftools = {{envs.bcftools | repr}}  # pyright: ignore
 ncores = {{envs.ncores | repr}}  # pyright: ignore
@@ -111,7 +111,7 @@ if namefile.endswith(".vcf") or namefile.endswith(".vcf.gz"):
             else:
                 info = readline(finfo)
-    namefile = namefile_tmp
+    namefile = str(namefile_tmp)
 args = {
     "": plink,

biopipen/scripts/stats/ChowTest.R CHANGED Viewed

@@ -12,15 +12,17 @@ transpose_input <- {{envs.transpose_input | r}}
 transpose_group <- {{envs.transpose_group | r}}
 log_info("Reading input files ...")
-indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1)
+indata <- read.table(infile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
 if (transpose_input) {
 	indata <- t(indata)
 }
-groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1)
+groupdata <- read.table(groupfile, header = TRUE, sep = "\t", row.names = 1, check.names = FALSE)
 if (transpose_group) {
 	groupdata <- t(groupdata)
 }
-fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL)
+allgroups = na.omit(unique(unlist(groupdata)))
+fmldata <- read.table(fmlfile, header = TRUE, sep = "\t", row.names = NULL, check.names = FALSE)
 colnames(fmldata)[1:2] <- c("Group", "Formula")
 chow.test <- function(fml, grouping) {
@@ -63,26 +65,43 @@ chow.test <- function(fml, grouping) {
 	)
 }
-formatlm <- function(m) {
-	if (class(m) == 'lm') {
-		coeff <- as.list(m$coefficients)
+formatlm <- function(m, g = NULL, type = "coeff") {
+	if (is.null(g)) {
 		vars <- all.vars(m$terms)
-		terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
-			ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
-			if (x == 'N') {
-				paste0('N=', nrow(m$model))
-			} else if (is.null(ce)) {
-				NULL
-			} else {
-				l <- ifelse(x == '(Intercept)', '_', x)
-				paste0(l, '=', round(ce, 3))
-			}
-		}))
+		if (type == "pval") {
+			df <- as.data.frame(summary(m)$coefficients)
+			terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
+				pv <- df[x, 4] %||% df[bQuote(x), 4]
+				if (x == 'N') {
+					paste0('N=', nrow(m$model))
+				} else if (is.null(pv)) {
+					NULL
+				} else {
+					l <- ifelse(x == '(Intercept)', '_', x)
+					paste0(l, '=', signif(pv, digits = 4))
+				}
+			}))
+		} else {
+			coeff <- as.list(m$coefficients)
+			terms <- unlist(sapply(na.omit(c(vars[2:length(vars)], '(Intercept)', 'N')), function(x) {
+				ce <- coeff[[x]] %||% coeff[[bQuote(x)]]
+				if (x == 'N') {
+					paste0('N=', nrow(m$model))
+				} else if (is.null(ce)) {
+					NULL
+				} else {
+					l <- ifelse(x == '(Intercept)', '_', x)
+					paste0(l, '=', round(ce, 3))
+				}
+			}))
+		}
 		paste(terms[!is.null(terms)], collapse = ', ')
 	} else {
-		paste(sapply(names(m), function(x) {
-			paste0(x, ': ', formatlm(m[[x]]))
-		}), collapse = ' // ')
+		gm <- m[[as.character(g)]]
+		if (is.null(gm)) {
+			return(NA)
+		}
+		formatlm(gm, type = type)
 	}
 }
@@ -98,8 +117,15 @@ results <- do_call(rbind, lapply(
         log_debug("  Running Chow test for formula: {fmlrow$Formula} (grouping = {fmlrow$Group})")
         res <- chow.test(fmlrow$Formula, fmlrow$Group)
-		fmlrow$Pooled <- formatlm(res$pooled.lm)
-		fmlrow$Groups <- formatlm(res$group.lms)
+		fmlrow$Pooled_Coef <- formatlm(res$pooled.lm)
+		for (g in allgroups) {
+			fmlrow[[paste0("Group_", g, "_Coef")]] <- formatlm(res$group.lms, g)
+		}
+		# fmlrow$Groups <- formatlm(res$group.lms)
+		fmlrow$Pooled_Pval <- formatlm(res$pooled.lm, type="pval")
+		for (g in allgroups) {
+			fmlrow[[paste0("Group_", g, "_Pval")]] <- formatlm(res$group.lms, g, type="pval")
+		}
 		fmlrow$SSR <- res$group.ssr
 		fmlrow$SumSSR <- res$pooled.ssr
 		fmlrow$Fstat <- res$Fstat

biopipen 0.32.1__py3-none-any.whl → 0.33.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.32.1py3-none-any.whl → 0.33.0py3-none-any.whl