PyPI - biopipen - Versions diffs - 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

biopipen 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (83) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +8 -0
biopipen/ns/bam.py +0 -2
biopipen/ns/bed.py +35 -0
biopipen/ns/cellranger_pipeline.py +5 -5
biopipen/ns/cnv.py +18 -2
biopipen/ns/cnvkit_pipeline.py +16 -11
biopipen/ns/gene.py +68 -23
biopipen/ns/misc.py +2 -15
biopipen/ns/plot.py +146 -0
biopipen/ns/regulation.py +214 -0
biopipen/ns/scrna.py +15 -3
biopipen/ns/snp.py +516 -8
biopipen/ns/stats.py +74 -2
biopipen/ns/vcf.py +196 -0
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/scripts/bam/CNVpytor.py +144 -46
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMerge.py +1 -1
biopipen/scripts/cnv/AneuploidyScore.R +30 -7
biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
biopipen/scripts/cnv/TMADScore.R +21 -5
biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
biopipen/scripts/gene/GeneNameConversion.R +65 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/plot/Manhattan.R +140 -0
biopipen/scripts/plot/QQPlot.R +62 -0
biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
biopipen/scripts/regulation/MotifScan.py +159 -0
biopipen/scripts/regulation/atSNP.R +33 -0
biopipen/scripts/regulation/motifBreakR.R +1594 -0
biopipen/scripts/scrna/CellsDistribution.R +2 -0
biopipen/scripts/scrna/MarkersFinder.R +59 -67
biopipen/scripts/scrna/SeuratClustering.R +63 -29
biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
biopipen/scripts/snp/MatrixEQTL.R +84 -43
biopipen/scripts/snp/Plink2GTMat.py +133 -0
biopipen/scripts/snp/PlinkCallRate.R +190 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +298 -0
biopipen/scripts/snp/PlinkFromVcf.py +78 -0
biopipen/scripts/snp/PlinkHWE.R +80 -0
biopipen/scripts/snp/PlinkHet.R +92 -0
biopipen/scripts/snp/PlinkIBD.R +197 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/MetaPvalue.R +2 -1
biopipen/scripts/stats/MetaPvalue1.R +70 -0
biopipen/scripts/tcr/TCRClusterStats.R +12 -7
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/VcfFix_utils.py +1 -1
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/utils/gene.R +83 -37
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.R +56 -0
biopipen/utils/misc.py +5 -2
biopipen/utils/reference.py +54 -10
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
biopipen/ns/bcftools.py +0 -111
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
{biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0

biopipen/scripts/scrna/CellsDistribution.R CHANGED Viewed

@@ -360,6 +360,7 @@ do_case <- function(name, case) {
     }
     log_info("  Merging and saving pie charts ...")
+    devpars = case$devpars
     # assemble and save pie chart plots
     res <- devpars$res %||% 100
     #                         legend, cells_by names
@@ -405,6 +406,7 @@ do_case <- function(name, case) {
     }
     col_fun <- colorRamp2(c(0, max(hmdata, na.rm = T)), c("lightyellow", "purple"))
+    hm_devpars <- case$hm_devpars
     hm_res <- hm_devpars$res %||% 100
     hm_width <- hm_devpars$width %||% (600 + 15 * length(unique(meta$seurat_clusters)) + extra_width)
     hm_height <- hm_devpars$height %||% (450 + 15 * cells_rows + extra_height)

biopipen/scripts/scrna/MarkersFinder.R CHANGED Viewed

@@ -411,45 +411,11 @@ do_case_findall <- function(casename) {
         log_info("  Using cached markers ...")
         markers <- cached$data
     } else {
-        markers <- tryCatch({
-            do_call(FindAllMarkers, args)
-                # gene, p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
-            }, error = function(e) {
-                log_warn(e$message)
-                data.frame(
-                    gene = character(),
-                    p_val = numeric(),
-                    avg_log2FC = numeric(),
-                    pct.1 = numeric(),
-                    pct.2 = numeric(),
-                    p_val_adj=numeric(),
-                    cluster = character()
-                )
-            })
+        markers <- find_markers(args, find_all = TRUE)
         cached$data <- markers
         save_to_cache(cached, "FindAllMarkers", cache)
     }
-    if (nrow(markers) == 0 && defassay == "SCT") {
-        log_warn("  No markers found from SCT assay, try recorrect_umi = FALSE")
-        args$recorrect_umi <- FALSE
-        markers <- tryCatch({
-            do_call(FindAllMarkers, args)
-        }, error = function(e) {
-            log_warn(e$message)
-            data.frame(
-                gene = character(),
-                p_val = numeric(),
-                avg_log2FC = numeric(),
-                pct.1 = numeric(),
-                pct.2 = numeric(),
-                p_val_adj=numeric(),
-                cluster = character()
-            )
-        })
-    }
     if (is.null(case$dotplot$assay)) {
         case$dotplot$assay <- case$assay
     }
@@ -483,6 +449,63 @@ do_case_findall <- function(casename) {
     }
 }
+find_markers <- function(findmarkers_args, find_all = FALSE) {
+    if (find_all) {
+        fun <- FindAllMarkers
+        empty <- data.frame(
+            gene = character(),
+            p_val = numeric(),
+            avg_log2FC = numeric(),
+            pct.1 = numeric(),
+            pct.2 = numeric(),
+            p_val_adj = numeric(),
+            cluster = character()
+        )
+    } else {
+        fun <- FindMarkers
+        empty <- data.frame(
+            gene = character(),
+            p_val = numeric(),
+            avg_log2FC = numeric(),
+            pct.1 = numeric(),
+            pct.2 = numeric(),
+            p_val_adj = numeric()
+        )
+    }
+    markers <- tryCatch({
+        do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
+    }, error = function(e) {
+        # Object contains multiple models with unequal library sizes.
+        # Run `PrepSCTFindMarkers()` before running `FindMarkers()`.
+        if (grepl("PrepSCTFindMarkers", e$message)) {
+            log_warn("  Running PrepSCTFindMarkers ...")
+            findmarkers_args$object <<- PrepSCTFindMarkers(findmarkers_args$object)
+            tryCatch({
+                do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
+            }, error = function(err) {
+                log_warn(paste0("  ", err$message))
+                empty
+            })
+        } else {
+            log_warn(paste0("  ", e$message))
+            empty
+        }
+    })
+    if (nrow(markers) == 0 && defassay == "SCT") {
+        log_warn("  No markers found from SCT assay, trying recorrect_umi = FALSE")
+        findmarkers_args$recorrect_umi <- FALSE
+        markers <- tryCatch({
+            do_call(fun, findmarkers_args) %>% rownames_to_column("gene")
+        }, error = function(e) {
+            log_warn(paste0("  ", e$message))
+            empty
+        })
+    }
+    markers
+}
 sections <- c()
 do_case <- function(casename) {
     if (isTRUE(cases[[casename]]$findall)) {
@@ -538,38 +561,7 @@ do_case <- function(casename) {
     # args$min.cells.feature <- args$min.cells.feature %||% 1
     # args$min.pct <- args$min.pct %||% 0
-    markers <- tryCatch({
-        do_call(FindMarkers, args) %>% rownames_to_column("gene")
-    }, error = function(e) {
-        log_warn(paste0("  ", e$message))
-        data.frame(
-            gene = character(),
-            p_val = numeric(),
-            avg_log2FC = numeric(),
-            pct.1 = numeric(),
-            pct.2 = numeric(),
-            p_val_adj = numeric()
-        )
-    })
-    if (nrow(markers) == 0 && defassay == "SCT") {
-        log_warn("  No markers found from SCT assay, trying recorrect_umi = FALSE")
-        args$recorrect_umi <- FALSE
-        markers <- tryCatch({
-            do_call(FindMarkers, args) %>% rownames_to_column("gene")
-        }, error = function(e) {
-            log_warn(paste0("  ", e$message))
-            data.frame(
-                gene = character(),
-                p_val = numeric(),
-                avg_log2FC = numeric(),
-                pct.1 = numeric(),
-                pct.2 = numeric(),
-                p_val_adj=numeric()
-            )
-        })
-    }
+    markers <- find_markers(args)
     siggenes <- do_enrich(info, markers, case$sigmarkers, case$volcano_genes)
     if (length(siggenes) > 0) {

biopipen/scripts/scrna/SeuratClustering.R CHANGED Viewed

@@ -3,9 +3,11 @@ source("{{biopipen_dir}}/utils/caching.R")
 library(Seurat)
 library(future)
+library(rlang)
 library(tidyr)
 library(dplyr)
 library(digest)
+library(clustree)
 set.seed(8525)
@@ -129,39 +131,71 @@ if (is.null(cached$data)) {
 }
 envs$FindClusters$random.seed <- envs$FindClusters$random.seed %||% 8525
-resolution <- envs$FindClusters$resolution %||% 0.8
-if (is.character(resolution)) {
-    if (grepl(",", resolution)) {
-        resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
-    } else {
-        resolution <- as.numeric(resolution)
+expand_resolution <- function(resolution) {
+    expanded_res <- c()
+    for (res in resolution) {
+        if (is.numeric(res)) {
+            expanded_res <- c(expanded_res, res)
+        } else {
+            # is.character
+            parts <- trimws(unlist(strsplit(res, ",")))
+            for (part in parts) {
+                if (grepl(":", part)) {
+                    parts <- trimws(unlist(strsplit(part, ":")))
+                    if (length(parts) == 2) { parts <- c(parts, 0.1) }
+                    if (length(parts) != 3) {
+                        stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
+                    }
+                    parts <- as.numeric(parts)
+                    expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
+                } else {
+                    expanded_res <- c(expanded_res, as.numeric(part))
+                }
+            }
+        }
     }
+    # keep the last resolution at last
+    rev(unique(rev(expanded_res)))
 }
+resolution <- envs$FindClusters$resolution <- expand_resolution(envs$FindClusters$resolution %||% 0.8)
+log_info("Running FindClusters at resolution: {paste(resolution, collapse=',')} ...")
+envs$FindClusters$object <- sobj
+sobj <- do_call(FindClusters, envs$FindClusters)
+# recode clusters from 0, 1, 2, ... to c1, c2, c3, ...
+recode_clusters <- function(clusters) {
+    recode <- function(x) paste0("c", as.integer(as.character(x)) + 1)
+    clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
+    clusters
+}
+graph_name <- envs$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
 for (res in resolution) {
-    envs$FindClusters$resolution <- res
-    cached <- get_cached(envs$FindClusters, paste0("FindClusters_", res), cache_dir)
-    res_key <- paste0("seurat_clusters_", res)
-    if (is.null(cached$data)) {
-        log_info("Running FindClusters at resolution: {res} ...")
-        envs$FindClusters$object <- sobj
-        sobj <- do_call(FindClusters, envs$FindClusters)
-        levels(sobj$seurat_clusters) <- paste0("c", as.numeric(levels(sobj$seurat_clusters)) + 1)
-        sobj[[res_key]] <- sobj$seurat_clusters
-        Idents(sobj) <- "seurat_clusters"
-        cached$data <- list(clusters = sobj$seurat_clusters, commands = sobj@commands)
-        save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
-    } else {
-        log_info("Loading cached FindClusters at resolution: {res} ...")
-        sobj@commands <- cached$data$commands
-        sobj[[res_key]] <- cached$data$clusters
-        sobj$seurat_clusters <- cached$data$clusters
-        Idents(sobj) <- "seurat_clusters"
-    }
-    ident_table <- table(Idents(sobj))
-    log_info("- Found {length(ident_table)} clusters")
-    print(ident_table)
-    cat("\n")
+    cluster_name <- paste0(graph_name, res)
+    new_cluster_name <- paste0("seurat_clusters.", res)
+    sobj@meta.data[[new_cluster_name]] <- recode_clusters(sobj@meta.data[[cluster_name]])
+}
+sobj@meta.data$seurat_clusters <- recode_clusters(sobj@meta.data$seurat_clusters)
+Idents(sobj) <- "seurat_clusters"
+ident_table <- table(Idents(sobj))
+log_info("- Found {length(ident_table)} clusters at resolution {resolution[length(resolution)]}")
+print(ident_table)
+cat("\n")
+# plot the tree
+if (length(resolution) > 1) {
+    log_info("Plotting clustree ...")
+    png(
+        file.path(joboutdir, "clustree.png"),
+        res = envs$clustree_devpars$res,
+        width = envs$clustree_devpars$width,
+        height = envs$clustree_devpars$height
+    )
+    p <- clustree(sobj, prefix = "seurat_clusters.")
+    print(p)
+    dev.off()
 }
 if (DefaultAssay(sobj) == "SCT") {

biopipen/scripts/scrna/SeuratMap2Ref.R CHANGED Viewed

@@ -63,6 +63,26 @@ if (endsWith(ref, ".rds") || endsWith(ref, ".RDS")) {
     reference = LoadH5Seurat(ref)
 }
+# check if refdata exists in the reference
+for (rname in names(mapquery_args$refdata)) {
+    use_name <- mapquery_args$refdata[[rname]]
+    # transferring an assay
+    if (use_name %in% names(reference)) { next }
+    # transferring a metadata column
+    if (!use_name %in% colnames(reference@meta.data)) {
+        stop(paste0(
+            "The reference does not have the column '",
+            use_name,
+            "' in either assays or metadata. "
+        ))
+        if (startsWith(use_name, "predicted.")) {
+            stop(paste0(
+                "Do you mean: ", substring(use_name, 11),
+            ))
+        }
+    }
+}
 if (refnorm == "auto" && DefaultAssay(reference) == "SCT") {
     refnorm = "SCTransform"
 }

biopipen/scripts/scrna/SeuratSubClustering.R CHANGED Viewed

@@ -8,6 +8,7 @@ library(tidyr)
 library(dplyr)
 library(tidyseurat)
 library(digest)
+library(clustree)
 set.seed(8525)
@@ -28,6 +29,40 @@ plan(strategy = "multicore", workers = envs$ncores)
     args
 }
+.expand_resolution <- function(resolution) {
+    expanded_res <- c()
+    for (res in resolution) {
+        if (is.numeric(res)) {
+            expanded_res <- c(expanded_res, res)
+        } else {
+            # is.character
+            parts <- trimws(unlist(strsplit(res, ",")))
+            for (part in parts) {
+                if (grepl(":", part)) {
+                    parts <- trimws(unlist(strsplit(part, ":")))
+                    if (length(parts) == 2) { parts <- c(parts, 0.1) }
+                    if (length(parts) != 3) {
+                        stop("Invalid resolution format: {part}. Expected 2 or 3 parts separated by ':' for a range.")
+                    }
+                    parts <- as.numeric(parts)
+                    expanded_res <- c(expanded_res, seq(parts[1], parts[2], by = parts[3]))
+                } else {
+                    expanded_res <- c(expanded_res, as.numeric(part))
+                }
+            }
+        }
+    }
+    # keep the last resolution at last
+    rev(unique(rev(expanded_res)))
+}
+# recode clusters from 0, 1, 2, ... to s1, s2, s3, ...
+.recode_clusters <- function(clusters) {
+    recode <- function(x) paste0("s", as.integer(as.character(x)) + 1)
+    clusters <- factor(recode(clusters), levels = recode(levels(clusters)))
+    clusters
+}
 envs$RunUMAP <- .expand_dims(envs$RunUMAP)
 envs$FindNeighbors <- .expand_dims(envs$FindNeighbors)
@@ -63,7 +98,8 @@ for (key in names(envs$cases)) {
             subset = envs$subset,
             RunUMAP = envs$RunUMAP,
             FindNeighbors = envs$FindNeighbors,
-            FindClusters = envs$FindClusters
+            FindClusters = envs$FindClusters,
+            clustree_devpars = envs$clustree_devpars
         ),
         case
     )
@@ -132,36 +168,49 @@ for (key in names(envs$cases)) {
     }
     case$FindClusters$random.seed <- case$FindClusters$random.seed %||% 8525
-    resolution <- case$FindClusters$resolution %||% 0.8
-    if (is.character(resolution)) {
-        if (grepl(",", resolution)) {
-            resolution <- as.numeric(trimws(unlist(strsplit(resolution, ","))))
-        } else {
-            resolution <- as.numeric(resolution)
+    resolution <- case$FindClusters$resolution <- .expand_resolution(case$FindClusters$resolution %||% 0.8)
+    cached <- get_cached(case$FindClusters, "FindClusters", cache_dir)
+    if (is.null(cached$data)) {
+        log_info("- Running FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
+        case$FindClusters$object <- sobj
+        # avoid overwriting the previous clustering results (as they have the same graph name
+        sobj1 <- do_call(FindClusters, case$FindClusters)
+        graph_name <- case$FindClusters$graph.name %||% paste0(DefaultAssay(sobj), "_snn_res.")
+        for (res in resolution) {
+            cluster_name <- paste0(graph_name, res)
+            new_cluster_name <- paste0(key, ".", res)
+            sobj1@meta.data[[new_cluster_name]] <- .recode_clusters(sobj1@meta.data[[cluster_name]])
         }
+        sobj1@meta.data[[key]] <- .recode_clusters(sobj1@meta.data$seurat_clusters)
+        keys <- sapply(resolution, function(res) paste0(key, ".", res))
+        keys <- c(keys, key)
+        cached$data <- sobj1@meta.data[, keys, drop = FALSE]
+        save_to_cache(cached, "FindClusters", cache_dir)
+        rm(sobj1)
+    } else {
+        log_info("- Using cached FindClusters at resolution: {paste(resolution, collapse = ',')} ...")
     }
-    for (res in resolution) {
-        case$FindClusters$resolution <- res
-        cached <- get_cached(case$FindClusters, paste0("FindClusters_", res), cache_dir)
-        res_key <- paste0("seurat_clusters_", res)
-        if (is.null(cached$data)) {
-            log_info("- Running FindClusters at resolution: {res} ...")
-            case$FindClusters$object <- sobj
-            sobj1 <- do_call(FindClusters, case$FindClusters)
-            levels(sobj1$seurat_clusters) <- paste0("s", as.numeric(levels(sobj1$seurat_clusters)) + 1)
-            sobj1[[res_key]] <- sobj1$seurat_clusters
-            cached$data <- sobj1@meta.data[, res_key, drop = FALSE]
-            save_to_cache(cached, paste0("FindClusters_", res), cache_dir)
-        } else {
-            log_info("- Using cached FindClusters at resolution: {res} ...")
-        }
-        ident_table <- table(cached$data[[res_key]])
-        log_info("  Found {length(ident_table)} clusters")
-        print(ident_table)
-        cat("\n")
+    ident_table <- table(cached$data[[key]])
+    log_info("  Found {length(ident_table)} clusters")
+    print(ident_table)
+    cat("\n")
+    if (length(resolution) > 1) {
+        log_info("- Plotting clustree ...")
+        png(
+            file.path(joboutdir, paste0(key, ".clustree.png")),
+            res = case$clustree_devpars$res,
+            width = case$clustree_devpars$width,
+            height = case$clustree_devpars$height
+        )
+        p <- clustree(cached$data, prefix = paste0(key, "."))
+        print(p)
+        dev.off()
     }
     log_info("- Updating meta.data with subclusters...")
-    srtobj <- AddMetaData(srtobj, metadata = cached$data, col.name = key)
+    srtobj <- AddMetaData(srtobj, metadata = cached$data)
     srtobj[[paste0("sub_umap_", key)]] <- reduc
 }

biopipen/scripts/snp/MatrixEQTL.R CHANGED Viewed

@@ -1,5 +1,6 @@
 source("{{biopipen_dir}}/utils/misc.R")
 library(rlang)
+library(rtracklayer)
 library(MatrixEQTL)
 snpfile = {{in.geno | r}}
@@ -11,6 +12,7 @@ outfile = {{out.cisqtls | r}}
 model = {{envs.model | r}}
 pval = {{envs.pval | r}}
+match_samples = {{envs.match_samples | r}}
 transp = {{envs.transp | r}}
 fdr = {{envs.fdr | r}}
 snppos = {{envs.snppos | r}}
@@ -36,7 +38,9 @@ if (!trans_enabled && !cis_enabled) {
     transp <- 1e-5
 }
-transpose_file <- function(file) {
+transpose_file <- function(file, what) {
+    if (is.null(file)) return(NULL)
+    log_info("Transposing {what} file ...")
     out <- file.path(joboutdir, paste0(
         tools::file_path_sans_ext(basename(file)),
         ".transposed.",
@@ -47,10 +51,11 @@ transpose_file <- function(file) {
     out
 }
-if (transpose_geno) snpfile = transpose_file(snpfile)
-if (transpose_expr) expfile = transpose_file(expfile)
-if (transpose_cov) covfile = transpose_file(covfile)
+if (transpose_geno) snpfile = transpose_file(snpfile, "geno")
+if (transpose_expr) expfile = transpose_file(expfile, "expr")
+if (transpose_cov) covfile = transpose_file(covfile, "cov")
+log_info("Loading SNP data ...")
 snps = SlicedData$new();
 snps$fileDelimiter = "\t";       # the TAB character
 snps$fileOmitCharacters = "NA";  # denote missing values;
@@ -59,6 +64,7 @@ snps$fileSkipColumns = 1;        # one column of row labels
 snps$fileSliceSize = 10000;      # read file in pieces of 2,000 rows
 snps$LoadFile( snpfile );
+log_info("Loading gene expression data ...")
 gene = SlicedData$new();
 gene$fileDelimiter = "\t";       # the TAB character
 gene$fileOmitCharacters = "NA";  # denote missing values;
@@ -69,16 +75,39 @@ gene$LoadFile( expfile );
 cvrt = SlicedData$new();
 if (!is.null(covfile) && file.exists(covfile)) {
-    covmatrix = t(read.table.inopts(covfile, list(cnames=TRUE, rnames=TRUE)))
+    log_info("Loading covariate data ...")
+    covmatrix = read.table(covfile, header=TRUE, stringsAsFactors=FALSE, row.names=1, sep="\t", quote="", check.names=FALSE)
     cvrt$CreateFromMatrix( as.matrix(covmatrix) )
 }
+log_info("Matching samples ...")
+if (match_samples) {
+    # let matrixEQTL raise an error if samples do not match
+} else {
+    n_sample_snps = snps$nCols()
+    n_sample_gene = gene$nCols()
+    common_samples = intersect(snps$columnNames, gene$columnNames)
+    if (!is.null(covfile)) {
+        common_samples = intersect(common_samples, cvrt$columnNames)
+        n_sample_cov = cvrt$nCols()
+        cvrt = cvrt$ColumnSubsample(match(common_samples, cvrt$columnNames))
+    }
+    snps = snps$ColumnSubsample(match(common_samples, snps$columnNames))
+    gene = gene$ColumnSubsample(match(common_samples, gene$columnNames))
+    log_info("- Samples used in SNP data: {n_sample_snps} -> {snps$nCols()}")
+    log_info("- Samples used in gene expression data: {n_sample_gene} -> {gene$nCols()}")
+    if (!is.null(covfile)) {
+        log_info("- Samples used in covariate data: {n_sample_cov} -> {cvrt$nCols()}")
+    }
+}
+log_info("Composing engine parameters ...")
 engine_params = list()
 engine_params$snps = snps
 engine_params$gene = gene
 engine_params$cvrt = cvrt
-engine_params$output_file_name = ifelse(trans_enabled, alleqtl, NULL)
-engine_params$pvOutputThreshold = ifelse(trans_enabled, transp, 0)
+engine_params$output_file_name = if(trans_enabled) alleqtl else NULL
+engine_params$pvOutputThreshold = if(trans_enabled) transp else 0
 engine_params$useModel = model
 engine_params$errorCovariance = numeric()
 engine_params$verbose = TRUE
@@ -89,66 +118,78 @@ noq = function(s) {
 }
 if (cis_enabled) {
+    log_info("Loading SNP positions ...")
     if (endsWith(snppos, ".bed")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE))
-        snppos_data = snppos_data[, c(4, 1, 2)]
-        colnames(snppos_data) = c("snp", "chr", "pos")
+        snppos_data = read.table(snppos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
+        snppos_data = data.frame(
+            snp = snppos_data$V4,
+            chr = snppos_data$V1,
+            pos = snppos_data$V3
+        )
     } else if (endsWith(snppos, ".gff") || endsWith(snppos, ".gtf")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE));
-        snppos_data = snppos_data[, c(9, 1, 4)]
-        colnames(snppos_data) = c("snp", "chr", "pos")
-        snppos_data$snp = unlist(lapply(snppos_data$snp, function(x) {
-            for (s in unlist(strsplit(x, '; ', fixed=T))) {
-                if (startsWith(s, "snp_id "))
-                    return(noq(substring(s, 8)))
-                else if (startsWith(s, "rs_id "))
-                    return(noq(substring(s, 7)))
-                else if (startsWith(s, "rs "))
-                    return(noq(substring(s, 4)))
-            }
-        }))
+        snppos_data = import(snppos)
+        elem_meta = elementMetadata(snppos_data)
+        snppos_data = data.frame(
+            snp = elem_meta$snp_id %||% elem_meta$rs_id %||% elem_meta$rs,
+            chr = as.character(seqnames(snppos_data)),
+            pos = start(snppos_data)
+        )
     } else if (endsWith(snppos, ".vcf") || endsWith(snppos, ".vcf.gz")) {
-        snppos_data = read.table.inopts(snppos,
-                                        list(cnames=FALSE, rnames=FALSE))
+        snppos_data = read.table(
+            snppos,
+            header=FALSE,
+            row.names=NULL,
+            stringsAsFactors=FALSE,
+            check.names=FALSE
+        )
         snppos_data = snppos_data[, c(3, 1, 2)]
         colnames(snppos_data) = c("snp", "chr", "pos")
     } else {
-        snppos_data = read.table.inopts(snppos, list(cnames=TRUE))
+        snppos_data = read.table(
+            snppos,
+            header=FALSE,
+            row.names=NULL,
+            stringsAsFactors=FALSE,
+            check.names=FALSE
+        )
         colnames(snppos_data) = c("snp", "chr", "pos")
     }
+    log_info("Loading gene positions ...")
     if (endsWith(genepos, ".bed")) {
-        genepos_data = read.table.inopts(genepos,
-                                         list(cnames=FALSE, rnames=FALSE))
-        genepos_data = genepos_data[, c(4, 1:3)]
-        colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
+        genepos_data = read.table(genepos, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
+        genepos_data = data.frame(
+            geneid = genepos_data$V4,
+            chr = genepos_data$V1,
+            s1 = genepos_data$V2,
+            s2 = genepos_data$V3
+        )
     } else if (endsWith(genepos, ".gff") || endsWith(genepos, ".gtf")) {
-        genepos_data = read.table.inopts(genepos,
-                                         list(cnames=FALSE, rnames=FALSE))
-        genepos_data = genepos_data[, c(9, 1, 4, 5)]
-        colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
-        genepos_data$geneid = noquote(unlist(lapply(genepos_data$geneid, function(x) {
-            for (s in unlist(strsplit(x, '; ', fixed=T))) {
-                if (startsWith(s, "gene_id "))
-                    return(noq(substring(s, 9)))
-            }
-        })))
+        genepos_data = import(genepos)
+        elem_meta = elementMetadata(genepos_data)
+        genepos_data = data.frame(
+            geneid = elem_meta$gene_id %||% elem_meta$gene_name,
+            chr = as.character(seqnames(genepos_data)),
+            s1 = start(genepos_data),
+            s2 = end(genepos_data)
+        )
     } else {
         genepos_data = read.table(genepos, header = TRUE, stringsAsFactors = FALSE);
         colnames(genepos_data) = c("geneid", "chr", "s1", "s2")
     }
+    log_info("Running MatrixEQTL with cis-eQTLs enabled ...")
     engine_params$output_file_name.cis = outfile
     engine_params$pvOutputThreshold.cis = pval
     engine_params$cisDist = dist
     engine_params$snpspos = snppos_data
     engine_params$genepos = genepos_data
     do_call(Matrix_eQTL_main, engine_params)
+    if (!file.exists(alleqtl)) file.create(alleqtl)
 } else {
+    log_info("Running MatrixEQTL without cis-eQTLs ...")
     do_call(Matrix_eQTL_engine, engine_params)
-    file.create(outfile)
+    if (!file.exists(outfile)) file.create(outfile)
 }
 if (pval == 0) {

biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl