PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/scrna/SeuratPreparing.R CHANGED Viewed

@@ -1,209 +1,259 @@
-source("{{biopipen_dir}}/utils/misc.R")
 library(Seurat)
 library(future)
 library(bracer)
-library(ggplot2)
-library(tidyseurat)
+library(dplyr)
+library(glue)
+library(biopipen.utils)
+metafile <- {{in.metafile | r}}
+outfile <- {{out.outfile | r}}
+joboutdir <- {{job.outdir | r}}
+envs <- {{envs | r: todot = "-", skip = 1}}
+if (isTRUE(envs$cache)) { envs$cache <- joboutdir }
-metafile = {{in.metafile | quote}}
-rdsfile = {{out.rdsfile | quote}}
-joboutdir = {{job.outdir | quote}}
-envs = {{envs | r}}
+log <- get_logger()
+reporter <- get_reporter()
 set.seed(8525)
-options(future.globals.maxSize = 80000 * 1024^2)
+# 8TB
+options(future.globals.maxSize = Inf)
+options(future.rng.onMisuse="ignore")
+options(Seurat.object.assay.version = "v5")
 plan(strategy = "multicore", workers = envs$ncores)
-metadata = read.table(
-    metafile,
-    header = TRUE,
-    row.names = NULL,
-    sep = "\t",
-    check.names = FALSE
+reporter$add(
+    list(
+        kind = "descr",
+        name = "Filters applied",
+        content = paste0(
+            "<p>Cell filters: ", html_escape(envs$cell_qc), "</p>",
+            "<p>Gene filters: </p>",
+            "<p>- Min Cells: ", envs$gene_qc$min_cells, "</p>",
+            "<p>- Excludes: ",
+            ifelse(is.null(envs$gene_qc$excludes), "Not set", paste(envs$gene_qc$excludes, collapse = ", ")),
+            "</p>"
+        )
+    ),
+    h1 = "Filters and QC"
 )
-meta_cols = colnames(metadata)
+metadata <- tryCatch({
+    log$debug("Trying to read Seurat object from metafile ...")
+    read_obj(metafile)
+}, error = function(e) {
+    log$debug("Failed to read Seurat object from metafile: {e$message}")
+    log$debug("Reading metafile as a table (sample info) ...")
+    read.table(
+        metafile,
+        header = TRUE,
+        row.names = NULL,
+        sep = "\t",
+        check.names = FALSE
+    )
+})
+is_seurat <- inherits(metadata, "Seurat")
+meta_cols <- if (is_seurat) colnames(metadata@meta.data) else colnames(metadata)
 if (!"Sample" %in% meta_cols) {
-    stop("Error: Column `Sample` is not found in metafile.")
+    stop("Error: Column `Sample` is not found in ", ifelse(is_seurat, "Seurat object's meta.data.", "metafile."))
 }
-if (!"RNAData" %in% meta_cols) {
+if (!"RNAData" %in% meta_cols && !is_seurat) {
     stop("Error: Column `RNAData` is not found in metafile.")
 }
+qcdir = file.path(joboutdir, "qc")
+dir.create(qcdir, showWarnings = FALSE, recursive = TRUE)
-rename_files = function(e, sample, path) {
-    tmpdatadir = file.path(joboutdir, "renamed", sample)
-    if (dir.exists(tmpdatadir)) {
-        unlink(tmpdatadir, recursive = TRUE)
-    }
-    dir.create(tmpdatadir, recursive = TRUE, showWarnings = FALSE)
-    barcodefile = Sys.glob(file.path(path, "*barcodes.tsv.gz"))[1]
-    file.symlink(
-        normalizePath(barcodefile),
-        file.path(tmpdatadir, "barcodes.tsv.gz")
-    )
-    genefile = glob(file.path(path, "*{genes,features}.tsv.gz"))[1]
-    file.symlink(
-        normalizePath(genefile),
-        file.path(tmpdatadir, "features.tsv.gz")
-    )
-    matrixfile = Sys.glob(file.path(path, "*matrix.mtx.gz"))[1]
-    file.symlink(
-        normalizePath(matrixfile),
-        file.path(tmpdatadir, "matrix.mtx.gz")
-    )
-    Read10X(data.dir = tmpdatadir)
-}
+sobj <- LoadSeuratAndPerformQC(
+    metadata,
+    min_cells = envs$min_cells,
+    min_features = envs$min_features,
+    cell_qc = envs$cell_qc,
+    gene_qc = envs$gene_qc,
+    tmpdir = joboutdir,
+    log = log,
+    cache = envs$cache)
-load_sample = function(sample) {
-    print(paste("  Loading sample:", sample, "..."))
-    mdata = as.data.frame(metadata)[metadata$Sample == sample, , drop=TRUE]
-    path = as.character(mdata$RNAData)
-    if (is.na(path) || !is.character(path) || nchar(path) == 0) {
-        warning(paste0("No path found for sample: ", sample))
-        return (NULL)
-    }
+log$info("Saving and visualizing QC results ...")
+cell_qc_df <- VizSeuratCellQC(sobj, plot_type = "table")
+write.table(cell_qc_df, file = file.path(qcdir, "cell_qc.txt"),
+            row.names = FALSE, quote = FALSE, sep = "\t")
-    # obj_list = list()
-    if (dir.exists(path)) {
-        exprs = tryCatch(
-            # Read10X requires
-            # - barcodes.tsv.gz
-            # - genes.tsv.gz
-            # - matrix.mtx.gz
-            # But sometimes, they are prefixed with sample name
-            # e.g.GSM4143656_SAM24345863-ln1.barcodes.tsv.gz
-            { Read10X(data.dir = path) },
-            error = function(e) rename_files(e, sample, path)
+reporter$add(
+    list(
+        name = "Cell QC metrics",
+        contents = list(
+            list(
+                kind = "descr",
+                content = paste0(
+                    "The table below show the number of cells in each sample that failed and passed the QC filters. ",
+                    "The last row shows the total number of cells that failed and passed the QC filters across all samples. "
+                )
+            ),
+            list(kind = "table", src = file.path(qcdir, "cell_qc.txt"))
         )
-    } else {
-        exprs = Read10X_h5(path)
-    }
-    if ("Gene Expression" %in% names(exprs)) {
-        exprs = exprs[["Gene Expression"]]
-    }
-    obj = CreateSeuratObject(counts=exprs, project=sample)
-    # filter the cells that don't have any gene expressions
-    cell_exprs = colSums(obj@assays$RNA)
-    obj = subset(obj, cells = names(cell_exprs[cell_exprs > 0]))
-    # obj = SCTransform(object=obj, return.only.var.genes=FALSE, verbose=FALSE)
-    obj = RenameCells(obj, add.cell.id = sample)
-    # Attach meta data
-    for (mname in names(mdata)) {
-        if (mname %in% c("RNAData", "TCRData")) { next }
-        mdt = mdata[[mname]]
-        if (is.factor(mdt)) { mdt = levels(mdt)[mdt] }
-        obj[[mname]] = mdt
-    }
-    # obj_list[[sample]] = obj
-    # obj_list
-    obj
-}
+    ),
+    h1 = "Filters and QC",
+    h2 = "Cell-level Quality Control",
+    ui = "tabs"
+)
-# Load data
-samples = as.character(metadata$Sample)
+gene_qc_df <- VizSeuratGeneQC(sobj, plot_type = "table")
+write.table(gene_qc_df, file = file.path(qcdir, "gene_qc.txt"),
+            row.names = FALSE, quote = FALSE, sep = "\t")
-print("- Reading samples individually ...")
-obj_list = lapply(samples, load_sample)
+reporter$add(
+    list(
+        name = "Gene QC metrics",
+        contents = list(
+            list(
+                kind = "descr",
+                content = paste0(
+                    "The table below show the number of genes in each sample that failed and passed the QC filters. ",
+                    "The last row shows the final number of genes that failed and passed the QC filters across all samples. ",
+                    "Any gene that failed the QC filters will be excluded in the merged Seurat object."
+                )
+            ),
+            list(kind = "table", src = file.path(qcdir, "gene_qc.txt")),
+            list(kind = "list", items = list(paste0(
+                "We may still end up with features slightly less than the final passed ones. ",
+                "For example, when SCTransform is used, the number of features may be less than the number of genes that passed the QC filters. ",
+                "This is because SCTransform selects the top N features based on variance. "
+            )))
+        )
+    ),
+    h1 = "Filters and QC",
+    h2 = "Gene-level Quality Control",
+    ui = "tabs"
+)
-print("- Merging samples ...")
-if (length(obj_list) >= 2) {
-    y = c()
-    for (i in 2:length(obj_list)) y = c(y, obj_list[[i]])
-    sobj = merge(obj_list[[1]], y)
-} else {
-    sobj = obj_list[[1]]
+for (pname in names(envs$qc_plots)) {
+    if (is.null(envs$qc_plots[[pname]])) next
+    log$info("- {pname} ...")
+    args <- envs$qc_plots[[pname]]
+    args$kind <- args$kind %||% "cell"
+    args$devpars <- args$devpars %||% list()
+    args$more_formats <- args$more_formats %||% character()
+    args$save_code <- args$save_code %||% FALSE
+    args$descr <- args$descr %||% pname
+    extract_vars(args, "kind", "devpars", "more_formats", "save_code", "descr")
+    if (kind == "gene") kind <- "gene_qc"
+    if (kind == "cell") kind <- "cell_qc"
+    args$object <- sobj
+    plot_fn <- if (kind == "cell_qc") {
+        gglogger::register(VizSeuratCellQC)
+    } else {
+        gglogger::register(VizSeuratGeneQC)
+    }
+    p <- do_call(plot_fn, args)
+    prefix <- file.path(qcdir, paste0(slugify(pname), ".", kind))
+    save_plot(p, prefix, devpars, formats = c("png", more_formats))
+    if (save_code) {
+        save_plotcode(p, prefix,
+            setup = c("library(biopipen.utils)", "load('data.RData')", "invisible(list2env(args, envir = .GlobalEnv))"),
+            "args",
+            auto_data_setup = FALSE)
+    }
+    reporter$add(
+        list(
+            name = pname,
+            contents = list(
+                list(kind = "descr", content = descr),
+                reporter$image(prefix, more_formats, save_code, kind = "image")
+            )
+        ),
+        h1 = "Filters and QC",
+        h2 = ifelse(kind == "cell_qc", "Cell-level Quality Control", "Gene-level Quality Control"),
+        ui = "tabs"
+    )
 }
-print("- Adding metadata for QC ...")
-sobj$percent.mt = PercentageFeatureSet(sobj, pattern = "^MT-")
-sobj$percent.ribo = PercentageFeatureSet(sobj, pattern = "^RP[SL]")
-sobj$percent.hb = PercentageFeatureSet(sobj, pattern = "^HB[^(P)]")
-sobj$percent.plat = PercentageFeatureSet(sobj, pattern = "PECAM1|PF4")
-dim_df = data.frame(When = "Before_QC", nCells = ncol(sobj), nGenes = nrow(sobj))
+log$info("Filtering with QC criteria ...")
+sobj <- FinishSeuratQC(sobj)
-if (is.null(envs$cell_qc) || length(envs$cell_qc) == 0) {
-    warning("No cell QC criteria is provided. All cells will be kept.", immediate. = TRUE)
-    envs$cell_qc = "TRUE"
-}
+sobj <- RunSeuratTransformation(
+    sobj,
+    use_sct = envs$use_sct,
+    SCTransformArgs = envs$SCTransform,
+    NormalizeDataArgs = envs$NormalizeData,
+    FindVariableFeaturesArgs = envs$FindVariableFeatures,
+    ScaleDataArgs = envs$ScaleData,
+    RunPCAArgs = envs$RunPCA,
+    log = log,
+    cache = envs$cache
+)
+sobj <- RunSeuratIntegration(
+    sobj,
+    no_integration = envs$no_integration,
+    IntegrateLayersArgs = envs$IntegrateLayers,
+    log = log,
+    cache = envs$cache
+)
-sobj = sobj %>% mutate(.QC = !!rlang::parse_expr(envs$cell_qc))
-feats = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo", "percent.hb", "percent.plat")
-plotsdir = file.path(joboutdir, "plots")
-dir.create(plotsdir, showWarnings = FALSE)
+# This is the last step, doesn't need to be cached
+if (!identical(envs$doublet_detector, "none")) {
+    dbldir <- file.path(joboutdir, "doublets")
+    dir.create(dbldir, showWarnings = FALSE, recursive = TRUE)
-# Violin plots
-print("- Plotting violin plots ...")
-for (feat in feats) {
-    print(paste0("  ", feat, "..."))
-    vln_p = VlnPlot(
+    sobj <- RunSeuratDoubletDetection(
         sobj,
-        cols = rep("white", length(samples)),
-        group.by = "Sample",
-        features = feat,
-        pt.size = 0) + NoLegend()
-    vln_p$data$.QC = sobj@meta.data$.QC
-    vln_p = vln_p + geom_jitter(
-            aes(color = .QC),
-            data = vln_p$data,
-            position = position_jitterdodge(jitter.width = 0.4, dodge.width = 0.9)
-        ) + scale_color_manual(values = c("black", "red"), breaks = c(TRUE, FALSE))
-    png(
-        file.path(plotsdir, paste0(feat, ".vln.png")),
-        width = 800 + length(samples) * 15, height = 600, res = 100
+        tool = envs$doublet_detector,
+        DoubletFinderArgs = envs$DoubletFinder,
+        scDblFinderArgs = envs$scDblFinder,
+        filter = FALSE,
+        log = log,
+        cache = envs$cache
     )
-    print(vln_p)
-    dev.off()
-}
-# Scatter plots against nCount_RNA
-print("- Plotting scatter plots ...")
-for (feat in setdiff(feats, "nCount_RNA")) {
-    print(paste0("  ", feat, "..."))
-    scat_p = FeatureScatter(
-        sobj,
-        feature1 = "nCount_RNA",
-        feature2 = feat,
-        group.by = ".QC"
-    ) +
-    NoLegend() +
-    scale_color_manual(values = c("black", "red"), breaks = c(TRUE, FALSE))
-    png(
-        file.path(plotsdir, paste0(feat, "-nCount_RNA.scatter.png")),
-        width = 800, height = 600, res = 100
-    )
-    print(scat_p)
-    dev.off()
-}
+    log$info("Visualizing doublet detection results ...")
+    if (identical(tolower(envs$doublet_detector), "doubletfinder")) {
+        p <- VizSeuratDoublets(sobj, plot_type = "pK", x_text_angle = 90)
+        save_plot(
+            p, file.path(dbldir, "doubletfinder_pk"),
+            devpars = list(res = 100, width = 800, height = 600),
+            formats = "png")
+        reporter$add(
+            list(
+                kind = "descr",
+                content = paste(
+                    "The pK plot from DoubletFinder to select the optimal pK value.",
+                    "See more at https://github.com/chris-mcginnis-ucsf/DoubletFinder"
+                )
+            ),
+            list(
+                kind = "image",
+                src = file.path(dbldir, "doubletfinder_pk.png")
+            ),
+            h1 = glue("Doublet detection using {envs$doublet_detector}"),
+            h2 = "BC metric vs pK"
+        )
+    }
-# Do the filtering
-print("- Filtering cells ...")
-sobj = sobj %>% filter(.QC)
-sobj$.QC = NULL
+    for (pt in c("dim", "pie")) {
+        p <- VizSeuratDoublets(sobj, plot_type = pt)
+        save_plot(p, file.path(dbldir, paste0("doublets_", pt)), formats = "png")
-print("- Filtering genes ...")
-if (is.list(envs$gene_qc)) {
-    if ("min_cells" %in% names(envs$gene_qc)) {
-        genes = rownames(sobj)[Matrix::rowSums(sobj) >= envs$gene_qc$min_cells]
-        sobj = subset(sobj, features = genes)
+        reporter$add(
+            list(
+                src = file.path(dbldir, paste0("doublets_", pt, ".png")),
+                descr = ifelse(pt == "dim", "Dimention Reduction Plot", "Pie Chart")
+            ),
+            h1 = glue("Doublet detection using {envs$doublet_detector}"),
+            h2 = "Doublets distribution",
+            ui = "table_of_images"
+        )
     }
+    sobj <- subset(sobj, subset = !!sym(paste0(sobj@misc$doublets$tool, "_DropletType")) != "doublet")
 }
-dim_df = rbind(
-    dim_df,
-    data.frame(
-        When = "After_Gene_QC",
-        nCells = ncol(sobj),
-        nGenes = nrow(sobj)
-    )
-)
-write.table(dim_df, file = file.path(plotsdir, "dim.txt"),
-            row.names = FALSE, quote = FALSE, sep = "\t")
+if (!is.null(envs$mutaters) && length(envs$mutaters) > 0) {
+    log$info("Mutating metadata ...")
+    sobj@meta.data <- sobj@meta.data %>%
+        mutate(!!!lapply(envs$mutaters, rlang::parse_expr))
+}
-print("- Saving results ...")
-saveRDS(sobj, rdsfile)
+log$info("Saving QC'ed seurat object ...")
+reporter$save(joboutdir)
+save_obj(sobj, outfile)

biopipen/scripts/scrna/SeuratSubClustering.R ADDED Viewed

@@ -0,0 +1,64 @@
+library(Seurat)
+library(future)
+library(biopipen.utils)
+set.seed(8525)
+srtfile <- {{in.srtobj | r}}
+outfile <- {{out.outfile | r}}
+ncores <- {{envs.ncores | r}}
+mutaters <- {{envs.mutaters | r}}
+subset <- {{envs.subset | r}}
+cache <- {{envs.cache | r}}
+RunPCAArgs <- {{envs.RunPCA | r: todot = "-"}}
+RunUMAPArgs <- {{envs.RunUMAP | r: todot = "-"}}
+FindNeighborsArgs <- {{envs.FindNeighbors | r: todot = "-"}}
+FindClustersArgs <- {{envs.FindClusters | r: todot = "-"}}
+cases <- {{envs.cases | r}}
+options(future.globals.maxSize = Inf)
+plan(strategy = "multicore", workers = ncores)
+log <- get_logger()
+cases <- expand_cases(cases, defaults = list(
+    RunPCA = RunPCAArgs,
+    RunUMAP = RunUMAPArgs,
+    FindNeighbors = FindNeighborsArgs,
+    FindClusters = FindClustersArgs,
+    subset = subset
+))
+if (isTRUE(cache)) {}
+log$info("Reading Seurat object ...")
+object <- read_obj(srtfile)
+if (!is.null(mutaters) && length(mutaters) > 0) {
+    log$info("Mutating meta data ...")
+    object@meta.data <- mutate(
+        object@meta.data,
+        !!!lapply(mutaters, parse_expr)
+    )
+}
+for (name in names(cases)) {
+    case <- cases[[name]]
+    log$info("Processing case '{name}' ...")
+    object <- RunSeuratSubClustering(
+        object = object,
+        subset = case$subset,
+        name = name,
+        RunPCAArgs = case$RunPCAArgs,
+        RunUMAPArgs = case$RunUMAPArgs,
+        FindNeighborsArgs = case$FindNeighborsArgs,
+        FindClustersArgs = case$FindClustersArgs,
+        log = log,
+        cache = cache
+    )
+}
+log$info("Saving results ...")
+biopipen.utils::save_obj(object, file = outfile)

biopipen/scripts/scrna/SeuratTo10X.R ADDED Viewed

@@ -0,0 +1,27 @@
+library(DropletUtils)
+library(Seurat)
+srtobjfile = {{in.srtobj | r}}
+outdir = {{out.outdir | r}}
+version = {{envs.version | r}}
+split_by = {{envs.split_by | r}}
+srtobj = readRDS(srtobjfile)
+if (!is.null(split_by)) {
+    # check if split_by is a valid column
+    if (is.null(srtobj[[split_by]])) {
+        stop(paste0("Column ", split_by, " not found in Seurat object"))
+    }
+    # split Seurat object by split_by column
+    objs <- SplitObject(srtobj, split.by = split_by)
+    for (s in names(objs)) {
+        counts <- GetAssayData(object = objs[[s]], layer = "counts")
+        odir <- file.path(outdir, s)
+        dir.create(odir, recursive = TRUE, showWarnings = FALSE)
+        write10xCounts(odir, counts, version = version, overwrite = TRUE)
+    }
+} else {
+    counts = GetAssayData(object = srtobj, layer = "counts")
+    write10xCounts(outdir, counts, version = version, overwrite = TRUE)
+}

biopipen/scripts/scrna/Slingshot.R ADDED Viewed

@@ -0,0 +1,65 @@
+library(rlang)
+library(Seurat)
+library(slingshot)
+library(biopipen.utils)
+sobjfile <- {{in.sobjfile | r}}
+outfile <- {{out.outfile | r}}
+group_by <- {{envs.group_by | r}}
+reduction <- {{envs.reduction | r}}
+dims <- {{envs.dims | r}}
+start <- {{envs.start | r}}
+end <- {{envs.end | r}}
+prefix <- {{envs.prefix | r}}
+reverse <- {{envs.reverse | r}}
+align_start <- {{envs.align_start | r}}
+seed <- {{envs.seed | r}}
+set.seed(seed)
+log <- get_logger()
+log$info("Reading Seurat object ...")
+srt <- read_obj(sobjfile)
+group_by <- group_by %||% biopipen.utils::GetIdentityColumn(srt)
+if (is.null(group_by) || !group_by %in% colnames(srt@meta.data)) {
+    stop(paste("Grouping column", group_by, "not found in the Seurat object"))
+}
+reduction <- reduction %||% DefaultDimReduc(srt)
+dims <- biopipen.utils:::.expand_number(dims)
+if (is.null(prefix)) {
+    prefix <- ""
+} else {
+    prefix <- paste0(prefix, "_")
+}
+log$info("Filtering cells in NA group_by ...")
+srt_sub <- srt[, !is.na(srt[[group_by, drop = TRUE]])]
+log$info("Running Slingshot ...")
+sl <- slingshot(
+    data = as.data.frame(srt_sub[[reduction]]@cell.embeddings[, dims]),
+    clusterLabels = as.character(srt_sub[[group_by, drop = TRUE]]),
+    start.clus = start, end.clus = end
+)
+df <- as.data.frame(slingPseudotime(sl))
+colnames(df) <- paste0(prefix, colnames(df))
+if (isTRUE(reverse)) {
+    if (isTRUE(align_start)) {
+        df <- apply(df, 2, function(x) max(x, na.rm = TRUE) - x)
+    } else {
+        df <- max(df, na.rm = TRUE) - df
+    }
+}
+srt <- AddMetaData(srt, metadata = df)
+srt <- AddMetaData(srt, metadata = slingBranchID(sl), col.name = paste0(prefix, "BranchID"))
+srt <- AddSeuratCommand(srt, "Slingshot", "slingshot(...)")
+log$info("Saving Seurat object ...")
+save_obj(srt, outfile)

biopipen/scripts/scrna/Subset10X.R CHANGED Viewed

@@ -1,7 +1,7 @@
 library(Matrix)
-indir = {{in.indir | quote}}
-outdir = {{out.outdir | quote}}
+indir = {{in.indir | r}}
+outdir = {{out.outdir | r}}
 envs = {{envs | r}}
 set.seed(envs$seed)

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl