PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/tcr/TESSA.R CHANGED Viewed

@@ -1,23 +1,28 @@
-source("{{biopipen_dir}}/utils/misc.R")
 library(glue)
 library(dplyr)
 library(tidyr)
-library(immunarch)
+library(tibble)
 library(Seurat)
-library(ggplot2)
-library(ggprism)
+library(biopipen.utils)
-immfile <- {{in.immdata | r}}
-exprfile <- {{in.srtobj | r}}
+screpdata <- {{in.screpdata | r}}
 outfile <- {{out.outfile | r}}
+joboutdir <- {{job.outdir | r}}
 python <- {{envs.python | r}}
 within_sample <- {{envs.within_sample | r}}
 assay <- {{envs.assay | r}}
 predefined_b <- {{envs.predefined_b | r}}
 max_iter <- {{envs.max_iter | int}}
 save_tessa <- {{envs.save_tessa | r}}
-tessa_srcdir <- "{{biopipen_dir}}/scripts/tcr/TESSA_source"
+log <- get_logger()
+reporter <- get_reporter()
+# In case this script is running in the cloud and <biopipen_dir> can not be found in there
+# In stead, we use the python command, which is associated with the cloud environment,
+# to get the biopipen directory
+biopipen_dir <- get_biopipen_dir(python)
+tessa_srcdir <- file.path(biopipen_dir, "scripts", "tcr", "TESSA_source")
 outdir <- dirname(outfile)
 result_dir <- file.path(outdir, "result")
@@ -27,98 +32,51 @@ if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
 ### Start preparing input files for TESSA
 # Prepare input files
-print("Preparing TCR input file ...")
-immdata <- readRDS(immfile)
-has_VJ <- "V.name" %in% colnames(immdata$data[[1]]) && "J.name" %in% colnames(immdata$data[[1]])
-# Merge all samples
-tcrdata <- do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
-    # Clones  Proportion   CDR3.aa                       Barcode
-    # 5      4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF   GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
-    # 6      4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
-    if (has_VJ) {
-        cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa", "V.name", "J.name")]
-    } else {
-        cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa")]
-    }
-    # # A tibble: 4 × 5
-    # Sample                  Patient     Timepoint Tissue
-    # <chr>                   <chr>       <chr>     <chr>
-    # 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline  PB
-    mdata = as.list(immdata$meta[i, , drop=FALSE])
-    for (mname in names(mdata)) {
-        assign(mname, mdata[[mname]])
-    }
-    cldata %>%
-        separate_rows(Barcode, sep=";") %>%
-        # Just in case there are duplicated barcodes
-        distinct(Barcode, .keep_all = TRUE) %>%
-        mutate(Barcode = glue("{{envs.prefix}}{Barcode}"), sample = Sample)
-}))
-if (has_VJ) {
-    tcrdata <- tcrdata %>% dplyr::mutate(
-        v_gene = sub("-\\d+$", "", V.name),
-        j_gene = sub("-\\d+$", "", J.name)
-    ) %>% dplyr::select(
-        contig_id = Barcode,
-        cdr3 = CDR3.aa,
-        v_gene,
-        j_gene,
-        sample
-    )
-} else {
-    tcrdata <- tcrdata %>% dplyr::select(
-        contig_id = Barcode,
-        cdr3 = CDR3.aa,
-        sample
-    )
-}
-print("Preparing expression input file ...")
-is_seurat <- endsWith(tolower(exprfile), ".rds")
-is_gz <- endsWith(tolower(exprfile), ".gz")
-if (is_seurat) {
-    sobj <- readRDS(exprfile)
-    expr <- GetAssayData(sobj, slot = "data", assay = assay)
-} else if (is_gz) {
-    expr <- read.table(gzfile(exprfile), sep="\t", header=TRUE, row.names=1)
-} else {
-    expr <- read.table(exprfile, sep="\t", header=TRUE, row.names=1)
-}
+log$info("Reading input file ...")
+sobj <- read_obj(screpdata)
+log$info("Preparing TCR input file ...")
+# If immfile endswith .rds, then it is an immunarch object
+tcrdata <- sobj@meta.data %>%
+    rownames_to_column("contig_id") %>%
+    select(contig_id, CTaa, CTgene, sample = Sample) %>%
+    filter(!is.na(CTaa) & !is.na(CTgene)) %>%
+    separate(CTaa, into = c(NA, "cdr3"), sep = "_", remove = TRUE) %>%
+    filter(!is.na(cdr3) & cdr3 != "NA" & cdr3 != "nan") %>%
+    separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = TRUE) %>%
+    separate(vjgene, into = c("v_gene", NA, "j_gene", NA), sep = "\\.", remove = TRUE) %>%
+    mutate(v_gene = sub("-\\d+$", "", v_gene), j_gene = sub("-\\d+$", "", j_gene))
+log$info("Preparing expression input file ...")
+expr <- GetAssayData(sobj, layer = "data")
 cell_ids <- intersect(tcrdata$contig_id, colnames(expr))
 # Warning about unused cells
-unused_tcr_cells <- setdiff(tcrdata$contig_id, cell_ids)
 unused_expr_cells <- setdiff(colnames(expr), cell_ids)
-if (length(unused_tcr_cells) > 0) {
-    warning(glue("{length(unused_tcr_cells)}/{nrow(tcrdata)} TCR cells are not used."), immediate. = TRUE)
-}
 if (length(unused_expr_cells) > 0) {
-    warning(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."), immediate. = TRUE)
+    log$warn(glue("{length(unused_expr_cells)}/{ncol(expr)} cells without TCR data are not used."))
 }
 if (length(cell_ids) == 0) {
-    stop("No common cells between TCR and expression data. Are you using the correct prefix?")
+    stop(
+        "No TCR data found in the Seurat object. ",
+        "Please use scRepertiore::combineExpression() to generate the Seurat object with TCR data."
+    )
 }
-tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
 expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]
 # Write input files
-print("Writing input files ...")
+log$info("Writing input files ...")
 write.table(tcrdata, file.path(tessa_dir, "tcrdata.txt"), sep=",", quote=FALSE, row.names=FALSE)
 write.table(expr, file.path(tessa_dir, "exprdata.txt"), sep=",", quote=FALSE, row.names=TRUE, col.names=TRUE)
 ### End preparing input files for TESSA
 ### Start running TESSA
-print("Running TESSA ...")
+log$info("Running TESSA ...")
 # The original TESSA uses a python wrapper to run the encoder and tessa model
 # here we run those two steps directly here
-print("- Running encoder ...")
+log$info("- Running encoder ...")
 cmd_encoder <- paste(
     python,
     file.path(tessa_srcdir, "BriseisEncoder.py"),
@@ -133,21 +91,22 @@ cmd_encoder <- paste(
     "-output_log",
     file.path(tessa_dir, "tcr_encoder.log")
 )
-if (has_VJ) {
-    cmd_encoder <- paste(
-        cmd_encoder,
-        "-output_VJ",
-        file.path(tessa_dir, "tcr_vj.txt")
-    )
-}
-print(paste("- ", cmd_encoder))
+cmd_encoder <- paste(
+    cmd_encoder,
+    "-output_VJ",
+    file.path(tessa_dir, "tcr_vj.txt")
+)
+print("Running:")
+print(cmd_encoder)
+log$debug(paste("- ", cmd_encoder))
 rc <- system(cmd_encoder)
 if (rc != 0) {
     stop("Error: Failed to run encoder.")
 }
-print("- Running TESSA model ...")
+log$info("- Running TESSA model ...")
 source(file.path(tessa_srcdir, "real_data.R"))
 tessa <- run_tessa(
@@ -162,42 +121,67 @@ tessa <- run_tessa(
 )
 # Save TESSA results
-print("Saving TESSA results ...")
-if (is_seurat) {
-    cells <- rownames(sobj@meta.data)
-    sobj@meta.data <- sobj@meta.data %>%
-        mutate(
-            TESSA_Cluster = tessa$meta[
-                match(cells, tessa$meta$barcode),
-                "cluster_number"
-            ]
-        ) %>%
-        add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
-    rownames(sobj@meta.data) <- cells
-    if (save_tessa) {
-        sobj@misc$tessa <- tessa
-    }
-    saveRDS(sobj, outfile)
-} else {
-    out <- tessa$meta %>%
-        dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
-        add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
-    write.table(out, outfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
+log$info("Saving TESSA results ...")
+cells <- rownames(sobj@meta.data)
+sobj@meta.data <- sobj@meta.data %>%
+    mutate(
+        TESSA_Cluster = tessa$meta[
+            match(cells, tessa$meta$barcode),
+            "cluster_number"
+        ]
+    ) %>%
+    add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
+rownames(sobj@meta.data) <- cells
+if (save_tessa) {
+    sobj@misc$tessa <- tessa
 }
+save_obj(sobj, outfile)
 # Post analysis
-print("Post analysis ...")
+log$info("Post analysis ...")
 plot_tessa(tessa, result_dir)
 plot_Tessa_clusters(tessa, result_dir)
 p <- tessa$meta %>%
     dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
     add_count(TESSA_Cluster, name = "TESSA_Cluster_Size") %>%
-    ggplot(aes(x = TESSA_Cluster_Size)) +
-    geom_histogram(binwidth = 1) +
-    theme_prism()
+    plotthis::Histogram(x = "TESSA_Cluster_Size")
+res <- 100
+height <- attr(p, "height") * res
+width <- attr(p, "width") * res
+prefix <- file.path(result_dir, "Cluster_size_dist")
+save_plot(p, prefix, devpars = list(width = width, height = height, res = res))
+reporter$add(
+    list(
+        src = file.path(result_dir, "Cluster_size_dist.png"),
+        descr = "Histogram of cluster size distribution",
+        download = file.path(result_dir, "Cluster_size_dist.pdf")
+    ),
+    list(
+        src = file.path(result_dir, "clone_size.png"),
+        descr = "Center cluster size vs. non-center cluster size"
+    ),
+    list(
+        src = file.path(result_dir, "exp_TCR_pair_plot.png"),
+        descr = "Expression-TCR distance plot"
+    ),
+    list(
+        src = file.path(result_dir, "TCR_dist_density.png"),
+        descr = "TCR distance density plot"
+    ),
+    list(
+        src = file.path(result_dir, "TCR_explore.png"),
+        descr = "Exploratory plot at the TCR level"
+    ),
+    list(
+        src = file.path(result_dir, "TCR_explore_clusters.png"),
+        descr = "TESSA clusters"
+    ),
+    h1 = "TESSA Results",
+    ui = "table_of_images"
+)
-png(file.path(result_dir, "Cluster_size_dist.png"), width=8, height=8, units="in", res=100)
-print(p)
-dev.off()
+reporter$save(joboutdir)

biopipen/scripts/tcr/VJUsage.R CHANGED Viewed

@@ -1,9 +1,9 @@
-infile = {{in.infile | quote}}
-outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | quote}}
-vdjtools = {{ envs.vdjtools | quote }}
-vdjtools_patch = {{ envs.vdjtools_patch | quote }}
-joboutdir = {{job.outdir | quote}}
+infile = {{in.infile | r}}
+outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | r}}
+vdjtools = {{ envs.vdjtools | r }}
+vdjtools_patch = {{ envs.vdjtools_patch | r }}
+joboutdir = {{job.outdir | r}}
 command = sprintf(
     "cd %s && bash %s %s PlotFancyVJUsage --plot-type png %s %s",

biopipen/scripts/tcr/immunarch-patched.R ADDED Viewed

@@ -0,0 +1,142 @@
+library(immunarch)
+vis.immunr_gini <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  .data = data.frame(Sample = rownames(.data), Value = .data[, 1])
+  if (.plot.type == "bar") {
+    vis_bar(
+        .data = .data, .by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Gini coefficient"),
+        .title = "Gini coefficient", .subtitle = "Sample diversity estimation using the Gini coefficient",
+        .legend = .legend, .leg.title = NA
+    )
+  } else {
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Gini coefficient"),
+        .title = "Gini coefficient", .subtitle = "Sample diversity estimation using the Gini coefficient",
+        .legend = .legend, .leg.title = NA, .melt = FALSE
+    )
+  }
+}
+vis.immunr_div <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  if (.plot.type == "bar") {
+    immunarch:::vis.immunr_div(.data = .data,.by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .legend = .legend)
+  } else {
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Effective number of clonoypes"),
+        .title = "True diversity", .subtitle = "Sample diversity estimation using the true diversity index",
+        .legend = NA, .leg.title = NA, .melt = FALSE
+    )
+  }
+}
+vis.immunr_chao1 <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  if (.plot.type == "bar") {
+    immunarch:::vis.immunr_chao1(.data = .data,.by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .legend = .legend)
+  } else {
+    .data <- data.frame(Sample = row.names(.data), Value = .data[, 1])
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Chao1"),
+        .title = "Chao1", .subtitle = "Sample diversity estimation using Chao1",
+        .legend = NA, .leg.title = NA, .melt = FALSE
+    )
+  }
+}
+vis.immunr_ginisimp <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  if (.plot.type == "bar") {
+    immunarch:::vis.immunr_ginisimp(.data = .data,.by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .legend = .legend)
+  } else {
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Gini-Simpson index"),
+        .title = "Gini-Simpson index", .subtitle = "Sample diversity estimation using the Gini-Simpson index",
+        .legend = .legend, .leg.title = NA, .melt = FALSE
+    )
+  }
+}
+vis.immunr_invsimp <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  if (.plot.type == "bar") {
+    immunarch:::vis.immunr_invsimp(.data = .data,.by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .legend = .legend)
+  } else {
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, "Inverse Simpson index"),
+        .title = "Inverse Simpson index", .subtitle = "Sample diversity estimation using the inverse Simpson index",
+        .legend = .legend, .leg.title = NA, .melt = FALSE
+    )
+  }
+}
+vis.immunr_dxx <- function(.data, .by = NA, .meta = NA,
+                            .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
+                            .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
+                            .legend = NA, .plot.type = "bar", ...) {
+  # repDiversity(..., .method = "gini") generates a matrix
+  if (.plot.type == "bar") {
+    immunarch:::vis.immunr_dxx(.data = .data,.by = .by, .meta = .meta,
+        .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
+        .points = .points, .test = .test, .signif.label.size = .signif.label.size,
+        .legend = .legend)
+  } else {
+    perc_value <- round(.data[1, 2][1])
+    .data <- data.frame(Sample = row.names(.data), Value = .data[, 1])
+    vis_box(
+        .data = .data, .by = .by, .meta = .meta, .test = .test,
+        .points = .points, .signif.label.size = .signif.label.size,
+        .defgroupby = "Sample", .grouping.var = "Group",
+        .labs = c(NA, paste0("D", perc_value)),
+        .title = paste0("D", perc_value, " diversity index"), .subtitle = paste0("Number of clonotypes occupying the ", perc_value, "% of repertoires"),
+        .legend = .legend, .leg.title = NA, .melt = FALSE
+    )
+  }
+}

biopipen/scripts/tcr/vdjtools-patch.sh CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # run the command and capture the stdout
-out=$(command $@)
+out=$(command "$@")
 echo "$out"

biopipen/scripts/vcf/BcftoolsAnnotate.py ADDED Viewed

@@ -0,0 +1,91 @@
+from os import path
+from contextlib import suppress
+from pathlib import PosixPath  # noqa: F401
+from biopipen.utils.reference import tabix_index
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa: E999
+annfile: str = {{in.annfile | quote}}  # pyright: ignore
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+joboutdir: str = {{job.outdir | quote}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+ncores = envs.pop("ncores")
+columns = envs.pop("columns")
+remove = envs.pop("remove")
+header = envs.pop("header")
+gz = envs.pop("gz")
+index = envs.pop("index")
+if isinstance(columns, list):
+    columns = ",".join(columns)
+if "c" in envs:
+    logger.warning(r"Ignoring envs\[c], use envs\[columns] instead.")
+    del envs["c"]
+if isinstance(remove, list):
+    remove = ",".join(remove)
+if "x" in envs:
+    logger.warning(r"Ignoring envs\[x], use envs\[remove] instead.")
+    del envs["x"]
+envs_has_annfile = "a" in envs or "annotations" in envs
+headerfile = path.join(joboutdir, "header.txt")
+if header:
+    with open(headerfile, "w") as fh:
+        fh.writelines(header)
+if annfile and envs_has_annfile:
+    logger.warning(
+        r"Ignoring envs\[a/annotations] because in.annfile is provided."
+    )
+    with suppress(KeyError):
+        del envs["a"]
+    with suppress(KeyError):
+        del envs["annotations"]
+elif not annfile and envs_has_annfile:
+    annfile = envs.pop("annotations", None) or envs.pop("a", None)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+envs[""] = [bcftools, "annotate"]
+envs["o"] = outfile
+envs["threads"] = ncores
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+if columns:
+    envs["columns"] = columns
+    if not annfile:
+        raise ValueError(
+            "envs.columns specified but no in.annfile/envs.annfile provided."
+        )
+    envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
+if remove:
+    envs["remove"] = remove
+    # no need to index it
+    envs["_"] = infile
+if "columns" not in envs and "remove" not in envs:
+    logger.warning(
+        "No columns/remove specified, no columns will be carried over or removed."
+    )
+if annfile:
+    envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
+if header:
+    envs["header_lines"] = headerfile
+run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)

biopipen/scripts/vcf/BcftoolsFilter.py ADDED Viewed

@@ -0,0 +1,90 @@
+from pathlib import Path, PosixPath  # noqa: F401
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infile: str | Path = {{in.infile | quote}}  # pyright: ignore # noqa: #999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+outdir = Path(outfile).parent
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+keep = envs.pop("keep")
+ncores = envs.pop("ncores")
+includes = envs.pop("includes")
+excludes = envs.pop("excludes")
+gz = envs.pop("gz")
+index = envs.pop("index")
+# a.vcf.gz -> a
+# a.vcf -> a
+stem = Path(infile).stem
+if stem.endswith(".vcf"):
+    stem = stem[:-4]
+# .vcf.gz
+# .gz
+ext = ".vcf.gz" if index or gz else '.vcf'
+def normalize_expr(expr, flag, prev_n_filters=0):
+    out = {}
+    if not expr:
+        return out
+    if isinstance(expr, list):
+        for ex in expr:
+            out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (ex, flag)
+    elif isinstance(expr, dict):
+        for name, ex in expr.items():
+            out[name] = (ex, flag)
+    else: # str
+        out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (expr, flag)
+    return out
+def handle_filter(vcf, fname, filt, flag, final):
+    logger.info("- Handling filter %s: %s ...", fname, filt)
+    arguments = envs.copy()
+    arguments[flag] = filt
+    arguments["_"] = vcf
+    arguments["o"] = outfile if final else outdir / f"{stem}.{fname}{ext}"
+    if keep:
+        arguments["s"] = fname
+    run_bcftools(arguments, bcftools=bcftools, index=index and final, tabix=tabix)
+    if final:
+        flagfile = outdir.joinpath(f"{stem}.{fname}{ext}")
+        if flagfile.is_symlink():
+            flagfile.unlink()
+        outdir.joinpath(f"{stem}.{fname}{ext}").symlink_to(outfile)
+    return arguments["o"]
+includes = normalize_expr(includes, "include")
+excludes = normalize_expr(excludes, "exclude", len(includes))
+includes.update(excludes)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+envs[""] = [bcftools, "filter"]
+envs["_"] = infile
+envs["o"] = outfile
+envs["threads"] = ncores
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+if keep:
+    envs["soft_filter"] = "+"
+if "m" not in envs and "mode" not in envs:
+    envs["m"] = "+"
+# bcftools can be only done once at one filter
+for i, (fname, (filt, flag)) in enumerate(includes.items()):
+    infile = handle_filter(infile, fname, filt, flag, i == len(includes) - 1)

biopipen/scripts/vcf/BcftoolsMerge.py ADDED Viewed

@@ -0,0 +1,31 @@
+from biopipen.utils.reference import tabix_index
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infiles: list = {{in.infiles | each: as_path}}  # pyright: ignore # noqa: E999
+outfile = {{out.outfile | repr}}  # pyright: ignore
+joboutdir = {{job.outdir | repr}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+ncores = envs.pop("ncores")
+gz = envs.pop("gz")
+index = envs.pop("index")
+envs.setdefault("force-single", True)
+envs.setdefault("missing-to-ref", True)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+envs[""] = [bcftools, "merge"]
+envs["o"] = outfile
+envs["threads"] = ncores
+envs["_"] = infiles
+run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl