PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/tcr/CDR3AAPhyschem.R CHANGED Viewed

@@ -1,33 +1,45 @@
+library(rlang)
 library(dplyr)
 library(tidyr)
 library(tibble)
-library(ggplot2)
-library(ggridges)
 library(glue)
 library(hash)
 library(glmnet)
 library(broom.mixed)
 library(stringr)
+library(plotthis)
+library(biopipen.utils)
-immdatafile = {{in.immdata | quote}}
-srtobjfile = {{in.srtobj | r}}
-outdir = {{out.outdir | quote}}
-group_name = {{envs.group | r}}
-comparison = {{envs.comparison | r}}
-prefix = {{envs.prefix | r}}
-target = {{envs.target | r}}
-subset_cols = {{envs.subset | r}}
+scrfile <- {{in.scrfile | r}}
+outdir <- {{out.outdir | r}}
+joboutdir <- {{job.outdir | r}}
+group_name <- {{envs.group | r}}
+comparison <- {{envs.comparison | r}}
+target <- {{envs.target | r}}
+each_cols <- {{envs.each | r}}
+log <- get_logger()
+reporter <- get_reporter()
 if (is.null(group_name) || is.null(comparison)) {
     stop("envs.group and envs.comparison must be specified")
 }
-if (is.null(target)) {
-    stop("envs.target must be specified, which should be one of the keys in `envs.comparison`")
+if (length(comparison) != 2) {
+    stop("envs.comparison must have exactly two elements or keys, representing the two groups to compare")
+}
+if (!is.list(comparison)) {
+    comparison <- stats::setNames(as.list(comparison), comparison)
+}
+target <- target %||% names(comparison)[1]
+if (!(target %in% names(comparison))) {
+    stop(paste0("Target group '", target, "' not found in the comparison groups."))
 }
-if (is.character(subset_cols) && length(subset_cols) == 1) {
-    subset_cols = trimws(strsplit(subset_cols, ",")[[1]])
+if (is.character(each_cols) && length(each_cols) == 1) {
+    each_cols = trimws(strsplit(each_cols, ",")[[1]])
 }
 ### Helpers
@@ -140,98 +152,43 @@ for (i in 1:3){
   AA_MAPS[[i]] <- create_hashmap(as.character(RF$AA), as.vector(RF[,(i+1),drop=TRUE]))
 }
-# Loading metadata from srtobjfile
-print("Loading metadata from srtobjfile")
-if (is.null(srtobjfile)) {
-    metadata = NULL
-} else {
-    # Get the extension (lowercase) of srtobjfile, see if it is .rds file
-    srtobjfile_ext = tolower(tools::file_ext(srtobjfile))
-    if (srtobjfile_ext != "rds") {
-        metadata = read.table(
-            srtobjfile,
-            sep = "\t",
-            header = TRUE,
-            row.names = 1,
-            stringsAsFactors = FALSE,
-            check.names = FALSE,
-        )
-    } else {
-        metadata = readRDS(srtobjfile)@meta.data
-    }
+log$info("Loading data from input file")
+mdata <- read_obj(scrfile)@meta.data
+if (!group_name %in% colnames(mdata)) {
+    stop(paste0("Group name '", group_name, "' not found in the data."))
 }
-print("Loading immdata from immdatafile")
-immdata = readRDS(immdatafile)
-merge_data = function(sam) {
-    # Merge the data for one sample from immdata and metadata
-    out = immdata$data[[sam]] %>%
-        mutate(
-            Sample = sam,
-            locus = "TCRB",
-            sequence = CDR3.aa,
-            length = nchar(sequence),
-            vgene = V.name,
-            jgene = J.name,
-        ) %>%
-        select(Sample, Barcode, locus, sequence, length, vgene, jgene) %>%
-        separate_longer_delim(Barcode, delim = ";") %>%
-        left_join(immdata$meta, by = "Sample")
-    if (is.null(metadata)) {
-        # No metadata, just return
-        return (out)
-    }
+# check if valuess of comparison is in the group_name column
+if (!all(unlist(comparison) %in% as.character(mdata[[group_name]]))) {
+    stop(paste0("Some values in comparison are not found in the group_name column: ",
+                paste(setdiff(unlist(comparison), mdata[[group_name]]), collapse = ", ")))
+}
-    # Merge with metadata
-    sdata = metadata %>% filter(Sample == sam)
-    if (!is.null(prefix) && nchar(prefix) > 0) {
-        # Replace the placeholder like {Sample} with the data in other columns
-        # in the same row
-        sdata = sdata %>% mutate(.prefix_len = nchar(glue("{{envs.prefix}}")))
-        # Remove the prefix in the rownames of sdata
-        rownames(sdata) = substring(rownames(sdata), sdata$.prefix_len + 1)
-        sdata = sdata %>% select(-.prefix_len)
-    }
-    sdata = rownames_to_column(sdata, "Barcode")
-    out = out %>% left_join(sdata, by = "Barcode", suffix = c("", "_seurat"))
-    out$.Group = NA_character_
-    for (k in names(comparison)) {
-        group_mask = out[[group_name]] %in% comparison[[k]]
-        if (sum(group_mask) == 0) {
-            stop(
-                glue("No cells in comparison group {k}. Please check if the group items {comparison[[k]]} exist.")
-            )
+# add a new column with the keys of comparison, when their values are in the group_name column
+mdata$.Group <- sapply(as.character(mdata[[group_name]]), function(x) {
+    for (key in names(comparison)) {
+        if (x %in% comparison[[key]]) {
+            return(key)
         }
-        out$.Group[out[[group_name]] %in% comparison[[k]]] = k
-    }
-    if (!is.null(subset_cols)) {
-        out = out %>% unite(".Subset", all_of(subset_cols), sep = "_", remove = FALSE)
     }
-    return (out)
-}
-# Expanded and merged with metadata
-# Now we are able to select the cells using group and comparison
-print("Merging data with metadata for each sample")
-merged = NULL
-for (sam in immdata$meta$Sample) {
-    print(glue("- For sample {sam}"))
-    md = merge_data(sam)
-    merged = if (is.null(merged)) md else rbind(merged, md)
-}
+    return(NA)
+})
+mdata <- mdata %>%
+    separate(CTaa, into = c(NA, "sequence"), sep = "_", remove = FALSE) %>%
+    separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = FALSE) %>%
+    separate(vjgene, into = c("vgene", NA, "jgene", NA), sep = "\\.", remove = FALSE) %>%
+    mutate(length = nchar(sequence))
 # Statistics about the cell numbers with groups avaiable in metadata
 # !!group_name, TotalCells, AvailCells, AvailCellsPct
-print("Calculating statistics")
-if (is.null(subset_cols)) {
-    stats = merged %>%
+log$info("Calculating statistics")
+if (is.null(each_cols)) {
+    stats = mdata %>%
         # group by group_name
         group_by(.Group) %>%
         summarise(
-            TotalCells = nrow(merged),
+            TotalCells = nrow(mdata),
             CellsPerGroup = n(),
             AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
             # Percentage with % in character
@@ -239,14 +196,15 @@ if (is.null(subset_cols)) {
             .groups = "drop"
         )
 } else {
-    stats = merged %>%
+    stats = mdata %>%
+        unite(".Subset", all_of(each_cols), sep = "_", remove = FALSE) %>%
         group_by(.Subset) %>%
         group_map(function(df, .y) {
             df %>%
                 group_by(.Group) %>%
                 summarise(
                     .Subset = .y$.Subset[1],
-                    AllCells = nrow(merged),
+                    AllCells = nrow(mdata),
                     TotalCells = nrow(df),
                     CellsPerGroup = n(),
                     AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
@@ -259,23 +217,61 @@ if (is.null(subset_cols)) {
 }
 # save the stats
-write.table(stats, file = file.path(outdir, "stats.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
+write.table(
+    stats,
+    file = file.path(outdir, "stats.txt"),
+    sep = "\t",
+    quote = FALSE,
+    row.names = FALSE,
+)
-print("Add amino acid features")
-merged = merged %>%
+reporter$add(
+    list(
+        kind = "descr",
+        content = "Statistics about the cells mapped to the comparison groups. Columns:"
+    ),
+    list(
+        kind = "list",
+        items = c(
+            "_Group: The group name in the comparison, or null, if cells are not mapped to any group",
+            "TotalCells: The total number of cells. This number should be the same for all groups",
+            "CellsPerGroup: The number of cells in the mapped group",
+            paste0(
+                "AvailCellsPerGroup: The number of cells with CDR3 length between ",
+                CDR3_MINLEN,
+                " and ",
+                CDR3_MAXLEN,
+                " for each group. These cells are used for the analysis"
+            ),
+            "AvailCellsPct: The percentage of AvailCellsPerGroup over CellsPerGroup"
+        )
+    ),
+    list(
+        kind = "table",
+        src = file.path(outdir, "stats.txt")
+    ),
+    h1 = "Available Cells"
+)
+log$info("Add amino acid features")
+mdata = mdata %>%
     filter(!is.na(.Group) & length >= CDR3_MINLEN & length <= CDR3_MAXLEN) %>%
     add_percentAA() %>%
     add_positionalAA()
 do_one_subset = function(s) {
-    print(paste("Processing subset", s))
+    if (!is.null(s)) {
+        log$info(paste("Processing subset", s))
+    }
     if (is.null(s)) {
-        data = merged
+        data = mdata
         odir = file.path(outdir, "ALL")
     } else {
-        data = merged %>% filter(.Subset == s)
-        odir = file.path(outdir, s)
+        data = mdata %>% filter(.Subset == s)
+        odir = file.path(outdir, slugify(s))
     }
     dir.create(odir, recursive = TRUE, showWarnings = FALSE)
@@ -299,6 +295,13 @@ do_one_subset = function(s) {
             }
         }
         y = ifelse(data_fit$.Group == target, 1, 0)
+        if (any(table(y) <= 3) || length(table(y)) < 2) {
+            if (is.null(s)) {
+                log$warn(paste0("Not enough observations for target group '", target, "' with CDR3 length ", len, ". At least 4 observations are required."))
+            } else {
+                log$warn(paste0("Not enough observations for target group '", target, "' in subset '", s, "' with CDR3 length ", len, ". At least 4 observations are required."))
+            }
+        }
         # one multinomial or binomial class has 1 or 0 observations; not allowed
         if (any(table(y) <= 1)) { next }
         fit = glmnet(x, y, data=data_fit, alpha=0, lambda=0.01, family="binomial")
@@ -327,56 +330,121 @@ do_one_subset = function(s) {
     write.table(alldf, file = file.path(odir, "estimates.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
     # save the plots
-    gr = alldf %>%
-        group_by(imgt_pos, feature) |>
+    gr <- alldf %>%
+        group_by(imgt_pos, feature) %>%
         summarise(coef = mean(estimate))
     # Avoid too large values
-    gr$coef[gr$coef > 1.5] = 1.5
-    g = ggplot(gr, aes(imgt_pos, exp(coef), color=feature))
-    g = g + geom_point() + geom_line(aes(group=feature)) + theme_classic() + geom_hline(yintercept=1)
-    g = g + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_color_manual(values=c("#eead0c", "#ed6a51", "#02868a"))
-    g = g + xlab("TCR position") + ylab(paste("Coefficient for", target, "prediction")) + ggtitle(s)
-    png(file.path(odir, "estimated_coefficients.png"), width=1000, height=1000, res=100)
-    print(g)
-    dev.off()
+    gr$coef[gr$coef > 1.5] <- 1.5
+    gr$coef <- exp(gr$coef)  # Exponentiate the coefficients
+    g <- LinePlot(gr, x = "imgt_pos", y = "coef", group_by = "feature",
+        add_line = 1, x_text_angle = 90, xlab = "TCR position",
+        ylab = paste("Coefficient for", target, "prediction"), title = s)
+    save_plot(g, file.path(odir, "estimated_coefficients"),
+        devpars = list(width = 1000, height = 1000, res = 100),
+        formats = c("png", "pdf"))
+    reporter$add(
+        list(
+            kind = "descr",
+            content = "Estimated coefficients for each feature and position in the CDR3"
+        ),
+        h1 = ifelse(
+            is.null(s),
+            "Estimated OR (per s.d.)",
+            paste0(paste(each_cols, collapse = ", "), " - ", s)
+        ),
+        h2 = ifelse(
+            is.null(s),
+            "#",
+            "Estimated OR (per s.d.)"
+        )
+    )
+    reporter$add(
+        list(
+            name = "Plot",
+            contents = list(
+                list(
+                    kind = "image",
+                    src = file.path(odir, "estimated_coefficients.png"),
+                    download = file.path(odir, "estimated_coefficients.pdf")
+                )
+            )
+        ),
+        list(
+            name = "Estimates",
+            contents = list(
+                list(
+                    kind = "table",
+                    src = file.path(odir, "estimates.txt")
+                )
+            )
+        ),
+        h1 = ifelse(
+            is.null(s),
+            "Estimated OR (per s.d.)",
+            paste0(paste(each_cols, collapse = ", "), " - ", s)
+        ),
+        h2 = ifelse(
+            is.null(s),
+            "#",
+            "Estimated OR (per s.d.)"
+        ),
+        ui = "tabs"
+    )
     # distributions
     data$mid_hydro = sapply(data$midseq, function(x) get_feat_score(x, AA_MAPS[[2]]))
     data$smid_hydro = scale(data$mid_hydro)[,1]
-    g = ggplot()
-    # Give colors for different groups
-    cols = c("turquoise3", "darkmagenta", "darkorange", "darkgreen", "darkblue", "darkred")
-    groups = unique(data$.Group)
-    if (length(groups) > length(cols)) {
-        cols = c(cols, c("darkcyan", "darkviolet", "darkgoldenrod", "darkolivegreen", "darkslategray", "darkkhaki"))
-    }
-    cols = cols[1:length(groups)]
-    for (i in seq_along(groups)) {
-        g = g + geom_vline(
-          xintercept = mean(data$smid_hydro[data$.Group==groups[i]]),
-          color=cols[i]
+    g <- RidgePlot(
+        data = data,
+        x = "smid_hydro",
+        group_by = ".Group",
+        xlab = "CDR3bmr hydrophobicity",
+        ylab = "",
+        add_vline = TRUE,
+        alpha = 0.5,
+        title = s,
+        flip = TRUE
+    )
+    save_plot(g, file.path(odir, "distribution"),
+        devpars = list(width = 1000, height = 1000, res = 100),
+        formats = c("png", "pdf"))
+    reporter$add(
+        list(
+            kind = "table_image",
+            descr = paste0(
+                "Hydrophobicity values are averaged over the CDR3 for each TCR and ",
+                "then scaled to have a mean of 0 and a variance of 1. ",
+                "Horizontal lines depict the mean for each population"
+            ),
+            src = file.path(odir, "distribution.png"),
+            download = file.path(odir, "distribution.pdf")
+        ),
+        h1 = ifelse(
+            is.null(s),
+            "Hydrophobicity Distribution",
+            paste0(paste(each_cols, collapse = ", "), " - ", s)
+        ),
+        h2 = ifelse(
+            is.null(s),
+            "#",
+            "Hydrophobicity Distribution"
         )
-    }
-    g = g + geom_density_ridges(
-      aes(x=data$smid_hydro, y=data$.Group, color=data$.Group, fill=data$.Group),
-      bandwidth=0.5,
-      alpha=0.4,
-      show.legend = FALSE
-    ) + scale_color_manual(values=cols)
-    g = g + scale_fill_manual(values=cols) + theme_bw(base_size=12)
-    g = g + xlim(c(-4,4)) + xlab("CDR3bmr hydrophobicity") + ylab("") + coord_flip() + ggtitle(s)
-    png(file.path(odir, "distribution.png"), width=1000, height=1000, res=100)
-    print(g)
-    dev.off()
+    )
 }
-if (is.null(subset_cols)) {
+if (is.null(each_cols)) {
     do_one_subset(NULL)
 } else {
-    subsets = na.omit(unique(merged$.Subset))
+    subsets = na.omit(unique(obj$.Subset))
     sapply(subsets, do_one_subset)
 }
+reporter$save(joboutdir)

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl