biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -1,33 +1,45 @@
|
|
|
1
|
+
library(rlang)
|
|
1
2
|
library(dplyr)
|
|
2
3
|
library(tidyr)
|
|
3
4
|
library(tibble)
|
|
4
|
-
library(ggplot2)
|
|
5
|
-
library(ggridges)
|
|
6
5
|
library(glue)
|
|
7
6
|
library(hash)
|
|
8
7
|
library(glmnet)
|
|
9
8
|
library(broom.mixed)
|
|
10
9
|
library(stringr)
|
|
10
|
+
library(plotthis)
|
|
11
|
+
library(biopipen.utils)
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
group_name
|
|
16
|
-
comparison
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
13
|
+
scrfile <- {{in.scrfile | r}}
|
|
14
|
+
outdir <- {{out.outdir | r}}
|
|
15
|
+
joboutdir <- {{job.outdir | r}}
|
|
16
|
+
group_name <- {{envs.group | r}}
|
|
17
|
+
comparison <- {{envs.comparison | r}}
|
|
18
|
+
target <- {{envs.target | r}}
|
|
19
|
+
each_cols <- {{envs.each | r}}
|
|
20
|
+
|
|
21
|
+
log <- get_logger()
|
|
22
|
+
reporter <- get_reporter()
|
|
20
23
|
|
|
21
24
|
if (is.null(group_name) || is.null(comparison)) {
|
|
22
25
|
stop("envs.group and envs.comparison must be specified")
|
|
23
26
|
}
|
|
24
27
|
|
|
25
|
-
if (
|
|
26
|
-
stop("envs.
|
|
28
|
+
if (length(comparison) != 2) {
|
|
29
|
+
stop("envs.comparison must have exactly two elements or keys, representing the two groups to compare")
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (!is.list(comparison)) {
|
|
33
|
+
comparison <- stats::setNames(as.list(comparison), comparison)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
target <- target %||% names(comparison)[1]
|
|
37
|
+
if (!(target %in% names(comparison))) {
|
|
38
|
+
stop(paste0("Target group '", target, "' not found in the comparison groups."))
|
|
27
39
|
}
|
|
28
40
|
|
|
29
|
-
if (is.character(
|
|
30
|
-
|
|
41
|
+
if (is.character(each_cols) && length(each_cols) == 1) {
|
|
42
|
+
each_cols = trimws(strsplit(each_cols, ",")[[1]])
|
|
31
43
|
}
|
|
32
44
|
|
|
33
45
|
### Helpers
|
|
@@ -140,98 +152,43 @@ for (i in 1:3){
|
|
|
140
152
|
AA_MAPS[[i]] <- create_hashmap(as.character(RF$AA), as.vector(RF[,(i+1),drop=TRUE]))
|
|
141
153
|
}
|
|
142
154
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Get the extension (lowercase) of srtobjfile, see if it is .rds file
|
|
149
|
-
srtobjfile_ext = tolower(tools::file_ext(srtobjfile))
|
|
150
|
-
if (srtobjfile_ext != "rds") {
|
|
151
|
-
metadata = read.table(
|
|
152
|
-
srtobjfile,
|
|
153
|
-
sep = "\t",
|
|
154
|
-
header = TRUE,
|
|
155
|
-
row.names = 1,
|
|
156
|
-
stringsAsFactors = FALSE,
|
|
157
|
-
check.names = FALSE,
|
|
158
|
-
)
|
|
159
|
-
} else {
|
|
160
|
-
metadata = readRDS(srtobjfile)@meta.data
|
|
161
|
-
}
|
|
155
|
+
log$info("Loading data from input file")
|
|
156
|
+
mdata <- read_obj(scrfile)@meta.data
|
|
157
|
+
|
|
158
|
+
if (!group_name %in% colnames(mdata)) {
|
|
159
|
+
stop(paste0("Group name '", group_name, "' not found in the data."))
|
|
162
160
|
}
|
|
163
161
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
# Merge the data for one sample from immdata and metadata
|
|
170
|
-
out = immdata$data[[sam]] %>%
|
|
171
|
-
mutate(
|
|
172
|
-
Sample = sam,
|
|
173
|
-
locus = "TCRB",
|
|
174
|
-
sequence = CDR3.aa,
|
|
175
|
-
length = nchar(sequence),
|
|
176
|
-
vgene = V.name,
|
|
177
|
-
jgene = J.name,
|
|
178
|
-
) %>%
|
|
179
|
-
select(Sample, Barcode, locus, sequence, length, vgene, jgene) %>%
|
|
180
|
-
separate_longer_delim(Barcode, delim = ";") %>%
|
|
181
|
-
left_join(immdata$meta, by = "Sample")
|
|
182
|
-
|
|
183
|
-
if (is.null(metadata)) {
|
|
184
|
-
# No metadata, just return
|
|
185
|
-
return (out)
|
|
186
|
-
}
|
|
162
|
+
# check if valuess of comparison is in the group_name column
|
|
163
|
+
if (!all(unlist(comparison) %in% as.character(mdata[[group_name]]))) {
|
|
164
|
+
stop(paste0("Some values in comparison are not found in the group_name column: ",
|
|
165
|
+
paste(setdiff(unlist(comparison), mdata[[group_name]]), collapse = ", ")))
|
|
166
|
+
}
|
|
187
167
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
sdata = sdata %>% mutate(.prefix_len = nchar(glue("{{envs.prefix}}")))
|
|
194
|
-
# Remove the prefix in the rownames of sdata
|
|
195
|
-
rownames(sdata) = substring(rownames(sdata), sdata$.prefix_len + 1)
|
|
196
|
-
sdata = sdata %>% select(-.prefix_len)
|
|
197
|
-
}
|
|
198
|
-
sdata = rownames_to_column(sdata, "Barcode")
|
|
199
|
-
out = out %>% left_join(sdata, by = "Barcode", suffix = c("", "_seurat"))
|
|
200
|
-
out$.Group = NA_character_
|
|
201
|
-
for (k in names(comparison)) {
|
|
202
|
-
group_mask = out[[group_name]] %in% comparison[[k]]
|
|
203
|
-
if (sum(group_mask) == 0) {
|
|
204
|
-
stop(
|
|
205
|
-
glue("No cells in comparison group {k}. Please check if the group items {comparison[[k]]} exist.")
|
|
206
|
-
)
|
|
168
|
+
# add a new column with the keys of comparison, when their values are in the group_name column
|
|
169
|
+
mdata$.Group <- sapply(as.character(mdata[[group_name]]), function(x) {
|
|
170
|
+
for (key in names(comparison)) {
|
|
171
|
+
if (x %in% comparison[[key]]) {
|
|
172
|
+
return(key)
|
|
207
173
|
}
|
|
208
|
-
out$.Group[out[[group_name]] %in% comparison[[k]]] = k
|
|
209
|
-
}
|
|
210
|
-
if (!is.null(subset_cols)) {
|
|
211
|
-
out = out %>% unite(".Subset", all_of(subset_cols), sep = "_", remove = FALSE)
|
|
212
174
|
}
|
|
213
|
-
return
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
for (sam in immdata$meta$Sample) {
|
|
221
|
-
print(glue("- For sample {sam}"))
|
|
222
|
-
md = merge_data(sam)
|
|
223
|
-
merged = if (is.null(merged)) md else rbind(merged, md)
|
|
224
|
-
}
|
|
175
|
+
return(NA)
|
|
176
|
+
})
|
|
177
|
+
mdata <- mdata %>%
|
|
178
|
+
separate(CTaa, into = c(NA, "sequence"), sep = "_", remove = FALSE) %>%
|
|
179
|
+
separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = FALSE) %>%
|
|
180
|
+
separate(vjgene, into = c("vgene", NA, "jgene", NA), sep = "\\.", remove = FALSE) %>%
|
|
181
|
+
mutate(length = nchar(sequence))
|
|
225
182
|
|
|
226
183
|
# Statistics about the cell numbers with groups avaiable in metadata
|
|
227
184
|
# !!group_name, TotalCells, AvailCells, AvailCellsPct
|
|
228
|
-
|
|
229
|
-
if (is.null(
|
|
230
|
-
stats =
|
|
185
|
+
log$info("Calculating statistics")
|
|
186
|
+
if (is.null(each_cols)) {
|
|
187
|
+
stats = mdata %>%
|
|
231
188
|
# group by group_name
|
|
232
189
|
group_by(.Group) %>%
|
|
233
190
|
summarise(
|
|
234
|
-
TotalCells = nrow(
|
|
191
|
+
TotalCells = nrow(mdata),
|
|
235
192
|
CellsPerGroup = n(),
|
|
236
193
|
AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
|
|
237
194
|
# Percentage with % in character
|
|
@@ -239,14 +196,15 @@ if (is.null(subset_cols)) {
|
|
|
239
196
|
.groups = "drop"
|
|
240
197
|
)
|
|
241
198
|
} else {
|
|
242
|
-
stats =
|
|
199
|
+
stats = mdata %>%
|
|
200
|
+
unite(".Subset", all_of(each_cols), sep = "_", remove = FALSE) %>%
|
|
243
201
|
group_by(.Subset) %>%
|
|
244
202
|
group_map(function(df, .y) {
|
|
245
203
|
df %>%
|
|
246
204
|
group_by(.Group) %>%
|
|
247
205
|
summarise(
|
|
248
206
|
.Subset = .y$.Subset[1],
|
|
249
|
-
AllCells = nrow(
|
|
207
|
+
AllCells = nrow(mdata),
|
|
250
208
|
TotalCells = nrow(df),
|
|
251
209
|
CellsPerGroup = n(),
|
|
252
210
|
AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
|
|
@@ -259,23 +217,61 @@ if (is.null(subset_cols)) {
|
|
|
259
217
|
}
|
|
260
218
|
|
|
261
219
|
# save the stats
|
|
262
|
-
write.table(
|
|
220
|
+
write.table(
|
|
221
|
+
stats,
|
|
222
|
+
file = file.path(outdir, "stats.txt"),
|
|
223
|
+
sep = "\t",
|
|
224
|
+
quote = FALSE,
|
|
225
|
+
row.names = FALSE,
|
|
226
|
+
)
|
|
263
227
|
|
|
264
|
-
|
|
265
|
-
|
|
228
|
+
reporter$add(
|
|
229
|
+
list(
|
|
230
|
+
kind = "descr",
|
|
231
|
+
content = "Statistics about the cells mapped to the comparison groups. Columns:"
|
|
232
|
+
),
|
|
233
|
+
list(
|
|
234
|
+
kind = "list",
|
|
235
|
+
items = c(
|
|
236
|
+
"_Group: The group name in the comparison, or null, if cells are not mapped to any group",
|
|
237
|
+
"TotalCells: The total number of cells. This number should be the same for all groups",
|
|
238
|
+
"CellsPerGroup: The number of cells in the mapped group",
|
|
239
|
+
paste0(
|
|
240
|
+
"AvailCellsPerGroup: The number of cells with CDR3 length between ",
|
|
241
|
+
CDR3_MINLEN,
|
|
242
|
+
" and ",
|
|
243
|
+
CDR3_MAXLEN,
|
|
244
|
+
" for each group. These cells are used for the analysis"
|
|
245
|
+
),
|
|
246
|
+
"AvailCellsPct: The percentage of AvailCellsPerGroup over CellsPerGroup"
|
|
247
|
+
)
|
|
248
|
+
),
|
|
249
|
+
list(
|
|
250
|
+
kind = "table",
|
|
251
|
+
src = file.path(outdir, "stats.txt")
|
|
252
|
+
),
|
|
253
|
+
h1 = "Available Cells"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
log$info("Add amino acid features")
|
|
259
|
+
mdata = mdata %>%
|
|
266
260
|
filter(!is.na(.Group) & length >= CDR3_MINLEN & length <= CDR3_MAXLEN) %>%
|
|
267
261
|
add_percentAA() %>%
|
|
268
262
|
add_positionalAA()
|
|
269
263
|
|
|
270
264
|
|
|
271
265
|
do_one_subset = function(s) {
|
|
272
|
-
|
|
266
|
+
if (!is.null(s)) {
|
|
267
|
+
log$info(paste("Processing subset", s))
|
|
268
|
+
}
|
|
273
269
|
if (is.null(s)) {
|
|
274
|
-
data =
|
|
270
|
+
data = mdata
|
|
275
271
|
odir = file.path(outdir, "ALL")
|
|
276
272
|
} else {
|
|
277
|
-
data =
|
|
278
|
-
odir = file.path(outdir, s)
|
|
273
|
+
data = mdata %>% filter(.Subset == s)
|
|
274
|
+
odir = file.path(outdir, slugify(s))
|
|
279
275
|
}
|
|
280
276
|
dir.create(odir, recursive = TRUE, showWarnings = FALSE)
|
|
281
277
|
|
|
@@ -299,6 +295,13 @@ do_one_subset = function(s) {
|
|
|
299
295
|
}
|
|
300
296
|
}
|
|
301
297
|
y = ifelse(data_fit$.Group == target, 1, 0)
|
|
298
|
+
if (any(table(y) <= 3) || length(table(y)) < 2) {
|
|
299
|
+
if (is.null(s)) {
|
|
300
|
+
log$warn(paste0("Not enough observations for target group '", target, "' with CDR3 length ", len, ". At least 4 observations are required."))
|
|
301
|
+
} else {
|
|
302
|
+
log$warn(paste0("Not enough observations for target group '", target, "' in subset '", s, "' with CDR3 length ", len, ". At least 4 observations are required."))
|
|
303
|
+
}
|
|
304
|
+
}
|
|
302
305
|
# one multinomial or binomial class has 1 or 0 observations; not allowed
|
|
303
306
|
if (any(table(y) <= 1)) { next }
|
|
304
307
|
fit = glmnet(x, y, data=data_fit, alpha=0, lambda=0.01, family="binomial")
|
|
@@ -327,56 +330,121 @@ do_one_subset = function(s) {
|
|
|
327
330
|
write.table(alldf, file = file.path(odir, "estimates.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
|
|
328
331
|
|
|
329
332
|
# save the plots
|
|
330
|
-
gr
|
|
331
|
-
group_by(imgt_pos, feature)
|
|
333
|
+
gr <- alldf %>%
|
|
334
|
+
group_by(imgt_pos, feature) %>%
|
|
332
335
|
summarise(coef = mean(estimate))
|
|
333
336
|
# Avoid too large values
|
|
334
|
-
gr$coef[gr$coef > 1.5]
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
g
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
337
|
+
gr$coef[gr$coef > 1.5] <- 1.5
|
|
338
|
+
gr$coef <- exp(gr$coef) # Exponentiate the coefficients
|
|
339
|
+
|
|
340
|
+
g <- LinePlot(gr, x = "imgt_pos", y = "coef", group_by = "feature",
|
|
341
|
+
add_line = 1, x_text_angle = 90, xlab = "TCR position",
|
|
342
|
+
ylab = paste("Coefficient for", target, "prediction"), title = s)
|
|
343
|
+
|
|
344
|
+
save_plot(g, file.path(odir, "estimated_coefficients"),
|
|
345
|
+
devpars = list(width = 1000, height = 1000, res = 100),
|
|
346
|
+
formats = c("png", "pdf"))
|
|
347
|
+
|
|
348
|
+
reporter$add(
|
|
349
|
+
list(
|
|
350
|
+
kind = "descr",
|
|
351
|
+
content = "Estimated coefficients for each feature and position in the CDR3"
|
|
352
|
+
),
|
|
353
|
+
h1 = ifelse(
|
|
354
|
+
is.null(s),
|
|
355
|
+
"Estimated OR (per s.d.)",
|
|
356
|
+
paste0(paste(each_cols, collapse = ", "), " - ", s)
|
|
357
|
+
),
|
|
358
|
+
h2 = ifelse(
|
|
359
|
+
is.null(s),
|
|
360
|
+
"#",
|
|
361
|
+
"Estimated OR (per s.d.)"
|
|
362
|
+
)
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
reporter$add(
|
|
366
|
+
list(
|
|
367
|
+
name = "Plot",
|
|
368
|
+
contents = list(
|
|
369
|
+
list(
|
|
370
|
+
kind = "image",
|
|
371
|
+
src = file.path(odir, "estimated_coefficients.png"),
|
|
372
|
+
download = file.path(odir, "estimated_coefficients.pdf")
|
|
373
|
+
)
|
|
374
|
+
)
|
|
375
|
+
),
|
|
376
|
+
list(
|
|
377
|
+
name = "Estimates",
|
|
378
|
+
contents = list(
|
|
379
|
+
list(
|
|
380
|
+
kind = "table",
|
|
381
|
+
src = file.path(odir, "estimates.txt")
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
),
|
|
385
|
+
h1 = ifelse(
|
|
386
|
+
is.null(s),
|
|
387
|
+
"Estimated OR (per s.d.)",
|
|
388
|
+
paste0(paste(each_cols, collapse = ", "), " - ", s)
|
|
389
|
+
),
|
|
390
|
+
h2 = ifelse(
|
|
391
|
+
is.null(s),
|
|
392
|
+
"#",
|
|
393
|
+
"Estimated OR (per s.d.)"
|
|
394
|
+
),
|
|
395
|
+
ui = "tabs"
|
|
396
|
+
)
|
|
344
397
|
|
|
345
398
|
# distributions
|
|
346
399
|
data$mid_hydro = sapply(data$midseq, function(x) get_feat_score(x, AA_MAPS[[2]]))
|
|
347
400
|
data$smid_hydro = scale(data$mid_hydro)[,1]
|
|
348
401
|
|
|
349
|
-
g
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
402
|
+
g <- RidgePlot(
|
|
403
|
+
data = data,
|
|
404
|
+
x = "smid_hydro",
|
|
405
|
+
group_by = ".Group",
|
|
406
|
+
xlab = "CDR3bmr hydrophobicity",
|
|
407
|
+
ylab = "",
|
|
408
|
+
add_vline = TRUE,
|
|
409
|
+
alpha = 0.5,
|
|
410
|
+
title = s,
|
|
411
|
+
flip = TRUE
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
save_plot(g, file.path(odir, "distribution"),
|
|
415
|
+
devpars = list(width = 1000, height = 1000, res = 100),
|
|
416
|
+
formats = c("png", "pdf"))
|
|
417
|
+
|
|
418
|
+
reporter$add(
|
|
419
|
+
list(
|
|
420
|
+
kind = "table_image",
|
|
421
|
+
descr = paste0(
|
|
422
|
+
"Hydrophobicity values are averaged over the CDR3 for each TCR and ",
|
|
423
|
+
"then scaled to have a mean of 0 and a variance of 1. ",
|
|
424
|
+
"Horizontal lines depict the mean for each population"
|
|
425
|
+
),
|
|
426
|
+
src = file.path(odir, "distribution.png"),
|
|
427
|
+
download = file.path(odir, "distribution.pdf")
|
|
428
|
+
),
|
|
429
|
+
h1 = ifelse(
|
|
430
|
+
is.null(s),
|
|
431
|
+
"Hydrophobicity Distribution",
|
|
432
|
+
paste0(paste(each_cols, collapse = ", "), " - ", s)
|
|
433
|
+
),
|
|
434
|
+
h2 = ifelse(
|
|
435
|
+
is.null(s),
|
|
436
|
+
"#",
|
|
437
|
+
"Hydrophobicity Distribution"
|
|
361
438
|
)
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
aes(x=data$smid_hydro, y=data$.Group, color=data$.Group, fill=data$.Group),
|
|
365
|
-
bandwidth=0.5,
|
|
366
|
-
alpha=0.4,
|
|
367
|
-
show.legend = FALSE
|
|
368
|
-
) + scale_color_manual(values=cols)
|
|
369
|
-
g = g + scale_fill_manual(values=cols) + theme_bw(base_size=12)
|
|
370
|
-
g = g + xlim(c(-4,4)) + xlab("CDR3bmr hydrophobicity") + ylab("") + coord_flip() + ggtitle(s)
|
|
371
|
-
|
|
372
|
-
png(file.path(odir, "distribution.png"), width=1000, height=1000, res=100)
|
|
373
|
-
print(g)
|
|
374
|
-
dev.off()
|
|
439
|
+
)
|
|
440
|
+
|
|
375
441
|
}
|
|
376
442
|
|
|
377
|
-
if (is.null(
|
|
443
|
+
if (is.null(each_cols)) {
|
|
378
444
|
do_one_subset(NULL)
|
|
379
445
|
} else {
|
|
380
|
-
subsets = na.omit(unique(
|
|
446
|
+
subsets = na.omit(unique(obj$.Subset))
|
|
381
447
|
sapply(subsets, do_one_subset)
|
|
382
448
|
}
|
|
449
|
+
|
|
450
|
+
reporter$save(joboutdir)
|