biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -1,33 +1,45 @@
1
+ library(rlang)
1
2
  library(dplyr)
2
3
  library(tidyr)
3
4
  library(tibble)
4
- library(ggplot2)
5
- library(ggridges)
6
5
  library(glue)
7
6
  library(hash)
8
7
  library(glmnet)
9
8
  library(broom.mixed)
10
9
  library(stringr)
10
+ library(plotthis)
11
+ library(biopipen.utils)
11
12
 
12
- immdatafile = {{in.immdata | quote}}
13
- srtobjfile = {{in.srtobj | r}}
14
- outdir = {{out.outdir | quote}}
15
- group_name = {{envs.group | r}}
16
- comparison = {{envs.comparison | r}}
17
- prefix = {{envs.prefix | r}}
18
- target = {{envs.target | r}}
19
- subset_cols = {{envs.subset | r}}
13
+ scrfile <- {{in.scrfile | r}}
14
+ outdir <- {{out.outdir | r}}
15
+ joboutdir <- {{job.outdir | r}}
16
+ group_name <- {{envs.group | r}}
17
+ comparison <- {{envs.comparison | r}}
18
+ target <- {{envs.target | r}}
19
+ each_cols <- {{envs.each | r}}
20
+
21
+ log <- get_logger()
22
+ reporter <- get_reporter()
20
23
 
21
24
  if (is.null(group_name) || is.null(comparison)) {
22
25
  stop("envs.group and envs.comparison must be specified")
23
26
  }
24
27
 
25
- if (is.null(target)) {
26
- stop("envs.target must be specified, which should be one of the keys in `envs.comparison`")
28
+ if (length(comparison) != 2) {
29
+ stop("envs.comparison must have exactly two elements or keys, representing the two groups to compare")
30
+ }
31
+
32
+ if (!is.list(comparison)) {
33
+ comparison <- stats::setNames(as.list(comparison), comparison)
34
+ }
35
+
36
+ target <- target %||% names(comparison)[1]
37
+ if (!(target %in% names(comparison))) {
38
+ stop(paste0("Target group '", target, "' not found in the comparison groups."))
27
39
  }
28
40
 
29
- if (is.character(subset_cols) && length(subset_cols) == 1) {
30
- subset_cols = trimws(strsplit(subset_cols, ",")[[1]])
41
+ if (is.character(each_cols) && length(each_cols) == 1) {
42
+ each_cols = trimws(strsplit(each_cols, ",")[[1]])
31
43
  }
32
44
 
33
45
  ### Helpers
@@ -140,98 +152,43 @@ for (i in 1:3){
140
152
  AA_MAPS[[i]] <- create_hashmap(as.character(RF$AA), as.vector(RF[,(i+1),drop=TRUE]))
141
153
  }
142
154
 
143
- # Loading metadata from srtobjfile
144
- print("Loading metadata from srtobjfile")
145
- if (is.null(srtobjfile)) {
146
- metadata = NULL
147
- } else {
148
- # Get the extension (lowercase) of srtobjfile, see if it is .rds file
149
- srtobjfile_ext = tolower(tools::file_ext(srtobjfile))
150
- if (srtobjfile_ext != "rds") {
151
- metadata = read.table(
152
- srtobjfile,
153
- sep = "\t",
154
- header = TRUE,
155
- row.names = 1,
156
- stringsAsFactors = FALSE,
157
- check.names = FALSE,
158
- )
159
- } else {
160
- metadata = readRDS(srtobjfile)@meta.data
161
- }
155
+ log$info("Loading data from input file")
156
+ mdata <- read_obj(scrfile)@meta.data
157
+
158
+ if (!group_name %in% colnames(mdata)) {
159
+ stop(paste0("Group name '", group_name, "' not found in the data."))
162
160
  }
163
161
 
164
- print("Loading immdata from immdatafile")
165
- immdata = readRDS(immdatafile)
166
-
167
-
168
- merge_data = function(sam) {
169
- # Merge the data for one sample from immdata and metadata
170
- out = immdata$data[[sam]] %>%
171
- mutate(
172
- Sample = sam,
173
- locus = "TCRB",
174
- sequence = CDR3.aa,
175
- length = nchar(sequence),
176
- vgene = V.name,
177
- jgene = J.name,
178
- ) %>%
179
- select(Sample, Barcode, locus, sequence, length, vgene, jgene) %>%
180
- separate_longer_delim(Barcode, delim = ";") %>%
181
- left_join(immdata$meta, by = "Sample")
182
-
183
- if (is.null(metadata)) {
184
- # No metadata, just return
185
- return (out)
186
- }
162
+ # check if valuess of comparison is in the group_name column
163
+ if (!all(unlist(comparison) %in% as.character(mdata[[group_name]]))) {
164
+ stop(paste0("Some values in comparison are not found in the group_name column: ",
165
+ paste(setdiff(unlist(comparison), mdata[[group_name]]), collapse = ", ")))
166
+ }
187
167
 
188
- # Merge with metadata
189
- sdata = metadata %>% filter(Sample == sam)
190
- if (!is.null(prefix) && nchar(prefix) > 0) {
191
- # Replace the placeholder like {Sample} with the data in other columns
192
- # in the same row
193
- sdata = sdata %>% mutate(.prefix_len = nchar(glue("{{envs.prefix}}")))
194
- # Remove the prefix in the rownames of sdata
195
- rownames(sdata) = substring(rownames(sdata), sdata$.prefix_len + 1)
196
- sdata = sdata %>% select(-.prefix_len)
197
- }
198
- sdata = rownames_to_column(sdata, "Barcode")
199
- out = out %>% left_join(sdata, by = "Barcode", suffix = c("", "_seurat"))
200
- out$.Group = NA_character_
201
- for (k in names(comparison)) {
202
- group_mask = out[[group_name]] %in% comparison[[k]]
203
- if (sum(group_mask) == 0) {
204
- stop(
205
- glue("No cells in comparison group {k}. Please check if the group items {comparison[[k]]} exist.")
206
- )
168
+ # add a new column with the keys of comparison, when their values are in the group_name column
169
+ mdata$.Group <- sapply(as.character(mdata[[group_name]]), function(x) {
170
+ for (key in names(comparison)) {
171
+ if (x %in% comparison[[key]]) {
172
+ return(key)
207
173
  }
208
- out$.Group[out[[group_name]] %in% comparison[[k]]] = k
209
- }
210
- if (!is.null(subset_cols)) {
211
- out = out %>% unite(".Subset", all_of(subset_cols), sep = "_", remove = FALSE)
212
174
  }
213
- return (out)
214
- }
215
-
216
- # Expanded and merged with metadata
217
- # Now we are able to select the cells using group and comparison
218
- print("Merging data with metadata for each sample")
219
- merged = NULL
220
- for (sam in immdata$meta$Sample) {
221
- print(glue("- For sample {sam}"))
222
- md = merge_data(sam)
223
- merged = if (is.null(merged)) md else rbind(merged, md)
224
- }
175
+ return(NA)
176
+ })
177
+ mdata <- mdata %>%
178
+ separate(CTaa, into = c(NA, "sequence"), sep = "_", remove = FALSE) %>%
179
+ separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = FALSE) %>%
180
+ separate(vjgene, into = c("vgene", NA, "jgene", NA), sep = "\\.", remove = FALSE) %>%
181
+ mutate(length = nchar(sequence))
225
182
 
226
183
  # Statistics about the cell numbers with groups avaiable in metadata
227
184
  # !!group_name, TotalCells, AvailCells, AvailCellsPct
228
- print("Calculating statistics")
229
- if (is.null(subset_cols)) {
230
- stats = merged %>%
185
+ log$info("Calculating statistics")
186
+ if (is.null(each_cols)) {
187
+ stats = mdata %>%
231
188
  # group by group_name
232
189
  group_by(.Group) %>%
233
190
  summarise(
234
- TotalCells = nrow(merged),
191
+ TotalCells = nrow(mdata),
235
192
  CellsPerGroup = n(),
236
193
  AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
237
194
  # Percentage with % in character
@@ -239,14 +196,15 @@ if (is.null(subset_cols)) {
239
196
  .groups = "drop"
240
197
  )
241
198
  } else {
242
- stats = merged %>%
199
+ stats = mdata %>%
200
+ unite(".Subset", all_of(each_cols), sep = "_", remove = FALSE) %>%
243
201
  group_by(.Subset) %>%
244
202
  group_map(function(df, .y) {
245
203
  df %>%
246
204
  group_by(.Group) %>%
247
205
  summarise(
248
206
  .Subset = .y$.Subset[1],
249
- AllCells = nrow(merged),
207
+ AllCells = nrow(mdata),
250
208
  TotalCells = nrow(df),
251
209
  CellsPerGroup = n(),
252
210
  AvailCellsPerGroup = sum(length >= CDR3_MINLEN & length <= CDR3_MAXLEN),
@@ -259,23 +217,61 @@ if (is.null(subset_cols)) {
259
217
  }
260
218
 
261
219
  # save the stats
262
- write.table(stats, file = file.path(outdir, "stats.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
220
+ write.table(
221
+ stats,
222
+ file = file.path(outdir, "stats.txt"),
223
+ sep = "\t",
224
+ quote = FALSE,
225
+ row.names = FALSE,
226
+ )
263
227
 
264
- print("Add amino acid features")
265
- merged = merged %>%
228
+ reporter$add(
229
+ list(
230
+ kind = "descr",
231
+ content = "Statistics about the cells mapped to the comparison groups. Columns:"
232
+ ),
233
+ list(
234
+ kind = "list",
235
+ items = c(
236
+ "_Group: The group name in the comparison, or null, if cells are not mapped to any group",
237
+ "TotalCells: The total number of cells. This number should be the same for all groups",
238
+ "CellsPerGroup: The number of cells in the mapped group",
239
+ paste0(
240
+ "AvailCellsPerGroup: The number of cells with CDR3 length between ",
241
+ CDR3_MINLEN,
242
+ " and ",
243
+ CDR3_MAXLEN,
244
+ " for each group. These cells are used for the analysis"
245
+ ),
246
+ "AvailCellsPct: The percentage of AvailCellsPerGroup over CellsPerGroup"
247
+ )
248
+ ),
249
+ list(
250
+ kind = "table",
251
+ src = file.path(outdir, "stats.txt")
252
+ ),
253
+ h1 = "Available Cells"
254
+ )
255
+
256
+
257
+
258
+ log$info("Add amino acid features")
259
+ mdata = mdata %>%
266
260
  filter(!is.na(.Group) & length >= CDR3_MINLEN & length <= CDR3_MAXLEN) %>%
267
261
  add_percentAA() %>%
268
262
  add_positionalAA()
269
263
 
270
264
 
271
265
  do_one_subset = function(s) {
272
- print(paste("Processing subset", s))
266
+ if (!is.null(s)) {
267
+ log$info(paste("Processing subset", s))
268
+ }
273
269
  if (is.null(s)) {
274
- data = merged
270
+ data = mdata
275
271
  odir = file.path(outdir, "ALL")
276
272
  } else {
277
- data = merged %>% filter(.Subset == s)
278
- odir = file.path(outdir, s)
273
+ data = mdata %>% filter(.Subset == s)
274
+ odir = file.path(outdir, slugify(s))
279
275
  }
280
276
  dir.create(odir, recursive = TRUE, showWarnings = FALSE)
281
277
 
@@ -299,6 +295,13 @@ do_one_subset = function(s) {
299
295
  }
300
296
  }
301
297
  y = ifelse(data_fit$.Group == target, 1, 0)
298
+ if (any(table(y) <= 3) || length(table(y)) < 2) {
299
+ if (is.null(s)) {
300
+ log$warn(paste0("Not enough observations for target group '", target, "' with CDR3 length ", len, ". At least 4 observations are required."))
301
+ } else {
302
+ log$warn(paste0("Not enough observations for target group '", target, "' in subset '", s, "' with CDR3 length ", len, ". At least 4 observations are required."))
303
+ }
304
+ }
302
305
  # one multinomial or binomial class has 1 or 0 observations; not allowed
303
306
  if (any(table(y) <= 1)) { next }
304
307
  fit = glmnet(x, y, data=data_fit, alpha=0, lambda=0.01, family="binomial")
@@ -327,56 +330,121 @@ do_one_subset = function(s) {
327
330
  write.table(alldf, file = file.path(odir, "estimates.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
328
331
 
329
332
  # save the plots
330
- gr = alldf %>%
331
- group_by(imgt_pos, feature) |>
333
+ gr <- alldf %>%
334
+ group_by(imgt_pos, feature) %>%
332
335
  summarise(coef = mean(estimate))
333
336
  # Avoid too large values
334
- gr$coef[gr$coef > 1.5] = 1.5
335
-
336
- g = ggplot(gr, aes(imgt_pos, exp(coef), color=feature))
337
- g = g + geom_point() + geom_line(aes(group=feature)) + theme_classic() + geom_hline(yintercept=1)
338
- g = g + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_color_manual(values=c("#eead0c", "#ed6a51", "#02868a"))
339
- g = g + xlab("TCR position") + ylab(paste("Coefficient for", target, "prediction")) + ggtitle(s)
340
-
341
- png(file.path(odir, "estimated_coefficients.png"), width=1000, height=1000, res=100)
342
- print(g)
343
- dev.off()
337
+ gr$coef[gr$coef > 1.5] <- 1.5
338
+ gr$coef <- exp(gr$coef) # Exponentiate the coefficients
339
+
340
+ g <- LinePlot(gr, x = "imgt_pos", y = "coef", group_by = "feature",
341
+ add_line = 1, x_text_angle = 90, xlab = "TCR position",
342
+ ylab = paste("Coefficient for", target, "prediction"), title = s)
343
+
344
+ save_plot(g, file.path(odir, "estimated_coefficients"),
345
+ devpars = list(width = 1000, height = 1000, res = 100),
346
+ formats = c("png", "pdf"))
347
+
348
+ reporter$add(
349
+ list(
350
+ kind = "descr",
351
+ content = "Estimated coefficients for each feature and position in the CDR3"
352
+ ),
353
+ h1 = ifelse(
354
+ is.null(s),
355
+ "Estimated OR (per s.d.)",
356
+ paste0(paste(each_cols, collapse = ", "), " - ", s)
357
+ ),
358
+ h2 = ifelse(
359
+ is.null(s),
360
+ "#",
361
+ "Estimated OR (per s.d.)"
362
+ )
363
+ )
364
+
365
+ reporter$add(
366
+ list(
367
+ name = "Plot",
368
+ contents = list(
369
+ list(
370
+ kind = "image",
371
+ src = file.path(odir, "estimated_coefficients.png"),
372
+ download = file.path(odir, "estimated_coefficients.pdf")
373
+ )
374
+ )
375
+ ),
376
+ list(
377
+ name = "Estimates",
378
+ contents = list(
379
+ list(
380
+ kind = "table",
381
+ src = file.path(odir, "estimates.txt")
382
+ )
383
+ )
384
+ ),
385
+ h1 = ifelse(
386
+ is.null(s),
387
+ "Estimated OR (per s.d.)",
388
+ paste0(paste(each_cols, collapse = ", "), " - ", s)
389
+ ),
390
+ h2 = ifelse(
391
+ is.null(s),
392
+ "#",
393
+ "Estimated OR (per s.d.)"
394
+ ),
395
+ ui = "tabs"
396
+ )
344
397
 
345
398
  # distributions
346
399
  data$mid_hydro = sapply(data$midseq, function(x) get_feat_score(x, AA_MAPS[[2]]))
347
400
  data$smid_hydro = scale(data$mid_hydro)[,1]
348
401
 
349
- g = ggplot()
350
- # Give colors for different groups
351
- cols = c("turquoise3", "darkmagenta", "darkorange", "darkgreen", "darkblue", "darkred")
352
- groups = unique(data$.Group)
353
- if (length(groups) > length(cols)) {
354
- cols = c(cols, c("darkcyan", "darkviolet", "darkgoldenrod", "darkolivegreen", "darkslategray", "darkkhaki"))
355
- }
356
- cols = cols[1:length(groups)]
357
- for (i in seq_along(groups)) {
358
- g = g + geom_vline(
359
- xintercept = mean(data$smid_hydro[data$.Group==groups[i]]),
360
- color=cols[i]
402
+ g <- RidgePlot(
403
+ data = data,
404
+ x = "smid_hydro",
405
+ group_by = ".Group",
406
+ xlab = "CDR3bmr hydrophobicity",
407
+ ylab = "",
408
+ add_vline = TRUE,
409
+ alpha = 0.5,
410
+ title = s,
411
+ flip = TRUE
412
+ )
413
+
414
+ save_plot(g, file.path(odir, "distribution"),
415
+ devpars = list(width = 1000, height = 1000, res = 100),
416
+ formats = c("png", "pdf"))
417
+
418
+ reporter$add(
419
+ list(
420
+ kind = "table_image",
421
+ descr = paste0(
422
+ "Hydrophobicity values are averaged over the CDR3 for each TCR and ",
423
+ "then scaled to have a mean of 0 and a variance of 1. ",
424
+ "Horizontal lines depict the mean for each population"
425
+ ),
426
+ src = file.path(odir, "distribution.png"),
427
+ download = file.path(odir, "distribution.pdf")
428
+ ),
429
+ h1 = ifelse(
430
+ is.null(s),
431
+ "Hydrophobicity Distribution",
432
+ paste0(paste(each_cols, collapse = ", "), " - ", s)
433
+ ),
434
+ h2 = ifelse(
435
+ is.null(s),
436
+ "#",
437
+ "Hydrophobicity Distribution"
361
438
  )
362
- }
363
- g = g + geom_density_ridges(
364
- aes(x=data$smid_hydro, y=data$.Group, color=data$.Group, fill=data$.Group),
365
- bandwidth=0.5,
366
- alpha=0.4,
367
- show.legend = FALSE
368
- ) + scale_color_manual(values=cols)
369
- g = g + scale_fill_manual(values=cols) + theme_bw(base_size=12)
370
- g = g + xlim(c(-4,4)) + xlab("CDR3bmr hydrophobicity") + ylab("") + coord_flip() + ggtitle(s)
371
-
372
- png(file.path(odir, "distribution.png"), width=1000, height=1000, res=100)
373
- print(g)
374
- dev.off()
439
+ )
440
+
375
441
  }
376
442
 
377
- if (is.null(subset_cols)) {
443
+ if (is.null(each_cols)) {
378
444
  do_one_subset(NULL)
379
445
  } else {
380
- subsets = na.omit(unique(merged$.Subset))
446
+ subsets = na.omit(unique(obj$.Subset))
381
447
  sapply(subsets, do_one_subset)
382
448
  }
449
+
450
+ reporter$save(joboutdir)