biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -1,175 +1,209 @@
1
- source("{{biopipen_dir}}/utils/misc.R")
2
-
3
1
  library(Seurat)
4
- library(tibble)
5
- library(enrichR)
6
2
  library(rlang)
7
3
  library(dplyr)
8
-
9
- setEnrichrSite("Enrichr")
4
+ library(tidyselect)
5
+ library(biopipen.utils)
10
6
 
11
7
  srtfile <- {{in.srtobj | r}}
12
8
  outdir <- {{out.outdir | r}}
9
+ joboutdir <- {{job.outdir | r}}
13
10
  mutaters <- {{ envs.mutaters | r }}
14
11
  ident <- {{ envs.ident | r }}
15
- group.by <- {{ envs["group-by"] | r }} # nolint
12
+ group_by <- {{ envs.group_by | default: envs["group-by"] | default: None | r }} # nolint
16
13
  each <- {{ envs.each | r }}
17
- prefix_each <- {{ envs.prefix_each | r }}
18
- section <- {{ envs.section | r }}
19
14
  dbs <- {{ envs.dbs | r }}
20
15
  n <- {{ envs.n | r }}
16
+ enrich_style <- {{ envs.enrich_style | r }}
17
+ sset <- {{ envs.subset | r }}
18
+ enrich_plots_defaults <- {{ envs.enrich_plots_defaults | r }}
19
+ enrich_plots <- {{ envs.enrich_plots | r }}
21
20
  cases <- {{ envs.cases | r: todot = "-" }} # nolint
22
21
 
23
22
  set.seed(8525)
23
+ log <- get_logger()
24
+ reporter <- get_reporter()
24
25
 
25
- print("- Loading Seurat object ...")
26
- srtobj <- readRDS(srtfile)
26
+ log$info("Reading Seurat object ...")
27
+ srtobj <- read_obj(srtfile)
28
+ assay <- DefaultAssay(srtobj)
27
29
 
28
- print("- Mutate meta data if needed ...")
29
- if (!is.null(mutaters) && length(mutaters)) {
30
+ if (!is.null(mutaters) && length(mutaters) > 0) {
31
+ log$info("Mutating meta data ...")
30
32
  srtobj@meta.data <- srtobj@meta.data %>%
31
33
  mutate(!!!lapply(mutaters, parse_expr))
32
34
  }
33
35
 
34
- print("- Expanding cases ...")
35
- if (is.null(cases) || length(cases) == 0) {
36
- cases <- list(
37
- DEFAULT = list(
38
- ident = ident,
39
- group.by = group.by,
40
- each = each,
41
- prefix_each = prefix_each,
42
- section = section,
43
- dbs = dbs,
44
- n = n
45
- )
46
- )
47
- } else {
48
- cases <- lapply(cases, function(cs) {
49
- list_setdefault(
50
- cs,
51
- ident = ident,
52
- group.by = group.by,
53
- each = each,
54
- prefix_each = prefix_each,
55
- section = section,
56
- dbs = dbs,
57
- n = n
36
+ enrich_plots <- lapply(enrich_plots, function(x) {
37
+ list_update(enrich_plots_defaults, x)
38
+ })
39
+ defaults <- list(
40
+ ident = ident,
41
+ group_by = group_by,
42
+ each = each,
43
+ dbs = dbs,
44
+ n = n,
45
+ enrich_style = enrich_style,
46
+ enrich_plots = enrich_plots,
47
+ enrich_plots_defaults = enrich_plots_defaults,
48
+ subset = sset
49
+ )
50
+
51
+ cases <- expand_cases(cases, defaults, default_case = "Top Expressing Genes", post = function(name, case) {
52
+ outcases <- list()
53
+ if (is.null(case$each) || is.na(case$each) || nchar(case$each) == 0 || isFALSE(each)) {
54
+ case$enrich_plots <- lapply(
55
+ case$enrich_plots,
56
+ function(x) { list_update(case$enrich_plots_defaults, x) }
58
57
  )
59
- })
60
- }
58
+ case$enrich_plots_defaults <- NULL
61
59
 
62
- # Expand each and ident
63
- newcases <- list()
64
- for (name in names(cases)) { # nolint
65
- case <- cases[[name]]
66
- if (is.null(case$each) && !is.null(case$ident)) {
67
- newcases[[paste0(case$section, ":", name)]] <- case
68
- } else if (is.null(case$each)) {
69
- idents <- srtobj@meta.data %>%
70
- pull(case$group.by) %>%
71
- unique() %>%
72
- na.omit()
73
- for (ident in idents) {
74
- key <- paste0(name, ":", ident)
75
- newcases[[key]] <- case
76
- newcases[[key]]$ident <- ident
77
- }
60
+ outcases[[name]] <- case
78
61
  } else {
79
- eachs <- srtobj@meta.data %>% pull(case$each) %>% unique() %>% na.omit()
62
+ eachs <- if (!is.null(case$subset)) {
63
+ srtobj@meta.data %>%
64
+ filter(!!parse_expr(case$subset)) %>%
65
+ pull(case$each) %>% na.omit() %>% unique() %>% as.vector()
66
+ } else {
67
+ srtobj@meta.data %>%
68
+ pull(case$each) %>% na.omit() %>% unique() %>% as.vector()
69
+ }
70
+
71
+ if (length(cases) == 0 && name == "Top Expressing Genes") {
72
+ name <- case$each
73
+ }
74
+
80
75
  for (each in eachs) {
81
- by <- make.names(paste0(".", name, "_", each))
82
- srtobj@meta.data <- srtobj@meta.data %>% mutate(
83
- !!sym(by) := if_else(
84
- !!sym(case$each) == each,
85
- !!sym(case$group.by),
86
- NA
87
- )
88
- )
89
- if (is.null(case$ident)) {
90
- idents <- srtobj@meta.data %>%
91
- pull(case$group.by) %>%
92
- unique() %>%
93
- na.omit()
94
- for (ident in idents) {
95
- kname <- if (name == "DEFAULT") "" else paste0("-", name)
96
- key <- paste0(each, kname, ":", ident)
97
- if (case$prefix_each) {
98
- key <- paste0(case$each, "-", key)
99
- }
100
- newcases[[key]] <- case
101
- newcases[[key]]$ident <- ident
102
- newcases[[key]]$group.by <- by # nolint
103
- }
76
+ newname <- paste0(name, " - ", each)
77
+ newcase <- case
78
+ newcase$each_name <- case$each
79
+ newcase$each <- each
80
+
81
+ if (!is.null(case$subset)) {
82
+ newcase$subset <- paste0(case$subset, " & ", bQuote(case$each), " == '", each, "'")
104
83
  } else {
105
- key <- paste0(case$each, ":", each)
106
- if (name != "DEFAULT") {
107
- key <- paste0(key, " - ", name)
108
- }
109
- newcases[[key]] <- case
84
+ newcase$subset <- paste0(bQuote(case$each), " == '", each, "'")
110
85
  }
86
+
87
+ newcase$enrich_plots <- lapply(
88
+ case$enrich_plots,
89
+ function(x) { list_update(case$enrich_plots_defaults, x) }
90
+ )
91
+ newcase$enrich_plots_defaults <- NULL
92
+
93
+ outcases[[newname]] <- newcase
111
94
  }
112
95
  }
113
- }
114
- cases <- newcases
115
-
116
- do_enrich <- function(expr, odir) {
117
- print(" Saving expressions ...")
118
- write.table(
119
- expr %>% as.data.frame() %>% rownames_to_column("Gene"),
120
- file.path(odir, "expr.txt"),
121
- sep = "\t",
122
- row.names = TRUE,
123
- col.names = TRUE,
124
- quote = FALSE
96
+
97
+ outcases
98
+ })
99
+
100
+ log$info("Running cases ...")
101
+
102
+ process_markers <- function(markers, info, case) {
103
+ # Save markers
104
+ write.table(markers, file.path(info$prefix, "top_genes.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
105
+ reporter$add2(
106
+ list(
107
+ name = "Table",
108
+ contents = list(
109
+ list(kind = "descr", content = "Showing top expressing genes ordered by their expression descendingly."),
110
+ list(kind = "table", src = file.path(info$prefix, "top_genes.tsv"), data = list(nrows = 100))
111
+ )
112
+ ),
113
+ hs = c(info$section, info$name),
114
+ hs2 = paste0("Top Genes"),
115
+ ui = "tabs"
125
116
  )
126
- write.table(
127
- expr %>% as.data.frame() %>% rownames_to_column("Gene") %>% head(n),
128
- file.path(odir, "exprn.txt"),
129
- sep = "\t",
130
- row.names = TRUE,
131
- col.names = TRUE,
132
- quote = FALSE
117
+
118
+ enrich <- RunEnrichment(
119
+ markers$gene,
120
+ dbs = case$dbs, style = case$enrich_style)
121
+
122
+ write.table(enrich, file.path(info$prefix, "enrich.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
123
+ reporter$add2(
124
+ list(
125
+ name = "Table",
126
+ contents = list(list(kind = "table", src = file.path(info$prefix, "enrich.tsv"), data = list(nrows = 100)))
127
+ ),
128
+ hs = c(info$section, info$name),
129
+ hs2 = "Enrichment Analysis",
130
+ ui = "tabs"
133
131
  )
134
132
 
135
- print(" Running enrichment ...")
136
- enriched <- enrichr(rownames(head(expr, n)), dbs) # nolint
137
- for (db in dbs) {
138
- write.table(
139
- enriched[[db]],
140
- file.path(odir, paste0("Enrichr-", db, ".txt")),
141
- sep = "\t",
142
- row.names = FALSE,
143
- col.names = TRUE,
144
- quote = FALSE
145
- )
146
- png(
147
- file.path(odir, paste0("Enrichr-", db, ".png")),
148
- res = 100, height = 1000, width = 1000
149
- )
150
- print(plotEnrich(enriched[[db]], showTerms = 20, title = db)) # nolint
151
- dev.off()
133
+ # Visualize enriched terms
134
+ if (length(case$enrich_plots) > 0) {
135
+ for (db in case$dbs) {
136
+ plots <- list()
137
+ for (plotname in names(case$enrich_plots)) {
138
+ plotargs <- case$enrich_plots[[plotname]]
139
+ plotargs$data <- enrich[enrich$Database == db, , drop = FALSE]
140
+
141
+ p <- do_call(VizEnrichment, plotargs)
142
+
143
+ outprefix <- file.path(info$prefix, paste0("enrich.", slugify(db), ".", slugify(plotname)))
144
+ if (plotargs$plot_type == "bar") {
145
+ attr(p, "height") <- attr(p, "height") / 1.5
146
+ }
147
+ save_plot(p, outprefix, plotargs$devpars, formats = "png")
148
+ plots[[length(plots) + 1]] <- reporter$image(outprefix, c(), FALSE)
149
+ }
150
+ reporter$add2(
151
+ list(name = db, contents = plots),
152
+ hs = c(info$section, info$name),
153
+ hs2 = "Enrichment Analysis",
154
+ ui = "tabs"
155
+ )
156
+ }
152
157
  }
153
158
  }
154
159
 
155
- do_case <- function(casename) {
156
- print(paste("- Running for case:", casename))
157
- case <- cases[[casename]]
158
- parts <- unlist(strsplit(casename, ":"))
159
- section <- parts[1]
160
- casename <- paste(parts[-1], collapse = ":")
161
160
 
162
- print(" Calculating average expression ...")
161
+ run_case <- function(name) {
162
+ log$info("Case: {name} ...")
163
+ case <- cases[[name]]
164
+
165
+ log$info("- Subsetting cells and calculating average expression ...")
166
+ if (!is.null(case$subset)) {
167
+ subobj <- filter(srtobj, !!parse_expr(case$subset))
168
+ } else {
169
+ subobj <- srtobj
170
+ }
171
+ case$group_by <- case$group_by %||% GetIdentityColumn(srtobj)
172
+ if (is.null(case$ident)) {
173
+ case$ident <- as.character(unique(subobj@meta.data[[case$group_by]]))
174
+ }
163
175
  avgexpr <- AverageExpression(
164
- srtobj,
165
- group.by = case$group.by
166
- )$RNA[, case$ident, drop = FALSE]
167
- avgexpr <- avgexpr[order(-avgexpr), , drop = FALSE]
176
+ subobj,
177
+ group_by = case$group_by,
178
+ assays = assay
179
+ )[[assay]]
180
+ # https://github.com/satijalab/seurat/issues/7893
181
+ colnames(avgexpr) <- as.character(unique(subobj@meta.data[[case$group_by]]))
182
+ avgexpr <- avgexpr[, case$ident, drop = FALSE]
168
183
 
169
- odir <- file.path(outdir, section, casename)
170
- dir.create(odir, recursive = TRUE, showWarnings = FALSE)
184
+ for (idt in case$ident) {
185
+ log$info("- Processing {idt} ...")
186
+ info <- case_info(paste0(name, "::", idt), outdir, create = TRUE)
187
+ expr <- avgexpr[, idt, drop = FALSE]
188
+ expr <- expr[order(expr, decreasing = TRUE), , drop = FALSE]
189
+ expr <- expr[1:min(case$n, nrow(expr)), , drop = FALSE]
190
+ expr <- as.data.frame(expr)
191
+ expr$gene <- rownames(expr)
192
+ colnames(expr) <- c("avg_expr", "gene")
193
+ expr <- expr[, c("gene", "avg_expr"), drop = FALSE]
171
194
 
172
- do_enrich(avgexpr, odir)
195
+ log$info(" Performing enrichment analysis ...")
196
+ process_markers(expr, info, case = list(
197
+ ident = idt,
198
+ dbs = case$dbs,
199
+ enrich_style = case$enrich_style,
200
+ enrich_plots = case$enrich_plots
201
+ ))
202
+ }
203
+
204
+ invisible()
173
205
  }
174
206
 
175
- sapply(sort(names(cases)), do_case)
207
+ sapply(names(cases), run_case)
208
+
209
+ reporter$save(joboutdir)
@@ -0,0 +1,195 @@
1
+ from argparse import ArgumentParser
2
+ from typing import Union
3
+ import numpy as np
4
+ import pandas as pd
5
+ import scanpy as sc
6
+ import celltypist
7
+ from celltypist.classifier import logger, AnnData, Model, Classifier
8
+
9
+ parser = ArgumentParser(description="Run CellTypist")
10
+ parser.add_argument(
11
+ "-i", "--input", required=True, help="Input H5AD file with AnnData object"
12
+ )
13
+ parser.add_argument("-o", "--output", required=True, help="Output file")
14
+ parser.add_argument("-m", "--model", required=True, help="Model file")
15
+ parser.add_argument(
16
+ "-v", "--majority_voting", action="store_true", help="Majority voting"
17
+ )
18
+ parser.add_argument(
19
+ "-c",
20
+ "--over_clustering",
21
+ required=False,
22
+ default=None,
23
+ help="Over clustering. Error if the column does not exist.",
24
+ )
25
+
26
+
27
+ def classifier_init(
28
+ self, filename="", model="", transpose=False, gene_file=None, cell_file=None
29
+ ):
30
+ """Celltypist check if adata is in the range of log1p normalized data to 10000
31
+ counts per cell. Otherwise it will use the raw data if available. However, in
32
+ some cases, the raw data has invalid feature names (var_names) which causes errors.
33
+ Here we check if the feature names of raw data is valid with intersection with
34
+ model features, if not, we will use the adata.X instead of adata.raw.X
35
+ """
36
+ if isinstance(model, str):
37
+ model = Model.load(model)
38
+ self.model = model
39
+ if not filename:
40
+ logger.warn("📭 No input file provided to the classifier")
41
+ return
42
+ if isinstance(filename, str):
43
+ self.filename = filename
44
+ logger.info(f"📁 Input file is '{self.filename}'")
45
+ logger.info("⏳ Loading data")
46
+ if isinstance(filename, str) and filename.endswith(
47
+ (".csv", ".txt", ".tsv", ".tab", ".mtx", ".mtx.gz")
48
+ ):
49
+ self.adata = sc.read(self.filename)
50
+ if transpose:
51
+ self.adata = self.adata.transpose()
52
+ if self.filename.endswith((".mtx", ".mtx.gz")):
53
+ if (gene_file is None) or (cell_file is None):
54
+ raise FileNotFoundError(
55
+ "🛑 Missing `gene_file` and/or `cell_file`. Please provide both "
56
+ "arguments together with the input mtx file"
57
+ )
58
+ genes_mtx = pd.read_csv(gene_file, header=None)[0].values
59
+ cells_mtx = pd.read_csv(cell_file, header=None)[0].values
60
+ if len(genes_mtx) != self.adata.n_vars:
61
+ raise ValueError(
62
+ f"🛑 The number of genes in {gene_file} does not match the number "
63
+ f"of genes in {self.filename}"
64
+ )
65
+ if len(cells_mtx) != self.adata.n_obs:
66
+ raise ValueError(
67
+ f"🛑 The number of cells in {cell_file} does not match the number "
68
+ f"of cells in {self.filename}"
69
+ )
70
+ self.adata.var_names = genes_mtx
71
+ self.adata.obs_names = cells_mtx
72
+ if not float(self.adata.X[:1000].max()).is_integer():
73
+ logger.warn(
74
+ "⚠️ Warning: the input file seems not a raw count matrix. The "
75
+ "prediction result may not be accurate"
76
+ )
77
+ if (
78
+ (self.adata.n_vars >= 100000)
79
+ or (len(self.adata.var_names[0]) >= 30)
80
+ or (
81
+ len(
82
+ self.adata.obs_names.intersection(
83
+ ["GAPDH", "ACTB", "CALM1", "PTPRC", "MALAT1"]
84
+ )
85
+ )
86
+ >= 1
87
+ )
88
+ ):
89
+ logger.warn(
90
+ "⚠️ The input matrix is detected to be a gene-by-cell matrix, will "
91
+ "transpose it"
92
+ )
93
+ self.adata = self.adata.transpose()
94
+ self.adata.var_names_make_unique()
95
+ sc.pp.normalize_total(self.adata, target_sum=1e4)
96
+ sc.pp.log1p(self.adata)
97
+ self.indata = self.adata.X
98
+ self.indata_genes = self.adata.var_names
99
+ self.indata_names = self.adata.obs_names
100
+ elif isinstance(filename, AnnData) or (
101
+ isinstance(filename, str) and filename.endswith(".h5ad")
102
+ ):
103
+ self.adata = sc.read(filename) if isinstance(filename, str) else filename
104
+ self.adata.var_names_make_unique()
105
+ # When to use raw.X?
106
+ # 1. if adata.raw exists
107
+ # 2. if adata.raw.var_names has intersection with model genes
108
+ # 3. if adata.X is not in the expected range
109
+ use_raw = self.adata.raw and (
110
+ self.adata.X[:1000].min() < 0 or self.adata.X[:1000].max() > 9.22
111
+ ) and np.isin(
112
+ self.adata.raw.var_names, self.model.classifier.features
113
+ ).sum() > 0
114
+
115
+ if use_raw:
116
+ if not self.adata.raw:
117
+ raise ValueError(
118
+ "🛑 Invalid expression matrix in `.X`, expect log1p normalized "
119
+ "expression to 10000 counts per cell"
120
+ )
121
+ elif (self.adata.raw.X[:1000].min() < 0) or (
122
+ self.adata.raw.X[:1000].max() > 9.22
123
+ ):
124
+ raise ValueError(
125
+ "🛑 Invalid expression matrix in both `.X` and `.raw.X`, expect "
126
+ "log1p normalized expression to 10000 counts per cell"
127
+ )
128
+ else:
129
+ logger.info(
130
+ "👀 Invalid expression matrix in `.X`, expect log1p normalized "
131
+ "expression to 10000 counts per cell; will use `.raw.X` instead"
132
+ )
133
+ self.indata = self.adata.raw.X
134
+ self.indata_genes = self.adata.raw.var_names
135
+ self.indata_names = self.adata.raw.obs_names
136
+ else:
137
+ self.indata = self.adata.X
138
+ self.indata_genes = self.adata.var_names
139
+ self.indata_names = self.adata.obs_names
140
+ if np.abs(np.expm1(self.indata[0]).sum() - 10000) > 1:
141
+ logger.warn(
142
+ "⚠️ Warning: invalid expression matrix, expect ALL genes and log1p "
143
+ "normalized expression to 10000 counts per cell. The prediction result "
144
+ "may not be accurate"
145
+ )
146
+ else:
147
+ raise ValueError(
148
+ "🛑 Invalid input. Supported types: .csv, .txt, .tsv, .tab, .mtx, .mtx.gz "
149
+ "and .h5ad, or AnnData loaded in memory"
150
+ )
151
+
152
+ logger.info(
153
+ f"🔬 Input data has {self.indata.shape[0]} cells and {len(self.indata_genes)} "
154
+ "genes"
155
+ )
156
+
157
+
158
+ if __name__ == "__main__":
159
+ Classifier.__init__ = classifier_init # type: ignore
160
+
161
+ args = parser.parse_args()
162
+ adata = sc.read_h5ad(args.input)
163
+ over_clustering = args.over_clustering
164
+ if over_clustering and over_clustering not in adata.obs.columns:
165
+ raise ValueError(
166
+ f"Over clustering column '{over_clustering}' not found in AnnData object."
167
+ )
168
+ if "neighbors" in adata.uns and "params" in adata.uns["neighbors"]:
169
+ adata.uns["neighbors"]["params"].setdefault("n_neighbors", 15)
170
+
171
+ annotated = celltypist.annotate(
172
+ adata,
173
+ model=args.model,
174
+ majority_voting=args.majority_voting,
175
+ over_clustering=over_clustering,
176
+ )
177
+
178
+ out_adata = annotated.to_adata()
179
+ # leave as is
180
+ # if over_clustering and args.majority_voting:
181
+ # # rename majority_voting column to over_clustering
182
+ # out_adata.obs[over_clustering] = out_adata.obs["majority_voting"]
183
+
184
+ if args.output.endswith(".h5ad"):
185
+ try:
186
+ out_adata._raw._var.rename( # type: ignore
187
+ columns={"_index": "features"}, inplace=True
188
+ )
189
+ del out_adata.raw
190
+ except (KeyError, AttributeError):
191
+ pass
192
+
193
+ out_adata.write(args.output)
194
+ else:
195
+ out_adata.obs.to_csv(args.output, sep="\t", index=True)