biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -1,23 +1,28 @@
1
- source("{{biopipen_dir}}/utils/misc.R")
2
-
3
1
  library(glue)
4
2
  library(dplyr)
5
3
  library(tidyr)
6
- library(immunarch)
4
+ library(tibble)
7
5
  library(Seurat)
8
- library(ggplot2)
9
- library(ggprism)
6
+ library(biopipen.utils)
10
7
 
11
- immfile <- {{in.immdata | r}}
12
- exprfile <- {{in.srtobj | r}}
8
+ screpdata <- {{in.screpdata | r}}
13
9
  outfile <- {{out.outfile | r}}
10
+ joboutdir <- {{job.outdir | r}}
14
11
  python <- {{envs.python | r}}
15
12
  within_sample <- {{envs.within_sample | r}}
16
13
  assay <- {{envs.assay | r}}
17
14
  predefined_b <- {{envs.predefined_b | r}}
18
15
  max_iter <- {{envs.max_iter | int}}
19
16
  save_tessa <- {{envs.save_tessa | r}}
20
- tessa_srcdir <- "{{biopipen_dir}}/scripts/tcr/TESSA_source"
17
+
18
+ log <- get_logger()
19
+ reporter <- get_reporter()
20
+
21
+ # In case this script is running in the cloud and <biopipen_dir> can not be found in there
22
+ # In stead, we use the python command, which is associated with the cloud environment,
23
+ # to get the biopipen directory
24
+ biopipen_dir <- get_biopipen_dir(python)
25
+ tessa_srcdir <- file.path(biopipen_dir, "scripts", "tcr", "TESSA_source")
21
26
 
22
27
  outdir <- dirname(outfile)
23
28
  result_dir <- file.path(outdir, "result")
@@ -27,98 +32,51 @@ if (!dir.exists(tessa_dir)) dir.create(tessa_dir)
27
32
 
28
33
  ### Start preparing input files for TESSA
29
34
  # Prepare input files
30
- print("Preparing TCR input file ...")
31
- immdata <- readRDS(immfile)
32
-
33
- has_VJ <- "V.name" %in% colnames(immdata$data[[1]]) && "J.name" %in% colnames(immdata$data[[1]])
34
- # Merge all samples
35
- tcrdata <- do_call(rbind, lapply(seq_len(nrow(immdata$meta)), function(i) {
36
- # Clones Proportion CDR3.aa Barcode
37
- # 5 4 0.008583691 CAVRDTGNTPLVF;CASSEYSNQPQHF GTTCGGGCACTTACGA-1;TCTCTAAGTACCAGTT-1
38
- # 6 4 0.008583691 CALTQAAGNKLTF;CASRPEDLRGQPQHF GCTTGAAGTCGGCACT-1;TACTCGCTCCTAAGTG-1
39
- if (has_VJ) {
40
- cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa", "V.name", "J.name")]
41
- } else {
42
- cldata = immdata$data[[i]][, c("Barcode", "CDR3.aa")]
43
- }
44
- # # A tibble: 4 × 5
45
- # Sample Patient Timepoint Tissue
46
- # <chr> <chr> <chr> <chr>
47
- # 1 MC1685Pt011-Baseline-PB MC1685Pt011 Baseline PB
48
- mdata = as.list(immdata$meta[i, , drop=FALSE])
49
- for (mname in names(mdata)) {
50
- assign(mname, mdata[[mname]])
51
- }
52
-
53
- cldata %>%
54
- separate_rows(Barcode, sep=";") %>%
55
- # Just in case there are duplicated barcodes
56
- distinct(Barcode, .keep_all = TRUE) %>%
57
- mutate(Barcode = glue("{{envs.prefix}}{Barcode}"), sample = Sample)
58
- }))
59
- if (has_VJ) {
60
- tcrdata <- tcrdata %>% dplyr::mutate(
61
- v_gene = sub("-\\d+$", "", V.name),
62
- j_gene = sub("-\\d+$", "", J.name)
63
- ) %>% dplyr::select(
64
- contig_id = Barcode,
65
- cdr3 = CDR3.aa,
66
- v_gene,
67
- j_gene,
68
- sample
69
- )
70
- } else {
71
- tcrdata <- tcrdata %>% dplyr::select(
72
- contig_id = Barcode,
73
- cdr3 = CDR3.aa,
74
- sample
75
- )
76
- }
77
-
78
-
79
- print("Preparing expression input file ...")
80
- is_seurat <- endsWith(tolower(exprfile), ".rds")
81
- is_gz <- endsWith(tolower(exprfile), ".gz")
82
-
83
- if (is_seurat) {
84
- sobj <- readRDS(exprfile)
85
- expr <- GetAssayData(sobj, slot = "data", assay = assay)
86
- } else if (is_gz) {
87
- expr <- read.table(gzfile(exprfile), sep="\t", header=TRUE, row.names=1)
88
- } else {
89
- expr <- read.table(exprfile, sep="\t", header=TRUE, row.names=1)
90
- }
91
-
35
+ log$info("Reading input file ...")
36
+ sobj <- read_obj(screpdata)
37
+
38
+ log$info("Preparing TCR input file ...")
39
+ # If immfile endswith .rds, then it is an immunarch object
40
+ tcrdata <- sobj@meta.data %>%
41
+ rownames_to_column("contig_id") %>%
42
+ select(contig_id, CTaa, CTgene, sample = Sample) %>%
43
+ filter(!is.na(CTaa) & !is.na(CTgene)) %>%
44
+ separate(CTaa, into = c(NA, "cdr3"), sep = "_", remove = TRUE) %>%
45
+ filter(!is.na(cdr3) & cdr3 != "NA" & cdr3 != "nan") %>%
46
+ separate(CTgene, into = c(NA, "vjgene"), sep = "_", remove = TRUE) %>%
47
+ separate(vjgene, into = c("v_gene", NA, "j_gene", NA), sep = "\\.", remove = TRUE) %>%
48
+ mutate(v_gene = sub("-\\d+$", "", v_gene), j_gene = sub("-\\d+$", "", j_gene))
49
+
50
+ log$info("Preparing expression input file ...")
51
+ expr <- GetAssayData(sobj, layer = "data")
92
52
  cell_ids <- intersect(tcrdata$contig_id, colnames(expr))
93
53
  # Warning about unused cells
94
- unused_tcr_cells <- setdiff(tcrdata$contig_id, cell_ids)
95
54
  unused_expr_cells <- setdiff(colnames(expr), cell_ids)
96
- if (length(unused_tcr_cells) > 0) {
97
- warning(glue("{length(unused_tcr_cells)}/{nrow(tcrdata)} TCR cells are not used."), immediate. = TRUE)
98
- }
99
55
  if (length(unused_expr_cells) > 0) {
100
- warning(glue("{length(unused_expr_cells)}/{ncol(expr)} expression cells are not used."), immediate. = TRUE)
56
+ log$warn(glue("{length(unused_expr_cells)}/{ncol(expr)} cells without TCR data are not used."))
101
57
  }
102
58
  if (length(cell_ids) == 0) {
103
- stop("No common cells between TCR and expression data. Are you using the correct prefix?")
59
+ stop(
60
+ "No TCR data found in the Seurat object. ",
61
+ "Please use scRepertiore::combineExpression() to generate the Seurat object with TCR data."
62
+ )
104
63
  }
105
- tcrdata <- tcrdata[tcrdata$contig_id %in% cell_ids, , drop=FALSE]
106
64
  expr <- as.matrix(expr)[, tcrdata$contig_id, drop=FALSE]
107
65
 
108
66
  # Write input files
109
- print("Writing input files ...")
67
+ log$info("Writing input files ...")
110
68
  write.table(tcrdata, file.path(tessa_dir, "tcrdata.txt"), sep=",", quote=FALSE, row.names=FALSE)
111
69
  write.table(expr, file.path(tessa_dir, "exprdata.txt"), sep=",", quote=FALSE, row.names=TRUE, col.names=TRUE)
112
70
 
113
71
  ### End preparing input files for TESSA
114
72
 
115
73
  ### Start running TESSA
116
- print("Running TESSA ...")
74
+ log$info("Running TESSA ...")
117
75
 
118
76
  # The original TESSA uses a python wrapper to run the encoder and tessa model
119
77
  # here we run those two steps directly here
120
78
 
121
- print("- Running encoder ...")
79
+ log$info("- Running encoder ...")
122
80
  cmd_encoder <- paste(
123
81
  python,
124
82
  file.path(tessa_srcdir, "BriseisEncoder.py"),
@@ -133,21 +91,22 @@ cmd_encoder <- paste(
133
91
  "-output_log",
134
92
  file.path(tessa_dir, "tcr_encoder.log")
135
93
  )
136
- if (has_VJ) {
137
- cmd_encoder <- paste(
138
- cmd_encoder,
139
- "-output_VJ",
140
- file.path(tessa_dir, "tcr_vj.txt")
141
- )
142
- }
143
- print(paste("- ", cmd_encoder))
94
+ cmd_encoder <- paste(
95
+ cmd_encoder,
96
+ "-output_VJ",
97
+ file.path(tessa_dir, "tcr_vj.txt")
98
+ )
99
+
100
+ print("Running:")
101
+ print(cmd_encoder)
102
+ log$debug(paste("- ", cmd_encoder))
144
103
 
145
104
  rc <- system(cmd_encoder)
146
105
  if (rc != 0) {
147
106
  stop("Error: Failed to run encoder.")
148
107
  }
149
108
 
150
- print("- Running TESSA model ...")
109
+ log$info("- Running TESSA model ...")
151
110
  source(file.path(tessa_srcdir, "real_data.R"))
152
111
 
153
112
  tessa <- run_tessa(
@@ -162,42 +121,67 @@ tessa <- run_tessa(
162
121
  )
163
122
 
164
123
  # Save TESSA results
165
- print("Saving TESSA results ...")
166
- if (is_seurat) {
167
- cells <- rownames(sobj@meta.data)
168
- sobj@meta.data <- sobj@meta.data %>%
169
- mutate(
170
- TESSA_Cluster = tessa$meta[
171
- match(cells, tessa$meta$barcode),
172
- "cluster_number"
173
- ]
174
- ) %>%
175
- add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
176
- rownames(sobj@meta.data) <- cells
177
-
178
- if (save_tessa) {
179
- sobj@misc$tessa <- tessa
180
- }
181
- saveRDS(sobj, outfile)
182
- } else {
183
- out <- tessa$meta %>%
184
- dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
185
- add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
186
- write.table(out, outfile, sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
124
+ log$info("Saving TESSA results ...")
125
+ cells <- rownames(sobj@meta.data)
126
+ sobj@meta.data <- sobj@meta.data %>%
127
+ mutate(
128
+ TESSA_Cluster = tessa$meta[
129
+ match(cells, tessa$meta$barcode),
130
+ "cluster_number"
131
+ ]
132
+ ) %>%
133
+ add_count(TESSA_Cluster, name = "TESSA_Cluster_Size")
134
+ rownames(sobj@meta.data) <- cells
135
+
136
+ if (save_tessa) {
137
+ sobj@misc$tessa <- tessa
187
138
  }
139
+ save_obj(sobj, outfile)
188
140
 
189
141
  # Post analysis
190
- print("Post analysis ...")
142
+ log$info("Post analysis ...")
191
143
  plot_tessa(tessa, result_dir)
192
144
  plot_Tessa_clusters(tessa, result_dir)
193
145
 
194
146
  p <- tessa$meta %>%
195
147
  dplyr::select(barcode, TESSA_Cluster = cluster_number) %>%
196
148
  add_count(TESSA_Cluster, name = "TESSA_Cluster_Size") %>%
197
- ggplot(aes(x = TESSA_Cluster_Size)) +
198
- geom_histogram(binwidth = 1) +
199
- theme_prism()
149
+ plotthis::Histogram(x = "TESSA_Cluster_Size")
150
+
151
+ res <- 100
152
+ height <- attr(p, "height") * res
153
+ width <- attr(p, "width") * res
154
+ prefix <- file.path(result_dir, "Cluster_size_dist")
155
+ save_plot(p, prefix, devpars = list(width = width, height = height, res = res))
156
+
157
+ reporter$add(
158
+ list(
159
+ src = file.path(result_dir, "Cluster_size_dist.png"),
160
+ descr = "Histogram of cluster size distribution",
161
+ download = file.path(result_dir, "Cluster_size_dist.pdf")
162
+ ),
163
+ list(
164
+ src = file.path(result_dir, "clone_size.png"),
165
+ descr = "Center cluster size vs. non-center cluster size"
166
+ ),
167
+ list(
168
+ src = file.path(result_dir, "exp_TCR_pair_plot.png"),
169
+ descr = "Expression-TCR distance plot"
170
+ ),
171
+ list(
172
+ src = file.path(result_dir, "TCR_dist_density.png"),
173
+ descr = "TCR distance density plot"
174
+ ),
175
+ list(
176
+ src = file.path(result_dir, "TCR_explore.png"),
177
+ descr = "Exploratory plot at the TCR level"
178
+ ),
179
+ list(
180
+ src = file.path(result_dir, "TCR_explore_clusters.png"),
181
+ descr = "TESSA clusters"
182
+ ),
183
+ h1 = "TESSA Results",
184
+ ui = "table_of_images"
185
+ )
200
186
 
201
- png(file.path(result_dir, "Cluster_size_dist.png"), width=8, height=8, units="in", res=100)
202
- print(p)
203
- dev.off()
187
+ reporter$save(joboutdir)
@@ -1,9 +1,9 @@
1
1
 
2
- infile = {{in.infile | quote}}
3
- outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | quote}}
4
- vdjtools = {{ envs.vdjtools | quote }}
5
- vdjtools_patch = {{ envs.vdjtools_patch | quote }}
6
- joboutdir = {{job.outdir | quote}}
2
+ infile = {{in.infile | r}}
3
+ outprefix = {{out.outfile | prefix | replace: ".fancyvj.wt", "" | r}}
4
+ vdjtools = {{ envs.vdjtools | r }}
5
+ vdjtools_patch = {{ envs.vdjtools_patch | r }}
6
+ joboutdir = {{job.outdir | r}}
7
7
 
8
8
  command = sprintf(
9
9
  "cd %s && bash %s %s PlotFancyVJUsage --plot-type png %s %s",
@@ -0,0 +1,142 @@
1
+ library(immunarch)
2
+
3
+ vis.immunr_gini <- function(.data, .by = NA, .meta = NA,
4
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
5
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
6
+ .legend = NA, .plot.type = "bar", ...) {
7
+ # repDiversity(..., .method = "gini") generates a matrix
8
+ .data = data.frame(Sample = rownames(.data), Value = .data[, 1])
9
+ if (.plot.type == "bar") {
10
+ vis_bar(
11
+ .data = .data, .by = .by, .meta = .meta,
12
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
13
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
14
+ .defgroupby = "Sample", .grouping.var = "Group",
15
+ .labs = c(NA, "Gini coefficient"),
16
+ .title = "Gini coefficient", .subtitle = "Sample diversity estimation using the Gini coefficient",
17
+ .legend = .legend, .leg.title = NA
18
+ )
19
+ } else {
20
+ vis_box(
21
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
22
+ .points = .points, .signif.label.size = .signif.label.size,
23
+ .defgroupby = "Sample", .grouping.var = "Group",
24
+ .labs = c(NA, "Gini coefficient"),
25
+ .title = "Gini coefficient", .subtitle = "Sample diversity estimation using the Gini coefficient",
26
+ .legend = .legend, .leg.title = NA, .melt = FALSE
27
+ )
28
+ }
29
+ }
30
+
31
+ vis.immunr_div <- function(.data, .by = NA, .meta = NA,
32
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
33
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
34
+ .legend = NA, .plot.type = "bar", ...) {
35
+ # repDiversity(..., .method = "gini") generates a matrix
36
+ if (.plot.type == "bar") {
37
+ immunarch:::vis.immunr_div(.data = .data,.by = .by, .meta = .meta,
38
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
39
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
40
+ .legend = .legend)
41
+ } else {
42
+ vis_box(
43
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
44
+ .points = .points, .signif.label.size = .signif.label.size,
45
+ .defgroupby = "Sample", .grouping.var = "Group",
46
+ .labs = c(NA, "Effective number of clonoypes"),
47
+ .title = "True diversity", .subtitle = "Sample diversity estimation using the true diversity index",
48
+ .legend = NA, .leg.title = NA, .melt = FALSE
49
+ )
50
+ }
51
+ }
52
+
53
+ vis.immunr_chao1 <- function(.data, .by = NA, .meta = NA,
54
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
55
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
56
+ .legend = NA, .plot.type = "bar", ...) {
57
+ # repDiversity(..., .method = "gini") generates a matrix
58
+ if (.plot.type == "bar") {
59
+ immunarch:::vis.immunr_chao1(.data = .data,.by = .by, .meta = .meta,
60
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
61
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
62
+ .legend = .legend)
63
+ } else {
64
+ .data <- data.frame(Sample = row.names(.data), Value = .data[, 1])
65
+ vis_box(
66
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
67
+ .points = .points, .signif.label.size = .signif.label.size,
68
+ .defgroupby = "Sample", .grouping.var = "Group",
69
+ .labs = c(NA, "Chao1"),
70
+ .title = "Chao1", .subtitle = "Sample diversity estimation using Chao1",
71
+ .legend = NA, .leg.title = NA, .melt = FALSE
72
+ )
73
+ }
74
+ }
75
+
76
+ vis.immunr_ginisimp <- function(.data, .by = NA, .meta = NA,
77
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
78
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
79
+ .legend = NA, .plot.type = "bar", ...) {
80
+ # repDiversity(..., .method = "gini") generates a matrix
81
+ if (.plot.type == "bar") {
82
+ immunarch:::vis.immunr_ginisimp(.data = .data,.by = .by, .meta = .meta,
83
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
84
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
85
+ .legend = .legend)
86
+ } else {
87
+ vis_box(
88
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
89
+ .points = .points, .signif.label.size = .signif.label.size,
90
+ .defgroupby = "Sample", .grouping.var = "Group",
91
+ .labs = c(NA, "Gini-Simpson index"),
92
+ .title = "Gini-Simpson index", .subtitle = "Sample diversity estimation using the Gini-Simpson index",
93
+ .legend = .legend, .leg.title = NA, .melt = FALSE
94
+ )
95
+ }
96
+ }
97
+
98
+ vis.immunr_invsimp <- function(.data, .by = NA, .meta = NA,
99
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
100
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
101
+ .legend = NA, .plot.type = "bar", ...) {
102
+ # repDiversity(..., .method = "gini") generates a matrix
103
+ if (.plot.type == "bar") {
104
+ immunarch:::vis.immunr_invsimp(.data = .data,.by = .by, .meta = .meta,
105
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
106
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
107
+ .legend = .legend)
108
+ } else {
109
+ vis_box(
110
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
111
+ .points = .points, .signif.label.size = .signif.label.size,
112
+ .defgroupby = "Sample", .grouping.var = "Group",
113
+ .labs = c(NA, "Inverse Simpson index"),
114
+ .title = "Inverse Simpson index", .subtitle = "Sample diversity estimation using the inverse Simpson index",
115
+ .legend = .legend, .leg.title = NA, .melt = FALSE
116
+ )
117
+ }
118
+ }
119
+
120
+ vis.immunr_dxx <- function(.data, .by = NA, .meta = NA,
121
+ .errorbars = c(0.025, 0.975), .errorbars.off = FALSE,
122
+ .points = TRUE, .test = TRUE, .signif.label.size = 3.5,
123
+ .legend = NA, .plot.type = "bar", ...) {
124
+ # repDiversity(..., .method = "gini") generates a matrix
125
+ if (.plot.type == "bar") {
126
+ immunarch:::vis.immunr_dxx(.data = .data,.by = .by, .meta = .meta,
127
+ .errorbars = .errorbars, .errorbars.off = .errorbars.off, .stack = FALSE,
128
+ .points = .points, .test = .test, .signif.label.size = .signif.label.size,
129
+ .legend = .legend)
130
+ } else {
131
+ perc_value <- round(.data[1, 2][1])
132
+ .data <- data.frame(Sample = row.names(.data), Value = .data[, 1])
133
+ vis_box(
134
+ .data = .data, .by = .by, .meta = .meta, .test = .test,
135
+ .points = .points, .signif.label.size = .signif.label.size,
136
+ .defgroupby = "Sample", .grouping.var = "Group",
137
+ .labs = c(NA, paste0("D", perc_value)),
138
+ .title = paste0("D", perc_value, " diversity index"), .subtitle = paste0("Number of clonotypes occupying the ", perc_value, "% of repertoires"),
139
+ .legend = .legend, .leg.title = NA, .melt = FALSE
140
+ )
141
+ }
142
+ }
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
 
3
3
  # run the command and capture the stdout
4
- out=$(command $@)
4
+ out=$(command "$@")
5
5
 
6
6
  echo "$out"
7
7
 
@@ -0,0 +1,91 @@
1
+ from os import path
2
+ from contextlib import suppress
3
+ from pathlib import PosixPath # noqa: F401
4
+
5
+ from biopipen.utils.reference import tabix_index
6
+ from biopipen.utils.misc import logger
7
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
8
+
9
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
10
+ annfile: str = {{in.annfile | quote}} # pyright: ignore
11
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
12
+ joboutdir: str = {{job.outdir | quote}} # pyright: ignore
13
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
14
+
15
+ bcftools = envs.pop("bcftools")
16
+ tabix = envs.pop("tabix")
17
+ ncores = envs.pop("ncores")
18
+ columns = envs.pop("columns")
19
+ remove = envs.pop("remove")
20
+ header = envs.pop("header")
21
+ gz = envs.pop("gz")
22
+ index = envs.pop("index")
23
+
24
+ if isinstance(columns, list):
25
+ columns = ",".join(columns)
26
+
27
+ if "c" in envs:
28
+ logger.warning(r"Ignoring envs\[c], use envs\[columns] instead.")
29
+ del envs["c"]
30
+
31
+ if isinstance(remove, list):
32
+ remove = ",".join(remove)
33
+
34
+ if "x" in envs:
35
+ logger.warning(r"Ignoring envs\[x], use envs\[remove] instead.")
36
+ del envs["x"]
37
+
38
+ envs_has_annfile = "a" in envs or "annotations" in envs
39
+ headerfile = path.join(joboutdir, "header.txt")
40
+ if header:
41
+ with open(headerfile, "w") as fh:
42
+ fh.writelines(header)
43
+
44
+ if annfile and envs_has_annfile:
45
+ logger.warning(
46
+ r"Ignoring envs\[a/annotations] because in.annfile is provided."
47
+ )
48
+ with suppress(KeyError):
49
+ del envs["a"]
50
+ with suppress(KeyError):
51
+ del envs["annotations"]
52
+ elif not annfile and envs_has_annfile:
53
+ annfile = envs.pop("annotations", None) or envs.pop("a", None)
54
+
55
+
56
+ if index and not gz:
57
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
58
+ gz = True
59
+
60
+ envs[""] = [bcftools, "annotate"]
61
+ envs["o"] = outfile
62
+ envs["threads"] = ncores
63
+
64
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
65
+ envs["O"] = "z" if gz else "v"
66
+
67
+ if columns:
68
+ envs["columns"] = columns
69
+ if not annfile:
70
+ raise ValueError(
71
+ "envs.columns specified but no in.annfile/envs.annfile provided."
72
+ )
73
+ envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
74
+
75
+ if remove:
76
+ envs["remove"] = remove
77
+ # no need to index it
78
+ envs["_"] = infile
79
+
80
+ if "columns" not in envs and "remove" not in envs:
81
+ logger.warning(
82
+ "No columns/remove specified, no columns will be carried over or removed."
83
+ )
84
+
85
+ if annfile:
86
+ envs["annotations"] = tabix_index(annfile, "vcf", tabix=tabix)
87
+
88
+ if header:
89
+ envs["header_lines"] = headerfile
90
+
91
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
@@ -0,0 +1,90 @@
1
+ from pathlib import Path, PosixPath # noqa: F401
2
+
3
+ from biopipen.utils.misc import logger
4
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
5
+
6
+ infile: str | Path = {{in.infile | quote}} # pyright: ignore # noqa: #999
7
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
8
+ outdir = Path(outfile).parent
9
+
10
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
11
+ bcftools = envs.pop("bcftools")
12
+ tabix = envs.pop("tabix")
13
+ keep = envs.pop("keep")
14
+ ncores = envs.pop("ncores")
15
+ includes = envs.pop("includes")
16
+ excludes = envs.pop("excludes")
17
+ gz = envs.pop("gz")
18
+ index = envs.pop("index")
19
+
20
+ # a.vcf.gz -> a
21
+ # a.vcf -> a
22
+ stem = Path(infile).stem
23
+ if stem.endswith(".vcf"):
24
+ stem = stem[:-4]
25
+ # .vcf.gz
26
+ # .gz
27
+ ext = ".vcf.gz" if index or gz else '.vcf'
28
+
29
+
30
+ def normalize_expr(expr, flag, prev_n_filters=0):
31
+ out = {}
32
+ if not expr:
33
+ return out
34
+ if isinstance(expr, list):
35
+ for ex in expr:
36
+ out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (ex, flag)
37
+ elif isinstance(expr, dict):
38
+ for name, ex in expr.items():
39
+ out[name] = (ex, flag)
40
+ else: # str
41
+ out[f"FILTER_{flag.upper()}_{len(out) + 1 + prev_n_filters}"] = (expr, flag)
42
+ return out
43
+
44
+
45
+ def handle_filter(vcf, fname, filt, flag, final):
46
+ logger.info("- Handling filter %s: %s ...", fname, filt)
47
+
48
+ arguments = envs.copy()
49
+ arguments[flag] = filt
50
+ arguments["_"] = vcf
51
+ arguments["o"] = outfile if final else outdir / f"{stem}.{fname}{ext}"
52
+ if keep:
53
+ arguments["s"] = fname
54
+
55
+ run_bcftools(arguments, bcftools=bcftools, index=index and final, tabix=tabix)
56
+
57
+ if final:
58
+ flagfile = outdir.joinpath(f"{stem}.{fname}{ext}")
59
+ if flagfile.is_symlink():
60
+ flagfile.unlink()
61
+ outdir.joinpath(f"{stem}.{fname}{ext}").symlink_to(outfile)
62
+
63
+ return arguments["o"]
64
+
65
+
66
+ includes = normalize_expr(includes, "include")
67
+ excludes = normalize_expr(excludes, "exclude", len(includes))
68
+ includes.update(excludes)
69
+
70
+ if index and not gz:
71
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
72
+ gz = True
73
+
74
+ envs[""] = [bcftools, "filter"]
75
+ envs["_"] = infile
76
+ envs["o"] = outfile
77
+ envs["threads"] = ncores
78
+
79
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
80
+ envs["O"] = "z" if gz else "v"
81
+
82
+ if keep:
83
+ envs["soft_filter"] = "+"
84
+
85
+ if "m" not in envs and "mode" not in envs:
86
+ envs["m"] = "+"
87
+
88
+ # bcftools can be only done once at one filter
89
+ for i, (fname, (filt, flag)) in enumerate(includes.items()):
90
+ infile = handle_filter(infile, fname, filt, flag, i == len(includes) - 1)
@@ -0,0 +1,31 @@
1
+ from biopipen.utils.reference import tabix_index
2
+ from biopipen.utils.misc import logger
3
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
4
+
5
+ infiles: list = {{in.infiles | each: as_path}} # pyright: ignore # noqa: E999
6
+ outfile = {{out.outfile | repr}} # pyright: ignore
7
+ joboutdir = {{job.outdir | repr}} # pyright: ignore
8
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
9
+
10
+ bcftools = envs.pop("bcftools")
11
+ tabix = envs.pop("tabix")
12
+ ncores = envs.pop("ncores")
13
+ gz = envs.pop("gz")
14
+ index = envs.pop("index")
15
+
16
+ envs.setdefault("force-single", True)
17
+ envs.setdefault("missing-to-ref", True)
18
+
19
+ if index and not gz:
20
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
21
+ gz = True
22
+
23
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
24
+ envs["O"] = "z" if gz else "v"
25
+
26
+ envs[""] = [bcftools, "merge"]
27
+ envs["o"] = outfile
28
+ envs["threads"] = ncores
29
+ envs["_"] = infiles
30
+
31
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)