biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -0,0 +1,186 @@
1
+ """Cellranger pipeline module for BioPipen"""
2
+ from ..core.proc import Proc
3
+ from ..core.config import config
4
+
5
+
6
+ class CellRangerCount(Proc):
7
+ """Run cellranger count
8
+
9
+ to count gene expression and/or feature barcode reads
10
+ requires cellranger v7+.
11
+
12
+ Input:
13
+ fastqs: The input fastq files
14
+ Either a list of fastq files or a directory containing fastq files
15
+ If a directory is provided, it should be passed as a list with one
16
+ element.
17
+ id: The id defining output directory. If not provided, it is inferred
18
+ from the fastq files.
19
+ Note that, unlike the `--id` argument of cellranger, this will not select
20
+ the samples from `in.fastqs`. In stead, it will symlink the fastq files
21
+ to a temporary directory with this `id` as prefix and pass that to
22
+ cellranger.
23
+
24
+ Output:
25
+ outdir: The output directory
26
+
27
+ Envs:
28
+ ncores: Number of cores to use
29
+ cellranger: Path to cellranger
30
+ ref: Path of folder containing 10x-compatible transcriptome reference
31
+ tmpdir: Path to temporary directory, used to save the soft-lined fastq files
32
+ to pass to cellranger
33
+ outdir_is_mounted (flag): A flag indicating whether the output directory is
34
+ on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
35
+ fail when trying to copy/operate files to a mounted filesystem.
36
+ See <https://github.com/10XGenomics/cellranger/issues/210> and
37
+ <https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
38
+ If that is the case, set this flag to `True` to use `envs.tmpdir` as
39
+ the output directory for `cellranger vdj`, and then move the results
40
+ to the final output directory after `cellranger vdj` finishes.
41
+ In this case, make sure that `envs.tmpdir` must have enough space and
42
+ it must be a local filesystem.
43
+ copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
44
+ to only copy the `outs` folder from the temporary output directory
45
+ to the final output directory, instead of the whole output directory.
46
+ include_introns (flag): Set to false to exclude intronic reads in count.
47
+ create_bam (flag): Enable or disable BAM file generation.
48
+ This is required by cellrange v8+. When using cellrange v8-, it will be
49
+ transformed to `--no-bam`.
50
+ <more>: Other environment variables required by `cellranger count`
51
+ See `cellranger count --help` for more details or
52
+ <https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#count>
53
+ """ # noqa: E501
54
+ input = "fastqs:files, id"
55
+ output = """outdir:dir:
56
+ {%- set fastqs = in.fastqs -%}
57
+ {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
58
+ {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
59
+ {%- endif -%}
60
+ {%- if in.id -%}
61
+ {{in.id}}
62
+ {%- else -%}
63
+ {%- set id = commonprefix(*fastqs) |
64
+ regex_replace: "_L\\d+(:?_.*)?$", "" |
65
+ regex_replace: "_S\\d+$", "" -%}
66
+ {{- id -}}
67
+ {%- endif -%}
68
+ """
69
+ lang = config.lang.python
70
+ envs = {
71
+ "ncores": config.misc.ncores,
72
+ "cellranger": config.exe.cellranger,
73
+ "ref": config.ref.ref_cellranger_gex,
74
+ "tmpdir": config.path.tmpdir,
75
+ "outdir_is_mounted": False,
76
+ "copy_outs_only": True,
77
+ "include_introns": True,
78
+ "create_bam": False,
79
+ }
80
+ script = "file://../scripts/cellranger/CellRangerCount.py"
81
+ plugin_opts = {
82
+ "report": "file://../reports/cellranger/CellRangerCount.svelte",
83
+ "report_paging": 5,
84
+ }
85
+
86
+
87
+ class CellRangerVdj(Proc):
88
+ """Run cellranger vdj
89
+
90
+ to perform sequence assembly and paired clonotype calling.
91
+ requires cellranger v7+.
92
+
93
+ Input:
94
+ fastqs: The input fastq files
95
+ Either a list of fastq files or a directory containing fastq files
96
+ If a directory is provided, it should be passed as a list with one
97
+ element.
98
+ id: The id determining the output directory. If not provided, it is inferred
99
+ from the fastq files.
100
+
101
+ Output:
102
+ outdir: The output directory
103
+
104
+ Envs:
105
+ ncores: Number of cores to use
106
+ cellranger: Path to cellranger
107
+ ref: Path of folder containing 10x-compatible transcriptome reference
108
+ tmpdir: Path to temporary directory, used to save the soft-lined fastq files
109
+ to pass to cellranger.
110
+ outdir_is_mounted (flag): A flag indicating whether the output directory is
111
+ on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
112
+ fail when trying to copy the VDJ reference files to a mounted filesystem.
113
+ See <https://github.com/10XGenomics/cellranger/issues/210> and
114
+ <https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
115
+ If that is the case, set this flag to `True` to use `envs.tmpdir` as
116
+ the output directory for `cellranger vdj`, and then move the results
117
+ to the final output directory after `cellranger vdj` finishes.
118
+ In this case, make sure that `envs.tmpdir` must have enough space and
119
+ it must be a local filesystem.
120
+ copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
121
+ to only copy the `outs` folder from the temporary output directory
122
+ to the final output directory, instead of the whole output directory.
123
+ <more>: Other environment variables required by `cellranger vdj`
124
+ See `cellranger vdj --help` for more details or
125
+ <https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#vdj>
126
+ """ # noqa: E501
127
+ input = "fastqs:files, id"
128
+ output = """outdir:dir:
129
+ {%- set fastqs = in.fastqs -%}
130
+ {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
131
+ {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
132
+ {%- endif -%}
133
+ {%- if in.id -%}
134
+ {{in.id}}
135
+ {%- else -%}
136
+ {%- set id = commonprefix(*fastqs) |
137
+ regex_replace: "_L\\d+(:?_.*)?$", "" |
138
+ regex_replace: "_S\\d+$", "" -%}
139
+ {{- id -}}
140
+ {%- endif -%}
141
+ """
142
+ lang = config.lang.python
143
+ envs = {
144
+ "ncores": config.misc.ncores,
145
+ "cellranger": config.exe.cellranger,
146
+ "ref": config.ref.ref_cellranger_vdj,
147
+ "outdir_is_mounted": False,
148
+ "copy_outs_only": True,
149
+ "tmpdir": config.path.tmpdir,
150
+ }
151
+ script = "file://../scripts/cellranger/CellRangerVdj.py"
152
+ plugin_opts = {
153
+ "report": "file://../reports/cellranger/CellRangerVdj.svelte",
154
+ "report_paging": 5,
155
+ }
156
+
157
+
158
+ class CellRangerSummary(Proc):
159
+ """Summarize cellranger metrics
160
+
161
+ Input:
162
+ indirs: The directories containing cellranger results
163
+ from `CellRangerCount`/`CellRangerVdj`.
164
+
165
+ Output:
166
+ outdir: The output directory
167
+
168
+ Envs:
169
+ group (type=auto): The group of the samples for boxplots.
170
+ If `None`, don't do boxplots.
171
+ It can be a dict of group names and sample names, e.g.
172
+ `{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
173
+ or a file containing the group information, with the first column
174
+ being the sample names and the second column being the group names.
175
+ The file should be tab-delimited with no header.
176
+ """
177
+ input = "indirs:dirs"
178
+ input_data = lambda ch: [list(ch.iloc[:, 0])]
179
+ output = "outdir:dir:{{in.indirs | first | stem | append: '-etc.summary'}}"
180
+ lang = config.lang.rscript
181
+ script = "file://../scripts/cellranger/CellRangerSummary.R"
182
+ envs = {"group": None}
183
+ plugin_opts = {
184
+ "report": "file://../reports/cellranger/CellRangerSummary.svelte",
185
+ "report_paging": 8,
186
+ }
@@ -0,0 +1,126 @@
1
+ """The cellranger pipelines
2
+
3
+ Primarily cellranger process plus summary for summarizing the metrics for
4
+ multiple samples.
5
+ """
6
+ from __future__ import annotations
7
+ from typing import TYPE_CHECKING
8
+
9
+ from diot import Diot
10
+ from pipen.utils import is_loading_pipeline
11
+ from pipen_args.procgroup import ProcGroup
12
+
13
+ if TYPE_CHECKING:
14
+ from pipen import Proc
15
+
16
+
17
+ class CellRangerCountPipeline(ProcGroup):
18
+ """The cellranger count pipeline
19
+
20
+ Run cellranger count for multiple samples and summarize the metrics.
21
+
22
+ Args:
23
+ input (list): The list of lists of fastq files.
24
+ or the list of comma-separated string of fastq files.
25
+ ids (list): The list of ids for the samples.
26
+ """
27
+ DEFAULTS = Diot(input=None, ids=None)
28
+
29
+ def post_init(self):
30
+ """Check if the input is a list of fastq files"""
31
+ if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
32
+ not isinstance(self.opts.input, (list, tuple))
33
+ or len(self.opts.input) == 0
34
+ ):
35
+ raise TypeError(
36
+ "The input of `CellRangerCountPipeline` should be a list of lists of "
37
+ "fastq files."
38
+ )
39
+
40
+ if isinstance(self.opts.input, (list, tuple)):
41
+ self.opts.input = [
42
+ [y.strip() for y in x.split(",")]
43
+ if isinstance(x, str)
44
+ else x
45
+ for x in self.opts.input
46
+ ]
47
+
48
+ @ProcGroup.add_proc
49
+ def p_cellranger_count(self) -> Proc:
50
+ """Build CellRangerCount process"""
51
+ from .cellranger import CellRangerCount as _CellRangerCount
52
+
53
+ class CellRangerCount(_CellRangerCount):
54
+ if self.opts.ids:
55
+ input_data = list(zip(self.opts.input, self.opts.ids))
56
+ else:
57
+ input_data = self.opts.input
58
+
59
+ return CellRangerCount
60
+
61
+ @ProcGroup.add_proc
62
+ def p_cellranger_count_summary(self) -> Proc:
63
+ """Build CellRangerCountSummary process"""
64
+ from .cellranger import CellRangerSummary
65
+
66
+ class CellRangerCountSummary(CellRangerSummary):
67
+ requires = self.p_cellranger_count
68
+ input_data = lambda ch: [list(ch.iloc[:, 0])]
69
+
70
+ return CellRangerCountSummary
71
+
72
+
73
+ class CellRangerVdjPipeline(ProcGroup):
74
+ """The cellranger vdj pipeline
75
+
76
+ Run cellranger vdj for multiple samples and summarize the metrics.
77
+
78
+ Args:
79
+ input (list): The list of lists of fastq files.
80
+ or the list of comma-separated string of fastq files.
81
+ ids (list): The list of ids for the samples.
82
+ """
83
+ DEFAULTS = Diot(input=None, ids=None)
84
+
85
+ def post_init(self):
86
+ """Check if the input is a list of fastq files"""
87
+ if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
88
+ not isinstance(self.opts.input, (list, tuple))
89
+ or len(self.opts.input) == 0
90
+ ):
91
+ raise TypeError(
92
+ "The input of `CellRangerVdjPipeline` should be a list of lists of "
93
+ "fastq files."
94
+ )
95
+
96
+ if isinstance(self.opts.input, (list, tuple)):
97
+ self.opts.input = [
98
+ [y.strip() for y in x.split(",")]
99
+ if isinstance(x, str)
100
+ else x
101
+ for x in self.opts.input
102
+ ]
103
+
104
+ @ProcGroup.add_proc
105
+ def p_cellranger_vdj(self) -> Proc:
106
+ """Build CellRangerVdj process"""
107
+ from .cellranger import CellRangerVdj as _CellRangerVdj
108
+
109
+ class CellRangerVdj(_CellRangerVdj):
110
+ if self.opts.ids:
111
+ input_data = list(zip(self.opts.input, self.opts.ids))
112
+ else:
113
+ input_data = self.opts.input
114
+
115
+ return CellRangerVdj
116
+
117
+ @ProcGroup.add_proc
118
+ def p_cellranger_vdj_summary(self) -> Proc:
119
+ """Build CellRangerVdjSummary process"""
120
+ from .cellranger import CellRangerSummary
121
+
122
+ class CellRangerVdjSummary(CellRangerSummary):
123
+ requires = self.p_cellranger_vdj
124
+ input_data = lambda ch: [list(ch.iloc[:, 0])]
125
+
126
+ return CellRangerVdjSummary
biopipen/ns/cnv.py CHANGED
@@ -12,7 +12,15 @@ class AneuploidyScore(Proc):
12
12
 
13
13
  Input:
14
14
  segfile: The seg file, generally including chrom, start, end and
15
- seg.mean (the log2 ratio)
15
+ seg.mean (the log2 ratio).
16
+ It is typically a tab-delimited file or a BED file.
17
+ If so, envs.chrom_col, envs.start_col, envs.end_col and envs.seg_col
18
+ are the 1st, 2nd, 3rd and 5th columns, respectively.
19
+ It can also be a VCF file. If so, envs.chrom_col and envs.start_col
20
+ are not required.
21
+ `end_col` and `envs.seg_col` will be a field in the INFO column.
22
+ [`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
23
+ is required to extract the INFO field.
16
24
 
17
25
  Output:
18
26
  outdir: The output directory containing the CAAs, AS and a histogram
@@ -122,7 +130,15 @@ class TMADScore(Proc):
122
130
  Input:
123
131
  segfile: The seg file, two columns are required:
124
132
  * chrom: The chromosome name, used for filtering
125
- * seg.mean: The log2 ratio
133
+ * seg.mean: The log2 ratio.
134
+ It is typically a tab-delimited file or a BED file.
135
+ If so, envs.chrom_col and envs.seg_col
136
+ are the 1st and 5th columns, respectively.
137
+ It can also be a VCF file. If so, envs.chrom_col and envs.start_col
138
+ are not required.
139
+ `end_col` and `envs.seg_col` will be a field in the INFO column.
140
+ [`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
141
+ is required to extract the INFO field.
126
142
 
127
143
  Output:
128
144
  outfile: The output file containing the TMAD score
@@ -134,7 +150,7 @@ class TMADScore(Proc):
134
150
  excl_chroms (list): The chromosomes to be excluded
135
151
  """
136
152
  input = "segfile:file"
137
- output = "outfile:file:{{in.segfile | stem0}}.tmad.txt"
153
+ output = "outfile:file:{{in.segfile | stem}}.tmad.txt"
138
154
  lang = config.lang.rscript
139
155
  envs = {
140
156
  "chrom_col": "chrom",
biopipen/ns/cnvkit.py CHANGED
@@ -482,7 +482,7 @@ class CNVkitDiagram(Proc):
482
482
  }
483
483
  script = "file://../scripts/cnvkit/CNVkitDiagram.py"
484
484
  plugin_opts = {
485
- "report": "file://../reports/cnvkit/CNVkitScatter.svelte",
485
+ "report": "file://../reports/cnvkit/CNVkitDiagram.svelte",
486
486
  "report_paging": 10,
487
487
  }
488
488
 
@@ -276,7 +276,10 @@ class CNVkitPipeline(ProcGroup):
276
276
  """Build CNVkitGuessBaits process"""
277
277
  from .cnvkit import CNVkitGuessBaits
278
278
 
279
- if not self.opts.guessbaits and not is_loading_pipeline():
279
+ if (
280
+ not self.opts.guessbaits and
281
+ not is_loading_pipeline("-h", "-h+", "--help", "--help+")
282
+ ):
280
283
  return None
281
284
 
282
285
  def _guess_baits_bams(ch):
@@ -487,7 +490,8 @@ class CNVkitPipeline(ProcGroup):
487
490
  target_file = None
488
491
  antitarget_file = None
489
492
  if self.col.sex in metadf:
490
- sample_sex = ",".join(metadf[self.col.sex][control_masks])
493
+ all_sex = metadf[self.col.sex][control_masks].unique()
494
+ sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
491
495
  else:
492
496
  sample_sex = [None]
493
497
  else:
@@ -774,13 +778,15 @@ class CNVkitPipeline(ProcGroup):
774
778
  else:
775
779
  tumor_masks = metadf[self.col.group] == self.opts.case
776
780
 
781
+ if self.col.sex in metadf:
782
+ all_sex = metadf[self.col.sex][tumor_masks].unique()
783
+ sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
784
+ else:
785
+ sample_sex = [None]
786
+
777
787
  return tibble(
778
788
  segfiles=[ch2.outfile.tolist()],
779
- sample_sex=(
780
- ",".join(metadf[self.col.sex][tumor_masks])
781
- if self.col.sex in metadf
782
- else [None]
783
- ),
789
+ sample_sex=sample_sex,
784
790
  )
785
791
 
786
792
  @annotate.format_doc(indent=3)
@@ -823,13 +829,15 @@ class CNVkitPipeline(ProcGroup):
823
829
  else:
824
830
  tumor_masks = metadf[self.col.group] == self.opts.case
825
831
 
832
+ if self.col.sex in metadf:
833
+ all_sex = metadf[self.col.sex][tumor_masks].unique()
834
+ sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
835
+ else:
836
+ sample_sex = [None]
837
+
826
838
  return tibble(
827
839
  segfiles=[ch2.outfile.tolist()],
828
- sample_sex=(
829
- ",".join(metadf[self.col.sex][tumor_masks])
830
- if self.col.sex in metadf
831
- else [None]
832
- ),
840
+ sample_sex=sample_sex,
833
841
  )
834
842
 
835
843
  @annotate.format_doc(indent=3)
biopipen/ns/delim.py CHANGED
@@ -51,6 +51,10 @@ class SampleInfo(Proc):
51
51
  Output:
52
52
  outfile: The output file with sample information, with mutated columns
53
53
  if `envs.save_mutated` is True.
54
+ The basename of the output file will be the same as the input file.
55
+ The file name of each plot will be slugified from the case name.
56
+ Each plot has 3 formats: pdf, png and code.zip, which contains the
57
+ data and R code to reproduce the plot.
54
58
 
55
59
  Envs:
56
60
  sep: The separator of the input file.
@@ -76,37 +80,34 @@ class SampleInfo(Proc):
76
80
  If `FALSE`, you can mutate the meta data frame with the
77
81
  returned ids. Non-paired ids will be `NA`.
78
82
  save_mutated (flag): Whether to save the mutated columns.
79
- exclude_cols: The columns to exclude in the table in the report.
83
+ exclude_cols (auto): The columns to exclude in the table in the report.
80
84
  Could be a list or a string separated by comma.
81
85
  defaults (ns): The default parameters for `envs.stats`.
82
- - on: The column name in the data for the stats.
83
- Default is `Sample`. The column could be either continuous or not.
84
- - distinct: The column name in the data for the distinct records.
85
- For example, you may have multiple `Sample`s for each patient.
86
- In this case, you can set `distinct` to `Patient` to get the
87
- stats for each patient, instead of each sample with duplicated
88
- values. Default is `None`, which means all records are distinct.
89
- Note that when `distinct` is provided, your `group` and `each` should
90
- be the same for each distinct record. For example, it doesn't make
91
- sense if you are doing statistics for each patient (`on = "Sample"`),
92
- but your `group` is `SampleSource`, defining the source of each
93
- sample.
94
- - group: The column name in the data for the group ids.
95
- If not provided, all records will be regarded as one group.
96
- - na_group (flag): Whether to include `NA`s in the group.
97
- - each: The column in the data to split the analysis in different
98
- plots.
99
- - ncol (type=int): The number of columns in the plot when `each`
100
- is not `NULL`. Default is 2.
101
- - na_each (flag): Whether to include `NA`s in the `each` column.
102
- - plot: Type of plot. If `on` is continuous, it could be
103
- `boxplot` (default), `violin`, `violin+boxplot` or `histogram`.
104
- If `on` is not continuous, it could be `barplot` or
105
- `pie` (default).
86
+ - plot_type: The type of the plot.
87
+ See the supported plot types here:
88
+ <https://pwwang.github.io/plotthis/reference/index.html>
89
+ The plot_type should be lower case and the plot function used in
90
+ `plotthis` should be used. The mapping from plot_type to the
91
+ plot function is like `bar -> BarPlot`, `box -> BoxPlot`, etc.
92
+ - more_formats (list): The additional formats to save the plot.
93
+ By default, the plot will be saved in png, which is also used to
94
+ display in the report. You can add more formats to save the plot.
95
+ For example, `more_formats = ["pdf", "svg"]`.
96
+ - save_code (flag): Whether to save the R code to reproduce the plot.
97
+ The data used to plot will also be saved.
98
+ - subset: An expression to subset the data frame before plotting.
99
+ The expression should be a string of R expression that will be passed
100
+ to `dplyr::filter`. For example, `subset = "Sample == 'A'"`.
101
+ - section: The section name in the report.
102
+ In case you want to group the plots in the report.
106
103
  - devpars (ns): The device parameters for the plot.
107
104
  - width (type=int): The width of the plot.
108
105
  - height (type=int): The height of the plot.
109
106
  - res (type=int): The resolution of the plot.
107
+ - descr: The description of the plot, shown in the report.
108
+ - <more>: You can add more parameters to the defaults.
109
+ These parameters will be expanded to the `envs.stats` for each case,
110
+ and passed to individual plot functions.
110
111
  stats (type=json): The statistics to perform.
111
112
  The keys are the case names and the values are the parameters
112
113
  inheirted from `envs.defaults`.
@@ -119,18 +120,16 @@ class SampleInfo(Proc):
119
120
  "save_mutated": False,
120
121
  "exclude_cols": None,
121
122
  "defaults": {
122
- "on": "Sample",
123
- "distinct": None,
124
- "group": None,
125
- "na_group": False,
126
- "each": None,
127
- "ncol": 2,
128
- "na_each": False,
129
- "plot": None,
130
- "devpars": {"width": 800, "height": 600, "res": 100},
123
+ "plot_type": "bar",
124
+ "more_formats": [],
125
+ "save_code": False,
126
+ "subset": None,
127
+ "section": None,
128
+ "descr": None,
129
+ "devpars": {"width": None, "height": None, "res": 100},
131
130
  },
132
131
  "stats": {},
133
132
  }
134
133
  lang = config.lang.rscript
135
134
  script = "file://../scripts/delim/SampleInfo.R"
136
- plugin_opts = {"report": "file://../reports/delim/SampleInfo.svelte"}
135
+ plugin_opts = {"report": "file://../reports/common.svelte"}
biopipen/ns/gene.py CHANGED
@@ -9,46 +9,91 @@ class GeneNameConversion(Proc):
9
9
 
10
10
  Input:
11
11
  infile: The input file with original gene names
12
+ It should be a tab-separated file with header
12
13
 
13
14
  Output:
14
15
  outfile: The output file with converted gene names
15
16
 
16
17
  Envs:
17
- inopts: Options to read `in.infile` for `pandas.read_csv()`
18
- See https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
19
- outopts: Options to write `out.outfile` for `pandas.to_csv()`
20
- See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
21
- notfound: What to do if a conversion cannot be done.
22
- use-query: Ignore the conversion and use the original name
23
- skip: Ignore the conversion and skip the entire row in input file
24
- error: Report error
25
- genecol: The index (0-based) or name of the column where
26
- genes are present
27
- output: How to output
28
- keep: Keep the original name column and add new converted columns
29
- drop: Drop the original name column, and add the converted names
30
- replace: Drop the original name column, and insert
31
- the converted names at the original position
32
- only: Only keep the query and the converted name columns
18
+ notfound (choice): What to do if a conversion cannot be done.
19
+ - use-query: Ignore the conversion and use the original name
20
+ - skip: Ignore the conversion and skip the entire row in input file
21
+ - ignore: Same as skip
22
+ - error: Report error
23
+ - na: Use NA
24
+ dup (choice): What to do if a conversion results in multiple names.
25
+ - first: Use the first name, sorted by matching score descendingly (default)
26
+ - last: Use the last name, sorted by matching score descendingly
27
+ - combine: Combine all names using `;` as separator
28
+ genecol: The index (1-based) or name of the column where genes are present
29
+ output (choice): How to output.
30
+ - append: Add the converted names as new columns at the end using `envs.outfmt`
31
+ as the column name.
32
+ - replace: Drop the original name column, and insert
33
+ the converted names at the original position.
34
+ - converted: Only keep the converted names.
35
+ - with-query: Output 2 columns with original and converted names.
33
36
  infmt: What's the original gene name format
34
37
  Available fields
35
38
  https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
36
- outfmt: What's the target gene name format
39
+ outfmt: What's the target gene name format. Currently only a single format
40
+ is supported.
37
41
  species: Limit gene query to certain species.
38
42
  Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
39
43
  thale-cress, frog and pig
40
44
  """ # noqa: E501
41
45
  input = "infile:file"
42
46
  output = "outfile:file:{{in.infile | basename}}"
43
- lang = config.lang.python
47
+ lang = config.lang.rscript
44
48
  envs = {
45
- "inopts": {"sep": "\t", "index_col": False},
46
- "outopts": {"sep": "\t", "index": False},
47
49
  "notfound": "error",
48
- "genecol": 0,
49
- "output": "keep",
50
+ "genecol": 1,
51
+ "dup": "first",
52
+ "output": "append",
50
53
  "infmt": ["symbol", "alias"],
51
54
  "outfmt": "symbol",
52
55
  "species": "human",
53
56
  }
54
- script = "file://../scripts/gene/GeneNameConversion.py"
57
+ script = "file://../scripts/gene/GeneNameConversion.R"
58
+
59
+
60
+ class GenePromoters(Proc):
61
+ """Get gene promoter regions by specifying the flanking regions of TSS
62
+
63
+ Input:
64
+ infile: The input file with gene ids/names
65
+
66
+ Output:
67
+ outfile: The output file with promoter regions in BED format
68
+
69
+ Envs:
70
+ up (type=int): The upstream distance from TSS
71
+ down (type=int): The downstream distance from TSS
72
+ If not specified, the default is `envs.up`
73
+ notfound (choice): What to do if a gene is not found.
74
+ - skip: Skip the gene
75
+ - error: Report error
76
+ refgene: The reference gene annotation file in GTF format
77
+ header (flag): Whether the input file has a header
78
+ genecol (type=int): The index (1-based) of the gene column
79
+ match_id (flag): Should we match the genes in `in.infile` by `gene_id`
80
+ instead of `gene_name` in `envs.refgene`
81
+ sort (flag): Sort the output by chromosome and start position
82
+ chrsize: The chromosome size file, from which the chromosome order is
83
+ used to sort the output
84
+ """
85
+ input = "infile:file"
86
+ output = "outfile:file:{{in.infile | stem}}-promoters.bed"
87
+ lang = config.lang.rscript
88
+ envs = {
89
+ "up": 2000,
90
+ "down": None,
91
+ "notfound": "error",
92
+ "refgene": config.ref.refgene,
93
+ "header": True,
94
+ "genecol": 1,
95
+ "match_id": False,
96
+ "sort": False,
97
+ "chrsize": config.ref.chrsize,
98
+ }
99
+ script = "file://../scripts/gene/GenePromoters.R"