biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
biopipen/ns/protein.py ADDED
@@ -0,0 +1,183 @@
1
+ """Protein-related processes."""
2
+ from ..core.proc import Proc
3
+ from ..core.config import config
4
+
5
+
6
+ class Prodigy(Proc):
7
+ """Prediction of binding affinity of protein-protein complexes based on
8
+ intermolecular contacts using Prodigy.
9
+
10
+ See <https://rascar.science.uu.nl/prodigy/> and
11
+ <https://github.com/haddocking/prodigy>.
12
+
13
+ `prodigy-prot` must be installed under the given python of `proc.lang`.
14
+
15
+ Input:
16
+ infile: The structure file in PDB or mmCIF format.
17
+
18
+ Output:
19
+ outfile: The output file generated by Prodigy.
20
+ outdir: The output directory containing all output files.
21
+
22
+ Envs:
23
+ distance_cutoff (type=float): The distance cutoff to calculate intermolecular
24
+ contacts.
25
+ acc_threshold (type=float): The accessibility threshold for BSA analysis.
26
+ temperature (type=float): The temperature (C) for Kd prediction.
27
+ contact_list (flag): Whether to generate contact list.
28
+ pymol_selection (flag): Whether output a script to highlight the interface
29
+ residues in PyMOL.
30
+ selection (list): The selection of the chains to analyze.
31
+ `['A', 'B']` will analyze chains A and B.
32
+ `['A,B', 'C']` will analyze chain A and C; and B and C.
33
+ `['A', 'B', 'C']` will analyze all combinations of A, B, and C.
34
+ outtype (choice): Set the format of the output file (`out.outfile`).
35
+ All three files will be generated. This option only determines which
36
+ is assigned to `out.outfile`.
37
+ - raw: The raw output file from prodigy.
38
+ - json: The output file in JSON format.
39
+ - tsv: The output file in CSV format.
40
+ """
41
+ input = "infile:file"
42
+ output = [
43
+ "outfile:file:{{in.infile | stem}}_prodigy/"
44
+ "{{in.infile | stem}}.{{envs.outtype if envs.outtype != 'raw' else 'out'}}",
45
+ "outdir:dir:{{in.infile | stem}}_prodigy",
46
+ ]
47
+ lang = config.lang.python
48
+ envs = {
49
+ "distance_cutoff": 5.5,
50
+ "acc_threshold": 0.05,
51
+ "temperature": 25.0,
52
+ "contact_list": True,
53
+ "pymol_selection": True,
54
+ "selection": None,
55
+ "outtype": "json",
56
+ }
57
+ script = "file://../scripts/protein/Prodigy.py"
58
+
59
+
60
+ class ProdigySummary(Proc):
61
+ """Summary of the output from `Prodigy`.
62
+
63
+ Input:
64
+ infiles: The output json file generated by `Prodigy`.
65
+
66
+ Output:
67
+ outdir: The directory of summary files generated by `ProdigySummary`.
68
+
69
+ Envs:
70
+ group (type=auto): The group of the samples for boxplots.
71
+ If `None`, don't do boxplots.
72
+ It can be a dict of group names and sample names, e.g.
73
+ `{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
74
+ or a file containing the group information, with the first column
75
+ being the sample names and the second column being the group names.
76
+ The file should be tab-delimited with no header.
77
+ """
78
+ input = "infiles:files"
79
+ input_data = lambda ch: [[f"{odir}/_prodigy.tsv" for odir in ch.outdir]]
80
+ output = "outdir:dir:prodigy_summary"
81
+ lang = config.lang.rscript
82
+ envs = {"group": None}
83
+ script = "file://../scripts/protein/ProdigySummary.R"
84
+ plugin_opts = {"report": "file://../reports/protein/ProdigySummary.svelte"}
85
+
86
+
87
+ class MMCIF2PDB(Proc):
88
+ """Convert mmCIF or PDBx file to PDB file.
89
+
90
+ Using [BeEM](https://github.com/kad-ecoli/BeEM)
91
+
92
+ Input:
93
+ infile: The input mmCIF or PDBx file.
94
+
95
+ Output:
96
+ outfile: The output PDB file.
97
+ The "outfmt" set to 3 to always output a single PDB file.
98
+
99
+ Envs:
100
+ tool (choice): The tool to use for conversion.
101
+ - maxit: Use MAXIT.
102
+ - beem: Use BeEM.
103
+ maxit: The path to the MAXIT executable.
104
+ beem: The path to the BeEM executable.
105
+ <more>: Other options for MAXIT/BeEM.
106
+ For BeEM, "outfmt" will not be used as it is set to 3.
107
+ """
108
+ input = "infile:file"
109
+ output = "outfile:file:{{in.infile | stem}}.pdb"
110
+ lang = config.lang.python
111
+ envs = {
112
+ "tool": "maxit",
113
+ "maxit": config.exe.maxit,
114
+ "beem": config.exe.beem,
115
+ }
116
+ script = "file://../scripts/protein/MMCIF2PDB.py"
117
+
118
+
119
+ class RMSD(Proc):
120
+ """Calculate the RMSD between two structures.
121
+
122
+ See also https://github.com/charnley/rmsd.
123
+
124
+ If the input is in mmCIF format, convert it to PDB first.
125
+
126
+ Input:
127
+ infile1: The first structure file.
128
+ infile2: The second structure file.
129
+
130
+ Output:
131
+ outfile: The output file containing the RMSD value.
132
+
133
+ Envs:
134
+ beem: The path to the BeEM executable.
135
+ calculate_rmsd: The path to the calculate_rmsd executable.
136
+ conv_tool (choice): The tool to use for conversion.
137
+ - maxit: Use MAXIT.
138
+ - beem: Use BeEM.
139
+ ca_only (flag): Whether to calculate RMSD using only C-alpha atoms.
140
+ duel (choice): How to handle the duel atoms. Default is "keep".
141
+ - keep: Keep both atoms.
142
+ - keep_first: Keep the first atom.
143
+ - keep_last: Keep the last atom.
144
+ - average: Average the coordinates.
145
+ reorder (flag): Whether to reorder the atoms in the structures.
146
+ <more>: Other options for calculate_rmsd.
147
+ """
148
+ input = "infile1:file, infile2:file"
149
+ output = "outfile:file:{{in.infile1 | stem}}-{{in.infile2 | stem}}.rmsd.txt"
150
+ lang = config.lang.python
151
+ envs = {
152
+ "maxit": config.exe.maxit,
153
+ "beem": config.exe.beem,
154
+ "calculate_rmsd": config.exe.calculate_rmsd,
155
+ "conv_tool": "maxit",
156
+ "ca_only": False,
157
+ "duel": "keep",
158
+ "reorder": True,
159
+ }
160
+ script = "file://../scripts/protein/RMSD.py"
161
+
162
+
163
+ class PDB2Fasta(Proc):
164
+ """Convert PDB file to FASTA file.
165
+
166
+ Input:
167
+ infile: The input PDB file.
168
+
169
+ Output:
170
+ outfile: The output FASTA file.
171
+
172
+ Envs:
173
+ chains (auto): The chains to extract. A list of chain IDs or separated by
174
+ commas.
175
+ If None, extract all chains.
176
+ wrap (type=int): The number of residues per line in the output FASTA
177
+ file. Set to 0 to disable wrapping.
178
+ """
179
+ input = "infile:file"
180
+ output = "outfile:file:{{in.infile | stem}}.fasta"
181
+ lang = config.lang.python
182
+ envs = {"chains": None, "wrap": 80}
183
+ script = "file://../scripts/protein/PDB2Fasta.py"
@@ -0,0 +1,290 @@
1
+ """Provides processes for the regulatory related"""
2
+
3
+ from ..core.proc import Proc
4
+ from ..core.config import config
5
+
6
+
7
+ class MotifScan(Proc):
8
+ """Scan the input sequences for binding sites using motifs.
9
+
10
+ Currently only [fimo](https://meme-suite.org/meme/tools/fimo) from MEME suite
11
+ is supported, based on the research/comparisons done by the following reference.
12
+
13
+ Reference:
14
+ - [Evaluating tools for transcription factor binding site prediction](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6889335/)
15
+
16
+ Input:
17
+ motiffile: File containing motif names.
18
+ The file contains the motif and regulator names.
19
+ The motif names should match the names in the motif database.
20
+ This file must have a header.
21
+ If multiple columns are present, it should be delimited by tab.
22
+ seqfile: File containing sequences in FASTA format.
23
+
24
+ Output:
25
+ outdir: Directory containing the results.
26
+ Especially `fimo_output.txt` extending from `fimo.tsv`, which contains:
27
+ 1. the results with the regulator information if `envs.regulator_col`
28
+ is provided, otherwise, the `regulator` columns will be filled with
29
+ the motif names.
30
+ 2. the original sequence from the fasta file (in.seqfile)
31
+ 3. corrected genomic coordinates if the genomic coordinates are included
32
+ in the sequence names.
33
+
34
+ See also the `Output` section of
35
+ <https://meme-suite.org/meme/doc/fimo.html>.
36
+ Note that `--no-pgc` is passed to fimo to not parse the genomic coordinates
37
+ from the sequence names by fimo. When fimo parses the genomic coordinates,
38
+ `DDX11L1` in `>DDX11L1::chr1:11869-14412` will be lost.
39
+ The purpose of this is to keep the sequence names as they are in the output.
40
+ If the sequence names are in the format of `>NAME::chr1:START-END`, we will
41
+ correct the coordinates in the output.
42
+ Also note that it requires meme/fimo v5.5.5+ to do this
43
+ (where the --no-pgc option is available).
44
+
45
+ Envs:
46
+ tool (choice): The tool to use for scanning.
47
+ Currently only fimo is supported.
48
+ - fimo: Use fimo from MEME suite.
49
+ fimo: The path to fimo binary.
50
+ motif_col: The column name in the motif file containing the motif names.
51
+ regulator_col: The column name in the motif file containing the regulator names.
52
+ Both `motif_col` and `regulator_col` should be the direct column names or
53
+ the index (1-based) of the columns.
54
+ If no `regulator_col` is provided, no regulator information is written in
55
+ the output.
56
+ notfound (choice): What to do if a motif is not found in the database.
57
+ - error: Report error and stop the process.
58
+ - ignore: Ignore the motif and continue.
59
+ motifdb: The path to the motif database. This is required.
60
+ It should be in the format of MEME motif database.
61
+ Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
62
+ See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
63
+ cutoff (type=float): The cutoff for p-value to write the results.
64
+ When `envs.q_cutoff` is set, this is applied to the q-value.
65
+ This is passed to `--thresh` in fimo.
66
+ q (flag): Calculate q-value.
67
+ When `False`, `--no-qvalue` is passed to fimo.
68
+ The q-value calculation is that of Benjamini and Hochberg (BH) (1995).
69
+ q_cutoff (flag): Apply `envs.cutoff` to q-value.
70
+ args (ns): Additional arguments to pass to the tool.
71
+ - <more>: Additional arguments for fimo.
72
+ See: <https://meme-suite.org/meme/doc/fimo.html>
73
+ """ # noqa: E501
74
+ input = "motiffile:file, seqfile:file"
75
+ output = "outdir:dir:{{in.motiffile | stem}}.fimo"
76
+ lang = config.lang.python
77
+ envs = {
78
+ "tool": "fimo",
79
+ "fimo": config.exe.fimo,
80
+ "motif_col": 1,
81
+ "regulator_col": None,
82
+ "notfound": "error",
83
+ "motifdb": config.tf_motifdb,
84
+ "cutoff": 1e-4,
85
+ "q": False,
86
+ "q_cutoff": False,
87
+ "args": {},
88
+ }
89
+ script = "file://../scripts/regulatory/MotifScan.py"
90
+
91
+
92
+ class MotifAffinityTest(Proc):
93
+ """Test the affinity of motifs to the sequences and the affinity change
94
+ due the mutations.
95
+
96
+ See also <https://simon-coetzee.github.io/motifBreakR> and
97
+ <https://www.bioconductor.org/packages/release/bioc/vignettes/atSNP/inst/doc/atsnp-vignette.html>
98
+
99
+ When using atSNP, motifBreakR is also required to plot the variants and motifs.
100
+
101
+ Input:
102
+ motiffile: File containing motif names.
103
+ The file contains the motif and regulator names.
104
+ The motif names should match the names in the motif database.
105
+ This file must have a header.
106
+ If multiple columns are present, it should be delimited by tab.
107
+ varfile: File containing the variants.
108
+ It could be a VCF file or a BED-like file.
109
+ If it is a VCF file, it does not need to be indexed. Only records with `PASS` in the `FILTER` column are used.
110
+ If it is a BED-like file, it should contain the following columns, `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
111
+
112
+ Output:
113
+ outdir: Directory containing the results.
114
+ For motifBreakR, `motifbreakr.txt` will be created. Records with effect `strong`/`weak` are written (`neutral` is not).
115
+ For atSNP, `atsnp.txt` will be created. Records with p-value (`envs.atsnp_args.p`) < `envs.cutoff` are written.
116
+
117
+ Envs:
118
+ ncores (type=int): The number of cores to use.
119
+ tool (choice): The tool to use for the test.
120
+ - motifbreakr: Use motifBreakR.
121
+ - motifBreakR: Use motifBreakR.
122
+ - atsnp: Use atSNP.
123
+ - atSNP: Use atSNP.
124
+ bcftools: The path to bcftools binary.
125
+ Used to convert the VCF file to the BED file when the input is a VCF file.
126
+ motif_col: The column name in the motif file containing the motif names.
127
+ If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
128
+ which are used to infer the motif names from the regulator names.
129
+ regulator_col: The column name in the motif file containing the regulator names.
130
+ Both `motif_col` and `regulator_col` should be the direct column names or
131
+ the index (1-based) of the columns.
132
+ If no `regulator_col` is provided, no regulator information is written in
133
+ the output. Otherwise, the regulator information is written in the output in
134
+ the `Regulator` column.
135
+ var_col: The column names in the `in.motiffile` containing the variant information.
136
+ It has to be matching the names in the `in.varfile`. This is helpful when
137
+ we only need to test the pairs of variants and motifs in the `in.motiffile`.
138
+ notfound (choice): What to do if a motif is not found in the database,
139
+ or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
140
+ file.
141
+ - error: Report error and stop the process.
142
+ - ignore: Ignore the motif and continue.
143
+ motifdb: The path to the motif database. This is required.
144
+ It should be in the format of MEME motif database.
145
+ Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
146
+ See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
147
+ [universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
148
+ genome: The genome assembly.
149
+ Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
150
+ `hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
151
+ cutoff (type=float): The cutoff for p-value to write the results.
152
+ devpars (ns): The default device parameters for the plot.
153
+ - width (type=int): The width of the plot.
154
+ - height (type=int): The height of the plot.
155
+ - res (type=int): The resolution of the plot.
156
+ plot_nvars (type=int): Number of variants to plot.
157
+ Plot top `<plot_nvars>` variants with the largest `abs(alleleDiff)` (motifBreakR) or smallest p-values (atSNP).
158
+ plots (type=json): Specify the details for the plots.
159
+ When specified, `plot_nvars` is ignored.
160
+ The keys are the variant names and the values are the details for the plots, including:
161
+ devpars: The device parameters for the plot to override the default (envs.devpars).
162
+ which: An expression passed to `subset(results, subset = ...)` to get the motifs for the variant to plot.
163
+ Or an integer to get the top `which` motifs.
164
+ For example, `effect == "strong"` to get the motifs with strong effect in motifBreakR result.
165
+ regmotifs: The path to the regulator-motif mapping file.
166
+ It must have header and the columns `Motif` or `Model` for motif names and
167
+ `TF`, `Regulator` or `Transcription factor` for regulator names.
168
+ motifbreakr_args (ns): Additional arguments to pass to motifBreakR.
169
+ - method (choice): The method to use.
170
+ See details of <https://rdrr.io/bioc/motifbreakR/man/motifbreakR.html>
171
+ and <https://simon-coetzee.github.io/motifBreakR/#methods>.
172
+ - default: Use the default method.
173
+ - log: Use the standard summation of log probabilities
174
+ - ic: Use information content
175
+ - notrans: Use the default method without transformation
176
+ atsnp_args (ns): Additional arguments to pass to atSNP.
177
+ - padj_cutoff (flag): The `envs.cutoff` will be applied to the adjusted p-value.
178
+ Only works for `atSNP`.
179
+ - padj (choice): The method to adjust the p-values.
180
+ Only works for `atSNP`
181
+ - holm: Holm's method
182
+ - hochberg: Hochberg's method
183
+ - hommel: Hommel's method
184
+ - bonferroni: Bonferroni method
185
+ - BH: Benjamini & Hochberg's method
186
+ - BY: Benjamini & Yekutieli's method
187
+ - fdr: False discovery rate
188
+ - none: No adjustment
189
+ - p (choice): Which p-value to use for adjustment and cutoff.
190
+ - pval_ref: p-value for the reference allele affinity score.
191
+ - pval_snp: p-value for the SNP allele affinity score.
192
+ - pval_cond_ref: and
193
+ - pval_cond_snp: conditional p-values for the affinity scores of the reference and SNP alleles.
194
+ - pval_diff: p-value for the affinity score change between the two alleles.
195
+ - pval_rank: p-value for the rank test between the two alleles.
196
+ """ # noqa: E501
197
+ input = "motiffile:file, varfile:file"
198
+ output = "outdir:dir:{{in.motiffile | stem}}.{{envs.tool | lower}}"
199
+ lang = config.lang.rscript
200
+ envs = {
201
+ "ncores": config.misc.ncores,
202
+ "tool": "atsnp",
203
+ "bcftools": config.exe.bcftools,
204
+ "motif_col": None,
205
+ "regulator_col": None,
206
+ "var_col": None,
207
+ "notfound": "error",
208
+ "motifdb": config.ref.tf_motifdb,
209
+ "regmotifs": config.ref.tf_motifs,
210
+ "genome": config.ref.genome,
211
+ "cutoff": 0.05,
212
+ "devpars": {"width": None, "height": None, "res": 100},
213
+ "plot_nvars": 10,
214
+ "plots": {},
215
+ "motifbreakr_args": {"method": "default"},
216
+ "atsnp_args": {"padj_cutoff": True, "padj": "BH", "p": "pval_diff"},
217
+ }
218
+ script = "file://../scripts/regulatory/MotifAffinityTest.R"
219
+
220
+
221
+ class VariantMotifPlot(Proc):
222
+ """A plot with a genomic region surrounding a genomic variant, and
223
+ potentially disrupted motifs.
224
+
225
+ Currently only SNVs are supported.
226
+
227
+ Input:
228
+ infile: File containing the variants and motifs.
229
+ It is a TAB-delimited file with the following columns:
230
+ - chrom: The chromosome of the SNV. Alias: chr, seqnames.
231
+ - start: The start position of the SNV, no matter 0- or 1-based.
232
+ - end: The end position of the SNV, which will be used as the position of the SNV.
233
+ - strand: Indicating the direction of the surrounding sequence matching the motif.
234
+ - SNP_id: The name of the SNV.
235
+ - REF: The reference allele of the SNV.
236
+ - ALT: The alternative allele of the SNV.
237
+ - providerId: The motif id. It can be specified by `envs.motif_col`.
238
+ - providerName: The name of the motif provider. Optional.
239
+ - Regulator: The regulator name. Optional, can be specified by `envs.regulator_col`.
240
+ - motifPos: The position of the motif, relative to the position of the SNV.
241
+ For example, '-8, 4' means the motif is 8 bp upstream and 4 bp downstream of the SNV.
242
+
243
+ Envs:
244
+ genome: The genome assembly.
245
+ Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
246
+ `hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
247
+ motifdb: The path to the motif database. This is required.
248
+ It should be in the format of MEME motif database.
249
+ Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
250
+ See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
251
+ [universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
252
+ motif_col: The column name in the motif file containing the motif names.
253
+ If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
254
+ which are used to infer the motif names from the regulator names.
255
+ regulator_col: The column name in the motif file containing the regulator names.
256
+ Both `motif_col` and `regulator_col` should be the direct column names or
257
+ the index (1-based) of the columns.
258
+ If no `regulator_col` is provided, no regulator information is written in
259
+ the output. Otherwise, the regulator information is written in the output in
260
+ the `Regulator` column.
261
+ regmotifs: The path to the regulator-motif mapping file.
262
+ It must have header and the columns `Motif` or `Model` for motif names and
263
+ `TF`, `Regulator` or `Transcription factor` for regulator names.
264
+ notfound (choice): What to do if a motif is not found in the database,
265
+ or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
266
+ file.
267
+ - error: Report error and stop the process.
268
+ - ignore: Ignore the motif and continue.
269
+ devpars (ns): The default device parameters for the plot.
270
+ - width (type=int): The width of the plot.
271
+ - height (type=int): The height of the plot.
272
+ - res (type=int): The resolution of the plot.
273
+ plot_vars (type=auto): The variants (SNP_id) to plot.
274
+ A list of variant names to plot or a string with the variant names separated by comma.
275
+ When not specified, all variants are plotted.
276
+ """ # noqa: E501
277
+ input = "infile:file"
278
+ output = "outdir:dir:{{in.infile | stem}}.vmplots"
279
+ lang = config.lang.rscript
280
+ envs = {
281
+ "genome": config.ref.genome,
282
+ "motifdb": config.ref.tf_motifdb,
283
+ "motif_col": "providerId",
284
+ "regulator_col": None,
285
+ "regmotifs": config.ref.tf_motifs,
286
+ "notfound": "error",
287
+ "devpars": {"width": 800, "height": None, "res": 100},
288
+ "plot_vars": None,
289
+ }
290
+ script = "file://../scripts/regulatory/VariantMotifPlot.R"
biopipen/ns/rnaseq.py CHANGED
@@ -5,17 +5,154 @@ from ..core.config import config
5
5
 
6
6
 
7
7
  class UnitConversion(Proc):
8
- """Convert expression value units back and forth"""
8
+ """Convert expression value units back and forth
9
+
10
+ See <https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/>
11
+ and <https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#fpkm>.
12
+
13
+ Following converstions are supported -
14
+ * `count -> cpm, fpkm/rpkm, fpkmuq/rpkmrq, tpm, tmm`
15
+ * `fpkm/rpkm -> count, tpm, cpm`
16
+ * `tpm -> count, fpkm/rpkm, cpm`
17
+ * `cpm -> count, fpkm/rpkm, tpm`
18
+ NOTE that during some conversions, `sum(counts/effLen)` is approximated to
19
+ `sum(counts)/sum(effLen) * length(effLen))`
20
+
21
+ You can also use this process to just transform the expression values, e.g., take
22
+ log2 of the expression values. In this case, you can set `inunit` and `outunit` to
23
+ `count` and `log2(count + 1)` respectively.
24
+
25
+ Input:
26
+ infile: Input file containing expression values
27
+ The file should be a matrix with rows representing genes and columns
28
+ representing samples.
29
+ It could be an RDS file containing a data frame or a matrix, or a
30
+ text file containing a matrix with tab as the delimiter. The text
31
+ file can be gzipped.
32
+
33
+ Output:
34
+ outfile: Output file containing the converted expression values
35
+ The file will be a matrix with rows representing genes and columns
36
+ representing samples.
37
+
38
+ Envs:
39
+ inunit: The input unit of the expression values.
40
+ You can also use an expression to indicate the input unit, e.g.,
41
+ `log2(counts + 1)`. The expression should be like `A * fn(B*X + C) + D`,
42
+ where `A`, `B`, `C` and `D` are constants, `fn` is a function, and X is
43
+ the input unit.
44
+ Currently only `expr`, `sqrt`, `log2`, `log10` and `log` are supported as
45
+ functions.
46
+ Supported input units are:
47
+ * counts/count/rawcounts/rawcount: raw counts.
48
+ * cpm: counts per million.
49
+ * fpkm/rpkm: fragments per kilobase of transcript per million.
50
+ * fpkmuq/rpkmuq: upper quartile normalized FPKM/RPKM.
51
+ * tpm: transcripts per million.
52
+ * tmm: trimmed mean of M-values.
53
+ outunit: The output unit of the expression values. An expression can also be
54
+ used for transformation (e.g. `log2(tpm + 1)`). If `inunit` is `count`,
55
+ then this means we are converting raw counts to tpm, and transforming it
56
+ to `log2(tpm + 1)` as the output. Any expression supported by `R` can be
57
+ used. Same units as `inunit` are supported.
58
+ refexon: Path to the reference exon gff file.
59
+ meanfl (type=auto): A file containing the mean fragment length for each sample
60
+ by rows (samples as rowname), without header.
61
+ Or a fixed universal estimated number (1 used by TCGA).
62
+ nreads (type=auto): The estimatied total number of reads for each sample.
63
+ or you can pass a file with the number for each sample by rows
64
+ (samples as rowname), without header.
65
+ When converting `fpkm/rpkm -> count`, it should be total reads of that sample.
66
+ When converting `cpm -> count`: it should be total reads of that sample.
67
+ When converting `tpm -> count`: it should be total reads of that sample.
68
+ When converting `tpm -> cpm`: it should be total reads of that sample.
69
+ When converting `tpm -> fpkm/rpkm`: it should be `sum(fpkm)` of that sample.
70
+ It is not used when converting `count -> cpm, fpkm/rpkm, tpm`.
71
+ """ # noqa: E501
9
72
  input = "infile:file"
10
73
  output = "outfile:file:{{in.infile | basename}}"
11
74
  lang = config.lang.rscript
12
75
  envs = {
13
- "infmt": "matrix", # or rds
14
76
  "inunit": None,
15
77
  "outunit": None,
16
78
  "refexon": config.ref.refexon,
17
- "meanfl": None,
18
- "inlog2p": False,
19
- "outlog2p": False,
79
+ "meanfl": 1,
80
+ "nreads": 1_000_000,
20
81
  }
21
82
  script = "file://../scripts/rnaseq/UnitConversion.R"
83
+
84
+
85
+ class Simulation(Proc):
86
+ """Simulate RNA-seq data using ESCO/RUVcorr package
87
+
88
+ Input:
89
+ ngenes: Number of genes to simulate
90
+ nsamples: Number of samples to simulate
91
+ If you want to force the process to re-simulate for the same
92
+ `ngenes` and `nsamples`, you can set a different value for `envs.seed`.
93
+ Note that the samples will be shown as cells in the output (since
94
+ the simulation is designed for single-cell RNA-seq data).
95
+
96
+ Output:
97
+ outfile: Output file containing the simulated data with rows representing
98
+ genes and columns representing samples.
99
+ outdir: Output directory containing the simulated data
100
+ `sim.rds` and `True.rds` will be generated.
101
+ For `ESCO`, `sim.rds` contains the simulated data in a
102
+ `SingleCellExperiment` object, and `True.rds` contains the matrix of true
103
+ counts.
104
+ For `RUVcorr`, `sim.rds` contains the simulated data in list with
105
+ `Truth`, A matrix containing the values of Xβ; `Y` A matrix containing the
106
+ values in `Y`; `Noise` A matrix containing the values in `Wα`; `Sigma`
107
+ A matrix containing the true gene-gene correlations, as defined by Xβ; and
108
+ `Info` A matrix containing some of the general information about the
109
+ simulation.
110
+ For all matrices, rows represent genes and columns represent samples.
111
+
112
+ Envs:
113
+ tool (choice): Which tool to use for simulation.
114
+ - ESCO: uses the [ESCO](https://github.com/JINJINT/ESCO) package.
115
+ - RUVcorr: uses the [RUVcorr](https://rdrr.io/bioc/RUVcorr/) package.
116
+ ncores (type=int): Number of cores to use.
117
+ seed (type=int): Random seed.
118
+ If not set, seed will not be set.
119
+ esco_args (ns): Additional arguments to pass to the simulation function.
120
+ - save (choice): Which type of data to save to `out.outfile`.
121
+ - `simulated-truth`: saves the simulated true counts.
122
+ - `zero-inflated`: saves the zero-inflated counts.
123
+ - `down-sampled`: saves the down-sampled counts.
124
+ - type (choice): Which type of heterogenounity to use.
125
+ - single: produces a single population.
126
+ - group: produces distinct groups.
127
+ - tree: produces distinct groups but admits a tree structure.
128
+ - traj: produces distinct groups but admits a smooth trajectory
129
+ structure.
130
+ - <more>: See <https://rdrr.io/github/JINJINT/ESCO/man/escoParams.html>.
131
+ ruvcorr_args (ns): Additional arguments to pass to the simulation
132
+ function.
133
+ - <more>: See <https://rdrr.io/bioc/RUVcorr/man/simulateGEdata.html>.
134
+ transpose_output (flag): If set, the output will be transposed.
135
+ index_start (type=int): The index to start from when naming the samples.
136
+ Affects the sample names in `out.outfile` only.
137
+ """
138
+ input = "ngenes:var, nsamples:var"
139
+ output = [
140
+ "outfile:file:{{in.ngenes}}x{{in.nsamples}}.sim/simulated.txt",
141
+ "outdir:dir:{{in.ngenes}}x{{in.nsamples}}.sim",
142
+ ]
143
+ lang = config.lang.rscript
144
+ envs = {
145
+ "tool": "RUVcorr",
146
+ "ncores": config.misc.ncores,
147
+ "type": "single",
148
+ "esco_args": {
149
+ "dropout-type": "none",
150
+ "save": "simulated-truth",
151
+ "type": "single",
152
+ },
153
+ "ruvcorr_args": {},
154
+ "seed": None,
155
+ "transpose_output": False,
156
+ "index_start": 1,
157
+ }
158
+ script = "file://../scripts/rnaseq/Simulation.R"