biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +28 -0
  3. biopipen/core/filters.py +79 -4
  4. biopipen/core/proc.py +12 -3
  5. biopipen/core/testing.py +75 -3
  6. biopipen/ns/bam.py +148 -6
  7. biopipen/ns/bed.py +75 -0
  8. biopipen/ns/cellranger.py +186 -0
  9. biopipen/ns/cellranger_pipeline.py +126 -0
  10. biopipen/ns/cnv.py +19 -3
  11. biopipen/ns/cnvkit.py +1 -1
  12. biopipen/ns/cnvkit_pipeline.py +20 -12
  13. biopipen/ns/delim.py +34 -35
  14. biopipen/ns/gene.py +68 -23
  15. biopipen/ns/gsea.py +63 -37
  16. biopipen/ns/misc.py +39 -14
  17. biopipen/ns/plot.py +304 -1
  18. biopipen/ns/protein.py +183 -0
  19. biopipen/ns/regulatory.py +290 -0
  20. biopipen/ns/rnaseq.py +142 -5
  21. biopipen/ns/scrna.py +2053 -473
  22. biopipen/ns/scrna_metabolic_landscape.py +228 -382
  23. biopipen/ns/snp.py +659 -0
  24. biopipen/ns/stats.py +484 -0
  25. biopipen/ns/tcr.py +683 -98
  26. biopipen/ns/vcf.py +236 -2
  27. biopipen/ns/web.py +97 -6
  28. biopipen/reports/bam/CNVpytor.svelte +4 -9
  29. biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
  30. biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
  31. biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
  32. biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
  33. biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
  34. biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
  35. biopipen/reports/common.svelte +15 -0
  36. biopipen/reports/protein/ProdigySummary.svelte +16 -0
  37. biopipen/reports/scrna/CellsDistribution.svelte +4 -39
  38. biopipen/reports/scrna/DimPlots.svelte +1 -1
  39. biopipen/reports/scrna/MarkersFinder.svelte +6 -126
  40. biopipen/reports/scrna/MetaMarkers.svelte +3 -75
  41. biopipen/reports/scrna/RadarPlots.svelte +4 -20
  42. biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
  43. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
  44. biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
  45. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  46. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  47. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  48. biopipen/reports/snp/PlinkHet.svelte +18 -0
  49. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  50. biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
  51. biopipen/reports/tcr/ClonalStats.svelte +16 -0
  52. biopipen/reports/tcr/CloneResidency.svelte +3 -93
  53. biopipen/reports/tcr/Immunarch.svelte +4 -155
  54. biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
  55. biopipen/reports/tcr/TESSA.svelte +11 -28
  56. biopipen/reports/utils/misc.liq +22 -7
  57. biopipen/scripts/bam/BamMerge.py +11 -15
  58. biopipen/scripts/bam/BamSampling.py +90 -0
  59. biopipen/scripts/bam/BamSort.py +141 -0
  60. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  61. biopipen/scripts/bam/BamSubsetByBed.py +38 -0
  62. biopipen/scripts/bam/CNAClinic.R +41 -5
  63. biopipen/scripts/bam/CNVpytor.py +153 -54
  64. biopipen/scripts/bam/ControlFREEC.py +13 -14
  65. biopipen/scripts/bam/SamtoolsView.py +33 -0
  66. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  67. biopipen/scripts/bed/BedConsensus.py +5 -5
  68. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  69. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  70. biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
  71. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  72. biopipen/scripts/cellranger/CellRangerCount.py +138 -0
  73. biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
  74. biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
  75. biopipen/scripts/cnv/AneuploidyScore.R +55 -20
  76. biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
  77. biopipen/scripts/cnv/TMADScore.R +25 -9
  78. biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
  79. biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
  80. biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
  81. biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
  82. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  83. biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
  84. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  85. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  86. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
  87. biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
  88. biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
  89. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  90. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  91. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  92. biopipen/scripts/delim/RowsBinder.R +1 -1
  93. biopipen/scripts/delim/SampleInfo.R +116 -118
  94. biopipen/scripts/gene/GeneNameConversion.R +67 -0
  95. biopipen/scripts/gene/GenePromoters.R +61 -0
  96. biopipen/scripts/gsea/Enrichr.R +5 -5
  97. biopipen/scripts/gsea/FGSEA.R +184 -50
  98. biopipen/scripts/gsea/GSEA.R +2 -2
  99. biopipen/scripts/gsea/PreRank.R +5 -5
  100. biopipen/scripts/misc/Config2File.py +2 -2
  101. biopipen/scripts/misc/Plot.R +80 -0
  102. biopipen/scripts/misc/Shell.sh +15 -0
  103. biopipen/scripts/misc/Str2File.py +2 -2
  104. biopipen/scripts/plot/Heatmap.R +3 -3
  105. biopipen/scripts/plot/Manhattan.R +147 -0
  106. biopipen/scripts/plot/QQPlot.R +146 -0
  107. biopipen/scripts/plot/ROC.R +88 -0
  108. biopipen/scripts/plot/Scatter.R +112 -0
  109. biopipen/scripts/plot/VennDiagram.R +5 -9
  110. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  111. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  112. biopipen/scripts/protein/Prodigy.py +119 -0
  113. biopipen/scripts/protein/ProdigySummary.R +140 -0
  114. biopipen/scripts/protein/RMSD.py +178 -0
  115. biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
  116. biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
  117. biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
  118. biopipen/scripts/regulatory/MotifScan.py +159 -0
  119. biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
  120. biopipen/scripts/regulatory/motifs-common.R +324 -0
  121. biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
  122. biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
  123. biopipen/scripts/rnaseq/Simulation.R +21 -0
  124. biopipen/scripts/rnaseq/UnitConversion.R +325 -54
  125. biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
  126. biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
  127. biopipen/scripts/scrna/CellCellCommunication.py +150 -0
  128. biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
  129. biopipen/scripts/scrna/CellSNPLite.py +30 -0
  130. biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
  131. biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
  132. biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
  133. biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
  134. biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
  135. biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
  136. biopipen/scripts/scrna/CellsDistribution.R +456 -167
  137. biopipen/scripts/scrna/DimPlots.R +1 -1
  138. biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
  139. biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
  140. biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
  141. biopipen/scripts/scrna/ExprImputation.R +7 -0
  142. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  143. biopipen/scripts/scrna/MQuad.py +25 -0
  144. biopipen/scripts/scrna/MarkersFinder.R +679 -400
  145. biopipen/scripts/scrna/MetaMarkers.R +265 -161
  146. biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
  147. biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
  148. biopipen/scripts/scrna/RadarPlots.R +355 -134
  149. biopipen/scripts/scrna/ScFGSEA.R +298 -100
  150. biopipen/scripts/scrna/ScSimulation.R +65 -0
  151. biopipen/scripts/scrna/ScVelo.py +617 -0
  152. biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
  153. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
  154. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
  155. biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
  156. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
  157. biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
  158. biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
  159. biopipen/scripts/scrna/SeuratClustering.R +36 -233
  160. biopipen/scripts/scrna/SeuratLoading.R +2 -2
  161. biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
  162. biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
  163. biopipen/scripts/scrna/SeuratPreparing.R +223 -173
  164. biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
  165. biopipen/scripts/scrna/SeuratTo10X.R +27 -0
  166. biopipen/scripts/scrna/Slingshot.R +65 -0
  167. biopipen/scripts/scrna/Subset10X.R +2 -2
  168. biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
  169. biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
  170. biopipen/scripts/scrna/scvelo_paga.py +313 -0
  171. biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
  172. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
  173. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
  174. biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
  175. biopipen/scripts/snp/MatrixEQTL.R +217 -0
  176. biopipen/scripts/snp/Plink2GTMat.py +148 -0
  177. biopipen/scripts/snp/PlinkCallRate.R +199 -0
  178. biopipen/scripts/snp/PlinkFilter.py +100 -0
  179. biopipen/scripts/snp/PlinkFreq.R +291 -0
  180. biopipen/scripts/snp/PlinkFromVcf.py +81 -0
  181. biopipen/scripts/snp/PlinkHWE.R +85 -0
  182. biopipen/scripts/snp/PlinkHet.R +96 -0
  183. biopipen/scripts/snp/PlinkIBD.R +196 -0
  184. biopipen/scripts/snp/PlinkSimulation.py +124 -0
  185. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  186. biopipen/scripts/stats/ChowTest.R +146 -0
  187. biopipen/scripts/stats/DiffCoexpr.R +152 -0
  188. biopipen/scripts/stats/LiquidAssoc.R +135 -0
  189. biopipen/scripts/stats/Mediation.R +108 -0
  190. biopipen/scripts/stats/MetaPvalue.R +130 -0
  191. biopipen/scripts/stats/MetaPvalue1.R +74 -0
  192. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  193. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  194. biopipen/scripts/tcr/Attach2Seurat.R +3 -2
  195. biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
  196. biopipen/scripts/tcr/CDR3Clustering.R +343 -0
  197. biopipen/scripts/tcr/ClonalStats.R +526 -0
  198. biopipen/scripts/tcr/CloneResidency.R +255 -131
  199. biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
  200. biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
  201. biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
  202. biopipen/scripts/tcr/GIANA/query.py +164 -162
  203. biopipen/scripts/tcr/Immunarch-basic.R +31 -9
  204. biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
  205. biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
  206. biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
  207. biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
  208. biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
  209. biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
  210. biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
  211. biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
  212. biopipen/scripts/tcr/Immunarch.R +63 -11
  213. biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
  214. biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
  215. biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
  216. biopipen/scripts/tcr/SampleDiversity.R +1 -1
  217. biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
  218. biopipen/scripts/tcr/ScRepLoading.R +166 -0
  219. biopipen/scripts/tcr/TCRClusterStats.R +176 -22
  220. biopipen/scripts/tcr/TCRDock.py +110 -0
  221. biopipen/scripts/tcr/TESSA.R +102 -118
  222. biopipen/scripts/tcr/VJUsage.R +5 -5
  223. biopipen/scripts/tcr/immunarch-patched.R +142 -0
  224. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  225. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  226. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  227. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  228. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  229. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  230. biopipen/scripts/vcf/TruvariBench.sh +14 -7
  231. biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
  232. biopipen/scripts/vcf/TruvariConsistency.R +1 -1
  233. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  234. biopipen/scripts/vcf/VcfAnno.py +11 -11
  235. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  236. biopipen/scripts/vcf/VcfFilter.py +5 -5
  237. biopipen/scripts/vcf/VcfFix.py +7 -7
  238. biopipen/scripts/vcf/VcfFix_utils.py +13 -4
  239. biopipen/scripts/vcf/VcfIndex.py +3 -3
  240. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  241. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  242. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  243. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  244. biopipen/scripts/web/Download.py +8 -4
  245. biopipen/scripts/web/DownloadList.py +5 -5
  246. biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
  247. biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
  248. biopipen/scripts/web/gcloud_common.py +49 -0
  249. biopipen/utils/gene.py +108 -60
  250. biopipen/utils/misc.py +146 -20
  251. biopipen/utils/reference.py +64 -20
  252. biopipen/utils/reporter.py +177 -0
  253. biopipen/utils/vcf.py +1 -1
  254. biopipen-0.34.26.dist-info/METADATA +27 -0
  255. biopipen-0.34.26.dist-info/RECORD +292 -0
  256. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
  257. {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
  258. biopipen/ns/bcftools.py +0 -111
  259. biopipen/ns/scrna_basic.py +0 -255
  260. biopipen/reports/delim/SampleInfo.svelte +0 -36
  261. biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
  262. biopipen/reports/scrna/ScFGSEA.svelte +0 -35
  263. biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
  264. biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
  265. biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
  266. biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
  267. biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
  268. biopipen/reports/utils/gsea.liq +0 -110
  269. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  270. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  271. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  272. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  273. biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
  274. biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
  275. biopipen/scripts/scrna/ExprImpution.R +0 -7
  276. biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
  277. biopipen/scripts/scrna/Write10X.R +0 -11
  278. biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
  279. biopipen/scripts/tcr/TCRClustering.R +0 -280
  280. biopipen/utils/common_docstrs.py +0 -61
  281. biopipen/utils/gene.R +0 -49
  282. biopipen/utils/gsea.R +0 -193
  283. biopipen/utils/io.R +0 -20
  284. biopipen/utils/misc.R +0 -114
  285. biopipen/utils/mutate_helpers.R +0 -433
  286. biopipen/utils/plot.R +0 -173
  287. biopipen/utils/rnaseq.R +0 -48
  288. biopipen/utils/single_cell.R +0 -115
  289. biopipen-0.21.0.dist-info/METADATA +0 -22
  290. biopipen-0.21.0.dist-info/RECORD +0 -218
@@ -0,0 +1,104 @@
1
+ library(motifbreakR)
2
+
3
+ bsgenome <- getBSgenome(genome_pkg)
4
+
5
+ # `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
6
+ is_indel <- nchar(snpinfo$ref) != 1 | nchar(snpinfo$alt) != 1
7
+ snpinfo$coordname <- ifelse(
8
+ is_indel,
9
+ sprintf("%s:%s-%s:%s:%s", snpinfo$chrom, snpinfo$start + 1, snpinfo$end, snpinfo$ref, snpinfo$alt),
10
+ sprintf("%s:%s:%s:%s", snpinfo$chrom, snpinfo$end, snpinfo$ref, snpinfo$alt)
11
+ )
12
+ motifbreakr_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".motifbreakr.bed", basename(varfile)))
13
+ write.table(
14
+ snpinfo[, c("chrom", "start", "end", "coordname", "score", "strand")],
15
+ file = motifbreakr_bed,
16
+ sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE
17
+ )
18
+ snps <- snps.from.file(motifbreakr_bed, search.genome = bsgenome, format = "bed", indels = any(is_indel))
19
+ snpinfo <- snpinfo[snpinfo$coordname == snps$SNP_id, , drop = FALSE]
20
+ snps@elementMetadata$SNP_id <- ifelse(
21
+ snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
22
+ snpinfo$coordname,
23
+ snpinfo$name
24
+ )
25
+
26
+ # prepare PWMs
27
+ get_bkg <- function(base) {
28
+ base_col <- paste0("bkg.", base)
29
+ base_bkg <- mdb@elementMetadata[[base_col]]
30
+ if (is.null(base_bkg) || length(base_bkg) == 0 || is.na(base_bkg[1])) {
31
+ base_bkg <- 0.25
32
+ } else {
33
+ base_bkg <- as.numeric(base_bkg[1])
34
+ }
35
+ }
36
+ bkg <- c(A = get_bkg("A"), C = get_bkg("C"), G = get_bkg("G"), T = get_bkg("T"))
37
+
38
+ # run motifbreakR
39
+ log$info("Running motifbreakR ...")
40
+ results <- motifbreakR(
41
+ snpList = snps,
42
+ pwmList = mdb,
43
+ threshold = cutoff,
44
+ method = motifbreakr_args$method,
45
+ bkg = bkg,
46
+ filterp = TRUE,
47
+ show.neutral = FALSE,
48
+ BPPARAM = MulticoreParam(ncores)
49
+ )
50
+
51
+ log$info("Calculating p values ...")
52
+ results <- calculatePvalue(results)
53
+ results$.id <- 1:length(results)
54
+ results_to_save <- as.data.frame(unname(results))
55
+ results_to_save$motifPos <- lapply(results_to_save$motifPos, function(x) paste(x, collapse = ","))
56
+ results_to_save$altPos <- lapply(results_to_save$altPos, function(x) paste(x, collapse = ","))
57
+ if (!is.null(regulator_col)) {
58
+ results_to_save$Regulator <- in_motifs[
59
+ match(results_to_save$providerId, in_motifs[[motif_col]]),
60
+ regulator_col,
61
+ drop = TRUE
62
+ ]
63
+ }
64
+ results_to_save <- as.data.frame(apply(results_to_save, 2, as.character))
65
+
66
+ if (!is.null(motif_var_pairs)) {
67
+ log$info("Filtering motif-variant pairs ...")
68
+ results_to_save$motifs_vars <- paste0(results_to_save$providerId, " // ", results_to_save$SNP_id)
69
+ results_to_save <- results_to_save[results_to_save$motifs_vars %in% motif_var_pairs, , drop = FALSE]
70
+ results_to_save$motifs_vars <- NULL
71
+ }
72
+
73
+ write.table(
74
+ results_to_save,
75
+ file = file.path(outdir, "motifbreakr.txt"),
76
+ sep = "\t", quote = FALSE, row.names = FALSE
77
+ )
78
+ # rm(results_to_save)
79
+
80
+ log$info("Plotting variants ...")
81
+ if (is.null(plots) || length(plots) == 0) {
82
+ results_to_save$alleleDiff <- as.numeric(results_to_save$alleleDiff)
83
+ results_to_save <- results_to_save[order(-abs(results_to_save$alleleDiff)), , drop = FALSE]
84
+ results_to_save <- results_to_save[1:min(plot_nvars, nrow(results_to_save)), , drop = FALSE]
85
+ variants <- unique(results_to_save$SNP_id)
86
+ } else {
87
+ variants <- names(plots)
88
+ }
89
+ for (variant in variants) {
90
+ log$info("- Variant: {variant}")
91
+ if (is.null(plots[[variant]])) {
92
+ plots[[variant]] <- list(devpars = devpars, which = "TRUE")
93
+ }
94
+ if (is.null(plots[[variant]]$which)) {
95
+ plots[[variant]]$which <- "TRUE"
96
+ }
97
+ if (is.null(plots[[variant]]$devpars)) {
98
+ plots[[variant]]$devpars <- devpars
99
+ }
100
+ res <- results[results$SNP_id == variant & results$.id %in% results_to_save$.id, , drop = FALSE]
101
+ res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
102
+
103
+ plot_variant_motifs(res, variant, plots[[variant]]$devpars, outdir)
104
+ }
@@ -0,0 +1,159 @@
1
+ """Script for regulatory.MotifScan"""
2
+ import re
3
+
4
+ # Paths may be passed in args or to motifdb
5
+ from pathlib import PosixPath # noqa: F401
6
+ from biopipen.utils.misc import run_command, dict_to_cli_args, logger
7
+
8
+ motiffile: str = {{in.motiffile | quote}} # pyright: ignore # noqa: #999
9
+ seqfile: str = {{in.seqfile | quote}} # pyright: ignore
10
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
11
+
12
+ tool = {{envs.tool | repr}} # pyright: ignore
13
+ fimo = {{envs.fimo | repr}} # pyright: ignore
14
+ motif_col: str | int = {{envs.motif_col | repr}} # pyright: ignore
15
+ regulator_col: str | int = {{envs.regulator_col | repr}} # pyright: ignore
16
+ notfound = {{envs.notfound | repr}} # pyright: ignore
17
+ motifdb: str | None = {{envs.motifdb | repr}} # pyright: ignore
18
+ cutoff = {{envs.cutoff | repr}} # pyright: ignore
19
+ q = {{envs.q | repr}} # pyright: ignore
20
+ q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
21
+ args: dict = {{envs.args | dict | repr}} # pyright: ignore
22
+
23
+ # Check if the tool is supported
24
+ if tool != "fimo":
25
+ raise ValueError(f"Unsupported tool: {tool}, currently only fimo is supported")
26
+
27
+ # Check if the motif database is provided
28
+ if motifdb is None:
29
+ raise ValueError("The motif database is required")
30
+
31
+ # Check if the motif file exists
32
+ if not motiffile:
33
+ raise FileNotFoundError(f"Motif file in.motiffile must be provided")
34
+
35
+ # Check if the sequence file exists
36
+ if not seqfile:
37
+ raise FileNotFoundError(f"Sequence file in.seqfile must be provided")
38
+
39
+ # Normalize motif_col and regulator_col into 0-based indexes
40
+ if isinstance(motif_col, str) or isinstance(regulator_col, str):
41
+ with open(motiffile, "r") as f:
42
+ header = f.readline().strip().split("\t")
43
+ if isinstance(motif_col, str):
44
+ motif_col: int = header.index(motif_col) + 1
45
+ if isinstance(regulator_col, str):
46
+ regulator_col = header.index(regulator_col) + 1
47
+ if isinstance(motif_col, int):
48
+ motif_col -= 1
49
+ if isinstance(regulator_col, int):
50
+ regulator_col -= 1
51
+
52
+ # Check if motif names exist in the database
53
+ with open(motiffile, "r") as f:
54
+ motif_names = set(
55
+ line.strip().split("\t")[motif_col]
56
+ for i, line in enumerate(f)
57
+ if i > 0 # skip header
58
+ )
59
+
60
+ with open(motifdb, "r") as f:
61
+ motif_db_names = set(
62
+ line[6:].strip()
63
+ for line in f
64
+ if line.startswith("MOTIF")
65
+ )
66
+
67
+ if notfound == "error":
68
+ notfound_motifs = motif_names - motif_db_names
69
+ if notfound_motifs:
70
+ raise ValueError(f"Motifs not found in the database: {notfound_motifs}")
71
+
72
+ # Make a new motif database with only the motifs in the motiffile
73
+ motif_names = motif_names & motif_db_names
74
+ motifdb_filtered = f"{outdir}/motif_db.txt"
75
+ with open(motifdb, "r") as f, open(motifdb_filtered, "w") as f_out:
76
+ should_write = True
77
+ for line in f:
78
+ if line.startswith("MOTIF"):
79
+ motif_name = line[6:].strip()
80
+ if motif_name in motif_names:
81
+ should_write = True
82
+ else:
83
+ should_write = False
84
+
85
+ if should_write:
86
+ f_out.write(line)
87
+ else:
88
+ continue
89
+
90
+ # Now run fimo
91
+ args[""] = fimo
92
+ args["oc"] = f"{outdir}"
93
+ args["thresh"] = cutoff
94
+ args["qv_thresh"] = q_cutoff
95
+ args["no_qvalue"] = not q
96
+ args["no-pgc"] = True
97
+ args["_"] = [motifdb_filtered, seqfile]
98
+
99
+ logger.info("Running fimo ...")
100
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
101
+
102
+ logger.info("Adding additional information to the output ...")
103
+ # Get the motif to regulator mapping
104
+ motif_regulator_map = {}
105
+ if regulator_col is not None:
106
+ with open(motiffile, "r") as f:
107
+ next(f) # skip header
108
+ for line in f:
109
+ line = line.strip().split("\t")
110
+ motif_name = line[motif_col]
111
+ regulator = line[regulator_col]
112
+ motif_regulator_map[motif_name] = regulator
113
+
114
+ # Get the sequence name information
115
+ seqnames = {}
116
+ seqcoords = {}
117
+ with open(seqfile, "r") as f:
118
+ for line in f:
119
+ if not line.startswith(">"):
120
+ continue
121
+
122
+ seqname = line[1:].strip()
123
+ match = re.match(r"^(.+)::((?:chr)?\d+):(\d+)-(\d+).*$", seqname)
124
+ if not match:
125
+ seqnames[seqname] = seqname
126
+ seqcoords[seqname] = None
127
+ else:
128
+ sname, chrom, start, end = match.groups()
129
+ seqnames[seqname] = sname
130
+ seqcoords[seqname] = (chrom, int(start), int(end))
131
+
132
+ # Add additional information to the output
133
+ with open(f"{outdir}/fimo.tsv", "r") as f, open(f"{outdir}/fimo_output.txt", "w") as f_out:
134
+ header = f.readline().strip().split("\t")
135
+ f_out.write(
136
+ "\t".join(header + ["regulator", "seqname", "seqstart", "seqstop"]) + "\n"
137
+ )
138
+ for line in f:
139
+ line = line.strip()
140
+ if not line or line.startswith("#"):
141
+ continue
142
+ line = line.split("\t")
143
+ motif_name = line[0]
144
+ sequence_name = line[2]
145
+ start = int(line[3])
146
+ stop = int(line[4])
147
+ regulator = motif_regulator_map.get(motif_name, motif_name)
148
+ seqname = seqnames.get(sequence_name, "NA")
149
+ seqcoord = seqcoords.get(sequence_name)
150
+ if not seqcoord:
151
+ seqstart = "NA"
152
+ seqstop = "NA"
153
+ else:
154
+ seqstart = start + seqcoord[1] - 1
155
+ seqstop = stop + seqcoord[2] - 1
156
+
157
+ f_out.write(
158
+ "\t".join(line + [regulator, seqname, str(seqstart), str(seqstop)]) + "\n"
159
+ )
@@ -0,0 +1,78 @@
1
+ {% include biopipen_dir + "/scripts/regulatory/motifs-common.R" %}
2
+
3
+ library(BSgenome)
4
+ library(GenomicRanges)
5
+ library(biopipen.utils)
6
+
7
+ infile <- {{in.infile | r}}
8
+ outdir <- {{out.outdir | r}}
9
+ genome <- {{envs.genome | r}}
10
+ motifdb <- {{envs.motifdb | r}}
11
+ motif_col <- {{envs.motif_col | r}}
12
+ regulator_col <- {{envs.regulator_col | r}}
13
+ regmotifs <- {{envs.regmotifs | r}}
14
+ notfound <- {{envs.notfound | r}}
15
+ devpars <- {{envs.devpars | r}}
16
+ plot_vars <- {{envs.plot_vars | r}}
17
+
18
+ if (is.null(motifdb) || !file.exists(motifdb)) {
19
+ stop("Motif database (envs.motifdb) is required and must exist")
20
+ }
21
+
22
+ if (is.null(genome)) {
23
+ stop("Reference genome (envs.ref) is required and must exist")
24
+ }
25
+
26
+ if (is.null(motif_col) && is.null(regulator_col)) {
27
+ stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
28
+ }
29
+
30
+ log <- get_logger()
31
+
32
+ log$info("Reading input data ...")
33
+ indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
34
+
35
+ log$info("Ensuring regulators in the input data ...")
36
+ indata <- ensure_regulator_motifs(indata, outdir, motif_col, regulator_col, "SNP_id", regmotifs, notfound = notfound)
37
+ genome_pkg <- get_genome_pkg(genome)
38
+
39
+ log$info("Reading motif database ...")
40
+ meme <- read_meme_to_motifdb(motifdb, indata, motif_col, regulator_col, notfound, outdir)
41
+
42
+ log$info("Composing motifbreakR results from input data ...")
43
+ indata$chr <- indata$chrom %||% indata$chr %||% indata$seqnames
44
+ indata$seqnames <- NULL
45
+ indata$strand <- indata$strand %||% "+"
46
+ indata$varType <- indata$varType %||% "SNV"
47
+ indata$geneSymbol <- indata$geneSymbol %||% indata$Regulator
48
+ indata$providerId <- indata$providerId %||% indata$motif
49
+ indata$providerName <- indata$providerName %||% indata$providerId
50
+ indata$dataSource <- indata$dataSource %||% strsplit(basename(motifdb), "\\.")[[1]][1]
51
+ indata$effect <- indata$effect %||% "strong"
52
+ indata$altPos <- indata$altPos %||% 1
53
+ indata$alleleDiff <- indata$alleleDiff %||% indata$score %||% 0
54
+
55
+ # check other required columns
56
+ for (col in c("start", "end", "SNP_id", "REF", "ALT", "motifPos")) {
57
+ if (!(col %in% colnames(indata))) {
58
+ stop("Column '", col, "' is required in the input data")
59
+ }
60
+ }
61
+ indata$motifPos <- lapply(indata$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
62
+ indata <- makeGRangesFromDataFrame(indata, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
63
+ genome(indata) <- genome
64
+ attributes(indata)$genome.package <- genome_pkg
65
+ attributes(indata)$motifs <- meme
66
+
67
+ log$info("Plotting variants ...")
68
+ if (is.null(plot_vars)) {
69
+ plot_vars <- unique(indata$SNP_id)
70
+ } else if (length(plot_vars) > 1) {
71
+ plot_vars <- unique(plot_vars)
72
+ } else {
73
+ plot_vars <- strsplit(plot_vars, ",")[[1]]
74
+ }
75
+ for (pvar in plot_vars) {
76
+ log$info("- Variant: {pvar}")
77
+ plot_variant_motifs(indata, pvar, devpars, outdir)
78
+ }
@@ -0,0 +1,324 @@
1
+ library(rlang)
2
+ library(universalmotif)
3
+ library(MotifDb)
4
+ library(biopipen.utils)
5
+
6
+ #' @title Common functions for regulatory analysis
7
+ #' @name regulatory-common
8
+ #' @author Panwen Wang
9
+
10
+ #' Read a regulator-motif mapping file
11
+ #'
12
+ #' @param rmfile Regulator-motif mapping file
13
+ #' @param motif_cols_allowed Allowed motif columns
14
+ #' @param reg_cols_allowed Allowed regulator columns
15
+ #' @return Data frame with regulators and motifs in the first and second columns, respectively
16
+ .read_regmotifs <- function(
17
+ rmfile,
18
+ motif_cols_allowed = c("Motif", "motif", "MOTIF", "Model", "model", "MODEL"),
19
+ reg_cols_allowed = c("Regulator", "regulator", "REGULATOR", "TF", "tf", "TF")
20
+ ) {
21
+ if (!file.exists(rmfile)) {
22
+ stop("Regulator-motif mapping file does not exist.")
23
+ }
24
+ regmotifs <- read.table(rmfile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
25
+ rm_motif_col <- intersect(motif_cols_allowed, colnames(regmotifs))
26
+ rm_reg_col <- intersect(reg_cols_allowed, colnames(regmotifs))
27
+ if (length(rm_motif_col) == 0) {
28
+ stop(paste0("No motif column found in the regulator-motif mapping file, provide one of: ", paste(motif_cols_allowed, collapse = ", ")))
29
+ }
30
+ if (length(rm_reg_col) == 0) {
31
+ stop(paste0("No regulator column found in the regulator-motif mapping file, provide one of: ", paste(reg_cols_allowed, collapse = ", ")))
32
+ }
33
+ if (length(rm_motif_col) > 1) {
34
+ stop(paste0("Multiple motif columns found (", paste(rm_motif_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
35
+ }
36
+ if (length(rm_reg_col) > 1) {
37
+ stop(paste0("Multiple regulator columns found (", paste(rm_reg_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
38
+ }
39
+ rm_motif_col <- rm_motif_col[1]
40
+ rm_reg_col <- rm_reg_col[1]
41
+ regmotifs <- regmotifs[, c(rm_motif_col, rm_reg_col), drop = FALSE]
42
+
43
+ return(regmotifs)
44
+ }
45
+
46
+ #' Handle not found items
47
+ #'
48
+ #' @param notfound_items Items that were not found
49
+ #' @param log_warn Function to log warnings
50
+ #' @param msg Message to display
51
+ #' @param notfound Action to take if items are not found
52
+ #' @param notfound_file File to save the full list of not found items
53
+ #' @param log_indent Indentation for log messages
54
+ .handle_notfound_items <- function (notfound_items, log_warn, msg, notfound, notfound_file, log_indent = "") {
55
+ if (length(notfound_items) > 0) {
56
+ first_notfound <- head(notfound_items, 3)
57
+ if (length(notfound_items) > 3) {
58
+ first_notfound <- c(first_notfound, "...")
59
+ writeLines(notfound_items, notfound_file)
60
+ msg1 <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
61
+ msg2 <- paste0(log_indent, "Check the full list in ", notfound_file)
62
+ if (notfound == "error") {
63
+ stop(msg1, "\n", msg2)
64
+ } else if (notfound == "ignore") {
65
+ log_warn(msg1)
66
+ log_warn(msg2)
67
+ }
68
+ } else {
69
+ msg <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
70
+ if (notfound == "error") {
71
+ stop(msg)
72
+ } else if (notfound == "ignore") {
73
+ log_warn(msg)
74
+ }
75
+ }
76
+ }
77
+ }
78
+
79
+ #' Read a MEME file to a MotifDb object
80
+ #' and filter the motifs based on the input data
81
+ #' and return the filtered MotifDb object
82
+ #' with metadata
83
+ #'
84
+ #' @param motifdb MEME file
85
+ #' @param indata Input data frame
86
+ #' @param motif_col Column name for the motif
87
+ #' @param regulator_col Column name for the regulator
88
+ #' @param notfound Action to take if motifs are not found
89
+ #' @param outdir Output directory, used to save un-matched motifs
90
+ #' @return MotifDb object
91
+ #' @export
92
+ read_meme_to_motifdb <- function(motifdb, indata, motif_col, regulator_col, notfound, outdir) {
93
+ meme <- read_meme(motifdb)
94
+ motifdb_names <- sapply(meme, function(m) m@name)
95
+ motifs <- check_motifs(indata[[motif_col]], motifdb_names, notfound, outdir)
96
+ meme <- filter_motifs(meme, name = motifs)
97
+ # Get the right order of motif names
98
+ motifs <- sapply(meme, function(m) m@name)
99
+ motifdb_matrices <- lapply(meme, function(m) m@motif)
100
+ names(motifdb_matrices) <- motifs
101
+ motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
102
+ ats <- attributes(m)
103
+ ats$dataSource <- strsplit(basename(motifdb), "\\.")[[1]][1]
104
+ ats$class <- NULL
105
+ ats$motif <- NULL
106
+ ats$gapinfo <- NULL
107
+ ats$sequenceCount <- ats$nsites
108
+ ats$providerId <- ats$name
109
+ ats$providerName <- ats$name
110
+ ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
111
+ if (!is.null(regulator_col)) {
112
+ ats$geneSymbol <- indata[
113
+ indata[[motif_col]] == ats$name,
114
+ regulator_col,
115
+ drop = TRUE
116
+ ]
117
+ }
118
+ unlist(ats)
119
+ })
120
+ )
121
+ rownames(motifdb_meta) <- motifs
122
+ MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
123
+ }
124
+
125
+ #' Convert a MotifDb object to a motif library
126
+ #' with motif names as keys
127
+ #' and PWMs as values
128
+ #' @param motifdb MotifDb object
129
+ #' @return Motif library
130
+ #' @export
131
+ motifdb_to_motiflib <- function(motifdb) {
132
+ lapply(motifdb, t)
133
+ }
134
+
135
+ #' Make sure the regulators and motifs in the input data from a regulator-motif mappings
136
+ #'
137
+ #' @param indata Input data frame
138
+ #' @param outdir Output directory, used to save un-matched regulators
139
+ #' @param motif_col Column name for the motif
140
+ #' @param regulator_col Column name for the regulator
141
+ #' @param var_col Column name for the variant
142
+ #' @param regmotifs Regulator-motif mapping file
143
+ #' @param log_indent Indentation for log messages
144
+ #' @param notfound Action to take if regulators are not found in the mapping file
145
+ #' @return Data frame with regulators and motifs
146
+ #' @export
147
+ ensure_regulator_motifs <- function (indata, outdir, motif_col, regulator_col, var_col, regmotifs, log_indent = "", notfound = "error", log = NULL) {
148
+ if (is.null(motif_col)) {
149
+ if (is.null(regmotifs)) {
150
+ stop("Regulator-motif mapping file (envs.regmotifs) is required when no motif column (envs.motif_col) is provided")
151
+ }
152
+ log <- log %||% get_logger()
153
+ regmotifs <- .read_regmotifs(regmotifs)
154
+ rm_motif_col <- colnames(regmotifs)[1]
155
+ rm_reg_col <- colnames(regmotifs)[2]
156
+ # check regulators
157
+ rm_regs <- regmotifs[[rm_reg_col]]
158
+ regulators <- indata[[regulator_col]]
159
+ notfound_regs <- setdiff(regulators, rm_regs)
160
+ .handle_notfound_items(
161
+ notfound_regs,
162
+ log$warn,
163
+ "The following regulators were not found in the regulator-motif mapping file",
164
+ notfound,
165
+ file.path(outdir, "notfound_regulators.txt"),
166
+ log_indent
167
+ )
168
+ indata <- indata[indata[[regulator_col]] %in% rm_regs, , drop = FALSE]
169
+ # add motif column
170
+ indata <- merge(indata, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
171
+ # update motif column
172
+ motif_col <<- rm_motif_col
173
+ } else if (is.null(regulator_col)) {
174
+ if (is.null(regmotifs) || (is.character(regmotifs) && nchar(regmotifs) == 0)) {
175
+ # make motifs unique
176
+ indata <- indata[!duplicated(indata[[motif_col]]), , drop = FALSE]
177
+ } else if (!file.exists(regmotifs)) {
178
+ stop("Regulator-motif mapping file (envs.regmotifs) does not exist.")
179
+ } else {
180
+ # map the regulators
181
+ regmotifs <- .read_regmotifs(regmotifs)
182
+ rm_motif_col <- colnames(regmotifs)[1]
183
+ rm_reg_col <- colnames(regmotifs)[2]
184
+ rm_motifs <- regmotifs[[rm_motif_col]]
185
+ motifs <- indata[[motif_col]]
186
+ notfound_motifs <- setdiff(motifs, rm_motifs)
187
+ .handle_notfound_items(
188
+ notfound_motifs,
189
+ log$warn,
190
+ "The following motifs were not found in the regulator-motif mapping file",
191
+ notfound,
192
+ file.path(outdir, "notfound_motifs.txt"),
193
+ log_indent
194
+ )
195
+ indata <- indata[indata[[motif_col]] %in% rm_motifs, , drop = FALSE]
196
+ # add regulator column
197
+ indata <- merge(indata, regmotifs, by.x = motif_col, by.y = rm_motif_col, all.x = TRUE, suffixes = c("", "_db"))
198
+ # update regulator column
199
+ regulator_col <<- rm_reg_col
200
+ }
201
+ } else {
202
+ indata <- indata[!duplicated(indata[, c(regulator_col, motif_col, var_col), drop = FALSE]), , drop = FALSE]
203
+ }
204
+
205
+ return(indata)
206
+ }
207
+
208
+ #' Get the genome package name for a given genome
209
+ #'
210
+ #' @param genome Genome name
211
+ #' @return Genome package name
212
+ #' @export
213
+ get_genome_pkg <- function(genome) {
214
+ if (!grepl(".", genome, fixed = TRUE)) {
215
+ genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
216
+ } else {
217
+ genome_pkg = genome
218
+ }
219
+ if (!requireNamespace(genome_pkg, quietly = TRUE)) {
220
+ stop(sprintf("Genome package %s is not installed", genome_pkg))
221
+ }
222
+
223
+ library(package = genome_pkg, character.only = TRUE)
224
+ return(genome_pkg)
225
+ }
226
+
227
+ #' Check if motifs are in the motif database
228
+ #' and return the motifs that are found
229
+ #'
230
+ #' @param motifs Motifs to check
231
+ #' @param all_motifs All motifs in the motif database
232
+ #' @param notfound Action to take if motifs are not found
233
+ #' @param outdir Output directory, used to save un-matched motifs
234
+ #' @return Motifs that are found
235
+ #' @export
236
+ check_motifs <- function(motifs, all_motifs, notfound, outdir, log = NULL) {
237
+ log <- log %||% get_logger()
238
+ notfound_motifs <- setdiff(motifs, all_motifs)
239
+ if (length(notfound_motifs) > 0) {
240
+ first_notfound <- head(notfound_motifs, 3)
241
+ if (length(notfound_motifs) > 3) {
242
+ first_notfound <- c(first_notfound, "...")
243
+ notfound_file <- file.path(outdir, "notfound_motifs.txt")
244
+ writeLines(notfound_motifs, notfound_file)
245
+ msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
246
+ msg2 <- paste0("Check the full list in ", notfound_file)
247
+
248
+ if (notfound == "error") {
249
+ stop(msg1, "\n", msg2)
250
+ } else if (notfound == "ignore") {
251
+ log$warn(msg1)
252
+ log$warn(msg2)
253
+ }
254
+ } else {
255
+ msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
256
+ if (notfound == "error") {
257
+ stop(msg)
258
+ } else if (notfound == "ignore") {
259
+ log$warn(msg)
260
+ }
261
+ }
262
+
263
+ motifs <- setdiff(motifs, notfound_motifs)
264
+ }
265
+ return(motifs)
266
+ }
267
+
268
+ #' Plot a genomic region surrounding a genomic variant, and
269
+ #' potentially disrupted motifs.
270
+ #'
271
+ #' @param results The motifbreakR results.
272
+ #' A GRanges object with the following columns:
273
+ #' - seqnames: Chromosome
274
+ #' - ranges: Start and end positions
275
+ #' - strand: Strand
276
+ #' -------------------
277
+ #' - SNP_id: Variant ID
278
+ #' - REF: Reference allele
279
+ #' - ALT: Alternative allele
280
+ #' - varType: Variant type. By default, "SNV"
281
+ #' - motifPos: Motif positions
282
+ #' - geneSymbol: Gene symbol, if not provided, try to get from the Regulator column
283
+ #' - dataSource: Motif database source
284
+ #' - providerName: Motif name
285
+ #' - providerId: Motif ID
286
+ #' - effect: Effect of the variant. By default, "strong"
287
+ #' - altPos: Alternative allele position. By default, 1
288
+ #' - alleleDiff: Allele difference, default 0, does not affect the plot for SNVs
289
+ #'
290
+ #' Attributes:
291
+ #' - genome.package: Genome package name
292
+ #' - motifs: Motif database, in MotifDb::MotifList format
293
+ #' @param variant Variant ID to be plotted
294
+ #' @param devpars List of device parameters
295
+ #' - res: Resolution, default 100
296
+ #' - width: Width of the plot, default NULL, calculated based on sequence length
297
+ #' - height: Height of the plot, default NULL, calculated based on the number of motifs
298
+ #' @param outdir Output directory. Plots will be saved in the sub-directory "<outdir>/plots/"
299
+ #' @export
300
+ plot_variant_motifs <- function(results, variant, devpars, outdir) {
301
+ plotdir <- file.path(outdir, "plots")
302
+ dir.create(plotdir, showWarnings = FALSE)
303
+
304
+ res <- results[results$SNP_id == variant, , drop = FALSE]
305
+ devpars <- devpars %||% list(res = 100, width = NULL, height = NULL)
306
+ if (length(res) == 0) {
307
+ stop(sprintf("Variant %s not found in results", variant))
308
+ }
309
+ devpars$res <- devpars$res %||% 100
310
+ devpars$height <- devpars$height %||% 2.4 * devpars$res + length(res) * 1.2 * devpars$res
311
+ if (is.null(devpars$width)) {
312
+ left <- min(sapply(res$motifPos, `[`, 1))
313
+ right <- max(sapply(res$motifPos, `[`, 2))
314
+ devpars$width <- 1.5 * devpars$res + (right - left) * 0.3 * devpars$res
315
+ devpars$width <- max(devpars$width, 5 * devpars$res)
316
+ }
317
+
318
+ plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
319
+ # fix motifBreakR 2.12 using names to filter in plotMB
320
+ names(res) <- res$SNP_id
321
+ png(plotfile, width = devpars$width, height = devpars$height, res = devpars$res)
322
+ motifbreakR::plotMB(res, variant)
323
+ dev.off()
324
+ }