biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
library(motifbreakR)
|
|
2
|
+
|
|
3
|
+
bsgenome <- getBSgenome(genome_pkg)
|
|
4
|
+
|
|
5
|
+
# `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
|
|
6
|
+
is_indel <- nchar(snpinfo$ref) != 1 | nchar(snpinfo$alt) != 1
|
|
7
|
+
snpinfo$coordname <- ifelse(
|
|
8
|
+
is_indel,
|
|
9
|
+
sprintf("%s:%s-%s:%s:%s", snpinfo$chrom, snpinfo$start + 1, snpinfo$end, snpinfo$ref, snpinfo$alt),
|
|
10
|
+
sprintf("%s:%s:%s:%s", snpinfo$chrom, snpinfo$end, snpinfo$ref, snpinfo$alt)
|
|
11
|
+
)
|
|
12
|
+
motifbreakr_bed <- file.path(outdir, gsub("\\.vcf(\\.gz)?$|\\.bed$", ".motifbreakr.bed", basename(varfile)))
|
|
13
|
+
write.table(
|
|
14
|
+
snpinfo[, c("chrom", "start", "end", "coordname", "score", "strand")],
|
|
15
|
+
file = motifbreakr_bed,
|
|
16
|
+
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE
|
|
17
|
+
)
|
|
18
|
+
snps <- snps.from.file(motifbreakr_bed, search.genome = bsgenome, format = "bed", indels = any(is_indel))
|
|
19
|
+
snpinfo <- snpinfo[snpinfo$coordname == snps$SNP_id, , drop = FALSE]
|
|
20
|
+
snps@elementMetadata$SNP_id <- ifelse(
|
|
21
|
+
snpinfo$name == "." | is.na(snpinfo$name) | nchar(snpinfo$name) == 0,
|
|
22
|
+
snpinfo$coordname,
|
|
23
|
+
snpinfo$name
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# prepare PWMs
|
|
27
|
+
get_bkg <- function(base) {
|
|
28
|
+
base_col <- paste0("bkg.", base)
|
|
29
|
+
base_bkg <- mdb@elementMetadata[[base_col]]
|
|
30
|
+
if (is.null(base_bkg) || length(base_bkg) == 0 || is.na(base_bkg[1])) {
|
|
31
|
+
base_bkg <- 0.25
|
|
32
|
+
} else {
|
|
33
|
+
base_bkg <- as.numeric(base_bkg[1])
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
bkg <- c(A = get_bkg("A"), C = get_bkg("C"), G = get_bkg("G"), T = get_bkg("T"))
|
|
37
|
+
|
|
38
|
+
# run motifbreakR
|
|
39
|
+
log$info("Running motifbreakR ...")
|
|
40
|
+
results <- motifbreakR(
|
|
41
|
+
snpList = snps,
|
|
42
|
+
pwmList = mdb,
|
|
43
|
+
threshold = cutoff,
|
|
44
|
+
method = motifbreakr_args$method,
|
|
45
|
+
bkg = bkg,
|
|
46
|
+
filterp = TRUE,
|
|
47
|
+
show.neutral = FALSE,
|
|
48
|
+
BPPARAM = MulticoreParam(ncores)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
log$info("Calculating p values ...")
|
|
52
|
+
results <- calculatePvalue(results)
|
|
53
|
+
results$.id <- 1:length(results)
|
|
54
|
+
results_to_save <- as.data.frame(unname(results))
|
|
55
|
+
results_to_save$motifPos <- lapply(results_to_save$motifPos, function(x) paste(x, collapse = ","))
|
|
56
|
+
results_to_save$altPos <- lapply(results_to_save$altPos, function(x) paste(x, collapse = ","))
|
|
57
|
+
if (!is.null(regulator_col)) {
|
|
58
|
+
results_to_save$Regulator <- in_motifs[
|
|
59
|
+
match(results_to_save$providerId, in_motifs[[motif_col]]),
|
|
60
|
+
regulator_col,
|
|
61
|
+
drop = TRUE
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
results_to_save <- as.data.frame(apply(results_to_save, 2, as.character))
|
|
65
|
+
|
|
66
|
+
if (!is.null(motif_var_pairs)) {
|
|
67
|
+
log$info("Filtering motif-variant pairs ...")
|
|
68
|
+
results_to_save$motifs_vars <- paste0(results_to_save$providerId, " // ", results_to_save$SNP_id)
|
|
69
|
+
results_to_save <- results_to_save[results_to_save$motifs_vars %in% motif_var_pairs, , drop = FALSE]
|
|
70
|
+
results_to_save$motifs_vars <- NULL
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
write.table(
|
|
74
|
+
results_to_save,
|
|
75
|
+
file = file.path(outdir, "motifbreakr.txt"),
|
|
76
|
+
sep = "\t", quote = FALSE, row.names = FALSE
|
|
77
|
+
)
|
|
78
|
+
# rm(results_to_save)
|
|
79
|
+
|
|
80
|
+
log$info("Plotting variants ...")
|
|
81
|
+
if (is.null(plots) || length(plots) == 0) {
|
|
82
|
+
results_to_save$alleleDiff <- as.numeric(results_to_save$alleleDiff)
|
|
83
|
+
results_to_save <- results_to_save[order(-abs(results_to_save$alleleDiff)), , drop = FALSE]
|
|
84
|
+
results_to_save <- results_to_save[1:min(plot_nvars, nrow(results_to_save)), , drop = FALSE]
|
|
85
|
+
variants <- unique(results_to_save$SNP_id)
|
|
86
|
+
} else {
|
|
87
|
+
variants <- names(plots)
|
|
88
|
+
}
|
|
89
|
+
for (variant in variants) {
|
|
90
|
+
log$info("- Variant: {variant}")
|
|
91
|
+
if (is.null(plots[[variant]])) {
|
|
92
|
+
plots[[variant]] <- list(devpars = devpars, which = "TRUE")
|
|
93
|
+
}
|
|
94
|
+
if (is.null(plots[[variant]]$which)) {
|
|
95
|
+
plots[[variant]]$which <- "TRUE"
|
|
96
|
+
}
|
|
97
|
+
if (is.null(plots[[variant]]$devpars)) {
|
|
98
|
+
plots[[variant]]$devpars <- devpars
|
|
99
|
+
}
|
|
100
|
+
res <- results[results$SNP_id == variant & results$.id %in% results_to_save$.id, , drop = FALSE]
|
|
101
|
+
res <- subset(res, subset = eval(parse(text = plots[[variant]]$which)))
|
|
102
|
+
|
|
103
|
+
plot_variant_motifs(res, variant, plots[[variant]]$devpars, outdir)
|
|
104
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Script for regulatory.MotifScan"""
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
# Paths may be passed in args or to motifdb
|
|
5
|
+
from pathlib import PosixPath # noqa: F401
|
|
6
|
+
from biopipen.utils.misc import run_command, dict_to_cli_args, logger
|
|
7
|
+
|
|
8
|
+
motiffile: str = {{in.motiffile | quote}} # pyright: ignore # noqa: #999
|
|
9
|
+
seqfile: str = {{in.seqfile | quote}} # pyright: ignore
|
|
10
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
11
|
+
|
|
12
|
+
tool = {{envs.tool | repr}} # pyright: ignore
|
|
13
|
+
fimo = {{envs.fimo | repr}} # pyright: ignore
|
|
14
|
+
motif_col: str | int = {{envs.motif_col | repr}} # pyright: ignore
|
|
15
|
+
regulator_col: str | int = {{envs.regulator_col | repr}} # pyright: ignore
|
|
16
|
+
notfound = {{envs.notfound | repr}} # pyright: ignore
|
|
17
|
+
motifdb: str | None = {{envs.motifdb | repr}} # pyright: ignore
|
|
18
|
+
cutoff = {{envs.cutoff | repr}} # pyright: ignore
|
|
19
|
+
q = {{envs.q | repr}} # pyright: ignore
|
|
20
|
+
q_cutoff = {{envs.q_cutoff | repr}} # pyright: ignore
|
|
21
|
+
args: dict = {{envs.args | dict | repr}} # pyright: ignore
|
|
22
|
+
|
|
23
|
+
# Check if the tool is supported
|
|
24
|
+
if tool != "fimo":
|
|
25
|
+
raise ValueError(f"Unsupported tool: {tool}, currently only fimo is supported")
|
|
26
|
+
|
|
27
|
+
# Check if the motif database is provided
|
|
28
|
+
if motifdb is None:
|
|
29
|
+
raise ValueError("The motif database is required")
|
|
30
|
+
|
|
31
|
+
# Check if the motif file exists
|
|
32
|
+
if not motiffile:
|
|
33
|
+
raise FileNotFoundError(f"Motif file in.motiffile must be provided")
|
|
34
|
+
|
|
35
|
+
# Check if the sequence file exists
|
|
36
|
+
if not seqfile:
|
|
37
|
+
raise FileNotFoundError(f"Sequence file in.seqfile must be provided")
|
|
38
|
+
|
|
39
|
+
# Normalize motif_col and regulator_col into 0-based indexes
|
|
40
|
+
if isinstance(motif_col, str) or isinstance(regulator_col, str):
|
|
41
|
+
with open(motiffile, "r") as f:
|
|
42
|
+
header = f.readline().strip().split("\t")
|
|
43
|
+
if isinstance(motif_col, str):
|
|
44
|
+
motif_col: int = header.index(motif_col) + 1
|
|
45
|
+
if isinstance(regulator_col, str):
|
|
46
|
+
regulator_col = header.index(regulator_col) + 1
|
|
47
|
+
if isinstance(motif_col, int):
|
|
48
|
+
motif_col -= 1
|
|
49
|
+
if isinstance(regulator_col, int):
|
|
50
|
+
regulator_col -= 1
|
|
51
|
+
|
|
52
|
+
# Check if motif names exist in the database
|
|
53
|
+
with open(motiffile, "r") as f:
|
|
54
|
+
motif_names = set(
|
|
55
|
+
line.strip().split("\t")[motif_col]
|
|
56
|
+
for i, line in enumerate(f)
|
|
57
|
+
if i > 0 # skip header
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
with open(motifdb, "r") as f:
|
|
61
|
+
motif_db_names = set(
|
|
62
|
+
line[6:].strip()
|
|
63
|
+
for line in f
|
|
64
|
+
if line.startswith("MOTIF")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if notfound == "error":
|
|
68
|
+
notfound_motifs = motif_names - motif_db_names
|
|
69
|
+
if notfound_motifs:
|
|
70
|
+
raise ValueError(f"Motifs not found in the database: {notfound_motifs}")
|
|
71
|
+
|
|
72
|
+
# Make a new motif database with only the motifs in the motiffile
|
|
73
|
+
motif_names = motif_names & motif_db_names
|
|
74
|
+
motifdb_filtered = f"{outdir}/motif_db.txt"
|
|
75
|
+
with open(motifdb, "r") as f, open(motifdb_filtered, "w") as f_out:
|
|
76
|
+
should_write = True
|
|
77
|
+
for line in f:
|
|
78
|
+
if line.startswith("MOTIF"):
|
|
79
|
+
motif_name = line[6:].strip()
|
|
80
|
+
if motif_name in motif_names:
|
|
81
|
+
should_write = True
|
|
82
|
+
else:
|
|
83
|
+
should_write = False
|
|
84
|
+
|
|
85
|
+
if should_write:
|
|
86
|
+
f_out.write(line)
|
|
87
|
+
else:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Now run fimo
|
|
91
|
+
args[""] = fimo
|
|
92
|
+
args["oc"] = f"{outdir}"
|
|
93
|
+
args["thresh"] = cutoff
|
|
94
|
+
args["qv_thresh"] = q_cutoff
|
|
95
|
+
args["no_qvalue"] = not q
|
|
96
|
+
args["no-pgc"] = True
|
|
97
|
+
args["_"] = [motifdb_filtered, seqfile]
|
|
98
|
+
|
|
99
|
+
logger.info("Running fimo ...")
|
|
100
|
+
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
101
|
+
|
|
102
|
+
logger.info("Adding additional information to the output ...")
|
|
103
|
+
# Get the motif to regulator mapping
|
|
104
|
+
motif_regulator_map = {}
|
|
105
|
+
if regulator_col is not None:
|
|
106
|
+
with open(motiffile, "r") as f:
|
|
107
|
+
next(f) # skip header
|
|
108
|
+
for line in f:
|
|
109
|
+
line = line.strip().split("\t")
|
|
110
|
+
motif_name = line[motif_col]
|
|
111
|
+
regulator = line[regulator_col]
|
|
112
|
+
motif_regulator_map[motif_name] = regulator
|
|
113
|
+
|
|
114
|
+
# Get the sequence name information
|
|
115
|
+
seqnames = {}
|
|
116
|
+
seqcoords = {}
|
|
117
|
+
with open(seqfile, "r") as f:
|
|
118
|
+
for line in f:
|
|
119
|
+
if not line.startswith(">"):
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
seqname = line[1:].strip()
|
|
123
|
+
match = re.match(r"^(.+)::((?:chr)?\d+):(\d+)-(\d+).*$", seqname)
|
|
124
|
+
if not match:
|
|
125
|
+
seqnames[seqname] = seqname
|
|
126
|
+
seqcoords[seqname] = None
|
|
127
|
+
else:
|
|
128
|
+
sname, chrom, start, end = match.groups()
|
|
129
|
+
seqnames[seqname] = sname
|
|
130
|
+
seqcoords[seqname] = (chrom, int(start), int(end))
|
|
131
|
+
|
|
132
|
+
# Add additional information to the output
|
|
133
|
+
with open(f"{outdir}/fimo.tsv", "r") as f, open(f"{outdir}/fimo_output.txt", "w") as f_out:
|
|
134
|
+
header = f.readline().strip().split("\t")
|
|
135
|
+
f_out.write(
|
|
136
|
+
"\t".join(header + ["regulator", "seqname", "seqstart", "seqstop"]) + "\n"
|
|
137
|
+
)
|
|
138
|
+
for line in f:
|
|
139
|
+
line = line.strip()
|
|
140
|
+
if not line or line.startswith("#"):
|
|
141
|
+
continue
|
|
142
|
+
line = line.split("\t")
|
|
143
|
+
motif_name = line[0]
|
|
144
|
+
sequence_name = line[2]
|
|
145
|
+
start = int(line[3])
|
|
146
|
+
stop = int(line[4])
|
|
147
|
+
regulator = motif_regulator_map.get(motif_name, motif_name)
|
|
148
|
+
seqname = seqnames.get(sequence_name, "NA")
|
|
149
|
+
seqcoord = seqcoords.get(sequence_name)
|
|
150
|
+
if not seqcoord:
|
|
151
|
+
seqstart = "NA"
|
|
152
|
+
seqstop = "NA"
|
|
153
|
+
else:
|
|
154
|
+
seqstart = start + seqcoord[1] - 1
|
|
155
|
+
seqstop = stop + seqcoord[2] - 1
|
|
156
|
+
|
|
157
|
+
f_out.write(
|
|
158
|
+
"\t".join(line + [regulator, seqname, str(seqstart), str(seqstop)]) + "\n"
|
|
159
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{% include biopipen_dir + "/scripts/regulatory/motifs-common.R" %}
|
|
2
|
+
|
|
3
|
+
library(BSgenome)
|
|
4
|
+
library(GenomicRanges)
|
|
5
|
+
library(biopipen.utils)
|
|
6
|
+
|
|
7
|
+
infile <- {{in.infile | r}}
|
|
8
|
+
outdir <- {{out.outdir | r}}
|
|
9
|
+
genome <- {{envs.genome | r}}
|
|
10
|
+
motifdb <- {{envs.motifdb | r}}
|
|
11
|
+
motif_col <- {{envs.motif_col | r}}
|
|
12
|
+
regulator_col <- {{envs.regulator_col | r}}
|
|
13
|
+
regmotifs <- {{envs.regmotifs | r}}
|
|
14
|
+
notfound <- {{envs.notfound | r}}
|
|
15
|
+
devpars <- {{envs.devpars | r}}
|
|
16
|
+
plot_vars <- {{envs.plot_vars | r}}
|
|
17
|
+
|
|
18
|
+
if (is.null(motifdb) || !file.exists(motifdb)) {
|
|
19
|
+
stop("Motif database (envs.motifdb) is required and must exist")
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (is.null(genome)) {
|
|
23
|
+
stop("Reference genome (envs.ref) is required and must exist")
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (is.null(motif_col) && is.null(regulator_col)) {
|
|
27
|
+
stop("Either motif (envs.motif_col) or regulator (envs.regulator_col) column must be provided")
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
log <- get_logger()
|
|
31
|
+
|
|
32
|
+
log$info("Reading input data ...")
|
|
33
|
+
indata <- read.table(infile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
34
|
+
|
|
35
|
+
log$info("Ensuring regulators in the input data ...")
|
|
36
|
+
indata <- ensure_regulator_motifs(indata, outdir, motif_col, regulator_col, "SNP_id", regmotifs, notfound = notfound)
|
|
37
|
+
genome_pkg <- get_genome_pkg(genome)
|
|
38
|
+
|
|
39
|
+
log$info("Reading motif database ...")
|
|
40
|
+
meme <- read_meme_to_motifdb(motifdb, indata, motif_col, regulator_col, notfound, outdir)
|
|
41
|
+
|
|
42
|
+
log$info("Composing motifbreakR results from input data ...")
|
|
43
|
+
indata$chr <- indata$chrom %||% indata$chr %||% indata$seqnames
|
|
44
|
+
indata$seqnames <- NULL
|
|
45
|
+
indata$strand <- indata$strand %||% "+"
|
|
46
|
+
indata$varType <- indata$varType %||% "SNV"
|
|
47
|
+
indata$geneSymbol <- indata$geneSymbol %||% indata$Regulator
|
|
48
|
+
indata$providerId <- indata$providerId %||% indata$motif
|
|
49
|
+
indata$providerName <- indata$providerName %||% indata$providerId
|
|
50
|
+
indata$dataSource <- indata$dataSource %||% strsplit(basename(motifdb), "\\.")[[1]][1]
|
|
51
|
+
indata$effect <- indata$effect %||% "strong"
|
|
52
|
+
indata$altPos <- indata$altPos %||% 1
|
|
53
|
+
indata$alleleDiff <- indata$alleleDiff %||% indata$score %||% 0
|
|
54
|
+
|
|
55
|
+
# check other required columns
|
|
56
|
+
for (col in c("start", "end", "SNP_id", "REF", "ALT", "motifPos")) {
|
|
57
|
+
if (!(col %in% colnames(indata))) {
|
|
58
|
+
stop("Column '", col, "' is required in the input data")
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
indata$motifPos <- lapply(indata$motifPos, function(x) as.integer(unlist(strsplit(x, ","))))
|
|
62
|
+
indata <- makeGRangesFromDataFrame(indata, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
|
|
63
|
+
genome(indata) <- genome
|
|
64
|
+
attributes(indata)$genome.package <- genome_pkg
|
|
65
|
+
attributes(indata)$motifs <- meme
|
|
66
|
+
|
|
67
|
+
log$info("Plotting variants ...")
|
|
68
|
+
if (is.null(plot_vars)) {
|
|
69
|
+
plot_vars <- unique(indata$SNP_id)
|
|
70
|
+
} else if (length(plot_vars) > 1) {
|
|
71
|
+
plot_vars <- unique(plot_vars)
|
|
72
|
+
} else {
|
|
73
|
+
plot_vars <- strsplit(plot_vars, ",")[[1]]
|
|
74
|
+
}
|
|
75
|
+
for (pvar in plot_vars) {
|
|
76
|
+
log$info("- Variant: {pvar}")
|
|
77
|
+
plot_variant_motifs(indata, pvar, devpars, outdir)
|
|
78
|
+
}
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
library(rlang)
|
|
2
|
+
library(universalmotif)
|
|
3
|
+
library(MotifDb)
|
|
4
|
+
library(biopipen.utils)
|
|
5
|
+
|
|
6
|
+
#' @title Common functions for regulatory analysis
|
|
7
|
+
#' @name regulatory-common
|
|
8
|
+
#' @author Panwen Wang
|
|
9
|
+
|
|
10
|
+
#' Read a regulator-motif mapping file
|
|
11
|
+
#'
|
|
12
|
+
#' @param rmfile Regulator-motif mapping file
|
|
13
|
+
#' @param motif_cols_allowed Allowed motif columns
|
|
14
|
+
#' @param reg_cols_allowed Allowed regulator columns
|
|
15
|
+
#' @return Data frame with regulators and motifs in the first and second columns, respectively
|
|
16
|
+
.read_regmotifs <- function(
|
|
17
|
+
rmfile,
|
|
18
|
+
motif_cols_allowed = c("Motif", "motif", "MOTIF", "Model", "model", "MODEL"),
|
|
19
|
+
reg_cols_allowed = c("Regulator", "regulator", "REGULATOR", "TF", "tf", "TF")
|
|
20
|
+
) {
|
|
21
|
+
if (!file.exists(rmfile)) {
|
|
22
|
+
stop("Regulator-motif mapping file does not exist.")
|
|
23
|
+
}
|
|
24
|
+
regmotifs <- read.table(rmfile, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names = FALSE)
|
|
25
|
+
rm_motif_col <- intersect(motif_cols_allowed, colnames(regmotifs))
|
|
26
|
+
rm_reg_col <- intersect(reg_cols_allowed, colnames(regmotifs))
|
|
27
|
+
if (length(rm_motif_col) == 0) {
|
|
28
|
+
stop(paste0("No motif column found in the regulator-motif mapping file, provide one of: ", paste(motif_cols_allowed, collapse = ", ")))
|
|
29
|
+
}
|
|
30
|
+
if (length(rm_reg_col) == 0) {
|
|
31
|
+
stop(paste0("No regulator column found in the regulator-motif mapping file, provide one of: ", paste(reg_cols_allowed, collapse = ", ")))
|
|
32
|
+
}
|
|
33
|
+
if (length(rm_motif_col) > 1) {
|
|
34
|
+
stop(paste0("Multiple motif columns found (", paste(rm_motif_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
|
|
35
|
+
}
|
|
36
|
+
if (length(rm_reg_col) > 1) {
|
|
37
|
+
stop(paste0("Multiple regulator columns found (", paste(rm_reg_col, collapse = ", "), ") in the regulator-motif mapping file, provide only one"))
|
|
38
|
+
}
|
|
39
|
+
rm_motif_col <- rm_motif_col[1]
|
|
40
|
+
rm_reg_col <- rm_reg_col[1]
|
|
41
|
+
regmotifs <- regmotifs[, c(rm_motif_col, rm_reg_col), drop = FALSE]
|
|
42
|
+
|
|
43
|
+
return(regmotifs)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#' Handle not found items
|
|
47
|
+
#'
|
|
48
|
+
#' @param notfound_items Items that were not found
|
|
49
|
+
#' @param log_warn Function to log warnings
|
|
50
|
+
#' @param msg Message to display
|
|
51
|
+
#' @param notfound Action to take if items are not found
|
|
52
|
+
#' @param notfound_file File to save the full list of not found items
|
|
53
|
+
#' @param log_indent Indentation for log messages
|
|
54
|
+
.handle_notfound_items <- function (notfound_items, log_warn, msg, notfound, notfound_file, log_indent = "") {
|
|
55
|
+
if (length(notfound_items) > 0) {
|
|
56
|
+
first_notfound <- head(notfound_items, 3)
|
|
57
|
+
if (length(notfound_items) > 3) {
|
|
58
|
+
first_notfound <- c(first_notfound, "...")
|
|
59
|
+
writeLines(notfound_items, notfound_file)
|
|
60
|
+
msg1 <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
|
|
61
|
+
msg2 <- paste0(log_indent, "Check the full list in ", notfound_file)
|
|
62
|
+
if (notfound == "error") {
|
|
63
|
+
stop(msg1, "\n", msg2)
|
|
64
|
+
} else if (notfound == "ignore") {
|
|
65
|
+
log_warn(msg1)
|
|
66
|
+
log_warn(msg2)
|
|
67
|
+
}
|
|
68
|
+
} else {
|
|
69
|
+
msg <- paste0(log_indent, msg, ": ", paste(first_notfound, collapse = ", "))
|
|
70
|
+
if (notfound == "error") {
|
|
71
|
+
stop(msg)
|
|
72
|
+
} else if (notfound == "ignore") {
|
|
73
|
+
log_warn(msg)
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#' Read a MEME file to a MotifDb object
|
|
80
|
+
#' and filter the motifs based on the input data
|
|
81
|
+
#' and return the filtered MotifDb object
|
|
82
|
+
#' with metadata
|
|
83
|
+
#'
|
|
84
|
+
#' @param motifdb MEME file
|
|
85
|
+
#' @param indata Input data frame
|
|
86
|
+
#' @param motif_col Column name for the motif
|
|
87
|
+
#' @param regulator_col Column name for the regulator
|
|
88
|
+
#' @param notfound Action to take if motifs are not found
|
|
89
|
+
#' @param outdir Output directory, used to save un-matched motifs
|
|
90
|
+
#' @return MotifDb object
|
|
91
|
+
#' @export
|
|
92
|
+
read_meme_to_motifdb <- function(motifdb, indata, motif_col, regulator_col, notfound, outdir) {
|
|
93
|
+
meme <- read_meme(motifdb)
|
|
94
|
+
motifdb_names <- sapply(meme, function(m) m@name)
|
|
95
|
+
motifs <- check_motifs(indata[[motif_col]], motifdb_names, notfound, outdir)
|
|
96
|
+
meme <- filter_motifs(meme, name = motifs)
|
|
97
|
+
# Get the right order of motif names
|
|
98
|
+
motifs <- sapply(meme, function(m) m@name)
|
|
99
|
+
motifdb_matrices <- lapply(meme, function(m) m@motif)
|
|
100
|
+
names(motifdb_matrices) <- motifs
|
|
101
|
+
motifdb_meta <- do.call(rbind, lapply(meme, function(m) {
|
|
102
|
+
ats <- attributes(m)
|
|
103
|
+
ats$dataSource <- strsplit(basename(motifdb), "\\.")[[1]][1]
|
|
104
|
+
ats$class <- NULL
|
|
105
|
+
ats$motif <- NULL
|
|
106
|
+
ats$gapinfo <- NULL
|
|
107
|
+
ats$sequenceCount <- ats$nsites
|
|
108
|
+
ats$providerId <- ats$name
|
|
109
|
+
ats$providerName <- ats$name
|
|
110
|
+
ats$organism <- if (is.null(ats$organism) || length(ats$organism) == 0) "Unknown" else ats$organism
|
|
111
|
+
if (!is.null(regulator_col)) {
|
|
112
|
+
ats$geneSymbol <- indata[
|
|
113
|
+
indata[[motif_col]] == ats$name,
|
|
114
|
+
regulator_col,
|
|
115
|
+
drop = TRUE
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
unlist(ats)
|
|
119
|
+
})
|
|
120
|
+
)
|
|
121
|
+
rownames(motifdb_meta) <- motifs
|
|
122
|
+
MotifDb:::MotifList(motifdb_matrices, tbl.metadata = motifdb_meta)
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#' Convert a MotifDb object to a motif library
|
|
126
|
+
#' with motif names as keys
|
|
127
|
+
#' and PWMs as values
|
|
128
|
+
#' @param motifdb MotifDb object
|
|
129
|
+
#' @return Motif library
|
|
130
|
+
#' @export
|
|
131
|
+
motifdb_to_motiflib <- function(motifdb) {
|
|
132
|
+
lapply(motifdb, t)
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#' Make sure the regulators and motifs in the input data from a regulator-motif mappings
|
|
136
|
+
#'
|
|
137
|
+
#' @param indata Input data frame
|
|
138
|
+
#' @param outdir Output directory, used to save un-matched regulators
|
|
139
|
+
#' @param motif_col Column name for the motif
|
|
140
|
+
#' @param regulator_col Column name for the regulator
|
|
141
|
+
#' @param var_col Column name for the variant
|
|
142
|
+
#' @param regmotifs Regulator-motif mapping file
|
|
143
|
+
#' @param log_indent Indentation for log messages
|
|
144
|
+
#' @param notfound Action to take if regulators are not found in the mapping file
|
|
145
|
+
#' @return Data frame with regulators and motifs
|
|
146
|
+
#' @export
|
|
147
|
+
ensure_regulator_motifs <- function (indata, outdir, motif_col, regulator_col, var_col, regmotifs, log_indent = "", notfound = "error", log = NULL) {
|
|
148
|
+
if (is.null(motif_col)) {
|
|
149
|
+
if (is.null(regmotifs)) {
|
|
150
|
+
stop("Regulator-motif mapping file (envs.regmotifs) is required when no motif column (envs.motif_col) is provided")
|
|
151
|
+
}
|
|
152
|
+
log <- log %||% get_logger()
|
|
153
|
+
regmotifs <- .read_regmotifs(regmotifs)
|
|
154
|
+
rm_motif_col <- colnames(regmotifs)[1]
|
|
155
|
+
rm_reg_col <- colnames(regmotifs)[2]
|
|
156
|
+
# check regulators
|
|
157
|
+
rm_regs <- regmotifs[[rm_reg_col]]
|
|
158
|
+
regulators <- indata[[regulator_col]]
|
|
159
|
+
notfound_regs <- setdiff(regulators, rm_regs)
|
|
160
|
+
.handle_notfound_items(
|
|
161
|
+
notfound_regs,
|
|
162
|
+
log$warn,
|
|
163
|
+
"The following regulators were not found in the regulator-motif mapping file",
|
|
164
|
+
notfound,
|
|
165
|
+
file.path(outdir, "notfound_regulators.txt"),
|
|
166
|
+
log_indent
|
|
167
|
+
)
|
|
168
|
+
indata <- indata[indata[[regulator_col]] %in% rm_regs, , drop = FALSE]
|
|
169
|
+
# add motif column
|
|
170
|
+
indata <- merge(indata, regmotifs, by.x = regulator_col, by.y = rm_reg_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
171
|
+
# update motif column
|
|
172
|
+
motif_col <<- rm_motif_col
|
|
173
|
+
} else if (is.null(regulator_col)) {
|
|
174
|
+
if (is.null(regmotifs) || (is.character(regmotifs) && nchar(regmotifs) == 0)) {
|
|
175
|
+
# make motifs unique
|
|
176
|
+
indata <- indata[!duplicated(indata[[motif_col]]), , drop = FALSE]
|
|
177
|
+
} else if (!file.exists(regmotifs)) {
|
|
178
|
+
stop("Regulator-motif mapping file (envs.regmotifs) does not exist.")
|
|
179
|
+
} else {
|
|
180
|
+
# map the regulators
|
|
181
|
+
regmotifs <- .read_regmotifs(regmotifs)
|
|
182
|
+
rm_motif_col <- colnames(regmotifs)[1]
|
|
183
|
+
rm_reg_col <- colnames(regmotifs)[2]
|
|
184
|
+
rm_motifs <- regmotifs[[rm_motif_col]]
|
|
185
|
+
motifs <- indata[[motif_col]]
|
|
186
|
+
notfound_motifs <- setdiff(motifs, rm_motifs)
|
|
187
|
+
.handle_notfound_items(
|
|
188
|
+
notfound_motifs,
|
|
189
|
+
log$warn,
|
|
190
|
+
"The following motifs were not found in the regulator-motif mapping file",
|
|
191
|
+
notfound,
|
|
192
|
+
file.path(outdir, "notfound_motifs.txt"),
|
|
193
|
+
log_indent
|
|
194
|
+
)
|
|
195
|
+
indata <- indata[indata[[motif_col]] %in% rm_motifs, , drop = FALSE]
|
|
196
|
+
# add regulator column
|
|
197
|
+
indata <- merge(indata, regmotifs, by.x = motif_col, by.y = rm_motif_col, all.x = TRUE, suffixes = c("", "_db"))
|
|
198
|
+
# update regulator column
|
|
199
|
+
regulator_col <<- rm_reg_col
|
|
200
|
+
}
|
|
201
|
+
} else {
|
|
202
|
+
indata <- indata[!duplicated(indata[, c(regulator_col, motif_col, var_col), drop = FALSE]), , drop = FALSE]
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return(indata)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
#' Get the genome package name for a given genome
|
|
209
|
+
#'
|
|
210
|
+
#' @param genome Genome name
|
|
211
|
+
#' @return Genome package name
|
|
212
|
+
#' @export
|
|
213
|
+
get_genome_pkg <- function(genome) {
|
|
214
|
+
if (!grepl(".", genome, fixed = TRUE)) {
|
|
215
|
+
genome_pkg = sprintf("BSgenome.Hsapiens.UCSC.%s", genome)
|
|
216
|
+
} else {
|
|
217
|
+
genome_pkg = genome
|
|
218
|
+
}
|
|
219
|
+
if (!requireNamespace(genome_pkg, quietly = TRUE)) {
|
|
220
|
+
stop(sprintf("Genome package %s is not installed", genome_pkg))
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
library(package = genome_pkg, character.only = TRUE)
|
|
224
|
+
return(genome_pkg)
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
#' Check if motifs are in the motif database
|
|
228
|
+
#' and return the motifs that are found
|
|
229
|
+
#'
|
|
230
|
+
#' @param motifs Motifs to check
|
|
231
|
+
#' @param all_motifs All motifs in the motif database
|
|
232
|
+
#' @param notfound Action to take if motifs are not found
|
|
233
|
+
#' @param outdir Output directory, used to save un-matched motifs
|
|
234
|
+
#' @return Motifs that are found
|
|
235
|
+
#' @export
|
|
236
|
+
check_motifs <- function(motifs, all_motifs, notfound, outdir, log = NULL) {
|
|
237
|
+
log <- log %||% get_logger()
|
|
238
|
+
notfound_motifs <- setdiff(motifs, all_motifs)
|
|
239
|
+
if (length(notfound_motifs) > 0) {
|
|
240
|
+
first_notfound <- head(notfound_motifs, 3)
|
|
241
|
+
if (length(notfound_motifs) > 3) {
|
|
242
|
+
first_notfound <- c(first_notfound, "...")
|
|
243
|
+
notfound_file <- file.path(outdir, "notfound_motifs.txt")
|
|
244
|
+
writeLines(notfound_motifs, notfound_file)
|
|
245
|
+
msg1 <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
246
|
+
msg2 <- paste0("Check the full list in ", notfound_file)
|
|
247
|
+
|
|
248
|
+
if (notfound == "error") {
|
|
249
|
+
stop(msg1, "\n", msg2)
|
|
250
|
+
} else if (notfound == "ignore") {
|
|
251
|
+
log$warn(msg1)
|
|
252
|
+
log$warn(msg2)
|
|
253
|
+
}
|
|
254
|
+
} else {
|
|
255
|
+
msg <- paste0("The following motifs were not found in the motif database: ", paste(first_notfound, collapse = ", "))
|
|
256
|
+
if (notfound == "error") {
|
|
257
|
+
stop(msg)
|
|
258
|
+
} else if (notfound == "ignore") {
|
|
259
|
+
log$warn(msg)
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
motifs <- setdiff(motifs, notfound_motifs)
|
|
264
|
+
}
|
|
265
|
+
return(motifs)
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
#' Plot a genomic region surrounding a genomic variant, and
|
|
269
|
+
#' potentially disrupted motifs.
|
|
270
|
+
#'
|
|
271
|
+
#' @param results The motifbreakR results.
|
|
272
|
+
#' A GRanges object with the following columns:
|
|
273
|
+
#' - seqnames: Chromosome
|
|
274
|
+
#' - ranges: Start and end positions
|
|
275
|
+
#' - strand: Strand
|
|
276
|
+
#' -------------------
|
|
277
|
+
#' - SNP_id: Variant ID
|
|
278
|
+
#' - REF: Reference allele
|
|
279
|
+
#' - ALT: Alternative allele
|
|
280
|
+
#' - varType: Variant type. By default, "SNV"
|
|
281
|
+
#' - motifPos: Motif positions
|
|
282
|
+
#' - geneSymbol: Gene symbol, if not provided, try to get from the Regulator column
|
|
283
|
+
#' - dataSource: Motif database source
|
|
284
|
+
#' - providerName: Motif name
|
|
285
|
+
#' - providerId: Motif ID
|
|
286
|
+
#' - effect: Effect of the variant. By default, "strong"
|
|
287
|
+
#' - altPos: Alternative allele position. By default, 1
|
|
288
|
+
#' - alleleDiff: Allele difference, default 0, does not affect the plot for SNVs
|
|
289
|
+
#'
|
|
290
|
+
#' Attributes:
|
|
291
|
+
#' - genome.package: Genome package name
|
|
292
|
+
#' - motifs: Motif database, in MotifDb::MotifList format
|
|
293
|
+
#' @param variant Variant ID to be plotted
|
|
294
|
+
#' @param devpars List of device parameters
|
|
295
|
+
#' - res: Resolution, default 100
|
|
296
|
+
#' - width: Width of the plot, default NULL, calculated based on sequence length
|
|
297
|
+
#' - height: Height of the plot, default NULL, calculated based on the number of motifs
|
|
298
|
+
#' @param outdir Output directory. Plots will be saved in the sub-directory "<outdir>/plots/"
|
|
299
|
+
#' @export
|
|
300
|
+
plot_variant_motifs <- function(results, variant, devpars, outdir) {
|
|
301
|
+
plotdir <- file.path(outdir, "plots")
|
|
302
|
+
dir.create(plotdir, showWarnings = FALSE)
|
|
303
|
+
|
|
304
|
+
res <- results[results$SNP_id == variant, , drop = FALSE]
|
|
305
|
+
devpars <- devpars %||% list(res = 100, width = NULL, height = NULL)
|
|
306
|
+
if (length(res) == 0) {
|
|
307
|
+
stop(sprintf("Variant %s not found in results", variant))
|
|
308
|
+
}
|
|
309
|
+
devpars$res <- devpars$res %||% 100
|
|
310
|
+
devpars$height <- devpars$height %||% 2.4 * devpars$res + length(res) * 1.2 * devpars$res
|
|
311
|
+
if (is.null(devpars$width)) {
|
|
312
|
+
left <- min(sapply(res$motifPos, `[`, 1))
|
|
313
|
+
right <- max(sapply(res$motifPos, `[`, 2))
|
|
314
|
+
devpars$width <- 1.5 * devpars$res + (right - left) * 0.3 * devpars$res
|
|
315
|
+
devpars$width <- max(devpars$width, 5 * devpars$res)
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
plotfile <- file.path(plotdir, sprintf("%s.png", slugify(variant)))
|
|
319
|
+
# fix motifBreakR 2.12 using names to filter in plotMB
|
|
320
|
+
names(res) <- res$SNP_id
|
|
321
|
+
png(plotfile, width = devpars$width, height = devpars$height, res = devpars$res)
|
|
322
|
+
motifbreakR::plotMB(res, variant)
|
|
323
|
+
dev.off()
|
|
324
|
+
}
|