biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
biopipen/ns/protein.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Protein-related processes."""
|
|
2
|
+
from ..core.proc import Proc
|
|
3
|
+
from ..core.config import config
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Prodigy(Proc):
|
|
7
|
+
"""Prediction of binding affinity of protein-protein complexes based on
|
|
8
|
+
intermolecular contacts using Prodigy.
|
|
9
|
+
|
|
10
|
+
See <https://rascar.science.uu.nl/prodigy/> and
|
|
11
|
+
<https://github.com/haddocking/prodigy>.
|
|
12
|
+
|
|
13
|
+
`prodigy-prot` must be installed under the given python of `proc.lang`.
|
|
14
|
+
|
|
15
|
+
Input:
|
|
16
|
+
infile: The structure file in PDB or mmCIF format.
|
|
17
|
+
|
|
18
|
+
Output:
|
|
19
|
+
outfile: The output file generated by Prodigy.
|
|
20
|
+
outdir: The output directory containing all output files.
|
|
21
|
+
|
|
22
|
+
Envs:
|
|
23
|
+
distance_cutoff (type=float): The distance cutoff to calculate intermolecular
|
|
24
|
+
contacts.
|
|
25
|
+
acc_threshold (type=float): The accessibility threshold for BSA analysis.
|
|
26
|
+
temperature (type=float): The temperature (C) for Kd prediction.
|
|
27
|
+
contact_list (flag): Whether to generate contact list.
|
|
28
|
+
pymol_selection (flag): Whether output a script to highlight the interface
|
|
29
|
+
residues in PyMOL.
|
|
30
|
+
selection (list): The selection of the chains to analyze.
|
|
31
|
+
`['A', 'B']` will analyze chains A and B.
|
|
32
|
+
`['A,B', 'C']` will analyze chain A and C; and B and C.
|
|
33
|
+
`['A', 'B', 'C']` will analyze all combinations of A, B, and C.
|
|
34
|
+
outtype (choice): Set the format of the output file (`out.outfile`).
|
|
35
|
+
All three files will be generated. This option only determines which
|
|
36
|
+
is assigned to `out.outfile`.
|
|
37
|
+
- raw: The raw output file from prodigy.
|
|
38
|
+
- json: The output file in JSON format.
|
|
39
|
+
- tsv: The output file in CSV format.
|
|
40
|
+
"""
|
|
41
|
+
input = "infile:file"
|
|
42
|
+
output = [
|
|
43
|
+
"outfile:file:{{in.infile | stem}}_prodigy/"
|
|
44
|
+
"{{in.infile | stem}}.{{envs.outtype if envs.outtype != 'raw' else 'out'}}",
|
|
45
|
+
"outdir:dir:{{in.infile | stem}}_prodigy",
|
|
46
|
+
]
|
|
47
|
+
lang = config.lang.python
|
|
48
|
+
envs = {
|
|
49
|
+
"distance_cutoff": 5.5,
|
|
50
|
+
"acc_threshold": 0.05,
|
|
51
|
+
"temperature": 25.0,
|
|
52
|
+
"contact_list": True,
|
|
53
|
+
"pymol_selection": True,
|
|
54
|
+
"selection": None,
|
|
55
|
+
"outtype": "json",
|
|
56
|
+
}
|
|
57
|
+
script = "file://../scripts/protein/Prodigy.py"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProdigySummary(Proc):
|
|
61
|
+
"""Summary of the output from `Prodigy`.
|
|
62
|
+
|
|
63
|
+
Input:
|
|
64
|
+
infiles: The output json file generated by `Prodigy`.
|
|
65
|
+
|
|
66
|
+
Output:
|
|
67
|
+
outdir: The directory of summary files generated by `ProdigySummary`.
|
|
68
|
+
|
|
69
|
+
Envs:
|
|
70
|
+
group (type=auto): The group of the samples for boxplots.
|
|
71
|
+
If `None`, don't do boxplots.
|
|
72
|
+
It can be a dict of group names and sample names, e.g.
|
|
73
|
+
`{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
|
|
74
|
+
or a file containing the group information, with the first column
|
|
75
|
+
being the sample names and the second column being the group names.
|
|
76
|
+
The file should be tab-delimited with no header.
|
|
77
|
+
"""
|
|
78
|
+
input = "infiles:files"
|
|
79
|
+
input_data = lambda ch: [[f"{odir}/_prodigy.tsv" for odir in ch.outdir]]
|
|
80
|
+
output = "outdir:dir:prodigy_summary"
|
|
81
|
+
lang = config.lang.rscript
|
|
82
|
+
envs = {"group": None}
|
|
83
|
+
script = "file://../scripts/protein/ProdigySummary.R"
|
|
84
|
+
plugin_opts = {"report": "file://../reports/protein/ProdigySummary.svelte"}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class MMCIF2PDB(Proc):
|
|
88
|
+
"""Convert mmCIF or PDBx file to PDB file.
|
|
89
|
+
|
|
90
|
+
Using [BeEM](https://github.com/kad-ecoli/BeEM)
|
|
91
|
+
|
|
92
|
+
Input:
|
|
93
|
+
infile: The input mmCIF or PDBx file.
|
|
94
|
+
|
|
95
|
+
Output:
|
|
96
|
+
outfile: The output PDB file.
|
|
97
|
+
The "outfmt" set to 3 to always output a single PDB file.
|
|
98
|
+
|
|
99
|
+
Envs:
|
|
100
|
+
tool (choice): The tool to use for conversion.
|
|
101
|
+
- maxit: Use MAXIT.
|
|
102
|
+
- beem: Use BeEM.
|
|
103
|
+
maxit: The path to the MAXIT executable.
|
|
104
|
+
beem: The path to the BeEM executable.
|
|
105
|
+
<more>: Other options for MAXIT/BeEM.
|
|
106
|
+
For BeEM, "outfmt" will not be used as it is set to 3.
|
|
107
|
+
"""
|
|
108
|
+
input = "infile:file"
|
|
109
|
+
output = "outfile:file:{{in.infile | stem}}.pdb"
|
|
110
|
+
lang = config.lang.python
|
|
111
|
+
envs = {
|
|
112
|
+
"tool": "maxit",
|
|
113
|
+
"maxit": config.exe.maxit,
|
|
114
|
+
"beem": config.exe.beem,
|
|
115
|
+
}
|
|
116
|
+
script = "file://../scripts/protein/MMCIF2PDB.py"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class RMSD(Proc):
|
|
120
|
+
"""Calculate the RMSD between two structures.
|
|
121
|
+
|
|
122
|
+
See also https://github.com/charnley/rmsd.
|
|
123
|
+
|
|
124
|
+
If the input is in mmCIF format, convert it to PDB first.
|
|
125
|
+
|
|
126
|
+
Input:
|
|
127
|
+
infile1: The first structure file.
|
|
128
|
+
infile2: The second structure file.
|
|
129
|
+
|
|
130
|
+
Output:
|
|
131
|
+
outfile: The output file containing the RMSD value.
|
|
132
|
+
|
|
133
|
+
Envs:
|
|
134
|
+
beem: The path to the BeEM executable.
|
|
135
|
+
calculate_rmsd: The path to the calculate_rmsd executable.
|
|
136
|
+
conv_tool (choice): The tool to use for conversion.
|
|
137
|
+
- maxit: Use MAXIT.
|
|
138
|
+
- beem: Use BeEM.
|
|
139
|
+
ca_only (flag): Whether to calculate RMSD using only C-alpha atoms.
|
|
140
|
+
duel (choice): How to handle the duel atoms. Default is "keep".
|
|
141
|
+
- keep: Keep both atoms.
|
|
142
|
+
- keep_first: Keep the first atom.
|
|
143
|
+
- keep_last: Keep the last atom.
|
|
144
|
+
- average: Average the coordinates.
|
|
145
|
+
reorder (flag): Whether to reorder the atoms in the structures.
|
|
146
|
+
<more>: Other options for calculate_rmsd.
|
|
147
|
+
"""
|
|
148
|
+
input = "infile1:file, infile2:file"
|
|
149
|
+
output = "outfile:file:{{in.infile1 | stem}}-{{in.infile2 | stem}}.rmsd.txt"
|
|
150
|
+
lang = config.lang.python
|
|
151
|
+
envs = {
|
|
152
|
+
"maxit": config.exe.maxit,
|
|
153
|
+
"beem": config.exe.beem,
|
|
154
|
+
"calculate_rmsd": config.exe.calculate_rmsd,
|
|
155
|
+
"conv_tool": "maxit",
|
|
156
|
+
"ca_only": False,
|
|
157
|
+
"duel": "keep",
|
|
158
|
+
"reorder": True,
|
|
159
|
+
}
|
|
160
|
+
script = "file://../scripts/protein/RMSD.py"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PDB2Fasta(Proc):
|
|
164
|
+
"""Convert PDB file to FASTA file.
|
|
165
|
+
|
|
166
|
+
Input:
|
|
167
|
+
infile: The input PDB file.
|
|
168
|
+
|
|
169
|
+
Output:
|
|
170
|
+
outfile: The output FASTA file.
|
|
171
|
+
|
|
172
|
+
Envs:
|
|
173
|
+
chains (auto): The chains to extract. A list of chain IDs or separated by
|
|
174
|
+
commas.
|
|
175
|
+
If None, extract all chains.
|
|
176
|
+
wrap (type=int): The number of residues per line in the output FASTA
|
|
177
|
+
file. Set to 0 to disable wrapping.
|
|
178
|
+
"""
|
|
179
|
+
input = "infile:file"
|
|
180
|
+
output = "outfile:file:{{in.infile | stem}}.fasta"
|
|
181
|
+
lang = config.lang.python
|
|
182
|
+
envs = {"chains": None, "wrap": 80}
|
|
183
|
+
script = "file://../scripts/protein/PDB2Fasta.py"
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""Provides processes for the regulatory related"""
|
|
2
|
+
|
|
3
|
+
from ..core.proc import Proc
|
|
4
|
+
from ..core.config import config
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MotifScan(Proc):
|
|
8
|
+
"""Scan the input sequences for binding sites using motifs.
|
|
9
|
+
|
|
10
|
+
Currently only [fimo](https://meme-suite.org/meme/tools/fimo) from MEME suite
|
|
11
|
+
is supported, based on the research/comparisons done by the following reference.
|
|
12
|
+
|
|
13
|
+
Reference:
|
|
14
|
+
- [Evaluating tools for transcription factor binding site prediction](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6889335/)
|
|
15
|
+
|
|
16
|
+
Input:
|
|
17
|
+
motiffile: File containing motif names.
|
|
18
|
+
The file contains the motif and regulator names.
|
|
19
|
+
The motif names should match the names in the motif database.
|
|
20
|
+
This file must have a header.
|
|
21
|
+
If multiple columns are present, it should be delimited by tab.
|
|
22
|
+
seqfile: File containing sequences in FASTA format.
|
|
23
|
+
|
|
24
|
+
Output:
|
|
25
|
+
outdir: Directory containing the results.
|
|
26
|
+
Especially `fimo_output.txt` extending from `fimo.tsv`, which contains:
|
|
27
|
+
1. the results with the regulator information if `envs.regulator_col`
|
|
28
|
+
is provided, otherwise, the `regulator` columns will be filled with
|
|
29
|
+
the motif names.
|
|
30
|
+
2. the original sequence from the fasta file (in.seqfile)
|
|
31
|
+
3. corrected genomic coordinates if the genomic coordinates are included
|
|
32
|
+
in the sequence names.
|
|
33
|
+
|
|
34
|
+
See also the `Output` section of
|
|
35
|
+
<https://meme-suite.org/meme/doc/fimo.html>.
|
|
36
|
+
Note that `--no-pgc` is passed to fimo to not parse the genomic coordinates
|
|
37
|
+
from the sequence names by fimo. When fimo parses the genomic coordinates,
|
|
38
|
+
`DDX11L1` in `>DDX11L1::chr1:11869-14412` will be lost.
|
|
39
|
+
The purpose of this is to keep the sequence names as they are in the output.
|
|
40
|
+
If the sequence names are in the format of `>NAME::chr1:START-END`, we will
|
|
41
|
+
correct the coordinates in the output.
|
|
42
|
+
Also note that it requires meme/fimo v5.5.5+ to do this
|
|
43
|
+
(where the --no-pgc option is available).
|
|
44
|
+
|
|
45
|
+
Envs:
|
|
46
|
+
tool (choice): The tool to use for scanning.
|
|
47
|
+
Currently only fimo is supported.
|
|
48
|
+
- fimo: Use fimo from MEME suite.
|
|
49
|
+
fimo: The path to fimo binary.
|
|
50
|
+
motif_col: The column name in the motif file containing the motif names.
|
|
51
|
+
regulator_col: The column name in the motif file containing the regulator names.
|
|
52
|
+
Both `motif_col` and `regulator_col` should be the direct column names or
|
|
53
|
+
the index (1-based) of the columns.
|
|
54
|
+
If no `regulator_col` is provided, no regulator information is written in
|
|
55
|
+
the output.
|
|
56
|
+
notfound (choice): What to do if a motif is not found in the database.
|
|
57
|
+
- error: Report error and stop the process.
|
|
58
|
+
- ignore: Ignore the motif and continue.
|
|
59
|
+
motifdb: The path to the motif database. This is required.
|
|
60
|
+
It should be in the format of MEME motif database.
|
|
61
|
+
Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
|
|
62
|
+
See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
|
|
63
|
+
cutoff (type=float): The cutoff for p-value to write the results.
|
|
64
|
+
When `envs.q_cutoff` is set, this is applied to the q-value.
|
|
65
|
+
This is passed to `--thresh` in fimo.
|
|
66
|
+
q (flag): Calculate q-value.
|
|
67
|
+
When `False`, `--no-qvalue` is passed to fimo.
|
|
68
|
+
The q-value calculation is that of Benjamini and Hochberg (BH) (1995).
|
|
69
|
+
q_cutoff (flag): Apply `envs.cutoff` to q-value.
|
|
70
|
+
args (ns): Additional arguments to pass to the tool.
|
|
71
|
+
- <more>: Additional arguments for fimo.
|
|
72
|
+
See: <https://meme-suite.org/meme/doc/fimo.html>
|
|
73
|
+
""" # noqa: E501
|
|
74
|
+
input = "motiffile:file, seqfile:file"
|
|
75
|
+
output = "outdir:dir:{{in.motiffile | stem}}.fimo"
|
|
76
|
+
lang = config.lang.python
|
|
77
|
+
envs = {
|
|
78
|
+
"tool": "fimo",
|
|
79
|
+
"fimo": config.exe.fimo,
|
|
80
|
+
"motif_col": 1,
|
|
81
|
+
"regulator_col": None,
|
|
82
|
+
"notfound": "error",
|
|
83
|
+
"motifdb": config.tf_motifdb,
|
|
84
|
+
"cutoff": 1e-4,
|
|
85
|
+
"q": False,
|
|
86
|
+
"q_cutoff": False,
|
|
87
|
+
"args": {},
|
|
88
|
+
}
|
|
89
|
+
script = "file://../scripts/regulatory/MotifScan.py"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class MotifAffinityTest(Proc):
|
|
93
|
+
"""Test the affinity of motifs to the sequences and the affinity change
|
|
94
|
+
due the mutations.
|
|
95
|
+
|
|
96
|
+
See also <https://simon-coetzee.github.io/motifBreakR> and
|
|
97
|
+
<https://www.bioconductor.org/packages/release/bioc/vignettes/atSNP/inst/doc/atsnp-vignette.html>
|
|
98
|
+
|
|
99
|
+
When using atSNP, motifBreakR is also required to plot the variants and motifs.
|
|
100
|
+
|
|
101
|
+
Input:
|
|
102
|
+
motiffile: File containing motif names.
|
|
103
|
+
The file contains the motif and regulator names.
|
|
104
|
+
The motif names should match the names in the motif database.
|
|
105
|
+
This file must have a header.
|
|
106
|
+
If multiple columns are present, it should be delimited by tab.
|
|
107
|
+
varfile: File containing the variants.
|
|
108
|
+
It could be a VCF file or a BED-like file.
|
|
109
|
+
If it is a VCF file, it does not need to be indexed. Only records with `PASS` in the `FILTER` column are used.
|
|
110
|
+
If it is a BED-like file, it should contain the following columns, `chrom`, `start`, `end`, `name`, `score`, `strand`, `ref`, `alt`.
|
|
111
|
+
|
|
112
|
+
Output:
|
|
113
|
+
outdir: Directory containing the results.
|
|
114
|
+
For motifBreakR, `motifbreakr.txt` will be created. Records with effect `strong`/`weak` are written (`neutral` is not).
|
|
115
|
+
For atSNP, `atsnp.txt` will be created. Records with p-value (`envs.atsnp_args.p`) < `envs.cutoff` are written.
|
|
116
|
+
|
|
117
|
+
Envs:
|
|
118
|
+
ncores (type=int): The number of cores to use.
|
|
119
|
+
tool (choice): The tool to use for the test.
|
|
120
|
+
- motifbreakr: Use motifBreakR.
|
|
121
|
+
- motifBreakR: Use motifBreakR.
|
|
122
|
+
- atsnp: Use atSNP.
|
|
123
|
+
- atSNP: Use atSNP.
|
|
124
|
+
bcftools: The path to bcftools binary.
|
|
125
|
+
Used to convert the VCF file to the BED file when the input is a VCF file.
|
|
126
|
+
motif_col: The column name in the motif file containing the motif names.
|
|
127
|
+
If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
|
|
128
|
+
which are used to infer the motif names from the regulator names.
|
|
129
|
+
regulator_col: The column name in the motif file containing the regulator names.
|
|
130
|
+
Both `motif_col` and `regulator_col` should be the direct column names or
|
|
131
|
+
the index (1-based) of the columns.
|
|
132
|
+
If no `regulator_col` is provided, no regulator information is written in
|
|
133
|
+
the output. Otherwise, the regulator information is written in the output in
|
|
134
|
+
the `Regulator` column.
|
|
135
|
+
var_col: The column names in the `in.motiffile` containing the variant information.
|
|
136
|
+
It has to be matching the names in the `in.varfile`. This is helpful when
|
|
137
|
+
we only need to test the pairs of variants and motifs in the `in.motiffile`.
|
|
138
|
+
notfound (choice): What to do if a motif is not found in the database,
|
|
139
|
+
or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
|
|
140
|
+
file.
|
|
141
|
+
- error: Report error and stop the process.
|
|
142
|
+
- ignore: Ignore the motif and continue.
|
|
143
|
+
motifdb: The path to the motif database. This is required.
|
|
144
|
+
It should be in the format of MEME motif database.
|
|
145
|
+
Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
|
|
146
|
+
See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
|
|
147
|
+
[universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
|
|
148
|
+
genome: The genome assembly.
|
|
149
|
+
Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
|
|
150
|
+
`hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
|
|
151
|
+
cutoff (type=float): The cutoff for p-value to write the results.
|
|
152
|
+
devpars (ns): The default device parameters for the plot.
|
|
153
|
+
- width (type=int): The width of the plot.
|
|
154
|
+
- height (type=int): The height of the plot.
|
|
155
|
+
- res (type=int): The resolution of the plot.
|
|
156
|
+
plot_nvars (type=int): Number of variants to plot.
|
|
157
|
+
Plot top `<plot_nvars>` variants with the largest `abs(alleleDiff)` (motifBreakR) or smallest p-values (atSNP).
|
|
158
|
+
plots (type=json): Specify the details for the plots.
|
|
159
|
+
When specified, `plot_nvars` is ignored.
|
|
160
|
+
The keys are the variant names and the values are the details for the plots, including:
|
|
161
|
+
devpars: The device parameters for the plot to override the default (envs.devpars).
|
|
162
|
+
which: An expression passed to `subset(results, subset = ...)` to get the motifs for the variant to plot.
|
|
163
|
+
Or an integer to get the top `which` motifs.
|
|
164
|
+
For example, `effect == "strong"` to get the motifs with strong effect in motifBreakR result.
|
|
165
|
+
regmotifs: The path to the regulator-motif mapping file.
|
|
166
|
+
It must have header and the columns `Motif` or `Model` for motif names and
|
|
167
|
+
`TF`, `Regulator` or `Transcription factor` for regulator names.
|
|
168
|
+
motifbreakr_args (ns): Additional arguments to pass to motifBreakR.
|
|
169
|
+
- method (choice): The method to use.
|
|
170
|
+
See details of <https://rdrr.io/bioc/motifbreakR/man/motifbreakR.html>
|
|
171
|
+
and <https://simon-coetzee.github.io/motifBreakR/#methods>.
|
|
172
|
+
- default: Use the default method.
|
|
173
|
+
- log: Use the standard summation of log probabilities
|
|
174
|
+
- ic: Use information content
|
|
175
|
+
- notrans: Use the default method without transformation
|
|
176
|
+
atsnp_args (ns): Additional arguments to pass to atSNP.
|
|
177
|
+
- padj_cutoff (flag): The `envs.cutoff` will be applied to the adjusted p-value.
|
|
178
|
+
Only works for `atSNP`.
|
|
179
|
+
- padj (choice): The method to adjust the p-values.
|
|
180
|
+
Only works for `atSNP`
|
|
181
|
+
- holm: Holm's method
|
|
182
|
+
- hochberg: Hochberg's method
|
|
183
|
+
- hommel: Hommel's method
|
|
184
|
+
- bonferroni: Bonferroni method
|
|
185
|
+
- BH: Benjamini & Hochberg's method
|
|
186
|
+
- BY: Benjamini & Yekutieli's method
|
|
187
|
+
- fdr: False discovery rate
|
|
188
|
+
- none: No adjustment
|
|
189
|
+
- p (choice): Which p-value to use for adjustment and cutoff.
|
|
190
|
+
- pval_ref: p-value for the reference allele affinity score.
|
|
191
|
+
- pval_snp: p-value for the SNP allele affinity score.
|
|
192
|
+
- pval_cond_ref: and
|
|
193
|
+
- pval_cond_snp: conditional p-values for the affinity scores of the reference and SNP alleles.
|
|
194
|
+
- pval_diff: p-value for the affinity score change between the two alleles.
|
|
195
|
+
- pval_rank: p-value for the rank test between the two alleles.
|
|
196
|
+
""" # noqa: E501
|
|
197
|
+
input = "motiffile:file, varfile:file"
|
|
198
|
+
output = "outdir:dir:{{in.motiffile | stem}}.{{envs.tool | lower}}"
|
|
199
|
+
lang = config.lang.rscript
|
|
200
|
+
envs = {
|
|
201
|
+
"ncores": config.misc.ncores,
|
|
202
|
+
"tool": "atsnp",
|
|
203
|
+
"bcftools": config.exe.bcftools,
|
|
204
|
+
"motif_col": None,
|
|
205
|
+
"regulator_col": None,
|
|
206
|
+
"var_col": None,
|
|
207
|
+
"notfound": "error",
|
|
208
|
+
"motifdb": config.ref.tf_motifdb,
|
|
209
|
+
"regmotifs": config.ref.tf_motifs,
|
|
210
|
+
"genome": config.ref.genome,
|
|
211
|
+
"cutoff": 0.05,
|
|
212
|
+
"devpars": {"width": None, "height": None, "res": 100},
|
|
213
|
+
"plot_nvars": 10,
|
|
214
|
+
"plots": {},
|
|
215
|
+
"motifbreakr_args": {"method": "default"},
|
|
216
|
+
"atsnp_args": {"padj_cutoff": True, "padj": "BH", "p": "pval_diff"},
|
|
217
|
+
}
|
|
218
|
+
script = "file://../scripts/regulatory/MotifAffinityTest.R"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class VariantMotifPlot(Proc):
|
|
222
|
+
"""A plot with a genomic region surrounding a genomic variant, and
|
|
223
|
+
potentially disrupted motifs.
|
|
224
|
+
|
|
225
|
+
Currently only SNVs are supported.
|
|
226
|
+
|
|
227
|
+
Input:
|
|
228
|
+
infile: File containing the variants and motifs.
|
|
229
|
+
It is a TAB-delimited file with the following columns:
|
|
230
|
+
- chrom: The chromosome of the SNV. Alias: chr, seqnames.
|
|
231
|
+
- start: The start position of the SNV, no matter 0- or 1-based.
|
|
232
|
+
- end: The end position of the SNV, which will be used as the position of the SNV.
|
|
233
|
+
- strand: Indicating the direction of the surrounding sequence matching the motif.
|
|
234
|
+
- SNP_id: The name of the SNV.
|
|
235
|
+
- REF: The reference allele of the SNV.
|
|
236
|
+
- ALT: The alternative allele of the SNV.
|
|
237
|
+
- providerId: The motif id. It can be specified by `envs.motif_col`.
|
|
238
|
+
- providerName: The name of the motif provider. Optional.
|
|
239
|
+
- Regulator: The regulator name. Optional, can be specified by `envs.regulator_col`.
|
|
240
|
+
- motifPos: The position of the motif, relative to the position of the SNV.
|
|
241
|
+
For example, '-8, 4' means the motif is 8 bp upstream and 4 bp downstream of the SNV.
|
|
242
|
+
|
|
243
|
+
Envs:
|
|
244
|
+
genome: The genome assembly.
|
|
245
|
+
Used to fetch the sequences around the variants by package, for example, `BSgenome.Hsapiens.UCSC.hg19` is required if
|
|
246
|
+
`hg19`. If it is an organism other than human, please specify the full name of the package, for example, `BSgenome.Mmusculus.UCSC.mm10`.
|
|
247
|
+
motifdb: The path to the motif database. This is required.
|
|
248
|
+
It should be in the format of MEME motif database.
|
|
249
|
+
Databases can be downloaded here: <https://meme-suite.org/meme/doc/download.html>.
|
|
250
|
+
See also introduction to the databases: <https://meme-suite.org/meme/db/motifs>.
|
|
251
|
+
[universalmotif](https://github.com/bjmt/universalmotif) is required to read the motif database.
|
|
252
|
+
motif_col: The column name in the motif file containing the motif names.
|
|
253
|
+
If this is not provided, `envs.regulator_col` and `envs.regmotifs` are required,
|
|
254
|
+
which are used to infer the motif names from the regulator names.
|
|
255
|
+
regulator_col: The column name in the motif file containing the regulator names.
|
|
256
|
+
Both `motif_col` and `regulator_col` should be the direct column names or
|
|
257
|
+
the index (1-based) of the columns.
|
|
258
|
+
If no `regulator_col` is provided, no regulator information is written in
|
|
259
|
+
the output. Otherwise, the regulator information is written in the output in
|
|
260
|
+
the `Regulator` column.
|
|
261
|
+
regmotifs: The path to the regulator-motif mapping file.
|
|
262
|
+
It must have header and the columns `Motif` or `Model` for motif names and
|
|
263
|
+
`TF`, `Regulator` or `Transcription factor` for regulator names.
|
|
264
|
+
notfound (choice): What to do if a motif is not found in the database,
|
|
265
|
+
or a regulator is not found in the regulator-motif mapping (envs.regmotifs)
|
|
266
|
+
file.
|
|
267
|
+
- error: Report error and stop the process.
|
|
268
|
+
- ignore: Ignore the motif and continue.
|
|
269
|
+
devpars (ns): The default device parameters for the plot.
|
|
270
|
+
- width (type=int): The width of the plot.
|
|
271
|
+
- height (type=int): The height of the plot.
|
|
272
|
+
- res (type=int): The resolution of the plot.
|
|
273
|
+
plot_vars (type=auto): The variants (SNP_id) to plot.
|
|
274
|
+
A list of variant names to plot or a string with the variant names separated by comma.
|
|
275
|
+
When not specified, all variants are plotted.
|
|
276
|
+
""" # noqa: E501
|
|
277
|
+
input = "infile:file"
|
|
278
|
+
output = "outdir:dir:{{in.infile | stem}}.vmplots"
|
|
279
|
+
lang = config.lang.rscript
|
|
280
|
+
envs = {
|
|
281
|
+
"genome": config.ref.genome,
|
|
282
|
+
"motifdb": config.ref.tf_motifdb,
|
|
283
|
+
"motif_col": "providerId",
|
|
284
|
+
"regulator_col": None,
|
|
285
|
+
"regmotifs": config.ref.tf_motifs,
|
|
286
|
+
"notfound": "error",
|
|
287
|
+
"devpars": {"width": 800, "height": None, "res": 100},
|
|
288
|
+
"plot_vars": None,
|
|
289
|
+
}
|
|
290
|
+
script = "file://../scripts/regulatory/VariantMotifPlot.R"
|
biopipen/ns/rnaseq.py
CHANGED
|
@@ -5,17 +5,154 @@ from ..core.config import config
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class UnitConversion(Proc):
|
|
8
|
-
"""Convert expression value units back and forth
|
|
8
|
+
"""Convert expression value units back and forth
|
|
9
|
+
|
|
10
|
+
See <https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/>
|
|
11
|
+
and <https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#fpkm>.
|
|
12
|
+
|
|
13
|
+
Following converstions are supported -
|
|
14
|
+
* `count -> cpm, fpkm/rpkm, fpkmuq/rpkmrq, tpm, tmm`
|
|
15
|
+
* `fpkm/rpkm -> count, tpm, cpm`
|
|
16
|
+
* `tpm -> count, fpkm/rpkm, cpm`
|
|
17
|
+
* `cpm -> count, fpkm/rpkm, tpm`
|
|
18
|
+
NOTE that during some conversions, `sum(counts/effLen)` is approximated to
|
|
19
|
+
`sum(counts)/sum(effLen) * length(effLen))`
|
|
20
|
+
|
|
21
|
+
You can also use this process to just transform the expression values, e.g., take
|
|
22
|
+
log2 of the expression values. In this case, you can set `inunit` and `outunit` to
|
|
23
|
+
`count` and `log2(count + 1)` respectively.
|
|
24
|
+
|
|
25
|
+
Input:
|
|
26
|
+
infile: Input file containing expression values
|
|
27
|
+
The file should be a matrix with rows representing genes and columns
|
|
28
|
+
representing samples.
|
|
29
|
+
It could be an RDS file containing a data frame or a matrix, or a
|
|
30
|
+
text file containing a matrix with tab as the delimiter. The text
|
|
31
|
+
file can be gzipped.
|
|
32
|
+
|
|
33
|
+
Output:
|
|
34
|
+
outfile: Output file containing the converted expression values
|
|
35
|
+
The file will be a matrix with rows representing genes and columns
|
|
36
|
+
representing samples.
|
|
37
|
+
|
|
38
|
+
Envs:
|
|
39
|
+
inunit: The input unit of the expression values.
|
|
40
|
+
You can also use an expression to indicate the input unit, e.g.,
|
|
41
|
+
`log2(counts + 1)`. The expression should be like `A * fn(B*X + C) + D`,
|
|
42
|
+
where `A`, `B`, `C` and `D` are constants, `fn` is a function, and X is
|
|
43
|
+
the input unit.
|
|
44
|
+
Currently only `expr`, `sqrt`, `log2`, `log10` and `log` are supported as
|
|
45
|
+
functions.
|
|
46
|
+
Supported input units are:
|
|
47
|
+
* counts/count/rawcounts/rawcount: raw counts.
|
|
48
|
+
* cpm: counts per million.
|
|
49
|
+
* fpkm/rpkm: fragments per kilobase of transcript per million.
|
|
50
|
+
* fpkmuq/rpkmuq: upper quartile normalized FPKM/RPKM.
|
|
51
|
+
* tpm: transcripts per million.
|
|
52
|
+
* tmm: trimmed mean of M-values.
|
|
53
|
+
outunit: The output unit of the expression values. An expression can also be
|
|
54
|
+
used for transformation (e.g. `log2(tpm + 1)`). If `inunit` is `count`,
|
|
55
|
+
then this means we are converting raw counts to tpm, and transforming it
|
|
56
|
+
to `log2(tpm + 1)` as the output. Any expression supported by `R` can be
|
|
57
|
+
used. Same units as `inunit` are supported.
|
|
58
|
+
refexon: Path to the reference exon gff file.
|
|
59
|
+
meanfl (type=auto): A file containing the mean fragment length for each sample
|
|
60
|
+
by rows (samples as rowname), without header.
|
|
61
|
+
Or a fixed universal estimated number (1 used by TCGA).
|
|
62
|
+
nreads (type=auto): The estimatied total number of reads for each sample.
|
|
63
|
+
or you can pass a file with the number for each sample by rows
|
|
64
|
+
(samples as rowname), without header.
|
|
65
|
+
When converting `fpkm/rpkm -> count`, it should be total reads of that sample.
|
|
66
|
+
When converting `cpm -> count`: it should be total reads of that sample.
|
|
67
|
+
When converting `tpm -> count`: it should be total reads of that sample.
|
|
68
|
+
When converting `tpm -> cpm`: it should be total reads of that sample.
|
|
69
|
+
When converting `tpm -> fpkm/rpkm`: it should be `sum(fpkm)` of that sample.
|
|
70
|
+
It is not used when converting `count -> cpm, fpkm/rpkm, tpm`.
|
|
71
|
+
""" # noqa: E501
|
|
9
72
|
input = "infile:file"
|
|
10
73
|
output = "outfile:file:{{in.infile | basename}}"
|
|
11
74
|
lang = config.lang.rscript
|
|
12
75
|
envs = {
|
|
13
|
-
"infmt": "matrix", # or rds
|
|
14
76
|
"inunit": None,
|
|
15
77
|
"outunit": None,
|
|
16
78
|
"refexon": config.ref.refexon,
|
|
17
|
-
"meanfl":
|
|
18
|
-
"
|
|
19
|
-
"outlog2p": False,
|
|
79
|
+
"meanfl": 1,
|
|
80
|
+
"nreads": 1_000_000,
|
|
20
81
|
}
|
|
21
82
|
script = "file://../scripts/rnaseq/UnitConversion.R"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class Simulation(Proc):
|
|
86
|
+
"""Simulate RNA-seq data using ESCO/RUVcorr package
|
|
87
|
+
|
|
88
|
+
Input:
|
|
89
|
+
ngenes: Number of genes to simulate
|
|
90
|
+
nsamples: Number of samples to simulate
|
|
91
|
+
If you want to force the process to re-simulate for the same
|
|
92
|
+
`ngenes` and `nsamples`, you can set a different value for `envs.seed`.
|
|
93
|
+
Note that the samples will be shown as cells in the output (since
|
|
94
|
+
the simulation is designed for single-cell RNA-seq data).
|
|
95
|
+
|
|
96
|
+
Output:
|
|
97
|
+
outfile: Output file containing the simulated data with rows representing
|
|
98
|
+
genes and columns representing samples.
|
|
99
|
+
outdir: Output directory containing the simulated data
|
|
100
|
+
`sim.rds` and `True.rds` will be generated.
|
|
101
|
+
For `ESCO`, `sim.rds` contains the simulated data in a
|
|
102
|
+
`SingleCellExperiment` object, and `True.rds` contains the matrix of true
|
|
103
|
+
counts.
|
|
104
|
+
For `RUVcorr`, `sim.rds` contains the simulated data in list with
|
|
105
|
+
`Truth`, A matrix containing the values of Xβ; `Y` A matrix containing the
|
|
106
|
+
values in `Y`; `Noise` A matrix containing the values in `Wα`; `Sigma`
|
|
107
|
+
A matrix containing the true gene-gene correlations, as defined by Xβ; and
|
|
108
|
+
`Info` A matrix containing some of the general information about the
|
|
109
|
+
simulation.
|
|
110
|
+
For all matrices, rows represent genes and columns represent samples.
|
|
111
|
+
|
|
112
|
+
Envs:
|
|
113
|
+
tool (choice): Which tool to use for simulation.
|
|
114
|
+
- ESCO: uses the [ESCO](https://github.com/JINJINT/ESCO) package.
|
|
115
|
+
- RUVcorr: uses the [RUVcorr](https://rdrr.io/bioc/RUVcorr/) package.
|
|
116
|
+
ncores (type=int): Number of cores to use.
|
|
117
|
+
seed (type=int): Random seed.
|
|
118
|
+
If not set, seed will not be set.
|
|
119
|
+
esco_args (ns): Additional arguments to pass to the simulation function.
|
|
120
|
+
- save (choice): Which type of data to save to `out.outfile`.
|
|
121
|
+
- `simulated-truth`: saves the simulated true counts.
|
|
122
|
+
- `zero-inflated`: saves the zero-inflated counts.
|
|
123
|
+
- `down-sampled`: saves the down-sampled counts.
|
|
124
|
+
- type (choice): Which type of heterogenounity to use.
|
|
125
|
+
- single: produces a single population.
|
|
126
|
+
- group: produces distinct groups.
|
|
127
|
+
- tree: produces distinct groups but admits a tree structure.
|
|
128
|
+
- traj: produces distinct groups but admits a smooth trajectory
|
|
129
|
+
structure.
|
|
130
|
+
- <more>: See <https://rdrr.io/github/JINJINT/ESCO/man/escoParams.html>.
|
|
131
|
+
ruvcorr_args (ns): Additional arguments to pass to the simulation
|
|
132
|
+
function.
|
|
133
|
+
- <more>: See <https://rdrr.io/bioc/RUVcorr/man/simulateGEdata.html>.
|
|
134
|
+
transpose_output (flag): If set, the output will be transposed.
|
|
135
|
+
index_start (type=int): The index to start from when naming the samples.
|
|
136
|
+
Affects the sample names in `out.outfile` only.
|
|
137
|
+
"""
|
|
138
|
+
input = "ngenes:var, nsamples:var"
|
|
139
|
+
output = [
|
|
140
|
+
"outfile:file:{{in.ngenes}}x{{in.nsamples}}.sim/simulated.txt",
|
|
141
|
+
"outdir:dir:{{in.ngenes}}x{{in.nsamples}}.sim",
|
|
142
|
+
]
|
|
143
|
+
lang = config.lang.rscript
|
|
144
|
+
envs = {
|
|
145
|
+
"tool": "RUVcorr",
|
|
146
|
+
"ncores": config.misc.ncores,
|
|
147
|
+
"type": "single",
|
|
148
|
+
"esco_args": {
|
|
149
|
+
"dropout-type": "none",
|
|
150
|
+
"save": "simulated-truth",
|
|
151
|
+
"type": "single",
|
|
152
|
+
},
|
|
153
|
+
"ruvcorr_args": {},
|
|
154
|
+
"seed": None,
|
|
155
|
+
"transpose_output": False,
|
|
156
|
+
"index_start": 1,
|
|
157
|
+
}
|
|
158
|
+
script = "file://../scripts/rnaseq/Simulation.R"
|