biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +28 -0
- biopipen/core/filters.py +79 -4
- biopipen/core/proc.py +12 -3
- biopipen/core/testing.py +75 -3
- biopipen/ns/bam.py +148 -6
- biopipen/ns/bed.py +75 -0
- biopipen/ns/cellranger.py +186 -0
- biopipen/ns/cellranger_pipeline.py +126 -0
- biopipen/ns/cnv.py +19 -3
- biopipen/ns/cnvkit.py +1 -1
- biopipen/ns/cnvkit_pipeline.py +20 -12
- biopipen/ns/delim.py +34 -35
- biopipen/ns/gene.py +68 -23
- biopipen/ns/gsea.py +63 -37
- biopipen/ns/misc.py +39 -14
- biopipen/ns/plot.py +304 -1
- biopipen/ns/protein.py +183 -0
- biopipen/ns/regulatory.py +290 -0
- biopipen/ns/rnaseq.py +142 -5
- biopipen/ns/scrna.py +2053 -473
- biopipen/ns/scrna_metabolic_landscape.py +228 -382
- biopipen/ns/snp.py +659 -0
- biopipen/ns/stats.py +484 -0
- biopipen/ns/tcr.py +683 -98
- biopipen/ns/vcf.py +236 -2
- biopipen/ns/web.py +97 -6
- biopipen/reports/bam/CNVpytor.svelte +4 -9
- biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
- biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
- biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
- biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
- biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
- biopipen/reports/common.svelte +15 -0
- biopipen/reports/protein/ProdigySummary.svelte +16 -0
- biopipen/reports/scrna/CellsDistribution.svelte +4 -39
- biopipen/reports/scrna/DimPlots.svelte +1 -1
- biopipen/reports/scrna/MarkersFinder.svelte +6 -126
- biopipen/reports/scrna/MetaMarkers.svelte +3 -75
- biopipen/reports/scrna/RadarPlots.svelte +4 -20
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
- biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
- biopipen/reports/snp/PlinkCallRate.svelte +24 -0
- biopipen/reports/snp/PlinkFreq.svelte +18 -0
- biopipen/reports/snp/PlinkHWE.svelte +18 -0
- biopipen/reports/snp/PlinkHet.svelte +18 -0
- biopipen/reports/snp/PlinkIBD.svelte +18 -0
- biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
- biopipen/reports/tcr/ClonalStats.svelte +16 -0
- biopipen/reports/tcr/CloneResidency.svelte +3 -93
- biopipen/reports/tcr/Immunarch.svelte +4 -155
- biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
- biopipen/reports/tcr/TESSA.svelte +11 -28
- biopipen/reports/utils/misc.liq +22 -7
- biopipen/scripts/bam/BamMerge.py +11 -15
- biopipen/scripts/bam/BamSampling.py +90 -0
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +38 -0
- biopipen/scripts/bam/CNAClinic.R +41 -5
- biopipen/scripts/bam/CNVpytor.py +153 -54
- biopipen/scripts/bam/ControlFREEC.py +13 -14
- biopipen/scripts/bam/SamtoolsView.py +33 -0
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
- biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +138 -0
- biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
- biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
- biopipen/scripts/cnv/AneuploidyScore.R +55 -20
- biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
- biopipen/scripts/cnv/TMADScore.R +25 -9
- biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
- biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
- biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/RowsBinder.R +1 -1
- biopipen/scripts/delim/SampleInfo.R +116 -118
- biopipen/scripts/gene/GeneNameConversion.R +67 -0
- biopipen/scripts/gene/GenePromoters.R +61 -0
- biopipen/scripts/gsea/Enrichr.R +5 -5
- biopipen/scripts/gsea/FGSEA.R +184 -50
- biopipen/scripts/gsea/GSEA.R +2 -2
- biopipen/scripts/gsea/PreRank.R +5 -5
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Plot.R +80 -0
- biopipen/scripts/misc/Shell.sh +15 -0
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/plot/Heatmap.R +3 -3
- biopipen/scripts/plot/Manhattan.R +147 -0
- biopipen/scripts/plot/QQPlot.R +146 -0
- biopipen/scripts/plot/ROC.R +88 -0
- biopipen/scripts/plot/Scatter.R +112 -0
- biopipen/scripts/plot/VennDiagram.R +5 -9
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +119 -0
- biopipen/scripts/protein/ProdigySummary.R +140 -0
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
- biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
- biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
- biopipen/scripts/regulatory/MotifScan.py +159 -0
- biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
- biopipen/scripts/regulatory/motifs-common.R +324 -0
- biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
- biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
- biopipen/scripts/rnaseq/Simulation.R +21 -0
- biopipen/scripts/rnaseq/UnitConversion.R +325 -54
- biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
- biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
- biopipen/scripts/scrna/CellCellCommunication.py +150 -0
- biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
- biopipen/scripts/scrna/CellSNPLite.py +30 -0
- biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
- biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
- biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
- biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
- biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
- biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
- biopipen/scripts/scrna/CellsDistribution.R +456 -167
- biopipen/scripts/scrna/DimPlots.R +1 -1
- biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
- biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
- biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
- biopipen/scripts/scrna/ExprImputation.R +7 -0
- biopipen/scripts/scrna/LoomTo10X.R +51 -0
- biopipen/scripts/scrna/MQuad.py +25 -0
- biopipen/scripts/scrna/MarkersFinder.R +679 -400
- biopipen/scripts/scrna/MetaMarkers.R +265 -161
- biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
- biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
- biopipen/scripts/scrna/RadarPlots.R +355 -134
- biopipen/scripts/scrna/ScFGSEA.R +298 -100
- biopipen/scripts/scrna/ScSimulation.R +65 -0
- biopipen/scripts/scrna/ScVelo.py +617 -0
- biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
- biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
- biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
- biopipen/scripts/scrna/SeuratClustering.R +36 -233
- biopipen/scripts/scrna/SeuratLoading.R +2 -2
- biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
- biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
- biopipen/scripts/scrna/SeuratPreparing.R +223 -173
- biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
- biopipen/scripts/scrna/SeuratTo10X.R +27 -0
- biopipen/scripts/scrna/Slingshot.R +65 -0
- biopipen/scripts/scrna/Subset10X.R +2 -2
- biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
- biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
- biopipen/scripts/scrna/scvelo_paga.py +313 -0
- biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
- biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
- biopipen/scripts/snp/MatrixEQTL.R +217 -0
- biopipen/scripts/snp/Plink2GTMat.py +148 -0
- biopipen/scripts/snp/PlinkCallRate.R +199 -0
- biopipen/scripts/snp/PlinkFilter.py +100 -0
- biopipen/scripts/snp/PlinkFreq.R +291 -0
- biopipen/scripts/snp/PlinkFromVcf.py +81 -0
- biopipen/scripts/snp/PlinkHWE.R +85 -0
- biopipen/scripts/snp/PlinkHet.R +96 -0
- biopipen/scripts/snp/PlinkIBD.R +196 -0
- biopipen/scripts/snp/PlinkSimulation.py +124 -0
- biopipen/scripts/snp/PlinkUpdateName.py +124 -0
- biopipen/scripts/stats/ChowTest.R +146 -0
- biopipen/scripts/stats/DiffCoexpr.R +152 -0
- biopipen/scripts/stats/LiquidAssoc.R +135 -0
- biopipen/scripts/stats/Mediation.R +108 -0
- biopipen/scripts/stats/MetaPvalue.R +130 -0
- biopipen/scripts/stats/MetaPvalue1.R +74 -0
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/Attach2Seurat.R +3 -2
- biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
- biopipen/scripts/tcr/CDR3Clustering.R +343 -0
- biopipen/scripts/tcr/ClonalStats.R +526 -0
- biopipen/scripts/tcr/CloneResidency.R +255 -131
- biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
- biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
- biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
- biopipen/scripts/tcr/GIANA/query.py +164 -162
- biopipen/scripts/tcr/Immunarch-basic.R +31 -9
- biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
- biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
- biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
- biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
- biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
- biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
- biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
- biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
- biopipen/scripts/tcr/Immunarch.R +63 -11
- biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
- biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
- biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
- biopipen/scripts/tcr/SampleDiversity.R +1 -1
- biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
- biopipen/scripts/tcr/ScRepLoading.R +166 -0
- biopipen/scripts/tcr/TCRClusterStats.R +176 -22
- biopipen/scripts/tcr/TCRDock.py +110 -0
- biopipen/scripts/tcr/TESSA.R +102 -118
- biopipen/scripts/tcr/VJUsage.R +5 -5
- biopipen/scripts/tcr/immunarch-patched.R +142 -0
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
- biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +113 -0
- biopipen/scripts/vcf/BcftoolsView.py +73 -0
- biopipen/scripts/vcf/TruvariBench.sh +14 -7
- biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
- biopipen/scripts/vcf/TruvariConsistency.R +1 -1
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +13 -4
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +52 -0
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
- biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
- biopipen/scripts/web/gcloud_common.py +49 -0
- biopipen/utils/gene.py +108 -60
- biopipen/utils/misc.py +146 -20
- biopipen/utils/reference.py +64 -20
- biopipen/utils/reporter.py +177 -0
- biopipen/utils/vcf.py +1 -1
- biopipen-0.34.26.dist-info/METADATA +27 -0
- biopipen-0.34.26.dist-info/RECORD +292 -0
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
- {biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
- biopipen/ns/bcftools.py +0 -111
- biopipen/ns/scrna_basic.py +0 -255
- biopipen/reports/delim/SampleInfo.svelte +0 -36
- biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
- biopipen/reports/scrna/ScFGSEA.svelte +0 -35
- biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
- biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
- biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
- biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
- biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
- biopipen/reports/utils/gsea.liq +0 -110
- biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
- biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
- biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
- biopipen/scripts/gene/GeneNameConversion.py +0 -66
- biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
- biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
- biopipen/scripts/scrna/ExprImpution.R +0 -7
- biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
- biopipen/scripts/scrna/Write10X.R +0 -11
- biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
- biopipen/scripts/tcr/TCRClustering.R +0 -280
- biopipen/utils/common_docstrs.py +0 -61
- biopipen/utils/gene.R +0 -49
- biopipen/utils/gsea.R +0 -193
- biopipen/utils/io.R +0 -20
- biopipen/utils/misc.R +0 -114
- biopipen/utils/mutate_helpers.R +0 -433
- biopipen/utils/plot.R +0 -173
- biopipen/utils/rnaseq.R +0 -48
- biopipen/utils/single_cell.R +0 -115
- biopipen-0.21.0.dist-info/METADATA +0 -22
- biopipen-0.21.0.dist-info/RECORD +0 -218
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Cellranger pipeline module for BioPipen"""
|
|
2
|
+
from ..core.proc import Proc
|
|
3
|
+
from ..core.config import config
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CellRangerCount(Proc):
|
|
7
|
+
"""Run cellranger count
|
|
8
|
+
|
|
9
|
+
to count gene expression and/or feature barcode reads
|
|
10
|
+
requires cellranger v7+.
|
|
11
|
+
|
|
12
|
+
Input:
|
|
13
|
+
fastqs: The input fastq files
|
|
14
|
+
Either a list of fastq files or a directory containing fastq files
|
|
15
|
+
If a directory is provided, it should be passed as a list with one
|
|
16
|
+
element.
|
|
17
|
+
id: The id defining output directory. If not provided, it is inferred
|
|
18
|
+
from the fastq files.
|
|
19
|
+
Note that, unlike the `--id` argument of cellranger, this will not select
|
|
20
|
+
the samples from `in.fastqs`. In stead, it will symlink the fastq files
|
|
21
|
+
to a temporary directory with this `id` as prefix and pass that to
|
|
22
|
+
cellranger.
|
|
23
|
+
|
|
24
|
+
Output:
|
|
25
|
+
outdir: The output directory
|
|
26
|
+
|
|
27
|
+
Envs:
|
|
28
|
+
ncores: Number of cores to use
|
|
29
|
+
cellranger: Path to cellranger
|
|
30
|
+
ref: Path of folder containing 10x-compatible transcriptome reference
|
|
31
|
+
tmpdir: Path to temporary directory, used to save the soft-lined fastq files
|
|
32
|
+
to pass to cellranger
|
|
33
|
+
outdir_is_mounted (flag): A flag indicating whether the output directory is
|
|
34
|
+
on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
|
|
35
|
+
fail when trying to copy/operate files to a mounted filesystem.
|
|
36
|
+
See <https://github.com/10XGenomics/cellranger/issues/210> and
|
|
37
|
+
<https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
|
|
38
|
+
If that is the case, set this flag to `True` to use `envs.tmpdir` as
|
|
39
|
+
the output directory for `cellranger vdj`, and then move the results
|
|
40
|
+
to the final output directory after `cellranger vdj` finishes.
|
|
41
|
+
In this case, make sure that `envs.tmpdir` must have enough space and
|
|
42
|
+
it must be a local filesystem.
|
|
43
|
+
copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
|
|
44
|
+
to only copy the `outs` folder from the temporary output directory
|
|
45
|
+
to the final output directory, instead of the whole output directory.
|
|
46
|
+
include_introns (flag): Set to false to exclude intronic reads in count.
|
|
47
|
+
create_bam (flag): Enable or disable BAM file generation.
|
|
48
|
+
This is required by cellrange v8+. When using cellrange v8-, it will be
|
|
49
|
+
transformed to `--no-bam`.
|
|
50
|
+
<more>: Other environment variables required by `cellranger count`
|
|
51
|
+
See `cellranger count --help` for more details or
|
|
52
|
+
<https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#count>
|
|
53
|
+
""" # noqa: E501
|
|
54
|
+
input = "fastqs:files, id"
|
|
55
|
+
output = """outdir:dir:
|
|
56
|
+
{%- set fastqs = in.fastqs -%}
|
|
57
|
+
{%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
|
|
58
|
+
{%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
|
|
59
|
+
{%- endif -%}
|
|
60
|
+
{%- if in.id -%}
|
|
61
|
+
{{in.id}}
|
|
62
|
+
{%- else -%}
|
|
63
|
+
{%- set id = commonprefix(*fastqs) |
|
|
64
|
+
regex_replace: "_L\\d+(:?_.*)?$", "" |
|
|
65
|
+
regex_replace: "_S\\d+$", "" -%}
|
|
66
|
+
{{- id -}}
|
|
67
|
+
{%- endif -%}
|
|
68
|
+
"""
|
|
69
|
+
lang = config.lang.python
|
|
70
|
+
envs = {
|
|
71
|
+
"ncores": config.misc.ncores,
|
|
72
|
+
"cellranger": config.exe.cellranger,
|
|
73
|
+
"ref": config.ref.ref_cellranger_gex,
|
|
74
|
+
"tmpdir": config.path.tmpdir,
|
|
75
|
+
"outdir_is_mounted": False,
|
|
76
|
+
"copy_outs_only": True,
|
|
77
|
+
"include_introns": True,
|
|
78
|
+
"create_bam": False,
|
|
79
|
+
}
|
|
80
|
+
script = "file://../scripts/cellranger/CellRangerCount.py"
|
|
81
|
+
plugin_opts = {
|
|
82
|
+
"report": "file://../reports/cellranger/CellRangerCount.svelte",
|
|
83
|
+
"report_paging": 5,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class CellRangerVdj(Proc):
|
|
88
|
+
"""Run cellranger vdj
|
|
89
|
+
|
|
90
|
+
to perform sequence assembly and paired clonotype calling.
|
|
91
|
+
requires cellranger v7+.
|
|
92
|
+
|
|
93
|
+
Input:
|
|
94
|
+
fastqs: The input fastq files
|
|
95
|
+
Either a list of fastq files or a directory containing fastq files
|
|
96
|
+
If a directory is provided, it should be passed as a list with one
|
|
97
|
+
element.
|
|
98
|
+
id: The id determining the output directory. If not provided, it is inferred
|
|
99
|
+
from the fastq files.
|
|
100
|
+
|
|
101
|
+
Output:
|
|
102
|
+
outdir: The output directory
|
|
103
|
+
|
|
104
|
+
Envs:
|
|
105
|
+
ncores: Number of cores to use
|
|
106
|
+
cellranger: Path to cellranger
|
|
107
|
+
ref: Path of folder containing 10x-compatible transcriptome reference
|
|
108
|
+
tmpdir: Path to temporary directory, used to save the soft-lined fastq files
|
|
109
|
+
to pass to cellranger.
|
|
110
|
+
outdir_is_mounted (flag): A flag indicating whether the output directory is
|
|
111
|
+
on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
|
|
112
|
+
fail when trying to copy the VDJ reference files to a mounted filesystem.
|
|
113
|
+
See <https://github.com/10XGenomics/cellranger/issues/210> and
|
|
114
|
+
<https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
|
|
115
|
+
If that is the case, set this flag to `True` to use `envs.tmpdir` as
|
|
116
|
+
the output directory for `cellranger vdj`, and then move the results
|
|
117
|
+
to the final output directory after `cellranger vdj` finishes.
|
|
118
|
+
In this case, make sure that `envs.tmpdir` must have enough space and
|
|
119
|
+
it must be a local filesystem.
|
|
120
|
+
copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
|
|
121
|
+
to only copy the `outs` folder from the temporary output directory
|
|
122
|
+
to the final output directory, instead of the whole output directory.
|
|
123
|
+
<more>: Other environment variables required by `cellranger vdj`
|
|
124
|
+
See `cellranger vdj --help` for more details or
|
|
125
|
+
<https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#vdj>
|
|
126
|
+
""" # noqa: E501
|
|
127
|
+
input = "fastqs:files, id"
|
|
128
|
+
output = """outdir:dir:
|
|
129
|
+
{%- set fastqs = in.fastqs -%}
|
|
130
|
+
{%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
|
|
131
|
+
{%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
|
|
132
|
+
{%- endif -%}
|
|
133
|
+
{%- if in.id -%}
|
|
134
|
+
{{in.id}}
|
|
135
|
+
{%- else -%}
|
|
136
|
+
{%- set id = commonprefix(*fastqs) |
|
|
137
|
+
regex_replace: "_L\\d+(:?_.*)?$", "" |
|
|
138
|
+
regex_replace: "_S\\d+$", "" -%}
|
|
139
|
+
{{- id -}}
|
|
140
|
+
{%- endif -%}
|
|
141
|
+
"""
|
|
142
|
+
lang = config.lang.python
|
|
143
|
+
envs = {
|
|
144
|
+
"ncores": config.misc.ncores,
|
|
145
|
+
"cellranger": config.exe.cellranger,
|
|
146
|
+
"ref": config.ref.ref_cellranger_vdj,
|
|
147
|
+
"outdir_is_mounted": False,
|
|
148
|
+
"copy_outs_only": True,
|
|
149
|
+
"tmpdir": config.path.tmpdir,
|
|
150
|
+
}
|
|
151
|
+
script = "file://../scripts/cellranger/CellRangerVdj.py"
|
|
152
|
+
plugin_opts = {
|
|
153
|
+
"report": "file://../reports/cellranger/CellRangerVdj.svelte",
|
|
154
|
+
"report_paging": 5,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class CellRangerSummary(Proc):
|
|
159
|
+
"""Summarize cellranger metrics
|
|
160
|
+
|
|
161
|
+
Input:
|
|
162
|
+
indirs: The directories containing cellranger results
|
|
163
|
+
from `CellRangerCount`/`CellRangerVdj`.
|
|
164
|
+
|
|
165
|
+
Output:
|
|
166
|
+
outdir: The output directory
|
|
167
|
+
|
|
168
|
+
Envs:
|
|
169
|
+
group (type=auto): The group of the samples for boxplots.
|
|
170
|
+
If `None`, don't do boxplots.
|
|
171
|
+
It can be a dict of group names and sample names, e.g.
|
|
172
|
+
`{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
|
|
173
|
+
or a file containing the group information, with the first column
|
|
174
|
+
being the sample names and the second column being the group names.
|
|
175
|
+
The file should be tab-delimited with no header.
|
|
176
|
+
"""
|
|
177
|
+
input = "indirs:dirs"
|
|
178
|
+
input_data = lambda ch: [list(ch.iloc[:, 0])]
|
|
179
|
+
output = "outdir:dir:{{in.indirs | first | stem | append: '-etc.summary'}}"
|
|
180
|
+
lang = config.lang.rscript
|
|
181
|
+
script = "file://../scripts/cellranger/CellRangerSummary.R"
|
|
182
|
+
envs = {"group": None}
|
|
183
|
+
plugin_opts = {
|
|
184
|
+
"report": "file://../reports/cellranger/CellRangerSummary.svelte",
|
|
185
|
+
"report_paging": 8,
|
|
186
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""The cellranger pipelines
|
|
2
|
+
|
|
3
|
+
Primarily cellranger process plus summary for summarizing the metrics for
|
|
4
|
+
multiple samples.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from diot import Diot
|
|
10
|
+
from pipen.utils import is_loading_pipeline
|
|
11
|
+
from pipen_args.procgroup import ProcGroup
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from pipen import Proc
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CellRangerCountPipeline(ProcGroup):
|
|
18
|
+
"""The cellranger count pipeline
|
|
19
|
+
|
|
20
|
+
Run cellranger count for multiple samples and summarize the metrics.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
input (list): The list of lists of fastq files.
|
|
24
|
+
or the list of comma-separated string of fastq files.
|
|
25
|
+
ids (list): The list of ids for the samples.
|
|
26
|
+
"""
|
|
27
|
+
DEFAULTS = Diot(input=None, ids=None)
|
|
28
|
+
|
|
29
|
+
def post_init(self):
|
|
30
|
+
"""Check if the input is a list of fastq files"""
|
|
31
|
+
if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
|
|
32
|
+
not isinstance(self.opts.input, (list, tuple))
|
|
33
|
+
or len(self.opts.input) == 0
|
|
34
|
+
):
|
|
35
|
+
raise TypeError(
|
|
36
|
+
"The input of `CellRangerCountPipeline` should be a list of lists of "
|
|
37
|
+
"fastq files."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if isinstance(self.opts.input, (list, tuple)):
|
|
41
|
+
self.opts.input = [
|
|
42
|
+
[y.strip() for y in x.split(",")]
|
|
43
|
+
if isinstance(x, str)
|
|
44
|
+
else x
|
|
45
|
+
for x in self.opts.input
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
@ProcGroup.add_proc
|
|
49
|
+
def p_cellranger_count(self) -> Proc:
|
|
50
|
+
"""Build CellRangerCount process"""
|
|
51
|
+
from .cellranger import CellRangerCount as _CellRangerCount
|
|
52
|
+
|
|
53
|
+
class CellRangerCount(_CellRangerCount):
|
|
54
|
+
if self.opts.ids:
|
|
55
|
+
input_data = list(zip(self.opts.input, self.opts.ids))
|
|
56
|
+
else:
|
|
57
|
+
input_data = self.opts.input
|
|
58
|
+
|
|
59
|
+
return CellRangerCount
|
|
60
|
+
|
|
61
|
+
@ProcGroup.add_proc
|
|
62
|
+
def p_cellranger_count_summary(self) -> Proc:
|
|
63
|
+
"""Build CellRangerCountSummary process"""
|
|
64
|
+
from .cellranger import CellRangerSummary
|
|
65
|
+
|
|
66
|
+
class CellRangerCountSummary(CellRangerSummary):
|
|
67
|
+
requires = self.p_cellranger_count
|
|
68
|
+
input_data = lambda ch: [list(ch.iloc[:, 0])]
|
|
69
|
+
|
|
70
|
+
return CellRangerCountSummary
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class CellRangerVdjPipeline(ProcGroup):
|
|
74
|
+
"""The cellranger vdj pipeline
|
|
75
|
+
|
|
76
|
+
Run cellranger vdj for multiple samples and summarize the metrics.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
input (list): The list of lists of fastq files.
|
|
80
|
+
or the list of comma-separated string of fastq files.
|
|
81
|
+
ids (list): The list of ids for the samples.
|
|
82
|
+
"""
|
|
83
|
+
DEFAULTS = Diot(input=None, ids=None)
|
|
84
|
+
|
|
85
|
+
def post_init(self):
|
|
86
|
+
"""Check if the input is a list of fastq files"""
|
|
87
|
+
if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
|
|
88
|
+
not isinstance(self.opts.input, (list, tuple))
|
|
89
|
+
or len(self.opts.input) == 0
|
|
90
|
+
):
|
|
91
|
+
raise TypeError(
|
|
92
|
+
"The input of `CellRangerVdjPipeline` should be a list of lists of "
|
|
93
|
+
"fastq files."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if isinstance(self.opts.input, (list, tuple)):
|
|
97
|
+
self.opts.input = [
|
|
98
|
+
[y.strip() for y in x.split(",")]
|
|
99
|
+
if isinstance(x, str)
|
|
100
|
+
else x
|
|
101
|
+
for x in self.opts.input
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
@ProcGroup.add_proc
|
|
105
|
+
def p_cellranger_vdj(self) -> Proc:
|
|
106
|
+
"""Build CellRangerVdj process"""
|
|
107
|
+
from .cellranger import CellRangerVdj as _CellRangerVdj
|
|
108
|
+
|
|
109
|
+
class CellRangerVdj(_CellRangerVdj):
|
|
110
|
+
if self.opts.ids:
|
|
111
|
+
input_data = list(zip(self.opts.input, self.opts.ids))
|
|
112
|
+
else:
|
|
113
|
+
input_data = self.opts.input
|
|
114
|
+
|
|
115
|
+
return CellRangerVdj
|
|
116
|
+
|
|
117
|
+
@ProcGroup.add_proc
|
|
118
|
+
def p_cellranger_vdj_summary(self) -> Proc:
|
|
119
|
+
"""Build CellRangerVdjSummary process"""
|
|
120
|
+
from .cellranger import CellRangerSummary
|
|
121
|
+
|
|
122
|
+
class CellRangerVdjSummary(CellRangerSummary):
|
|
123
|
+
requires = self.p_cellranger_vdj
|
|
124
|
+
input_data = lambda ch: [list(ch.iloc[:, 0])]
|
|
125
|
+
|
|
126
|
+
return CellRangerVdjSummary
|
biopipen/ns/cnv.py
CHANGED
|
@@ -12,7 +12,15 @@ class AneuploidyScore(Proc):
|
|
|
12
12
|
|
|
13
13
|
Input:
|
|
14
14
|
segfile: The seg file, generally including chrom, start, end and
|
|
15
|
-
seg.mean (the log2 ratio)
|
|
15
|
+
seg.mean (the log2 ratio).
|
|
16
|
+
It is typically a tab-delimited file or a BED file.
|
|
17
|
+
If so, envs.chrom_col, envs.start_col, envs.end_col and envs.seg_col
|
|
18
|
+
are the 1st, 2nd, 3rd and 5th columns, respectively.
|
|
19
|
+
It can also be a VCF file. If so, envs.chrom_col and envs.start_col
|
|
20
|
+
are not required.
|
|
21
|
+
`end_col` and `envs.seg_col` will be a field in the INFO column.
|
|
22
|
+
[`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
|
|
23
|
+
is required to extract the INFO field.
|
|
16
24
|
|
|
17
25
|
Output:
|
|
18
26
|
outdir: The output directory containing the CAAs, AS and a histogram
|
|
@@ -122,7 +130,15 @@ class TMADScore(Proc):
|
|
|
122
130
|
Input:
|
|
123
131
|
segfile: The seg file, two columns are required:
|
|
124
132
|
* chrom: The chromosome name, used for filtering
|
|
125
|
-
* seg.mean: The log2 ratio
|
|
133
|
+
* seg.mean: The log2 ratio.
|
|
134
|
+
It is typically a tab-delimited file or a BED file.
|
|
135
|
+
If so, envs.chrom_col and envs.seg_col
|
|
136
|
+
are the 1st and 5th columns, respectively.
|
|
137
|
+
It can also be a VCF file. If so, envs.chrom_col and envs.start_col
|
|
138
|
+
are not required.
|
|
139
|
+
`end_col` and `envs.seg_col` will be a field in the INFO column.
|
|
140
|
+
[`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
|
|
141
|
+
is required to extract the INFO field.
|
|
126
142
|
|
|
127
143
|
Output:
|
|
128
144
|
outfile: The output file containing the TMAD score
|
|
@@ -134,7 +150,7 @@ class TMADScore(Proc):
|
|
|
134
150
|
excl_chroms (list): The chromosomes to be excluded
|
|
135
151
|
"""
|
|
136
152
|
input = "segfile:file"
|
|
137
|
-
output = "outfile:file:{{in.segfile |
|
|
153
|
+
output = "outfile:file:{{in.segfile | stem}}.tmad.txt"
|
|
138
154
|
lang = config.lang.rscript
|
|
139
155
|
envs = {
|
|
140
156
|
"chrom_col": "chrom",
|
biopipen/ns/cnvkit.py
CHANGED
|
@@ -482,7 +482,7 @@ class CNVkitDiagram(Proc):
|
|
|
482
482
|
}
|
|
483
483
|
script = "file://../scripts/cnvkit/CNVkitDiagram.py"
|
|
484
484
|
plugin_opts = {
|
|
485
|
-
"report": "file://../reports/cnvkit/
|
|
485
|
+
"report": "file://../reports/cnvkit/CNVkitDiagram.svelte",
|
|
486
486
|
"report_paging": 10,
|
|
487
487
|
}
|
|
488
488
|
|
biopipen/ns/cnvkit_pipeline.py
CHANGED
|
@@ -276,7 +276,10 @@ class CNVkitPipeline(ProcGroup):
|
|
|
276
276
|
"""Build CNVkitGuessBaits process"""
|
|
277
277
|
from .cnvkit import CNVkitGuessBaits
|
|
278
278
|
|
|
279
|
-
if
|
|
279
|
+
if (
|
|
280
|
+
not self.opts.guessbaits and
|
|
281
|
+
not is_loading_pipeline("-h", "-h+", "--help", "--help+")
|
|
282
|
+
):
|
|
280
283
|
return None
|
|
281
284
|
|
|
282
285
|
def _guess_baits_bams(ch):
|
|
@@ -487,7 +490,8 @@ class CNVkitPipeline(ProcGroup):
|
|
|
487
490
|
target_file = None
|
|
488
491
|
antitarget_file = None
|
|
489
492
|
if self.col.sex in metadf:
|
|
490
|
-
|
|
493
|
+
all_sex = metadf[self.col.sex][control_masks].unique()
|
|
494
|
+
sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
|
|
491
495
|
else:
|
|
492
496
|
sample_sex = [None]
|
|
493
497
|
else:
|
|
@@ -774,13 +778,15 @@ class CNVkitPipeline(ProcGroup):
|
|
|
774
778
|
else:
|
|
775
779
|
tumor_masks = metadf[self.col.group] == self.opts.case
|
|
776
780
|
|
|
781
|
+
if self.col.sex in metadf:
|
|
782
|
+
all_sex = metadf[self.col.sex][tumor_masks].unique()
|
|
783
|
+
sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
|
|
784
|
+
else:
|
|
785
|
+
sample_sex = [None]
|
|
786
|
+
|
|
777
787
|
return tibble(
|
|
778
788
|
segfiles=[ch2.outfile.tolist()],
|
|
779
|
-
sample_sex=
|
|
780
|
-
",".join(metadf[self.col.sex][tumor_masks])
|
|
781
|
-
if self.col.sex in metadf
|
|
782
|
-
else [None]
|
|
783
|
-
),
|
|
789
|
+
sample_sex=sample_sex,
|
|
784
790
|
)
|
|
785
791
|
|
|
786
792
|
@annotate.format_doc(indent=3)
|
|
@@ -823,13 +829,15 @@ class CNVkitPipeline(ProcGroup):
|
|
|
823
829
|
else:
|
|
824
830
|
tumor_masks = metadf[self.col.group] == self.opts.case
|
|
825
831
|
|
|
832
|
+
if self.col.sex in metadf:
|
|
833
|
+
all_sex = metadf[self.col.sex][tumor_masks].unique()
|
|
834
|
+
sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
|
|
835
|
+
else:
|
|
836
|
+
sample_sex = [None]
|
|
837
|
+
|
|
826
838
|
return tibble(
|
|
827
839
|
segfiles=[ch2.outfile.tolist()],
|
|
828
|
-
sample_sex=
|
|
829
|
-
",".join(metadf[self.col.sex][tumor_masks])
|
|
830
|
-
if self.col.sex in metadf
|
|
831
|
-
else [None]
|
|
832
|
-
),
|
|
840
|
+
sample_sex=sample_sex,
|
|
833
841
|
)
|
|
834
842
|
|
|
835
843
|
@annotate.format_doc(indent=3)
|
biopipen/ns/delim.py
CHANGED
|
@@ -51,6 +51,10 @@ class SampleInfo(Proc):
|
|
|
51
51
|
Output:
|
|
52
52
|
outfile: The output file with sample information, with mutated columns
|
|
53
53
|
if `envs.save_mutated` is True.
|
|
54
|
+
The basename of the output file will be the same as the input file.
|
|
55
|
+
The file name of each plot will be slugified from the case name.
|
|
56
|
+
Each plot has 3 formats: pdf, png and code.zip, which contains the
|
|
57
|
+
data and R code to reproduce the plot.
|
|
54
58
|
|
|
55
59
|
Envs:
|
|
56
60
|
sep: The separator of the input file.
|
|
@@ -76,37 +80,34 @@ class SampleInfo(Proc):
|
|
|
76
80
|
If `FALSE`, you can mutate the meta data frame with the
|
|
77
81
|
returned ids. Non-paired ids will be `NA`.
|
|
78
82
|
save_mutated (flag): Whether to save the mutated columns.
|
|
79
|
-
exclude_cols: The columns to exclude in the table in the report.
|
|
83
|
+
exclude_cols (auto): The columns to exclude in the table in the report.
|
|
80
84
|
Could be a list or a string separated by comma.
|
|
81
85
|
defaults (ns): The default parameters for `envs.stats`.
|
|
82
|
-
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
-
|
|
98
|
-
plots.
|
|
99
|
-
- ncol (type=int): The number of columns in the plot when `each`
|
|
100
|
-
is not `NULL`. Default is 2.
|
|
101
|
-
- na_each (flag): Whether to include `NA`s in the `each` column.
|
|
102
|
-
- plot: Type of plot. If `on` is continuous, it could be
|
|
103
|
-
`boxplot` (default), `violin`, `violin+boxplot` or `histogram`.
|
|
104
|
-
If `on` is not continuous, it could be `barplot` or
|
|
105
|
-
`pie` (default).
|
|
86
|
+
- plot_type: The type of the plot.
|
|
87
|
+
See the supported plot types here:
|
|
88
|
+
<https://pwwang.github.io/plotthis/reference/index.html>
|
|
89
|
+
The plot_type should be lower case and the plot function used in
|
|
90
|
+
`plotthis` should be used. The mapping from plot_type to the
|
|
91
|
+
plot function is like `bar -> BarPlot`, `box -> BoxPlot`, etc.
|
|
92
|
+
- more_formats (list): The additional formats to save the plot.
|
|
93
|
+
By default, the plot will be saved in png, which is also used to
|
|
94
|
+
display in the report. You can add more formats to save the plot.
|
|
95
|
+
For example, `more_formats = ["pdf", "svg"]`.
|
|
96
|
+
- save_code (flag): Whether to save the R code to reproduce the plot.
|
|
97
|
+
The data used to plot will also be saved.
|
|
98
|
+
- subset: An expression to subset the data frame before plotting.
|
|
99
|
+
The expression should be a string of R expression that will be passed
|
|
100
|
+
to `dplyr::filter`. For example, `subset = "Sample == 'A'"`.
|
|
101
|
+
- section: The section name in the report.
|
|
102
|
+
In case you want to group the plots in the report.
|
|
106
103
|
- devpars (ns): The device parameters for the plot.
|
|
107
104
|
- width (type=int): The width of the plot.
|
|
108
105
|
- height (type=int): The height of the plot.
|
|
109
106
|
- res (type=int): The resolution of the plot.
|
|
107
|
+
- descr: The description of the plot, shown in the report.
|
|
108
|
+
- <more>: You can add more parameters to the defaults.
|
|
109
|
+
These parameters will be expanded to the `envs.stats` for each case,
|
|
110
|
+
and passed to individual plot functions.
|
|
110
111
|
stats (type=json): The statistics to perform.
|
|
111
112
|
The keys are the case names and the values are the parameters
|
|
112
113
|
inheirted from `envs.defaults`.
|
|
@@ -119,18 +120,16 @@ class SampleInfo(Proc):
|
|
|
119
120
|
"save_mutated": False,
|
|
120
121
|
"exclude_cols": None,
|
|
121
122
|
"defaults": {
|
|
122
|
-
"
|
|
123
|
-
"
|
|
124
|
-
"
|
|
125
|
-
"
|
|
126
|
-
"
|
|
127
|
-
"
|
|
128
|
-
"
|
|
129
|
-
"plot": None,
|
|
130
|
-
"devpars": {"width": 800, "height": 600, "res": 100},
|
|
123
|
+
"plot_type": "bar",
|
|
124
|
+
"more_formats": [],
|
|
125
|
+
"save_code": False,
|
|
126
|
+
"subset": None,
|
|
127
|
+
"section": None,
|
|
128
|
+
"descr": None,
|
|
129
|
+
"devpars": {"width": None, "height": None, "res": 100},
|
|
131
130
|
},
|
|
132
131
|
"stats": {},
|
|
133
132
|
}
|
|
134
133
|
lang = config.lang.rscript
|
|
135
134
|
script = "file://../scripts/delim/SampleInfo.R"
|
|
136
|
-
plugin_opts = {"report": "file://../reports/
|
|
135
|
+
plugin_opts = {"report": "file://../reports/common.svelte"}
|
biopipen/ns/gene.py
CHANGED
|
@@ -9,46 +9,91 @@ class GeneNameConversion(Proc):
|
|
|
9
9
|
|
|
10
10
|
Input:
|
|
11
11
|
infile: The input file with original gene names
|
|
12
|
+
It should be a tab-separated file with header
|
|
12
13
|
|
|
13
14
|
Output:
|
|
14
15
|
outfile: The output file with converted gene names
|
|
15
16
|
|
|
16
17
|
Envs:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
18
|
+
notfound (choice): What to do if a conversion cannot be done.
|
|
19
|
+
- use-query: Ignore the conversion and use the original name
|
|
20
|
+
- skip: Ignore the conversion and skip the entire row in input file
|
|
21
|
+
- ignore: Same as skip
|
|
22
|
+
- error: Report error
|
|
23
|
+
- na: Use NA
|
|
24
|
+
dup (choice): What to do if a conversion results in multiple names.
|
|
25
|
+
- first: Use the first name, sorted by matching score descendingly (default)
|
|
26
|
+
- last: Use the last name, sorted by matching score descendingly
|
|
27
|
+
- combine: Combine all names using `;` as separator
|
|
28
|
+
genecol: The index (1-based) or name of the column where genes are present
|
|
29
|
+
output (choice): How to output.
|
|
30
|
+
- append: Add the converted names as new columns at the end using `envs.outfmt`
|
|
31
|
+
as the column name.
|
|
32
|
+
- replace: Drop the original name column, and insert
|
|
33
|
+
the converted names at the original position.
|
|
34
|
+
- converted: Only keep the converted names.
|
|
35
|
+
- with-query: Output 2 columns with original and converted names.
|
|
33
36
|
infmt: What's the original gene name format
|
|
34
37
|
Available fields
|
|
35
38
|
https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
|
|
36
|
-
outfmt: What's the target gene name format
|
|
39
|
+
outfmt: What's the target gene name format. Currently only a single format
|
|
40
|
+
is supported.
|
|
37
41
|
species: Limit gene query to certain species.
|
|
38
42
|
Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
|
|
39
43
|
thale-cress, frog and pig
|
|
40
44
|
""" # noqa: E501
|
|
41
45
|
input = "infile:file"
|
|
42
46
|
output = "outfile:file:{{in.infile | basename}}"
|
|
43
|
-
lang = config.lang.
|
|
47
|
+
lang = config.lang.rscript
|
|
44
48
|
envs = {
|
|
45
|
-
"inopts": {"sep": "\t", "index_col": False},
|
|
46
|
-
"outopts": {"sep": "\t", "index": False},
|
|
47
49
|
"notfound": "error",
|
|
48
|
-
"genecol":
|
|
49
|
-
"
|
|
50
|
+
"genecol": 1,
|
|
51
|
+
"dup": "first",
|
|
52
|
+
"output": "append",
|
|
50
53
|
"infmt": ["symbol", "alias"],
|
|
51
54
|
"outfmt": "symbol",
|
|
52
55
|
"species": "human",
|
|
53
56
|
}
|
|
54
|
-
script = "file://../scripts/gene/GeneNameConversion.
|
|
57
|
+
script = "file://../scripts/gene/GeneNameConversion.R"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class GenePromoters(Proc):
|
|
61
|
+
"""Get gene promoter regions by specifying the flanking regions of TSS
|
|
62
|
+
|
|
63
|
+
Input:
|
|
64
|
+
infile: The input file with gene ids/names
|
|
65
|
+
|
|
66
|
+
Output:
|
|
67
|
+
outfile: The output file with promoter regions in BED format
|
|
68
|
+
|
|
69
|
+
Envs:
|
|
70
|
+
up (type=int): The upstream distance from TSS
|
|
71
|
+
down (type=int): The downstream distance from TSS
|
|
72
|
+
If not specified, the default is `envs.up`
|
|
73
|
+
notfound (choice): What to do if a gene is not found.
|
|
74
|
+
- skip: Skip the gene
|
|
75
|
+
- error: Report error
|
|
76
|
+
refgene: The reference gene annotation file in GTF format
|
|
77
|
+
header (flag): Whether the input file has a header
|
|
78
|
+
genecol (type=int): The index (1-based) of the gene column
|
|
79
|
+
match_id (flag): Should we match the genes in `in.infile` by `gene_id`
|
|
80
|
+
instead of `gene_name` in `envs.refgene`
|
|
81
|
+
sort (flag): Sort the output by chromosome and start position
|
|
82
|
+
chrsize: The chromosome size file, from which the chromosome order is
|
|
83
|
+
used to sort the output
|
|
84
|
+
"""
|
|
85
|
+
input = "infile:file"
|
|
86
|
+
output = "outfile:file:{{in.infile | stem}}-promoters.bed"
|
|
87
|
+
lang = config.lang.rscript
|
|
88
|
+
envs = {
|
|
89
|
+
"up": 2000,
|
|
90
|
+
"down": None,
|
|
91
|
+
"notfound": "error",
|
|
92
|
+
"refgene": config.ref.refgene,
|
|
93
|
+
"header": True,
|
|
94
|
+
"genecol": 1,
|
|
95
|
+
"match_id": False,
|
|
96
|
+
"sort": False,
|
|
97
|
+
"chrsize": config.ref.chrsize,
|
|
98
|
+
}
|
|
99
|
+
script = "file://../scripts/gene/GenePromoters.R"
|