PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/ns/cellranger.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Cellranger pipeline module for BioPipen"""
+from ..core.proc import Proc
+from ..core.config import config
+class CellRangerCount(Proc):
+    """Run cellranger count
+    to count gene expression and/or feature barcode reads
+    requires cellranger v7+.
+    Input:
+        fastqs: The input fastq files
+            Either a list of fastq files or a directory containing fastq files
+            If a directory is provided, it should be passed as a list with one
+            element.
+        id: The id defining output directory. If not provided, it is inferred
+            from the fastq files.
+            Note that, unlike the `--id` argument of cellranger, this will not select
+            the samples from `in.fastqs`. In stead, it will symlink the fastq files
+            to a temporary directory with this `id` as prefix and pass that to
+            cellranger.
+    Output:
+        outdir: The output directory
+    Envs:
+        ncores: Number of cores to use
+        cellranger: Path to cellranger
+        ref: Path of folder containing 10x-compatible transcriptome reference
+        tmpdir: Path to temporary directory, used to save the soft-lined fastq files
+            to pass to cellranger
+        outdir_is_mounted (flag): A flag indicating whether the output directory is
+            on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
+            fail when trying to copy/operate files to a mounted filesystem.
+            See <https://github.com/10XGenomics/cellranger/issues/210> and
+            <https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
+            If that is the case, set this flag to `True` to use `envs.tmpdir` as
+            the output directory for `cellranger vdj`, and then move the results
+            to the final output directory after `cellranger vdj` finishes.
+            In this case, make sure that `envs.tmpdir` must have enough space and
+            it must be a local filesystem.
+        copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
+            to only copy the `outs` folder from the temporary output directory
+            to the final output directory, instead of the whole output directory.
+        include_introns (flag): Set to false to exclude intronic reads in count.
+        create_bam (flag): Enable or disable BAM file generation.
+            This is required by cellrange v8+. When using cellrange v8-, it will be
+            transformed to `--no-bam`.
+        <more>: Other environment variables required by `cellranger count`
+            See `cellranger count --help` for more details or
+            <https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#count>
+    """  # noqa: E501
+    input = "fastqs:files, id"
+    output = """outdir:dir:
+        {%- set fastqs = in.fastqs -%}
+        {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
+            {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
+        {%- endif -%}
+        {%- if in.id -%}
+            {{in.id}}
+        {%- else -%}
+            {%- set id = commonprefix(*fastqs) |
+                regex_replace: "_L\\d+(:?_.*)?$", "" |
+                regex_replace: "_S\\d+$", "" -%}
+            {{- id -}}
+        {%- endif -%}
+    """
+    lang = config.lang.python
+    envs = {
+        "ncores": config.misc.ncores,
+        "cellranger": config.exe.cellranger,
+        "ref": config.ref.ref_cellranger_gex,
+        "tmpdir": config.path.tmpdir,
+        "outdir_is_mounted": False,
+        "copy_outs_only": True,
+        "include_introns": True,
+        "create_bam": False,
+    }
+    script = "file://../scripts/cellranger/CellRangerCount.py"
+    plugin_opts = {
+        "report": "file://../reports/cellranger/CellRangerCount.svelte",
+        "report_paging": 5,
+    }
+class CellRangerVdj(Proc):
+    """Run cellranger vdj
+    to perform sequence assembly and paired clonotype calling.
+    requires cellranger v7+.
+    Input:
+        fastqs: The input fastq files
+            Either a list of fastq files or a directory containing fastq files
+            If a directory is provided, it should be passed as a list with one
+            element.
+        id: The id determining the output directory. If not provided, it is inferred
+            from the fastq files.
+    Output:
+        outdir: The output directory
+    Envs:
+        ncores: Number of cores to use
+        cellranger: Path to cellranger
+        ref: Path of folder containing 10x-compatible transcriptome reference
+        tmpdir: Path to temporary directory, used to save the soft-lined fastq files
+            to pass to cellranger.
+        outdir_is_mounted (flag): A flag indicating whether the output directory is
+            on a mounted filesystem. As of `cellranger` v9.0.1, `cellranger vdj` will
+            fail when trying to copy the VDJ reference files to a mounted filesystem.
+            See <https://github.com/10XGenomics/cellranger/issues/210> and
+            <https://github.com/10XGenomics/cellranger/issues/250> for similar issues.
+            If that is the case, set this flag to `True` to use `envs.tmpdir` as
+            the output directory for `cellranger vdj`, and then move the results
+            to the final output directory after `cellranger vdj` finishes.
+            In this case, make sure that `envs.tmpdir` must have enough space and
+            it must be a local filesystem.
+        copy_outs_only (flag): If `outdir_is_mounted` is `True`, set this flag to `True`
+            to only copy the `outs` folder from the temporary output directory
+            to the final output directory, instead of the whole output directory.
+        <more>: Other environment variables required by `cellranger vdj`
+            See `cellranger vdj --help` for more details or
+            <https://www.10xgenomics.com/support/software/cell-ranger/advanced/cr-command-line-arguments#vdj>
+    """  # noqa: E501
+    input = "fastqs:files, id"
+    output = """outdir:dir:
+        {%- set fastqs = in.fastqs -%}
+        {%- if len(fastqs) == 1 and isdir(fastqs[0]) -%}
+            {%- set fastqs = fastqs[0] | glob: "*.fastq.gz" -%}
+        {%- endif -%}
+        {%- if in.id -%}
+            {{in.id}}
+        {%- else -%}
+            {%- set id = commonprefix(*fastqs) |
+                regex_replace: "_L\\d+(:?_.*)?$", "" |
+                regex_replace: "_S\\d+$", "" -%}
+            {{- id -}}
+        {%- endif -%}
+    """
+    lang = config.lang.python
+    envs = {
+        "ncores": config.misc.ncores,
+        "cellranger": config.exe.cellranger,
+        "ref": config.ref.ref_cellranger_vdj,
+        "outdir_is_mounted": False,
+        "copy_outs_only": True,
+        "tmpdir": config.path.tmpdir,
+    }
+    script = "file://../scripts/cellranger/CellRangerVdj.py"
+    plugin_opts = {
+        "report": "file://../reports/cellranger/CellRangerVdj.svelte",
+        "report_paging": 5,
+    }
+class CellRangerSummary(Proc):
+    """Summarize cellranger metrics
+    Input:
+        indirs: The directories containing cellranger results
+            from `CellRangerCount`/`CellRangerVdj`.
+    Output:
+        outdir: The output directory
+    Envs:
+        group (type=auto): The group of the samples for boxplots.
+            If `None`, don't do boxplots.
+            It can be a dict of group names and sample names, e.g.
+            `{"group1": ["sample1", "sample2"], "group2": ["sample3"]}`
+            or a file containing the group information, with the first column
+            being the sample names and the second column being the group names.
+            The file should be tab-delimited with no header.
+    """
+    input = "indirs:dirs"
+    input_data = lambda ch: [list(ch.iloc[:, 0])]
+    output = "outdir:dir:{{in.indirs | first | stem | append: '-etc.summary'}}"
+    lang = config.lang.rscript
+    script = "file://../scripts/cellranger/CellRangerSummary.R"
+    envs = {"group": None}
+    plugin_opts = {
+        "report": "file://../reports/cellranger/CellRangerSummary.svelte",
+        "report_paging": 8,
+    }

biopipen/ns/cellranger_pipeline.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""The cellranger pipelines
+Primarily cellranger process plus summary for summarizing the metrics for
+multiple samples.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from diot import Diot
+from pipen.utils import is_loading_pipeline
+from pipen_args.procgroup import ProcGroup
+if TYPE_CHECKING:
+    from pipen import Proc
+class CellRangerCountPipeline(ProcGroup):
+    """The cellranger count pipeline
+    Run cellranger count for multiple samples and summarize the metrics.
+    Args:
+        input (list): The list of lists of fastq files.
+            or the list of comma-separated string of fastq files.
+        ids (list): The list of ids for the samples.
+    """
+    DEFAULTS = Diot(input=None, ids=None)
+    def post_init(self):
+        """Check if the input is a list of fastq files"""
+        if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
+            not isinstance(self.opts.input, (list, tuple))
+            or len(self.opts.input) == 0
+        ):
+            raise TypeError(
+                "The input of `CellRangerCountPipeline` should be a list of lists of "
+                "fastq files."
+            )
+        if isinstance(self.opts.input, (list, tuple)):
+            self.opts.input = [
+                [y.strip() for y in x.split(",")]
+                if isinstance(x, str)
+                else x
+                for x in self.opts.input
+            ]
+    @ProcGroup.add_proc
+    def p_cellranger_count(self) -> Proc:
+        """Build CellRangerCount process"""
+        from .cellranger import CellRangerCount as _CellRangerCount
+        class CellRangerCount(_CellRangerCount):
+            if self.opts.ids:
+                input_data = list(zip(self.opts.input, self.opts.ids))
+            else:
+                input_data = self.opts.input
+        return CellRangerCount
+    @ProcGroup.add_proc
+    def p_cellranger_count_summary(self) -> Proc:
+        """Build CellRangerCountSummary process"""
+        from .cellranger import CellRangerSummary
+        class CellRangerCountSummary(CellRangerSummary):
+            requires = self.p_cellranger_count
+            input_data = lambda ch: [list(ch.iloc[:, 0])]
+        return CellRangerCountSummary
+class CellRangerVdjPipeline(ProcGroup):
+    """The cellranger vdj pipeline
+    Run cellranger vdj for multiple samples and summarize the metrics.
+    Args:
+        input (list): The list of lists of fastq files.
+            or the list of comma-separated string of fastq files.
+        ids (list): The list of ids for the samples.
+    """
+    DEFAULTS = Diot(input=None, ids=None)
+    def post_init(self):
+        """Check if the input is a list of fastq files"""
+        if not is_loading_pipeline("-h", "-h+", "--help", "--help+") and (
+            not isinstance(self.opts.input, (list, tuple))
+            or len(self.opts.input) == 0
+        ):
+            raise TypeError(
+                "The input of `CellRangerVdjPipeline` should be a list of lists of "
+                "fastq files."
+            )
+        if isinstance(self.opts.input, (list, tuple)):
+            self.opts.input = [
+                [y.strip() for y in x.split(",")]
+                if isinstance(x, str)
+                else x
+                for x in self.opts.input
+            ]
+    @ProcGroup.add_proc
+    def p_cellranger_vdj(self) -> Proc:
+        """Build CellRangerVdj process"""
+        from .cellranger import CellRangerVdj as _CellRangerVdj
+        class CellRangerVdj(_CellRangerVdj):
+            if self.opts.ids:
+                input_data = list(zip(self.opts.input, self.opts.ids))
+            else:
+                input_data = self.opts.input
+        return CellRangerVdj
+    @ProcGroup.add_proc
+    def p_cellranger_vdj_summary(self) -> Proc:
+        """Build CellRangerVdjSummary process"""
+        from .cellranger import CellRangerSummary
+        class CellRangerVdjSummary(CellRangerSummary):
+            requires = self.p_cellranger_vdj
+            input_data = lambda ch: [list(ch.iloc[:, 0])]
+        return CellRangerVdjSummary

biopipen/ns/cnv.py CHANGED Viewed

@@ -12,7 +12,15 @@ class AneuploidyScore(Proc):
     Input:
         segfile: The seg file, generally including chrom, start, end and
-            seg.mean (the log2 ratio)
+            seg.mean (the log2 ratio).
+            It is typically a tab-delimited file or a BED file.
+            If so, envs.chrom_col, envs.start_col, envs.end_col and envs.seg_col
+            are the 1st, 2nd, 3rd and 5th columns, respectively.
+            It can also be a VCF file. If so, envs.chrom_col and envs.start_col
+            are not required.
+            `end_col` and `envs.seg_col` will be a field in the INFO column.
+            [`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
+            is required to extract the INFO field.
     Output:
         outdir: The output directory containing the CAAs, AS and a histogram
@@ -122,7 +130,15 @@ class TMADScore(Proc):
     Input:
         segfile: The seg file, two columns are required:
             * chrom: The chromosome name, used for filtering
-            * seg.mean: The log2 ratio
+            * seg.mean: The log2 ratio.
+            It is typically a tab-delimited file or a BED file.
+            If so, envs.chrom_col and envs.seg_col
+            are the 1st and 5th columns, respectively.
+            It can also be a VCF file. If so, envs.chrom_col and envs.start_col
+            are not required.
+            `end_col` and `envs.seg_col` will be a field in the INFO column.
+            [`VariantAnnotation`](https://rdrr.io/bioc/VariantAnnotation/)
+            is required to extract the INFO field.
     Output:
         outfile: The output file containing the TMAD score
@@ -134,7 +150,7 @@ class TMADScore(Proc):
         excl_chroms (list): The chromosomes to be excluded
     """
     input = "segfile:file"
-    output = "outfile:file:{{in.segfile | stem0}}.tmad.txt"
+    output = "outfile:file:{{in.segfile | stem}}.tmad.txt"
     lang = config.lang.rscript
     envs = {
         "chrom_col": "chrom",

biopipen/ns/cnvkit.py CHANGED Viewed

@@ -482,7 +482,7 @@ class CNVkitDiagram(Proc):
     }
     script = "file://../scripts/cnvkit/CNVkitDiagram.py"
     plugin_opts = {
-        "report": "file://../reports/cnvkit/CNVkitScatter.svelte",
+        "report": "file://../reports/cnvkit/CNVkitDiagram.svelte",
         "report_paging": 10,
     }

biopipen/ns/cnvkit_pipeline.py CHANGED Viewed

@@ -276,7 +276,10 @@ class CNVkitPipeline(ProcGroup):
         """Build CNVkitGuessBaits process"""
         from .cnvkit import CNVkitGuessBaits
-        if not self.opts.guessbaits and not is_loading_pipeline():
+        if (
+            not self.opts.guessbaits and
+            not is_loading_pipeline("-h", "-h+", "--help", "--help+")
+        ):
             return None
         def _guess_baits_bams(ch):
@@ -487,7 +490,8 @@ class CNVkitPipeline(ProcGroup):
                 target_file = None
                 antitarget_file = None
                 if self.col.sex in metadf:
-                    sample_sex = ",".join(metadf[self.col.sex][control_masks])
+                    all_sex = metadf[self.col.sex][control_masks].unique()
+                    sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
                 else:
                     sample_sex = [None]
             else:
@@ -774,13 +778,15 @@ class CNVkitPipeline(ProcGroup):
             else:
                 tumor_masks = metadf[self.col.group] == self.opts.case
+            if self.col.sex in metadf:
+                all_sex = metadf[self.col.sex][tumor_masks].unique()
+                sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
+            else:
+                sample_sex = [None]
             return tibble(
                 segfiles=[ch2.outfile.tolist()],
-                sample_sex=(
-                    ",".join(metadf[self.col.sex][tumor_masks])
-                    if self.col.sex in metadf
-                    else [None]
-                ),
+                sample_sex=sample_sex,
             )
         @annotate.format_doc(indent=3)
@@ -823,13 +829,15 @@ class CNVkitPipeline(ProcGroup):
             else:
                 tumor_masks = metadf[self.col.group] == self.opts.case
+            if self.col.sex in metadf:
+                all_sex = metadf[self.col.sex][tumor_masks].unique()
+                sample_sex = [None] if len(all_sex) > 1 else all_sex[0]
+            else:
+                sample_sex = [None]
             return tibble(
                 segfiles=[ch2.outfile.tolist()],
-                sample_sex=(
-                    ",".join(metadf[self.col.sex][tumor_masks])
-                    if self.col.sex in metadf
-                    else [None]
-                ),
+                sample_sex=sample_sex,
             )
         @annotate.format_doc(indent=3)

biopipen/ns/delim.py CHANGED Viewed

@@ -51,6 +51,10 @@ class SampleInfo(Proc):
     Output:
         outfile: The output file with sample information, with mutated columns
             if `envs.save_mutated` is True.
+            The basename of the output file will be the same as the input file.
+            The file name of each plot will be slugified from the case name.
+            Each plot has 3 formats: pdf, png and code.zip, which contains the
+            data and R code to reproduce the plot.
     Envs:
         sep: The separator of the input file.
@@ -76,37 +80,34 @@ class SampleInfo(Proc):
                 If `FALSE`, you can mutate the meta data frame with the
                 returned ids. Non-paired ids will be `NA`.
         save_mutated (flag): Whether to save the mutated columns.
-        exclude_cols: The columns to exclude in the table in the report.
+        exclude_cols (auto): The columns to exclude in the table in the report.
             Could be a list or a string separated by comma.
         defaults (ns): The default parameters for `envs.stats`.
-            - on: The column name in the data for the stats.
-                Default is `Sample`. The column could be either continuous or not.
-            - distinct: The column name in the data for the distinct records.
-                For example, you may have multiple `Sample`s for each patient.
-                In this case, you can set `distinct` to `Patient` to get the
-                stats for each patient, instead of each sample with duplicated
-                values. Default is `None`, which means all records are distinct.
-                Note that when `distinct` is provided, your `group` and `each` should
-                be the same for each distinct record. For example, it doesn't make
-                sense if you are doing statistics for each patient (`on = "Sample"`),
-                but your `group` is `SampleSource`, defining the source of each
-                sample.
-            - group: The column name in the data for the group ids.
-                If not provided, all records will be regarded as one group.
-            - na_group (flag): Whether to include `NA`s in the group.
-            - each: The column in the data to split the analysis in different
-                plots.
-            - ncol (type=int): The number of columns in the plot when `each`
-                is not `NULL`. Default is 2.
-            - na_each (flag): Whether to include `NA`s in the `each` column.
-            - plot: Type of plot. If `on` is continuous, it could be
-                `boxplot` (default), `violin`, `violin+boxplot` or `histogram`.
-                If `on` is not continuous, it could be `barplot` or
-                `pie` (default).
+            - plot_type: The type of the plot.
+                See the supported plot types here:
+                <https://pwwang.github.io/plotthis/reference/index.html>
+                The plot_type should be lower case and the plot function used in
+                `plotthis` should be used. The mapping from plot_type to the
+                plot function is like `bar -> BarPlot`, `box -> BoxPlot`, etc.
+            - more_formats (list): The additional formats to save the plot.
+                By default, the plot will be saved in png, which is also used to
+                display in the report. You can add more formats to save the plot.
+                For example, `more_formats = ["pdf", "svg"]`.
+            - save_code (flag): Whether to save the R code to reproduce the plot.
+                The data used to plot will also be saved.
+            - subset: An expression to subset the data frame before plotting.
+                The expression should be a string of R expression that will be passed
+                to `dplyr::filter`. For example, `subset = "Sample == 'A'"`.
+            - section: The section name in the report.
+                In case you want to group the plots in the report.
             - devpars (ns): The device parameters for the plot.
                 - width (type=int): The width of the plot.
                 - height (type=int): The height of the plot.
                 - res (type=int): The resolution of the plot.
+            - descr: The description of the plot, shown in the report.
+            - <more>: You can add more parameters to the defaults.
+                These parameters will be expanded to the `envs.stats` for each case,
+                and passed to individual plot functions.
         stats (type=json): The statistics to perform.
             The keys are the case names and the values are the parameters
             inheirted from `envs.defaults`.
@@ -119,18 +120,16 @@ class SampleInfo(Proc):
         "save_mutated": False,
         "exclude_cols": None,
         "defaults": {
-            "on": "Sample",
-            "distinct": None,
-            "group": None,
-            "na_group": False,
-            "each": None,
-            "ncol": 2,
-            "na_each": False,
-            "plot": None,
-            "devpars": {"width": 800, "height": 600, "res": 100},
+            "plot_type": "bar",
+            "more_formats": [],
+            "save_code": False,
+            "subset": None,
+            "section": None,
+            "descr": None,
+            "devpars": {"width": None, "height": None, "res": 100},
         },
         "stats": {},
     }
     lang = config.lang.rscript
     script = "file://../scripts/delim/SampleInfo.R"
-    plugin_opts = {"report": "file://../reports/delim/SampleInfo.svelte"}
+    plugin_opts = {"report": "file://../reports/common.svelte"}

biopipen/ns/gene.py CHANGED Viewed

@@ -9,46 +9,91 @@ class GeneNameConversion(Proc):
     Input:
         infile: The input file with original gene names
+            It should be a tab-separated file with header
     Output:
         outfile: The output file with converted gene names
     Envs:
-        inopts: Options to read `in.infile` for `pandas.read_csv()`
-            See https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
-        outopts: Options to write `out.outfile` for `pandas.to_csv()`
-            See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
-        notfound: What to do if a conversion cannot be done.
-            use-query: Ignore the conversion and use the original name
-            skip: Ignore the conversion and skip the entire row in input file
-            error: Report error
-        genecol: The index (0-based) or name of the column where
-            genes are present
-        output: How to output
-            keep: Keep the original name column and add new converted columns
-            drop: Drop the original name column, and add the converted names
-            replace: Drop the original name column, and insert
-                the converted names at the original position
-            only: Only keep the query and the converted name columns
+        notfound (choice): What to do if a conversion cannot be done.
+            - use-query: Ignore the conversion and use the original name
+            - skip: Ignore the conversion and skip the entire row in input file
+            - ignore: Same as skip
+            - error: Report error
+            - na: Use NA
+        dup (choice): What to do if a conversion results in multiple names.
+            - first: Use the first name, sorted by matching score descendingly (default)
+            - last: Use the last name, sorted by matching score descendingly
+            - combine: Combine all names using `;` as separator
+        genecol: The index (1-based) or name of the column where genes are present
+        output (choice): How to output.
+            - append: Add the converted names as new columns at the end using `envs.outfmt`
+                as the column name.
+            - replace: Drop the original name column, and insert
+                the converted names at the original position.
+            - converted: Only keep the converted names.
+            - with-query: Output 2 columns with original and converted names.
         infmt: What's the original gene name format
             Available fields
             https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
-        outfmt: What's the target gene name format
+        outfmt: What's the target gene name format. Currently only a single format
+            is supported.
         species: Limit gene query to certain species.
             Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
             thale-cress, frog and pig
     """  # noqa: E501
     input = "infile:file"
     output = "outfile:file:{{in.infile | basename}}"
-    lang = config.lang.python
+    lang = config.lang.rscript
     envs = {
-        "inopts": {"sep": "\t", "index_col": False},
-        "outopts": {"sep": "\t", "index": False},
         "notfound": "error",
-        "genecol": 0,
-        "output": "keep",
+        "genecol": 1,
+        "dup": "first",
+        "output": "append",
         "infmt": ["symbol", "alias"],
         "outfmt": "symbol",
         "species": "human",
     }
-    script = "file://../scripts/gene/GeneNameConversion.py"
+    script = "file://../scripts/gene/GeneNameConversion.R"
+class GenePromoters(Proc):
+    """Get gene promoter regions by specifying the flanking regions of TSS
+    Input:
+        infile: The input file with gene ids/names
+    Output:
+        outfile: The output file with promoter regions in BED format
+    Envs:
+        up (type=int): The upstream distance from TSS
+        down (type=int): The downstream distance from TSS
+            If not specified, the default is `envs.up`
+        notfound (choice): What to do if a gene is not found.
+            - skip: Skip the gene
+            - error: Report error
+        refgene: The reference gene annotation file in GTF format
+        header (flag): Whether the input file has a header
+        genecol (type=int): The index (1-based) of the gene column
+        match_id (flag): Should we match the genes in `in.infile` by `gene_id`
+            instead of `gene_name` in `envs.refgene`
+        sort (flag): Sort the output by chromosome and start position
+        chrsize: The chromosome size file, from which the chromosome order is
+            used to sort the output
+    """
+    input = "infile:file"
+    output = "outfile:file:{{in.infile | stem}}-promoters.bed"
+    lang = config.lang.rscript
+    envs = {
+        "up": 2000,
+        "down": None,
+        "notfound": "error",
+        "refgene": config.ref.refgene,
+        "header": True,
+        "genecol": 1,
+        "match_id": False,
+        "sort": False,
+        "chrsize": config.ref.chrsize,
+    }
+    script = "file://../scripts/gene/GenePromoters.R"

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl