PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/cellranger/CellRangerCount.py ADDED Viewed

@@ -0,0 +1,138 @@
+from contextlib import suppress
+import hashlib
+import shutil
+import re
+from pathlib import Path, PosixPath  # noqa: F401
+from biopipen.utils.misc import run_command
+fastqs: list[Path] = {{in.fastqs | each: as_path}}  # pyright: ignore  # noqa
+outdir: Path = Path({{out.outdir | quote}})  # pyright: ignore
+id: str = {{out.outdir | basename | quote}}  # pyright: ignore
+cellranger = {{envs.cellranger | quote}}  # pyright: ignore
+tmpdir = Path({{envs.tmpdir | quote}})  # pyright: ignore
+ref: str = {{envs.ref | quote}}  # pyright: ignore
+ncores = {{envs.ncores | int}}  # pyright: ignore
+include_introns = {{envs.include_introns | repr}}  # pyright: ignore
+create_bam = {{envs.create_bam | repr}}  # pyright: ignore
+outdir_is_mounted: bool = {{envs.outdir_is_mounted | repr}}  # pyright: ignore
+copy_outs_only: bool = {{envs.copy_outs_only | repr}}  # pyright: ignore
+ref: Path = Path(ref).resolve()  # pyright: ignore
+if not ref.exists():
+    raise FileNotFoundError(f"Reference path does not exist: {ref}")
+include_introns = str(include_introns).lower()
+create_bam = str(create_bam).lower()
+# create a temporary unique directory to store the soft-linked fastq files
+uid = hashlib.md5(str(fastqs).encode()).hexdigest()[:8]
+fastqdir = tmpdir / f"cellranger_count_{uid}"
+fastqdir.mkdir(parents=True, exist_ok=True)
+if len(fastqs) == 1 and fastqs[0].is_dir():
+    fastqs = list(fastqs[0].glob("*.fastq.gz"))
+# soft-link the fastq files to the temporary directory
+for fastq in fastqs:
+    fastq = Path(fastq)
+    fqnames = re.split(r"(_S\d+_)", fastq.name)
+    if len(fqnames) != 3:
+        raise ValueError(
+            fr"Expect one and only one '_S\d+_' in fastq file name: {fastq.name}"
+        )
+    linked = fastqdir / f"{id}{fqnames[1]}{fqnames[2]}"
+    if linked.exists():
+        linked.unlink()
+    linked.symlink_to(fastq)
+other_args = {{envs | dict_to_cli_args: dashify=True, exclude=['no_bam', 'create_bam', 'include_introns', 'cellranger', 'transcriptome', 'ref', 'tmpdir', 'id', 'ncores', 'outdir_is_mounted', 'copy_outs_only']}}  # pyright: ignore
+command = [
+    cellranger,
+    "count",
+    "--id",
+    id,
+    "--fastqs",
+    fastqdir,
+    "--transcriptome",
+    str(ref),
+    "--localcores",
+    ncores,
+    "--disable-ui",
+    "--include-introns",
+    include_introns,
+    *other_args,
+]
+# check cellranger version
+#   cellranger cellranger-7.2.0
+version: str = run_command([cellranger, "--version"], stdout = "RETURN")  # type: ignore
+version = version.replace("cellranger", "").replace("-", "").strip()  # type: ignore
+print(f"# Detected cellranger version: {version}")
+version: list[int] = list(map(int, version.split(".")))  # type: ignore
+if version[0] >= 8:
+    command += ["--create-bam", create_bam]
+elif create_bam != "true":
+    command += ["--no-bam"]
+if outdir_is_mounted:
+    print("# Using mounted outdir, redirecting cellranger output to a local tmpdir")
+    local_outdir = tmpdir / f"{outdir.name}-{uid}" / id
+    if local_outdir.parent.exists():
+        shutil.rmtree(local_outdir.parent)
+    local_outdir.parent.mkdir(parents=True, exist_ok=True)
+    odir = local_outdir
+else:
+    odir = outdir
+run_command(command, fg=True, cwd=str(odir.parent))
+web_summary_html = odir / "outs" / "web_summary.html"
+if not web_summary_html.exists():
+    raise RuntimeError(
+        f"web_summary.html does not exist in {odir}/outs. "
+        "cellranger count failed."
+    )
+# Modify web_summary.html to move javascript to a separate file
+# to void vscode live server breaking the page by injecting some code
+print("# Modify web_summary.html to move javascript to a separate file")
+try:
+    web_summary_js = odir / "outs" / "web_summary.js"
+    web_summary_content = web_summary_html.read_text()
+    regex = re.compile(r"<script>(.+)</script>", re.DOTALL)
+    web_summary_html.write_text(regex.sub(
+        '<script src="web_summary.js"></script>',
+        web_summary_content,
+    ))
+    web_summary_js.write_text(regex.search(web_summary_content).group(1))  # type: ignore
+except Exception as e:
+    print(f"Error modifying web_summary.html: {e}")
+    raise e
+# If using local tmpdir for output, move results to the final outdir
+if outdir_is_mounted:
+    print("# Copy results back to outdir")
+    if outdir.exists():
+        shutil.rmtree(outdir)
+    if copy_outs_only:
+        outdir.mkdir(parents=True, exist_ok=True)
+        with suppress(Exception):
+            # Some files may be failed to copy due to permission issues
+            # But the contents are actually copied
+            shutil.copytree(odir / "outs", outdir / "outs")
+    else:
+        with suppress(Exception):
+            shutil.copytree(local_outdir, outdir)  # type: ignore
+    # Make sure essential files exist
+    web_summary_html = outdir / "outs" / "web_summary.html"
+    web_summary_js = outdir / "outs" / "web_summary.js"
+    for f in [web_summary_html, web_summary_js]:
+        if not f.exists():
+            raise RuntimeError(
+                f"{f} does not exist in {outdir}/outs. "
+                "Copying results back from tmpdir failed."
+            )

biopipen/scripts/cellranger/CellRangerSummary.R ADDED Viewed

@@ -0,0 +1,181 @@
+library(rlang)
+library(dplyr)
+library(plotthis)
+library(biopipen.utils)
+indirs <- {{in.indirs | r}}
+outdir <- {{out.outdir | r}}
+joboutdir <- {{job.outdir | r}}
+group <- {{envs.group | r}}
+logger <- get_logger()
+reporter <- get_reporter()
+if (is.character(group)) {
+    group <- read.csv(group, header = FALSE, row.names = NULL)
+    colnames(group) <- c("Sample", "Group")
+} else if (is.list(group)) {
+    group <- do_call(
+        rbind,
+        lapply(names(group), function(n) data.frame(Sample = group[[n]], Group = n))
+    )
+} else if (!is.null(group)) {
+    stop(paste0("Invalid group: ", paste0(group, collapse = ", ")))
+}
+cellranger_type <- NULL
+logger$info("Reading and merging metrics for each sample ...")
+metrics <- NULL
+for (indir in indirs) {
+    sample <- basename(indir)
+    logger$debug("- Reading metrics for sample: ", sample)
+    metric <- read.csv(
+        file.path(indir, "outs", "metrics_summary.csv"),
+        header = TRUE, row.names = NULL, check.names = FALSE)
+    metric$Sample <- sample
+    sample_cellranger_type <- ifelse(
+        file.exists(file.path(indir, "outs", "clonotypes.csv")),
+        "vdj",
+        "count"  # support more types in the future
+    )
+    cellranger_type <- cellranger_type %||% sample_cellranger_type
+    if (cellranger_type != sample_cellranger_type) {
+        stop("Multiple types of CellRanger output detected. Should be either count or vdj.")
+    }
+    if (!is.null(metrics)) {
+        missing_cols <- setdiff(colnames(metrics), colnames(metric))
+        if (length(missing_cols) > 0) {
+            logger$warn('- Missing columns: {paste0(missing_cols, collapse = ", ")} in sample: {sample}')
+            metric[missing_cols] <- NA
+        }
+        missing_cols <- setdiff(colnames(metric), colnames(metrics))
+        if (length(missing_cols) > 0) {
+            logger$warn('- Missing columns: {paste0(missing_cols, collapse = ", ")} in samples before {sample}')
+            metrics[missing_cols] <- NA
+        }
+    }
+    metrics <- rbind(metrics, metric)
+}
+if (is.null(metrics)) {
+    stop("No samples found, check the input directories.")
+}
+percent_columns <- sapply(colnames(metrics), function(x) {
+    is.character(metrics[[x]]) && grepl("%", metrics[[x]][1]) && x != "Sample"
+})
+percent_columns <- colnames(metrics)[percent_columns]
+# Remove %
+metrics <- metrics %>%
+    mutate(across(all_of(percent_columns), ~as.numeric(gsub("%", "", .x)))) %>%
+    rename_with(.fn = function(x) { paste0(x, " (%)") }, .cols = percent_columns) %>%
+    mutate(across(-Sample, ~as.numeric(gsub(",", "", .x))))
+# Save metrics
+write.table(
+    metrics,
+    file.path(outdir, "metrics.txt"),
+    sep = "\t",
+    quote = FALSE,
+    row.names = FALSE
+)
+reporter$add(
+    list(kind = "descr", content = "Metrics for all samples"),
+    list(kind = "table", src = file.path(outdir, "metrics.txt")),
+    h1 = "Metrics of all samples"
+)
+if (cellranger_type == "vdj") {
+    METRIC_DESCR = list(
+        `Estimated Number of Cells` = "The number of barcodes estimated to correspond to GEMs containing cells. See VDJ Cell Calling Algorithm.",
+        `Mean Read Pairs per Cell` = "Number of input read pairs divided by the estimated number of cells.",
+        `Number of Cells With Productive V-J Spanning Pair` = "Number of cell barcodes for which at least one productive sequence was found for each of TRA and TRB (or heavy and light chains, for Ig).",
+        `Number of Read Pairs` = "Total number of read pairs that were assigned to this library in demultiplexing.",
+        `Valid Barcodes` = "Fraction of reads with barcodes that match the whitelist after barcode correction.",
+        `Q30 Bases in Barcode` = "Fraction of cell barcode bases with Q-score greater than or equal to 30.",
+        `Q30 Bases in RNA Read 1` = "Fraction of read 1 bases with Q-score greater than or equal to 30. (Likewise for read 2.)",
+        `Q30 Bases in Sample Index` = "Fraction of sample index bases with Q-score greater than or equal to 30.",
+        `Q30 Bases in UMI` = "Fraction of UMI bases with Q-score ≥ 30.",
+        `Reads Mapped to Any V(D)J Gene` = "Fraction of reads that partially or wholly map to a V(D)J gene segment.",
+        `Reads Mapped to TRA` = "Fraction of reads that map partially or wholly to a TRA gene segment.",
+        `Mean Used Read Pairs per Cell` = "Mean number of read pairs used in assembly per cell barcode. These reads must have a cell barcode, map to a V(D)J gene, and have a UMI with sufficient read support, counted after subsampling.",
+        `Fraction Reads in Cells` = "Number of reads with cell barcodes divided by the number of reads with valid barcodes.",
+        `Median TRA UMIs per Cell` = "Median number of UMIs assigned to a TRA contig per cell.",
+        `Paired Clonotype Diversity` = "Effective diversity of the paired clonotypes, computed as the Inverse Simpson Index of the clonotype frequencies. A value of 1 indicates a minimally diverse sample - only one distinct clonotype was detected. A value equal to the estimated number of cells indicates a maximally diverse sample.",
+        `Cells With TRA Contig` = "Fraction of cell barcodes with at least one TRA contig annotated as a full or partial V(D)J gene.",
+        `Cells With CDR3-annotated TRA Contig` = "Fraction of cell barcodes with at least one TRA contig where a CDR3 was detected.",
+        `Cells With V-J Spanning Contig` = "Fraction of cell barcodes with at least one full-length contig.",
+        `Cells With V-J Spanning TRA Contig` = "Fraction of cell barcodes with at least one full-length TRA contig.",
+        `Cells With Productive TRA Contig` = "Fraction of cell barcodes with at least one full-length TRA contig that is productive.",
+        `Cells With Productive V-J Spanning Pair` = "Fraction of cell barcodes with at least one contig for each chain of the receptor pair that is productive."
+    )
+} else {
+    METRIC_DESCR = list(
+        `Estimated Number of Cells` = "The number of barcodes associated with cell-containing partitions, estimated from the barcode UMI count distribution.",
+        `Mean Reads per Cell` = "The total number of reads divided by the estimated number of cells.",
+        `Median Genes per Cell` = "Median number of read pairs sequenced from the cells assigned to this sample. In case of multiplexing, only cell-associated barcodes assigned exactly one CMO can be assigned to a sample.",
+        `Number of Reads` = "Total number of sequencing reads.",
+        `Valid Barcodes` = "Fraction of reads with cell-barcodes that match the whitelist.",
+        `Sequencing Saturation` = 'Fraction of reads originating from an already-observed UMI. This is a function of library complexity and sequencing depth. More specifically, this is a ratio where: the denominator is the number of confidently-mapped reads with a valid cell-barcode and valid UMI, and the numerator is the subset of those reads that had a non-unique combination of (cell-barcode, UMI, gene). This metric was called "cDNA PCR Duplication" in versions of Cell Ranger prior to 1.2.',
+        `Q30 Bases in Barcode` = "Fraction of bases with Q-score at least 30 in the cell barcode sequences. This is the i7 index (I1) read for the Single Cell 3' v1 chemistry and the R1 read for the Single Cell 3' v2 chemistry.",
+        `Q30 Bases in RNA` = "Fraction of bases with Q-score at least 30 in the RNA read sequences. This is Illumina R1 for the Single Cell 3' v1 chemistry and Illumina R2 for the Single Cell 3' v2 chemistry.",
+        `Q30 Bases in UMI` = "Fraction of bases with Q-score at least 30 in the UMI sequences. This is the R2 read for the Single Cell 3' v1 chemistry and the R1 read for the Single Cell 3' v2 chemistry.",
+        `Reads Mapped to Genome` = "Fraction of reads that mapped to the genome.",
+        `Reads Mapped Confidently to Genome` = "Fraction of reads that mapped uniquely to the genome. If a read mapped to exonic loci from a single gene and also to non-exonic loci, it is considered uniquely mapped to one of the exonic loci.",
+        `Reads Mapped Confidently to Intergenic Regions` = "Fraction of reads that mapped to the intergenic regions of the genome with a high mapping quality score as reported by the aligner.",
+        `Reads Mapped Confidently to Intronic Regions` = "Fraction of reads that mapped to the intronic regions of the genome with a high mapping quality score as reported by the aligner.",
+        `Reads Mapped Confidently to Exonic Regions` = "Fraction of reads that mapped to the exonic regions of the genome with a high mapping quality score as reported by the aligner.",
+        `Reads Mapped Confidently to Transcriptome` = "Fraction of reads that mapped to a unique gene in the transcriptome with a high mapping quality score as reported by the aligner. The read must be consistent with annotated splice junctions when include-introns=false. These reads are considered for UMI counting.",
+        `Reads Confidently Mapped Antisense` = "Fraction of reads confidently mapped to the transcriptome, but on the opposite strand of their annotated gene. A read is counted as antisense if it has any alignments that are consistent with an exon of a transcript but antisense to it, and has no sense alignments.",
+        `Total Genes Detected Median UMI Counts per Cell` = "The number of genes with at least one UMI count in any cell."
+    )
+}
+logger$info("Plotting metrics ...")
+for (metric in colnames(metrics)) {
+    if (metric == "Sample") { next }
+    metric_name <- sub(" \\(%\\)$", "", metric)
+    logger$info("- {metric_name}")
+    reporter$add(
+        list(
+            kind = "descr",
+            content = METRIC_DESCR[[metric_name]] %||% paste0("Metric: ", metric)
+        ),
+        h1 = metric
+    )
+    # barplot
+    p <- BarPlot(metrics, x = "Sample", y = metric, x_text_angle = 90)
+    figfile <- file.path(outdir, paste0(slugify(metric), ".barplot.png"))
+    png(figfile, height = 600, res = 100, width = max(nrow(metrics) * 30 + 200, 400))
+    print(p)
+    dev.off()
+    reporter$add(
+        list(src = figfile, name = "By Sample"),
+        ui = "table_of_images",
+        h1 = metric
+    )
+    if (is.null(group)) { next }
+    # boxplot, if group is provided
+    # group: Sample, Group
+    pdata <- group %>%
+        left_join(metrics, by = "Sample") %>%
+        mutate(Group = factor(Group, levels = unique(Group)))
+    p <- BoxPlot(pdata, x = "Group", y = metric, x_text_angle = 90)
+    figfile <- file.path(outdir, paste0(slugify(metric), ".boxplot.png"))
+    png(figfile, height = 600, res = 100, width = max(length(unique(pdata$Group)) * 30 + 200, 400))
+    print(p)
+    dev.off()
+    reporter$add(
+        list(src = figfile, name = "By Group"),
+        ui = "table_of_images",
+        h1 = metric
+    )
+}
+reporter$save(joboutdir)

biopipen/scripts/cellranger/CellRangerVdj.py ADDED Viewed

@@ -0,0 +1,112 @@
+import hashlib
+import shutil
+import re
+from contextlib import suppress
+from pathlib import Path, PosixPath  # noqa: F401
+from biopipen.utils.misc import run_command
+fastqs: list[Path] = {{in.fastqs | each: as_path}}  # pyright: ignore  # noqa
+outdir: Path = Path({{out.outdir | quote}})  # pyright: ignore
+id: str = {{out.outdir | basename | quote}}  # pyright: ignore
+cellranger: str = {{envs.cellranger | quote}}  # pyright: ignore
+tmpdir = Path({{envs.tmpdir | quote}})  # pyright: ignore
+ref: str = {{envs.ref | quote}}  # pyright: ignore
+ncores: int = {{envs.ncores | int}}  # pyright: ignore
+outdir_is_mounted: bool = {{envs.outdir_is_mounted | repr}}  # pyright: ignore
+copy_outs_only: bool = {{envs.copy_outs_only | repr}}  # pyright: ignore
+# create a temporary unique directory to store the soft-linked fastq files
+uid = hashlib.md5(str(fastqs).encode()).hexdigest()[:8]
+fastqdir = tmpdir / f"cellranger_count_{uid}"
+fastqdir.mkdir(parents=True, exist_ok=True)
+if len(fastqs) == 1 and fastqs[0].is_dir():
+    fastqs = list(fastqs[0].glob("*.fastq.gz"))
+# soft-link the fastq files to the temporary directory
+for fastq in fastqs:
+    fastq = Path(fastq)
+    (fastqdir / fastq.name).symlink_to(fastq)
+other_args = {{envs | dict_to_cli_args: dashify=True, exclude=['cellranger', 'reference', 'ref', 'tmpdir', 'id', 'ncores', 'outdir_is_mounted', 'copy_outs_only']}}  # pyright: ignore
+command = [
+    cellranger,
+    "vdj",
+    "--id",
+    id,
+    "--fastqs",
+    fastqdir,
+    "--reference",
+    Path(ref).resolve(),
+    "--localcores",
+    ncores,
+    "--disable-ui",
+    *other_args,
+]
+version: str = run_command([cellranger, "--version"], stdout = "RETURN")  # type: ignore
+version = version.replace("cellranger", "").replace("-", "").strip()  # type: ignore
+print(f"# Detected cellranger version: {version}")
+if outdir_is_mounted:
+    print("# Using mounted outdir, redirecting cellranger output to a local tmpdir")
+    local_outdir = tmpdir / f"{outdir.name}-{uid}" / id
+    if local_outdir.parent.exists():
+        shutil.rmtree(local_outdir.parent)
+    local_outdir.parent.mkdir(parents=True, exist_ok=True)
+    odir = local_outdir
+else:
+    odir = outdir
+run_command(command, fg=True, cwd=str(odir.parent))
+web_summary_html = odir / "outs" / "web_summary.html"
+if not web_summary_html.exists():
+    raise RuntimeError(
+        f"web_summary.html does not exist in {odir}/outs. "
+        "cellranger vdj failed."
+    )
+# Modify web_summary.html to move javascript to a separate file
+# to void vscode live server breaking the page by injecting some code
+print("# Modify web_summary.html to move javascript to a separate file")
+try:
+    web_summary_js = odir / "outs" / "web_summary.js"
+    web_summary_content = web_summary_html.read_text()
+    regex = re.compile(r"<script>(.+)</script>", re.DOTALL)
+    web_summary_html.write_text(regex.sub(
+        '<script src="web_summary.js"></script>',
+        web_summary_content,
+    ))
+    web_summary_js.write_text(regex.search(web_summary_content).group(1))  # type: ignore
+except Exception as e:
+    print(f"Error modifying web_summary.html: {e}")
+    raise e
+# If using local tmpdir for output, move results to the final outdir
+if outdir_is_mounted:
+    print("# Copy results back to outdir")
+    if outdir.exists():
+        shutil.rmtree(outdir)
+    if copy_outs_only:
+        outdir.mkdir(parents=True, exist_ok=True)
+        with suppress(Exception):
+            # Some files may be failed to copy due to permission issues
+            # But the contents are actually copied
+            shutil.copytree(odir / "outs", outdir / "outs")
+    else:
+        with suppress(Exception):
+            shutil.copytree(local_outdir, outdir)  # type: ignore
+    # Make sure essential files exist
+    web_summary_html = outdir / "outs" / "web_summary.html"
+    web_summary_js = outdir / "outs" / "web_summary.js"
+    filtered_annotations_csv = outdir / "outs" / "filtered_contig_annotations.csv"
+    for f in [web_summary_html, web_summary_js, filtered_annotations_csv]:
+        if not f.exists():
+            raise RuntimeError(
+                f"{f} does not exist in {outdir}/outs. "
+                "Copying results back from tmpdir failed."
+            )

biopipen/scripts/cnv/AneuploidyScore.R CHANGED Viewed

@@ -1,11 +1,9 @@
-source("{{biopipen_dir}}/utils/misc.R")
 library(AneuploidyScore)
 library(dplyr)
 library(tidyr)
 library(tibble)
-library(ggplot2)
-library(ggprism)
+library(plotthis)
+library(biopipen.utils)
 segfile = {{in.segfile | r}}
 outdir = {{out.outdir | r}}
@@ -59,7 +57,15 @@ getCAA <- function(segf, cytoarm, tcn_col,
     }
     ## Create a GRanges object with all unique intervals between segc and cytoc
-    starts <- sort(c(GenomicRanges::start(segc), GenomicRanges::start(cytoc)))
+    starts <- tryCatch({
+      sort(c(GenomicRanges::start(segc), GenomicRanges::start(cytoc)))
+    }, error=function(e) {
+      warning("Error to detect start on chromosome: ", chr_id, immediate. = TRUE)
+      NULL
+    })
+    if (is.null(starts)) {
+      return(NULL)
+    }
     ends <- sort(c(GenomicRanges::end(segc), GenomicRanges::end(cytoc)))
     combc <- GRanges(seqnames=chr_id,
                      IRanges(start=unique(sort(c(starts, ends[-length(ends)]+1))),
@@ -123,17 +129,36 @@ getCAA <- function(segf, cytoarm, tcn_col,
     return(combc_arms)
   })
   names(seg_cyto_chr) <- names(seg_chr)
+  seg_cyto_chr <- seg_cyto_chr[!sapply(seg_cyto_chr, is.null)]
   return(as(seg_cyto_chr, "GRangesList"))
 }
-segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
-seg = data.frame(
-    seqnames = segments[, chrom_col],
-    start = segments[, start_col],
-    end = segments[, end_col],
-    seg.mean = segments[, seg_col]
-)
+if (endsWith(segfile, ".vcf") || endsWith(segfile, ".vcf.gz")) {
+  library(VariantAnnotation)
+  vcf = readVcf(segfile)
+  seg = data.frame(
+      seqnames = as.character(seqnames(vcf)),
+      start = start(vcf),
+      end = vcf@info[[end_col]],
+      seg.mean = vcf@info[[seg_col]]
+  )
+} else if (endsWith(segfile, ".bed")) {
+  segments = read.table(segfile, header=F, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      seqnames = segments[, 1],
+      start = segments[, 2],
+      end = segments[, 3],
+      seg.mean = segments[, 5]
+  )
+} else {
+  segments = read.table(segfile, header=T, row.names=NULL, sep="\t", stringsAsFactors=F)
+  seg = data.frame(
+      seqnames = segments[, chrom_col],
+      start = segments[, start_col],
+      end = segments[, end_col],
+      seg.mean = segments[, seg_col]
+  )
+}
 {% if envs.segmean_transform %}
 segmean_transform = {{envs.segmean_transform}}
@@ -168,6 +193,10 @@ if (is.character(cn_transform)) {
 }
 {% endif %}
+seg <- seg[
+  !is.na(seg$seg.mean) & !is.na(seg$TCN) & !is.infinite(seg$seg.mean) & !is.infinite(seg$TCN),,
+  drop=FALSE]
 write.table(seg, file.path(outdir, "seg.txt"), sep="\t", quote=F, row.names=F, col.names=T)
 wgd_ploidy = checkIfWGD(
@@ -227,11 +256,17 @@ sig_min = min(-1, plotdata$Signal, na.rm=TRUE)
 sig_max = max(1, plotdata$Signal, na.rm=TRUE)
 png(file.path(outdir, "AneuploidyScore.png"), width=1000, height=600, res=100)
-ggplot(plotdata) +
-    geom_bar(aes(x=Arms, y=Signal, fill=Type), stat="identity") +
-    geom_hline(yintercept=0, color="black", size=0.1) +
-    ylim(c(sig_min, sig_max)) +
-    theme_prism() +
-    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    facet_wrap(~SignalType, scales="free_y", nrow=2)
+p <- BarPlot(
+    plotdata,
+    x = "Arms",
+    y = "Signal",
+    fill = "Type",
+    facet_by = "SignalType",
+    facet_nrow = 2,
+    y_min = sig_min,
+    y_max = sig_max,
+    x_text_angle = 90,
+    aspect.ratio = 0.2
+)
+print(p)
 dev.off()

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl