PyPI - biopipen - Versions diffs - 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl - Mend

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +28 -0
biopipen/core/filters.py +79 -4
biopipen/core/proc.py +12 -3
biopipen/core/testing.py +75 -3
biopipen/ns/bam.py +148 -6
biopipen/ns/bed.py +75 -0
biopipen/ns/cellranger.py +186 -0
biopipen/ns/cellranger_pipeline.py +126 -0
biopipen/ns/cnv.py +19 -3
biopipen/ns/cnvkit.py +1 -1
biopipen/ns/cnvkit_pipeline.py +20 -12
biopipen/ns/delim.py +34 -35
biopipen/ns/gene.py +68 -23
biopipen/ns/gsea.py +63 -37
biopipen/ns/misc.py +39 -14
biopipen/ns/plot.py +304 -1
biopipen/ns/protein.py +183 -0
biopipen/ns/regulatory.py +290 -0
biopipen/ns/rnaseq.py +142 -5
biopipen/ns/scrna.py +2053 -473
biopipen/ns/scrna_metabolic_landscape.py +228 -382
biopipen/ns/snp.py +659 -0
biopipen/ns/stats.py +484 -0
biopipen/ns/tcr.py +683 -98
biopipen/ns/vcf.py +236 -2
biopipen/ns/web.py +97 -6
biopipen/reports/bam/CNVpytor.svelte +4 -9
biopipen/reports/cellranger/CellRangerCount.svelte +18 -0
biopipen/reports/cellranger/CellRangerSummary.svelte +16 -0
biopipen/reports/cellranger/CellRangerVdj.svelte +18 -0
biopipen/reports/cnvkit/CNVkitDiagram.svelte +1 -1
biopipen/reports/cnvkit/CNVkitHeatmap.svelte +1 -1
biopipen/reports/cnvkit/CNVkitScatter.svelte +1 -1
biopipen/reports/common.svelte +15 -0
biopipen/reports/protein/ProdigySummary.svelte +16 -0
biopipen/reports/scrna/CellsDistribution.svelte +4 -39
biopipen/reports/scrna/DimPlots.svelte +1 -1
biopipen/reports/scrna/MarkersFinder.svelte +6 -126
biopipen/reports/scrna/MetaMarkers.svelte +3 -75
biopipen/reports/scrna/RadarPlots.svelte +4 -20
biopipen/reports/scrna_metabolic_landscape/MetabolicFeatures.svelte +61 -22
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayActivity.svelte +88 -82
biopipen/reports/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.svelte +70 -10
biopipen/reports/snp/PlinkCallRate.svelte +24 -0
biopipen/reports/snp/PlinkFreq.svelte +18 -0
biopipen/reports/snp/PlinkHWE.svelte +18 -0
biopipen/reports/snp/PlinkHet.svelte +18 -0
biopipen/reports/snp/PlinkIBD.svelte +18 -0
biopipen/reports/tcr/CDR3AAPhyschem.svelte +19 -66
biopipen/reports/tcr/ClonalStats.svelte +16 -0
biopipen/reports/tcr/CloneResidency.svelte +3 -93
biopipen/reports/tcr/Immunarch.svelte +4 -155
biopipen/reports/tcr/TCRClusterStats.svelte +3 -45
biopipen/reports/tcr/TESSA.svelte +11 -28
biopipen/reports/utils/misc.liq +22 -7
biopipen/scripts/bam/BamMerge.py +11 -15
biopipen/scripts/bam/BamSampling.py +90 -0
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +38 -0
biopipen/scripts/bam/CNAClinic.R +41 -5
biopipen/scripts/bam/CNVpytor.py +153 -54
biopipen/scripts/bam/ControlFREEC.py +13 -14
biopipen/scripts/bam/SamtoolsView.py +33 -0
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
biopipen/scripts/bed/BedtoolsMakeWindows.py +47 -0
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +138 -0
biopipen/scripts/cellranger/CellRangerSummary.R +181 -0
biopipen/scripts/cellranger/CellRangerVdj.py +112 -0
biopipen/scripts/cnv/AneuploidyScore.R +55 -20
biopipen/scripts/cnv/AneuploidyScoreSummary.R +221 -163
biopipen/scripts/cnv/TMADScore.R +25 -9
biopipen/scripts/cnv/TMADScoreSummary.R +57 -86
biopipen/scripts/cnvkit/CNVkitAccess.py +7 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +26 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +6 -6
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +4 -3
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +12 -8
biopipen/scripts/cnvkit/CNVkitHeatmap.py +5 -5
biopipen/scripts/cnvkit/CNVkitReference.py +6 -5
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/RowsBinder.R +1 -1
biopipen/scripts/delim/SampleInfo.R +116 -118
biopipen/scripts/gene/GeneNameConversion.R +67 -0
biopipen/scripts/gene/GenePromoters.R +61 -0
biopipen/scripts/gsea/Enrichr.R +5 -5
biopipen/scripts/gsea/FGSEA.R +184 -50
biopipen/scripts/gsea/GSEA.R +2 -2
biopipen/scripts/gsea/PreRank.R +5 -5
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Plot.R +80 -0
biopipen/scripts/misc/Shell.sh +15 -0
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/plot/Heatmap.R +3 -3
biopipen/scripts/plot/Manhattan.R +147 -0
biopipen/scripts/plot/QQPlot.R +146 -0
biopipen/scripts/plot/ROC.R +88 -0
biopipen/scripts/plot/Scatter.R +112 -0
biopipen/scripts/plot/VennDiagram.R +5 -9
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +119 -0
biopipen/scripts/protein/ProdigySummary.R +140 -0
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifAffinityTest.R +102 -0
biopipen/scripts/regulatory/MotifAffinityTest_AtSNP.R +127 -0
biopipen/scripts/regulatory/MotifAffinityTest_MotifBreakR.R +104 -0
biopipen/scripts/regulatory/MotifScan.py +159 -0
biopipen/scripts/regulatory/VariantMotifPlot.R +78 -0
biopipen/scripts/regulatory/motifs-common.R +324 -0
biopipen/scripts/rnaseq/Simulation-ESCO.R +180 -0
biopipen/scripts/rnaseq/Simulation-RUVcorr.R +45 -0
biopipen/scripts/rnaseq/Simulation.R +21 -0
biopipen/scripts/rnaseq/UnitConversion.R +325 -54
biopipen/scripts/scrna/AnnData2Seurat.R +40 -0
biopipen/scripts/scrna/CCPlotR-patch.R +161 -0
biopipen/scripts/scrna/CellCellCommunication.py +150 -0
biopipen/scripts/scrna/CellCellCommunicationPlots.R +93 -0
biopipen/scripts/scrna/CellSNPLite.py +30 -0
biopipen/scripts/scrna/CellTypeAnnotation-celltypist.R +185 -0
biopipen/scripts/scrna/CellTypeAnnotation-direct.R +68 -31
biopipen/scripts/scrna/CellTypeAnnotation-hitype.R +27 -22
biopipen/scripts/scrna/CellTypeAnnotation-sccatch.R +28 -20
biopipen/scripts/scrna/CellTypeAnnotation-sctype.R +48 -25
biopipen/scripts/scrna/CellTypeAnnotation.R +37 -1
biopipen/scripts/scrna/CellsDistribution.R +456 -167
biopipen/scripts/scrna/DimPlots.R +1 -1
biopipen/scripts/scrna/ExprImputation-alra.R +109 -0
biopipen/scripts/scrna/ExprImputation-rmagic.R +256 -0
biopipen/scripts/scrna/{ExprImpution-scimpute.R → ExprImputation-scimpute.R} +8 -5
biopipen/scripts/scrna/ExprImputation.R +7 -0
biopipen/scripts/scrna/LoomTo10X.R +51 -0
biopipen/scripts/scrna/MQuad.py +25 -0
biopipen/scripts/scrna/MarkersFinder.R +679 -400
biopipen/scripts/scrna/MetaMarkers.R +265 -161
biopipen/scripts/scrna/ModuleScoreCalculator.R +66 -11
biopipen/scripts/scrna/PseudoBulkDEG.R +678 -0
biopipen/scripts/scrna/RadarPlots.R +355 -134
biopipen/scripts/scrna/ScFGSEA.R +298 -100
biopipen/scripts/scrna/ScSimulation.R +65 -0
biopipen/scripts/scrna/ScVelo.py +617 -0
biopipen/scripts/scrna/Seurat2AnnData.R +7 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +87 -0
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +36 -30
biopipen/scripts/scrna/SeuratClusterStats-features.R +138 -187
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +81 -0
biopipen/scripts/scrna/SeuratClusterStats-stats.R +78 -89
biopipen/scripts/scrna/SeuratClusterStats.R +47 -10
biopipen/scripts/scrna/SeuratClustering.R +36 -233
biopipen/scripts/scrna/SeuratLoading.R +2 -2
biopipen/scripts/scrna/SeuratMap2Ref.R +84 -113
biopipen/scripts/scrna/SeuratMetadataMutater.R +16 -6
biopipen/scripts/scrna/SeuratPreparing.R +223 -173
biopipen/scripts/scrna/SeuratSubClustering.R +64 -0
biopipen/scripts/scrna/SeuratTo10X.R +27 -0
biopipen/scripts/scrna/Slingshot.R +65 -0
biopipen/scripts/scrna/Subset10X.R +2 -2
biopipen/scripts/scrna/TopExpressingGenes.R +169 -135
biopipen/scripts/scrna/celltypist-wrapper.py +195 -0
biopipen/scripts/scrna/scvelo_paga.py +313 -0
biopipen/scripts/scrna/seurat_anndata_conversion.py +98 -0
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeatures.R +447 -82
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayActivity.R +348 -241
biopipen/scripts/scrna_metabolic_landscape/MetabolicPathwayHeterogeneity.R +188 -166
biopipen/scripts/snp/MatrixEQTL.R +217 -0
biopipen/scripts/snp/Plink2GTMat.py +148 -0
biopipen/scripts/snp/PlinkCallRate.R +199 -0
biopipen/scripts/snp/PlinkFilter.py +100 -0
biopipen/scripts/snp/PlinkFreq.R +291 -0
biopipen/scripts/snp/PlinkFromVcf.py +81 -0
biopipen/scripts/snp/PlinkHWE.R +85 -0
biopipen/scripts/snp/PlinkHet.R +96 -0
biopipen/scripts/snp/PlinkIBD.R +196 -0
biopipen/scripts/snp/PlinkSimulation.py +124 -0
biopipen/scripts/snp/PlinkUpdateName.py +124 -0
biopipen/scripts/stats/ChowTest.R +146 -0
biopipen/scripts/stats/DiffCoexpr.R +152 -0
biopipen/scripts/stats/LiquidAssoc.R +135 -0
biopipen/scripts/stats/Mediation.R +108 -0
biopipen/scripts/stats/MetaPvalue.R +130 -0
biopipen/scripts/stats/MetaPvalue1.R +74 -0
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/Attach2Seurat.R +3 -2
biopipen/scripts/tcr/CDR3AAPhyschem.R +211 -143
biopipen/scripts/tcr/CDR3Clustering.R +343 -0
biopipen/scripts/tcr/ClonalStats.R +526 -0
biopipen/scripts/tcr/CloneResidency.R +255 -131
biopipen/scripts/tcr/CloneSizeQQPlot.R +4 -4
biopipen/scripts/tcr/GIANA/GIANA.py +1356 -797
biopipen/scripts/tcr/GIANA/GIANA4.py +1362 -789
biopipen/scripts/tcr/GIANA/query.py +164 -162
biopipen/scripts/tcr/Immunarch-basic.R +31 -9
biopipen/scripts/tcr/Immunarch-clonality.R +25 -5
biopipen/scripts/tcr/Immunarch-diversity.R +352 -134
biopipen/scripts/tcr/Immunarch-geneusage.R +45 -5
biopipen/scripts/tcr/Immunarch-kmer.R +68 -8
biopipen/scripts/tcr/Immunarch-overlap.R +84 -4
biopipen/scripts/tcr/Immunarch-spectratyping.R +35 -6
biopipen/scripts/tcr/Immunarch-tracking.R +38 -6
biopipen/scripts/tcr/Immunarch-vjjunc.R +165 -0
biopipen/scripts/tcr/Immunarch.R +63 -11
biopipen/scripts/tcr/Immunarch2VDJtools.R +2 -2
biopipen/scripts/tcr/ImmunarchFilter.R +4 -4
biopipen/scripts/tcr/ImmunarchLoading.R +38 -29
biopipen/scripts/tcr/SampleDiversity.R +1 -1
biopipen/scripts/tcr/ScRepCombiningExpression.R +40 -0
biopipen/scripts/tcr/ScRepLoading.R +166 -0
biopipen/scripts/tcr/TCRClusterStats.R +176 -22
biopipen/scripts/tcr/TCRDock.py +110 -0
biopipen/scripts/tcr/TESSA.R +102 -118
biopipen/scripts/tcr/VJUsage.R +5 -5
biopipen/scripts/tcr/immunarch-patched.R +142 -0
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +113 -0
biopipen/scripts/vcf/BcftoolsView.py +73 -0
biopipen/scripts/vcf/TruvariBench.sh +14 -7
biopipen/scripts/vcf/TruvariBenchSummary.R +16 -13
biopipen/scripts/vcf/TruvariConsistency.R +1 -1
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +13 -4
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +52 -0
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +82 -0
biopipen/scripts/web/GCloudStorageDownloadFile.py +23 -0
biopipen/scripts/web/gcloud_common.py +49 -0
biopipen/utils/gene.py +108 -60
biopipen/utils/misc.py +146 -20
biopipen/utils/reference.py +64 -20
biopipen/utils/reporter.py +177 -0
biopipen/utils/vcf.py +1 -1
biopipen-0.34.26.dist-info/METADATA +27 -0
biopipen-0.34.26.dist-info/RECORD +292 -0
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/WHEEL +1 -1
{biopipen-0.21.0.dist-info → biopipen-0.34.26.dist-info}/entry_points.txt +6 -2
biopipen/ns/bcftools.py +0 -111
biopipen/ns/scrna_basic.py +0 -255
biopipen/reports/delim/SampleInfo.svelte +0 -36
biopipen/reports/scrna/GeneExpressionInvistigation.svelte +0 -32
biopipen/reports/scrna/ScFGSEA.svelte +0 -35
biopipen/reports/scrna/SeuratClusterStats.svelte +0 -82
biopipen/reports/scrna/SeuratMap2Ref.svelte +0 -20
biopipen/reports/scrna/SeuratPreparing.svelte +0 -38
biopipen/reports/scrna/TopExpressingGenes.svelte +0 -55
biopipen/reports/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.svelte +0 -31
biopipen/reports/utils/gsea.liq +0 -110
biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
biopipen/scripts/gene/GeneNameConversion.py +0 -66
biopipen/scripts/scrna/ExprImpution-alra.R +0 -32
biopipen/scripts/scrna/ExprImpution-rmagic.R +0 -29
biopipen/scripts/scrna/ExprImpution.R +0 -7
biopipen/scripts/scrna/GeneExpressionInvistigation.R +0 -132
biopipen/scripts/scrna/Write10X.R +0 -11
biopipen/scripts/scrna_metabolic_landscape/MetabolicFeaturesIntraSubset.R +0 -150
biopipen/scripts/tcr/TCRClustering.R +0 -280
biopipen/utils/common_docstrs.py +0 -61
biopipen/utils/gene.R +0 -49
biopipen/utils/gsea.R +0 -193
biopipen/utils/io.R +0 -20
biopipen/utils/misc.R +0 -114
biopipen/utils/mutate_helpers.R +0 -433
biopipen/utils/plot.R +0 -173
biopipen/utils/rnaseq.R +0 -48
biopipen/utils/single_cell.R +0 -115
biopipen-0.21.0.dist-info/METADATA +0 -22
biopipen-0.21.0.dist-info/RECORD +0 -218

biopipen/scripts/web/GCloudStorageDownloadBucket.py ADDED Viewed

@@ -0,0 +1,82 @@
+import concurrent.futures
+from pathlib import Path
+from biopipen.utils.misc import run_command, dict_to_cli_args, logger
+from biopipen.scripts.web.gcloud_common import (
+    is_logged_in,
+    is_valid_gs_bucket_url,
+    get_file_path,
+)
+url: str = {{in.url | quote}}  # pyright: ignore  # noqa: E999
+outdir = Path({{out.outdir | repr}})  # pyright: ignore
+gcloud: str = {{envs.gcloud | quote}}  # pyright: ignore
+keep_structure = {{envs.keep_structure | repr}}  # pyright: ignore
+ncores: int = {{envs.ncores | repr}}  # pyright: ignore
+args: dict = {{envs.args | repr}}  # pyright: ignore
+if not is_valid_gs_bucket_url(url):
+    raise Exception(
+        f"Invalid Google Cloud Storage URL for a bucket: {url}. "
+        "URL should be in the format gs://bucket"
+    )
+if not is_logged_in(gcloud):
+    raise Exception(
+        "You need to be logged in to gcloud to download files. "
+        "Please run `gcloud auth login` first."
+    )
+def create_folders(folder_lines):
+    for folder_line in folder_lines:
+        folder_path = get_file_path(folder_line)
+        folder = outdir / folder_path
+        folder.mkdir(parents=True, exist_ok=True)
+def download_file(i: int, line: str, total: int):
+    path = get_file_path(line)
+    if total <= 50:
+        logger.info(f"Downloading {path}")
+    elif 50 < total <= 500:
+        if i % 10 == 0:
+            logger.info(f"Downloading {i}/{total} ...")
+    else:
+        if i % 100 == 0:
+            logger.info(f"Downloading {i}/{total} ...")
+    if keep_structure:
+        target = (outdir / path)
+    else:
+        name = Path(path).name
+        target = outdir / name
+        if target.exists():
+            new_name = f"g{i}-{name}"
+            logger.warning(f"{name} already exists. Renaming to {new_name}.")
+            target = outdir / new_name
+    gs_args = args.copy()
+    gs_args[""] = [gcloud, "storage", "cp", line, target]
+    run_command(dict_to_cli_args(gs_args, dashify=True), fg=True)
+def download_bucket():
+    out = run_command([gcloud, "storage", "ls", "--recursive", url], stdout="RETURN")
+    # remove empty lines and skip the root
+    out = list(filter(None, out.splitlines()[1:]))  # type: ignore
+    if keep_structure:
+        # create folders first
+        logger.info(f"Creating folders to keep structure.")
+        folder_lines = [line[:-2] for line in out if line.endswith("/:")]
+        create_folders(folder_lines)
+    out = [line for line in out if not line.endswith("/:")]
+    length = len(out)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=ncores) as executor:
+        executor.map(download_file, range(length), out, [length] * length)
+if __name__ == "__main__":
+    download_bucket()

biopipen/scripts/web/GCloudStorageDownloadFile.py ADDED Viewed

@@ -0,0 +1,23 @@
+from biopipen.utils.misc import run_command, dict_to_cli_args
+from biopipen.scripts.web.gcloud_common import is_logged_in, is_valid_gs_file_url
+url: str = {{in.url | repr}}  # pyright: ignore  # noqa: E999
+outfile = {{out.outfile | repr}}  # pyright: ignore
+gcloud: str = {{envs.gcloud | repr}}  # pyright: ignore
+args: dict = {{envs.args | repr}}  # pyright: ignore
+if not is_valid_gs_file_url(url):
+    raise Exception(
+        f"Invalid Google Cloud Storage URL for a file: {url}. "
+        "URL should be in the format gs://bucket/path/to/file"
+    )
+if not is_logged_in(gcloud):
+    raise Exception(
+        "You need to be logged in to gcloud to download files. "
+        "Please run `gcloud auth login` first."
+    )
+args[""] = [gcloud, "storage", "cp", url, outfile]
+run_command(dict_to_cli_args(args, dashify=True), fg=True)

biopipen/scripts/web/gcloud_common.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Provides common functions for interacting with Google Cloud Storage."""
+from biopipen.utils.misc import run_command
+def is_logged_in(gcloud: str) -> bool:
+    """Check if the user is logged in to Google Cloud Storage.
+    Args:
+        gcloud: Path to the `gcloud` executable.
+    Returns:
+        bool: True if the user is logged in, False otherwise.
+    """
+    out = run_command([gcloud, "auth", "list"], stdout="RETURN")
+    return "ACTIVE" in out  # type: ignore
+def is_valid_gs_bucket_url(url: str) -> bool:
+    """Check if a URL is a valid Google Cloud Storage bucket URL.
+    Such as `gs://bucket`.
+    """
+    if not url.startswith("gs://"):
+        return False
+    url = url.rstrip("/")
+    return "/" not in url[5:]
+def get_file_path(url: str) -> str:
+    """Get the file path from a Google Cloud Storage file URL, without bucket.
+    For example: gs://bucket/path/to/file -> path/to/file
+    Args:
+        url: The Google Cloud Storage file URL.
+    Returns:
+        str: The file path.
+    """
+    return url[5:].split("/", 1)[1]
+def is_valid_gs_file_url(url: str) -> bool:
+    """Check if a URL is a valid Google Cloud Storage file URL.
+    Such as `gs://bucket/path/to/file`.
+    """
+    return url.startswith("gs://")

biopipen/utils/gene.py CHANGED Viewed

@@ -1,86 +1,134 @@
 """Do gene name conversion"""
+from __future__ import annotations
+import re
+import contextlib
+import pandas as pd
 from mygene import MyGeneInfo
-from datar.all import (
-    c,
-    f,
-    group_by,
-    desc,
-    arrange,
-    slice_head,
-    tibble,
-    left_join,
-    mutate,
-    is_na,
-    across,
-    if_else,
-    filter_,
-    pull,
-    select,
-)
 mygene = MyGeneInfo()
-class QueryGenesNotFound(Exception):
+class QueryGenesNotFound(ValueError):
     """When genes cannot be found"""
 def gene_name_conversion(
-    genes,
-    species,
-    infmt,
-    outfmt,
-    notfound,
+    genes: list[str],
+    infmt: str | list[str],
+    outfmt: str,
+    dup: str = "first",
+    species: str = "human",
+    notfound: str = "na",
+    suppress_messages: bool = False,
 ):
     """Convert gene names using MyGeneInfo
     Args:
-        genes: A sequence of genes
-        species: The species to limit the query
-            Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
-            thale-cress, frog and pig
-        infmt: What's the original gene name format
-            Available fields
-            https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
-        outfmt: What's the target gene name format
-        notfound: What to do if a conversion cannot be done.
-            use-query: Ignore the conversion and use the original name
-            skip: Ignore the conversion and skip the entire row in input file
-            error: Report error
+        genes: A character/integer vector of gene names/ids
+        species: A character vector of species names
+        infmt: A character vector of input gene name formats
+            See the available scopes at
+            https://docs.mygene.info/en/latest/doc/data.html#available-fields
+            You can use ensg as a shortcut for ensembl.gene
+        outfmt: A character vector of output gene name formats
+        dup: How to deal with duplicate gene names found.
+            first: keep the first one (default), sorted by score descendingly
+            last: keep the last one, sorted by score descendingly
+            all: keep all of them, each will be a separate row
+            <X>: combine them into a single string, separated by X
+        notfound: How to deal with gene names that are not found
+            error: stop with an error message
+            use-query: use the query gene name as the converted gene name
+            skip: skip the gene names that are not found
+            ignore: Same as "skip"
+            na: use NA as the converted gene name (default)
+        suppress_messages: Suppress the messages while querying
     Returns:
-        A dataframe with two columns, query and `outfmt`.
+        A dataframe with the query gene names and the converted gene names
+        When a gene name is not found, the converted name will be "NA"
+        When duplicate gene names are found, the one with the highest score will be kept
     """
-    out = (
-        mygene.querymany(
+    notfound = notfound.lower()
+    if notfound not in ("error", "use-query", "skip", "ignore", "na"):
+        raise ValueError(
+            "`notfound` of `gene_name_conversion` must be one of "
+            "'error', 'use-query', 'skip', 'ignore', 'na'"
+        )
+    if infmt in ["ensg", "ensmusg"]:
+        infmt = "ensembl.gene"
+    if outfmt in ["ensg", "ensmusg"]:
+        outfmt = "ensembl.gene"
+    orig_genes = genes[:]
+    if infmt == "ensembl.gene":
+        # Remove version numbers from ensembl gene ids
+        genes = [re.sub("\\..*", "", gene) for gene in genes]
+    query_df = pd.DataFrame({"query": genes, "orig": orig_genes})
+    if suppress_messages:
+        with contextlib.redirect_stdout(None):
+            out = mygene.querymany(
+                genes,
+                scopes=infmt,
+                fields=outfmt,
+                species=species,
+                as_dataframe=True,
+                df_index=False,
+            )
+    else:
+        out = mygene.querymany(
             genes,
             scopes=infmt,
             fields=outfmt,
+            species=species,
             as_dataframe=True,
             df_index=False,
-            species=species,
         )
-        >> group_by(f.query)
-        >> arrange(desc(f._score))
-        >> slice_head(1)
-        >> select(~c(f._id, f._score, f.notfound))
-    )
-    if isinstance(outfmt, str):
-        outfmt = [of.strip() for of in outfmt.split(",")]
-    out = tibble(query=genes) >> left_join(out, by=f.query)
-    if notfound == "use-query":
-        out = out >> mutate(
-            across(
-                outfmt,
-                lambda col, query: if_else(is_na(col), query, col),
-                query=f.query,
-            )
+    if out.shape[0] == 0:
+        return pd.DataFrame({"query": genes, "converted": ["NA"] * len(genes)})
+    if dup == "first":
+        out = (
+            out
+            .sort_values("_score", ascending=False)
+            .groupby("query")
+            .head(1)
+            .reset_index(drop=True)
         )
-    elif notfound == "error" and any(is_na(out[outfmt[0]])):
-        nagenes = out >> filter_(is_na(f[outfmt[0]])) >> pull(f.query)
-        raise QueryGenesNotFound(nagenes)
-    elif notfound == "skip":
-        out = out >> filter_(~is_na(f[outfmt[0]]))
+    elif dup == "last":
+        out = (
+            out
+            .sort_values("_score", ascending=False)
+            .groupby("query")
+            .tail(1)
+            .reset_index(drop=True)
+        )
+    elif dup != "all":
+        out = (
+            out
+            .sort_values("_score", ascending=False)
+            .groupby("query")
+            .agg({outfmt: lambda x: f"{dup}".join([str(x) for x in x.unique()])})
+            .reset_index()
+        )
+    out = pd.merge(query_df, out, on="query", how="left")
+    out = out.drop(columns=["query"]).rename(columns={"orig": "query"})
+    if notfound == "error":
+        if out[outfmt].isnull().any():
+            nagenes = out[out[outfmt].isnull()]["query"].tolist()
+            raise QueryGenesNotFound(f"Query genes not found: {','.join(nagenes)}")
+    elif notfound == "use-query":
+        out[outfmt] = out[outfmt].combine_first(out["query"])
+    elif notfound in ["skip", "ignore"]:
+        out = out.dropna(subset=[outfmt])
+    else:  # notfound == "na"
+        out[outfmt] = out[outfmt].fillna("NA")
     return out

biopipen/utils/misc.py CHANGED Viewed

@@ -1,30 +1,126 @@
 from __future__ import annotations
 from pathlib import Path
+import os
 import sys
-from typing import List
+import logging
+from subprocess import Popen
+from typing import List, Callable, Any
 from biopipen.core.filters import dict_to_cli_args  # noqa: F401
+logger = logging.getLogger("biopipen_job")
+logger.setLevel(logging.DEBUG)
+_handler = logging.StreamHandler(sys.stdout)
+# Use same log format as in R
+# {sprintf("%-7s", level)} [{format(time, "%Y-%m-%d %H:%M:%S")}] {msg}
+# so the logs can be populated by pipen-poplog
+_handler.setFormatter(
+    logging.Formatter(
+        "%(levelname)-7s [%(asctime)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+)
+logger.addHandler(_handler)
+def require_package(
+    package: str,
+    version: str | None = None,
+    python: str | None = None,
+) -> None:
+    """Require a Python package to be installed with optional version check.
+    The version specifier should follow the format used by pip, e.g., '>=1.2.3'.
+    Multiple version specifiers can be separated by commas, e.g., '>=1.2.3,<2.0.0'.
-def exec_code(code, global_vars=None, local_vars=None, return_var=None):
-    global_vars = global_vars or {}
-    local_vars = local_vars or {}
-    exec(code, global_vars, local_vars)
-    if return_var is not None:
-        return local_vars[return_var]
-    return None
+    Args:
+        package (str): The name of the package to check.
+        version (str | None): The version specifier string.
+        python (str | None): The Python interpreter to use.
+    """
+    if not python:
+        import importlib
+        from importlib.metadata import version as get_version
+        from packaging.specifiers import SpecifierSet
+        try:
+            importlib.import_module(package)
+        except ImportError:
+            raise ImportError(f"Package '{package}' is required but not installed.")
+        if version:
+            installed_version = get_version(package)
+            specifier = SpecifierSet(version)
+            if installed_version not in specifier:
+                raise ImportError(
+                    f"Package '{package}' version '{installed_version}' does not "
+                    f"satisfy the requirement '{package}{version}'."
+                )
+    else:
+        import subprocess
+        from packaging.specifiers import SpecifierSet
+        # Check if package is installed using the specified Python interpreter
+        try:
+            result = subprocess.run(
+                [python, "-c", f"import {package}"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode != 0:
+                raise ImportError(
+                    f"Package '{package}' is required but not installed in {python}."
+                )
+        except subprocess.TimeoutExpired:
+            raise ImportError(
+                f"Timeout while checking if package '{package}' is "
+                f"installed in {python}."
+            )
+        except FileNotFoundError:
+            raise ImportError(f"Python interpreter '{python}' not found.")
+        if version:
+            # Get the installed version
+            try:
+                version_cmd = (
+                    f"from importlib.metadata import version; "
+                    f"print(version('{package}'))"
+                )
+                result = subprocess.run(
+                    [python, "-c", version_cmd],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode != 0:
+                    raise ImportError(
+                        f"Failed to get version of package '{package}' "
+                        f"in {python}."
+                    )
+                installed_version = result.stdout.strip()
+                specifier = SpecifierSet(version)
+                if installed_version not in specifier:
+                    raise ImportError(
+                        f"Package '{package}' version '{installed_version}' "
+                        f"in {python} does not satisfy the requirement "
+                        f"'{package}{version}'."
+                    )
+            except subprocess.TimeoutExpired:
+                raise ImportError(
+                    f"Timeout while checking version of package '{package}' "
+                    f"in {python}."
+                )
 def run_command(
-    cmd: str | List[str],
+    cmd: str | List[Any],
     fg: bool = False,
     wait: bool = True,
     print_command: bool = True,
-    print_command_handler: callable = print,
+    print_command_handler: Callable = print,
     **kwargs,
-):
+) -> Popen | str:
     """Run a command.
     Args:
@@ -41,7 +137,7 @@ def run_command(
         The `Popen` object, or str when `stdout` is `RETURN` or `return`.
     """
     import shlex
-    from subprocess import Popen, PIPE, STDOUT
+    from subprocess import PIPE, STDOUT
     if isinstance(cmd, list):
         cmd = [str(c) for c in cmd]
@@ -49,9 +145,12 @@ def run_command(
     if print_command:
         print_command_handler("RUNNING COMMAND:")
         if isinstance(cmd, str):
-            print_command_handler(f"  {cmd}")
+            print_command_handler(f"  {cmd}\n")
         else:
-            print_command_handler(f"  {shlex.join(cmd)}")
+            print_command_handler(f"  {shlex.join(cmd)}\n")
+        # flush the output if print_command_handler is print
+        if print_command_handler is print:
+            sys.stdout.flush()
     if isinstance(cmd, str):
         kwargs["shell"] = True
@@ -60,6 +159,7 @@ def run_command(
         kwargs["stdin"] = PIPE
     return_stdout = False
+    stdout_file = None
     if kwargs.get("stdout") is True:
         kwargs["stdout"] = PIPE
     elif kwargs.get("stdout") in ("RETURN", "return"):
@@ -68,7 +168,8 @@ def run_command(
     elif isinstance(kwargs.get("stdout"), (str, Path)):
         if isinstance(kwargs["stdout"], str):
             kwargs["stdout"] = Path(kwargs["stdout"])
-        kwargs["stdout"] = kwargs["stdout"].open("w")
+        stdout_file = kwargs["stdout"].open("w")
+        kwargs["stdout"] = stdout_file
         kwargs["close_fds"] = True
     if kwargs.get("stderr") is True:
@@ -76,6 +177,10 @@ def run_command(
     elif kwargs.get("stderr") in ("STDOUT", "stdout"):
         kwargs["stderr"] = STDOUT
+    # Enable line buffering for stdout/stderr when redirecting to files or pipes
+    if kwargs.get("bufsize") == 1:
+        kwargs.setdefault("universal_newlines", True)
     if fg:
         if kwargs.get("stdout") or kwargs.get("stderr"):
             raise ValueError(
@@ -85,18 +190,39 @@ def run_command(
         kwargs["stderr"] = sys.stderr
         kwargs["universal_newlines"] = True
+    if "env" in kwargs:
+        kwargs["env"] = {**os.environ, **kwargs["env"]}
     try:
         p = Popen(cmd, **kwargs)
     except Exception as e:
-        raise RuntimeError(f"Failed to run command: {e}")
+        raise RuntimeError(
+            f"Failed to run command: {e}\n"
+            f"Command (list): {cmd}\n"
+            f"Command (str): {shlex.join(cmd)}"
+        )
     if fg or wait or return_stdout:
         rc = p.wait()
         if rc != 0:
-            raise RuntimeError(f"Failed to run command: {cmd}")
+            if stdout_file:
+                stdout_file.close()
+            if return_stdout and p.stdout:
+                p.stdout.close()
+            raise RuntimeError(
+                f"Failed to run command: rc={rc}\n"
+                f"Command (list): {cmd}\n"
+                f"Command (str): {shlex.join(cmd)}"
+            )
         if return_stdout:
-            return p.stdout.read().decode()
+            try:
+                return p.stdout.read().decode()  # type: ignore
+            finally:
+                p.stdout.close()  # type: ignore
+        if stdout_file:
+            stdout_file.close()
         return p

biopipen 0.21.0__py3-none-any.whl → 0.34.26__py3-none-any.whl

biopipen 0.21.0py3-none-any.whl → 0.34.26py3-none-any.whl