PyPI - biopipen - Versions diffs - 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

biopipen 0.32.3py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biopipen might be problematic. Click here for more details.

Files changed (117) hide show

biopipen/__init__.py +1 -1
biopipen/core/config.toml +6 -0
biopipen/core/filters.py +35 -23
biopipen/core/testing.py +6 -1
biopipen/ns/bam.py +39 -0
biopipen/ns/cellranger.py +5 -0
biopipen/ns/cellranger_pipeline.py +2 -2
biopipen/ns/cnvkit_pipeline.py +4 -1
biopipen/ns/delim.py +33 -27
biopipen/ns/protein.py +99 -0
biopipen/ns/scrna.py +411 -250
biopipen/ns/snp.py +16 -3
biopipen/ns/tcr.py +125 -1
biopipen/ns/vcf.py +34 -0
biopipen/ns/web.py +5 -1
biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
biopipen/reports/tcr/ClonalStats.svelte +15 -0
biopipen/reports/utils/misc.liq +20 -7
biopipen/scripts/bam/BamMerge.py +2 -2
biopipen/scripts/bam/BamSampling.py +4 -4
biopipen/scripts/bam/BamSort.py +141 -0
biopipen/scripts/bam/BamSplitChroms.py +10 -10
biopipen/scripts/bam/BamSubsetByBed.py +3 -3
biopipen/scripts/bam/CNVpytor.py +10 -10
biopipen/scripts/bam/ControlFREEC.py +11 -11
biopipen/scripts/bed/Bed2Vcf.py +5 -5
biopipen/scripts/bed/BedConsensus.py +5 -5
biopipen/scripts/bed/BedLiftOver.sh +6 -4
biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
biopipen/scripts/bed/BedtoolsMerge.py +4 -4
biopipen/scripts/cellranger/CellRangerCount.py +20 -9
biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
biopipen/scripts/cnvkit/guess_baits.py +166 -93
biopipen/scripts/delim/SampleInfo.R +85 -148
biopipen/scripts/misc/Config2File.py +2 -2
biopipen/scripts/misc/Str2File.py +2 -2
biopipen/scripts/protein/MMCIF2PDB.py +33 -0
biopipen/scripts/protein/PDB2Fasta.py +60 -0
biopipen/scripts/protein/Prodigy.py +4 -4
biopipen/scripts/protein/RMSD.py +178 -0
biopipen/scripts/regulatory/MotifScan.py +8 -8
biopipen/scripts/scrna/CellCellCommunication.py +59 -22
biopipen/scripts/scrna/MarkersFinder.R +273 -654
biopipen/scripts/scrna/RadarPlots.R +73 -53
biopipen/scripts/scrna/SCP-plot.R +15202 -0
biopipen/scripts/scrna/ScVelo.py +0 -0
biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
biopipen/scripts/scrna/SeuratPreparing.R +138 -81
biopipen/scripts/scrna/SlingShot.R +71 -0
biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
biopipen/scripts/snp/Plink2GTMat.py +26 -11
biopipen/scripts/snp/PlinkFilter.py +7 -7
biopipen/scripts/snp/PlinkFromVcf.py +8 -5
biopipen/scripts/snp/PlinkSimulation.py +4 -4
biopipen/scripts/snp/PlinkUpdateName.py +4 -4
biopipen/scripts/stats/ChowTest.R +48 -22
biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
biopipen/scripts/tcr/ClonalStats.R +484 -0
biopipen/scripts/tcr/ScRepLoading.R +127 -0
biopipen/scripts/tcr/TCRDock.py +10 -6
biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
biopipen/scripts/vcf/BcftoolsSort.py +4 -4
biopipen/scripts/vcf/BcftoolsView.py +5 -5
biopipen/scripts/vcf/Vcf2Bed.py +2 -2
biopipen/scripts/vcf/VcfAnno.py +11 -11
biopipen/scripts/vcf/VcfDownSample.sh +22 -10
biopipen/scripts/vcf/VcfFilter.py +5 -5
biopipen/scripts/vcf/VcfFix.py +7 -7
biopipen/scripts/vcf/VcfFix_utils.py +12 -3
biopipen/scripts/vcf/VcfIndex.py +3 -3
biopipen/scripts/vcf/VcfIntersect.py +3 -3
biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
biopipen/scripts/vcf/bcftools_utils.py +3 -3
biopipen/scripts/web/Download.py +8 -4
biopipen/scripts/web/DownloadList.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
biopipen/scripts/web/gcloud_common.py +1 -1
biopipen/utils/gsea.R +75 -35
biopipen/utils/misc.R +205 -7
biopipen/utils/misc.py +17 -8
biopipen/utils/reference.py +11 -11
biopipen/utils/repr.R +146 -0
biopipen/utils/vcf.py +1 -1
{biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
{biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
{biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
{biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0

biopipen/scripts/vcf/BcftoolsAnnotate.py CHANGED Viewed

@@ -6,11 +6,11 @@ from biopipen.utils.reference import tabix_index
 from biopipen.utils.misc import logger
 from biopipen.scripts.vcf.bcftools_utils import run_bcftools
-infile = {{in.infile | repr}}  # pyright: ignore # noqa: E999
-annfile = {{in.annfile | repr}}  # pyright: ignore
-outfile = {{out.outfile | repr}}  # pyright: ignore
-joboutdir = {{job.outdir | repr}}  # pyright: ignore
-envs = {{envs | dict | repr}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa: E999
+annfile: str = {{in.annfile | quote}}  # pyright: ignore
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+joboutdir: str = {{job.outdir | quote}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
 bcftools = envs.pop("bcftools")
 tabix = envs.pop("tabix")
@@ -25,14 +25,14 @@ if isinstance(columns, list):
     columns = ",".join(columns)
 if "c" in envs:
-    logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
+    logger.warning(r"Ignoring envs\[c], use envs\[columns] instead.")
     del envs["c"]
 if isinstance(remove, list):
     remove = ",".join(remove)
 if "x" in envs:
-    logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
+    logger.warning(r"Ignoring envs\[x], use envs\[remove] instead.")
     del envs["x"]
 envs_has_annfile = "a" in envs or "annotations" in envs
@@ -43,7 +43,7 @@ if header:
 if annfile and envs_has_annfile:
     logger.warning(
-        "Ignoring envs\[a/annotations] because in.annfile is provided."
+        r"Ignoring envs\[a/annotations] because in.annfile is provided."
     )
     with suppress(KeyError):
         del envs["a"]

biopipen/scripts/vcf/BcftoolsFilter.py CHANGED Viewed

@@ -3,11 +3,11 @@ from pathlib import Path, PosixPath  # noqa: F401
 from biopipen.utils.misc import logger
 from biopipen.scripts.vcf.bcftools_utils import run_bcftools
-infile = {{in.infile | repr}}  # pyright: ignore # noqa: #999
-outfile = {{out.outfile | repr}}  # pyright: ignore
+infile: str | Path = {{in.infile | quote}}  # pyright: ignore # noqa: #999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
 outdir = Path(outfile).parent
-envs = {{envs | dict | repr}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
 bcftools = envs.pop("bcftools")
 tabix = envs.pop("tabix")
 keep = envs.pop("keep")

biopipen/scripts/vcf/BcftoolsMerge.py ADDED Viewed

@@ -0,0 +1,31 @@
+from biopipen.utils.reference import tabix_index
+from biopipen.utils.misc import logger
+from biopipen.scripts.vcf.bcftools_utils import run_bcftools
+infiles: list = {{in.infiles | each: as_path}}  # pyright: ignore # noqa: E999
+outfile = {{out.outfile | repr}}  # pyright: ignore
+joboutdir = {{job.outdir | repr}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
+bcftools = envs.pop("bcftools")
+tabix = envs.pop("tabix")
+ncores = envs.pop("ncores")
+gz = envs.pop("gz")
+index = envs.pop("index")
+envs.setdefault("force-single", True)
+envs.setdefault("missing-to-ref", True)
+if index and not gz:
+    logger.warning("Forcing envs.gz to True because envs.index is True.")
+    gz = True
+if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
+    envs["O"] = "z" if gz else "v"
+envs[""] = [bcftools, "merge"]
+envs["o"] = outfile
+envs["threads"] = ncores
+envs["_"] = infiles
+run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)

biopipen/scripts/vcf/BcftoolsSort.py CHANGED Viewed

@@ -4,9 +4,9 @@ from pathlib import Path, PosixPath  # noqa: F401
 from biopipen.utils.misc import run_command, logger
 from biopipen.scripts.vcf.bcftools_utils import run_bcftools
-infile = {{in.infile | quote}}  # pyright: ignore # noqa: E999
-outfile = {{out.outfile | quote}}  # pyright: ignore
-envs = {{envs | dict | repr}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa: E999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+envs: dict = {{envs | dict | repr}}  # pyright: ignore
 outdir = Path(outfile).parent
 bcftools = envs.pop("bcftools")
@@ -97,7 +97,7 @@ if chrsize:
         infile
     ], fg=True)
-    infile = reheader_vcf
+    infile = str(reheader_vcf)
 envs[""] = [bcftools, "sort"]
 envs["_"] = infile

biopipen/scripts/vcf/BcftoolsView.py CHANGED Viewed

@@ -6,10 +6,10 @@ from biopipen.utils.misc import logger
 from biopipen.utils.reference import tabix_index
 from biopipen.scripts.vcf.bcftools_utils import run_bcftools
-infile = {{in.infile | repr}}  # pyright: ignore # noqa: #999
-regions_file = {{in.regions_file | repr}}  # pyright: ignore
-samples_file = {{in.samples_file | repr}}  # pyright: ignore
-outfile = {{out.outfile | repr}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore # noqa: #999
+regions_file: str = {{in.regions_file | quote}}  # pyright: ignore
+samples_file: str = {{in.samples_file | quote}}  # pyright: ignore
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
 envs: dict = {{envs | dict | repr}}  # pyright: ignore
 bcftools = envs.pop("bcftools")
@@ -21,7 +21,7 @@ index = envs.pop("index")
 if regions_file:
     if "R" in envs or "regions_file" in envs or "regions-file" in envs:
         logger.warning(
-            "Ignoring envs\[regions_file/regions-file/R] "
+            r"Ignoring envs\[regions_file/regions-file/R] "
             "because in.regionsfile is provided."
         )
         with suppress(KeyError):

biopipen/scripts/vcf/Vcf2Bed.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from cyvcf2 import VCF, Variant
-infile = {{in.infile | quote}}  # pyright: ignore
-outfile = {{out.outfile | quote}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore  # noqa: E999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
 # vcf, default 1
 inbase = {{envs.inbase | int}}  # pyright: ignore
 # bed, default 0

biopipen/scripts/vcf/VcfAnno.py CHANGED Viewed

@@ -2,22 +2,22 @@ from os import path
 from biopipen.utils.misc import run_command, dict_to_cli_args
-infile = {{in.infile | quote}}  # pyright: ignore
-outfile = {{out.outfile | quote}}  # pyright: ignore
-joboutdir = {{job.outdir | quote}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore  # noqa
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
+joboutdir: str = {{job.outdir | quote}}  # pyright: ignore
 vcfanno = {{envs.vcfanno | quote}}  # pyright: ignore
-ncores = {{envs.ncores | repr}}  # pyright: ignore
-args = {{envs.args | repr}}  # pyright: ignore
+ncores: int = {{envs.ncores | repr}}  # pyright: ignore
+args: dict = {{envs.args | dict}}  # pyright: ignore
-{% set conf = envs.conffile or in.conffile %}
-{% if conf | isinstance: dict %}
+{% set conf = envs.conffile or in.conffile %}  # pyright: ignore  # noqa
+{% if conf | isinstance: dict %}  # pyright: ignore  # noqa
 conffile = path.join(joboutdir, "config.toml")
-conf = {{ conf | toml | quote }}
+conf: str = {{ conf | toml | quote }}  # pyright: ignore  # noqa
 with open(conffile, "w") as f:
     f.write(conf)
-{% else %}
-conffile = {{conf | quote}}
-{% endif %}
+{% else %}  # pyright: ignore  # noqa
+conffile = {{conf | quote}}  # pyright: ignore  # noqa
+{% endif %}  # pyright: ignore  # noqa
 args["p"] = ncores
 args["_"] = [conffile, infile]

biopipen/scripts/vcf/VcfDownSample.sh CHANGED Viewed

@@ -1,25 +1,37 @@
+# shellcheck disable=SC2148
+# shellcheck disable=SC2036
+# shellcheck disable=SC2030
+# shellcheck disable=SC1083
+# shellcheck disable=SC2288
 infile={{in.infile | quote}}
 outfile={{out.outfile | quote}}
 n={{envs.n}}
+# shellcheck disable=SC2031
 if [[ $infile == *.gz ]]; then
-    outfile=$(echo $outfile | sed -r "s/\.gz$//")
-    nheader=$(zcat $infile | head -n 9999 | grep "^#" | wc -l | cut -d' ' -f1)
+    outfile=$(echo "$outfile" | sed -r "s/\.gz$//")
+    # shellcheck disable=SC2126
+    nheader=$(zcat "$infile" | head -n 9999 | grep "^#" | wc -l | cut -d' ' -f1)
     if [[ ! $n -gt 1 ]]; then
-        nrows=$(zcat $infile | wc -l | cut -d' ' -f1)
+        nrows=$(zcat "$infile" | wc -l | cut -d' ' -f1)
+        # shellcheck disable=SC2004
         nvars=$(($nrows - $nheader))
         n=$(echo "$nvars * $n" | bc)
     fi
-    zcat $infile | head -n $nheader > $outfile
-    zcat $infile | tail -n +$(($nheader + 1)) | shuf -n $n | LC_ALL=C sort -k1,1V -k2,2n >> $outfile
-    bgzip $outfile
+    zcat "$infile" | head -n "$nheader" > "$outfile"
+    # shellcheck disable=SC2004
+    zcat "$infile" | tail -n +$(($nheader + 1)) | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
+    bgzip "$outfile"
 else
-    nheader=$(head -n 9999 $infile | grep "^#" | wc -l | cut -d' ' -f1)
+    # shellcheck disable=SC2126
+    nheader=$(head -n 9999 "$infile" | grep "^#" | wc -l | cut -d' ' -f1)
     if [[ ! $n -gt 1 ]]; then
-        nrows=$(wc -l $infile | cut -d' ' -f1)
+        nrows=$(wc -l "$infile" | cut -d' ' -f1)
+        # shellcheck disable=SC2004
         nvars=$(($nrows - $nheader))
         n=$(echo "$nvars * $n" | bc)
     fi
-    head -n $nheader $infile > $outfile
-    tail -n +$(($nheader + 1)) $infile | shuf -n $n | LC_ALL=C sort -k1,1V -k2,2n >> $outfile
+    head -n "$nheader" "$infile" > "$outfile"
+    # shellcheck disable=SC2004
+    tail -n +$(($nheader + 1)) "$infile" | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
 fi

biopipen/scripts/vcf/VcfFilter.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from cyvcf2 import VCF, Writer, Variant
-infile = {{in.invcf | repr}}  # pyright: ignore
-outfile = {{out.outfile | repr}}  # pyright: ignore
+infile: str = {{in.invcf | quote}}  # pyright: ignore  # noqa: E999
+outfile: str = {{out.outfile | quote}}  # pyright: ignore
-{{envs.helper}}
+{{envs.helper}}  # pyright: ignore  # noqa: E999
 keep = {{envs.keep | repr}}  # pyright: ignore
-filters = {{envs.filters | repr}}  # pyright: ignore
-filter_descs = {{envs.filter_descs | repr}}  # pyright: ignore
+filters: dict = {{envs.filters | repr}}  # pyright: ignore
+filter_descs: dict = {{envs.filter_descs | repr}}  # pyright: ignore
 # builtin filters
 BUILTIN_FILTERS = {}

biopipen/scripts/vcf/VcfFix.py CHANGED Viewed

@@ -7,17 +7,17 @@ from biopipen.scripts.vcf.VcfFix_utils import (  # noqa: F401
     HeaderContig,
     HeaderGeneral,
     Fields,
-    Info,
-    Format,
-    Alt,
-    Filter,
-    Sample,
-    Samples,
+    # Info,
+    # Format,
+    # Alt,
+    # Filter,
+    # Sample,
+    # Samples,
     Variant,
 )
 from biopipen.scripts.vcf.VcfFix_utils import fix_vcffile
-infile = {{in.infile | quote}}  # pyright: ignore
+infile = {{in.infile | quote}}  # pyright: ignore  # noqa: E999
 instem = {{in.infile | stem | quote}}  # pyright: ignore
 outfile = {{out.outfile | quote}}  # pyright: ignore

biopipen/scripts/vcf/VcfFix_utils.py CHANGED Viewed

@@ -1,6 +1,15 @@
 import re
 import gzip
-from biopipen.utils.vcf import *  # noqa: F401, F403
+from biopipen.utils.vcf import (
+    HeaderInfo,
+    HeaderFormat,
+    HeaderFilter,
+    HeaderContig,
+    HeaderGeneral,
+    Fields,
+    Variant,
+    HeaderItem,
+)
 def line_to_obj(line: str):
@@ -41,7 +50,7 @@ def handle_obj(obj, fixes: dict):
         regex = fix.get("regex")
         if regex:
-            if not re.search(regex, obj.raw):
+            if not re.search(regex, obj.raw):  # type: ignore
                 continue
             return fix["fix"](obj.raw if kind is None else obj)
@@ -67,7 +76,7 @@ def fix_vcffile(vcffile, outfile, fixes):
     with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
         for line in fin:
             obj = line_to_obj(line)
-            out = handle_obj(obj, modify_fixes)
+            out = handle_obj(obj, modify_fixes)  # type: ignore
             if obj.kind == "fields":
                 for fix in header_append_fixes:
                     fout.write(str(fix["fix"](None)).rstrip("\n") + "\n")

biopipen/scripts/vcf/VcfIndex.py CHANGED Viewed

@@ -4,10 +4,10 @@ from os import path
 from biopipen.utils.reference import tabix_index
 from biopipen.utils.misc import run_command
-infile = {{in.infile | repr}}  # pyright: ignore
-outfile = Path({{out.outfile | repr}})  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore  # noqa
+outfile = Path({{out.outfile | quote}})  # pyright: ignore
 outidx = {{out.outidx | repr}}  # pyright: ignore
-tabix = {{envs.tabix | repr}}  # pyright: ignore
+tabix: str = {{envs.tabix | repr}}  # pyright: ignore
 ncores = {{envs.ncores | repr}}  # pyright: ignore
 outfile_with_index = tabix_index(infile, "vcf", outfile.parent, tabix)

biopipen/scripts/vcf/VcfIntersect.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from biopipen.utils.misc import run_command, dict_to_cli_args
-infile1 = {{in.infile1 | repr}}  # pyright: ignore
-infile2 = {{in.infile2 | repr}}  # pyright: ignore
-outfile = {{out.outfile | repr}}  # pyright: ignore
+infile1: str = {{in.infile1 | quote}}  # pyright: ignore  # noqa
+infile2 = {{in.infile2 | quote}}  # pyright: ignore
+outfile = {{out.outfile | quote}}  # pyright: ignore
 bcftools = {{envs.bcftools | repr}}  # pyright: ignore
 gz = {{envs.gz | repr}}  # pyright: ignore
 index = {{envs.index | repr}}  # pyright: ignore

biopipen/scripts/vcf/VcfLiftOver.sh CHANGED Viewed

@@ -1,3 +1,5 @@
+# shellcheck disable=SC2148
+# shellcheck disable=SC1083
 invcf={{ in.invcf | quote }}
 outvcf={{ out.outvcf | quote }}
 rejfile={{ job.outdir | joinpaths: "rejected.vcf" | quote }}
@@ -6,12 +8,15 @@ chain={{ envs.chain | quote }}
 reffa={{ envs.reffa | quote }}
 args={{ envs.args | dict_to_cli_args: join=True }}
+# shellcheck disable=SC2154
 refdict="${reffa%.fa}.dict"
 if [[ ! -e "$refdict" ]]; then
     echo "Sequence dictionary does not exist: $refdict" 1>&2
     exit 1
 fi
+# shellcheck disable=SC2154
+# shellcheck disable=SC2086
 $gatk LiftoverVcf \
     $args \
     --INPUT "$invcf" \

biopipen/scripts/vcf/VcfSplitSamples.py CHANGED Viewed

@@ -3,12 +3,12 @@ import shlex
 import concurrent.futures
 from subprocess import Popen, check_output
-infile = {{in.infile | repr}}  # pyright: ignore
-outdir = {{out.outdir | repr}}  # pyright: ignore
-bcftools = {{envs.bcftools | repr}}  # pyright: ignore
+infile: str = {{in.infile | quote}}  # pyright: ignore  # noqa
+outdir: str = {{out.outdir | quote}}  # pyright: ignore
+bcftools: str = {{envs.bcftools | repr}}  # pyright: ignore
 gz = {{envs.gz | repr}}  # pyright: ignore
 index = {{envs.index | repr}}  # pyright: ignore
-ncores = {{envs.ncores | int}}  # pyright: ignore
+ncores: int = {{envs.ncores | int}}  # pyright: ignore
 private = {{envs.private | repr}}  # pyright: ignore
 if index:

biopipen/scripts/vcf/bcftools_utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
     """
     bversion = (
         run_command([bcftools, "version"], stdout="return")
-        .splitlines()[0]  # bcftools 1.20
+        .splitlines()[0]  # bcftools 1.20  # type: ignore
         .replace("bcftools", "")
         .strip()  # 1.20
         .split(".")
@@ -24,8 +24,8 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
 def run_bcftools(
-    args: dict[str, object],
-    bcftools: str,
+    args: dict,
+    bcftools: str,  # TODO: get from the first argument of args
     index: bool,
     tabix: str
 ) -> None:

biopipen/scripts/web/Download.py CHANGED Viewed

@@ -2,13 +2,13 @@ from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
-url = {{in.url | repr}}  # pyright: ignore
-outfile = Path({{out.outfile | repr}})  # pyright: ignore
+url = {{in.url | quote}}  # pyright: ignore # noqa
+outfile = Path({{out.outfile | quote}})  # pyright: ignore
 tool = {{envs.tool | repr}}  # pyright: ignore
 wget = {{envs.wget | repr}}  # pyright: ignore
 aria2c = {{envs.aria2c | repr}}  # pyright: ignore
 ncores = {{envs.ncores | repr}}  # pyright: ignore
-args = {{envs.args | dict}}  # pyright: ignore
+args: dict = {{envs.args | dict}}  # pyright: ignore
 if tool == "wget":
     args["_"] = url
@@ -28,4 +28,8 @@ elif tool == "aria2c":
 else: # use python
     import urllib
-    urllib.urlretrieve(url, outfile)
+    try:
+        urllib.urlretrieve(url, outfile)  # type: ignore
+    except AttributeError:
+        urllib.request.urlretrieve(url, outfile)  # type: ignore

biopipen/scripts/web/DownloadList.py CHANGED Viewed

@@ -2,13 +2,13 @@ from pathlib import Path
 from biopipen.utils.misc import run_command, dict_to_cli_args
-urlfile = {{in.urlfile | repr}}  # pyright: ignore
-outdir = Path({{out.outdir | repr}})  # pyright: ignore
+urlfile: str = {{in.urlfile | quote}}  # pyright: ignore  # noqa
+outdir = Path({{out.outdir | quote}})  # pyright: ignore
 tool = {{envs.tool | repr}}  # pyright: ignore
 wget = {{envs.wget | repr}}  # pyright: ignore
 aria2c = {{envs.aria2c | repr}}  # pyright: ignore
 ncores = {{envs.ncores | repr}}  # pyright: ignore
-args = {{envs.args | repr}}  # pyright: ignore
+args: dict = {{envs.args | repr}}  # pyright: ignore
 if tool == "wget":
     args["i"] = urlfile
@@ -26,10 +26,10 @@ elif tool == "aria2c":
     run_command(dict_to_cli_args(args, dashify=True), fg=True)
 else: # use python
-    import urllib
+    from urllib.request import urlretrieve
     from urllib.parse import urlparse
     with open(urlfile, "r") as furl:
         for i, url in enumerate(furl.readlines()):
             parsed = urlparse(url)
             path = Path(parsed.path)
-            urllib.urlretrieve(url, f"{path.stem}-{i}{path.suffix}")
+            urlretrieve(url, f"{path.stem}-{i}{path.suffix}")

biopipen/scripts/web/GCloudStorageDownloadBucket.py CHANGED Viewed

@@ -8,12 +8,12 @@ from biopipen.scripts.web.gcloud_common import (
     get_file_path,
 )
-url = {{in.url | repr}}  # pyright: ignore  # noqa: E999
+url: str = {{in.url | quote}}  # pyright: ignore  # noqa: E999
 outdir = Path({{out.outdir | repr}})  # pyright: ignore
-gcloud = {{envs.gcloud | repr}}  # pyright: ignore
+gcloud: str = {{envs.gcloud | quote}}  # pyright: ignore
 keep_structure = {{envs.keep_structure | repr}}  # pyright: ignore
-ncores = {{envs.ncores | repr}}  # pyright: ignore
-args = {{envs.args | repr}}  # pyright: ignore
+ncores: int = {{envs.ncores | repr}}  # pyright: ignore
+args: dict = {{envs.args | repr}}  # pyright: ignore
 if not is_valid_gs_bucket_url(url):
     raise Exception(
@@ -65,7 +65,7 @@ def download_file(i: int, line: str, total: int):
 def download_bucket():
     out = run_command([gcloud, "storage", "ls", "--recursive", url], stdout="RETURN")
     # remove empty lines and skip the root
-    out = list(filter(None, out.splitlines()[1:]))
+    out = list(filter(None, out.splitlines()[1:]))  # type: ignore
     if keep_structure:
         # create folders first
         logger.info(f"Creating folders to keep structure.")

biopipen/scripts/web/GCloudStorageDownloadFile.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from biopipen.utils.misc import run_command, dict_to_cli_args
 from biopipen.scripts.web.gcloud_common import is_logged_in, is_valid_gs_file_url
-url = {{in.url | repr}}  # pyright: ignore  # noqa: E999
+url: str = {{in.url | repr}}  # pyright: ignore  # noqa: E999
 outfile = {{out.outfile | repr}}  # pyright: ignore
-gcloud = {{envs.gcloud | repr}}  # pyright: ignore
-args = {{envs.args | repr}}  # pyright: ignore
+gcloud: str = {{envs.gcloud | repr}}  # pyright: ignore
+args: dict = {{envs.args | repr}}  # pyright: ignore
 if not is_valid_gs_file_url(url):
     raise Exception(

biopipen/scripts/web/gcloud_common.py CHANGED Viewed

@@ -12,7 +12,7 @@ def is_logged_in(gcloud: str) -> bool:
         bool: True if the user is logged in, False otherwise.
     """
     out = run_command([gcloud, "auth", "list"], stdout="RETURN")
-    return "ACTIVE" in out
+    return "ACTIVE" in out  # type: ignore
 def is_valid_gs_bucket_url(url: str) -> bool:

biopipen/utils/gsea.R CHANGED Viewed

@@ -34,46 +34,85 @@ if (!exists("slugify")) {
     }
 }
+#' Download the GMT file and save it to cachedir
+#' Return the path to the GMT file
+#' We also check if the second column is shorter than the first column.
+#' If so, we switch the first and second columns.
+#' In case some providers provide the GMT file with the first and second columns switched.
+#' We also replace the "/" in the gene names with "-" if any. This is because the "/" is
+#' not allowed in a path, but GSEA uses the gene names as the file name.
+#'
+#' @param gmturl The URL or path of the GMT file
+#' @param cachedir The directory to save the GMT file
+#' @return The path to the GMT file
 localizeGmtfile <- function(gmturl, cachedir = tempdir()) {
     # Download the GMT file and save it to cachedir
     # Return the path to the GMT file
-    if (!startsWith(gmturl, "http") && !startsWith(gmturl, "ftp")) {
-        return(gmturl)
+    in_gmtfile <- out_gmtfile <- file.path(cachedir, basename(gmturl))
+    if (startsWith(gmturl, "http") || startsWith(gmturl, "ftp")) {
+        download.file(gmturl, in_gmtfile)
+        remote <- TRUE
+    } else {
+        in_gmtfile <- gmturl
+        remote <- FALSE
     }
-    gmtfile = file.path(cachedir, basename(gmturl))
-    if (!file.exists(gmtfile)) {
-        download.file(gmturl, gmtfile)
-        items <- read.delim(gmtfile, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
-        if (ncol(items) < 3) {
-            stop(paste0("Invalid GMT file: ", gmtfile, ", from ", gmturl))
-        }
-        if (nrow(items) == 0) {
-            stop(paste0("Empty GMT file: ", gmtfile, ", from ", gmturl))
-        }
-        if (
-            is.character(items$V2[1]) &&
-            nchar(items$V2[1]) < nchar(items$V1[1]) &&
-            nchar(items$V2[1]) > 0 &&
-            is.na(suppressWarnings(as.numeric(items$V2[1])))
-        ) {
-            warning(paste0(
-                "The second column is shorter, switching the first and second columns in GMT file ",
-                gmtfile,
-                " from ",
-                gmturl
-            ))
-            items <- items[, c(2, 1, 3:ncol(items))]
-            write.table(
-                items,
-                gmtfile,
-                row.names = F,
-                col.names = F,
-                sep = "\t",
-                quote = F
-            )
-        }
+    items <- readLines(in_gmtfile)
+    items <- items[!grepl("^#", items) & nchar(items) > 0]
+    items <- lapply(strsplit(items, "\t"), function(x) c(x[1:2], paste0(x[3:length(x)], collapse = "\t")))
+    items <- as.data.frame(t(as.data.frame(items)))
+    rownames(items) <- NULL
+    colnames(items) <- c("V1", "V2", "V3")
+    if (ncol(items) < 3) {
+        stop(paste0("Invalid GMT file: ", gmturl))
+    }
+    if (nrow(items) == 0) {
+        stop(paste0("Empty GMT file: ", gmturl))
     }
-    return(gmtfile)
+    # Check if the second column is shorter than the first column
+    nchars1 <- sum(nchar(items$V1))
+    nchars2 <- sum(nchar(items$V2))
+    prefix <- gsub("[0-9]+$", "", items$V2[1])
+    if (is.character(items$V2) &&     # Only when V2 is character, as pathway names
+        nchars2 < nchars1 &&          # Only when V2 is shorter than V1
+        all(nchar(items$V2) > 0) &&   # Only when V2 is not empty
+        !all(grepl("^[0-9]+$", items$V2)) && # Only when V2 is not all numbers
+        (nchar(prefix) == 0 || !all(startsWith(items$V2, prefix))) # Only when they are not like hsa00001, hsa00002, etc.
+    ) {
+        warning(paste0(
+            "The second column is shorter, switching the first and second columns in ",
+            "GMT file ", gmturl
+        ))
+        items <- items[, c(2, 1, 3:ncol(items))]
+        switched <- TRUE
+    } else {
+        switched <- FALSE
+    }
+    if (any(grepl("/", items$V1))) {
+        items$V1 <- gsub("/", "-", items$V1)
+        replaced <- TRUE
+    } else {
+        replaced <- FALSE
+    }
+    if (remote || switched || replaced) {
+        write.table(
+            items,
+            out_gmtfile,
+            row.names = FALSE,
+            col.names = FALSE,
+            sep = "\t",
+            quote = FALSE
+        )
+    } else {
+        out_gmtfile <- in_gmtfile
+    }
+    return(out_gmtfile)
 }
@@ -261,6 +300,7 @@ runGSEA = function(
         mutate(Description = "na") %>%
         rownames_to_column("NAME") %>%
         select(NAME, Description, everything())
     write.table(
         indata,
         gctfile,

biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl

Potentially problematic release.

biopipen 0.32.3py3-none-any.whl → 0.33.0py3-none-any.whl