biopipen 0.32.3__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biopipen might be problematic. Click here for more details.
- biopipen/__init__.py +1 -1
- biopipen/core/config.toml +6 -0
- biopipen/core/filters.py +35 -23
- biopipen/core/testing.py +6 -1
- biopipen/ns/bam.py +39 -0
- biopipen/ns/cellranger.py +5 -0
- biopipen/ns/cellranger_pipeline.py +2 -2
- biopipen/ns/cnvkit_pipeline.py +4 -1
- biopipen/ns/delim.py +33 -27
- biopipen/ns/protein.py +99 -0
- biopipen/ns/scrna.py +411 -250
- biopipen/ns/snp.py +16 -3
- biopipen/ns/tcr.py +125 -1
- biopipen/ns/vcf.py +34 -0
- biopipen/ns/web.py +5 -1
- biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
- biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
- biopipen/reports/tcr/ClonalStats.svelte +15 -0
- biopipen/reports/utils/misc.liq +20 -7
- biopipen/scripts/bam/BamMerge.py +2 -2
- biopipen/scripts/bam/BamSampling.py +4 -4
- biopipen/scripts/bam/BamSort.py +141 -0
- biopipen/scripts/bam/BamSplitChroms.py +10 -10
- biopipen/scripts/bam/BamSubsetByBed.py +3 -3
- biopipen/scripts/bam/CNVpytor.py +10 -10
- biopipen/scripts/bam/ControlFREEC.py +11 -11
- biopipen/scripts/bed/Bed2Vcf.py +5 -5
- biopipen/scripts/bed/BedConsensus.py +5 -5
- biopipen/scripts/bed/BedLiftOver.sh +6 -4
- biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
- biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
- biopipen/scripts/bed/BedtoolsMerge.py +4 -4
- biopipen/scripts/cellranger/CellRangerCount.py +20 -9
- biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
- biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
- biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
- biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
- biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
- biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
- biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
- biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
- biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
- biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
- biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
- biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
- biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
- biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
- biopipen/scripts/cnvkit/guess_baits.py +166 -93
- biopipen/scripts/delim/SampleInfo.R +85 -148
- biopipen/scripts/misc/Config2File.py +2 -2
- biopipen/scripts/misc/Str2File.py +2 -2
- biopipen/scripts/protein/MMCIF2PDB.py +33 -0
- biopipen/scripts/protein/PDB2Fasta.py +60 -0
- biopipen/scripts/protein/Prodigy.py +4 -4
- biopipen/scripts/protein/RMSD.py +178 -0
- biopipen/scripts/regulatory/MotifScan.py +8 -8
- biopipen/scripts/scrna/CellCellCommunication.py +59 -22
- biopipen/scripts/scrna/MarkersFinder.R +273 -654
- biopipen/scripts/scrna/RadarPlots.R +73 -53
- biopipen/scripts/scrna/SCP-plot.R +15202 -0
- biopipen/scripts/scrna/ScVelo.py +0 -0
- biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
- biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
- biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
- biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
- biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
- biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
- biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
- biopipen/scripts/scrna/SeuratPreparing.R +138 -81
- biopipen/scripts/scrna/SlingShot.R +71 -0
- biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
- biopipen/scripts/snp/Plink2GTMat.py +26 -11
- biopipen/scripts/snp/PlinkFilter.py +7 -7
- biopipen/scripts/snp/PlinkFromVcf.py +8 -5
- biopipen/scripts/snp/PlinkSimulation.py +4 -4
- biopipen/scripts/snp/PlinkUpdateName.py +4 -4
- biopipen/scripts/stats/ChowTest.R +48 -22
- biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
- biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
- biopipen/scripts/tcr/ClonalStats.R +484 -0
- biopipen/scripts/tcr/ScRepLoading.R +127 -0
- biopipen/scripts/tcr/TCRDock.py +10 -6
- biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
- biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
- biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
- biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
- biopipen/scripts/vcf/BcftoolsSort.py +4 -4
- biopipen/scripts/vcf/BcftoolsView.py +5 -5
- biopipen/scripts/vcf/Vcf2Bed.py +2 -2
- biopipen/scripts/vcf/VcfAnno.py +11 -11
- biopipen/scripts/vcf/VcfDownSample.sh +22 -10
- biopipen/scripts/vcf/VcfFilter.py +5 -5
- biopipen/scripts/vcf/VcfFix.py +7 -7
- biopipen/scripts/vcf/VcfFix_utils.py +12 -3
- biopipen/scripts/vcf/VcfIndex.py +3 -3
- biopipen/scripts/vcf/VcfIntersect.py +3 -3
- biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
- biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
- biopipen/scripts/vcf/bcftools_utils.py +3 -3
- biopipen/scripts/web/Download.py +8 -4
- biopipen/scripts/web/DownloadList.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
- biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
- biopipen/scripts/web/gcloud_common.py +1 -1
- biopipen/utils/gsea.R +75 -35
- biopipen/utils/misc.R +205 -7
- biopipen/utils/misc.py +17 -8
- biopipen/utils/reference.py +11 -11
- biopipen/utils/repr.R +146 -0
- biopipen/utils/vcf.py +1 -1
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/METADATA +8 -8
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/RECORD +114 -105
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/WHEEL +1 -1
- biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
- biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
- biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
- {biopipen-0.32.3.dist-info → biopipen-0.33.0.dist-info}/entry_points.txt +0 -0
|
@@ -6,11 +6,11 @@ from biopipen.utils.reference import tabix_index
|
|
|
6
6
|
from biopipen.utils.misc import logger
|
|
7
7
|
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
8
8
|
|
|
9
|
-
infile = {{in.infile |
|
|
10
|
-
annfile = {{in.annfile |
|
|
11
|
-
outfile = {{out.outfile |
|
|
12
|
-
joboutdir = {{job.outdir |
|
|
13
|
-
envs = {{envs | dict | repr}} # pyright: ignore
|
|
9
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
10
|
+
annfile: str = {{in.annfile | quote}} # pyright: ignore
|
|
11
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
12
|
+
joboutdir: str = {{job.outdir | quote}} # pyright: ignore
|
|
13
|
+
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
14
14
|
|
|
15
15
|
bcftools = envs.pop("bcftools")
|
|
16
16
|
tabix = envs.pop("tabix")
|
|
@@ -25,14 +25,14 @@ if isinstance(columns, list):
|
|
|
25
25
|
columns = ",".join(columns)
|
|
26
26
|
|
|
27
27
|
if "c" in envs:
|
|
28
|
-
logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
|
|
28
|
+
logger.warning(r"Ignoring envs\[c], use envs\[columns] instead.")
|
|
29
29
|
del envs["c"]
|
|
30
30
|
|
|
31
31
|
if isinstance(remove, list):
|
|
32
32
|
remove = ",".join(remove)
|
|
33
33
|
|
|
34
34
|
if "x" in envs:
|
|
35
|
-
logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
|
|
35
|
+
logger.warning(r"Ignoring envs\[x], use envs\[remove] instead.")
|
|
36
36
|
del envs["x"]
|
|
37
37
|
|
|
38
38
|
envs_has_annfile = "a" in envs or "annotations" in envs
|
|
@@ -43,7 +43,7 @@ if header:
|
|
|
43
43
|
|
|
44
44
|
if annfile and envs_has_annfile:
|
|
45
45
|
logger.warning(
|
|
46
|
-
"Ignoring envs\[a/annotations] because in.annfile is provided."
|
|
46
|
+
r"Ignoring envs\[a/annotations] because in.annfile is provided."
|
|
47
47
|
)
|
|
48
48
|
with suppress(KeyError):
|
|
49
49
|
del envs["a"]
|
|
@@ -3,11 +3,11 @@ from pathlib import Path, PosixPath # noqa: F401
|
|
|
3
3
|
from biopipen.utils.misc import logger
|
|
4
4
|
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
5
5
|
|
|
6
|
-
infile = {{in.infile |
|
|
7
|
-
outfile = {{out.outfile |
|
|
6
|
+
infile: str | Path = {{in.infile | quote}} # pyright: ignore # noqa: #999
|
|
7
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
8
8
|
outdir = Path(outfile).parent
|
|
9
9
|
|
|
10
|
-
envs = {{envs | dict | repr}} # pyright: ignore
|
|
10
|
+
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
11
11
|
bcftools = envs.pop("bcftools")
|
|
12
12
|
tabix = envs.pop("tabix")
|
|
13
13
|
keep = envs.pop("keep")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from biopipen.utils.reference import tabix_index
|
|
2
|
+
from biopipen.utils.misc import logger
|
|
3
|
+
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
4
|
+
|
|
5
|
+
infiles: list = {{in.infiles | each: as_path}} # pyright: ignore # noqa: E999
|
|
6
|
+
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
7
|
+
joboutdir = {{job.outdir | repr}} # pyright: ignore
|
|
8
|
+
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
9
|
+
|
|
10
|
+
bcftools = envs.pop("bcftools")
|
|
11
|
+
tabix = envs.pop("tabix")
|
|
12
|
+
ncores = envs.pop("ncores")
|
|
13
|
+
gz = envs.pop("gz")
|
|
14
|
+
index = envs.pop("index")
|
|
15
|
+
|
|
16
|
+
envs.setdefault("force-single", True)
|
|
17
|
+
envs.setdefault("missing-to-ref", True)
|
|
18
|
+
|
|
19
|
+
if index and not gz:
|
|
20
|
+
logger.warning("Forcing envs.gz to True because envs.index is True.")
|
|
21
|
+
gz = True
|
|
22
|
+
|
|
23
|
+
if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
|
|
24
|
+
envs["O"] = "z" if gz else "v"
|
|
25
|
+
|
|
26
|
+
envs[""] = [bcftools, "merge"]
|
|
27
|
+
envs["o"] = outfile
|
|
28
|
+
envs["threads"] = ncores
|
|
29
|
+
envs["_"] = infiles
|
|
30
|
+
|
|
31
|
+
run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
|
|
@@ -4,9 +4,9 @@ from pathlib import Path, PosixPath # noqa: F401
|
|
|
4
4
|
from biopipen.utils.misc import run_command, logger
|
|
5
5
|
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
6
6
|
|
|
7
|
-
infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
8
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
9
|
-
envs = {{envs | dict | repr}} # pyright: ignore
|
|
7
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
8
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
9
|
+
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
10
10
|
|
|
11
11
|
outdir = Path(outfile).parent
|
|
12
12
|
bcftools = envs.pop("bcftools")
|
|
@@ -97,7 +97,7 @@ if chrsize:
|
|
|
97
97
|
infile
|
|
98
98
|
], fg=True)
|
|
99
99
|
|
|
100
|
-
infile = reheader_vcf
|
|
100
|
+
infile = str(reheader_vcf)
|
|
101
101
|
|
|
102
102
|
envs[""] = [bcftools, "sort"]
|
|
103
103
|
envs["_"] = infile
|
|
@@ -6,10 +6,10 @@ from biopipen.utils.misc import logger
|
|
|
6
6
|
from biopipen.utils.reference import tabix_index
|
|
7
7
|
from biopipen.scripts.vcf.bcftools_utils import run_bcftools
|
|
8
8
|
|
|
9
|
-
infile = {{in.infile |
|
|
10
|
-
regions_file = {{in.regions_file |
|
|
11
|
-
samples_file = {{in.samples_file |
|
|
12
|
-
outfile = {{out.outfile |
|
|
9
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa: #999
|
|
10
|
+
regions_file: str = {{in.regions_file | quote}} # pyright: ignore
|
|
11
|
+
samples_file: str = {{in.samples_file | quote}} # pyright: ignore
|
|
12
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
13
13
|
envs: dict = {{envs | dict | repr}} # pyright: ignore
|
|
14
14
|
|
|
15
15
|
bcftools = envs.pop("bcftools")
|
|
@@ -21,7 +21,7 @@ index = envs.pop("index")
|
|
|
21
21
|
if regions_file:
|
|
22
22
|
if "R" in envs or "regions_file" in envs or "regions-file" in envs:
|
|
23
23
|
logger.warning(
|
|
24
|
-
"Ignoring envs\[regions_file/regions-file/R] "
|
|
24
|
+
r"Ignoring envs\[regions_file/regions-file/R] "
|
|
25
25
|
"because in.regionsfile is provided."
|
|
26
26
|
)
|
|
27
27
|
with suppress(KeyError):
|
biopipen/scripts/vcf/Vcf2Bed.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from cyvcf2 import VCF, Variant
|
|
2
2
|
|
|
3
|
-
infile = {{in.infile | quote}} # pyright: ignore
|
|
4
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
3
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
4
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
5
5
|
# vcf, default 1
|
|
6
6
|
inbase = {{envs.inbase | int}} # pyright: ignore
|
|
7
7
|
# bed, default 0
|
biopipen/scripts/vcf/VcfAnno.py
CHANGED
|
@@ -2,22 +2,22 @@ from os import path
|
|
|
2
2
|
|
|
3
3
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
4
4
|
|
|
5
|
-
infile = {{in.infile | quote}} # pyright: ignore
|
|
6
|
-
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
7
|
-
joboutdir = {{job.outdir | quote}} # pyright: ignore
|
|
5
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
6
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
7
|
+
joboutdir: str = {{job.outdir | quote}} # pyright: ignore
|
|
8
8
|
vcfanno = {{envs.vcfanno | quote}} # pyright: ignore
|
|
9
|
-
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
-
args = {{envs.args |
|
|
9
|
+
ncores: int = {{envs.ncores | repr}} # pyright: ignore
|
|
10
|
+
args: dict = {{envs.args | dict}} # pyright: ignore
|
|
11
11
|
|
|
12
|
-
{% set conf = envs.conffile or in.conffile %}
|
|
13
|
-
{% if conf | isinstance: dict %}
|
|
12
|
+
{% set conf = envs.conffile or in.conffile %} # pyright: ignore # noqa
|
|
13
|
+
{% if conf | isinstance: dict %} # pyright: ignore # noqa
|
|
14
14
|
conffile = path.join(joboutdir, "config.toml")
|
|
15
|
-
conf = {{ conf | toml | quote }}
|
|
15
|
+
conf: str = {{ conf | toml | quote }} # pyright: ignore # noqa
|
|
16
16
|
with open(conffile, "w") as f:
|
|
17
17
|
f.write(conf)
|
|
18
|
-
{% else %}
|
|
19
|
-
conffile = {{conf | quote}}
|
|
20
|
-
{% endif %}
|
|
18
|
+
{% else %} # pyright: ignore # noqa
|
|
19
|
+
conffile = {{conf | quote}} # pyright: ignore # noqa
|
|
20
|
+
{% endif %} # pyright: ignore # noqa
|
|
21
21
|
|
|
22
22
|
args["p"] = ncores
|
|
23
23
|
args["_"] = [conffile, infile]
|
|
@@ -1,25 +1,37 @@
|
|
|
1
|
+
# shellcheck disable=SC2148
|
|
2
|
+
# shellcheck disable=SC2036
|
|
3
|
+
# shellcheck disable=SC2030
|
|
4
|
+
# shellcheck disable=SC1083
|
|
5
|
+
# shellcheck disable=SC2288
|
|
1
6
|
infile={{in.infile | quote}}
|
|
2
7
|
outfile={{out.outfile | quote}}
|
|
3
8
|
n={{envs.n}}
|
|
4
9
|
|
|
10
|
+
# shellcheck disable=SC2031
|
|
5
11
|
if [[ $infile == *.gz ]]; then
|
|
6
|
-
outfile=$(echo $outfile | sed -r "s/\.gz$//")
|
|
7
|
-
|
|
12
|
+
outfile=$(echo "$outfile" | sed -r "s/\.gz$//")
|
|
13
|
+
# shellcheck disable=SC2126
|
|
14
|
+
nheader=$(zcat "$infile" | head -n 9999 | grep "^#" | wc -l | cut -d' ' -f1)
|
|
8
15
|
if [[ ! $n -gt 1 ]]; then
|
|
9
|
-
nrows=$(zcat $infile | wc -l | cut -d' ' -f1)
|
|
16
|
+
nrows=$(zcat "$infile" | wc -l | cut -d' ' -f1)
|
|
17
|
+
# shellcheck disable=SC2004
|
|
10
18
|
nvars=$(($nrows - $nheader))
|
|
11
19
|
n=$(echo "$nvars * $n" | bc)
|
|
12
20
|
fi
|
|
13
|
-
zcat $infile | head -n $nheader > $outfile
|
|
14
|
-
|
|
15
|
-
|
|
21
|
+
zcat "$infile" | head -n "$nheader" > "$outfile"
|
|
22
|
+
# shellcheck disable=SC2004
|
|
23
|
+
zcat "$infile" | tail -n +$(($nheader + 1)) | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
|
|
24
|
+
bgzip "$outfile"
|
|
16
25
|
else
|
|
17
|
-
|
|
26
|
+
# shellcheck disable=SC2126
|
|
27
|
+
nheader=$(head -n 9999 "$infile" | grep "^#" | wc -l | cut -d' ' -f1)
|
|
18
28
|
if [[ ! $n -gt 1 ]]; then
|
|
19
|
-
nrows=$(wc -l $infile | cut -d' ' -f1)
|
|
29
|
+
nrows=$(wc -l "$infile" | cut -d' ' -f1)
|
|
30
|
+
# shellcheck disable=SC2004
|
|
20
31
|
nvars=$(($nrows - $nheader))
|
|
21
32
|
n=$(echo "$nvars * $n" | bc)
|
|
22
33
|
fi
|
|
23
|
-
head -n $nheader $infile > $outfile
|
|
24
|
-
|
|
34
|
+
head -n "$nheader" "$infile" > "$outfile"
|
|
35
|
+
# shellcheck disable=SC2004
|
|
36
|
+
tail -n +$(($nheader + 1)) "$infile" | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
|
|
25
37
|
fi
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from cyvcf2 import VCF, Writer, Variant
|
|
2
2
|
|
|
3
|
-
infile = {{in.invcf |
|
|
4
|
-
outfile = {{out.outfile |
|
|
3
|
+
infile: str = {{in.invcf | quote}} # pyright: ignore # noqa: E999
|
|
4
|
+
outfile: str = {{out.outfile | quote}} # pyright: ignore
|
|
5
5
|
|
|
6
|
-
{{envs.helper}}
|
|
6
|
+
{{envs.helper}} # pyright: ignore # noqa: E999
|
|
7
7
|
|
|
8
8
|
keep = {{envs.keep | repr}} # pyright: ignore
|
|
9
|
-
filters = {{envs.filters | repr}} # pyright: ignore
|
|
10
|
-
filter_descs = {{envs.filter_descs | repr}} # pyright: ignore
|
|
9
|
+
filters: dict = {{envs.filters | repr}} # pyright: ignore
|
|
10
|
+
filter_descs: dict = {{envs.filter_descs | repr}} # pyright: ignore
|
|
11
11
|
|
|
12
12
|
# builtin filters
|
|
13
13
|
BUILTIN_FILTERS = {}
|
biopipen/scripts/vcf/VcfFix.py
CHANGED
|
@@ -7,17 +7,17 @@ from biopipen.scripts.vcf.VcfFix_utils import ( # noqa: F401
|
|
|
7
7
|
HeaderContig,
|
|
8
8
|
HeaderGeneral,
|
|
9
9
|
Fields,
|
|
10
|
-
Info,
|
|
11
|
-
Format,
|
|
12
|
-
Alt,
|
|
13
|
-
Filter,
|
|
14
|
-
Sample,
|
|
15
|
-
Samples,
|
|
10
|
+
# Info,
|
|
11
|
+
# Format,
|
|
12
|
+
# Alt,
|
|
13
|
+
# Filter,
|
|
14
|
+
# Sample,
|
|
15
|
+
# Samples,
|
|
16
16
|
Variant,
|
|
17
17
|
)
|
|
18
18
|
from biopipen.scripts.vcf.VcfFix_utils import fix_vcffile
|
|
19
19
|
|
|
20
|
-
infile = {{in.infile | quote}} # pyright: ignore
|
|
20
|
+
infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
|
|
21
21
|
instem = {{in.infile | stem | quote}} # pyright: ignore
|
|
22
22
|
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
23
23
|
|
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import gzip
|
|
3
|
-
from biopipen.utils.vcf import
|
|
3
|
+
from biopipen.utils.vcf import (
|
|
4
|
+
HeaderInfo,
|
|
5
|
+
HeaderFormat,
|
|
6
|
+
HeaderFilter,
|
|
7
|
+
HeaderContig,
|
|
8
|
+
HeaderGeneral,
|
|
9
|
+
Fields,
|
|
10
|
+
Variant,
|
|
11
|
+
HeaderItem,
|
|
12
|
+
)
|
|
4
13
|
|
|
5
14
|
|
|
6
15
|
def line_to_obj(line: str):
|
|
@@ -41,7 +50,7 @@ def handle_obj(obj, fixes: dict):
|
|
|
41
50
|
|
|
42
51
|
regex = fix.get("regex")
|
|
43
52
|
if regex:
|
|
44
|
-
if not re.search(regex, obj.raw):
|
|
53
|
+
if not re.search(regex, obj.raw): # type: ignore
|
|
45
54
|
continue
|
|
46
55
|
|
|
47
56
|
return fix["fix"](obj.raw if kind is None else obj)
|
|
@@ -67,7 +76,7 @@ def fix_vcffile(vcffile, outfile, fixes):
|
|
|
67
76
|
with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
|
|
68
77
|
for line in fin:
|
|
69
78
|
obj = line_to_obj(line)
|
|
70
|
-
out = handle_obj(obj, modify_fixes)
|
|
79
|
+
out = handle_obj(obj, modify_fixes) # type: ignore
|
|
71
80
|
if obj.kind == "fields":
|
|
72
81
|
for fix in header_append_fixes:
|
|
73
82
|
fout.write(str(fix["fix"](None)).rstrip("\n") + "\n")
|
biopipen/scripts/vcf/VcfIndex.py
CHANGED
|
@@ -4,10 +4,10 @@ from os import path
|
|
|
4
4
|
from biopipen.utils.reference import tabix_index
|
|
5
5
|
from biopipen.utils.misc import run_command
|
|
6
6
|
|
|
7
|
-
infile = {{in.infile |
|
|
8
|
-
outfile = Path({{out.outfile |
|
|
7
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
8
|
+
outfile = Path({{out.outfile | quote}}) # pyright: ignore
|
|
9
9
|
outidx = {{out.outidx | repr}} # pyright: ignore
|
|
10
|
-
tabix = {{envs.tabix | repr}} # pyright: ignore
|
|
10
|
+
tabix: str = {{envs.tabix | repr}} # pyright: ignore
|
|
11
11
|
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
12
12
|
|
|
13
13
|
outfile_with_index = tabix_index(infile, "vcf", outfile.parent, tabix)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
2
|
|
|
3
|
-
infile1 = {{in.infile1 |
|
|
4
|
-
infile2 = {{in.infile2 |
|
|
5
|
-
outfile = {{out.outfile |
|
|
3
|
+
infile1: str = {{in.infile1 | quote}} # pyright: ignore # noqa
|
|
4
|
+
infile2 = {{in.infile2 | quote}} # pyright: ignore
|
|
5
|
+
outfile = {{out.outfile | quote}} # pyright: ignore
|
|
6
6
|
bcftools = {{envs.bcftools | repr}} # pyright: ignore
|
|
7
7
|
gz = {{envs.gz | repr}} # pyright: ignore
|
|
8
8
|
index = {{envs.index | repr}} # pyright: ignore
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# shellcheck disable=SC2148
|
|
2
|
+
# shellcheck disable=SC1083
|
|
1
3
|
invcf={{ in.invcf | quote }}
|
|
2
4
|
outvcf={{ out.outvcf | quote }}
|
|
3
5
|
rejfile={{ job.outdir | joinpaths: "rejected.vcf" | quote }}
|
|
@@ -6,12 +8,15 @@ chain={{ envs.chain | quote }}
|
|
|
6
8
|
reffa={{ envs.reffa | quote }}
|
|
7
9
|
args={{ envs.args | dict_to_cli_args: join=True }}
|
|
8
10
|
|
|
11
|
+
# shellcheck disable=SC2154
|
|
9
12
|
refdict="${reffa%.fa}.dict"
|
|
10
13
|
if [[ ! -e "$refdict" ]]; then
|
|
11
14
|
echo "Sequence dictionary does not exist: $refdict" 1>&2
|
|
12
15
|
exit 1
|
|
13
16
|
fi
|
|
14
17
|
|
|
18
|
+
# shellcheck disable=SC2154
|
|
19
|
+
# shellcheck disable=SC2086
|
|
15
20
|
$gatk LiftoverVcf \
|
|
16
21
|
$args \
|
|
17
22
|
--INPUT "$invcf" \
|
|
@@ -3,12 +3,12 @@ import shlex
|
|
|
3
3
|
import concurrent.futures
|
|
4
4
|
from subprocess import Popen, check_output
|
|
5
5
|
|
|
6
|
-
infile = {{in.infile |
|
|
7
|
-
outdir = {{out.outdir |
|
|
8
|
-
bcftools = {{envs.bcftools | repr}} # pyright: ignore
|
|
6
|
+
infile: str = {{in.infile | quote}} # pyright: ignore # noqa
|
|
7
|
+
outdir: str = {{out.outdir | quote}} # pyright: ignore
|
|
8
|
+
bcftools: str = {{envs.bcftools | repr}} # pyright: ignore
|
|
9
9
|
gz = {{envs.gz | repr}} # pyright: ignore
|
|
10
10
|
index = {{envs.index | repr}} # pyright: ignore
|
|
11
|
-
ncores = {{envs.ncores | int}} # pyright: ignore
|
|
11
|
+
ncores: int = {{envs.ncores | int}} # pyright: ignore
|
|
12
12
|
private = {{envs.private | repr}} # pyright: ignore
|
|
13
13
|
|
|
14
14
|
if index:
|
|
@@ -15,7 +15,7 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
|
|
|
15
15
|
"""
|
|
16
16
|
bversion = (
|
|
17
17
|
run_command([bcftools, "version"], stdout="return")
|
|
18
|
-
.splitlines()[0] # bcftools 1.20
|
|
18
|
+
.splitlines()[0] # bcftools 1.20 # type: ignore
|
|
19
19
|
.replace("bcftools", "")
|
|
20
20
|
.strip() # 1.20
|
|
21
21
|
.split(".")
|
|
@@ -24,8 +24,8 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def run_bcftools(
|
|
27
|
-
args: dict
|
|
28
|
-
bcftools: str,
|
|
27
|
+
args: dict,
|
|
28
|
+
bcftools: str, # TODO: get from the first argument of args
|
|
29
29
|
index: bool,
|
|
30
30
|
tabix: str
|
|
31
31
|
) -> None:
|
biopipen/scripts/web/Download.py
CHANGED
|
@@ -2,13 +2,13 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
4
4
|
|
|
5
|
-
url = {{in.url |
|
|
6
|
-
outfile = Path({{out.outfile |
|
|
5
|
+
url = {{in.url | quote}} # pyright: ignore # noqa
|
|
6
|
+
outfile = Path({{out.outfile | quote}}) # pyright: ignore
|
|
7
7
|
tool = {{envs.tool | repr}} # pyright: ignore
|
|
8
8
|
wget = {{envs.wget | repr}} # pyright: ignore
|
|
9
9
|
aria2c = {{envs.aria2c | repr}} # pyright: ignore
|
|
10
10
|
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
11
|
-
args = {{envs.args | dict}} # pyright: ignore
|
|
11
|
+
args: dict = {{envs.args | dict}} # pyright: ignore
|
|
12
12
|
|
|
13
13
|
if tool == "wget":
|
|
14
14
|
args["_"] = url
|
|
@@ -28,4 +28,8 @@ elif tool == "aria2c":
|
|
|
28
28
|
|
|
29
29
|
else: # use python
|
|
30
30
|
import urllib
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
urllib.urlretrieve(url, outfile) # type: ignore
|
|
34
|
+
except AttributeError:
|
|
35
|
+
urllib.request.urlretrieve(url, outfile) # type: ignore
|
|
@@ -2,13 +2,13 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
4
4
|
|
|
5
|
-
urlfile = {{in.urlfile |
|
|
6
|
-
outdir = Path({{out.outdir |
|
|
5
|
+
urlfile: str = {{in.urlfile | quote}} # pyright: ignore # noqa
|
|
6
|
+
outdir = Path({{out.outdir | quote}}) # pyright: ignore
|
|
7
7
|
tool = {{envs.tool | repr}} # pyright: ignore
|
|
8
8
|
wget = {{envs.wget | repr}} # pyright: ignore
|
|
9
9
|
aria2c = {{envs.aria2c | repr}} # pyright: ignore
|
|
10
10
|
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
11
|
-
args = {{envs.args | repr}} # pyright: ignore
|
|
11
|
+
args: dict = {{envs.args | repr}} # pyright: ignore
|
|
12
12
|
|
|
13
13
|
if tool == "wget":
|
|
14
14
|
args["i"] = urlfile
|
|
@@ -26,10 +26,10 @@ elif tool == "aria2c":
|
|
|
26
26
|
run_command(dict_to_cli_args(args, dashify=True), fg=True)
|
|
27
27
|
|
|
28
28
|
else: # use python
|
|
29
|
-
import
|
|
29
|
+
from urllib.request import urlretrieve
|
|
30
30
|
from urllib.parse import urlparse
|
|
31
31
|
with open(urlfile, "r") as furl:
|
|
32
32
|
for i, url in enumerate(furl.readlines()):
|
|
33
33
|
parsed = urlparse(url)
|
|
34
34
|
path = Path(parsed.path)
|
|
35
|
-
|
|
35
|
+
urlretrieve(url, f"{path.stem}-{i}{path.suffix}")
|
|
@@ -8,12 +8,12 @@ from biopipen.scripts.web.gcloud_common import (
|
|
|
8
8
|
get_file_path,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
url = {{in.url |
|
|
11
|
+
url: str = {{in.url | quote}} # pyright: ignore # noqa: E999
|
|
12
12
|
outdir = Path({{out.outdir | repr}}) # pyright: ignore
|
|
13
|
-
gcloud = {{envs.gcloud |
|
|
13
|
+
gcloud: str = {{envs.gcloud | quote}} # pyright: ignore
|
|
14
14
|
keep_structure = {{envs.keep_structure | repr}} # pyright: ignore
|
|
15
|
-
ncores = {{envs.ncores | repr}} # pyright: ignore
|
|
16
|
-
args = {{envs.args | repr}} # pyright: ignore
|
|
15
|
+
ncores: int = {{envs.ncores | repr}} # pyright: ignore
|
|
16
|
+
args: dict = {{envs.args | repr}} # pyright: ignore
|
|
17
17
|
|
|
18
18
|
if not is_valid_gs_bucket_url(url):
|
|
19
19
|
raise Exception(
|
|
@@ -65,7 +65,7 @@ def download_file(i: int, line: str, total: int):
|
|
|
65
65
|
def download_bucket():
|
|
66
66
|
out = run_command([gcloud, "storage", "ls", "--recursive", url], stdout="RETURN")
|
|
67
67
|
# remove empty lines and skip the root
|
|
68
|
-
out = list(filter(None, out.splitlines()[1:]))
|
|
68
|
+
out = list(filter(None, out.splitlines()[1:])) # type: ignore
|
|
69
69
|
if keep_structure:
|
|
70
70
|
# create folders first
|
|
71
71
|
logger.info(f"Creating folders to keep structure.")
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from biopipen.utils.misc import run_command, dict_to_cli_args
|
|
2
2
|
from biopipen.scripts.web.gcloud_common import is_logged_in, is_valid_gs_file_url
|
|
3
3
|
|
|
4
|
-
url = {{in.url | repr}} # pyright: ignore # noqa: E999
|
|
4
|
+
url: str = {{in.url | repr}} # pyright: ignore # noqa: E999
|
|
5
5
|
outfile = {{out.outfile | repr}} # pyright: ignore
|
|
6
|
-
gcloud = {{envs.gcloud | repr}} # pyright: ignore
|
|
7
|
-
args = {{envs.args | repr}} # pyright: ignore
|
|
6
|
+
gcloud: str = {{envs.gcloud | repr}} # pyright: ignore
|
|
7
|
+
args: dict = {{envs.args | repr}} # pyright: ignore
|
|
8
8
|
|
|
9
9
|
if not is_valid_gs_file_url(url):
|
|
10
10
|
raise Exception(
|
|
@@ -12,7 +12,7 @@ def is_logged_in(gcloud: str) -> bool:
|
|
|
12
12
|
bool: True if the user is logged in, False otherwise.
|
|
13
13
|
"""
|
|
14
14
|
out = run_command([gcloud, "auth", "list"], stdout="RETURN")
|
|
15
|
-
return "ACTIVE" in out
|
|
15
|
+
return "ACTIVE" in out # type: ignore
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def is_valid_gs_bucket_url(url: str) -> bool:
|
biopipen/utils/gsea.R
CHANGED
|
@@ -34,46 +34,85 @@ if (!exists("slugify")) {
|
|
|
34
34
|
}
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
#' Download the GMT file and save it to cachedir
|
|
38
|
+
#' Return the path to the GMT file
|
|
39
|
+
#' We also check if the second column is shorter than the first column.
|
|
40
|
+
#' If so, we switch the first and second columns.
|
|
41
|
+
#' In case some providers provide the GMT file with the first and second columns switched.
|
|
42
|
+
#' We also replace the "/" in the gene names with "-" if any. This is because the "/" is
|
|
43
|
+
#' not allowed in a path, but GSEA uses the gene names as the file name.
|
|
44
|
+
#'
|
|
45
|
+
#' @param gmturl The URL or path of the GMT file
|
|
46
|
+
#' @param cachedir The directory to save the GMT file
|
|
47
|
+
#' @return The path to the GMT file
|
|
37
48
|
localizeGmtfile <- function(gmturl, cachedir = tempdir()) {
|
|
38
49
|
# Download the GMT file and save it to cachedir
|
|
39
50
|
# Return the path to the GMT file
|
|
40
|
-
|
|
41
|
-
|
|
51
|
+
in_gmtfile <- out_gmtfile <- file.path(cachedir, basename(gmturl))
|
|
52
|
+
if (startsWith(gmturl, "http") || startsWith(gmturl, "ftp")) {
|
|
53
|
+
download.file(gmturl, in_gmtfile)
|
|
54
|
+
remote <- TRUE
|
|
55
|
+
} else {
|
|
56
|
+
in_gmtfile <- gmturl
|
|
57
|
+
remote <- FALSE
|
|
42
58
|
}
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
nchar(items$V2[1]) > 0 &&
|
|
57
|
-
is.na(suppressWarnings(as.numeric(items$V2[1])))
|
|
58
|
-
) {
|
|
59
|
-
warning(paste0(
|
|
60
|
-
"The second column is shorter, switching the first and second columns in GMT file ",
|
|
61
|
-
gmtfile,
|
|
62
|
-
" from ",
|
|
63
|
-
gmturl
|
|
64
|
-
))
|
|
65
|
-
items <- items[, c(2, 1, 3:ncol(items))]
|
|
66
|
-
write.table(
|
|
67
|
-
items,
|
|
68
|
-
gmtfile,
|
|
69
|
-
row.names = F,
|
|
70
|
-
col.names = F,
|
|
71
|
-
sep = "\t",
|
|
72
|
-
quote = F
|
|
73
|
-
)
|
|
74
|
-
}
|
|
59
|
+
|
|
60
|
+
items <- readLines(in_gmtfile)
|
|
61
|
+
items <- items[!grepl("^#", items) & nchar(items) > 0]
|
|
62
|
+
items <- lapply(strsplit(items, "\t"), function(x) c(x[1:2], paste0(x[3:length(x)], collapse = "\t")))
|
|
63
|
+
items <- as.data.frame(t(as.data.frame(items)))
|
|
64
|
+
rownames(items) <- NULL
|
|
65
|
+
colnames(items) <- c("V1", "V2", "V3")
|
|
66
|
+
|
|
67
|
+
if (ncol(items) < 3) {
|
|
68
|
+
stop(paste0("Invalid GMT file: ", gmturl))
|
|
69
|
+
}
|
|
70
|
+
if (nrow(items) == 0) {
|
|
71
|
+
stop(paste0("Empty GMT file: ", gmturl))
|
|
75
72
|
}
|
|
76
|
-
|
|
73
|
+
|
|
74
|
+
# Check if the second column is shorter than the first column
|
|
75
|
+
nchars1 <- sum(nchar(items$V1))
|
|
76
|
+
nchars2 <- sum(nchar(items$V2))
|
|
77
|
+
prefix <- gsub("[0-9]+$", "", items$V2[1])
|
|
78
|
+
|
|
79
|
+
if (is.character(items$V2) && # Only when V2 is character, as pathway names
|
|
80
|
+
nchars2 < nchars1 && # Only when V2 is shorter than V1
|
|
81
|
+
all(nchar(items$V2) > 0) && # Only when V2 is not empty
|
|
82
|
+
!all(grepl("^[0-9]+$", items$V2)) && # Only when V2 is not all numbers
|
|
83
|
+
(nchar(prefix) == 0 || !all(startsWith(items$V2, prefix))) # Only when they are not like hsa00001, hsa00002, etc.
|
|
84
|
+
) {
|
|
85
|
+
warning(paste0(
|
|
86
|
+
"The second column is shorter, switching the first and second columns in ",
|
|
87
|
+
"GMT file ", gmturl
|
|
88
|
+
))
|
|
89
|
+
items <- items[, c(2, 1, 3:ncol(items))]
|
|
90
|
+
switched <- TRUE
|
|
91
|
+
} else {
|
|
92
|
+
switched <- FALSE
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (any(grepl("/", items$V1))) {
|
|
96
|
+
items$V1 <- gsub("/", "-", items$V1)
|
|
97
|
+
replaced <- TRUE
|
|
98
|
+
} else {
|
|
99
|
+
replaced <- FALSE
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (remote || switched || replaced) {
|
|
103
|
+
write.table(
|
|
104
|
+
items,
|
|
105
|
+
out_gmtfile,
|
|
106
|
+
row.names = FALSE,
|
|
107
|
+
col.names = FALSE,
|
|
108
|
+
sep = "\t",
|
|
109
|
+
quote = FALSE
|
|
110
|
+
)
|
|
111
|
+
} else {
|
|
112
|
+
out_gmtfile <- in_gmtfile
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return(out_gmtfile)
|
|
77
116
|
}
|
|
78
117
|
|
|
79
118
|
|
|
@@ -261,6 +300,7 @@ runGSEA = function(
|
|
|
261
300
|
mutate(Description = "na") %>%
|
|
262
301
|
rownames_to_column("NAME") %>%
|
|
263
302
|
select(NAME, Description, everything())
|
|
303
|
+
|
|
264
304
|
write.table(
|
|
265
305
|
indata,
|
|
266
306
|
gctfile,
|