biopipen 0.32.3__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (118) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +6 -0
  3. biopipen/core/filters.py +35 -23
  4. biopipen/core/testing.py +6 -1
  5. biopipen/ns/bam.py +39 -0
  6. biopipen/ns/cellranger.py +5 -0
  7. biopipen/ns/cellranger_pipeline.py +2 -2
  8. biopipen/ns/cnvkit_pipeline.py +4 -1
  9. biopipen/ns/delim.py +33 -27
  10. biopipen/ns/protein.py +99 -0
  11. biopipen/ns/scrna.py +428 -250
  12. biopipen/ns/snp.py +16 -3
  13. biopipen/ns/tcr.py +125 -1
  14. biopipen/ns/vcf.py +34 -0
  15. biopipen/ns/web.py +5 -1
  16. biopipen/reports/scrna/SeuratClusterStats.svelte +1 -1
  17. biopipen/reports/scrna/SeuratMap2Ref.svelte +15 -2
  18. biopipen/reports/tcr/ClonalStats.svelte +15 -0
  19. biopipen/reports/utils/misc.liq +20 -7
  20. biopipen/scripts/bam/BamMerge.py +2 -2
  21. biopipen/scripts/bam/BamSampling.py +4 -4
  22. biopipen/scripts/bam/BamSort.py +141 -0
  23. biopipen/scripts/bam/BamSplitChroms.py +10 -10
  24. biopipen/scripts/bam/BamSubsetByBed.py +3 -3
  25. biopipen/scripts/bam/CNVpytor.py +10 -10
  26. biopipen/scripts/bam/ControlFREEC.py +11 -11
  27. biopipen/scripts/bed/Bed2Vcf.py +5 -5
  28. biopipen/scripts/bed/BedConsensus.py +5 -5
  29. biopipen/scripts/bed/BedLiftOver.sh +6 -4
  30. biopipen/scripts/bed/BedtoolsIntersect.py +4 -4
  31. biopipen/scripts/bed/BedtoolsMakeWindows.py +3 -3
  32. biopipen/scripts/bed/BedtoolsMerge.py +4 -4
  33. biopipen/scripts/cellranger/CellRangerCount.py +20 -9
  34. biopipen/scripts/cellranger/CellRangerSummary.R +20 -29
  35. biopipen/scripts/cellranger/CellRangerVdj.py +8 -8
  36. biopipen/scripts/cnvkit/CNVkitAccess.py +6 -6
  37. biopipen/scripts/cnvkit/CNVkitAutobin.py +25 -18
  38. biopipen/scripts/cnvkit/CNVkitBatch.py +5 -5
  39. biopipen/scripts/cnvkit/CNVkitCall.py +3 -3
  40. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -2
  41. biopipen/scripts/cnvkit/CNVkitDiagram.py +5 -5
  42. biopipen/scripts/cnvkit/CNVkitFix.py +3 -3
  43. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +9 -5
  44. biopipen/scripts/cnvkit/CNVkitHeatmap.py +4 -4
  45. biopipen/scripts/cnvkit/CNVkitReference.py +2 -2
  46. biopipen/scripts/cnvkit/CNVkitScatter.py +5 -5
  47. biopipen/scripts/cnvkit/CNVkitSegment.py +5 -5
  48. biopipen/scripts/cnvkit/guess_baits.py +166 -93
  49. biopipen/scripts/delim/SampleInfo.R +94 -148
  50. biopipen/scripts/misc/Config2File.py +2 -2
  51. biopipen/scripts/misc/Str2File.py +2 -2
  52. biopipen/scripts/protein/MMCIF2PDB.py +33 -0
  53. biopipen/scripts/protein/PDB2Fasta.py +60 -0
  54. biopipen/scripts/protein/Prodigy.py +4 -4
  55. biopipen/scripts/protein/RMSD.py +178 -0
  56. biopipen/scripts/regulatory/MotifScan.py +8 -8
  57. biopipen/scripts/scrna/CellCellCommunication.py +59 -22
  58. biopipen/scripts/scrna/LoomTo10X.R +51 -0
  59. biopipen/scripts/scrna/MarkersFinder.R +273 -654
  60. biopipen/scripts/scrna/RadarPlots.R +73 -53
  61. biopipen/scripts/scrna/SCP-plot.R +15202 -0
  62. biopipen/scripts/scrna/ScVelo.py +0 -0
  63. biopipen/scripts/scrna/SeuratClusterStats-clustree.R +23 -31
  64. biopipen/scripts/scrna/SeuratClusterStats-dimplots.R +26 -54
  65. biopipen/scripts/scrna/SeuratClusterStats-features.R +85 -403
  66. biopipen/scripts/scrna/SeuratClusterStats-ngenes.R +32 -17
  67. biopipen/scripts/scrna/SeuratClusterStats-stats.R +45 -239
  68. biopipen/scripts/scrna/SeuratClusterStats.R +13 -19
  69. biopipen/scripts/scrna/SeuratMap2Ref.R +16 -12
  70. biopipen/scripts/scrna/SeuratPreparing.R +138 -81
  71. biopipen/scripts/scrna/SlingShot.R +71 -0
  72. biopipen/scripts/scrna/celltypist-wrapper.py +7 -6
  73. biopipen/scripts/snp/Plink2GTMat.py +26 -11
  74. biopipen/scripts/snp/PlinkFilter.py +7 -7
  75. biopipen/scripts/snp/PlinkFromVcf.py +8 -5
  76. biopipen/scripts/snp/PlinkSimulation.py +4 -4
  77. biopipen/scripts/snp/PlinkUpdateName.py +4 -4
  78. biopipen/scripts/stats/ChowTest.R +48 -22
  79. biopipen/scripts/tcgamaf/Maf2Vcf.py +2 -2
  80. biopipen/scripts/tcgamaf/MafAddChr.py +2 -2
  81. biopipen/scripts/tcr/ClonalStats.R +484 -0
  82. biopipen/scripts/tcr/ScRepLoading.R +127 -0
  83. biopipen/scripts/tcr/TCRDock.py +10 -6
  84. biopipen/scripts/tcr/vdjtools-patch.sh +1 -1
  85. biopipen/scripts/vcf/BcftoolsAnnotate.py +8 -8
  86. biopipen/scripts/vcf/BcftoolsFilter.py +3 -3
  87. biopipen/scripts/vcf/BcftoolsMerge.py +31 -0
  88. biopipen/scripts/vcf/BcftoolsSort.py +4 -4
  89. biopipen/scripts/vcf/BcftoolsView.py +5 -5
  90. biopipen/scripts/vcf/Vcf2Bed.py +2 -2
  91. biopipen/scripts/vcf/VcfAnno.py +11 -11
  92. biopipen/scripts/vcf/VcfDownSample.sh +22 -10
  93. biopipen/scripts/vcf/VcfFilter.py +5 -5
  94. biopipen/scripts/vcf/VcfFix.py +7 -7
  95. biopipen/scripts/vcf/VcfFix_utils.py +12 -3
  96. biopipen/scripts/vcf/VcfIndex.py +3 -3
  97. biopipen/scripts/vcf/VcfIntersect.py +3 -3
  98. biopipen/scripts/vcf/VcfLiftOver.sh +5 -0
  99. biopipen/scripts/vcf/VcfSplitSamples.py +4 -4
  100. biopipen/scripts/vcf/bcftools_utils.py +3 -3
  101. biopipen/scripts/web/Download.py +8 -4
  102. biopipen/scripts/web/DownloadList.py +5 -5
  103. biopipen/scripts/web/GCloudStorageDownloadBucket.py +5 -5
  104. biopipen/scripts/web/GCloudStorageDownloadFile.py +3 -3
  105. biopipen/scripts/web/gcloud_common.py +1 -1
  106. biopipen/utils/gsea.R +75 -35
  107. biopipen/utils/misc.R +205 -7
  108. biopipen/utils/misc.py +17 -8
  109. biopipen/utils/reference.py +11 -11
  110. biopipen/utils/repr.R +146 -0
  111. biopipen/utils/vcf.py +1 -1
  112. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/METADATA +8 -8
  113. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/RECORD +115 -105
  114. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/WHEEL +1 -1
  115. biopipen/scripts/scrna/SeuratClusterStats-hists.R +0 -144
  116. biopipen/scripts/scrna/SeuratPreparing-common.R +0 -467
  117. biopipen/scripts/scrna/SeuratPreparing-doublet_detection.R +0 -204
  118. {biopipen-0.32.3.dist-info → biopipen-0.33.1.dist-info}/entry_points.txt +0 -0
@@ -6,11 +6,11 @@ from biopipen.utils.reference import tabix_index
6
6
  from biopipen.utils.misc import logger
7
7
  from biopipen.scripts.vcf.bcftools_utils import run_bcftools
8
8
 
9
- infile = {{in.infile | repr}} # pyright: ignore # noqa: E999
10
- annfile = {{in.annfile | repr}} # pyright: ignore
11
- outfile = {{out.outfile | repr}} # pyright: ignore
12
- joboutdir = {{job.outdir | repr}} # pyright: ignore
13
- envs = {{envs | dict | repr}} # pyright: ignore
9
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
10
+ annfile: str = {{in.annfile | quote}} # pyright: ignore
11
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
12
+ joboutdir: str = {{job.outdir | quote}} # pyright: ignore
13
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
14
14
 
15
15
  bcftools = envs.pop("bcftools")
16
16
  tabix = envs.pop("tabix")
@@ -25,14 +25,14 @@ if isinstance(columns, list):
25
25
  columns = ",".join(columns)
26
26
 
27
27
  if "c" in envs:
28
- logger.warning("Ignoring envs\[c], use envs\[columns] instead.")
28
+ logger.warning(r"Ignoring envs\[c], use envs\[columns] instead.")
29
29
  del envs["c"]
30
30
 
31
31
  if isinstance(remove, list):
32
32
  remove = ",".join(remove)
33
33
 
34
34
  if "x" in envs:
35
- logger.warning("Ignoring envs\[x], use envs\[remove] instead.")
35
+ logger.warning(r"Ignoring envs\[x], use envs\[remove] instead.")
36
36
  del envs["x"]
37
37
 
38
38
  envs_has_annfile = "a" in envs or "annotations" in envs
@@ -43,7 +43,7 @@ if header:
43
43
 
44
44
  if annfile and envs_has_annfile:
45
45
  logger.warning(
46
- "Ignoring envs\[a/annotations] because in.annfile is provided."
46
+ r"Ignoring envs\[a/annotations] because in.annfile is provided."
47
47
  )
48
48
  with suppress(KeyError):
49
49
  del envs["a"]
@@ -3,11 +3,11 @@ from pathlib import Path, PosixPath # noqa: F401
3
3
  from biopipen.utils.misc import logger
4
4
  from biopipen.scripts.vcf.bcftools_utils import run_bcftools
5
5
 
6
- infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
7
- outfile = {{out.outfile | repr}} # pyright: ignore
6
+ infile: str | Path = {{in.infile | quote}} # pyright: ignore # noqa: #999
7
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
8
8
  outdir = Path(outfile).parent
9
9
 
10
- envs = {{envs | dict | repr}} # pyright: ignore
10
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
11
11
  bcftools = envs.pop("bcftools")
12
12
  tabix = envs.pop("tabix")
13
13
  keep = envs.pop("keep")
@@ -0,0 +1,31 @@
1
+ from biopipen.utils.reference import tabix_index
2
+ from biopipen.utils.misc import logger
3
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
4
+
5
+ infiles: list = {{in.infiles | each: as_path}} # pyright: ignore # noqa: E999
6
+ outfile = {{out.outfile | repr}} # pyright: ignore
7
+ joboutdir = {{job.outdir | repr}} # pyright: ignore
8
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
9
+
10
+ bcftools = envs.pop("bcftools")
11
+ tabix = envs.pop("tabix")
12
+ ncores = envs.pop("ncores")
13
+ gz = envs.pop("gz")
14
+ index = envs.pop("index")
15
+
16
+ envs.setdefault("force-single", True)
17
+ envs.setdefault("missing-to-ref", True)
18
+
19
+ if index and not gz:
20
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
21
+ gz = True
22
+
23
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
24
+ envs["O"] = "z" if gz else "v"
25
+
26
+ envs[""] = [bcftools, "merge"]
27
+ envs["o"] = outfile
28
+ envs["threads"] = ncores
29
+ envs["_"] = infiles
30
+
31
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
@@ -4,9 +4,9 @@ from pathlib import Path, PosixPath # noqa: F401
4
4
  from biopipen.utils.misc import run_command, logger
5
5
  from biopipen.scripts.vcf.bcftools_utils import run_bcftools
6
6
 
7
- infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
8
- outfile = {{out.outfile | quote}} # pyright: ignore
9
- envs = {{envs | dict | repr}} # pyright: ignore
7
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
8
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
9
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
10
10
 
11
11
  outdir = Path(outfile).parent
12
12
  bcftools = envs.pop("bcftools")
@@ -97,7 +97,7 @@ if chrsize:
97
97
  infile
98
98
  ], fg=True)
99
99
 
100
- infile = reheader_vcf
100
+ infile = str(reheader_vcf)
101
101
 
102
102
  envs[""] = [bcftools, "sort"]
103
103
  envs["_"] = infile
@@ -6,10 +6,10 @@ from biopipen.utils.misc import logger
6
6
  from biopipen.utils.reference import tabix_index
7
7
  from biopipen.scripts.vcf.bcftools_utils import run_bcftools
8
8
 
9
- infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
10
- regions_file = {{in.regions_file | repr}} # pyright: ignore
11
- samples_file = {{in.samples_file | repr}} # pyright: ignore
12
- outfile = {{out.outfile | repr}} # pyright: ignore
9
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: #999
10
+ regions_file: str = {{in.regions_file | quote}} # pyright: ignore
11
+ samples_file: str = {{in.samples_file | quote}} # pyright: ignore
12
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
13
13
  envs: dict = {{envs | dict | repr}} # pyright: ignore
14
14
 
15
15
  bcftools = envs.pop("bcftools")
@@ -21,7 +21,7 @@ index = envs.pop("index")
21
21
  if regions_file:
22
22
  if "R" in envs or "regions_file" in envs or "regions-file" in envs:
23
23
  logger.warning(
24
- "Ignoring envs\[regions_file/regions-file/R] "
24
+ r"Ignoring envs\[regions_file/regions-file/R] "
25
25
  "because in.regionsfile is provided."
26
26
  )
27
27
  with suppress(KeyError):
@@ -1,7 +1,7 @@
1
1
  from cyvcf2 import VCF, Variant
2
2
 
3
- infile = {{in.infile | quote}} # pyright: ignore
4
- outfile = {{out.outfile | quote}} # pyright: ignore
3
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa: E999
4
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
5
5
  # vcf, default 1
6
6
  inbase = {{envs.inbase | int}} # pyright: ignore
7
7
  # bed, default 0
@@ -2,22 +2,22 @@ from os import path
2
2
 
3
3
  from biopipen.utils.misc import run_command, dict_to_cli_args
4
4
 
5
- infile = {{in.infile | quote}} # pyright: ignore
6
- outfile = {{out.outfile | quote}} # pyright: ignore
7
- joboutdir = {{job.outdir | quote}} # pyright: ignore
5
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
6
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
7
+ joboutdir: str = {{job.outdir | quote}} # pyright: ignore
8
8
  vcfanno = {{envs.vcfanno | quote}} # pyright: ignore
9
- ncores = {{envs.ncores | repr}} # pyright: ignore
10
- args = {{envs.args | repr}} # pyright: ignore
9
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
10
+ args: dict = {{envs.args | dict}} # pyright: ignore
11
11
 
12
- {% set conf = envs.conffile or in.conffile %}
13
- {% if conf | isinstance: dict %}
12
+ {% set conf = envs.conffile or in.conffile %} # pyright: ignore # noqa
13
+ {% if conf | isinstance: dict %} # pyright: ignore # noqa
14
14
  conffile = path.join(joboutdir, "config.toml")
15
- conf = {{ conf | toml | quote }}
15
+ conf: str = {{ conf | toml | quote }} # pyright: ignore # noqa
16
16
  with open(conffile, "w") as f:
17
17
  f.write(conf)
18
- {% else %}
19
- conffile = {{conf | quote}}
20
- {% endif %}
18
+ {% else %} # pyright: ignore # noqa
19
+ conffile = {{conf | quote}} # pyright: ignore # noqa
20
+ {% endif %} # pyright: ignore # noqa
21
21
 
22
22
  args["p"] = ncores
23
23
  args["_"] = [conffile, infile]
@@ -1,25 +1,37 @@
1
+ # shellcheck disable=SC2148
2
+ # shellcheck disable=SC2036
3
+ # shellcheck disable=SC2030
4
+ # shellcheck disable=SC1083
5
+ # shellcheck disable=SC2288
1
6
  infile={{in.infile | quote}}
2
7
  outfile={{out.outfile | quote}}
3
8
  n={{envs.n}}
4
9
 
10
+ # shellcheck disable=SC2031
5
11
  if [[ $infile == *.gz ]]; then
6
- outfile=$(echo $outfile | sed -r "s/\.gz$//")
7
- nheader=$(zcat $infile | head -n 9999 | grep "^#" | wc -l | cut -d' ' -f1)
12
+ outfile=$(echo "$outfile" | sed -r "s/\.gz$//")
13
+ # shellcheck disable=SC2126
14
+ nheader=$(zcat "$infile" | head -n 9999 | grep "^#" | wc -l | cut -d' ' -f1)
8
15
  if [[ ! $n -gt 1 ]]; then
9
- nrows=$(zcat $infile | wc -l | cut -d' ' -f1)
16
+ nrows=$(zcat "$infile" | wc -l | cut -d' ' -f1)
17
+ # shellcheck disable=SC2004
10
18
  nvars=$(($nrows - $nheader))
11
19
  n=$(echo "$nvars * $n" | bc)
12
20
  fi
13
- zcat $infile | head -n $nheader > $outfile
14
- zcat $infile | tail -n +$(($nheader + 1)) | shuf -n $n | LC_ALL=C sort -k1,1V -k2,2n >> $outfile
15
- bgzip $outfile
21
+ zcat "$infile" | head -n "$nheader" > "$outfile"
22
+ # shellcheck disable=SC2004
23
+ zcat "$infile" | tail -n +$(($nheader + 1)) | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
24
+ bgzip "$outfile"
16
25
  else
17
- nheader=$(head -n 9999 $infile | grep "^#" | wc -l | cut -d' ' -f1)
26
+ # shellcheck disable=SC2126
27
+ nheader=$(head -n 9999 "$infile" | grep "^#" | wc -l | cut -d' ' -f1)
18
28
  if [[ ! $n -gt 1 ]]; then
19
- nrows=$(wc -l $infile | cut -d' ' -f1)
29
+ nrows=$(wc -l "$infile" | cut -d' ' -f1)
30
+ # shellcheck disable=SC2004
20
31
  nvars=$(($nrows - $nheader))
21
32
  n=$(echo "$nvars * $n" | bc)
22
33
  fi
23
- head -n $nheader $infile > $outfile
24
- tail -n +$(($nheader + 1)) $infile | shuf -n $n | LC_ALL=C sort -k1,1V -k2,2n >> $outfile
34
+ head -n "$nheader" "$infile" > "$outfile"
35
+ # shellcheck disable=SC2004
36
+ tail -n +$(($nheader + 1)) "$infile" | shuf -n "$n" | LC_ALL=C sort -k1,1V -k2,2n >> "$outfile"
25
37
  fi
@@ -1,13 +1,13 @@
1
1
  from cyvcf2 import VCF, Writer, Variant
2
2
 
3
- infile = {{in.invcf | repr}} # pyright: ignore
4
- outfile = {{out.outfile | repr}} # pyright: ignore
3
+ infile: str = {{in.invcf | quote}} # pyright: ignore # noqa: E999
4
+ outfile: str = {{out.outfile | quote}} # pyright: ignore
5
5
 
6
- {{envs.helper}}
6
+ {{envs.helper}} # pyright: ignore # noqa: E999
7
7
 
8
8
  keep = {{envs.keep | repr}} # pyright: ignore
9
- filters = {{envs.filters | repr}} # pyright: ignore
10
- filter_descs = {{envs.filter_descs | repr}} # pyright: ignore
9
+ filters: dict = {{envs.filters | repr}} # pyright: ignore
10
+ filter_descs: dict = {{envs.filter_descs | repr}} # pyright: ignore
11
11
 
12
12
  # builtin filters
13
13
  BUILTIN_FILTERS = {}
@@ -7,17 +7,17 @@ from biopipen.scripts.vcf.VcfFix_utils import ( # noqa: F401
7
7
  HeaderContig,
8
8
  HeaderGeneral,
9
9
  Fields,
10
- Info,
11
- Format,
12
- Alt,
13
- Filter,
14
- Sample,
15
- Samples,
10
+ # Info,
11
+ # Format,
12
+ # Alt,
13
+ # Filter,
14
+ # Sample,
15
+ # Samples,
16
16
  Variant,
17
17
  )
18
18
  from biopipen.scripts.vcf.VcfFix_utils import fix_vcffile
19
19
 
20
- infile = {{in.infile | quote}} # pyright: ignore
20
+ infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
21
21
  instem = {{in.infile | stem | quote}} # pyright: ignore
22
22
  outfile = {{out.outfile | quote}} # pyright: ignore
23
23
 
@@ -1,6 +1,15 @@
1
1
  import re
2
2
  import gzip
3
- from biopipen.utils.vcf import * # noqa: F401, F403
3
+ from biopipen.utils.vcf import (
4
+ HeaderInfo,
5
+ HeaderFormat,
6
+ HeaderFilter,
7
+ HeaderContig,
8
+ HeaderGeneral,
9
+ Fields,
10
+ Variant,
11
+ HeaderItem,
12
+ )
4
13
 
5
14
 
6
15
  def line_to_obj(line: str):
@@ -41,7 +50,7 @@ def handle_obj(obj, fixes: dict):
41
50
 
42
51
  regex = fix.get("regex")
43
52
  if regex:
44
- if not re.search(regex, obj.raw):
53
+ if not re.search(regex, obj.raw): # type: ignore
45
54
  continue
46
55
 
47
56
  return fix["fix"](obj.raw if kind is None else obj)
@@ -67,7 +76,7 @@ def fix_vcffile(vcffile, outfile, fixes):
67
76
  with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
68
77
  for line in fin:
69
78
  obj = line_to_obj(line)
70
- out = handle_obj(obj, modify_fixes)
79
+ out = handle_obj(obj, modify_fixes) # type: ignore
71
80
  if obj.kind == "fields":
72
81
  for fix in header_append_fixes:
73
82
  fout.write(str(fix["fix"](None)).rstrip("\n") + "\n")
@@ -4,10 +4,10 @@ from os import path
4
4
  from biopipen.utils.reference import tabix_index
5
5
  from biopipen.utils.misc import run_command
6
6
 
7
- infile = {{in.infile | repr}} # pyright: ignore
8
- outfile = Path({{out.outfile | repr}}) # pyright: ignore
7
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
8
+ outfile = Path({{out.outfile | quote}}) # pyright: ignore
9
9
  outidx = {{out.outidx | repr}} # pyright: ignore
10
- tabix = {{envs.tabix | repr}} # pyright: ignore
10
+ tabix: str = {{envs.tabix | repr}} # pyright: ignore
11
11
  ncores = {{envs.ncores | repr}} # pyright: ignore
12
12
 
13
13
  outfile_with_index = tabix_index(infile, "vcf", outfile.parent, tabix)
@@ -1,8 +1,8 @@
1
1
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
2
 
3
- infile1 = {{in.infile1 | repr}} # pyright: ignore
4
- infile2 = {{in.infile2 | repr}} # pyright: ignore
5
- outfile = {{out.outfile | repr}} # pyright: ignore
3
+ infile1: str = {{in.infile1 | quote}} # pyright: ignore # noqa
4
+ infile2 = {{in.infile2 | quote}} # pyright: ignore
5
+ outfile = {{out.outfile | quote}} # pyright: ignore
6
6
  bcftools = {{envs.bcftools | repr}} # pyright: ignore
7
7
  gz = {{envs.gz | repr}} # pyright: ignore
8
8
  index = {{envs.index | repr}} # pyright: ignore
@@ -1,3 +1,5 @@
1
+ # shellcheck disable=SC2148
2
+ # shellcheck disable=SC1083
1
3
  invcf={{ in.invcf | quote }}
2
4
  outvcf={{ out.outvcf | quote }}
3
5
  rejfile={{ job.outdir | joinpaths: "rejected.vcf" | quote }}
@@ -6,12 +8,15 @@ chain={{ envs.chain | quote }}
6
8
  reffa={{ envs.reffa | quote }}
7
9
  args={{ envs.args | dict_to_cli_args: join=True }}
8
10
 
11
+ # shellcheck disable=SC2154
9
12
  refdict="${reffa%.fa}.dict"
10
13
  if [[ ! -e "$refdict" ]]; then
11
14
  echo "Sequence dictionary does not exist: $refdict" 1>&2
12
15
  exit 1
13
16
  fi
14
17
 
18
+ # shellcheck disable=SC2154
19
+ # shellcheck disable=SC2086
15
20
  $gatk LiftoverVcf \
16
21
  $args \
17
22
  --INPUT "$invcf" \
@@ -3,12 +3,12 @@ import shlex
3
3
  import concurrent.futures
4
4
  from subprocess import Popen, check_output
5
5
 
6
- infile = {{in.infile | repr}} # pyright: ignore
7
- outdir = {{out.outdir | repr}} # pyright: ignore
8
- bcftools = {{envs.bcftools | repr}} # pyright: ignore
6
+ infile: str = {{in.infile | quote}} # pyright: ignore # noqa
7
+ outdir: str = {{out.outdir | quote}} # pyright: ignore
8
+ bcftools: str = {{envs.bcftools | repr}} # pyright: ignore
9
9
  gz = {{envs.gz | repr}} # pyright: ignore
10
10
  index = {{envs.index | repr}} # pyright: ignore
11
- ncores = {{envs.ncores | int}} # pyright: ignore
11
+ ncores: int = {{envs.ncores | int}} # pyright: ignore
12
12
  private = {{envs.private | repr}} # pyright: ignore
13
13
 
14
14
  if index:
@@ -15,7 +15,7 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
15
15
  """
16
16
  bversion = (
17
17
  run_command([bcftools, "version"], stdout="return")
18
- .splitlines()[0] # bcftools 1.20
18
+ .splitlines()[0] # bcftools 1.20 # type: ignore
19
19
  .replace("bcftools", "")
20
20
  .strip() # 1.20
21
21
  .split(".")
@@ -24,8 +24,8 @@ def bcftools_version(bcftools: str) -> tuple[int, ...]:
24
24
 
25
25
 
26
26
  def run_bcftools(
27
- args: dict[str, object],
28
- bcftools: str,
27
+ args: dict,
28
+ bcftools: str, # TODO: get from the first argument of args
29
29
  index: bool,
30
30
  tabix: str
31
31
  ) -> None:
@@ -2,13 +2,13 @@ from pathlib import Path
2
2
 
3
3
  from biopipen.utils.misc import run_command, dict_to_cli_args
4
4
 
5
- url = {{in.url | repr}} # pyright: ignore
6
- outfile = Path({{out.outfile | repr}}) # pyright: ignore
5
+ url = {{in.url | quote}} # pyright: ignore # noqa
6
+ outfile = Path({{out.outfile | quote}}) # pyright: ignore
7
7
  tool = {{envs.tool | repr}} # pyright: ignore
8
8
  wget = {{envs.wget | repr}} # pyright: ignore
9
9
  aria2c = {{envs.aria2c | repr}} # pyright: ignore
10
10
  ncores = {{envs.ncores | repr}} # pyright: ignore
11
- args = {{envs.args | dict}} # pyright: ignore
11
+ args: dict = {{envs.args | dict}} # pyright: ignore
12
12
 
13
13
  if tool == "wget":
14
14
  args["_"] = url
@@ -28,4 +28,8 @@ elif tool == "aria2c":
28
28
 
29
29
  else: # use python
30
30
  import urllib
31
- urllib.urlretrieve(url, outfile)
31
+
32
+ try:
33
+ urllib.urlretrieve(url, outfile) # type: ignore
34
+ except AttributeError:
35
+ urllib.request.urlretrieve(url, outfile) # type: ignore
@@ -2,13 +2,13 @@ from pathlib import Path
2
2
 
3
3
  from biopipen.utils.misc import run_command, dict_to_cli_args
4
4
 
5
- urlfile = {{in.urlfile | repr}} # pyright: ignore
6
- outdir = Path({{out.outdir | repr}}) # pyright: ignore
5
+ urlfile: str = {{in.urlfile | quote}} # pyright: ignore # noqa
6
+ outdir = Path({{out.outdir | quote}}) # pyright: ignore
7
7
  tool = {{envs.tool | repr}} # pyright: ignore
8
8
  wget = {{envs.wget | repr}} # pyright: ignore
9
9
  aria2c = {{envs.aria2c | repr}} # pyright: ignore
10
10
  ncores = {{envs.ncores | repr}} # pyright: ignore
11
- args = {{envs.args | repr}} # pyright: ignore
11
+ args: dict = {{envs.args | repr}} # pyright: ignore
12
12
 
13
13
  if tool == "wget":
14
14
  args["i"] = urlfile
@@ -26,10 +26,10 @@ elif tool == "aria2c":
26
26
  run_command(dict_to_cli_args(args, dashify=True), fg=True)
27
27
 
28
28
  else: # use python
29
- import urllib
29
+ from urllib.request import urlretrieve
30
30
  from urllib.parse import urlparse
31
31
  with open(urlfile, "r") as furl:
32
32
  for i, url in enumerate(furl.readlines()):
33
33
  parsed = urlparse(url)
34
34
  path = Path(parsed.path)
35
- urllib.urlretrieve(url, f"{path.stem}-{i}{path.suffix}")
35
+ urlretrieve(url, f"{path.stem}-{i}{path.suffix}")
@@ -8,12 +8,12 @@ from biopipen.scripts.web.gcloud_common import (
8
8
  get_file_path,
9
9
  )
10
10
 
11
- url = {{in.url | repr}} # pyright: ignore # noqa: E999
11
+ url: str = {{in.url | quote}} # pyright: ignore # noqa: E999
12
12
  outdir = Path({{out.outdir | repr}}) # pyright: ignore
13
- gcloud = {{envs.gcloud | repr}} # pyright: ignore
13
+ gcloud: str = {{envs.gcloud | quote}} # pyright: ignore
14
14
  keep_structure = {{envs.keep_structure | repr}} # pyright: ignore
15
- ncores = {{envs.ncores | repr}} # pyright: ignore
16
- args = {{envs.args | repr}} # pyright: ignore
15
+ ncores: int = {{envs.ncores | repr}} # pyright: ignore
16
+ args: dict = {{envs.args | repr}} # pyright: ignore
17
17
 
18
18
  if not is_valid_gs_bucket_url(url):
19
19
  raise Exception(
@@ -65,7 +65,7 @@ def download_file(i: int, line: str, total: int):
65
65
  def download_bucket():
66
66
  out = run_command([gcloud, "storage", "ls", "--recursive", url], stdout="RETURN")
67
67
  # remove empty lines and skip the root
68
- out = list(filter(None, out.splitlines()[1:]))
68
+ out = list(filter(None, out.splitlines()[1:])) # type: ignore
69
69
  if keep_structure:
70
70
  # create folders first
71
71
  logger.info(f"Creating folders to keep structure.")
@@ -1,10 +1,10 @@
1
1
  from biopipen.utils.misc import run_command, dict_to_cli_args
2
2
  from biopipen.scripts.web.gcloud_common import is_logged_in, is_valid_gs_file_url
3
3
 
4
- url = {{in.url | repr}} # pyright: ignore # noqa: E999
4
+ url: str = {{in.url | repr}} # pyright: ignore # noqa: E999
5
5
  outfile = {{out.outfile | repr}} # pyright: ignore
6
- gcloud = {{envs.gcloud | repr}} # pyright: ignore
7
- args = {{envs.args | repr}} # pyright: ignore
6
+ gcloud: str = {{envs.gcloud | repr}} # pyright: ignore
7
+ args: dict = {{envs.args | repr}} # pyright: ignore
8
8
 
9
9
  if not is_valid_gs_file_url(url):
10
10
  raise Exception(
@@ -12,7 +12,7 @@ def is_logged_in(gcloud: str) -> bool:
12
12
  bool: True if the user is logged in, False otherwise.
13
13
  """
14
14
  out = run_command([gcloud, "auth", "list"], stdout="RETURN")
15
- return "ACTIVE" in out
15
+ return "ACTIVE" in out # type: ignore
16
16
 
17
17
 
18
18
  def is_valid_gs_bucket_url(url: str) -> bool:
biopipen/utils/gsea.R CHANGED
@@ -34,46 +34,85 @@ if (!exists("slugify")) {
34
34
  }
35
35
  }
36
36
 
37
+ #' Download the GMT file and save it to cachedir
38
+ #' Return the path to the GMT file
39
+ #' We also check if the second column is shorter than the first column.
40
+ #' If so, we switch the first and second columns.
41
+ #' In case some providers provide the GMT file with the first and second columns switched.
42
+ #' We also replace the "/" in the gene names with "-" if any. This is because the "/" is
43
+ #' not allowed in a path, but GSEA uses the gene names as the file name.
44
+ #'
45
+ #' @param gmturl The URL or path of the GMT file
46
+ #' @param cachedir The directory to save the GMT file
47
+ #' @return The path to the GMT file
37
48
  localizeGmtfile <- function(gmturl, cachedir = tempdir()) {
38
49
  # Download the GMT file and save it to cachedir
39
50
  # Return the path to the GMT file
40
- if (!startsWith(gmturl, "http") && !startsWith(gmturl, "ftp")) {
41
- return(gmturl)
51
+ in_gmtfile <- out_gmtfile <- file.path(cachedir, basename(gmturl))
52
+ if (startsWith(gmturl, "http") || startsWith(gmturl, "ftp")) {
53
+ download.file(gmturl, in_gmtfile)
54
+ remote <- TRUE
55
+ } else {
56
+ in_gmtfile <- gmturl
57
+ remote <- FALSE
42
58
  }
43
- gmtfile = file.path(cachedir, basename(gmturl))
44
- if (!file.exists(gmtfile)) {
45
- download.file(gmturl, gmtfile)
46
- items <- read.delim(gmtfile, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
47
- if (ncol(items) < 3) {
48
- stop(paste0("Invalid GMT file: ", gmtfile, ", from ", gmturl))
49
- }
50
- if (nrow(items) == 0) {
51
- stop(paste0("Empty GMT file: ", gmtfile, ", from ", gmturl))
52
- }
53
- if (
54
- is.character(items$V2[1]) &&
55
- nchar(items$V2[1]) < nchar(items$V1[1]) &&
56
- nchar(items$V2[1]) > 0 &&
57
- is.na(suppressWarnings(as.numeric(items$V2[1])))
58
- ) {
59
- warning(paste0(
60
- "The second column is shorter, switching the first and second columns in GMT file ",
61
- gmtfile,
62
- " from ",
63
- gmturl
64
- ))
65
- items <- items[, c(2, 1, 3:ncol(items))]
66
- write.table(
67
- items,
68
- gmtfile,
69
- row.names = F,
70
- col.names = F,
71
- sep = "\t",
72
- quote = F
73
- )
74
- }
59
+
60
+ items <- readLines(in_gmtfile)
61
+ items <- items[!grepl("^#", items) & nchar(items) > 0]
62
+ items <- lapply(strsplit(items, "\t"), function(x) c(x[1:2], paste0(x[3:length(x)], collapse = "\t")))
63
+ items <- as.data.frame(t(as.data.frame(items)))
64
+ rownames(items) <- NULL
65
+ colnames(items) <- c("V1", "V2", "V3")
66
+
67
+ if (ncol(items) < 3) {
68
+ stop(paste0("Invalid GMT file: ", gmturl))
69
+ }
70
+ if (nrow(items) == 0) {
71
+ stop(paste0("Empty GMT file: ", gmturl))
75
72
  }
76
- return(gmtfile)
73
+
74
+ # Check if the second column is shorter than the first column
75
+ nchars1 <- sum(nchar(items$V1))
76
+ nchars2 <- sum(nchar(items$V2))
77
+ prefix <- gsub("[0-9]+$", "", items$V2[1])
78
+
79
+ if (is.character(items$V2) && # Only when V2 is character, as pathway names
80
+ nchars2 < nchars1 && # Only when V2 is shorter than V1
81
+ all(nchar(items$V2) > 0) && # Only when V2 is not empty
82
+ !all(grepl("^[0-9]+$", items$V2)) && # Only when V2 is not all numbers
83
+ (nchar(prefix) == 0 || !all(startsWith(items$V2, prefix))) # Only when they are not like hsa00001, hsa00002, etc.
84
+ ) {
85
+ warning(paste0(
86
+ "The second column is shorter, switching the first and second columns in ",
87
+ "GMT file ", gmturl
88
+ ))
89
+ items <- items[, c(2, 1, 3:ncol(items))]
90
+ switched <- TRUE
91
+ } else {
92
+ switched <- FALSE
93
+ }
94
+
95
+ if (any(grepl("/", items$V1))) {
96
+ items$V1 <- gsub("/", "-", items$V1)
97
+ replaced <- TRUE
98
+ } else {
99
+ replaced <- FALSE
100
+ }
101
+
102
+ if (remote || switched || replaced) {
103
+ write.table(
104
+ items,
105
+ out_gmtfile,
106
+ row.names = FALSE,
107
+ col.names = FALSE,
108
+ sep = "\t",
109
+ quote = FALSE
110
+ )
111
+ } else {
112
+ out_gmtfile <- in_gmtfile
113
+ }
114
+
115
+ return(out_gmtfile)
77
116
  }
78
117
 
79
118
 
@@ -261,6 +300,7 @@ runGSEA = function(
261
300
  mutate(Description = "na") %>%
262
301
  rownames_to_column("NAME") %>%
263
302
  select(NAME, Description, everything())
303
+
264
304
  write.table(
265
305
  indata,
266
306
  gctfile,