biopipen 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biopipen might be problematic. Click here for more details.

Files changed (83) hide show
  1. biopipen/__init__.py +1 -1
  2. biopipen/core/config.toml +8 -0
  3. biopipen/ns/bam.py +0 -2
  4. biopipen/ns/bed.py +35 -0
  5. biopipen/ns/cellranger_pipeline.py +5 -5
  6. biopipen/ns/cnv.py +18 -2
  7. biopipen/ns/cnvkit_pipeline.py +16 -11
  8. biopipen/ns/gene.py +68 -23
  9. biopipen/ns/misc.py +2 -15
  10. biopipen/ns/plot.py +146 -0
  11. biopipen/ns/regulation.py +214 -0
  12. biopipen/ns/scrna.py +15 -3
  13. biopipen/ns/snp.py +516 -8
  14. biopipen/ns/stats.py +74 -2
  15. biopipen/ns/vcf.py +196 -0
  16. biopipen/reports/snp/PlinkCallRate.svelte +24 -0
  17. biopipen/reports/snp/PlinkFreq.svelte +18 -0
  18. biopipen/reports/snp/PlinkHWE.svelte +18 -0
  19. biopipen/reports/snp/PlinkHet.svelte +18 -0
  20. biopipen/reports/snp/PlinkIBD.svelte +18 -0
  21. biopipen/scripts/bam/CNVpytor.py +144 -46
  22. biopipen/scripts/bed/BedtoolsIntersect.py +54 -0
  23. biopipen/scripts/bed/BedtoolsMerge.py +1 -1
  24. biopipen/scripts/cnv/AneuploidyScore.R +30 -7
  25. biopipen/scripts/cnv/AneuploidyScoreSummary.R +5 -2
  26. biopipen/scripts/cnv/TMADScore.R +21 -5
  27. biopipen/scripts/cnv/TMADScoreSummary.R +6 -2
  28. biopipen/scripts/cnvkit/CNVkitAccess.py +2 -1
  29. biopipen/scripts/cnvkit/CNVkitAutobin.py +3 -2
  30. biopipen/scripts/cnvkit/CNVkitBatch.py +1 -1
  31. biopipen/scripts/cnvkit/CNVkitCoverage.py +2 -1
  32. biopipen/scripts/cnvkit/CNVkitGuessBaits.py +1 -1
  33. biopipen/scripts/cnvkit/CNVkitHeatmap.py +1 -1
  34. biopipen/scripts/cnvkit/CNVkitReference.py +2 -1
  35. biopipen/scripts/gene/GeneNameConversion.R +65 -0
  36. biopipen/scripts/gene/GenePromoters.R +61 -0
  37. biopipen/scripts/misc/Shell.sh +15 -0
  38. biopipen/scripts/plot/Manhattan.R +140 -0
  39. biopipen/scripts/plot/QQPlot.R +62 -0
  40. biopipen/scripts/regulation/MotifAffinityTest.R +226 -0
  41. biopipen/scripts/regulation/MotifAffinityTest_AtSNP.R +126 -0
  42. biopipen/scripts/regulation/MotifAffinityTest_MotifBreakR.R +96 -0
  43. biopipen/scripts/regulation/MotifScan.py +159 -0
  44. biopipen/scripts/regulation/atSNP.R +33 -0
  45. biopipen/scripts/regulation/motifBreakR.R +1594 -0
  46. biopipen/scripts/scrna/CellsDistribution.R +2 -0
  47. biopipen/scripts/scrna/MarkersFinder.R +59 -67
  48. biopipen/scripts/scrna/SeuratClustering.R +63 -29
  49. biopipen/scripts/scrna/SeuratMap2Ref.R +20 -0
  50. biopipen/scripts/scrna/SeuratSubClustering.R +76 -27
  51. biopipen/scripts/snp/MatrixEQTL.R +84 -43
  52. biopipen/scripts/snp/Plink2GTMat.py +133 -0
  53. biopipen/scripts/snp/PlinkCallRate.R +190 -0
  54. biopipen/scripts/snp/PlinkFilter.py +100 -0
  55. biopipen/scripts/snp/PlinkFreq.R +298 -0
  56. biopipen/scripts/snp/PlinkFromVcf.py +78 -0
  57. biopipen/scripts/snp/PlinkHWE.R +80 -0
  58. biopipen/scripts/snp/PlinkHet.R +92 -0
  59. biopipen/scripts/snp/PlinkIBD.R +197 -0
  60. biopipen/scripts/snp/PlinkUpdateName.py +124 -0
  61. biopipen/scripts/stats/MetaPvalue.R +2 -1
  62. biopipen/scripts/stats/MetaPvalue1.R +70 -0
  63. biopipen/scripts/tcr/TCRClusterStats.R +12 -7
  64. biopipen/scripts/vcf/BcftoolsAnnotate.py +91 -0
  65. biopipen/scripts/vcf/BcftoolsFilter.py +90 -0
  66. biopipen/scripts/vcf/BcftoolsSort.py +113 -0
  67. biopipen/scripts/vcf/BcftoolsView.py +73 -0
  68. biopipen/scripts/vcf/VcfFix_utils.py +1 -1
  69. biopipen/scripts/vcf/bcftools_utils.py +52 -0
  70. biopipen/utils/gene.R +83 -37
  71. biopipen/utils/gene.py +108 -60
  72. biopipen/utils/misc.R +56 -0
  73. biopipen/utils/misc.py +5 -2
  74. biopipen/utils/reference.py +54 -10
  75. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/METADATA +2 -2
  76. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/RECORD +78 -50
  77. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/entry_points.txt +1 -1
  78. biopipen/ns/bcftools.py +0 -111
  79. biopipen/scripts/bcftools/BcftoolsAnnotate.py +0 -42
  80. biopipen/scripts/bcftools/BcftoolsFilter.py +0 -79
  81. biopipen/scripts/bcftools/BcftoolsSort.py +0 -19
  82. biopipen/scripts/gene/GeneNameConversion.py +0 -66
  83. {biopipen-0.28.0.dist-info → biopipen-0.29.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,113 @@
1
+ from typing import Literal
2
+ from pathlib import Path, PosixPath # noqa: F401
3
+
4
+ from biopipen.utils.misc import run_command, logger
5
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
6
+
7
+ infile = {{in.infile | quote}} # pyright: ignore # noqa: E999
8
+ outfile = {{out.outfile | quote}} # pyright: ignore
9
+ envs = {{envs | dict | repr}} # pyright: ignore
10
+
11
+ outdir = Path(outfile).parent
12
+ bcftools = envs.pop("bcftools")
13
+ tabix = envs.pop("tabix")
14
+ ncores = envs.pop("ncores")
15
+ gz = envs.pop("gz")
16
+ index = envs.pop("index")
17
+ chrsize = envs.pop("chrsize")
18
+ notfound = envs.pop("notfound")
19
+
20
+ if chrsize:
21
+ class Contig:
22
+ def __init__(self, name: str, length: str):
23
+ self.name = name
24
+ self.length = length
25
+
26
+ def __str__(self) -> str:
27
+ return f"##contig=<ID={self.name},length={self.length}>"
28
+
29
+ def parse_header(header_file: Path) -> tuple[list[str], dict[str, Contig]]:
30
+ hlines = []
31
+ ctgs = {}
32
+ with open(header_file) as fh:
33
+ for line in fh:
34
+ if line.startswith("##contig"):
35
+ ctg = line.strip().split("##contig=<ID=")[1].split(",length=")
36
+ ctgs[ctg[0]] = Contig(ctg[0], ctg[1].replace(">", ""))
37
+ else:
38
+ hlines.append(line.strip())
39
+ return hlines, ctgs
40
+
41
+ def match_contigs(
42
+ ctgs: dict[str, Contig],
43
+ chroms: list[str],
44
+ notfound: Literal["error", "remove", "start", "end"],
45
+ ) -> list[str]:
46
+ if (
47
+ ctgs
48
+ and chroms
49
+ and all(chrom.startswith("chr") for chrom in chroms)
50
+ and not any(chrom.startswith("chr") for chrom in ctgs)
51
+ ):
52
+ logger.warning(
53
+ "Removing 'chr' prefix from chromosomes in envs.chrsize file, "
54
+ "because the input VCF file does not have 'chr' prefix."
55
+ )
56
+ chroms = [chrom[3:] for chrom in chroms]
57
+
58
+ new_ctgs = []
59
+ for chrom in chroms:
60
+ if chrom in ctgs:
61
+ new_ctgs.append(str(ctgs[chrom]))
62
+ del ctgs[chrom]
63
+
64
+ if ctgs:
65
+ if notfound == "error":
66
+ raise ValueError(
67
+ "Chromosomes not found in envs.chrsize file: "
68
+ f"{', '.join(ctgs.keys())}"
69
+ )
70
+ elif notfound == "start":
71
+ new_ctgs = [str(ctg) for ctg in ctgs.values()] + new_ctgs
72
+ elif notfound == "end":
73
+ new_ctgs = new_ctgs + [str(ctg) for ctg in ctgs.values()]
74
+
75
+ return new_ctgs
76
+
77
+ chroms = []
78
+ with Path(chrsize).expanduser().open() as fh:
79
+ for line in fh:
80
+ chrom = line.strip().split()[0]
81
+ chroms.append(chrom)
82
+
83
+ header_file = outdir / "header.txt"
84
+ run_command(f'{bcftools} view -h {infile} > {header_file}', fg=True)
85
+ header_lines, contigs = parse_header(header_file)
86
+ new_contigs = match_contigs(contigs, chroms, notfound=notfound)
87
+ header_lines = [header_lines[0], *new_contigs, *header_lines[1:]]
88
+ reheader_file = outdir / "reheader.txt"
89
+ with open(reheader_file, "w") as fh:
90
+ fh.writelines([f"{line}\n" for line in header_lines])
91
+
92
+ reheader_vcf = outdir / f"{Path(infile).stem}_reheader.vcf"
93
+ run_command([
94
+ bcftools, "reheader",
95
+ "--header", reheader_file,
96
+ "-o", reheader_vcf,
97
+ infile
98
+ ], fg=True)
99
+
100
+ infile = reheader_vcf
101
+
102
+ envs[""] = [bcftools, "sort"]
103
+ envs["_"] = infile
104
+ envs["o"] = outfile
105
+
106
+ if index and not gz:
107
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
108
+ gz = True
109
+
110
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
111
+ envs["O"] = "z" if gz else "v"
112
+
113
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
@@ -0,0 +1,73 @@
1
+ from contextlib import suppress
2
+ # In case there are paths passed to envs
3
+ from pathlib import PosixPath # noqa: F401
4
+
5
+ from biopipen.utils.misc import logger
6
+ from biopipen.utils.reference import tabix_index
7
+ from biopipen.scripts.vcf.bcftools_utils import run_bcftools
8
+
9
+ infile = {{in.infile | repr}} # pyright: ignore # noqa: #999
10
+ regions_file = {{in.regions_file | repr}} # pyright: ignore
11
+ samples_file = {{in.samples_file | repr}} # pyright: ignore
12
+ outfile = {{out.outfile | repr}} # pyright: ignore
13
+ envs: dict = {{envs | dict | repr}} # pyright: ignore
14
+
15
+ bcftools = envs.pop("bcftools")
16
+ tabix = envs.pop("tabix")
17
+ ncores = envs.pop("ncores")
18
+ gz = envs.pop("gz")
19
+ index = envs.pop("index")
20
+
21
+ if regions_file:
22
+ if "R" in envs or "regions_file" in envs or "regions-file" in envs:
23
+ logger.warning(
24
+ "Ignoring envs\[regions_file/regions-file/R] "
25
+ "because in.regionsfile is provided."
26
+ )
27
+ with suppress(KeyError):
28
+ del envs["regions_file"]
29
+ with suppress(KeyError):
30
+ del envs["regions-file"]
31
+ with suppress(KeyError):
32
+ del envs["R"]
33
+ elif "R" in envs or "regions_file" in envs or "regions-file" in envs:
34
+ regions_file = (
35
+ envs.pop("regions_file", None)
36
+ or envs.pop("regions-file", None)
37
+ or envs.pop("R", None)
38
+ )
39
+
40
+ if samples_file:
41
+ if "S" in envs or "samples_file" in envs or "samples-file" in envs:
42
+ logger.warning(
43
+ "Ignoring envs[samples_file/samples-file/S] "
44
+ "because in.samples_file is provided."
45
+ )
46
+ with suppress(KeyError):
47
+ del envs["samples_file"]
48
+ with suppress(KeyError):
49
+ del envs["samples-file"]
50
+ with suppress(KeyError):
51
+ del envs["S"]
52
+ elif "S" in envs or "samples_file" in envs or "samples-file" in envs:
53
+ samples_file = (
54
+ envs.pop("samples_file", None)
55
+ or envs.pop("samples-file", None)
56
+ or envs.pop("S", None)
57
+ )
58
+
59
+ if index and not gz:
60
+ logger.warning("Forcing envs.gz to True because envs.index is True.")
61
+ gz = True
62
+
63
+ if "O" not in envs and "output-type" not in envs and "output_type" not in envs:
64
+ envs["O"] = "z" if gz else "v"
65
+
66
+ envs[""] = [bcftools, "view"]
67
+ envs["_"] = tabix_index(infile, "vcf", tabix=tabix)
68
+ envs["o"] = outfile
69
+ envs["threads"] = ncores
70
+ envs["regions_file"] = regions_file
71
+ envs["samples_file"] = samples_file
72
+
73
+ run_bcftools(envs, bcftools=bcftools, index=index, tabix=tabix)
@@ -63,7 +63,7 @@ def fix_vcffile(vcffile, outfile, fixes):
63
63
  else:
64
64
  modify_fixes.append(fix)
65
65
 
66
- inopen = gzip.open if vcffile.endswith(".gz") else open
66
+ inopen = gzip.open if str(vcffile).endswith(".gz") else open
67
67
  with inopen(vcffile, "rt") as fin, open(outfile, "w") as fout:
68
68
  for line in fin:
69
69
  obj = line_to_obj(line)
@@ -0,0 +1,52 @@
1
+ """Utilities for bcftools"""
2
+
3
+ from biopipen.utils.misc import run_command, dict_to_cli_args
4
+ from biopipen.utils.reference import tabix_index
5
+
6
+
7
+ def bcftools_version(bcftools: str) -> tuple[int, ...]:
8
+ """Get the version of bcftools
9
+
10
+ Args:
11
+ bcftools (str): Path to bcftools
12
+
13
+ Returns:
14
+ tuple[int, ...]: The version of bcftools
15
+ """
16
+ bversion = (
17
+ run_command([bcftools, "version"], stdout="return")
18
+ .splitlines()[0] # bcftools 1.20
19
+ .replace("bcftools", "")
20
+ .strip() # 1.20
21
+ .split(".")
22
+ )
23
+ return tuple(map(int, bversion))
24
+
25
+
26
+ def run_bcftools(
27
+ args: dict[str, object],
28
+ bcftools: str,
29
+ index: bool,
30
+ tabix: str
31
+ ) -> None:
32
+ """Run bcftools with the given arguments
33
+
34
+ Args:
35
+ args: Arguments to pass to bcftools
36
+ bcftools (str): Path to bcftools
37
+ index (bool): Whether to index the output
38
+ tabix (str): Path to tabix
39
+ """
40
+ if not index:
41
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
42
+ else:
43
+ bversion = bcftools_version(bcftools)
44
+ if bversion >= (1, 20):
45
+ # requires bcftools 1.20+
46
+ # '--write-index tbi' not working
47
+ # it has to be '--write-index=tbi'
48
+ args["write_index=tbi"] = True
49
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
50
+ else:
51
+ run_command(dict_to_cli_args(args, dashify=True), fg=True)
52
+ tabix_index(args["o"], "vcf", tmpdir=False, tabix=tabix)
biopipen/utils/gene.R CHANGED
@@ -1,49 +1,95 @@
1
- library(mygene)
2
- library(dplyr)
1
+ suppressPackageStartupMessages({
2
+ library(rlang)
3
+ library(dplyr)
4
+ library(mygene)
5
+ })
3
6
 
4
- gene_name_conversion = function(
7
+
8
+ #@' Convert gene names between different formats
9
+ #@'
10
+ #@' @param genes A character/integer vector of gene names/ids
11
+ #@' @param species A character vector of species names
12
+ #@' @param infmt A character vector of input gene name formats
13
+ #@' See the available scopes at
14
+ #@' https://docs.mygene.info/en/latest/doc/data.html#available-fields
15
+ #@' You can use ensg as a shortcut for ensembl.gene
16
+ #@' @param outfmt A character vector of output gene name formats
17
+ #@' @param dup How to deal with duplicate gene names found.
18
+ #@' "first": keep the first one (default), sorted by score descendingly
19
+ #@' "last": keep the last one, sorted by score descendingly
20
+ #@' "all": keep all of them, each will be a separate row
21
+ #@' "<X>": combine them into a single string, separated by X
22
+ #@' @param notfound How to deal with gene names that are not found
23
+ #@' "error": stop with an error message
24
+ #@' "use-query": use the query gene name as the converted gene name
25
+ #@' "skip": skip the gene names that are not found
26
+ #@' "ignore": Same as "skip"
27
+ #@' "na": use NA as the converted gene name (default)
28
+ #@' @param suppress_messages Whether to suppress the warning messages
29
+ #@' @return A tibble with the query gene names and the converted gene names
30
+ #@' When a gene name is not found, the converted name will be NA
31
+ #@' When duplicate gene names are found, the one with the highest score will be kept
32
+ #@' @export
33
+ gene_name_conversion <- function(
5
34
  genes,
6
- species,
7
35
  infmt,
8
36
  outfmt,
9
- notfound
37
+ dup = "first",
38
+ species = "human",
39
+ notfound = "na",
40
+ suppress_messages = FALSE
10
41
  ) {
11
- out = queryMany(
12
- genes,
13
- scopes=infmt,
14
- fields=outfmt,
15
- species=species
16
- ) %>% as.data.frame() %>% group_by(
17
- query
18
- ) %>% arrange(
19
- desc(X_score)
20
- ) %>% slice_head(n=1) %>% select(
21
- -c(X_id, X_score)
22
- )
23
-
24
- if ("notfound" %in% colnames(out)) {
25
- out = out %>% select(-c("notfound"))
42
+ notfound <- arg_match(notfound, c("error", "use-query", "skip", "ignore", "na"))
43
+
44
+ if (infmt %in% c("ensg", "ensmusg")) { infmt = "ensembl.gene" }
45
+ if (outfmt %in% c("ensg", "ensmusg")) { outfmt = "ensembl.gene" }
46
+
47
+ orig_genes <- genes
48
+ if (infmt == "ensembl.gene") {
49
+ # Remove version numbers from ensembl gene ids
50
+ genes <- gsub("\\..*", "", genes)
51
+ }
52
+ query_df <- tibble(query = genes, orig = orig_genes)
53
+
54
+ if (suppress_messages) {
55
+ capture.output(suppressWarnings(suppressMessages({
56
+ out <- queryMany(genes, scopes=infmt, fields=outfmt, species=species) %>%
57
+ as_tibble()
58
+ })))
59
+ } else {
60
+ out <- queryMany(genes, scopes=infmt, fields=outfmt, species=species) %>%
61
+ as_tibble()
62
+ }
63
+
64
+ if (nrow(out) == 0) {
65
+ return(tibble(query = orig_genes, converted = NA_character_))
26
66
  }
27
67
 
28
- if (length(outfmt) == 1 && "," %in% outfmt) {
29
- outfmt = trimws(unlist(strsplit(outfmt, ",", fixed=TRUE)))
68
+ if (dup == "first") {
69
+ out = out %>% group_by(query) %>% arrange(desc(X_score)) %>%
70
+ slice_head(n=1) %>% ungroup() %>% dplyr::select(all_of(c("query", outfmt)))
71
+ } else if (dup == "last") {
72
+ out = out %>% group_by(query) %>% arrange(X_score) %>%
73
+ slice_head(n=1) %>% ungroup() %>% dplyr::select(all_of(c("query", outfmt)))
74
+ } else if (dup != "all") {
75
+ out = out %>% group_by(query) %>% arrange(desc(X_score)) %>%
76
+ summarise(!!sym(outfmt) := paste(unique(!!sym(outfmt)), collapse=dup))
30
77
  }
78
+ out <- query_df %>%
79
+ left_join(out, by="query") %>%
80
+ dplyr::select(-"query") %>%
81
+ dplyr::select(query = orig, everything())
31
82
 
32
- out = tibble(query=genes) %>% left_join(out, by="query")
33
- if (notfound == "use-query") {
34
- out = out %>% mutate(
35
- across(
36
- outfmt,
37
- function(col, query) if_else(is.na(col), query, col),
38
- query=query
39
- )
40
- )
41
- } else if (notfound == "error" && any(is.na(out[[outfmt[1]]]))) {
42
- nagenes = out %>% filter(is.na(.[[outfmt[1]]])) %>% pull("query")
43
- stop(paste("Query genes not found:", paste(nagenes, collapse=",")))
44
- } else if (notfound == "skip") {
45
- out = out %>% filter(!is.na(.[[outfmt[1]]]))
83
+ if (notfound == "error") {
84
+ if (any(is.na(out[[outfmt]]))) {
85
+ nagenes = out %>% filter(is.na(.[[outfmt]])) %>% pull("query")
86
+ stop(paste("Query genes not found:", paste(nagenes, collapse=",")))
87
+ }
88
+ } else if (notfound == "use-query") {
89
+ out = out %>% mutate(!!sym(outfmt) := coalesce(!!sym(outfmt), query))
90
+ } else if (notfound == "skip" || notfound == "ignore") {
91
+ out = out %>% filter(!is.na(!!sym(outfmt)))
46
92
  }
47
93
 
48
- return out
94
+ return(out)
49
95
  }
biopipen/utils/gene.py CHANGED
@@ -1,86 +1,134 @@
1
1
  """Do gene name conversion"""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ import contextlib
6
+ import pandas as pd
2
7
  from mygene import MyGeneInfo
3
- from datar.all import (
4
- c,
5
- f,
6
- group_by,
7
- desc,
8
- arrange,
9
- slice_head,
10
- tibble,
11
- left_join,
12
- mutate,
13
- is_na,
14
- across,
15
- if_else,
16
- filter_,
17
- pull,
18
- select,
19
- )
20
8
 
21
9
  mygene = MyGeneInfo()
22
10
 
23
11
 
24
- class QueryGenesNotFound(Exception):
12
+ class QueryGenesNotFound(ValueError):
25
13
  """When genes cannot be found"""
26
14
 
27
15
 
28
16
  def gene_name_conversion(
29
- genes,
30
- species,
31
- infmt,
32
- outfmt,
33
- notfound,
17
+ genes: list[str],
18
+ infmt: str | list[str],
19
+ outfmt: str,
20
+ dup: str = "first",
21
+ species: str = "human",
22
+ notfound: str = "na",
23
+ suppress_messages: bool = False,
34
24
  ):
35
25
  """Convert gene names using MyGeneInfo
36
26
 
37
27
  Args:
38
- genes: A sequence of genes
39
- species: The species to limit the query
40
- Supported: human, mouse, rat, fruitfly, nematode, zebrafish,
41
- thale-cress, frog and pig
42
-
43
- infmt: What's the original gene name format
44
- Available fields
45
- https://docs.mygene.info/en/latest/doc/query_service.html#available-fields
46
- outfmt: What's the target gene name format
47
- notfound: What to do if a conversion cannot be done.
48
- use-query: Ignore the conversion and use the original name
49
- skip: Ignore the conversion and skip the entire row in input file
50
- error: Report error
28
+ genes: A character/integer vector of gene names/ids
29
+ species: A character vector of species names
30
+ infmt: A character vector of input gene name formats
31
+ See the available scopes at
32
+ https://docs.mygene.info/en/latest/doc/data.html#available-fields
33
+ You can use ensg as a shortcut for ensembl.gene
34
+ outfmt: A character vector of output gene name formats
35
+ dup: How to deal with duplicate gene names found.
36
+ first: keep the first one (default), sorted by score descendingly
37
+ last: keep the last one, sorted by score descendingly
38
+ all: keep all of them, each will be a separate row
39
+ <X>: combine them into a single string, separated by X
40
+ notfound: How to deal with gene names that are not found
41
+ error: stop with an error message
42
+ use-query: use the query gene name as the converted gene name
43
+ skip: skip the gene names that are not found
44
+ ignore: Same as "skip"
45
+ na: use NA as the converted gene name (default)
46
+ suppress_messages: Suppress the messages while querying
51
47
 
52
48
  Returns:
53
- A dataframe with two columns, query and `outfmt`.
49
+ A dataframe with the query gene names and the converted gene names
50
+ When a gene name is not found, the converted name will be "NA"
51
+ When duplicate gene names are found, the one with the highest score will be kept
54
52
  """
55
- out = (
56
- mygene.querymany(
53
+ notfound = notfound.lower()
54
+ if notfound not in ("error", "use-query", "skip", "ignore", "na"):
55
+ raise ValueError(
56
+ "`notfound` of `gene_name_conversion` must be one of "
57
+ "'error', 'use-query', 'skip', 'ignore', 'na'"
58
+ )
59
+
60
+ if infmt in ["ensg", "ensmusg"]:
61
+ infmt = "ensembl.gene"
62
+ if outfmt in ["ensg", "ensmusg"]:
63
+ outfmt = "ensembl.gene"
64
+
65
+ orig_genes = genes[:]
66
+ if infmt == "ensembl.gene":
67
+ # Remove version numbers from ensembl gene ids
68
+ genes = [re.sub("\\..*", "", gene) for gene in genes]
69
+
70
+ query_df = pd.DataFrame({"query": genes, "orig": orig_genes})
71
+
72
+ if suppress_messages:
73
+ with contextlib.redirect_stdout(None):
74
+ out = mygene.querymany(
75
+ genes,
76
+ scopes=infmt,
77
+ fields=outfmt,
78
+ species=species,
79
+ as_dataframe=True,
80
+ df_index=False,
81
+ )
82
+ else:
83
+ out = mygene.querymany(
57
84
  genes,
58
85
  scopes=infmt,
59
86
  fields=outfmt,
87
+ species=species,
60
88
  as_dataframe=True,
61
89
  df_index=False,
62
- species=species,
63
90
  )
64
- >> group_by(f.query)
65
- >> arrange(desc(f._score))
66
- >> slice_head(1)
67
- >> select(~c(f._id, f._score, f.notfound))
68
- )
69
- if isinstance(outfmt, str):
70
- outfmt = [of.strip() for of in outfmt.split(",")]
71
- out = tibble(query=genes) >> left_join(out, by=f.query)
72
- if notfound == "use-query":
73
- out = out >> mutate(
74
- across(
75
- outfmt,
76
- lambda col, query: if_else(is_na(col), query, col),
77
- query=f.query,
78
- )
91
+
92
+ if out.shape[0] == 0:
93
+ return pd.DataFrame({"query": genes, "converted": ["NA"] * len(genes)})
94
+
95
+ if dup == "first":
96
+ out = (
97
+ out
98
+ .sort_values("_score", ascending=False)
99
+ .groupby("query")
100
+ .head(1)
101
+ .reset_index(drop=True)
79
102
  )
80
- elif notfound == "error" and any(is_na(out[outfmt[0]])):
81
- nagenes = out >> filter_(is_na(f[outfmt[0]])) >> pull(f.query)
82
- raise QueryGenesNotFound(nagenes)
83
- elif notfound == "skip":
84
- out = out >> filter_(~is_na(f[outfmt[0]]))
103
+ elif dup == "last":
104
+ out = (
105
+ out
106
+ .sort_values("_score", ascending=False)
107
+ .groupby("query")
108
+ .tail(1)
109
+ .reset_index(drop=True)
110
+ )
111
+ elif dup != "all":
112
+ out = (
113
+ out
114
+ .sort_values("_score", ascending=False)
115
+ .groupby("query")
116
+ .agg({outfmt: lambda x: f"{dup}".join([str(x) for x in x.unique()])})
117
+ .reset_index()
118
+ )
119
+
120
+ out = pd.merge(query_df, out, on="query", how="left")
121
+ out = out.drop(columns=["query"]).rename(columns={"orig": "query"})
122
+
123
+ if notfound == "error":
124
+ if out[outfmt].isnull().any():
125
+ nagenes = out[out[outfmt].isnull()]["query"].tolist()
126
+ raise QueryGenesNotFound(f"Query genes not found: {','.join(nagenes)}")
127
+ elif notfound == "use-query":
128
+ out[outfmt] = out[outfmt].combine_first(out["query"])
129
+ elif notfound in ["skip", "ignore"]:
130
+ out = out.dropna(subset=[outfmt])
131
+ else: # notfound == "na"
132
+ out[outfmt] = out[outfmt].fillna("NA")
85
133
 
86
134
  return out
biopipen/utils/misc.R CHANGED
@@ -346,3 +346,59 @@ casename_info <- function(
346
346
  }
347
347
  out
348
348
  }
349
+
350
+ run_command <- function(
351
+ cmd,
352
+ fg = FALSE,
353
+ wait = TRUE,
354
+ print_command = TRUE,
355
+ print_command_handler = cat,
356
+ ...
357
+ ) {
358
+ if (print_command) {
359
+ print_command_handler("RUNNING COMMAND:\n")
360
+ print_command_handler(paste0(" ", paste(cmd, collapse = " "), "\n\n"))
361
+ }
362
+
363
+ kwargs <- list(...)
364
+ stdin <- kwargs$stdin %||% ""
365
+ stdout <- kwargs$stdout %||% ""
366
+ stderr <- kwargs$stderr %||% ""
367
+ input <- kwargs$input %||% NULL
368
+ k_env <- kwargs$env %||% list()
369
+ env <- ""
370
+ if (is.list(k_env)) {
371
+ for (k in names(env)) { env <- paste0(env, k, "=", k_env[[k]], ";")}
372
+ } else {
373
+ env <- k_env
374
+ }
375
+ if (fg) {
376
+ stdout <- ""
377
+ stderr <- ""
378
+ } else {
379
+ if (stdout == "") { stdout <- FALSE }
380
+ }
381
+
382
+ command = cmd[1]
383
+ args = cmd[-1]
384
+ out <- system2(
385
+ command,
386
+ args = args,
387
+ stdout = stdout,
388
+ stderr = stderr,
389
+ stdin = stdin,
390
+ env = env,
391
+ wait = wait,
392
+ input = input
393
+ )
394
+ if (!isTRUE(stdout) && !isTRUE(stderr)) {
395
+ if(out != 0) stop(sprintf("Command failed with exit code %s", out))
396
+ if (!fg) { return(out) }
397
+ } else {
398
+ status <- attr(out, "status")
399
+ if (is.integer(status) && status != 0) {
400
+ stop(sprintf("Command failed: code (%s): %s", status, out))
401
+ }
402
+ return(out)
403
+ }
404
+ }
biopipen/utils/misc.py CHANGED
@@ -65,9 +65,12 @@ def run_command(
65
65
  if print_command:
66
66
  print_command_handler("RUNNING COMMAND:")
67
67
  if isinstance(cmd, str):
68
- print_command_handler(f" {cmd}")
68
+ print_command_handler(f" {cmd}\n")
69
69
  else:
70
- print_command_handler(f" {shlex.join(cmd)}")
70
+ print_command_handler(f" {shlex.join(cmd)}\n")
71
+ # flush the output if print_command_handler is print
72
+ if print_command_handler is print:
73
+ sys.stdout.flush()
71
74
 
72
75
  if isinstance(cmd, str):
73
76
  kwargs["shell"] = True