cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. cool_seq_tool/__init__.py +7 -11
  2. cool_seq_tool/app.py +44 -24
  3. cool_seq_tool/handlers/__init__.py +1 -0
  4. cool_seq_tool/handlers/seqrepo_access.py +27 -25
  5. cool_seq_tool/mappers/__init__.py +3 -1
  6. cool_seq_tool/mappers/alignment.py +5 -6
  7. cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
  8. cool_seq_tool/mappers/liftover.py +90 -0
  9. cool_seq_tool/mappers/mane_transcript.py +208 -113
  10. cool_seq_tool/resources/__init__.py +1 -0
  11. cool_seq_tool/resources/data_files.py +93 -0
  12. cool_seq_tool/resources/status.py +153 -0
  13. cool_seq_tool/schemas.py +92 -54
  14. cool_seq_tool/sources/__init__.py +1 -0
  15. cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
  16. cool_seq_tool/sources/transcript_mappings.py +41 -32
  17. cool_seq_tool/sources/uta_database.py +96 -249
  18. cool_seq_tool/utils.py +44 -4
  19. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
  20. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
  21. cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
  22. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
  23. cool_seq_tool/api.py +0 -42
  24. cool_seq_tool/data/__init__.py +0 -2
  25. cool_seq_tool/data/data_downloads.py +0 -89
  26. cool_seq_tool/paths.py +0 -28
  27. cool_seq_tool/routers/__init__.py +0 -16
  28. cool_seq_tool/routers/default.py +0 -125
  29. cool_seq_tool/routers/mane.py +0 -98
  30. cool_seq_tool/routers/mappings.py +0 -155
  31. cool_seq_tool/version.py +0 -2
  32. cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
  33. /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
  34. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
cool_seq_tool/__init__.py CHANGED
@@ -1,14 +1,10 @@
1
1
  """The cool_seq_tool package"""
2
- import logging
3
- from pathlib import Path
4
2
 
5
- APP_ROOT = Path(__file__).resolve().parents[0]
3
+ from importlib.metadata import PackageNotFoundError, version
6
4
 
7
- logging.basicConfig(
8
- filename="cool_seq_tool.log",
9
- format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
10
- )
11
- logger = logging.getLogger("cool_seq_tool")
12
- logger.setLevel(logging.DEBUG)
13
-
14
- LOG_FN = "cool_seq_tool.log"
5
+ try:
6
+ __version__ = version("cool_seq_tool")
7
+ except PackageNotFoundError:
8
+ __version__ = "unknown"
9
+ finally:
10
+ del version, PackageNotFoundError
cool_seq_tool/app.py CHANGED
@@ -1,29 +1,24 @@
1
1
  """Provides core CoolSeqTool class, which non-redundantly initializes all Cool-Seq-Tool
2
2
  data handler and mapping resources for straightforward access.
3
3
  """
4
+
4
5
  import logging
5
6
  from pathlib import Path
6
- from typing import Optional
7
7
 
8
8
  from biocommons.seqrepo import SeqRepo
9
9
 
10
- from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
10
+ from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
11
11
  from cool_seq_tool.mappers import (
12
12
  AlignmentMapper,
13
13
  ExonGenomicCoordsMapper,
14
+ LiftOver,
14
15
  ManeTranscript,
15
16
  )
16
- from cool_seq_tool.paths import (
17
- LRG_REFSEQGENE_PATH,
18
- MANE_SUMMARY_PATH,
19
- SEQREPO_ROOT_DIR,
20
- TRANSCRIPT_MAPPINGS_PATH,
21
- )
22
17
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
23
18
  from cool_seq_tool.sources.transcript_mappings import TranscriptMappings
24
19
  from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase
25
20
 
26
- logger = logging.getLogger(__name__)
21
+ _logger = logging.getLogger(__name__)
27
22
 
28
23
 
29
24
  class CoolSeqTool:
@@ -35,28 +30,47 @@ class CoolSeqTool:
35
30
  * ``self.mane_transcript_mappings``: :py:class:`ManeTranscriptMappings <cool_seq_tool.sources.mane_transcript_mappings.ManeTranscriptMappings>`
36
31
  * ``self.uta_db``: :py:class:`UtaDatabase <cool_seq_tool.sources.uta_database.UtaDatabase>`
37
32
  * ``self.alignment_mapper``: :py:class:`AlignmentMapper <cool_seq_tool.mappers.alignment.AlignmentMapper>`
33
+ * ``self.liftover``: :py:class:`LiftOver <cool_seq_tool.mappers.liftover.LiftOver>`
38
34
  * ``self.mane_transcript``: :py:class:`ManeTranscript <cool_seq_tool.mappers.mane_transcript.ManeTranscript>`
39
35
  * ``self.ex_g_coords_mapper``: :py:class:`ExonGenomicCoordsMapper <cool_seq_tool.mappers.exon_genomic_coords.ExonGenomicCoordsMapper>`
40
-
41
- Initialization with default resource locations is straightforward:
42
-
43
- .. code-block:: pycon
44
-
45
- >>> from cool_seq_tool.app import CoolSeqTool
46
- >>> cst = CoolSeqTool()
47
-
48
- See the :ref:`configuration <configuration>` section for more information.
49
36
  """
50
37
 
51
38
  def __init__(
52
39
  self,
53
- transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
54
- lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
55
- mane_data_path: Path = MANE_SUMMARY_PATH,
40
+ transcript_file_path: Path | None = None,
41
+ lrg_refseqgene_path: Path | None = None,
42
+ mane_data_path: Path | None = None,
56
43
  db_url: str = UTA_DB_URL,
57
- sr: Optional[SeqRepo] = None,
44
+ sr: SeqRepo | None = None,
45
+ force_local_files: bool = False,
58
46
  ) -> None:
59
- """Initialize CoolSeqTool class
47
+ """Initialize CoolSeqTool class.
48
+
49
+ Initialization with default resource locations is straightforward:
50
+
51
+ >>> from cool_seq_tool.app import CoolSeqTool
52
+ >>> cst = CoolSeqTool()
53
+
54
+ By default, this will attempt to fetch the latest versions of static resources,
55
+ which means brief FTP and HTTPS requests to NCBI servers upon initialization.
56
+ To suppress this check and simply rely on the most recent locally-available
57
+ data:
58
+
59
+ >>> cst = CoolSeqTool(force_local_files=True)
60
+
61
+ Note that this will raise a FileNotFoundError if no locally-available data exists.
62
+
63
+ Paths to those files can also be explicitly passed to avoid checks as well:
64
+
65
+ >>> from pathlib import Path
66
+ >>> cst = CoolSeqTool(
67
+ ... lrg_refseqgene_path=Path("lrg_refseqgene_20240625.tsv"),
68
+ ... mane_data_path=Path("ncbi_mane_summary_1.3.txt"),
69
+ ... )
70
+
71
+ If not passed explicit arguments, these locations can also be set via
72
+ environment variables. See the :ref:`configuration <configuration>` section of
73
+ the docs for more information.
60
74
 
61
75
  :param transcript_file_path: The path to ``transcript_mapping.tsv``
62
76
  :param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
@@ -64,6 +78,8 @@ class CoolSeqTool:
64
78
  :param db_url: PostgreSQL connection URL
65
79
  Format: ``driver://user:password@host/database/schema``
66
80
  :param sr: SeqRepo instance. If this is not provided, will create a new instance
81
+ :param force_local_files: if ``True``, don't check for or try to acquire latest
82
+ versions of static data files -- just use most recently available, if any
67
83
  """
68
84
  if not sr:
69
85
  sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
@@ -71,23 +87,27 @@ class CoolSeqTool:
71
87
  self.transcript_mappings = TranscriptMappings(
72
88
  transcript_file_path=transcript_file_path,
73
89
  lrg_refseqgene_path=lrg_refseqgene_path,
90
+ from_local=force_local_files,
74
91
  )
75
92
  self.mane_transcript_mappings = ManeTranscriptMappings(
76
- mane_data_path=mane_data_path
93
+ mane_data_path=mane_data_path, from_local=force_local_files
77
94
  )
78
95
  self.uta_db = UtaDatabase(db_url=db_url)
79
96
  self.alignment_mapper = AlignmentMapper(
80
97
  self.seqrepo_access, self.transcript_mappings, self.uta_db
81
98
  )
99
+ self.liftover = LiftOver()
82
100
  self.mane_transcript = ManeTranscript(
83
101
  self.seqrepo_access,
84
102
  self.transcript_mappings,
85
103
  self.mane_transcript_mappings,
86
104
  self.uta_db,
105
+ self.liftover,
87
106
  )
88
107
  self.ex_g_coords_mapper = ExonGenomicCoordsMapper(
89
108
  self.seqrepo_access,
90
109
  self.uta_db,
91
110
  self.mane_transcript,
92
111
  self.mane_transcript_mappings,
112
+ self.liftover,
93
113
  )
@@ -1,2 +1,3 @@
1
1
  """Module for extending clients"""
2
+
2
3
  from .seqrepo_access import SeqRepoAccess
@@ -1,17 +1,20 @@
1
1
  """Wrap SeqRepo to provide additional lookup and identification methods on top of basic
2
2
  dereferencing functions.
3
3
  """
4
+
4
5
  import logging
5
6
  from os import environ
6
7
  from pathlib import Path
7
- from typing import List, Optional, Tuple, Union
8
8
 
9
9
  from ga4gh.vrs.dataproxy import SeqRepoDataProxy
10
10
 
11
- from cool_seq_tool.schemas import ResidueMode
12
- from cool_seq_tool.utils import get_inter_residue_pos
11
+ from cool_seq_tool.schemas import Assembly, ResidueMode
12
+ from cool_seq_tool.utils import get_inter_residue_pos, process_chromosome_input
13
+
14
+ _logger = logging.getLogger(__name__)
15
+
13
16
 
14
- logger = logging.getLogger(__name__)
17
+ SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
15
18
 
16
19
 
17
20
  class SeqRepoAccess(SeqRepoDataProxy):
@@ -24,10 +27,10 @@ class SeqRepoAccess(SeqRepoDataProxy):
24
27
  def get_reference_sequence(
25
28
  self,
26
29
  ac: str,
27
- start: Optional[int] = None,
28
- end: Optional[int] = None,
30
+ start: int | None = None,
31
+ end: int | None = None,
29
32
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
30
- ) -> Tuple[str, Optional[str]]:
33
+ ) -> tuple[str, str | None]:
31
34
  """Get reference sequence for an accession given a start and end position. If
32
35
  ``start`` and ``end`` are not given, returns the entire reference sequence.
33
36
 
@@ -63,7 +66,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
63
66
  sequence = self.sr.fetch(ac, start=start, end=end)
64
67
  except KeyError:
65
68
  msg = f"Accession, {ac}, not found in SeqRepo"
66
- logger.warning(msg)
69
+ _logger.warning(msg)
67
70
  return "", msg
68
71
  except ValueError as e:
69
72
  error = str(e)
@@ -77,7 +80,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
77
80
  )
78
81
  else:
79
82
  msg = f"{e}"
80
- logger.warning(msg)
83
+ _logger.warning(msg)
81
84
  return "", msg
82
85
  else:
83
86
  # If start is valid, but end is invalid, SeqRepo still returns
@@ -93,8 +96,8 @@ class SeqRepoAccess(SeqRepoDataProxy):
93
96
  return sequence, None
94
97
 
95
98
  def translate_identifier(
96
- self, ac: str, target_namespaces: Optional[Union[str, List[str]]] = None
97
- ) -> Tuple[List[str], Optional[str]]:
99
+ self, ac: str, target_namespaces: str | list[str] | None = None
100
+ ) -> tuple[list[str], str | None]:
98
101
  """Return list of identifiers for accession.
99
102
 
100
103
  >>> from cool_seq_tool.handlers import SeqRepoAccess
@@ -115,14 +118,12 @@ class SeqRepoAccess(SeqRepoDataProxy):
115
118
  )
116
119
  except KeyError:
117
120
  msg = f"SeqRepo unable to get translated identifiers for {ac}"
118
- logger.warning(msg)
121
+ _logger.warning(msg)
119
122
  return [], msg
120
123
  else:
121
124
  return ga4gh_identifiers, None
122
125
 
123
- def translate_alias(
124
- self, input_str: str
125
- ) -> Tuple[List[Optional[str]], Optional[str]]:
126
+ def translate_alias(self, input_str: str) -> tuple[list[str | None], str | None]:
126
127
  """Get aliases for a given input.
127
128
 
128
129
  :param str input_str: Input to get aliases for
@@ -132,29 +133,30 @@ class SeqRepoAccess(SeqRepoDataProxy):
132
133
  return self.sr.translate_alias(input_str), None
133
134
  except KeyError:
134
135
  msg = f"SeqRepo could not translate alias {input_str}"
135
- logger.warning(msg)
136
+ _logger.warning(msg)
136
137
  return [], msg
137
138
 
138
- def chromosome_to_acs(
139
- self, chromosome: str
140
- ) -> Tuple[Optional[List[str]], Optional[str]]:
139
+ def chromosome_to_acs(self, chromosome: str) -> tuple[list[str] | None, str | None]:
141
140
  """Get accessions for a chromosome
142
141
 
143
142
  :param chromosome: Chromosome number. Must be either 1-22, X, or Y
144
143
  :return: Accessions for chromosome (ordered by latest assembly)
145
144
  """
146
145
  acs = []
147
- for assembly in ["GRCh38", "GRCh37"]:
146
+ for assembly in reversed(Assembly.values()):
148
147
  tmp_acs, _ = self.translate_identifier(
149
- f"{assembly}:chr{chromosome}", target_namespaces="refseq"
148
+ f"{assembly}:{process_chromosome_input(chromosome)}",
149
+ target_namespaces="refseq",
150
150
  )
151
- for ac in tmp_acs:
152
- acs.append(ac.split("refseq:")[-1])
151
+ acs += [ac.split("refseq:")[-1] for ac in tmp_acs]
153
152
  if acs:
154
153
  return acs, None
155
- return None, f"{chromosome} is not a valid chromosome"
154
+ return (
155
+ None,
156
+ f'Unable to find matching accessions for "{chromosome}" in SeqRepo.',
157
+ )
156
158
 
157
- def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
159
+ def ac_to_chromosome(self, ac: str) -> tuple[str | None, str | None]:
158
160
  """Get chromosome for accession.
159
161
 
160
162
  :param str ac: Accession
@@ -1,7 +1,9 @@
1
1
  """Module for mapping data"""
2
+
2
3
  from .alignment import AlignmentMapper # noqa: I001
4
+ from .liftover import LiftOver
3
5
  from .mane_transcript import ManeTranscript
4
6
  from .exon_genomic_coords import ExonGenomicCoordsMapper
5
7
 
6
8
 
7
- __all__ = ["AlignmentMapper", "ManeTranscript", "ExonGenomicCoordsMapper"]
9
+ __all__ = ["AlignmentMapper", "LiftOver", "ManeTranscript", "ExonGenomicCoordsMapper"]
@@ -1,7 +1,6 @@
1
1
  """Module containing alignment methods for translating to and from different
2
2
  reference sequences.
3
3
  """
4
- from typing import Dict, Optional, Tuple
5
4
 
6
5
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
7
6
  from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode
@@ -34,7 +33,7 @@ class AlignmentMapper:
34
33
  p_start_pos: int,
35
34
  p_end_pos: int,
36
35
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
37
- ) -> Tuple[Optional[Dict], Optional[str]]:
36
+ ) -> tuple[dict | None, str | None]:
38
37
  """Translate protein representation to cDNA representation.
39
38
 
40
39
  :param p_ac: Protein RefSeq accession
@@ -83,7 +82,7 @@ class AlignmentMapper:
83
82
  "residue_mode": ResidueMode.INTER_RESIDUE.value,
84
83
  }, None
85
84
 
86
- async def _get_cds_start(self, c_ac: str) -> Tuple[Optional[int], Optional[str]]:
85
+ async def _get_cds_start(self, c_ac: str) -> tuple[int | None, str | None]:
87
86
  """Get CDS start for a given cDNA RefSeq accession
88
87
 
89
88
  :param c_ac: cDNA RefSeq accession
@@ -105,10 +104,10 @@ class AlignmentMapper:
105
104
  c_ac: str,
106
105
  c_start_pos: int,
107
106
  c_end_pos: int,
108
- cds_start: Optional[int] = None,
107
+ cds_start: int | None = None,
109
108
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
110
109
  target_genome_assembly: bool = Assembly.GRCH38,
111
- ) -> Tuple[Optional[Dict], Optional[str]]:
110
+ ) -> tuple[dict | None, str | None]:
112
111
  """Translate cDNA representation to genomic representation
113
112
 
114
113
  :param c_ac: cDNA RefSeq accession
@@ -212,7 +211,7 @@ class AlignmentMapper:
212
211
  p_end_pos: int,
213
212
  residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE,
214
213
  target_genome_assembly: Assembly = Assembly.GRCH38,
215
- ) -> Tuple[Optional[Dict], Optional[str]]:
214
+ ) -> tuple[dict | None, str | None]:
216
215
  """Translate protein representation to genomic representation, by way of
217
216
  intermediary conversion into cDNA coordinates.
218
217