cool-seq-tool 0.3.0.dev1__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cool_seq_tool/api.py CHANGED
@@ -24,16 +24,16 @@ def custom_openapi() -> Dict:
24
24
  if app.openapi_schema:
25
25
  return app.openapi_schema
26
26
  openapi_schema = get_openapi(
27
- title="The GenomicMedLab Cool Seq Tool",
27
+ title="The GenomicMedLab Cool-Seq-Tool",
28
28
  version=__version__,
29
- description="Common Operations On Lots-of Sequences Tool.",
29
+ description="Common Operations On Lots of Sequences Tool.",
30
30
  routes=app.routes,
31
31
  )
32
32
 
33
33
  openapi_schema["info"]["contact"] = {
34
34
  "name": "Alex H. Wagner",
35
35
  "email": "Alex.Wagner@nationwidechildrens.org",
36
- "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", # noqa: E501
36
+ "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab",
37
37
  }
38
38
  app.openapi_schema = openapi_schema
39
39
  return app.openapi_schema
cool_seq_tool/app.py CHANGED
@@ -1,4 +1,6 @@
1
- """Module for initializing data sources."""
1
+ """Provides core CoolSeqTool class, which non-redundantly initializes all Cool-Seq-Tool
2
+ data handler and mapping resources for straightforward access.
3
+ """
2
4
  import logging
3
5
  from pathlib import Path
4
6
  from typing import Optional
@@ -9,7 +11,7 @@ from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
9
11
  from cool_seq_tool.mappers import (
10
12
  AlignmentMapper,
11
13
  ExonGenomicCoordsMapper,
12
- MANETranscript,
14
+ ManeTranscript,
13
15
  )
14
16
  from cool_seq_tool.paths import (
15
17
  LRG_REFSEQGENE_PATH,
@@ -17,15 +19,34 @@ from cool_seq_tool.paths import (
17
19
  SEQREPO_ROOT_DIR,
18
20
  TRANSCRIPT_MAPPINGS_PATH,
19
21
  )
20
- from cool_seq_tool.sources.mane_transcript_mappings import MANETranscriptMappings
22
+ from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
21
23
  from cool_seq_tool.sources.transcript_mappings import TranscriptMappings
22
- from cool_seq_tool.sources.uta_database import UTA_DB_URL, UTADatabase
24
+ from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
26
28
 
27
29
  class CoolSeqTool:
28
- """Class to initialize data sources."""
30
+ """Non-redundantly initialize all Cool-Seq-Tool data resources, available under the
31
+ following attribute names:
32
+
33
+ * ``self.seqrepo_access``: :py:class:`SeqRepoAccess <cool_seq_tool.handlers.seqrepo_access.SeqRepoAccess>`
34
+ * ``self.transcript_mappings``: :py:class:`TranscriptMappings <cool_seq_tool.sources.transcript_mappings.TranscriptMappings>`
35
+ * ``self.mane_transcript_mappings``: :py:class:`ManeTranscriptMappings <cool_seq_tool.sources.mane_transcript_mappings.ManeTranscriptMappings>`
36
+ * ``self.uta_db``: :py:class:`UtaDatabase <cool_seq_tool.sources.uta_database.UtaDatabase>`
37
+ * ``self.alignment_mapper``: :py:class:`AlignmentMapper <cool_seq_tool.mappers.alignment.AlignmentMapper>`
38
+ * ``self.mane_transcript``: :py:class:`ManeTranscript <cool_seq_tool.mappers.mane_transcript.ManeTranscript>`
39
+ * ``self.ex_g_coords_mapper``: :py:class:`ExonGenomicCoordsMapper <cool_seq_tool.mappers.exon_genomic_coords.ExonGenomicCoordsMapper>`
40
+
41
+ Initialization with default resource locations is straightforward:
42
+
43
+ .. code-block:: pycon
44
+
45
+ >>> from cool_seq_tool.app import CoolSeqTool
46
+ >>> cst = CoolSeqTool()
47
+
48
+ See the :ref:`configuration <configuration>` section for more information.
49
+ """
29
50
 
30
51
  def __init__(
31
52
  self,
@@ -37,11 +58,11 @@ class CoolSeqTool:
37
58
  ) -> None:
38
59
  """Initialize CoolSeqTool class
39
60
 
40
- :param transcript_file_path: The path to transcript_mapping.tsv
41
- :param lrg_refseqgene_path: The path to LRG_RefSeqGene
61
+ :param transcript_file_path: The path to ``transcript_mapping.tsv``
62
+ :param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
42
63
  :param mane_data_path: Path to RefSeq MANE summary data
43
64
  :param db_url: PostgreSQL connection URL
44
- Format: `driver://user:password@host/database/schema`
65
+ Format: ``driver://user:password@host/database/schema``
45
66
  :param sr: SeqRepo instance. If this is not provided, will create a new instance
46
67
  """
47
68
  if not sr:
@@ -51,14 +72,14 @@ class CoolSeqTool:
51
72
  transcript_file_path=transcript_file_path,
52
73
  lrg_refseqgene_path=lrg_refseqgene_path,
53
74
  )
54
- self.mane_transcript_mappings = MANETranscriptMappings(
75
+ self.mane_transcript_mappings = ManeTranscriptMappings(
55
76
  mane_data_path=mane_data_path
56
77
  )
57
- self.uta_db = UTADatabase(db_url=db_url)
78
+ self.uta_db = UtaDatabase(db_url=db_url)
58
79
  self.alignment_mapper = AlignmentMapper(
59
80
  self.seqrepo_access, self.transcript_mappings, self.uta_db
60
81
  )
61
- self.mane_transcript = MANETranscript(
82
+ self.mane_transcript = ManeTranscript(
62
83
  self.seqrepo_access,
63
84
  self.transcript_mappings,
64
85
  self.mane_transcript_mappings,
@@ -1,4 +1,4 @@
1
- """Module for handling downloadable data files."""
1
+ """Handle acquisition of external data."""
2
2
  import datetime
3
3
  import gzip
4
4
  import logging
@@ -15,8 +15,11 @@ logger = logging.getLogger("cool_seq_tool")
15
15
 
16
16
 
17
17
  class DataDownload:
18
- """Class for managing downloadable data files. Responsible for checking if files
19
- are available under default locations, and fetching them if not.
18
+ """Manage downloadable data files. Responsible for checking if files are available
19
+ under expected locations, and fetching them if not.
20
+
21
+ Relevant methods are called automatically by data classes; users should not have
22
+ to interact with this class under normal circumstances.
20
23
  """
21
24
 
22
25
  def __init__(self) -> None:
@@ -25,7 +28,7 @@ class DataDownload:
25
28
 
26
29
  def get_mane_summary(self) -> Path:
27
30
  """Identify latest MANE summary data. If unavailable locally, download from
28
- source.
31
+ `NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/>`_.
29
32
 
30
33
  :return: path to MANE summary file
31
34
  """
@@ -52,7 +55,7 @@ class DataDownload:
52
55
 
53
56
  def get_lrg_refseq_gene_data(self) -> Path:
54
57
  """Identify latest LRG RefSeq Gene file. If unavailable locally, download from
55
- source.
58
+ `NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/>`_.
56
59
 
57
60
  :return: path to acquired LRG RefSeq Gene data file
58
61
  """
@@ -1,4 +1,6 @@
1
- """A module for accessing SeqRepo."""
1
+ """Wrap SeqRepo to provide additional lookup and identification methods on top of basic
2
+ dereferencing functions.
3
+ """
2
4
  import logging
3
5
  from os import environ
4
6
  from pathlib import Path
@@ -13,7 +15,9 @@ logger = logging.getLogger(__name__)
13
15
 
14
16
 
15
17
  class SeqRepoAccess(SeqRepoDataProxy):
16
- """The SeqRepoAccess class."""
18
+ """Provide a wrapper around the base SeqRepoDataProxy class from ``VRS-Python`` to
19
+ provide additional lookup and identification methods.
20
+ """
17
21
 
18
22
  environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
19
23
 
@@ -24,25 +28,37 @@ class SeqRepoAccess(SeqRepoDataProxy):
24
28
  end: Optional[int] = None,
25
29
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
26
30
  ) -> Tuple[str, Optional[str]]:
27
- """Get reference sequence for an accession given a start and end position.
28
- If `start` and `end` are not given, it will return the entire reference sequence
31
+ """Get reference sequence for an accession given a start and end position. If
32
+ ``start`` and ``end`` are not given, returns the entire reference sequence.
33
+
34
+ >>> from cool_seq_tool.handlers import SeqRepoAccess
35
+ >>> from biocommons.seqrepo import SeqRepo
36
+ >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest"))
37
+ >>> sr.get_reference_sequence("NM_002529.3", 1, 10)[0]
38
+ 'TGCAGCTGG'
39
+ >>> sr.get_reference_sequence("NP_001341538.1", 1, 10)[0]
40
+ 'MAALSGGGG'
29
41
 
30
42
  :param ac: Accession
31
43
  :param start: Start pos change
32
- :param end: End pos change. If `None` assumes both `start` and `end` have same
33
- values, if `start` exists.
34
- :param residue_mode: Residue mode for `start` and `end`
44
+ :param end: End pos change. If ``None`` assumes both ``start`` and ``end`` have
45
+ same values, if ``start`` exists.
46
+ :param residue_mode: Residue mode for ``start`` and ``end``
35
47
  :return: Sequence at position (if accession and positions actually
36
48
  exist, else return empty string), warning if any
37
49
  """
38
- if start or end:
39
- pos, warning = get_inter_residue_pos(start, residue_mode, end_pos=end)
40
- if pos is None:
41
- return "", warning
42
- else:
43
- start, end = pos
44
- if start == end:
45
- end += 1
50
+ if start and end:
51
+ if start > end:
52
+ msg = f"start ({start}) cannot be greater than end ({end})"
53
+ return "", msg
54
+
55
+ start, end = get_inter_residue_pos(start, end, residue_mode)
56
+ if start == end:
57
+ end += 1
58
+ else:
59
+ if start is not None and residue_mode == ResidueMode.RESIDUE:
60
+ start -= 1
61
+
46
62
  try:
47
63
  sequence = self.sr.fetch(ac, start=start, end=end)
48
64
  except KeyError:
@@ -53,18 +69,12 @@ class SeqRepoAccess(SeqRepoDataProxy):
53
69
  error = str(e)
54
70
  if error.startswith("start out of range"):
55
71
  msg = (
56
- f"Start inter-residue coordinate ({start}) is out of "
57
- f"index on {ac}"
72
+ f"Start inter-residue coordinate ({start}) is out of index on {ac}"
58
73
  )
59
74
  elif error.startswith("stop out of range"):
60
75
  msg = (
61
76
  f"End inter-residue coordinate ({end}) is out of " f"index on {ac}"
62
77
  )
63
- elif error.startswith("invalid coordinates") and ">" in error:
64
- msg = (
65
- f"Invalid inter-residue coordinates: start ({start}) "
66
- f"cannot be greater than end ({end})"
67
- )
68
78
  else:
69
79
  msg = f"{e}"
70
80
  logger.warning(msg)
@@ -78,8 +88,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
78
88
  if len(sequence) != expected_len_of_seq:
79
89
  return (
80
90
  "",
81
- f"End inter-residue coordinate ({end})"
82
- f" is out of index on {ac}",
91
+ f"End inter-residue coordinate ({end}) is out of index on {ac}",
83
92
  )
84
93
  return sequence, None
85
94
 
@@ -88,6 +97,14 @@ class SeqRepoAccess(SeqRepoDataProxy):
88
97
  ) -> Tuple[List[str], Optional[str]]:
89
98
  """Return list of identifiers for accession.
90
99
 
100
+ >>> from cool_seq_tool.handlers import SeqRepoAccess
101
+ >>> from biocommons.seqrepo import SeqRepo
102
+ >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest"))
103
+ >>> sr.translate_identifier("NM_002529.3")[0]
104
+ ['MD5:18f0a6e3af9e1bbd8fef1948c7156012', 'NCBI:NM_002529.3', 'refseq:NM_002529.3', 'SEGUID:dEJQBkga9d9VeBHTyTbg6JEtTGQ', 'SHA1:74425006481af5df557811d3c936e0e8912d4c64', 'VMC:GS_RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA', 'sha512t24u:RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA', 'ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA']
105
+ >>> sr.translate_identifier("NM_002529.3", "ga4gh")[0]
106
+ ['ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA']
107
+
91
108
  :param ac: Identifier accession
92
109
  :param target_namespace: The namespace(s) of identifier to return
93
110
  :return: List of identifiers, warning
@@ -123,7 +140,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
123
140
  ) -> Tuple[Optional[List[str]], Optional[str]]:
124
141
  """Get accessions for a chromosome
125
142
 
126
- :param str chromosome: Chromosome number. Must be either 1-22, X, or Y
143
+ :param chromosome: Chromosome number. Must be either 1-22, X, or Y
127
144
  :return: Accessions for chromosome (ordered by latest assembly)
128
145
  """
129
146
  acs = []
@@ -160,9 +177,20 @@ class SeqRepoAccess(SeqRepoDataProxy):
160
177
 
161
178
  def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None:
162
179
  """Retrieve FASTA file containing sequence for requested sequence ID.
163
- :param sequence_id: accession ID, sans namespace, eg `NM_152263.3`
180
+
181
+ >>> from pathlib import Path
182
+ >>> from cool_seq_tool.handlers import SeqRepoAccess
183
+ >>> from biocommons.seqrepo import SeqRepo
184
+ >>> sr = SeqRepoAccess(SeqRepo("/usr/local/share/seqrepo/latest"))
185
+ >>> # write to local file tpm3.fasta:
186
+ >>> sr.get_fasta_file("NM_002529.3", Path("tpm3.fasta"))
187
+
188
+ FASTA file headers will include GA4GH sequence digest, Ensembl accession ID,
189
+ and RefSeq accession ID.
190
+
191
+ :param sequence_id: accession ID, sans namespace, eg ``NM_152263.3``
164
192
  :param outfile_path: path to save file to
165
- :return: None, but saves sequence data to `outfile_path` if successful
193
+ :return: None, but saves sequence data to ``outfile_path`` if successful
166
194
  :raise: KeyError if SeqRepo doesn't have sequence data for the given ID
167
195
  """
168
196
  sequence = self.get_reference_sequence(sequence_id)[0]
@@ -1,4 +1,7 @@
1
1
  """Module for mapping data"""
2
2
  from .alignment import AlignmentMapper # noqa: I001
3
- from .mane_transcript import MANETranscript
3
+ from .mane_transcript import ManeTranscript
4
4
  from .exon_genomic_coords import ExonGenomicCoordsMapper
5
+
6
+
7
+ __all__ = ["AlignmentMapper", "ManeTranscript", "ExonGenomicCoordsMapper"]
@@ -5,7 +5,7 @@ from typing import Dict, Optional, Tuple
5
5
 
6
6
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
7
7
  from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode
8
- from cool_seq_tool.sources import TranscriptMappings, UTADatabase
8
+ from cool_seq_tool.sources import TranscriptMappings, UtaDatabase
9
9
 
10
10
 
11
11
  class AlignmentMapper:
@@ -15,15 +15,14 @@ class AlignmentMapper:
15
15
  self,
16
16
  seqrepo_access: SeqRepoAccess,
17
17
  transcript_mappings: TranscriptMappings,
18
- uta_db: UTADatabase,
18
+ uta_db: UtaDatabase,
19
19
  ) -> None:
20
20
  """Initialize the AlignmentMapper class.
21
21
 
22
- :param SeqRepoAccess seqrepo_access: Access to seqrepo queries
23
- :param TranscriptMappings transcript_mappings: Access to transcript
24
- accession mappings and conversions
25
- :param UTADatabase uta_db: UTADatabase instance to give access to query
26
- UTA database
22
+ :param seqrepo_access: Access to seqrepo queries
23
+ :param transcript_mappings: Access to transcript accession mappings and
24
+ conversions
25
+ :param uta_db: UtaDatabase instance to give access to query UTA database
27
26
  """
28
27
  self.seqrepo_access = seqrepo_access
29
28
  self.transcript_mappings = transcript_mappings
@@ -38,15 +37,16 @@ class AlignmentMapper:
38
37
  ) -> Tuple[Optional[Dict], Optional[str]]:
39
38
  """Translate protein representation to cDNA representation.
40
39
 
41
- :param str p_ac: Protein RefSeq accession
42
- :param int p_start_pos: Protein start position
43
- :param int p_end_pos: Protein end position
44
- :param ResidueMode residue_mode: Residue mode for `p_start_pos` and `p_end_pos`
40
+ :param p_ac: Protein RefSeq accession
41
+ :param p_start_pos: Protein start position
42
+ :param p_end_pos: Protein end position
43
+ :param residue_mode: Residue mode for ``p_start_pos`` and ``p_end_pos``
45
44
  :return: Tuple containing:
46
- - cDNA representation (accession, codon range positions for corresponding
47
- change, cds start site) if able to translate. Will return positions as
48
- inter-residue coordinates. If unable to translate, returns `None`.
49
- - Warning, if unable to translate to cDNA representation. Else `None`
45
+
46
+ * cDNA representation (accession, codon range positions for corresponding
47
+ change, cds start site) if able to translate. Will return positions as
48
+ inter-residue coordinates. If unable to translate, returns ``None``.
49
+ * Warning, if unable to translate to cDNA representation. Else ``None``
50
50
  """
51
51
  # Get cDNA accession
52
52
  temp_c_ac = await self.uta_db.p_to_c_ac(p_ac)
@@ -86,10 +86,10 @@ class AlignmentMapper:
86
86
  async def _get_cds_start(self, c_ac: str) -> Tuple[Optional[int], Optional[str]]:
87
87
  """Get CDS start for a given cDNA RefSeq accession
88
88
 
89
- :param str c_ac: cDNA RefSeq accession
89
+ :param c_ac: cDNA RefSeq accession
90
90
  :return: Tuple containing:
91
- - CDS start site if found. Else `None`
92
- - Warning, if unable to get CDS start. Else `None`
91
+ - CDS start site if found. Else ``None``
92
+ - Warning, if unable to get CDS start. Else ``None``
93
93
  """
94
94
  cds_start_end = await self.uta_db.get_cds_start_end(c_ac)
95
95
  if not cds_start_end:
@@ -111,16 +111,17 @@ class AlignmentMapper:
111
111
  ) -> Tuple[Optional[Dict], Optional[str]]:
112
112
  """Translate cDNA representation to genomic representation
113
113
 
114
- :param str c_ac: cDNA RefSeq accession
115
- :param int c_start_pos: cDNA start position for codon
116
- :param int c_end_pos: cDNA end position for codon
117
- :param Optional[int] coding_start_site: Coding start site. If not provided,
118
- this will be computed.
119
- :param Assembly target_genome_assembly: Genome assembly to get genomic data for
114
+ :param c_ac: cDNA RefSeq accession
115
+ :param c_start_pos: cDNA start position for codon
116
+ :param c_end_pos: cDNA end position for codon
117
+ :param coding_start_site: Coding start site. If not provided, this will be
118
+ computed.
119
+ :param target_genome_assembly: Genome assembly to get genomic data for
120
120
  :return: Tuple containing:
121
- - Genomic representation (ac, positions) if able to translate. Will return
122
- positions as inter-residue coordinates. Else `None`.
123
- - Warning, if unable to translate to genomic representation. Else `None`
121
+
122
+ * Genomic representation (ac, positions) if able to translate. Will return
123
+ positions as inter-residue coordinates. Else ``None``.
124
+ * Warning, if unable to translate to genomic representation. Else ``None``
124
125
  """
125
126
  if any(
126
127
  (
@@ -212,17 +213,19 @@ class AlignmentMapper:
212
213
  residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE,
213
214
  target_genome_assembly: Assembly = Assembly.GRCH38,
214
215
  ) -> Tuple[Optional[Dict], Optional[str]]:
215
- """Translate protein representation to genomic representation
216
-
217
- :param str p_ac: Protein RefSeq accession
218
- :param int p_start_pos: Protein start position
219
- :param int p_end_pos: Protein end position
220
- :param ResidueMode residue_mode: Residue mode for `p_start_pos` and `p_end_pos`.
221
- :param Assembly target_genome_assembly: Genome assembly to get genomic data for
216
+ """Translate protein representation to genomic representation, by way of
217
+ intermediary conversion into cDNA coordinates.
218
+
219
+ :param p_ac: Protein RefSeq accession
220
+ :param p_start_pos: Protein start position
221
+ :param p_end_pos: Protein end position
222
+ :param residue_mode: Residue mode for ``p_start_pos`` and ``p_end_pos``.
223
+ :param target_genome_assembly: Genome assembly to get genomic data for
222
224
  :return: Tuple containing:
223
- - Genomic representation (ac, positions) if able to translate. Will return
224
- positions as inter-residue coordinates. Else `None`.
225
- and warnings. The genomic data will always return inter-residue coordinates
225
+
226
+ * Genomic representation (ac, positions) if able to translate. Will return
227
+ positions as inter-residue coordinates. Else ``None``.
228
+ * Warnings, if conversion to cDNA or genomic coordinates fails.
226
229
  """
227
230
  c_data, warning = await self.p_to_c(
228
231
  p_ac, p_start_pos, p_end_pos, residue_mode=residue_mode