cool-seq-tool 0.3.0.dev1__py3-none-any.whl → 0.4.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,8 +79,8 @@ async def get_mane_data(
79
79
  mane_data = await cool_seq_tool.mane_transcript.get_mane_transcript(
80
80
  ac=ac,
81
81
  start_pos=start_pos,
82
- start_annotation_layer=start_annotation_layer,
83
82
  end_pos=end_pos,
83
+ start_annotation_layer=start_annotation_layer,
84
84
  gene=gene,
85
85
  ref=ref,
86
86
  try_longest_compatible=try_longest_compatible,
@@ -57,7 +57,7 @@ async def p_to_c(
57
57
  "/c_to_g",
58
58
  summary="Translate cDNA representation to genomic representation",
59
59
  response_description=RESP_DESCR,
60
- description="Given cDNA accession and positions for codon(s), return associated genomic" # noqa: E501
60
+ description="Given cDNA accession and positions for codon(s), return associated genomic"
61
61
  " accession and positions for a given target genome assembly",
62
62
  response_model=ToGenomicService,
63
63
  tags=[Tags.ALIGNMENT_MAPPER],
cool_seq_tool/schemas.py CHANGED
@@ -1,7 +1,7 @@
1
- """Module for data models."""
1
+ """Defines attribute constants, useful object structures, and API response schemas."""
2
2
  import re
3
3
  from datetime import datetime
4
- from enum import Enum
4
+ from enum import Enum, IntEnum
5
5
  from typing import List, Literal, Optional, Tuple, Union
6
6
 
7
7
  from pydantic import (
@@ -24,11 +24,11 @@ class AnnotationLayer(str, Enum):
24
24
  GENOMIC: Literal["g"] = "g"
25
25
 
26
26
 
27
- class Strand(str, Enum):
27
+ class Strand(IntEnum):
28
28
  """Create enum for positive and negative strand"""
29
29
 
30
- POSITIVE = "+"
31
- NEGATIVE = "-"
30
+ POSITIVE = 1
31
+ NEGATIVE = -1
32
32
 
33
33
 
34
34
  class Assembly(str, Enum):
@@ -48,8 +48,15 @@ class TranscriptPriority(str, Enum):
48
48
 
49
49
 
50
50
  class ResidueMode(str, Enum):
51
- """Create Enum for residue modes."""
51
+ """Create Enum for residue modes.
52
52
 
53
+ | | C | | T | | G | |
54
+ ZERO | | 0 | | 1 | | 2 | |
55
+ RESIDUE | | 1 | | 2 | | 3 | |
56
+ INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
57
+ """
58
+
59
+ ZERO = "zero"
53
60
  RESIDUE = "residue"
54
61
  INTER_RESIDUE = "inter-residue"
55
62
 
@@ -64,14 +71,14 @@ class GenomicRequestBody(BaseModelForbidExtra):
64
71
  chromosome: Union[StrictStr, StrictInt]
65
72
  start: Optional[StrictInt] = None
66
73
  end: Optional[StrictInt] = None
67
- strand: Optional[StrictInt] = None
74
+ strand: Optional[Strand] = None
68
75
  transcript: Optional[StrictStr] = None
69
76
  gene: Optional[StrictStr] = None
70
77
  residue_mode: ResidueMode = ResidueMode.RESIDUE
71
78
 
72
79
  @model_validator(mode="after")
73
80
  def check_start_and_end(cls, values):
74
- """Check that at least one of {`start`, `end`} is set"""
81
+ """Check that at least one of {``start``, ``end``} is set"""
75
82
  msg = "Must provide either `start` or `end`"
76
83
  start, end = values.start, values.end
77
84
  assert start or end, msg
@@ -83,7 +90,7 @@ class GenomicRequestBody(BaseModelForbidExtra):
83
90
  "chromosome": "NC_000001.11",
84
91
  "start": 154192135,
85
92
  "end": None,
86
- "strand": -1,
93
+ "strand": Strand.NEGATIVE,
87
94
  "transcript": "NM_152263.3",
88
95
  "gene": "TPM3",
89
96
  "residue_mode": "residue",
@@ -95,8 +102,8 @@ class GenomicRequestBody(BaseModelForbidExtra):
95
102
  class TranscriptRequestBody(BaseModelForbidExtra):
96
103
  """Define constraints for transcript exon to genomic coordinates request body"""
97
104
 
105
+ transcript: StrictStr
98
106
  gene: Optional[StrictStr] = None
99
- transcript: Optional[StrictStr] = None
100
107
  exon_start: Optional[StrictInt] = None
101
108
  exon_start_offset: Optional[StrictInt] = 0
102
109
  exon_end: Optional[StrictInt] = None
@@ -104,7 +111,7 @@ class TranscriptRequestBody(BaseModelForbidExtra):
104
111
 
105
112
  @model_validator(mode="after")
106
113
  def check_exon_start_and_exon_end(cls, values):
107
- """Check that at least one of {`exon_start`, `exon_end`} is set"""
114
+ """Check that at least one of {``exon_start``, ``exon_end``} is set"""
108
115
  msg = "Must provide either `exon_start` or `exon_end`"
109
116
  exon_start, exon_end = values.exon_start, values.exon_end
110
117
  assert exon_start or exon_end, msg
@@ -133,7 +140,7 @@ class TranscriptExonData(BaseModelForbidExtra):
133
140
  exon_offset: StrictInt = 0
134
141
  gene: StrictStr
135
142
  chr: StrictStr
136
- strand: StrictInt
143
+ strand: Strand
137
144
 
138
145
  model_config = ConfigDict(
139
146
  json_schema_extra={
@@ -144,7 +151,7 @@ class TranscriptExonData(BaseModelForbidExtra):
144
151
  "exon": 1,
145
152
  "exon_offset": 0,
146
153
  "transcript": "NM_152263.3",
147
- "strand": -1,
154
+ "strand": Strand.NEGATIVE,
148
155
  }
149
156
  }
150
157
  )
@@ -162,13 +169,13 @@ class GenomicData(BaseModelForbidExtra):
162
169
  exon_end: Optional[StrictInt] = None
163
170
  exon_end_offset: Optional[StrictInt] = 0
164
171
  transcript: StrictStr
165
- strand: StrictInt
172
+ strand: Strand
166
173
 
167
174
  @model_validator(mode="after")
168
175
  def check_start_end(cls, values):
169
- """Check that at least one of {`start`, `end`} is set.
170
- Check that at least one of {`exon_start`, `exon_end`} is set.
171
- If not set, set corresponding offset to `None`
176
+ """Check that at least one of {``start``, ``end``} is set.
177
+ Check that at least one of {``exon_start``, ``exon_end``} is set.
178
+ If not set, set corresponding offset to ``None``
172
179
  """
173
180
  msg = "Missing values for `start` or `end`"
174
181
  start = values.start
@@ -200,7 +207,7 @@ class GenomicData(BaseModelForbidExtra):
200
207
  "exon_start_offset": 0,
201
208
  "exon_end_offset": None,
202
209
  "transcript": "NM_152263.3",
203
- "strand": -1,
210
+ "strand": Strand.NEGATIVE,
204
211
  }
205
212
  }
206
213
  )
@@ -254,7 +261,7 @@ class TranscriptExonDataResponse(BaseModelForbidExtra):
254
261
  "exon": 1,
255
262
  "exon_offset": 0,
256
263
  "transcript": "NM_152263.3",
257
- "strand": -1,
264
+ "strand": Strand.NEGATIVE,
258
265
  },
259
266
  "warnings": [],
260
267
  "service_meta": {
@@ -288,7 +295,7 @@ class GenomicDataResponse(BaseModelForbidExtra):
288
295
  "exon_start_offset": 0,
289
296
  "exon_end_offset": None,
290
297
  "transcript": "NM_152263.3",
291
- "strand": -1,
298
+ "strand": Strand.NEGATIVE,
292
299
  },
293
300
  "warnings": [],
294
301
  "service_meta": {
@@ -319,7 +326,7 @@ class MappedManeData(BaseModel):
319
326
  "gene": "BRAF",
320
327
  "refseq": "NM_001374258.1",
321
328
  "ensembl": "ENST00000644969.2",
322
- "strand": "-",
329
+ "strand": Strand.NEGATIVE,
323
330
  "status": TranscriptPriority.MANE_PLUS_CLINICAL,
324
331
  "alt_ac": "NC_000007.13",
325
332
  "assembly": "GRCh37",
@@ -342,7 +349,7 @@ class MappedManeDataService(BaseModelForbidExtra):
342
349
  "gene": "BRAF",
343
350
  "refseq": "NM_001374258.1",
344
351
  "ensembl": "ENST00000644969.2",
345
- "strand": "-",
352
+ "strand": Strand.NEGATIVE,
346
353
  "status": TranscriptPriority.MANE_PLUS_CLINICAL,
347
354
  "alt_ac": "NC_000007.13",
348
355
  "assembly": "GRCh37",
@@ -376,7 +383,7 @@ class ManeData(BaseModel):
376
383
  "refseq": "NP_004324.2",
377
384
  "ensembl": "ENSP00000493543.1",
378
385
  "pos": (598, 598),
379
- "strand": "-",
386
+ "strand": Strand.NEGATIVE,
380
387
  "status": TranscriptPriority.MANE_SELECT,
381
388
  }
382
389
  }
@@ -398,7 +405,7 @@ class ManeDataService(BaseModelForbidExtra):
398
405
  "refseq": "NP_004324.2",
399
406
  "ensembl": "ENSP00000493543.1",
400
407
  "pos": (598, 598),
401
- "strand": "-",
408
+ "strand": Strand.NEGATIVE,
402
409
  "status": TranscriptPriority.MANE_SELECT,
403
410
  },
404
411
  "warnings": [],
@@ -1,4 +1,6 @@
1
1
  """Module for providing basic acquisition/setup for the various resources"""
2
- from .mane_transcript_mappings import MANETranscriptMappings
2
+ from .mane_transcript_mappings import ManeTranscriptMappings
3
3
  from .transcript_mappings import TranscriptMappings
4
- from .uta_database import UTADatabase
4
+ from .uta_database import UtaDatabase
5
+
6
+ __all__ = ["ManeTranscriptMappings", "TranscriptMappings", "UtaDatabase"]
@@ -1,4 +1,6 @@
1
- """The module for loading MANE Transcript mappings to genes."""
1
+ """Provide fast tabular access to MANE summary file. Enables retrieval of associated
2
+ MANE transcripts for gene symbols, genomic positions, or transcript accessions.
3
+ """
2
4
  import logging
3
5
  from pathlib import Path
4
6
  from typing import Dict, List
@@ -10,11 +12,19 @@ from cool_seq_tool.paths import MANE_SUMMARY_PATH
10
12
  logger = logging.getLogger(__name__)
11
13
 
12
14
 
13
- class MANETranscriptMappings:
14
- """The MANE Transcript mappings class."""
15
+ class ManeTranscriptMappings:
16
+ """Provide fast tabular access to MANE summary file.
17
+
18
+ By default, acquires data from `NCBI FTP server <ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/>`_
19
+ if unavailable locally. The local data location can be passed as an argument or
20
+ given under the environment variable ``MANE_SUMMARY_PATH``.
21
+
22
+ See the `NCBI MANE page <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ for more information.
23
+ """
15
24
 
16
25
  def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
17
26
  """Initialize the MANE Transcript mappings class.
27
+
18
28
  :param Path mane_data_path: Path to RefSeq MANE summary data
19
29
  """
20
30
  self.mane_data_path = mane_data_path
@@ -22,16 +32,26 @@ class MANETranscriptMappings:
22
32
 
23
33
  def _load_mane_transcript_data(self) -> pl.DataFrame:
24
34
  """Load RefSeq MANE data file into DataFrame.
35
+
25
36
  :return: DataFrame containing RefSeq MANE Transcript data
26
37
  """
27
38
  return pl.read_csv(self.mane_data_path, separator="\t")
28
39
 
29
40
  def get_gene_mane_data(self, gene_symbol: str) -> List[Dict]:
30
41
  """Return MANE Transcript data for a gene.
42
+
43
+ >>> from cool_seq_tool.sources import ManeTranscriptMappings
44
+ >>> m = ManeTranscriptMappings()
45
+ >>> braf_mane = m.get_gene_mane_data("BRAF")
46
+ >>> braf_mane[0]["RefSeq_nuc"], braf_mane[0]["MANE_status"]
47
+ ('NM_004333.6', 'MANE Select')
48
+ >>> braf_mane[1]["RefSeq_nuc"], braf_mane[1]["MANE_status"]
49
+ ('NM_001374258.1', 'MANE Plus Clinical')
50
+
31
51
  :param str gene_symbol: HGNC Gene Symbol
32
- :return: List of MANE Transcript data (Transcript accessions,
33
- gene, and location information). Sorted list: MANE Select and then MANE Plus
34
- Clinical
52
+ :return: List of MANE Transcript data (Transcript accessions, gene, and
53
+ location information). The list is sorted so that a MANE Select entry comes
54
+ first, followed by a MANE Plus Clinical entry, if available.
35
55
  """
36
56
  data = self.df.filter(pl.col("symbol") == gene_symbol.upper())
37
57
 
@@ -58,7 +78,8 @@ class MANETranscriptMappings:
58
78
  def get_mane_data_from_chr_pos(
59
79
  self, alt_ac: str, start: int, end: int
60
80
  ) -> List[Dict]:
61
- """Get MANE data given chromosome, start pos, end end pos. Assumes GRCh38.
81
+ """Get MANE data given a GRCh38 genomic position.
82
+
62
83
  :param str alt_ac: NC Accession
63
84
  :param int start: Start genomic position. Assumes residue coordinates.
64
85
  :param int end: End genomic position. Assumes residue coordinates.
@@ -1,4 +1,4 @@
1
- """The module for Transcript Mappings."""
1
+ """Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions."""
2
2
  import csv
3
3
  from pathlib import Path
4
4
  from typing import Dict, List, Optional
@@ -7,7 +7,17 @@ from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, TRANSCRIPT_MAPPINGS_PATH
7
7
 
8
8
 
9
9
  class TranscriptMappings:
10
- """The transcript mappings class."""
10
+ """Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.
11
+
12
+ Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically
13
+ be acquired if they aren't already available. See the
14
+ :ref:`configuration <configuration>` section in the documentation for information
15
+ about manual acquisition of data.
16
+
17
+ In general, this class's methods expect to receive NCBI gene symbols, so users
18
+ should be careful about the sourcing of their input in cases where terms are
19
+ conflicted or ambiguous (which, to be fair, should be relatively rare).
20
+ """
11
21
 
12
22
  def __init__(
13
23
  self,
@@ -16,8 +26,8 @@ class TranscriptMappings:
16
26
  ) -> None:
17
27
  """Initialize the transcript mappings class.
18
28
 
19
- :param Path transcript_file_path: Path to transcript mappings file
20
- :param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
29
+ :param transcript_file_path: Path to transcript mappings file
30
+ :param lrg_refseqgene_path: Path to LRG RefSeqGene file
21
31
  """
22
32
  # ENSP <-> Gene Symbol
23
33
  self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {}
@@ -53,7 +63,7 @@ class TranscriptMappings:
53
63
  def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
54
64
  """Load transcript mappings file to dictionaries.
55
65
 
56
- :param Path transcript_file_path: Path to transcript mappings file
66
+ :param transcript_file_path: Path to transcript mappings file
57
67
  """
58
68
  with open(transcript_file_path) as file:
59
69
  reader = csv.DictReader(file, delimiter="\t")
@@ -127,7 +137,13 @@ class TranscriptMappings:
127
137
  def protein_transcripts(self, identifier: str) -> List[str]:
128
138
  """Return a list of protein transcripts for a gene symbol.
129
139
 
130
- :param str identifier: Gene identifier to get protein transcripts for
140
+ >>> from cool_seq_tool.sources import TranscriptMappings
141
+ >>> braf_txs = TranscriptMappings().protein_transcripts("BRAF")
142
+ >>> braf_txs.sort()
143
+ >>> braf_txs[-1]
144
+ 'NP_004324.2'
145
+
146
+ :param identifier: Gene identifier to get protein transcripts for
131
147
  :return: Protein transcripts for a gene symbol
132
148
  """
133
149
  protein_transcripts = list()
@@ -141,7 +157,7 @@ class TranscriptMappings:
141
157
  def coding_dna_transcripts(self, identifier: str) -> List[str]:
142
158
  """Return transcripts from a coding dna refseq for a gene symbol.
143
159
 
144
- :param str identifier: Gene identifier to find transcripts for
160
+ :param identifier: Gene identifier to find transcripts for
145
161
  :return: cDNA transcripts for a gene symbol
146
162
  """
147
163
  genomic_transcripts = list()
@@ -159,7 +175,7 @@ class TranscriptMappings:
159
175
  def get_gene_symbol_from_ensembl_protein(self, q: str) -> Optional[str]:
160
176
  """Return the gene symbol for a Ensembl Protein.
161
177
 
162
- :param str q: ensembl protein accession
178
+ :param q: ensembl protein accession
163
179
  :return: Gene symbol
164
180
  """
165
181
  gene_symbol = self.ensembl_protein_version_to_gene_symbol.get(q)
@@ -172,7 +188,7 @@ class TranscriptMappings:
172
188
  def get_gene_symbol_from_refeq_protein(self, q: str) -> Optional[str]:
173
189
  """Return the gene symbol for a Refseq Protein.
174
190
 
175
- :param str q: RefSeq protein accession
191
+ :param q: RefSeq protein accession
176
192
  :return: Gene symbol
177
193
  """
178
194
  return self.refseq_protein_to_gene_symbol.get(q)
@@ -180,7 +196,7 @@ class TranscriptMappings:
180
196
  def get_gene_symbol_from_refseq_rna(self, q: str) -> Optional[str]:
181
197
  """Return gene symbol for a Refseq RNA Transcript.
182
198
 
183
- :param str q: RefSeq RNA transcript accession
199
+ :param q: RefSeq RNA transcript accession
184
200
  :return: Gene symbol
185
201
  """
186
202
  gene_symbol = self.refseq_rna_version_to_gene_symbol.get(q)
@@ -193,7 +209,7 @@ class TranscriptMappings:
193
209
  def get_gene_symbol_from_ensembl_transcript(self, q: str) -> Optional[str]:
194
210
  """Return gene symbol for an Ensembl Transcript.
195
211
 
196
- :param str q: Ensembl transcript accession
212
+ :param q: Ensembl transcript accession
197
213
  :return: Gene symbol
198
214
  """
199
215
  gene_symbol = self.ensembl_transcript_version_to_gene_symbol.get(q)