cool-seq-tool 0.3.0.dev0__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,121 @@
1
- """Module for retrieving MANE Transcript from variation on p/c/g coordinate.
1
+ """Retrieve MANE transcript from a location on p./c./g. coordinates.
2
+
2
3
  Steps:
3
- 1. Map annotation layer to genome
4
- 2. Liftover to preferred genome
5
- We want to liftover to GRCh38. We do not support getting MANE transcripts
6
- for GRCh36 and earlier assemblies.
7
- 3. Select preferred compatible annotation
8
- 4. Map back to correct annotation layer
4
+
5
+ #. Map annotation layer to genome
6
+ #. Liftover to preferred genome (GRCh38). GRCh36 and earlier assemblies are not supported
7
+ for fetching MANE transcripts.
8
+ #. Select preferred compatible annotation (see :ref:`transcript compatibility <transcript_compatibility>`)
9
+ #. Map back to correct annotation layer
10
+
11
+ In addition to a mapper utility class, this module also defines several vocabulary
12
+ constraints and data models for coordinate representation.
9
13
  """
10
14
  import logging
11
15
  import math
16
+ from enum import StrEnum
12
17
  from typing import Dict, List, Optional, Set, Tuple, Union
13
18
 
14
19
  import polars as pl
20
+ from pydantic import BaseModel
15
21
 
16
22
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
17
23
  from cool_seq_tool.schemas import (
18
24
  AnnotationLayer,
19
25
  Assembly,
20
26
  ResidueMode,
27
+ Strand,
21
28
  TranscriptPriority,
22
29
  )
23
30
  from cool_seq_tool.sources import (
24
- MANETranscriptMappings,
31
+ ManeTranscriptMappings,
25
32
  TranscriptMappings,
26
- UTADatabase,
33
+ UtaDatabase,
27
34
  )
28
35
  from cool_seq_tool.utils import get_inter_residue_pos
29
36
 
30
37
  logger = logging.getLogger(__name__)
31
38
 
32
39
 
33
- class MANETranscript:
40
+ class EndAnnotationLayer(StrEnum):
41
+ """Define constraints for end annotation layer. This is used for determining the
42
+ end annotation layer when getting the longest compatible remaining representation
43
+ """
44
+
45
+ PROTEIN = AnnotationLayer.PROTEIN
46
+ CDNA = AnnotationLayer.CDNA
47
+ PROTEIN_AND_CDNA = "p_and_c"
48
+
49
+
50
+ class DataRepresentation(BaseModel):
51
+ """Define object model for final output representation"""
52
+
53
+ gene: Optional[str] = None
54
+ refseq: str
55
+ ensembl: Optional[str] = None
56
+ pos: Tuple[int, int]
57
+ strand: Strand
58
+ status: TranscriptPriority
59
+
60
+
61
+ class CdnaRepresentation(DataRepresentation):
62
+ """Define object model for coding DNA representation"""
63
+
64
+ coding_start_site: int
65
+ coding_end_site: int
66
+ alt_ac: Optional[str] = None
67
+
68
+
69
+ class GenomicRepresentation(BaseModel):
70
+ """Define object model for genomic representation"""
71
+
72
+ refseq: str
73
+ pos: Tuple[int, int]
74
+ status: TranscriptPriority
75
+ alt_ac: str
76
+
77
+
78
+ class ProteinAndCdnaRepresentation(BaseModel):
79
+ """Define object model for protein and cDNA representation"""
80
+
81
+ protein: DataRepresentation
82
+ cdna: CdnaRepresentation
83
+
84
+
85
+ class ManeTranscript:
34
86
  """Class for retrieving MANE transcripts."""
35
87
 
36
88
  def __init__(
37
89
  self,
38
90
  seqrepo_access: SeqRepoAccess,
39
91
  transcript_mappings: TranscriptMappings,
40
- mane_transcript_mappings: MANETranscriptMappings,
41
- uta_db: UTADatabase,
92
+ mane_transcript_mappings: ManeTranscriptMappings,
93
+ uta_db: UtaDatabase,
42
94
  ) -> None:
43
- """Initialize the MANETranscript class.
95
+ """Initialize the ManeTranscript class.
96
+
97
+ A handful of resources are required for initialization, so when defaults are
98
+ enough, it's easiest to let the core CoolSeqTool class handle it for you:
99
+
100
+ >>> from cool_seq_tool.app import CoolSeqTool
101
+ >>> mane_mapper = CoolSeqTool().mane_transcript
102
+
103
+ Note that most methods are defined as Python coroutines, so they must be called
104
+ with ``await`` or run from an ``async`` event loop:
105
+
106
+ >>> import asyncio
107
+ >>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
108
+ >>> result['ac']
109
+ 'NC_000001.11'
110
+
111
+ See the :ref:`Usage section <async_note>` for more information.
44
112
 
45
113
  :param seqrepo_access: Access to seqrepo queries
46
114
  :param transcript_mappings: Access to transcript accession mappings and
47
115
  conversions
48
116
  :param mane_transcript_mappings: Access to MANE Transcript accession mapping
49
117
  data
50
- :param uta_db: UTADatabase instance to give access to query UTA database
118
+ :param uta_db: UtaDatabase instance to give access to query UTA database
51
119
  """
52
120
  self.seqrepo_access = seqrepo_access
53
121
  self.transcript_mappings = transcript_mappings
@@ -56,10 +124,9 @@ class MANETranscript:
56
124
 
57
125
  @staticmethod
58
126
  def _get_reading_frame(pos: int) -> int:
59
- """Return reading frame number.
60
- Only used on c. coordinate
127
+ """Return reading frame number. Only used on c. coordinate.
61
128
 
62
- :param int pos: cDNA position
129
+ :param pos: cDNA position
63
130
  :return: Reading frame
64
131
  """
65
132
  pos_mod_3 = pos % 3
@@ -71,26 +138,25 @@ class MANETranscript:
71
138
  def _p_to_c_pos(start: int, end: int) -> Tuple[int, int]:
72
139
  """Return cDNA position given a protein position.
73
140
 
74
- :param int start: Start protein position
75
- :param int end: End protein position
141
+ :param start: Start protein position. Inter-residue coordinates
142
+ :param end: End protein position. Inter-residue coordinates
76
143
  :return: cDNA position start, cDNA position end
77
144
  """
78
- start_pos = start * 3 - 1
145
+ start_pos = start * 3
79
146
  if end != start:
80
- end_pos = end * 3 - 1
147
+ end_pos = end * 3
81
148
  else:
82
149
  end_pos = start_pos
83
-
84
- return start_pos - 1, end_pos + 1
150
+ return start_pos, end_pos - 1
85
151
 
86
152
  async def _p_to_c(
87
153
  self, ac: str, start_pos: int, end_pos: int
88
154
  ) -> Optional[Tuple[str, Tuple[int, int]]]:
89
155
  """Convert protein (p.) annotation to cDNA (c.) annotation.
90
156
 
91
- :param str ac: Protein accession
92
- :param int start_pos: Protein start position
93
- :param int end_pos: Protein end position
157
+ :param ac: Protein accession
158
+ :param start_pos: Protein start position. Inter-residue coordinates
159
+ :param end_pos: Protein end position. Inter-residue coordinates
94
160
  :return: [cDNA transcript accession, [cDNA pos start, cDNA pos end]]
95
161
  """
96
162
  # TODO: Check version mappings 1 to 1 relationship
@@ -116,8 +182,8 @@ class MANETranscript:
116
182
  async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]:
117
183
  """Get g. annotation from c. annotation.
118
184
 
119
- :param str ac: cDNA accession
120
- :param Tuple[int, int] pos: [cDNA pos start, cDNA pos end]
185
+ :param ac: cDNA accession
186
+ :param pos: [cDNA pos start, cDNA pos end]
121
187
  :return: Gene, Transcript accession and position change,
122
188
  Altered transcript accession and position change, Strand
123
189
  """
@@ -127,7 +193,9 @@ class MANETranscript:
127
193
  if not self.transcript_mappings.ensembl_transcript_version_to_gene_symbol.get(
128
194
  ac
129
195
  ):
130
- if not self.seqrepo_access.get_reference_sequence(ac, 1)[0]:
196
+ if not self.seqrepo_access.get_reference_sequence(ac, start=1, end=1)[
197
+ 0
198
+ ]:
131
199
  logger.warning(f"Ensembl transcript not found: {ac}")
132
200
  return None
133
201
 
@@ -160,12 +228,11 @@ class MANETranscript:
160
228
  ) -> Optional[Dict]:
161
229
  """Get and validate genomic_tx_data
162
230
 
163
- :param str tx_ac: Accession on c. coordinate
164
- :param Tuple[int, int] pos: (start pos, end pos)
165
- :param Union[AnnotationLayer.CDNA, AnnotationLayer.GENOMIC] annotation_layer:
166
- Annotation layer for `ac` and `pos`
167
- :param Optional[int] coding_start_site: Coding start site
168
- :param Optional[str] alt_ac: Accession on g. coordinate
231
+ :param tx_ac: Accession on c. coordinate
232
+ :param pos: (start pos, end pos)
233
+ :param annotation_layer: Annotation layer for ``ac`` and ``pos``
234
+ :param coding_start_site: Coding start site
235
+ :param alt_ac: Accession on g. coordinate
169
236
  :return: genomic_tx_data if found and validated, else None
170
237
  """
171
238
  genomic_tx_data = await self.uta_db.get_genomic_tx_data(
@@ -199,27 +266,25 @@ class MANETranscript:
199
266
 
200
267
  @staticmethod
201
268
  def _get_c_data(
202
- gene: str,
203
269
  cds_start_end: Tuple[int, int],
204
270
  c_pos_change: Tuple[int, int],
205
- strand: str,
271
+ strand: Strand,
206
272
  status: TranscriptPriority,
207
273
  refseq_c_ac: str,
274
+ gene: Optional[str] = None,
208
275
  ensembl_c_ac: Optional[str] = None,
209
276
  alt_ac: Optional[str] = None,
210
- ) -> Dict:
277
+ ) -> CdnaRepresentation:
211
278
  """Return transcript data on c. coordinate.
212
279
 
213
- :param str gene: Gene symbol
214
- :param Tuple[int, int] cds_start_end: Coding start and end site
215
- for transcript
216
- :param Tuple[int, int] c_pos_change: Start and end positions
217
- for change on c. coordinate
218
- :param str strand: Strand
219
- :param TranscriptPriority status: Status of transcript
220
- :param str refseq_c_ac: Refseq transcript
221
- :param Optional[str] ensembl_c_ac: Ensembl transcript
222
- :param Optional[str] alt_ac: Genomic accession
280
+ :param gene: Gene symbol
281
+ :param cds_start_end: Coding start and end site for transcript
282
+ :param c_pos_change: Start and end positions for change on c. coordinate
283
+ :param strand: Strand
284
+ :param status: Status of transcript
285
+ :param refseq_c_ac: Refseq transcript
286
+ :param ensembl_c_ac: Ensembl transcript
287
+ :param alt_ac: Genomic accession
223
288
  :return: Transcript data on c. coord
224
289
  """
225
290
  cds_start = cds_start_end[0]
@@ -229,10 +294,10 @@ class MANETranscript:
229
294
 
230
295
  if lt_cds_start or gt_cds_end:
231
296
  logger.info(
232
- f"{refseq_c_ac} with position"
233
- f" {c_pos_change} is not within CDS start/end"
297
+ f"{refseq_c_ac} with position {c_pos_change} is not within CDS start/end"
234
298
  )
235
- return dict(
299
+
300
+ return CdnaRepresentation(
236
301
  gene=gene,
237
302
  refseq=refseq_c_ac,
238
303
  ensembl=ensembl_c_ac,
@@ -244,27 +309,37 @@ class MANETranscript:
244
309
  alt_ac=alt_ac,
245
310
  )
246
311
 
247
- @staticmethod
248
- def _get_mane_p(mane_data: Dict, mane_c_pos_range: Tuple[int, int]) -> Dict:
249
- """Translate MANE Transcript c. annotation to p. annotation
312
+ def _c_to_p_pos(self, c_pos: Tuple[int, int]) -> Tuple[int, int]:
313
+ """Get protein position from cdna position
250
314
 
251
- :param Dict mane_data: MANE Transcript data
252
- :param Tuple[int, int] mane_c_pos_range: Position change range
253
- on MANE Transcript c. coordinate
254
- :return: MANE transcripts accessions and position change on
255
- p. coordinate
315
+ :param c_pos: cdna position. inter-residue coordinates
316
+ :return: protein position. inter-residue coordinates
256
317
  """
257
- start = mane_c_pos_range[0] / 3
258
- end = mane_c_pos_range[1] / 3
259
- start = math.floor(start) if start == end else math.ceil(start)
260
- end = math.floor(end)
318
+ end = math.ceil(c_pos[1] / 3)
319
+ if c_pos[1] - c_pos[0] == 1:
320
+ start = end - 1
321
+ else:
322
+ start = math.ceil((c_pos[0] + 1) / 3) - 1
323
+ return start, end
324
+
325
+ def _get_mane_p(
326
+ self, mane_data: Dict, mane_c_pos_range: Tuple[int, int]
327
+ ) -> DataRepresentation:
328
+ """Translate MANE Transcript c. annotation to p. annotation
261
329
 
262
- return dict(
330
+ :param mane_data: MANE Transcript data
331
+ :param mane_c_pos_range: Position change range on MANE Transcript c. coordinate
332
+ using inter-residue coordinates
333
+ :return: Protein representation
334
+ """
335
+ return DataRepresentation(
263
336
  gene=mane_data["symbol"],
264
337
  refseq=mane_data["RefSeq_prot"],
265
338
  ensembl=mane_data["Ensembl_prot"],
266
- pos=(start, end),
267
- strand=mane_data["chr_strand"],
339
+ pos=self._c_to_p_pos(mane_c_pos_range),
340
+ strand=Strand.NEGATIVE
341
+ if mane_data["chr_strand"] == "-"
342
+ else Strand.POSITIVE,
268
343
  status=TranscriptPriority(
269
344
  "_".join(mane_data["MANE_status"].split()).lower()
270
345
  ),
@@ -278,17 +353,17 @@ class MANETranscript:
278
353
  ensembl_c_ac: Optional[str] = None,
279
354
  alt_ac: Optional[str] = None,
280
355
  found_result: bool = False,
281
- ) -> Optional[Dict]:
356
+ ) -> Optional[CdnaRepresentation]:
282
357
  """Get transcript c. annotation data from g. annotation.
283
358
 
284
- :param Dict g: Genomic data
285
- :param str refseq_c_ac: Refseq transcript accession
286
- :param TranscriptPriority status: Status of transcript
287
- :param Optional[str] ensembl_c_ac: Ensembl transcript accession
288
- :param Optional[str] alt_ac: Genomic accession
289
- :param bool found_result: `True` if found result, so do not need to query
359
+ :param g: Genomic data
360
+ :param refseq_c_ac: Refseq transcript accession
361
+ :param status: Status of transcript
362
+ :param ensembl_c_ac: Ensembl transcript accession
363
+ :param alt_ac: Genomic accession
364
+ :param found_result: ``True`` if found result, so do not need to query
290
365
  tx_exon_aln_v table. This is because the user did not need to liftover.
291
- `False` if need to get result from tx_exon_aln_v table.
366
+ ``False`` if need to get result from tx_exon_aln_v table.
292
367
  :return: Transcript data
293
368
  """
294
369
  if found_result:
@@ -321,7 +396,7 @@ class MANETranscript:
321
396
  g_pos = g["alt_pos_change_range"] # start/end genomic change
322
397
  g_pos_change = g_pos[0] - tx_g_pos[0], tx_g_pos[1] - g_pos[1]
323
398
 
324
- if g["strand"] == "-":
399
+ if g["strand"] == Strand.NEGATIVE:
325
400
  g_pos_change = (tx_g_pos[1] - g_pos[0], g_pos[1] - tx_g_pos[0])
326
401
 
327
402
  c_pos_change = (
@@ -344,29 +419,27 @@ class MANETranscript:
344
419
  )
345
420
 
346
421
  def _validate_reading_frames(
347
- self, ac: str, start_pos: int, end_pos: int, transcript_data: Dict
422
+ self, ac: str, start_pos: int, end_pos: int, transcript_data: CdnaRepresentation
348
423
  ) -> bool:
349
424
  """Return whether reading frames are the same after translation.
350
425
 
351
- :param str ac: Query accession
352
- :param int start_pos: Original start cDNA position change
353
- :param int end_pos: Original end cDNA position change
354
- :param Dict transcript_data: Ensembl and RefSeq transcripts with
355
- corresponding position change
356
- :return: `True` if reading frames are the same after translation.
357
- `False` otherwise
426
+ :param ac: Query accession
427
+ :param start_pos: Original start cDNA position change
428
+ :param end_pos: Original end cDNA position change
429
+ :param transcript_data: Ensembl and RefSeq transcripts with corresponding
430
+ position change
431
+ :return: ``True`` if reading frames are the same after translation.
432
+ ``False`` otherwise
358
433
  """
359
434
  for pos, pos_index in [(start_pos, 0), (end_pos, 1)]:
360
435
  if pos is not None:
361
436
  og_rf = self._get_reading_frame(pos)
362
- new_rf = self._get_reading_frame(transcript_data["pos"][pos_index])
437
+ new_rf = self._get_reading_frame(transcript_data.pos[pos_index])
363
438
 
364
439
  if og_rf != new_rf:
365
440
  logger.warning(
366
- f"{ac} original reading frame ({og_rf}) "
367
- f"does not match new "
368
- f"{transcript_data['ensembl']}, "
369
- f"{transcript_data['refseq']} reading "
441
+ f"{ac} original reading frame ({og_rf}) does not match new "
442
+ f"{transcript_data.ensembl}, {transcript_data.refseq} reading "
370
443
  f"frame ({new_rf})"
371
444
  )
372
445
  return False
@@ -382,7 +455,9 @@ class MANETranscript:
382
455
  coding_start_site: int,
383
456
  start_pos: int,
384
457
  end_pos: int,
385
- mane_transcript: Dict,
458
+ mane_transcript: Union[
459
+ DataRepresentation, CdnaRepresentation, GenomicRepresentation
460
+ ],
386
461
  expected_ref: str,
387
462
  anno: AnnotationLayer,
388
463
  residue_mode: ResidueMode,
@@ -397,29 +472,29 @@ class MANETranscript:
397
472
  position change
398
473
  :param expected_ref: Reference at position given during input
399
474
  :param anno: Annotation layer we are starting from
400
- :param residue_mode: Residue mode for `start_pos` and `end_pos`
401
- :return: `True` if reference check passes. `False` otherwise.
475
+ :param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
476
+ :return: ``True`` if reference check passes. ``False`` otherwise.
402
477
  """
403
478
  if anno == AnnotationLayer.CDNA:
404
479
  start_pos += coding_start_site
405
480
  end_pos += coding_start_site
406
481
 
407
- ref, warnings = self.seqrepo_access.get_reference_sequence(
408
- ac, start_pos, end=end_pos, residue_mode=residue_mode
482
+ ref, _ = self.seqrepo_access.get_reference_sequence(
483
+ ac, start=start_pos, end=end_pos, residue_mode=residue_mode
409
484
  )
410
485
  if ref is None:
411
486
  return False
412
487
 
413
488
  if mane_transcript:
414
- mane_start_pos = mane_transcript["pos"][0]
415
- mane_end_pos = mane_transcript["pos"][1]
489
+ mane_start_pos = mane_transcript.pos[0]
490
+ mane_end_pos = mane_transcript.pos[1]
416
491
  if anno == AnnotationLayer.CDNA:
417
- mane_cds = mane_transcript["coding_start_site"]
492
+ mane_cds = mane_transcript.coding_start_site
418
493
  mane_start_pos += mane_cds
419
494
  mane_end_pos += mane_cds
420
- mane_ref, warnings = self.seqrepo_access.get_reference_sequence(
421
- mane_transcript["refseq"],
422
- mane_start_pos,
495
+ mane_ref, _ = self.seqrepo_access.get_reference_sequence(
496
+ mane_transcript.refseq,
497
+ start=mane_start_pos,
423
498
  end=mane_end_pos if mane_start_pos != mane_end_pos else None,
424
499
  residue_mode=residue_mode,
425
500
  )
@@ -429,12 +504,12 @@ class MANETranscript:
429
504
  if expected_ref != mane_ref:
430
505
  logger.info(
431
506
  f"Expected ref, {expected_ref}, but got {mane_ref}"
432
- f" on MANE accession, {mane_transcript['refseq']}"
507
+ f" on MANE accession, {mane_transcript.refseq}"
433
508
  )
434
509
 
435
510
  if expected_ref != ref:
436
511
  logger.warning(
437
- f"Expected ref, {expected_ref}, but got {ref} " f"on accession, {ac}"
512
+ f"Expected ref, {expected_ref}, but got {ref} on accession, {ac}"
438
513
  )
439
514
  return False
440
515
 
@@ -445,18 +520,16 @@ class MANETranscript:
445
520
  ) -> bool:
446
521
  """Validate that positions actually exist on accession
447
522
 
448
- :param str ac: Accession
449
- :param Tuple[int, int] pos: Start position change, End position change
450
- :param int coding_start_site: coding start site for accession
451
- :return: `True` if positions exist on accession. `False` otherwise
523
+ :param ac: Accession
524
+ :param pos: Start position change, End position change
525
+ :param coding_start_site: coding start site for accession
526
+ :return: ``True`` if positions exist on accession. ``False`` otherwise
452
527
  """
453
528
  start_pos = pos[0] + coding_start_site
454
529
  end_pos = pos[1] + coding_start_site
455
530
  if self.seqrepo_access.get_reference_sequence(
456
- ac, start_pos, end_pos, residue_mode=ResidueMode.INTER_RESIDUE
457
- )[
458
- 0
459
- ]: # noqa E501
531
+ ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
532
+ )[0]:
460
533
  return True
461
534
  else:
462
535
  return False
@@ -517,36 +590,83 @@ class MANETranscript:
517
590
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
518
591
  mane_transcripts: Optional[Set] = None,
519
592
  alt_ac: Optional[str] = None,
520
- end_annotation_layer: Optional[
521
- Union[AnnotationLayer.PROTEIN, AnnotationLayer.CDNA]
522
- ] = None,
523
- ) -> Optional[Dict]:
524
- """Get longest compatible transcript from a gene.
525
- Try GRCh38 first, then GRCh37.
526
- Transcript is compatible if it passes validation checks.
593
+ end_annotation_layer: Optional[EndAnnotationLayer] = None,
594
+ ) -> Optional[
595
+ Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation]
596
+ ]:
597
+ """Get longest compatible transcript from a gene. See the documentation for
598
+ the :ref:`transcript compatibility policy <transcript_compatibility>` for more
599
+ information.
600
+
601
+ >>> import asyncio
602
+ >>> from cool_seq_tool.app import CoolSeqTool
603
+ >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
604
+ >>> mane_mapper = CoolSeqTool().mane_transcript
605
+ >>> mane_transcripts = {
606
+ ... "ENST00000646891.2",
607
+ ... "NM_001374258.1",
608
+ ... "NM_004333.6",
609
+ ... "ENST00000644969.2",
610
+ ... }
611
+ >>> result = asyncio.run(mane_mapper.get_longest_compatible_transcript(
612
+ ... 599,
613
+ ... 599,
614
+ ... gene="BRAF",
615
+ ... start_annotation_layer=AnnotationLayer.PROTEIN,
616
+ ... residue_mode=ResidueMode.INTER_RESIDUE,
617
+ ... mane_transcripts=mane_transcripts,
618
+ ... ))
619
+ >>> result.refseq
620
+ 'NP_001365396.1'
621
+
622
+ If unable to find a match on GRCh38, this method will then attempt to drop down
623
+ to GRCh37.
624
+
625
+ # TODO example for inputs that demonstrate this?
527
626
 
528
627
  :param start_pos: Start position change
529
628
  :param end_pos: End position change
530
629
  :param start_annotation_layer: Starting annotation layer
531
630
  :param gene: HGNC gene symbol
532
631
  :param ref: Reference at position given during input
533
- :param residue_mode: Residue mode for `start_pos` and `end_pos`
632
+ :param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
534
633
  :param mane_transcripts: Attempted mane transcripts that were not compatible
535
634
  :param alt_ac: Genomic accession
536
635
  :param end_annotation_layer: The end annotation layer. If not provided, will be
537
- set to the following
538
- `AnnotationLayer.PROTEIN` if
539
- `start_annotation_layer == AnnotationLayer.PROTEIN`
540
- `AnnotationLayer.CDNA` otherwise
636
+ set to ``EndAnnotationLayer.PROTEIN`` if
637
+ ``start_annotation_layer == AnnotationLayer.PROTEIN``,
638
+ ``EndAnnotationLayer.CDNA`` otherwise
541
639
  :return: Data for longest compatible transcript
542
640
  """
543
- inter_residue_pos, _ = get_inter_residue_pos(
544
- start_pos, residue_mode, end_pos=end_pos
545
- )
546
- if not inter_residue_pos:
547
- return None
641
+
642
+ def _get_protein_rep(
643
+ gene: Optional[str],
644
+ pro_ac: str,
645
+ lcr_c_data_pos: Tuple[int, int],
646
+ strand: Strand,
647
+ status: TranscriptPriority,
648
+ ) -> DataRepresentation:
649
+ """Get longest compatible remaining protein representation
650
+
651
+ :param gene: HGNC gene symbol
652
+ :param pro_ac: Protein accession
653
+ :param lcr_c_data_pos: Longest compatible remaining position
654
+ :param strand: Strand
655
+ :param status: Status for `pro_ac`
656
+ :return: Protein representation for longest compatible remaining result
657
+ """
658
+ return DataRepresentation(
659
+ gene=gene,
660
+ refseq=pro_ac if pro_ac.startswith("N") else None,
661
+ ensembl=pro_ac if pro_ac.startswith("E") else None,
662
+ pos=self._c_to_p_pos(lcr_c_data_pos),
663
+ strand=strand,
664
+ status=status,
665
+ )
666
+
667
+ lcr_result = None
668
+ start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
548
669
  residue_mode = ResidueMode.INTER_RESIDUE
549
- start_pos, end_pos = inter_residue_pos
550
670
 
551
671
  is_p_or_c_start_anno = True
552
672
  if start_annotation_layer == AnnotationLayer.PROTEIN:
@@ -568,7 +688,7 @@ class MANETranscript:
568
688
 
569
689
  if df.is_empty():
570
690
  logger.warning(f"Unable to get transcripts from gene {gene}")
571
- return None
691
+ return lcr_result
572
692
 
573
693
  prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df)
574
694
 
@@ -606,7 +726,7 @@ class MANETranscript:
606
726
 
607
727
  # Get prioritized transcript data for gene
608
728
  # grch38 -> c
609
- lcr_c_data = await self._g_to_c(
729
+ lcr_c_data: Optional[CdnaRepresentation] = await self._g_to_c(
610
730
  g=g,
611
731
  refseq_c_ac=tx_ac,
612
732
  status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
@@ -664,74 +784,108 @@ class MANETranscript:
664
784
 
665
785
  if not end_annotation_layer:
666
786
  if start_annotation_layer == AnnotationLayer.PROTEIN:
667
- end_annotation_layer = AnnotationLayer.PROTEIN
787
+ end_annotation_layer = EndAnnotationLayer.PROTEIN
668
788
  else:
669
- end_annotation_layer = AnnotationLayer.CDNA
789
+ end_annotation_layer = EndAnnotationLayer.CDNA
790
+
791
+ if end_annotation_layer in {
792
+ EndAnnotationLayer.CDNA,
793
+ EndAnnotationLayer.PROTEIN,
794
+ }:
795
+ if end_annotation_layer == EndAnnotationLayer.CDNA:
796
+ lcr_result = lcr_c_data
797
+ coding_start_site = lcr_result.coding_start_site
798
+ else:
799
+ lcr_result = _get_protein_rep(
800
+ gene,
801
+ row["pro_ac"],
802
+ lcr_c_data.pos,
803
+ g["strand"],
804
+ lcr_c_data.status,
805
+ )
806
+ coding_start_site = 0
670
807
 
671
- if end_annotation_layer == AnnotationLayer.PROTEIN:
672
- pos = (
673
- math.ceil(lcr_c_data["pos"][0] / 3),
674
- math.floor(lcr_c_data["pos"][1] / 3),
675
- )
676
- ac = row["pro_ac"]
677
- coding_start_site = 0
678
- else:
679
- # cDNA and Genomic annotations will return c. data
680
- pos = lcr_c_data["pos"]
681
- ac = tx_ac
682
- coding_start_site = lcr_c_data["coding_start_site"]
808
+ ac = lcr_result.refseq or lcr_result.ensembl
809
+ pos = lcr_result.pos
683
810
 
684
- if not self._validate_index(ac, pos, coding_start_site):
685
- logger.warning(
686
- f"{pos} are not valid positions on {ac}"
687
- f"with coding start site "
688
- f"{coding_start_site}"
811
+ if not self._validate_index(ac, pos, coding_start_site):
812
+ logger.warning(
813
+ f"{pos} are not valid positions on {ac} with coding start site "
814
+ f"{coding_start_site}"
815
+ )
816
+ continue
817
+ return lcr_result
818
+ else:
819
+ lcr_result = ProteinAndCdnaRepresentation(
820
+ protein=_get_protein_rep(
821
+ gene,
822
+ row["pro_ac"],
823
+ lcr_c_data.pos,
824
+ g["strand"],
825
+ lcr_c_data.status,
826
+ ),
827
+ cdna=lcr_c_data,
689
828
  )
690
- continue
691
-
692
- return dict(
693
- refseq=ac if ac.startswith("N") else None,
694
- ensembl=ac if ac.startswith("E") else None, # TODO: issues 87, 4
695
- pos=pos,
696
- strand=g["strand"],
697
- status=lcr_c_data["status"],
698
- )
699
- return None
829
+ lcr_result_dict = lcr_result.model_dump()
830
+
831
+ valid = True
832
+ for k in lcr_result_dict.keys():
833
+ cds = lcr_result_dict[k].get("coding_start_site", 0)
834
+ ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
835
+ pos = lcr_result_dict[k]["pos"]
836
+ if not self._validate_index(ac, pos, cds):
837
+ valid = False
838
+ logger.warning(
839
+ f"{pos} are not valid positions on {ac} with coding start site {cds}"
840
+ )
841
+ break
842
+
843
+ if valid:
844
+ return lcr_result
845
+ return lcr_result
700
846
 
701
847
  async def get_mane_transcript(
702
848
  self,
703
849
  ac: str,
704
850
  start_pos: int,
851
+ end_pos: int,
705
852
  start_annotation_layer: AnnotationLayer,
706
- end_pos: Optional[int] = None,
707
853
  gene: Optional[str] = None,
708
854
  ref: Optional[str] = None,
709
855
  try_longest_compatible: bool = False,
710
- residue_mode: ResidueMode = ResidueMode.RESIDUE,
711
- ) -> Optional[Dict]:
712
- """Return mane transcript.
856
+ residue_mode: Union[
857
+ ResidueMode.RESIDUE, ResidueMode.INTER_RESIDUE
858
+ ] = ResidueMode.RESIDUE,
859
+ ) -> Optional[Union[DataRepresentation, CdnaRepresentation]]:
860
+ """Return MANE transcript.
861
+
862
+ >>> from cool_seq_tool.app import CoolSeqTool
863
+ >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
864
+ >>> import asyncio
865
+ >>> mane_mapper = CoolSeqTool().mane_transcript
866
+ >>> result = asyncio.run(mane_mapper.get_mane_transcript(
867
+ ... "NP_004324.2",
868
+ ... 599,
869
+ ... AnnotationLayer.PROTEIN,
870
+ ... residue_mode=ResidueMode.INTER_RESIDUE,
871
+ ... ))
872
+ >>> result.gene, result.refseq, result.status
873
+ ('BRAF', 'NP_004324.2', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
713
874
 
714
875
  :param ac: Accession
715
876
  :param start_pos: Start position change
877
+ :param end_pos: End position change
716
878
  :param start_annotation_layer: Starting annotation layer.
717
- :param end_pos: End position change. If `None` assumes both `start_pos` and
718
- `end_pos` have same values.
719
879
  :param gene: HGNC gene symbol
720
880
  :param ref: Reference at position given during input
721
- :param try_longest_compatible: `True` if should try longest compatible remaining
722
- if mane transcript was not compatible. `False` otherwise.
723
- :param ResidueMode residue_mode: Starting residue mode for `start_pos`
724
- and `end_pos`. Will always return coordinates in inter-residue
881
+ :param try_longest_compatible: ``True`` if should try longest compatible remaining
882
+ if mane transcript was not compatible. ``False`` otherwise.
883
+ :param ResidueMode residue_mode: Starting residue mode for ``start_pos`` and
884
+ ``end_pos``. Will always return coordinates in inter-residue
725
885
  :return: MANE data or longest transcript compatible data if validation
726
- checks are correct. Will return inter-residue coordinates.
727
- Else, `None`
886
+ checks are correct. Will return inter-residue coordinates. Else, ``None``.
728
887
  """
729
- inter_residue_pos, warning = get_inter_residue_pos(
730
- start_pos, residue_mode, end_pos=end_pos
731
- )
732
- if not inter_residue_pos:
733
- return None
734
- start_pos, end_pos = inter_residue_pos
888
+ start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
735
889
  residue_mode = ResidueMode.INTER_RESIDUE
736
890
  if ref:
737
891
  ref = ref[: end_pos - start_pos]
@@ -766,7 +920,7 @@ class MANETranscript:
766
920
  mane_transcripts |= set(
767
921
  (current_mane_data["RefSeq_nuc"], current_mane_data["Ensembl_nuc"])
768
922
  )
769
- mane = await self._g_to_c(
923
+ mane: Optional[CdnaRepresentation] = await self._g_to_c(
770
924
  g=g,
771
925
  refseq_c_ac=current_mane_data["RefSeq_nuc"],
772
926
  status=TranscriptPriority(
@@ -777,10 +931,10 @@ class MANETranscript:
777
931
  if not mane:
778
932
  continue
779
933
 
780
- if not mane["alt_ac"]:
934
+ if not mane.alt_ac:
781
935
  g_alt_ac = g.get("alt_ac")
782
936
  if g_alt_ac:
783
- mane["alt_ac"] = g_alt_ac
937
+ mane.alt_ac = g_alt_ac
784
938
 
785
939
  valid_reading_frame = self._validate_reading_frames(
786
940
  c_ac, c_pos[0], c_pos[1], mane
@@ -789,7 +943,9 @@ class MANETranscript:
789
943
  continue
790
944
 
791
945
  if start_annotation_layer == AnnotationLayer.PROTEIN:
792
- mane = self._get_mane_p(current_mane_data, mane["pos"])
946
+ mane: DataRepresentation = self._get_mane_p(
947
+ current_mane_data, mane.pos
948
+ )
793
949
 
794
950
  if ref:
795
951
  valid_references = self._validate_references(
@@ -842,9 +998,9 @@ class MANETranscript:
842
998
  ) -> Optional[Dict]:
843
999
  """Return genomic coordinate on GRCh38 when not given gene context.
844
1000
 
845
- :param str ac: Genomic accession
846
- :param int start_pos: Genomic start position change
847
- :param int end_pos: Genomic end position change
1001
+ :param ac: Genomic accession
1002
+ :param start_pos: Genomic start position
1003
+ :param end_pos: Genomic end position
848
1004
  :return: NC accession, start and end pos on GRCh38 assembly
849
1005
  """
850
1006
  if end_pos is None:
@@ -899,8 +1055,8 @@ class MANETranscript:
899
1055
  ) -> Tuple[int, int]:
900
1056
  """Get mane c position change
901
1057
 
902
- :param Dict mane_tx_genomic_data: MANE transcript and genomic data
903
- :param int coding_start_site: Coding start site
1058
+ :param mane_tx_genomic_data: MANE transcript and genomic data
1059
+ :param coding_start_site: Coding start site
904
1060
  :return: cDNA pos start, cDNA pos end
905
1061
  """
906
1062
  tx_pos_range = mane_tx_genomic_data["tx_pos_range"]
@@ -922,28 +1078,40 @@ class MANETranscript:
922
1078
  end_pos: int,
923
1079
  gene: Optional[str] = None,
924
1080
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
925
- ) -> Optional[Dict]:
1081
+ ) -> Optional[Union[GenomicRepresentation, CdnaRepresentation]]:
926
1082
  """Return MANE Transcript on the c. coordinate.
927
- If gene is provided, g->GRCh38->MANE c.
928
- If MANE c. cannot be found, we return the genomic coordinate on
929
- GRCh38
930
- If gene is not provided, g -> GRCh38
931
-
932
- :param str ac: Transcript accession on g. coordinate
933
- :param int start_pos: genomic change start position
934
- :param int end_pos: genomic change end position
935
- :param str gene: Gene symbol
936
- :param ResidueMode residue_mode: Starting residue mode for `start_pos`
937
- and `end_pos`. Will always return coordinates in inter-residue
1083
+
1084
+ If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
1085
+ representation.
1086
+
1087
+ >>> import asyncio
1088
+ >>> from cool_seq_tool.app import CoolSeqTool
1089
+ >>> cst = CoolSeqTool()
1090
+ >>> result = asyncio.run(cst.mane_transcript.g_to_mane_c(
1091
+ ... "NC_000007.13",
1092
+ ... 55259515,
1093
+ ... None,
1094
+ ... gene="EGFR"
1095
+ ... ))
1096
+ >>> type(result)
1097
+ <class 'cool_seq_tool.mappers.mane_transcript.CdnaRepresentation'>
1098
+ >>> result.status
1099
+ <TranscriptPriority.MANE_SELECT: 'mane_select'>
1100
+ >>> del cst
1101
+
1102
+ Locating a MANE transcript requires a ``gene`` symbol argument -- if none is
1103
+ given, this method will only lift over to genomic coordinates on GRCh38.
1104
+
1105
+ :param ac: Transcript accession on g. coordinate
1106
+ :param start_pos: genomic start position
1107
+ :param end_pos: genomic end position
1108
+ :param gene: HGNC gene symbol
1109
+ :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``.
1110
+ Will always return coordinates in inter-residue.
938
1111
  :return: MANE Transcripts with cDNA change on c. coordinate if gene
939
1112
  is provided. Else, GRCh38 data
940
1113
  """
941
- inter_residue_pos, _ = get_inter_residue_pos(
942
- start_pos, residue_mode, end_pos=end_pos
943
- )
944
- if not inter_residue_pos:
945
- return None
946
- start_pos, end_pos = inter_residue_pos
1114
+ start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
947
1115
  residue_mode = ResidueMode.INTER_RESIDUE
948
1116
 
949
1117
  # If gene not provided, return GRCh38
@@ -952,14 +1120,9 @@ class MANETranscript:
952
1120
  if not grch38:
953
1121
  return None
954
1122
 
955
- return dict(
956
- gene=None,
1123
+ return GenomicRepresentation(
957
1124
  refseq=grch38["ac"],
958
- ensembl=None,
959
- coding_start_site=None,
960
- coding_end_site=None,
961
1125
  pos=grch38["pos"],
962
- strand=None,
963
1126
  status=TranscriptPriority.GRCH38,
964
1127
  alt_ac=grch38["ac"],
965
1128
  )
@@ -981,7 +1144,7 @@ class MANETranscript:
981
1144
  if grch38:
982
1145
  # GRCh38 -> MANE C
983
1146
  mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
984
- mane_c_ac, None, grch38["pos"][0], grch38["pos"][1]
1147
+ mane_c_ac, grch38["ac"], grch38["pos"][0], grch38["pos"][1]
985
1148
  )
986
1149
 
987
1150
  if not grch38 or not mane_tx_genomic_data:
@@ -1014,7 +1177,9 @@ class MANETranscript:
1014
1177
  gene=current_mane_data["symbol"],
1015
1178
  cds_start_end=(coding_start_site, coding_end_site),
1016
1179
  c_pos_change=mane_c_pos_change,
1017
- strand=current_mane_data["chr_strand"],
1180
+ strand=Strand.NEGATIVE
1181
+ if current_mane_data["chr_strand"] == "-"
1182
+ else Strand.POSITIVE,
1018
1183
  status=TranscriptPriority(
1019
1184
  "_".join(current_mane_data["MANE_status"].split()).lower()
1020
1185
  ),
@@ -1023,7 +1188,7 @@ class MANETranscript:
1023
1188
  alt_ac=grch38["ac"] if grch38 else None,
1024
1189
  )
1025
1190
 
1026
- async def grch38_to_mane_p(
1191
+ async def grch38_to_mane_c_p(
1027
1192
  self,
1028
1193
  alt_ac: str,
1029
1194
  start_pos: int,
@@ -1033,21 +1198,22 @@ class MANETranscript:
1033
1198
  try_longest_compatible: bool = False,
1034
1199
  ) -> Optional[Dict]:
1035
1200
  """Given GRCh38 genomic representation, return protein representation.
1201
+
1036
1202
  Will try MANE Select and then MANE Plus Clinical. If neither is found and
1037
- `try_longest_compatible` is set to `true`, will also try to find the longest
1203
+ ``try_longest_compatible`` is set to ``true``, will also try to find the longest
1038
1204
  compatible remaining representation.
1039
1205
 
1040
1206
  :param alt_ac: Genomic RefSeq accession on GRCh38
1041
1207
  :param start_pos: Start position
1042
1208
  :param end_pos: End position
1043
1209
  :param gene: HGNC gene symbol
1044
- :param residue_mode: Starting residue mode for `start_pos` and `end_pos`. Will
1210
+ :param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``. Will
1045
1211
  always return coordinates as inter-residue.
1046
- :param try_longest_compatible: `True` if should try longest compatible remaining
1047
- if mane transcript(s) not compatible. `False` otherwise.
1212
+ :param try_longest_compatible: ``True`` if should try longest compatible remaining
1213
+ if mane transcript(s) not compatible. ``False`` otherwise.
1048
1214
  :return: If successful, return MANE data or longest compatible remaining (if
1049
- `try_longest_compatible` set to `True`) protein representation. Will return
1050
- inter-residue coordinates.
1215
+ ``try_longest_compatible`` set to ``True``) cDNA and protein representation.
1216
+ Will return inter-residue coordinates.
1051
1217
  """
1052
1218
  # Step 1: Get MANE data to map to
1053
1219
  if gene:
@@ -1061,12 +1227,7 @@ class MANETranscript:
1061
1227
  return None
1062
1228
 
1063
1229
  # Step 2: Get inter-residue position
1064
- inter_residue_pos, _ = get_inter_residue_pos(
1065
- start_pos, residue_mode, end_pos=end_pos
1066
- )
1067
- if not inter_residue_pos:
1068
- return None
1069
- start_pos, end_pos = inter_residue_pos
1230
+ start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
1070
1231
  residue_mode = ResidueMode.INTER_RESIDUE
1071
1232
 
1072
1233
  # Step 3: Try getting MANE protein representation
@@ -1084,6 +1245,7 @@ class MANETranscript:
1084
1245
 
1085
1246
  # Get MANE C positions
1086
1247
  coding_start_site = mane_tx_genomic_data["coding_start_site"]
1248
+ coding_end_site = mane_tx_genomic_data["coding_end_site"]
1087
1249
  mane_c_pos_change = self.get_mane_c_pos_change(
1088
1250
  mane_tx_genomic_data, coding_start_site
1089
1251
  )
@@ -1098,8 +1260,21 @@ class MANETranscript:
1098
1260
  )
1099
1261
  continue
1100
1262
 
1101
- # MANE C -> MANE P
1102
- return self._get_mane_p(current_mane_data, mane_c_pos_change)
1263
+ return ProteinAndCdnaRepresentation(
1264
+ protein=self._get_mane_p(current_mane_data, mane_c_pos_change),
1265
+ cdna=self._get_c_data(
1266
+ (coding_start_site, coding_end_site),
1267
+ mane_c_pos_change,
1268
+ mane_tx_genomic_data["strand"],
1269
+ TranscriptPriority(
1270
+ "_".join(current_mane_data["MANE_status"].split()).lower()
1271
+ ),
1272
+ mane_c_ac,
1273
+ alt_ac=alt_ac,
1274
+ ensembl_c_ac=current_mane_data["Ensembl_nuc"],
1275
+ gene=current_mane_data["symbol"],
1276
+ ),
1277
+ )
1103
1278
 
1104
1279
  if try_longest_compatible:
1105
1280
  return await self.get_longest_compatible_transcript(
@@ -1108,7 +1283,7 @@ class MANETranscript:
1108
1283
  AnnotationLayer.GENOMIC,
1109
1284
  residue_mode=residue_mode,
1110
1285
  alt_ac=alt_ac,
1111
- end_annotation_layer=AnnotationLayer.PROTEIN,
1286
+ end_annotation_layer=EndAnnotationLayer.PROTEIN_AND_CDNA,
1112
1287
  mane_transcripts=mane_transcripts,
1113
1288
  )
1114
1289
  else: