cool-seq-tool 0.3.0.dev1__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/api.py +3 -3
- cool_seq_tool/app.py +32 -11
- cool_seq_tool/data/data_downloads.py +8 -5
- cool_seq_tool/handlers/seqrepo_access.py +55 -27
- cool_seq_tool/mappers/__init__.py +4 -1
- cool_seq_tool/mappers/alignment.py +40 -37
- cool_seq_tool/mappers/exon_genomic_coords.py +329 -138
- cool_seq_tool/mappers/mane_transcript.py +402 -227
- cool_seq_tool/routers/mane.py +1 -1
- cool_seq_tool/routers/mappings.py +1 -1
- cool_seq_tool/schemas.py +31 -24
- cool_seq_tool/sources/__init__.py +4 -2
- cool_seq_tool/sources/mane_transcript_mappings.py +28 -7
- cool_seq_tool/sources/transcript_mappings.py +27 -11
- cool_seq_tool/sources/uta_database.py +179 -232
- cool_seq_tool/utils.py +22 -24
- cool_seq_tool/version.py +1 -1
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/LICENSE +1 -1
- cool_seq_tool-0.4.0.dev0.dist-info/METADATA +130 -0
- cool_seq_tool-0.4.0.dev0.dist-info/RECORD +28 -0
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/transcript_mapping.tsv +0 -256226
- cool_seq_tool-0.3.0.dev1.dist-info/METADATA +0 -187
- cool_seq_tool-0.3.0.dev1.dist-info/RECORD +0 -29
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/top_level.txt +0 -0
@@ -1,53 +1,121 @@
|
|
1
|
-
"""
|
1
|
+
"""Retrieve MANE transcript from a location on p./c./g. coordinates.
|
2
|
+
|
2
3
|
Steps:
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
|
5
|
+
#. Map annotation layer to genome
|
6
|
+
#. Liftover to preferred genome (GRCh38). GRCh36 and earlier assemblies are not supported
|
7
|
+
for fetching MANE transcripts.
|
8
|
+
#. Select preferred compatible annotation (see :ref:`transcript compatibility <transcript_compatibility>`)
|
9
|
+
#. Map back to correct annotation layer
|
10
|
+
|
11
|
+
In addition to a mapper utility class, this module also defines several vocabulary
|
12
|
+
constraints and data models for coordinate representation.
|
9
13
|
"""
|
10
14
|
import logging
|
11
15
|
import math
|
16
|
+
from enum import StrEnum
|
12
17
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
13
18
|
|
14
19
|
import polars as pl
|
20
|
+
from pydantic import BaseModel
|
15
21
|
|
16
22
|
from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
|
17
23
|
from cool_seq_tool.schemas import (
|
18
24
|
AnnotationLayer,
|
19
25
|
Assembly,
|
20
26
|
ResidueMode,
|
27
|
+
Strand,
|
21
28
|
TranscriptPriority,
|
22
29
|
)
|
23
30
|
from cool_seq_tool.sources import (
|
24
|
-
|
31
|
+
ManeTranscriptMappings,
|
25
32
|
TranscriptMappings,
|
26
|
-
|
33
|
+
UtaDatabase,
|
27
34
|
)
|
28
35
|
from cool_seq_tool.utils import get_inter_residue_pos
|
29
36
|
|
30
37
|
logger = logging.getLogger(__name__)
|
31
38
|
|
32
39
|
|
33
|
-
class
|
40
|
+
class EndAnnotationLayer(StrEnum):
|
41
|
+
"""Define constraints for end annotation layer. This is used for determining the
|
42
|
+
end annotation layer when getting the longest compatible remaining representation
|
43
|
+
"""
|
44
|
+
|
45
|
+
PROTEIN = AnnotationLayer.PROTEIN
|
46
|
+
CDNA = AnnotationLayer.CDNA
|
47
|
+
PROTEIN_AND_CDNA = "p_and_c"
|
48
|
+
|
49
|
+
|
50
|
+
class DataRepresentation(BaseModel):
|
51
|
+
"""Define object model for final output representation"""
|
52
|
+
|
53
|
+
gene: Optional[str] = None
|
54
|
+
refseq: str
|
55
|
+
ensembl: Optional[str] = None
|
56
|
+
pos: Tuple[int, int]
|
57
|
+
strand: Strand
|
58
|
+
status: TranscriptPriority
|
59
|
+
|
60
|
+
|
61
|
+
class CdnaRepresentation(DataRepresentation):
|
62
|
+
"""Define object model for coding DNA representation"""
|
63
|
+
|
64
|
+
coding_start_site: int
|
65
|
+
coding_end_site: int
|
66
|
+
alt_ac: Optional[str] = None
|
67
|
+
|
68
|
+
|
69
|
+
class GenomicRepresentation(BaseModel):
|
70
|
+
"""Define object model for genomic representation"""
|
71
|
+
|
72
|
+
refseq: str
|
73
|
+
pos: Tuple[int, int]
|
74
|
+
status: TranscriptPriority
|
75
|
+
alt_ac: str
|
76
|
+
|
77
|
+
|
78
|
+
class ProteinAndCdnaRepresentation(BaseModel):
|
79
|
+
"""Define object model for protein and cDNA representation"""
|
80
|
+
|
81
|
+
protein: DataRepresentation
|
82
|
+
cdna: CdnaRepresentation
|
83
|
+
|
84
|
+
|
85
|
+
class ManeTranscript:
|
34
86
|
"""Class for retrieving MANE transcripts."""
|
35
87
|
|
36
88
|
def __init__(
|
37
89
|
self,
|
38
90
|
seqrepo_access: SeqRepoAccess,
|
39
91
|
transcript_mappings: TranscriptMappings,
|
40
|
-
mane_transcript_mappings:
|
41
|
-
uta_db:
|
92
|
+
mane_transcript_mappings: ManeTranscriptMappings,
|
93
|
+
uta_db: UtaDatabase,
|
42
94
|
) -> None:
|
43
|
-
"""Initialize the
|
95
|
+
"""Initialize the ManeTranscript class.
|
96
|
+
|
97
|
+
A handful of resources are required for initialization, so when defaults are
|
98
|
+
enough, it's easiest to let the core CoolSeqTool class handle it for you:
|
99
|
+
|
100
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
101
|
+
>>> mane_mapper = CoolSeqTool().mane_transcript
|
102
|
+
|
103
|
+
Note that most methods are defined as Python coroutines, so they must be called
|
104
|
+
with ``await`` or run from an ``async`` event loop:
|
105
|
+
|
106
|
+
>>> import asyncio
|
107
|
+
>>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
|
108
|
+
>>> result['ac']
|
109
|
+
'NC_000001.11'
|
110
|
+
|
111
|
+
See the :ref:`Usage section <async_note>` for more information.
|
44
112
|
|
45
113
|
:param seqrepo_access: Access to seqrepo queries
|
46
114
|
:param transcript_mappings: Access to transcript accession mappings and
|
47
115
|
conversions
|
48
116
|
:param mane_transcript_mappings: Access to MANE Transcript accession mapping
|
49
117
|
data
|
50
|
-
:param uta_db:
|
118
|
+
:param uta_db: UtaDatabase instance to give access to query UTA database
|
51
119
|
"""
|
52
120
|
self.seqrepo_access = seqrepo_access
|
53
121
|
self.transcript_mappings = transcript_mappings
|
@@ -56,10 +124,9 @@ class MANETranscript:
|
|
56
124
|
|
57
125
|
@staticmethod
|
58
126
|
def _get_reading_frame(pos: int) -> int:
|
59
|
-
"""Return reading frame number.
|
60
|
-
Only used on c. coordinate
|
127
|
+
"""Return reading frame number. Only used on c. coordinate.
|
61
128
|
|
62
|
-
:param
|
129
|
+
:param pos: cDNA position
|
63
130
|
:return: Reading frame
|
64
131
|
"""
|
65
132
|
pos_mod_3 = pos % 3
|
@@ -71,26 +138,25 @@ class MANETranscript:
|
|
71
138
|
def _p_to_c_pos(start: int, end: int) -> Tuple[int, int]:
|
72
139
|
"""Return cDNA position given a protein position.
|
73
140
|
|
74
|
-
:param
|
75
|
-
:param
|
141
|
+
:param start: Start protein position. Inter-residue coordinates
|
142
|
+
:param end: End protein position. Inter-residue coordinates
|
76
143
|
:return: cDNA position start, cDNA position end
|
77
144
|
"""
|
78
|
-
start_pos = start * 3
|
145
|
+
start_pos = start * 3
|
79
146
|
if end != start:
|
80
|
-
end_pos = end * 3
|
147
|
+
end_pos = end * 3
|
81
148
|
else:
|
82
149
|
end_pos = start_pos
|
83
|
-
|
84
|
-
return start_pos - 1, end_pos + 1
|
150
|
+
return start_pos, end_pos - 1
|
85
151
|
|
86
152
|
async def _p_to_c(
|
87
153
|
self, ac: str, start_pos: int, end_pos: int
|
88
154
|
) -> Optional[Tuple[str, Tuple[int, int]]]:
|
89
155
|
"""Convert protein (p.) annotation to cDNA (c.) annotation.
|
90
156
|
|
91
|
-
:param
|
92
|
-
:param
|
93
|
-
:param
|
157
|
+
:param ac: Protein accession
|
158
|
+
:param start_pos: Protein start position. Inter-residue coordinates
|
159
|
+
:param end_pos: Protein end position. Inter-residue coordinates
|
94
160
|
:return: [cDNA transcript accession, [cDNA pos start, cDNA pos end]]
|
95
161
|
"""
|
96
162
|
# TODO: Check version mappings 1 to 1 relationship
|
@@ -116,8 +182,8 @@ class MANETranscript:
|
|
116
182
|
async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]:
|
117
183
|
"""Get g. annotation from c. annotation.
|
118
184
|
|
119
|
-
:param
|
120
|
-
:param
|
185
|
+
:param ac: cDNA accession
|
186
|
+
:param pos: [cDNA pos start, cDNA pos end]
|
121
187
|
:return: Gene, Transcript accession and position change,
|
122
188
|
Altered transcript accession and position change, Strand
|
123
189
|
"""
|
@@ -127,7 +193,9 @@ class MANETranscript:
|
|
127
193
|
if not self.transcript_mappings.ensembl_transcript_version_to_gene_symbol.get(
|
128
194
|
ac
|
129
195
|
):
|
130
|
-
if not self.seqrepo_access.get_reference_sequence(ac, 1)[
|
196
|
+
if not self.seqrepo_access.get_reference_sequence(ac, start=1, end=1)[
|
197
|
+
0
|
198
|
+
]:
|
131
199
|
logger.warning(f"Ensembl transcript not found: {ac}")
|
132
200
|
return None
|
133
201
|
|
@@ -160,12 +228,11 @@ class MANETranscript:
|
|
160
228
|
) -> Optional[Dict]:
|
161
229
|
"""Get and validate genomic_tx_data
|
162
230
|
|
163
|
-
:param
|
164
|
-
:param
|
165
|
-
:param
|
166
|
-
|
167
|
-
:param
|
168
|
-
:param Optional[str] alt_ac: Accession on g. coordinate
|
231
|
+
:param tx_ac: Accession on c. coordinate
|
232
|
+
:param pos: (start pos, end pos)
|
233
|
+
:param annotation_layer: Annotation layer for ``ac`` and ``pos``
|
234
|
+
:param coding_start_site: Coding start site
|
235
|
+
:param alt_ac: Accession on g. coordinate
|
169
236
|
:return: genomic_tx_data if found and validated, else None
|
170
237
|
"""
|
171
238
|
genomic_tx_data = await self.uta_db.get_genomic_tx_data(
|
@@ -199,27 +266,25 @@ class MANETranscript:
|
|
199
266
|
|
200
267
|
@staticmethod
|
201
268
|
def _get_c_data(
|
202
|
-
gene: str,
|
203
269
|
cds_start_end: Tuple[int, int],
|
204
270
|
c_pos_change: Tuple[int, int],
|
205
|
-
strand:
|
271
|
+
strand: Strand,
|
206
272
|
status: TranscriptPriority,
|
207
273
|
refseq_c_ac: str,
|
274
|
+
gene: Optional[str] = None,
|
208
275
|
ensembl_c_ac: Optional[str] = None,
|
209
276
|
alt_ac: Optional[str] = None,
|
210
|
-
) ->
|
277
|
+
) -> CdnaRepresentation:
|
211
278
|
"""Return transcript data on c. coordinate.
|
212
279
|
|
213
|
-
:param
|
214
|
-
:param
|
215
|
-
|
216
|
-
:param
|
217
|
-
|
218
|
-
:param
|
219
|
-
:param
|
220
|
-
:param
|
221
|
-
:param Optional[str] ensembl_c_ac: Ensembl transcript
|
222
|
-
:param Optional[str] alt_ac: Genomic accession
|
280
|
+
:param gene: Gene symbol
|
281
|
+
:param cds_start_end: Coding start and end site for transcript
|
282
|
+
:param c_pos_change: Start and end positions for change on c. coordinate
|
283
|
+
:param strand: Strand
|
284
|
+
:param status: Status of transcript
|
285
|
+
:param refseq_c_ac: Refseq transcript
|
286
|
+
:param ensembl_c_ac: Ensembl transcript
|
287
|
+
:param alt_ac: Genomic accession
|
223
288
|
:return: Transcript data on c. coord
|
224
289
|
"""
|
225
290
|
cds_start = cds_start_end[0]
|
@@ -229,10 +294,10 @@ class MANETranscript:
|
|
229
294
|
|
230
295
|
if lt_cds_start or gt_cds_end:
|
231
296
|
logger.info(
|
232
|
-
f"{refseq_c_ac} with position"
|
233
|
-
f" {c_pos_change} is not within CDS start/end"
|
297
|
+
f"{refseq_c_ac} with position {c_pos_change} is not within CDS start/end"
|
234
298
|
)
|
235
|
-
|
299
|
+
|
300
|
+
return CdnaRepresentation(
|
236
301
|
gene=gene,
|
237
302
|
refseq=refseq_c_ac,
|
238
303
|
ensembl=ensembl_c_ac,
|
@@ -244,27 +309,37 @@ class MANETranscript:
|
|
244
309
|
alt_ac=alt_ac,
|
245
310
|
)
|
246
311
|
|
247
|
-
|
248
|
-
|
249
|
-
"""Translate MANE Transcript c. annotation to p. annotation
|
312
|
+
def _c_to_p_pos(self, c_pos: Tuple[int, int]) -> Tuple[int, int]:
|
313
|
+
"""Get protein position from cdna position
|
250
314
|
|
251
|
-
:param
|
252
|
-
:
|
253
|
-
on MANE Transcript c. coordinate
|
254
|
-
:return: MANE transcripts accessions and position change on
|
255
|
-
p. coordinate
|
315
|
+
:param c_pos: cdna position. inter-residue coordinates
|
316
|
+
:return: protein position. inter-residue coordinates
|
256
317
|
"""
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
318
|
+
end = math.ceil(c_pos[1] / 3)
|
319
|
+
if c_pos[1] - c_pos[0] == 1:
|
320
|
+
start = end - 1
|
321
|
+
else:
|
322
|
+
start = math.ceil((c_pos[0] + 1) / 3) - 1
|
323
|
+
return start, end
|
324
|
+
|
325
|
+
def _get_mane_p(
|
326
|
+
self, mane_data: Dict, mane_c_pos_range: Tuple[int, int]
|
327
|
+
) -> DataRepresentation:
|
328
|
+
"""Translate MANE Transcript c. annotation to p. annotation
|
261
329
|
|
262
|
-
|
330
|
+
:param mane_data: MANE Transcript data
|
331
|
+
:param mane_c_pos_range: Position change range on MANE Transcript c. coordinate
|
332
|
+
using inter-residue coordinates
|
333
|
+
:return: Protein representation
|
334
|
+
"""
|
335
|
+
return DataRepresentation(
|
263
336
|
gene=mane_data["symbol"],
|
264
337
|
refseq=mane_data["RefSeq_prot"],
|
265
338
|
ensembl=mane_data["Ensembl_prot"],
|
266
|
-
pos=(
|
267
|
-
strand=
|
339
|
+
pos=self._c_to_p_pos(mane_c_pos_range),
|
340
|
+
strand=Strand.NEGATIVE
|
341
|
+
if mane_data["chr_strand"] == "-"
|
342
|
+
else Strand.POSITIVE,
|
268
343
|
status=TranscriptPriority(
|
269
344
|
"_".join(mane_data["MANE_status"].split()).lower()
|
270
345
|
),
|
@@ -278,17 +353,17 @@ class MANETranscript:
|
|
278
353
|
ensembl_c_ac: Optional[str] = None,
|
279
354
|
alt_ac: Optional[str] = None,
|
280
355
|
found_result: bool = False,
|
281
|
-
) -> Optional[
|
356
|
+
) -> Optional[CdnaRepresentation]:
|
282
357
|
"""Get transcript c. annotation data from g. annotation.
|
283
358
|
|
284
|
-
:param
|
285
|
-
:param
|
286
|
-
:param
|
287
|
-
:param
|
288
|
-
:param
|
289
|
-
:param
|
359
|
+
:param g: Genomic data
|
360
|
+
:param refseq_c_ac: Refseq transcript accession
|
361
|
+
:param status: Status of transcript
|
362
|
+
:param ensembl_c_ac: Ensembl transcript accession
|
363
|
+
:param alt_ac: Genomic accession
|
364
|
+
:param found_result: ``True`` if found result, so do not need to query
|
290
365
|
tx_exon_aln_v table. This is because the user did not need to liftover.
|
291
|
-
|
366
|
+
``False`` if need to get result from tx_exon_aln_v table.
|
292
367
|
:return: Transcript data
|
293
368
|
"""
|
294
369
|
if found_result:
|
@@ -321,7 +396,7 @@ class MANETranscript:
|
|
321
396
|
g_pos = g["alt_pos_change_range"] # start/end genomic change
|
322
397
|
g_pos_change = g_pos[0] - tx_g_pos[0], tx_g_pos[1] - g_pos[1]
|
323
398
|
|
324
|
-
if g["strand"] ==
|
399
|
+
if g["strand"] == Strand.NEGATIVE:
|
325
400
|
g_pos_change = (tx_g_pos[1] - g_pos[0], g_pos[1] - tx_g_pos[0])
|
326
401
|
|
327
402
|
c_pos_change = (
|
@@ -344,29 +419,27 @@ class MANETranscript:
|
|
344
419
|
)
|
345
420
|
|
346
421
|
def _validate_reading_frames(
|
347
|
-
self, ac: str, start_pos: int, end_pos: int, transcript_data:
|
422
|
+
self, ac: str, start_pos: int, end_pos: int, transcript_data: CdnaRepresentation
|
348
423
|
) -> bool:
|
349
424
|
"""Return whether reading frames are the same after translation.
|
350
425
|
|
351
|
-
:param
|
352
|
-
:param
|
353
|
-
:param
|
354
|
-
:param
|
355
|
-
|
356
|
-
:return:
|
357
|
-
|
426
|
+
:param ac: Query accession
|
427
|
+
:param start_pos: Original start cDNA position change
|
428
|
+
:param end_pos: Original end cDNA position change
|
429
|
+
:param transcript_data: Ensembl and RefSeq transcripts with corresponding
|
430
|
+
position change
|
431
|
+
:return: ``True`` if reading frames are the same after translation.
|
432
|
+
``False`` otherwise
|
358
433
|
"""
|
359
434
|
for pos, pos_index in [(start_pos, 0), (end_pos, 1)]:
|
360
435
|
if pos is not None:
|
361
436
|
og_rf = self._get_reading_frame(pos)
|
362
|
-
new_rf = self._get_reading_frame(transcript_data
|
437
|
+
new_rf = self._get_reading_frame(transcript_data.pos[pos_index])
|
363
438
|
|
364
439
|
if og_rf != new_rf:
|
365
440
|
logger.warning(
|
366
|
-
f"{ac} original reading frame ({og_rf}) "
|
367
|
-
f"
|
368
|
-
f"{transcript_data['ensembl']}, "
|
369
|
-
f"{transcript_data['refseq']} reading "
|
441
|
+
f"{ac} original reading frame ({og_rf}) does not match new "
|
442
|
+
f"{transcript_data.ensembl}, {transcript_data.refseq} reading "
|
370
443
|
f"frame ({new_rf})"
|
371
444
|
)
|
372
445
|
return False
|
@@ -382,7 +455,9 @@ class MANETranscript:
|
|
382
455
|
coding_start_site: int,
|
383
456
|
start_pos: int,
|
384
457
|
end_pos: int,
|
385
|
-
mane_transcript:
|
458
|
+
mane_transcript: Union[
|
459
|
+
DataRepresentation, CdnaRepresentation, GenomicRepresentation
|
460
|
+
],
|
386
461
|
expected_ref: str,
|
387
462
|
anno: AnnotationLayer,
|
388
463
|
residue_mode: ResidueMode,
|
@@ -397,29 +472,29 @@ class MANETranscript:
|
|
397
472
|
position change
|
398
473
|
:param expected_ref: Reference at position given during input
|
399
474
|
:param anno: Annotation layer we are starting from
|
400
|
-
:param residue_mode: Residue mode for
|
401
|
-
:return:
|
475
|
+
:param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
|
476
|
+
:return: ``True`` if reference check passes. ``False`` otherwise.
|
402
477
|
"""
|
403
478
|
if anno == AnnotationLayer.CDNA:
|
404
479
|
start_pos += coding_start_site
|
405
480
|
end_pos += coding_start_site
|
406
481
|
|
407
|
-
ref,
|
408
|
-
ac, start_pos, end=end_pos, residue_mode=residue_mode
|
482
|
+
ref, _ = self.seqrepo_access.get_reference_sequence(
|
483
|
+
ac, start=start_pos, end=end_pos, residue_mode=residue_mode
|
409
484
|
)
|
410
485
|
if ref is None:
|
411
486
|
return False
|
412
487
|
|
413
488
|
if mane_transcript:
|
414
|
-
mane_start_pos = mane_transcript
|
415
|
-
mane_end_pos = mane_transcript
|
489
|
+
mane_start_pos = mane_transcript.pos[0]
|
490
|
+
mane_end_pos = mane_transcript.pos[1]
|
416
491
|
if anno == AnnotationLayer.CDNA:
|
417
|
-
mane_cds = mane_transcript
|
492
|
+
mane_cds = mane_transcript.coding_start_site
|
418
493
|
mane_start_pos += mane_cds
|
419
494
|
mane_end_pos += mane_cds
|
420
|
-
mane_ref,
|
421
|
-
mane_transcript
|
422
|
-
mane_start_pos,
|
495
|
+
mane_ref, _ = self.seqrepo_access.get_reference_sequence(
|
496
|
+
mane_transcript.refseq,
|
497
|
+
start=mane_start_pos,
|
423
498
|
end=mane_end_pos if mane_start_pos != mane_end_pos else None,
|
424
499
|
residue_mode=residue_mode,
|
425
500
|
)
|
@@ -429,12 +504,12 @@ class MANETranscript:
|
|
429
504
|
if expected_ref != mane_ref:
|
430
505
|
logger.info(
|
431
506
|
f"Expected ref, {expected_ref}, but got {mane_ref}"
|
432
|
-
f" on MANE accession, {mane_transcript
|
507
|
+
f" on MANE accession, {mane_transcript.refseq}"
|
433
508
|
)
|
434
509
|
|
435
510
|
if expected_ref != ref:
|
436
511
|
logger.warning(
|
437
|
-
f"Expected ref, {expected_ref}, but got {ref}
|
512
|
+
f"Expected ref, {expected_ref}, but got {ref} on accession, {ac}"
|
438
513
|
)
|
439
514
|
return False
|
440
515
|
|
@@ -445,18 +520,16 @@ class MANETranscript:
|
|
445
520
|
) -> bool:
|
446
521
|
"""Validate that positions actually exist on accession
|
447
522
|
|
448
|
-
:param
|
449
|
-
:param
|
450
|
-
:param
|
451
|
-
:return:
|
523
|
+
:param ac: Accession
|
524
|
+
:param pos: Start position change, End position change
|
525
|
+
:param coding_start_site: coding start site for accession
|
526
|
+
:return: ``True`` if positions exist on accession. ``False`` otherwise
|
452
527
|
"""
|
453
528
|
start_pos = pos[0] + coding_start_site
|
454
529
|
end_pos = pos[1] + coding_start_site
|
455
530
|
if self.seqrepo_access.get_reference_sequence(
|
456
|
-
ac, start_pos, end_pos, residue_mode=ResidueMode.INTER_RESIDUE
|
457
|
-
)[
|
458
|
-
0
|
459
|
-
]: # noqa E501
|
531
|
+
ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
|
532
|
+
)[0]:
|
460
533
|
return True
|
461
534
|
else:
|
462
535
|
return False
|
@@ -517,36 +590,83 @@ class MANETranscript:
|
|
517
590
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
518
591
|
mane_transcripts: Optional[Set] = None,
|
519
592
|
alt_ac: Optional[str] = None,
|
520
|
-
end_annotation_layer: Optional[
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
"""Get longest compatible transcript from a gene.
|
525
|
-
|
526
|
-
|
593
|
+
end_annotation_layer: Optional[EndAnnotationLayer] = None,
|
594
|
+
) -> Optional[
|
595
|
+
Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation]
|
596
|
+
]:
|
597
|
+
"""Get longest compatible transcript from a gene. See the documentation for
|
598
|
+
the :ref:`transcript compatibility policy <transcript_compatibility>` for more
|
599
|
+
information.
|
600
|
+
|
601
|
+
>>> import asyncio
|
602
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
603
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
604
|
+
>>> mane_mapper = CoolSeqTool().mane_transcript
|
605
|
+
>>> mane_transcripts = {
|
606
|
+
... "ENST00000646891.2",
|
607
|
+
... "NM_001374258.1",
|
608
|
+
... "NM_004333.6",
|
609
|
+
... "ENST00000644969.2",
|
610
|
+
... }
|
611
|
+
>>> result = asyncio.run(mane_mapper.get_longest_compatible_transcript(
|
612
|
+
... 599,
|
613
|
+
... 599,
|
614
|
+
... gene="BRAF",
|
615
|
+
... start_annotation_layer=AnnotationLayer.PROTEIN,
|
616
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
617
|
+
... mane_transcripts=mane_transcripts,
|
618
|
+
... ))
|
619
|
+
>>> result.refseq
|
620
|
+
'NP_001365396.1'
|
621
|
+
|
622
|
+
If unable to find a match on GRCh38, this method will then attempt to drop down
|
623
|
+
to GRCh37.
|
624
|
+
|
625
|
+
# TODO example for inputs that demonstrate this?
|
527
626
|
|
528
627
|
:param start_pos: Start position change
|
529
628
|
:param end_pos: End position change
|
530
629
|
:param start_annotation_layer: Starting annotation layer
|
531
630
|
:param gene: HGNC gene symbol
|
532
631
|
:param ref: Reference at position given during input
|
533
|
-
:param residue_mode: Residue mode for
|
632
|
+
:param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
|
534
633
|
:param mane_transcripts: Attempted mane transcripts that were not compatible
|
535
634
|
:param alt_ac: Genomic accession
|
536
635
|
:param end_annotation_layer: The end annotation layer. If not provided, will be
|
537
|
-
set to
|
538
|
-
|
539
|
-
|
540
|
-
`AnnotationLayer.CDNA` otherwise
|
636
|
+
set to ``EndAnnotationLayer.PROTEIN`` if
|
637
|
+
``start_annotation_layer == AnnotationLayer.PROTEIN``,
|
638
|
+
``EndAnnotationLayer.CDNA`` otherwise
|
541
639
|
:return: Data for longest compatible transcript
|
542
640
|
"""
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
641
|
+
|
642
|
+
def _get_protein_rep(
|
643
|
+
gene: Optional[str],
|
644
|
+
pro_ac: str,
|
645
|
+
lcr_c_data_pos: Tuple[int, int],
|
646
|
+
strand: Strand,
|
647
|
+
status: TranscriptPriority,
|
648
|
+
) -> DataRepresentation:
|
649
|
+
"""Get longest compatible remaining protein representation
|
650
|
+
|
651
|
+
:param gene: HGNC gene symbol
|
652
|
+
:param pro_ac: Protein accession
|
653
|
+
:param lcr_c_data_pos: Longest compatible remaining position
|
654
|
+
:param strand: Strand
|
655
|
+
:param status: Status for `pro_ac`
|
656
|
+
:return: Protein representation for longest compatible remaining result
|
657
|
+
"""
|
658
|
+
return DataRepresentation(
|
659
|
+
gene=gene,
|
660
|
+
refseq=pro_ac if pro_ac.startswith("N") else None,
|
661
|
+
ensembl=pro_ac if pro_ac.startswith("E") else None,
|
662
|
+
pos=self._c_to_p_pos(lcr_c_data_pos),
|
663
|
+
strand=strand,
|
664
|
+
status=status,
|
665
|
+
)
|
666
|
+
|
667
|
+
lcr_result = None
|
668
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
548
669
|
residue_mode = ResidueMode.INTER_RESIDUE
|
549
|
-
start_pos, end_pos = inter_residue_pos
|
550
670
|
|
551
671
|
is_p_or_c_start_anno = True
|
552
672
|
if start_annotation_layer == AnnotationLayer.PROTEIN:
|
@@ -568,7 +688,7 @@ class MANETranscript:
|
|
568
688
|
|
569
689
|
if df.is_empty():
|
570
690
|
logger.warning(f"Unable to get transcripts from gene {gene}")
|
571
|
-
return
|
691
|
+
return lcr_result
|
572
692
|
|
573
693
|
prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df)
|
574
694
|
|
@@ -606,7 +726,7 @@ class MANETranscript:
|
|
606
726
|
|
607
727
|
# Get prioritized transcript data for gene
|
608
728
|
# grch38 -> c
|
609
|
-
lcr_c_data = await self._g_to_c(
|
729
|
+
lcr_c_data: Optional[CdnaRepresentation] = await self._g_to_c(
|
610
730
|
g=g,
|
611
731
|
refseq_c_ac=tx_ac,
|
612
732
|
status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
|
@@ -664,74 +784,108 @@ class MANETranscript:
|
|
664
784
|
|
665
785
|
if not end_annotation_layer:
|
666
786
|
if start_annotation_layer == AnnotationLayer.PROTEIN:
|
667
|
-
end_annotation_layer =
|
787
|
+
end_annotation_layer = EndAnnotationLayer.PROTEIN
|
668
788
|
else:
|
669
|
-
end_annotation_layer =
|
789
|
+
end_annotation_layer = EndAnnotationLayer.CDNA
|
790
|
+
|
791
|
+
if end_annotation_layer in {
|
792
|
+
EndAnnotationLayer.CDNA,
|
793
|
+
EndAnnotationLayer.PROTEIN,
|
794
|
+
}:
|
795
|
+
if end_annotation_layer == EndAnnotationLayer.CDNA:
|
796
|
+
lcr_result = lcr_c_data
|
797
|
+
coding_start_site = lcr_result.coding_start_site
|
798
|
+
else:
|
799
|
+
lcr_result = _get_protein_rep(
|
800
|
+
gene,
|
801
|
+
row["pro_ac"],
|
802
|
+
lcr_c_data.pos,
|
803
|
+
g["strand"],
|
804
|
+
lcr_c_data.status,
|
805
|
+
)
|
806
|
+
coding_start_site = 0
|
670
807
|
|
671
|
-
|
672
|
-
pos =
|
673
|
-
math.ceil(lcr_c_data["pos"][0] / 3),
|
674
|
-
math.floor(lcr_c_data["pos"][1] / 3),
|
675
|
-
)
|
676
|
-
ac = row["pro_ac"]
|
677
|
-
coding_start_site = 0
|
678
|
-
else:
|
679
|
-
# cDNA and Genomic annotations will return c. data
|
680
|
-
pos = lcr_c_data["pos"]
|
681
|
-
ac = tx_ac
|
682
|
-
coding_start_site = lcr_c_data["coding_start_site"]
|
808
|
+
ac = lcr_result.refseq or lcr_result.ensembl
|
809
|
+
pos = lcr_result.pos
|
683
810
|
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
811
|
+
if not self._validate_index(ac, pos, coding_start_site):
|
812
|
+
logger.warning(
|
813
|
+
f"{pos} are not valid positions on {ac} with coding start site "
|
814
|
+
f"{coding_start_site}"
|
815
|
+
)
|
816
|
+
continue
|
817
|
+
return lcr_result
|
818
|
+
else:
|
819
|
+
lcr_result = ProteinAndCdnaRepresentation(
|
820
|
+
protein=_get_protein_rep(
|
821
|
+
gene,
|
822
|
+
row["pro_ac"],
|
823
|
+
lcr_c_data.pos,
|
824
|
+
g["strand"],
|
825
|
+
lcr_c_data.status,
|
826
|
+
),
|
827
|
+
cdna=lcr_c_data,
|
689
828
|
)
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
829
|
+
lcr_result_dict = lcr_result.model_dump()
|
830
|
+
|
831
|
+
valid = True
|
832
|
+
for k in lcr_result_dict.keys():
|
833
|
+
cds = lcr_result_dict[k].get("coding_start_site", 0)
|
834
|
+
ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
|
835
|
+
pos = lcr_result_dict[k]["pos"]
|
836
|
+
if not self._validate_index(ac, pos, cds):
|
837
|
+
valid = False
|
838
|
+
logger.warning(
|
839
|
+
f"{pos} are not valid positions on {ac} with coding start site {cds}"
|
840
|
+
)
|
841
|
+
break
|
842
|
+
|
843
|
+
if valid:
|
844
|
+
return lcr_result
|
845
|
+
return lcr_result
|
700
846
|
|
701
847
|
async def get_mane_transcript(
|
702
848
|
self,
|
703
849
|
ac: str,
|
704
850
|
start_pos: int,
|
851
|
+
end_pos: int,
|
705
852
|
start_annotation_layer: AnnotationLayer,
|
706
|
-
end_pos: Optional[int] = None,
|
707
853
|
gene: Optional[str] = None,
|
708
854
|
ref: Optional[str] = None,
|
709
855
|
try_longest_compatible: bool = False,
|
710
|
-
residue_mode:
|
711
|
-
|
712
|
-
|
856
|
+
residue_mode: Union[
|
857
|
+
ResidueMode.RESIDUE, ResidueMode.INTER_RESIDUE
|
858
|
+
] = ResidueMode.RESIDUE,
|
859
|
+
) -> Optional[Union[DataRepresentation, CdnaRepresentation]]:
|
860
|
+
"""Return MANE transcript.
|
861
|
+
|
862
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
863
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
864
|
+
>>> import asyncio
|
865
|
+
>>> mane_mapper = CoolSeqTool().mane_transcript
|
866
|
+
>>> result = asyncio.run(mane_mapper.get_mane_transcript(
|
867
|
+
... "NP_004324.2",
|
868
|
+
... 599,
|
869
|
+
... AnnotationLayer.PROTEIN,
|
870
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
871
|
+
... ))
|
872
|
+
>>> result.gene, result.refseq, result.status
|
873
|
+
('BRAF', 'NP_004324.2', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
|
713
874
|
|
714
875
|
:param ac: Accession
|
715
876
|
:param start_pos: Start position change
|
877
|
+
:param end_pos: End position change
|
716
878
|
:param start_annotation_layer: Starting annotation layer.
|
717
|
-
:param end_pos: End position change. If `None` assumes both `start_pos` and
|
718
|
-
`end_pos` have same values.
|
719
879
|
:param gene: HGNC gene symbol
|
720
880
|
:param ref: Reference at position given during input
|
721
|
-
:param try_longest_compatible:
|
722
|
-
if mane transcript was not compatible.
|
723
|
-
:param ResidueMode residue_mode: Starting residue mode for
|
724
|
-
|
881
|
+
:param try_longest_compatible: ``True`` if should try longest compatible remaining
|
882
|
+
if mane transcript was not compatible. ``False`` otherwise.
|
883
|
+
:param ResidueMode residue_mode: Starting residue mode for ``start_pos`` and
|
884
|
+
``end_pos``. Will always return coordinates in inter-residue
|
725
885
|
:return: MANE data or longest transcript compatible data if validation
|
726
|
-
checks are correct. Will return inter-residue coordinates.
|
727
|
-
Else, `None`
|
886
|
+
checks are correct. Will return inter-residue coordinates. Else, ``None``.
|
728
887
|
"""
|
729
|
-
|
730
|
-
start_pos, residue_mode, end_pos=end_pos
|
731
|
-
)
|
732
|
-
if not inter_residue_pos:
|
733
|
-
return None
|
734
|
-
start_pos, end_pos = inter_residue_pos
|
888
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
735
889
|
residue_mode = ResidueMode.INTER_RESIDUE
|
736
890
|
if ref:
|
737
891
|
ref = ref[: end_pos - start_pos]
|
@@ -766,7 +920,7 @@ class MANETranscript:
|
|
766
920
|
mane_transcripts |= set(
|
767
921
|
(current_mane_data["RefSeq_nuc"], current_mane_data["Ensembl_nuc"])
|
768
922
|
)
|
769
|
-
mane = await self._g_to_c(
|
923
|
+
mane: Optional[CdnaRepresentation] = await self._g_to_c(
|
770
924
|
g=g,
|
771
925
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
772
926
|
status=TranscriptPriority(
|
@@ -777,10 +931,10 @@ class MANETranscript:
|
|
777
931
|
if not mane:
|
778
932
|
continue
|
779
933
|
|
780
|
-
if not mane
|
934
|
+
if not mane.alt_ac:
|
781
935
|
g_alt_ac = g.get("alt_ac")
|
782
936
|
if g_alt_ac:
|
783
|
-
mane
|
937
|
+
mane.alt_ac = g_alt_ac
|
784
938
|
|
785
939
|
valid_reading_frame = self._validate_reading_frames(
|
786
940
|
c_ac, c_pos[0], c_pos[1], mane
|
@@ -789,7 +943,9 @@ class MANETranscript:
|
|
789
943
|
continue
|
790
944
|
|
791
945
|
if start_annotation_layer == AnnotationLayer.PROTEIN:
|
792
|
-
mane = self._get_mane_p(
|
946
|
+
mane: DataRepresentation = self._get_mane_p(
|
947
|
+
current_mane_data, mane.pos
|
948
|
+
)
|
793
949
|
|
794
950
|
if ref:
|
795
951
|
valid_references = self._validate_references(
|
@@ -842,9 +998,9 @@ class MANETranscript:
|
|
842
998
|
) -> Optional[Dict]:
|
843
999
|
"""Return genomic coordinate on GRCh38 when not given gene context.
|
844
1000
|
|
845
|
-
:param
|
846
|
-
:param
|
847
|
-
:param
|
1001
|
+
:param ac: Genomic accession
|
1002
|
+
:param start_pos: Genomic start position
|
1003
|
+
:param end_pos: Genomic end position
|
848
1004
|
:return: NC accession, start and end pos on GRCh38 assembly
|
849
1005
|
"""
|
850
1006
|
if end_pos is None:
|
@@ -899,8 +1055,8 @@ class MANETranscript:
|
|
899
1055
|
) -> Tuple[int, int]:
|
900
1056
|
"""Get mane c position change
|
901
1057
|
|
902
|
-
:param
|
903
|
-
:param
|
1058
|
+
:param mane_tx_genomic_data: MANE transcript and genomic data
|
1059
|
+
:param coding_start_site: Coding start site
|
904
1060
|
:return: cDNA pos start, cDNA pos end
|
905
1061
|
"""
|
906
1062
|
tx_pos_range = mane_tx_genomic_data["tx_pos_range"]
|
@@ -922,28 +1078,40 @@ class MANETranscript:
|
|
922
1078
|
end_pos: int,
|
923
1079
|
gene: Optional[str] = None,
|
924
1080
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
925
|
-
) -> Optional[
|
1081
|
+
) -> Optional[Union[GenomicRepresentation, CdnaRepresentation]]:
|
926
1082
|
"""Return MANE Transcript on the c. coordinate.
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
1083
|
+
|
1084
|
+
If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
|
1085
|
+
representation.
|
1086
|
+
|
1087
|
+
>>> import asyncio
|
1088
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
1089
|
+
>>> cst = CoolSeqTool()
|
1090
|
+
>>> result = asyncio.run(cst.mane_transcript.g_to_mane_c(
|
1091
|
+
... "NC_000007.13",
|
1092
|
+
... 55259515,
|
1093
|
+
... None,
|
1094
|
+
... gene="EGFR"
|
1095
|
+
... ))
|
1096
|
+
>>> type(result)
|
1097
|
+
<class 'cool_seq_tool.mappers.mane_transcript.CdnaRepresentation'>
|
1098
|
+
>>> result.status
|
1099
|
+
<TranscriptPriority.MANE_SELECT: 'mane_select'>
|
1100
|
+
>>> del cst
|
1101
|
+
|
1102
|
+
Locating a MANE transcript requires a ``gene`` symbol argument -- if none is
|
1103
|
+
given, this method will only lift over to genomic coordinates on GRCh38.
|
1104
|
+
|
1105
|
+
:param ac: Transcript accession on g. coordinate
|
1106
|
+
:param start_pos: genomic start position
|
1107
|
+
:param end_pos: genomic end position
|
1108
|
+
:param gene: HGNC gene symbol
|
1109
|
+
:param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``.
|
1110
|
+
Will always return coordinates in inter-residue.
|
938
1111
|
:return: MANE Transcripts with cDNA change on c. coordinate if gene
|
939
1112
|
is provided. Else, GRCh38 data
|
940
1113
|
"""
|
941
|
-
|
942
|
-
start_pos, residue_mode, end_pos=end_pos
|
943
|
-
)
|
944
|
-
if not inter_residue_pos:
|
945
|
-
return None
|
946
|
-
start_pos, end_pos = inter_residue_pos
|
1114
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
947
1115
|
residue_mode = ResidueMode.INTER_RESIDUE
|
948
1116
|
|
949
1117
|
# If gene not provided, return GRCh38
|
@@ -952,14 +1120,9 @@ class MANETranscript:
|
|
952
1120
|
if not grch38:
|
953
1121
|
return None
|
954
1122
|
|
955
|
-
return
|
956
|
-
gene=None,
|
1123
|
+
return GenomicRepresentation(
|
957
1124
|
refseq=grch38["ac"],
|
958
|
-
ensembl=None,
|
959
|
-
coding_start_site=None,
|
960
|
-
coding_end_site=None,
|
961
1125
|
pos=grch38["pos"],
|
962
|
-
strand=None,
|
963
1126
|
status=TranscriptPriority.GRCH38,
|
964
1127
|
alt_ac=grch38["ac"],
|
965
1128
|
)
|
@@ -981,7 +1144,7 @@ class MANETranscript:
|
|
981
1144
|
if grch38:
|
982
1145
|
# GRCh38 -> MANE C
|
983
1146
|
mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
|
984
|
-
mane_c_ac,
|
1147
|
+
mane_c_ac, grch38["ac"], grch38["pos"][0], grch38["pos"][1]
|
985
1148
|
)
|
986
1149
|
|
987
1150
|
if not grch38 or not mane_tx_genomic_data:
|
@@ -1014,7 +1177,9 @@ class MANETranscript:
|
|
1014
1177
|
gene=current_mane_data["symbol"],
|
1015
1178
|
cds_start_end=(coding_start_site, coding_end_site),
|
1016
1179
|
c_pos_change=mane_c_pos_change,
|
1017
|
-
strand=
|
1180
|
+
strand=Strand.NEGATIVE
|
1181
|
+
if current_mane_data["chr_strand"] == "-"
|
1182
|
+
else Strand.POSITIVE,
|
1018
1183
|
status=TranscriptPriority(
|
1019
1184
|
"_".join(current_mane_data["MANE_status"].split()).lower()
|
1020
1185
|
),
|
@@ -1023,7 +1188,7 @@ class MANETranscript:
|
|
1023
1188
|
alt_ac=grch38["ac"] if grch38 else None,
|
1024
1189
|
)
|
1025
1190
|
|
1026
|
-
async def
|
1191
|
+
async def grch38_to_mane_c_p(
|
1027
1192
|
self,
|
1028
1193
|
alt_ac: str,
|
1029
1194
|
start_pos: int,
|
@@ -1033,21 +1198,22 @@ class MANETranscript:
|
|
1033
1198
|
try_longest_compatible: bool = False,
|
1034
1199
|
) -> Optional[Dict]:
|
1035
1200
|
"""Given GRCh38 genomic representation, return protein representation.
|
1201
|
+
|
1036
1202
|
Will try MANE Select and then MANE Plus Clinical. If neither is found and
|
1037
|
-
|
1203
|
+
``try_longest_compatible`` is set to ``true``, will also try to find the longest
|
1038
1204
|
compatible remaining representation.
|
1039
1205
|
|
1040
1206
|
:param alt_ac: Genomic RefSeq accession on GRCh38
|
1041
1207
|
:param start_pos: Start position
|
1042
1208
|
:param end_pos: End position
|
1043
1209
|
:param gene: HGNC gene symbol
|
1044
|
-
:param residue_mode: Starting residue mode for
|
1210
|
+
:param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``. Will
|
1045
1211
|
always return coordinates as inter-residue.
|
1046
|
-
:param try_longest_compatible:
|
1047
|
-
if mane transcript(s) not compatible.
|
1212
|
+
:param try_longest_compatible: ``True`` if should try longest compatible remaining
|
1213
|
+
if mane transcript(s) not compatible. ``False`` otherwise.
|
1048
1214
|
:return: If successful, return MANE data or longest compatible remaining (if
|
1049
|
-
|
1050
|
-
inter-residue coordinates.
|
1215
|
+
``try_longest_compatible`` set to ``True``) cDNA and protein representation.
|
1216
|
+
Will return inter-residue coordinates.
|
1051
1217
|
"""
|
1052
1218
|
# Step 1: Get MANE data to map to
|
1053
1219
|
if gene:
|
@@ -1061,12 +1227,7 @@ class MANETranscript:
|
|
1061
1227
|
return None
|
1062
1228
|
|
1063
1229
|
# Step 2: Get inter-residue position
|
1064
|
-
|
1065
|
-
start_pos, residue_mode, end_pos=end_pos
|
1066
|
-
)
|
1067
|
-
if not inter_residue_pos:
|
1068
|
-
return None
|
1069
|
-
start_pos, end_pos = inter_residue_pos
|
1230
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
1070
1231
|
residue_mode = ResidueMode.INTER_RESIDUE
|
1071
1232
|
|
1072
1233
|
# Step 3: Try getting MANE protein representation
|
@@ -1084,6 +1245,7 @@ class MANETranscript:
|
|
1084
1245
|
|
1085
1246
|
# Get MANE C positions
|
1086
1247
|
coding_start_site = mane_tx_genomic_data["coding_start_site"]
|
1248
|
+
coding_end_site = mane_tx_genomic_data["coding_end_site"]
|
1087
1249
|
mane_c_pos_change = self.get_mane_c_pos_change(
|
1088
1250
|
mane_tx_genomic_data, coding_start_site
|
1089
1251
|
)
|
@@ -1098,8 +1260,21 @@ class MANETranscript:
|
|
1098
1260
|
)
|
1099
1261
|
continue
|
1100
1262
|
|
1101
|
-
|
1102
|
-
|
1263
|
+
return ProteinAndCdnaRepresentation(
|
1264
|
+
protein=self._get_mane_p(current_mane_data, mane_c_pos_change),
|
1265
|
+
cdna=self._get_c_data(
|
1266
|
+
(coding_start_site, coding_end_site),
|
1267
|
+
mane_c_pos_change,
|
1268
|
+
mane_tx_genomic_data["strand"],
|
1269
|
+
TranscriptPriority(
|
1270
|
+
"_".join(current_mane_data["MANE_status"].split()).lower()
|
1271
|
+
),
|
1272
|
+
mane_c_ac,
|
1273
|
+
alt_ac=alt_ac,
|
1274
|
+
ensembl_c_ac=current_mane_data["Ensembl_nuc"],
|
1275
|
+
gene=current_mane_data["symbol"],
|
1276
|
+
),
|
1277
|
+
)
|
1103
1278
|
|
1104
1279
|
if try_longest_compatible:
|
1105
1280
|
return await self.get_longest_compatible_transcript(
|
@@ -1108,7 +1283,7 @@ class MANETranscript:
|
|
1108
1283
|
AnnotationLayer.GENOMIC,
|
1109
1284
|
residue_mode=residue_mode,
|
1110
1285
|
alt_ac=alt_ac,
|
1111
|
-
end_annotation_layer=
|
1286
|
+
end_annotation_layer=EndAnnotationLayer.PROTEIN_AND_CDNA,
|
1112
1287
|
mane_transcripts=mane_transcripts,
|
1113
1288
|
)
|
1114
1289
|
else:
|