cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +7 -11
- cool_seq_tool/app.py +44 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +27 -25
- cool_seq_tool/mappers/__init__.py +3 -1
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
- cool_seq_tool/mappers/liftover.py +90 -0
- cool_seq_tool/mappers/mane_transcript.py +208 -113
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +153 -0
- cool_seq_tool/schemas.py +92 -54
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +96 -249
- cool_seq_tool/utils.py +44 -4
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
- cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
- cool_seq_tool/api.py +0 -42
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool/routers/__init__.py +0 -16
- cool_seq_tool/routers/default.py +0 -125
- cool_seq_tool/routers/mane.py +0 -98
- cool_seq_tool/routers/mappings.py +0 -155
- cool_seq_tool/version.py +0 -2
- cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,17 @@ Steps:
|
|
11
11
|
In addition to a mapper utility class, this module also defines several vocabulary
|
12
12
|
constraints and data models for coordinate representation.
|
13
13
|
"""
|
14
|
+
|
14
15
|
import logging
|
15
16
|
import math
|
16
17
|
from enum import Enum
|
17
|
-
from typing import
|
18
|
+
from typing import Literal
|
18
19
|
|
19
20
|
import polars as pl
|
20
21
|
from pydantic import BaseModel
|
21
22
|
|
22
23
|
from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
|
24
|
+
from cool_seq_tool.mappers.liftover import LiftOver
|
23
25
|
from cool_seq_tool.schemas import (
|
24
26
|
AnnotationLayer,
|
25
27
|
Assembly,
|
@@ -34,7 +36,7 @@ from cool_seq_tool.sources import (
|
|
34
36
|
)
|
35
37
|
from cool_seq_tool.utils import get_inter_residue_pos
|
36
38
|
|
37
|
-
|
39
|
+
_logger = logging.getLogger(__name__)
|
38
40
|
|
39
41
|
|
40
42
|
class EndAnnotationLayer(str, Enum):
|
@@ -50,10 +52,10 @@ class EndAnnotationLayer(str, Enum):
|
|
50
52
|
class DataRepresentation(BaseModel):
|
51
53
|
"""Define object model for final output representation"""
|
52
54
|
|
53
|
-
gene:
|
55
|
+
gene: str | None = None
|
54
56
|
refseq: str
|
55
|
-
ensembl:
|
56
|
-
pos:
|
57
|
+
ensembl: str | None = None
|
58
|
+
pos: tuple[int, int]
|
57
59
|
strand: Strand
|
58
60
|
status: TranscriptPriority
|
59
61
|
|
@@ -63,14 +65,14 @@ class CdnaRepresentation(DataRepresentation):
|
|
63
65
|
|
64
66
|
coding_start_site: int
|
65
67
|
coding_end_site: int
|
66
|
-
alt_ac:
|
68
|
+
alt_ac: str | None = None
|
67
69
|
|
68
70
|
|
69
71
|
class GenomicRepresentation(BaseModel):
|
70
72
|
"""Define object model for genomic representation"""
|
71
73
|
|
72
74
|
refseq: str
|
73
|
-
pos:
|
75
|
+
pos: tuple[int, int]
|
74
76
|
status: TranscriptPriority
|
75
77
|
alt_ac: str
|
76
78
|
|
@@ -91,6 +93,7 @@ class ManeTranscript:
|
|
91
93
|
transcript_mappings: TranscriptMappings,
|
92
94
|
mane_transcript_mappings: ManeTranscriptMappings,
|
93
95
|
uta_db: UtaDatabase,
|
96
|
+
liftover: LiftOver,
|
94
97
|
) -> None:
|
95
98
|
"""Initialize the ManeTranscript class.
|
96
99
|
|
@@ -105,7 +108,7 @@ class ManeTranscript:
|
|
105
108
|
|
106
109
|
>>> import asyncio
|
107
110
|
>>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
|
108
|
-
>>> result[
|
111
|
+
>>> result["ac"]
|
109
112
|
'NC_000001.11'
|
110
113
|
|
111
114
|
See the :ref:`Usage section <async_note>` for more information.
|
@@ -116,11 +119,13 @@ class ManeTranscript:
|
|
116
119
|
:param mane_transcript_mappings: Access to MANE Transcript accession mapping
|
117
120
|
data
|
118
121
|
:param uta_db: UtaDatabase instance to give access to query UTA database
|
122
|
+
:param liftover: Instance to provide mapping between human genome assemblies
|
119
123
|
"""
|
120
124
|
self.seqrepo_access = seqrepo_access
|
121
125
|
self.transcript_mappings = transcript_mappings
|
122
126
|
self.mane_transcript_mappings = mane_transcript_mappings
|
123
127
|
self.uta_db = uta_db
|
128
|
+
self.liftover = liftover
|
124
129
|
|
125
130
|
@staticmethod
|
126
131
|
def _get_reading_frame(pos: int) -> int:
|
@@ -135,7 +140,7 @@ class ManeTranscript:
|
|
135
140
|
return pos_mod_3
|
136
141
|
|
137
142
|
@staticmethod
|
138
|
-
def _p_to_c_pos(start: int, end: int) ->
|
143
|
+
def _p_to_c_pos(start: int, end: int) -> tuple[int, int]:
|
139
144
|
"""Return cDNA position given a protein position.
|
140
145
|
|
141
146
|
:param start: Start protein position. Inter-residue coordinates
|
@@ -148,7 +153,7 @@ class ManeTranscript:
|
|
148
153
|
|
149
154
|
async def _p_to_c(
|
150
155
|
self, ac: str, start_pos: int, end_pos: int
|
151
|
-
) ->
|
156
|
+
) -> tuple[str, tuple[int, int]] | None:
|
152
157
|
"""Convert protein (p.) annotation to cDNA (c.) annotation.
|
153
158
|
|
154
159
|
:param ac: Protein accession
|
@@ -167,16 +172,16 @@ class ManeTranscript:
|
|
167
172
|
elif ac.startswith("ENSP"):
|
168
173
|
ac = self.transcript_mappings.ensp_to_enst[ac]
|
169
174
|
else:
|
170
|
-
|
175
|
+
_logger.warning("Unable to find accession: %s", ac)
|
171
176
|
return None
|
172
177
|
except KeyError:
|
173
|
-
|
178
|
+
_logger.warning("%s not found in transcript_mappings", ac)
|
174
179
|
return None
|
175
180
|
|
176
181
|
pos = self._p_to_c_pos(start_pos, end_pos)
|
177
182
|
return ac, pos
|
178
183
|
|
179
|
-
async def _c_to_g(self, ac: str, pos:
|
184
|
+
async def _c_to_g(self, ac: str, pos: tuple[int, int]) -> dict | None:
|
180
185
|
"""Get g. annotation from c. annotation.
|
181
186
|
|
182
187
|
:param ac: cDNA accession
|
@@ -195,7 +200,7 @@ class ManeTranscript:
|
|
195
200
|
0
|
196
201
|
]
|
197
202
|
):
|
198
|
-
|
203
|
+
_logger.warning("Ensembl transcript not found: %s", ac)
|
199
204
|
return None
|
200
205
|
|
201
206
|
temp_ac = ac.split(".")[0]
|
@@ -205,7 +210,7 @@ class ManeTranscript:
|
|
205
210
|
# c. coordinate does not contain cds start, so we need to add it
|
206
211
|
cds_start_end = await self.uta_db.get_cds_start_end(temp_ac)
|
207
212
|
if not cds_start_end:
|
208
|
-
|
213
|
+
_logger.warning("Accession %s not found in UTA", temp_ac)
|
209
214
|
return None
|
210
215
|
coding_start_site = cds_start_end[0]
|
211
216
|
pos = pos[0] + coding_start_site, pos[1] + coding_start_site
|
@@ -214,16 +219,108 @@ class ManeTranscript:
|
|
214
219
|
ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site
|
215
220
|
)
|
216
221
|
|
222
|
+
async def _liftover_to_38(self, genomic_tx_data: dict) -> None:
|
223
|
+
"""Liftover genomic_tx_data to hg38 assembly.
|
224
|
+
|
225
|
+
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
226
|
+
strand. This will be mutated in-place if not GRCh38 assembly.
|
227
|
+
"""
|
228
|
+
descr = await self.uta_db.get_chr_assembly(genomic_tx_data["alt_ac"])
|
229
|
+
if descr is None:
|
230
|
+
# already grch38
|
231
|
+
return
|
232
|
+
chromosome, _ = descr
|
233
|
+
|
234
|
+
query = f"""
|
235
|
+
SELECT DISTINCT alt_ac
|
236
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
237
|
+
WHERE tx_ac = '{genomic_tx_data['tx_ac']}';
|
238
|
+
""" # noqa: S608
|
239
|
+
nc_acs = await self.uta_db.execute_query(query)
|
240
|
+
nc_acs = [nc_ac[0] for nc_ac in nc_acs]
|
241
|
+
if nc_acs == [genomic_tx_data["alt_ac"]]:
|
242
|
+
_logger.warning(
|
243
|
+
"UTA does not have GRCh38 assembly for %s",
|
244
|
+
genomic_tx_data["alt_ac"].split(".")[0],
|
245
|
+
)
|
246
|
+
return
|
247
|
+
|
248
|
+
# Get most recent assembly version position
|
249
|
+
# Liftover range
|
250
|
+
self._set_liftover(
|
251
|
+
genomic_tx_data, "alt_pos_range", chromosome, Assembly.GRCH38
|
252
|
+
)
|
253
|
+
|
254
|
+
# Liftover changes range
|
255
|
+
self._set_liftover(
|
256
|
+
genomic_tx_data, "alt_pos_change_range", chromosome, Assembly.GRCH38
|
257
|
+
)
|
258
|
+
|
259
|
+
# Change alt_ac to most recent
|
260
|
+
if genomic_tx_data["alt_ac"].startswith("EN"):
|
261
|
+
order_by_cond = "ORDER BY alt_ac DESC;"
|
262
|
+
else:
|
263
|
+
order_by_cond = """
|
264
|
+
ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1,
|
265
|
+
LENGTH(alt_ac)) AS INT) DESC;
|
266
|
+
"""
|
267
|
+
query = f"""
|
268
|
+
SELECT alt_ac
|
269
|
+
FROM {self.uta_db.schema}.genomic
|
270
|
+
WHERE alt_ac LIKE '{genomic_tx_data['alt_ac'].split('.')[0]}%'
|
271
|
+
{order_by_cond}
|
272
|
+
""" # noqa: S608
|
273
|
+
nc_acs = await self.uta_db.execute_query(query)
|
274
|
+
genomic_tx_data["alt_ac"] = nc_acs[0][0]
|
275
|
+
|
276
|
+
def _set_liftover(
|
277
|
+
self,
|
278
|
+
genomic_tx_data: dict,
|
279
|
+
key: str,
|
280
|
+
chromosome: str,
|
281
|
+
liftover_to_assembly: Assembly,
|
282
|
+
) -> None:
|
283
|
+
"""Update genomic_tx_data to have coordinates for given assembly.
|
284
|
+
|
285
|
+
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
286
|
+
strand
|
287
|
+
:param key: Key to access coordinate positions
|
288
|
+
:param chromosome: Chromosome, must be prefixed with ``chr``
|
289
|
+
:param liftover_to_assembly: Assembly to liftover to
|
290
|
+
"""
|
291
|
+
liftover_start_i = self.liftover.get_liftover(
|
292
|
+
chromosome, genomic_tx_data[key][0], liftover_to_assembly
|
293
|
+
)
|
294
|
+
if liftover_start_i is None:
|
295
|
+
_logger.warning(
|
296
|
+
"Unable to liftover position %s on %s",
|
297
|
+
genomic_tx_data[key][0],
|
298
|
+
chromosome,
|
299
|
+
)
|
300
|
+
return
|
301
|
+
|
302
|
+
liftover_end_i = self.liftover.get_liftover(
|
303
|
+
chromosome, genomic_tx_data[key][1], liftover_to_assembly
|
304
|
+
)
|
305
|
+
if liftover_end_i is None:
|
306
|
+
_logger.warning(
|
307
|
+
"Unable to liftover position %s on %s",
|
308
|
+
genomic_tx_data[key][1],
|
309
|
+
chromosome,
|
310
|
+
)
|
311
|
+
return
|
312
|
+
|
313
|
+
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
314
|
+
|
217
315
|
async def _get_and_validate_genomic_tx_data(
|
218
316
|
self,
|
219
317
|
tx_ac: str,
|
220
|
-
pos:
|
221
|
-
annotation_layer:
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
) -> Optional[Dict]:
|
318
|
+
pos: tuple[int, int],
|
319
|
+
annotation_layer: Literal[AnnotationLayer.CDNA]
|
320
|
+
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
321
|
+
coding_start_site: int | None = None,
|
322
|
+
alt_ac: str | None = None,
|
323
|
+
) -> dict | None:
|
227
324
|
"""Get and validate genomic_tx_data
|
228
325
|
|
229
326
|
:param tx_ac: Accession on c. coordinate
|
@@ -237,7 +334,7 @@ class ManeTranscript:
|
|
237
334
|
tx_ac, pos, annotation_layer, alt_ac=alt_ac
|
238
335
|
)
|
239
336
|
if not genomic_tx_data:
|
240
|
-
|
337
|
+
_logger.warning(
|
241
338
|
"Unable to find genomic_tx_data for %s at position %s on annotation layer %s",
|
242
339
|
alt_ac,
|
243
340
|
pos,
|
@@ -250,12 +347,12 @@ class ManeTranscript:
|
|
250
347
|
# Only want to liftover if alt_ac not provided. If alt_ac is provided,
|
251
348
|
# it means user wants to stick with the queried assembly
|
252
349
|
og_alt_exon_id = genomic_tx_data["alt_exon_id"]
|
253
|
-
await self.
|
350
|
+
await self._liftover_to_38(genomic_tx_data)
|
254
351
|
liftover_alt_exon_id = genomic_tx_data["alt_exon_id"]
|
255
352
|
|
256
353
|
# Validation check: Exon structure
|
257
354
|
if og_alt_exon_id != liftover_alt_exon_id:
|
258
|
-
|
355
|
+
_logger.warning(
|
259
356
|
"Original alt_exon_id %s does not match liftover alt_exon_id %s",
|
260
357
|
og_alt_exon_id,
|
261
358
|
liftover_alt_exon_id,
|
@@ -266,14 +363,14 @@ class ManeTranscript:
|
|
266
363
|
|
267
364
|
@staticmethod
|
268
365
|
def _get_c_data(
|
269
|
-
cds_start_end:
|
270
|
-
c_pos_change:
|
366
|
+
cds_start_end: tuple[int, int],
|
367
|
+
c_pos_change: tuple[int, int],
|
271
368
|
strand: Strand,
|
272
369
|
status: TranscriptPriority,
|
273
370
|
refseq_c_ac: str,
|
274
|
-
gene:
|
275
|
-
ensembl_c_ac:
|
276
|
-
alt_ac:
|
371
|
+
gene: str | None = None,
|
372
|
+
ensembl_c_ac: str | None = None,
|
373
|
+
alt_ac: str | None = None,
|
277
374
|
) -> CdnaRepresentation:
|
278
375
|
"""Return transcript data on c. coordinate.
|
279
376
|
|
@@ -293,7 +390,7 @@ class ManeTranscript:
|
|
293
390
|
gt_cds_end = c_pos_change[1] > cds_end and c_pos_change[1] > cds_end
|
294
391
|
|
295
392
|
if lt_cds_start or gt_cds_end:
|
296
|
-
|
393
|
+
_logger.info(
|
297
394
|
"%s with position %s is not within CDS start/end",
|
298
395
|
refseq_c_ac,
|
299
396
|
c_pos_change,
|
@@ -311,7 +408,7 @@ class ManeTranscript:
|
|
311
408
|
alt_ac=alt_ac,
|
312
409
|
)
|
313
410
|
|
314
|
-
def _c_to_p_pos(self, c_pos:
|
411
|
+
def _c_to_p_pos(self, c_pos: tuple[int, int]) -> tuple[int, int]:
|
315
412
|
"""Get protein position from cdna position
|
316
413
|
|
317
414
|
:param c_pos: cdna position. inter-residue coordinates
|
@@ -325,7 +422,7 @@ class ManeTranscript:
|
|
325
422
|
return start, end
|
326
423
|
|
327
424
|
def _get_mane_p(
|
328
|
-
self, mane_data:
|
425
|
+
self, mane_data: dict, mane_c_pos_range: tuple[int, int]
|
329
426
|
) -> DataRepresentation:
|
330
427
|
"""Translate MANE Transcript c. annotation to p. annotation
|
331
428
|
|
@@ -349,13 +446,13 @@ class ManeTranscript:
|
|
349
446
|
|
350
447
|
async def _g_to_c(
|
351
448
|
self,
|
352
|
-
g:
|
449
|
+
g: dict,
|
353
450
|
refseq_c_ac: str,
|
354
451
|
status: TranscriptPriority,
|
355
|
-
ensembl_c_ac:
|
356
|
-
alt_ac:
|
452
|
+
ensembl_c_ac: str | None = None,
|
453
|
+
alt_ac: str | None = None,
|
357
454
|
found_result: bool = False,
|
358
|
-
) ->
|
455
|
+
) -> CdnaRepresentation | None:
|
359
456
|
"""Get transcript c. annotation data from g. annotation.
|
360
457
|
|
361
458
|
:param g: Genomic data
|
@@ -381,7 +478,7 @@ class ManeTranscript:
|
|
381
478
|
)
|
382
479
|
|
383
480
|
if not result:
|
384
|
-
|
481
|
+
_logger.warning(
|
385
482
|
"Unable to find transcript, %s, position change", refseq_c_ac
|
386
483
|
)
|
387
484
|
return None
|
@@ -438,7 +535,7 @@ class ManeTranscript:
|
|
438
535
|
new_rf = self._get_reading_frame(transcript_data.pos[pos_index])
|
439
536
|
|
440
537
|
if og_rf != new_rf:
|
441
|
-
|
538
|
+
_logger.warning(
|
442
539
|
"%s original reading frame (%s) does not match new %s, %s reading frame (%s)",
|
443
540
|
ac,
|
444
541
|
og_rf,
|
@@ -449,7 +546,7 @@ class ManeTranscript:
|
|
449
546
|
return False
|
450
547
|
else:
|
451
548
|
if pos_index == 0:
|
452
|
-
|
549
|
+
_logger.warning("%s must having start position", ac)
|
453
550
|
return False
|
454
551
|
return True
|
455
552
|
|
@@ -459,9 +556,9 @@ class ManeTranscript:
|
|
459
556
|
coding_start_site: int,
|
460
557
|
start_pos: int,
|
461
558
|
end_pos: int,
|
462
|
-
mane_transcript:
|
463
|
-
|
464
|
-
|
559
|
+
mane_transcript: DataRepresentation
|
560
|
+
| CdnaRepresentation
|
561
|
+
| GenomicRepresentation,
|
465
562
|
expected_ref: str,
|
466
563
|
anno: AnnotationLayer,
|
467
564
|
residue_mode: ResidueMode,
|
@@ -503,10 +600,10 @@ class ManeTranscript:
|
|
503
600
|
residue_mode=residue_mode,
|
504
601
|
)
|
505
602
|
if not mane_ref:
|
506
|
-
|
603
|
+
_logger.info("Unable to validate reference for MANE Transcript")
|
507
604
|
|
508
605
|
if expected_ref != mane_ref:
|
509
|
-
|
606
|
+
_logger.info(
|
510
607
|
"Expected ref, %s, but got %s on MANE accession, %s",
|
511
608
|
expected_ref,
|
512
609
|
mane_ref,
|
@@ -514,7 +611,7 @@ class ManeTranscript:
|
|
514
611
|
)
|
515
612
|
|
516
613
|
if expected_ref != ref:
|
517
|
-
|
614
|
+
_logger.warning(
|
518
615
|
"Expected ref, %s, but got %s on accession, %s", expected_ref, ref, ac
|
519
616
|
)
|
520
617
|
return False
|
@@ -522,7 +619,7 @@ class ManeTranscript:
|
|
522
619
|
return True
|
523
620
|
|
524
621
|
def _validate_index(
|
525
|
-
self, ac: str, pos:
|
622
|
+
self, ac: str, pos: tuple[int, int], coding_start_site: int
|
526
623
|
) -> bool:
|
527
624
|
"""Validate that positions actually exist on accession
|
528
625
|
|
@@ -533,13 +630,13 @@ class ManeTranscript:
|
|
533
630
|
"""
|
534
631
|
start_pos = pos[0] + coding_start_site
|
535
632
|
end_pos = pos[1] + coding_start_site
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
633
|
+
return bool(
|
634
|
+
self.seqrepo_access.get_reference_sequence(
|
635
|
+
ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
|
636
|
+
)[0]
|
637
|
+
)
|
541
638
|
|
542
|
-
def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) ->
|
639
|
+
def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list:
|
543
640
|
"""Sort and filter transcripts from gene to get priority list
|
544
641
|
|
545
642
|
:param df: Data frame containing transcripts from gene
|
@@ -550,7 +647,7 @@ class ManeTranscript:
|
|
550
647
|
most recent version of a transcript associated with an assembly will be kept
|
551
648
|
"""
|
552
649
|
copy_df = df.clone()
|
553
|
-
copy_df = copy_df.drop(
|
650
|
+
copy_df = copy_df.drop("alt_ac").unique()
|
554
651
|
copy_df = copy_df.with_columns(
|
555
652
|
[
|
556
653
|
pl.col("tx_ac")
|
@@ -590,15 +687,13 @@ class ManeTranscript:
|
|
590
687
|
start_pos: int,
|
591
688
|
end_pos: int,
|
592
689
|
start_annotation_layer: AnnotationLayer,
|
593
|
-
gene:
|
594
|
-
ref:
|
690
|
+
gene: str | None = None,
|
691
|
+
ref: str | None = None,
|
595
692
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
596
|
-
mane_transcripts:
|
597
|
-
alt_ac:
|
598
|
-
end_annotation_layer:
|
599
|
-
) ->
|
600
|
-
Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation]
|
601
|
-
]:
|
693
|
+
mane_transcripts: set | None = None,
|
694
|
+
alt_ac: str | None = None,
|
695
|
+
end_annotation_layer: EndAnnotationLayer | None = None,
|
696
|
+
) -> DataRepresentation | CdnaRepresentation | ProteinAndCdnaRepresentation | None:
|
602
697
|
"""Get longest compatible transcript from a gene. See the documentation for
|
603
698
|
the :ref:`transcript compatibility policy <transcript_compatibility>` for more
|
604
699
|
information.
|
@@ -613,14 +708,16 @@ class ManeTranscript:
|
|
613
708
|
... "NM_004333.6",
|
614
709
|
... "ENST00000644969.2",
|
615
710
|
... }
|
616
|
-
>>> result = asyncio.run(
|
617
|
-
...
|
618
|
-
...
|
619
|
-
...
|
620
|
-
...
|
621
|
-
...
|
622
|
-
...
|
623
|
-
...
|
711
|
+
>>> result = asyncio.run(
|
712
|
+
... mane_mapper.get_longest_compatible_transcript(
|
713
|
+
... 599,
|
714
|
+
... 599,
|
715
|
+
... gene="BRAF",
|
716
|
+
... start_annotation_layer=AnnotationLayer.PROTEIN,
|
717
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
718
|
+
... mane_transcripts=mane_transcripts,
|
719
|
+
... )
|
720
|
+
... )
|
624
721
|
>>> result.refseq
|
625
722
|
'NP_001365396.1'
|
626
723
|
|
@@ -645,9 +742,9 @@ class ManeTranscript:
|
|
645
742
|
"""
|
646
743
|
|
647
744
|
def _get_protein_rep(
|
648
|
-
gene:
|
745
|
+
gene: str | None,
|
649
746
|
pro_ac: str,
|
650
|
-
lcr_c_data_pos:
|
747
|
+
lcr_c_data_pos: tuple[int, int],
|
651
748
|
strand: Strand,
|
652
749
|
status: TranscriptPriority,
|
653
750
|
) -> DataRepresentation:
|
@@ -692,7 +789,7 @@ class ManeTranscript:
|
|
692
789
|
)
|
693
790
|
|
694
791
|
if df.is_empty():
|
695
|
-
|
792
|
+
_logger.warning("Unable to get transcripts from gene %s", gene)
|
696
793
|
return lcr_result
|
697
794
|
|
698
795
|
prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df)
|
@@ -731,7 +828,7 @@ class ManeTranscript:
|
|
731
828
|
|
732
829
|
# Get prioritized transcript data for gene
|
733
830
|
# grch38 -> c
|
734
|
-
lcr_c_data:
|
831
|
+
lcr_c_data: CdnaRepresentation | None = await self._g_to_c(
|
735
832
|
g=g,
|
736
833
|
refseq_c_ac=tx_ac,
|
737
834
|
status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
|
@@ -814,7 +911,7 @@ class ManeTranscript:
|
|
814
911
|
pos = lcr_result.pos
|
815
912
|
|
816
913
|
if not self._validate_index(ac, pos, coding_start_site):
|
817
|
-
|
914
|
+
_logger.warning(
|
818
915
|
"%s are not valid positions on %s with coding start site %s",
|
819
916
|
pos,
|
820
917
|
ac,
|
@@ -841,7 +938,7 @@ class ManeTranscript:
|
|
841
938
|
pos = lcr_result_dict[k]["pos"]
|
842
939
|
if not self._validate_index(ac, pos, cds):
|
843
940
|
valid = False
|
844
|
-
|
941
|
+
_logger.warning(
|
845
942
|
"%s are not valid positions on %s with coding start site %s",
|
846
943
|
pos,
|
847
944
|
ac,
|
@@ -859,25 +956,26 @@ class ManeTranscript:
|
|
859
956
|
start_pos: int,
|
860
957
|
end_pos: int,
|
861
958
|
start_annotation_layer: AnnotationLayer,
|
862
|
-
gene:
|
863
|
-
ref:
|
959
|
+
gene: str | None = None,
|
960
|
+
ref: str | None = None,
|
864
961
|
try_longest_compatible: bool = False,
|
865
|
-
residue_mode:
|
866
|
-
|
867
|
-
|
868
|
-
) -> Optional[Union[DataRepresentation, CdnaRepresentation]]:
|
962
|
+
residue_mode: Literal[ResidueMode.RESIDUE]
|
963
|
+
| Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.RESIDUE,
|
964
|
+
) -> DataRepresentation | CdnaRepresentation | None:
|
869
965
|
"""Return MANE transcript.
|
870
966
|
|
871
967
|
>>> from cool_seq_tool.app import CoolSeqTool
|
872
968
|
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
873
969
|
>>> import asyncio
|
874
970
|
>>> mane_mapper = CoolSeqTool().mane_transcript
|
875
|
-
>>> result = asyncio.run(
|
876
|
-
...
|
877
|
-
...
|
878
|
-
...
|
879
|
-
...
|
880
|
-
...
|
971
|
+
>>> result = asyncio.run(
|
972
|
+
... mane_mapper.get_mane_transcript(
|
973
|
+
... "NP_004324.2",
|
974
|
+
... 599,
|
975
|
+
... AnnotationLayer.PROTEIN,
|
976
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
977
|
+
... )
|
978
|
+
... )
|
881
979
|
>>> result.gene, result.refseq, result.status
|
882
980
|
('BRAF', 'NP_004324.2', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
|
883
981
|
|
@@ -930,7 +1028,7 @@ class ManeTranscript:
|
|
930
1028
|
current_mane_data["RefSeq_nuc"],
|
931
1029
|
current_mane_data["Ensembl_nuc"],
|
932
1030
|
}
|
933
|
-
mane:
|
1031
|
+
mane: CdnaRepresentation | None = await self._g_to_c(
|
934
1032
|
g=g,
|
935
1033
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
936
1034
|
status=TranscriptPriority(
|
@@ -998,12 +1096,10 @@ class ManeTranscript:
|
|
998
1096
|
return await self.g_to_mane_c(
|
999
1097
|
ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode
|
1000
1098
|
)
|
1001
|
-
|
1099
|
+
_logger.warning("Annotation layer not supported: %s", start_annotation_layer)
|
1002
1100
|
return None
|
1003
1101
|
|
1004
|
-
async def g_to_grch38(
|
1005
|
-
self, ac: str, start_pos: int, end_pos: int
|
1006
|
-
) -> Optional[Dict]:
|
1102
|
+
async def g_to_grch38(self, ac: str, start_pos: int, end_pos: int) -> dict | None:
|
1007
1103
|
"""Return genomic coordinate on GRCh38 when not given gene context.
|
1008
1104
|
|
1009
1105
|
:param ac: Genomic accession
|
@@ -1025,11 +1121,11 @@ class ManeTranscript:
|
|
1025
1121
|
is_same_pos = start_pos == end_pos
|
1026
1122
|
|
1027
1123
|
# Coordinate liftover
|
1028
|
-
if assembly <
|
1029
|
-
|
1124
|
+
if assembly < Assembly.GRCH37:
|
1125
|
+
_logger.warning("Liftover only supported for GRCh37")
|
1030
1126
|
return None
|
1031
1127
|
|
1032
|
-
liftover_start_i = self.
|
1128
|
+
liftover_start_i = self.liftover.get_liftover(
|
1033
1129
|
chromosome, start_pos, Assembly.GRCH38
|
1034
1130
|
)
|
1035
1131
|
if liftover_start_i is None:
|
@@ -1037,7 +1133,7 @@ class ManeTranscript:
|
|
1037
1133
|
start_pos = liftover_start_i[1]
|
1038
1134
|
|
1039
1135
|
if not is_same_pos:
|
1040
|
-
liftover_end_i = self.
|
1136
|
+
liftover_end_i = self.liftover.get_liftover(
|
1041
1137
|
chromosome, end_pos, Assembly.GRCH38
|
1042
1138
|
)
|
1043
1139
|
if liftover_end_i is None:
|
@@ -1055,8 +1151,8 @@ class ManeTranscript:
|
|
1055
1151
|
|
1056
1152
|
@staticmethod
|
1057
1153
|
def get_mane_c_pos_change(
|
1058
|
-
mane_tx_genomic_data:
|
1059
|
-
) ->
|
1154
|
+
mane_tx_genomic_data: dict, coding_start_site: int
|
1155
|
+
) -> tuple[int, int]:
|
1060
1156
|
"""Get mane c position change
|
1061
1157
|
|
1062
1158
|
:param mane_tx_genomic_data: MANE transcript and genomic data
|
@@ -1080,9 +1176,9 @@ class ManeTranscript:
|
|
1080
1176
|
ac: str,
|
1081
1177
|
start_pos: int,
|
1082
1178
|
end_pos: int,
|
1083
|
-
gene:
|
1179
|
+
gene: str | None = None,
|
1084
1180
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1085
|
-
) ->
|
1181
|
+
) -> GenomicRepresentation | CdnaRepresentation | None:
|
1086
1182
|
"""Return MANE Transcript on the c. coordinate.
|
1087
1183
|
|
1088
1184
|
If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
|
@@ -1091,12 +1187,11 @@ class ManeTranscript:
|
|
1091
1187
|
>>> import asyncio
|
1092
1188
|
>>> from cool_seq_tool.app import CoolSeqTool
|
1093
1189
|
>>> cst = CoolSeqTool()
|
1094
|
-
>>> result = asyncio.run(
|
1095
|
-
...
|
1096
|
-
...
|
1097
|
-
...
|
1098
|
-
...
|
1099
|
-
... ))
|
1190
|
+
>>> result = asyncio.run(
|
1191
|
+
... cst.mane_transcript.g_to_mane_c(
|
1192
|
+
... "NC_000007.13", 55259515, None, gene="EGFR"
|
1193
|
+
... )
|
1194
|
+
... )
|
1100
1195
|
>>> type(result)
|
1101
1196
|
<class 'cool_seq_tool.mappers.mane_transcript.CdnaRepresentation'>
|
1102
1197
|
>>> result.status
|
@@ -1132,7 +1227,7 @@ class ManeTranscript:
|
|
1132
1227
|
)
|
1133
1228
|
|
1134
1229
|
if not await self.uta_db.validate_genomic_ac(ac):
|
1135
|
-
|
1230
|
+
_logger.warning("Genomic accession does not exist: %s", ac)
|
1136
1231
|
return None
|
1137
1232
|
|
1138
1233
|
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
@@ -1158,7 +1253,7 @@ class ManeTranscript:
|
|
1158
1253
|
)
|
1159
1254
|
if not mane_tx_genomic_data:
|
1160
1255
|
continue
|
1161
|
-
|
1256
|
+
_logger.info("Not using most recent assembly")
|
1162
1257
|
|
1163
1258
|
coding_start_site = mane_tx_genomic_data["coding_start_site"]
|
1164
1259
|
coding_end_site = mane_tx_genomic_data["coding_end_site"]
|
@@ -1169,7 +1264,7 @@ class ManeTranscript:
|
|
1169
1264
|
if not self._validate_index(
|
1170
1265
|
mane_c_ac, mane_c_pos_change, coding_start_site
|
1171
1266
|
):
|
1172
|
-
|
1267
|
+
_logger.warning(
|
1173
1268
|
"%s are not valid positions on %s with coding start site %s",
|
1174
1269
|
mane_c_pos_change,
|
1175
1270
|
mane_c_ac,
|
@@ -1198,10 +1293,10 @@ class ManeTranscript:
|
|
1198
1293
|
alt_ac: str,
|
1199
1294
|
start_pos: int,
|
1200
1295
|
end_pos: int,
|
1201
|
-
gene:
|
1296
|
+
gene: str | None = None,
|
1202
1297
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1203
1298
|
try_longest_compatible: bool = False,
|
1204
|
-
) ->
|
1299
|
+
) -> dict | None:
|
1205
1300
|
"""Given GRCh38 genomic representation, return protein representation.
|
1206
1301
|
|
1207
1302
|
Will try MANE Select and then MANE Plus Clinical. If neither is found and
|
@@ -1259,7 +1354,7 @@ class ManeTranscript:
|
|
1259
1354
|
if not self._validate_index(
|
1260
1355
|
mane_c_ac, mane_c_pos_change, coding_start_site
|
1261
1356
|
):
|
1262
|
-
|
1357
|
+
_logger.warning(
|
1263
1358
|
"%s are not valid positions on %s with coding start site %s",
|
1264
1359
|
mane_c_pos_change,
|
1265
1360
|
mane_c_ac,
|