cool-seq-tool 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +6 -0
- cool_seq_tool/app.py +1 -2
- cool_seq_tool/handlers/seqrepo_access.py +5 -5
- cool_seq_tool/mappers/alignment.py +16 -16
- cool_seq_tool/mappers/exon_genomic_coords.py +845 -628
- cool_seq_tool/mappers/mane_transcript.py +184 -152
- cool_seq_tool/schemas.py +30 -438
- cool_seq_tool/sources/mane_transcript_mappings.py +35 -0
- cool_seq_tool/sources/uta_database.py +149 -229
- cool_seq_tool/utils.py +9 -9
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/METADATA +8 -8
- cool_seq_tool-0.7.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/WHEEL +1 -1
- cool_seq_tool-0.5.1.dist-info/RECORD +0 -24
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,9 @@ from cool_seq_tool.mappers.liftover import LiftOver
|
|
25
25
|
from cool_seq_tool.schemas import (
|
26
26
|
AnnotationLayer,
|
27
27
|
Assembly,
|
28
|
-
|
28
|
+
CoordinateType,
|
29
|
+
GenomicTxMetadata,
|
30
|
+
ManeGeneData,
|
29
31
|
Strand,
|
30
32
|
TranscriptPriority,
|
31
33
|
)
|
@@ -71,10 +73,10 @@ class CdnaRepresentation(DataRepresentation):
|
|
71
73
|
class GenomicRepresentation(BaseModel):
|
72
74
|
"""Define object model for genomic representation"""
|
73
75
|
|
74
|
-
refseq: str
|
75
76
|
pos: tuple[int, int]
|
76
|
-
|
77
|
-
|
77
|
+
mane_genes: list[ManeGeneData] = []
|
78
|
+
status: Literal["grch38"] = TranscriptPriority.GRCH38.value
|
79
|
+
ac: str
|
78
80
|
|
79
81
|
|
80
82
|
class ProteinAndCdnaRepresentation(BaseModel):
|
@@ -100,7 +102,7 @@ class ManeTranscript:
|
|
100
102
|
A handful of resources are required for initialization, so when defaults are
|
101
103
|
enough, it's easiest to let the core CoolSeqTool class handle it for you:
|
102
104
|
|
103
|
-
>>> from cool_seq_tool
|
105
|
+
>>> from cool_seq_tool import CoolSeqTool
|
104
106
|
>>> mane_mapper = CoolSeqTool().mane_transcript
|
105
107
|
|
106
108
|
Note that most methods are defined as Python coroutines, so they must be called
|
@@ -108,7 +110,7 @@ class ManeTranscript:
|
|
108
110
|
|
109
111
|
>>> import asyncio
|
110
112
|
>>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
|
111
|
-
>>> result
|
113
|
+
>>> result.ac
|
112
114
|
'NC_000001.11'
|
113
115
|
|
114
116
|
See the :ref:`Usage section <async_note>` for more information.
|
@@ -128,7 +130,7 @@ class ManeTranscript:
|
|
128
130
|
self.liftover = liftover
|
129
131
|
|
130
132
|
@staticmethod
|
131
|
-
def
|
133
|
+
def get_reading_frame(pos: int) -> int:
|
132
134
|
"""Return reading frame number. Only used on c. coordinate.
|
133
135
|
|
134
136
|
:param pos: cDNA position
|
@@ -181,13 +183,12 @@ class ManeTranscript:
|
|
181
183
|
pos = self._p_to_c_pos(start_pos, end_pos)
|
182
184
|
return ac, pos
|
183
185
|
|
184
|
-
async def _c_to_g(self, ac: str, pos: tuple[int, int]) ->
|
186
|
+
async def _c_to_g(self, ac: str, pos: tuple[int, int]) -> GenomicTxMetadata | None:
|
185
187
|
"""Get g. annotation from c. annotation.
|
186
188
|
|
187
189
|
:param ac: cDNA accession
|
188
190
|
:param pos: [cDNA pos start, cDNA pos end]
|
189
|
-
:return:
|
190
|
-
Altered transcript accession and position change, Strand
|
191
|
+
:return: Metadata for genomic and transcript accessions
|
191
192
|
"""
|
192
193
|
# UTA does not store ENST versions
|
193
194
|
# So we want to make sure version is valid
|
@@ -219,13 +220,13 @@ class ManeTranscript:
|
|
219
220
|
ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site
|
220
221
|
)
|
221
222
|
|
222
|
-
async def _liftover_to_38(self, genomic_tx_data:
|
223
|
+
async def _liftover_to_38(self, genomic_tx_data: GenomicTxMetadata) -> None:
|
223
224
|
"""Liftover genomic_tx_data to hg38 assembly.
|
224
225
|
|
225
|
-
:param genomic_tx_data:
|
226
|
-
|
226
|
+
:param genomic_tx_data: Metadata for genomic and transcript accessions. This
|
227
|
+
will be mutated in-place if not GRCh38 assembly.
|
227
228
|
"""
|
228
|
-
descr = await self.uta_db.get_chr_assembly(genomic_tx_data
|
229
|
+
descr = await self.uta_db.get_chr_assembly(genomic_tx_data.alt_ac)
|
229
230
|
if descr is None:
|
230
231
|
# already grch38
|
231
232
|
return
|
@@ -234,14 +235,14 @@ class ManeTranscript:
|
|
234
235
|
query = f"""
|
235
236
|
SELECT DISTINCT alt_ac
|
236
237
|
FROM {self.uta_db.schema}.tx_exon_aln_v
|
237
|
-
WHERE tx_ac = '{genomic_tx_data
|
238
|
+
WHERE tx_ac = '{genomic_tx_data.tx_ac}';
|
238
239
|
""" # noqa: S608
|
239
240
|
nc_acs = await self.uta_db.execute_query(query)
|
240
241
|
nc_acs = [nc_ac[0] for nc_ac in nc_acs]
|
241
|
-
if nc_acs == [genomic_tx_data
|
242
|
+
if nc_acs == [genomic_tx_data.alt_ac]:
|
242
243
|
_logger.warning(
|
243
244
|
"UTA does not have GRCh38 assembly for %s",
|
244
|
-
genomic_tx_data
|
245
|
+
genomic_tx_data.alt_ac.split(".")[0],
|
245
246
|
)
|
246
247
|
return
|
247
248
|
|
@@ -257,7 +258,7 @@ class ManeTranscript:
|
|
257
258
|
)
|
258
259
|
|
259
260
|
# Change alt_ac to most recent
|
260
|
-
if genomic_tx_data
|
261
|
+
if genomic_tx_data.alt_ac.startswith("EN"):
|
261
262
|
order_by_cond = "ORDER BY alt_ac DESC;"
|
262
263
|
else:
|
263
264
|
order_by_cond = """
|
@@ -267,50 +268,49 @@ class ManeTranscript:
|
|
267
268
|
query = f"""
|
268
269
|
SELECT alt_ac
|
269
270
|
FROM {self.uta_db.schema}.genomic
|
270
|
-
WHERE alt_ac LIKE '{genomic_tx_data
|
271
|
+
WHERE alt_ac LIKE '{genomic_tx_data.alt_ac.split('.')[0]}%'
|
271
272
|
{order_by_cond}
|
272
273
|
""" # noqa: S608
|
273
274
|
nc_acs = await self.uta_db.execute_query(query)
|
274
|
-
genomic_tx_data
|
275
|
+
genomic_tx_data.alt_ac = nc_acs[0][0]
|
275
276
|
|
276
277
|
def _set_liftover(
|
277
278
|
self,
|
278
|
-
genomic_tx_data:
|
279
|
+
genomic_tx_data: GenomicTxMetadata,
|
279
280
|
key: str,
|
280
281
|
chromosome: str,
|
281
282
|
liftover_to_assembly: Assembly,
|
282
283
|
) -> None:
|
283
284
|
"""Update genomic_tx_data to have coordinates for given assembly.
|
284
285
|
|
285
|
-
:param genomic_tx_data:
|
286
|
-
strand
|
286
|
+
:param genomic_tx_data: Metadata for genomic and transcript accessions
|
287
287
|
:param key: Key to access coordinate positions
|
288
288
|
:param chromosome: Chromosome, must be prefixed with ``chr``
|
289
289
|
:param liftover_to_assembly: Assembly to liftover to
|
290
290
|
"""
|
291
|
+
coords = getattr(genomic_tx_data, key)
|
291
292
|
liftover_start_i = self.liftover.get_liftover(
|
292
|
-
chromosome,
|
293
|
+
chromosome, coords[0], liftover_to_assembly
|
293
294
|
)
|
294
295
|
if liftover_start_i is None:
|
295
296
|
_logger.warning(
|
296
297
|
"Unable to liftover position %s on %s",
|
297
|
-
|
298
|
+
coords[0],
|
298
299
|
chromosome,
|
299
300
|
)
|
300
301
|
return
|
301
302
|
|
302
303
|
liftover_end_i = self.liftover.get_liftover(
|
303
|
-
chromosome,
|
304
|
+
chromosome, coords[1], liftover_to_assembly
|
304
305
|
)
|
305
306
|
if liftover_end_i is None:
|
306
307
|
_logger.warning(
|
307
308
|
"Unable to liftover position %s on %s",
|
308
|
-
|
309
|
+
coords[1],
|
309
310
|
chromosome,
|
310
311
|
)
|
311
312
|
return
|
312
|
-
|
313
|
-
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
313
|
+
setattr(genomic_tx_data, key, (liftover_start_i[1], liftover_end_i[1]))
|
314
314
|
|
315
315
|
async def _get_and_validate_genomic_tx_data(
|
316
316
|
self,
|
@@ -320,7 +320,7 @@ class ManeTranscript:
|
|
320
320
|
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
321
321
|
coding_start_site: int | None = None,
|
322
322
|
alt_ac: str | None = None,
|
323
|
-
) ->
|
323
|
+
) -> GenomicTxMetadata | None:
|
324
324
|
"""Get and validate genomic_tx_data
|
325
325
|
|
326
326
|
:param tx_ac: Accession on c. coordinate
|
@@ -328,7 +328,8 @@ class ManeTranscript:
|
|
328
328
|
:param annotation_layer: Annotation layer for ``ac`` and ``pos``
|
329
329
|
:param coding_start_site: Coding start site
|
330
330
|
:param alt_ac: Accession on g. coordinate
|
331
|
-
:return:
|
331
|
+
:return: Metadata for genomic and transcript accessions if found and validated,
|
332
|
+
else None
|
332
333
|
"""
|
333
334
|
genomic_tx_data = await self.uta_db.get_genomic_tx_data(
|
334
335
|
tx_ac, pos, annotation_layer, alt_ac=alt_ac
|
@@ -341,14 +342,14 @@ class ManeTranscript:
|
|
341
342
|
annotation_layer,
|
342
343
|
)
|
343
344
|
return None
|
344
|
-
genomic_tx_data
|
345
|
+
genomic_tx_data.coding_start_site = coding_start_site
|
345
346
|
|
346
347
|
if not alt_ac:
|
347
348
|
# Only want to liftover if alt_ac not provided. If alt_ac is provided,
|
348
349
|
# it means user wants to stick with the queried assembly
|
349
|
-
og_alt_exon_id = genomic_tx_data
|
350
|
+
og_alt_exon_id = genomic_tx_data.alt_exon_id
|
350
351
|
await self._liftover_to_38(genomic_tx_data)
|
351
|
-
liftover_alt_exon_id = genomic_tx_data
|
352
|
+
liftover_alt_exon_id = genomic_tx_data.alt_exon_id
|
352
353
|
|
353
354
|
# Validation check: Exon structure
|
354
355
|
if og_alt_exon_id != liftover_alt_exon_id:
|
@@ -466,14 +467,14 @@ class ManeTranscript:
|
|
466
467
|
:return: Transcript data
|
467
468
|
"""
|
468
469
|
if found_result:
|
469
|
-
tx_g_pos = g
|
470
|
-
tx_pos_range = g
|
470
|
+
tx_g_pos = g.alt_pos_range
|
471
|
+
tx_pos_range = g.tx_pos_range
|
471
472
|
else:
|
472
473
|
result = await self.uta_db.get_tx_exon_aln_v_data(
|
473
474
|
refseq_c_ac,
|
474
|
-
g
|
475
|
-
g
|
476
|
-
alt_ac=alt_ac if alt_ac else g
|
475
|
+
g.alt_pos_change_range[0],
|
476
|
+
g.alt_pos_change_range[1],
|
477
|
+
alt_ac=alt_ac if alt_ac else g.alt_ac,
|
477
478
|
use_tx_pos=False,
|
478
479
|
)
|
479
480
|
|
@@ -483,18 +484,18 @@ class ManeTranscript:
|
|
483
484
|
)
|
484
485
|
return None
|
485
486
|
result = result[-1]
|
486
|
-
tx_g_pos = result
|
487
|
-
tx_pos_range = result
|
487
|
+
tx_g_pos = result.alt_start_i, result.alt_end_i
|
488
|
+
tx_pos_range = result.tx_start_i, result.tx_end_i
|
488
489
|
|
489
490
|
cds_start_end = await self.uta_db.get_cds_start_end(refseq_c_ac)
|
490
491
|
if not cds_start_end:
|
491
492
|
return None
|
492
493
|
coding_start_site = cds_start_end[0]
|
493
494
|
|
494
|
-
g_pos = g
|
495
|
+
g_pos = g.alt_pos_change_range # start/end genomic change
|
495
496
|
g_pos_change = g_pos[0] - tx_g_pos[0], tx_g_pos[1] - g_pos[1]
|
496
497
|
|
497
|
-
if g
|
498
|
+
if g.strand == Strand.NEGATIVE:
|
498
499
|
g_pos_change = (tx_g_pos[1] - g_pos[0], g_pos[1] - tx_g_pos[0])
|
499
500
|
|
500
501
|
c_pos_change = (
|
@@ -506,10 +507,10 @@ class ManeTranscript:
|
|
506
507
|
c_pos_change = c_pos_change[1], c_pos_change[0]
|
507
508
|
|
508
509
|
return self._get_c_data(
|
509
|
-
gene=g
|
510
|
+
gene=g.gene,
|
510
511
|
cds_start_end=cds_start_end,
|
511
512
|
c_pos_change=c_pos_change,
|
512
|
-
strand=g
|
513
|
+
strand=g.strand,
|
513
514
|
alt_ac=alt_ac,
|
514
515
|
status=status,
|
515
516
|
refseq_c_ac=refseq_c_ac,
|
@@ -531,8 +532,8 @@ class ManeTranscript:
|
|
531
532
|
"""
|
532
533
|
for pos, pos_index in [(start_pos, 0), (end_pos, 1)]:
|
533
534
|
if pos is not None:
|
534
|
-
og_rf = self.
|
535
|
-
new_rf = self.
|
535
|
+
og_rf = self.get_reading_frame(pos)
|
536
|
+
new_rf = self.get_reading_frame(transcript_data.pos[pos_index])
|
536
537
|
|
537
538
|
if og_rf != new_rf:
|
538
539
|
_logger.warning(
|
@@ -561,7 +562,7 @@ class ManeTranscript:
|
|
561
562
|
| GenomicRepresentation,
|
562
563
|
expected_ref: str,
|
563
564
|
anno: AnnotationLayer,
|
564
|
-
|
565
|
+
coordinate_type: CoordinateType,
|
565
566
|
) -> bool:
|
566
567
|
"""Return whether or not reference changes are the same.
|
567
568
|
|
@@ -573,7 +574,7 @@ class ManeTranscript:
|
|
573
574
|
position change
|
574
575
|
:param expected_ref: Reference at position given during input
|
575
576
|
:param anno: Annotation layer we are starting from
|
576
|
-
:param
|
577
|
+
:param coordinate_type: Coordinate type for ``start_pos`` and ``end_pos``
|
577
578
|
:return: ``True`` if reference check passes. ``False`` otherwise.
|
578
579
|
"""
|
579
580
|
if anno == AnnotationLayer.CDNA:
|
@@ -581,7 +582,7 @@ class ManeTranscript:
|
|
581
582
|
end_pos += coding_start_site
|
582
583
|
|
583
584
|
ref, _ = self.seqrepo_access.get_reference_sequence(
|
584
|
-
ac, start=start_pos, end=end_pos,
|
585
|
+
ac, start=start_pos, end=end_pos, coordinate_type=coordinate_type
|
585
586
|
)
|
586
587
|
if ref is None:
|
587
588
|
return False
|
@@ -597,7 +598,7 @@ class ManeTranscript:
|
|
597
598
|
mane_transcript.refseq,
|
598
599
|
start=mane_start_pos,
|
599
600
|
end=mane_end_pos if mane_start_pos != mane_end_pos else None,
|
600
|
-
|
601
|
+
coordinate_type=coordinate_type,
|
601
602
|
)
|
602
603
|
if not mane_ref:
|
603
604
|
_logger.info("Unable to validate reference for MANE Transcript")
|
@@ -618,7 +619,7 @@ class ManeTranscript:
|
|
618
619
|
|
619
620
|
return True
|
620
621
|
|
621
|
-
def
|
622
|
+
def validate_index(
|
622
623
|
self, ac: str, pos: tuple[int, int], coding_start_site: int
|
623
624
|
) -> bool:
|
624
625
|
"""Validate that positions actually exist on accession
|
@@ -632,7 +633,10 @@ class ManeTranscript:
|
|
632
633
|
end_pos = pos[1] + coding_start_site
|
633
634
|
return bool(
|
634
635
|
self.seqrepo_access.get_reference_sequence(
|
635
|
-
ac,
|
636
|
+
ac,
|
637
|
+
start=start_pos,
|
638
|
+
end=end_pos,
|
639
|
+
coordinate_type=CoordinateType.INTER_RESIDUE,
|
636
640
|
)[0]
|
637
641
|
)
|
638
642
|
|
@@ -689,7 +693,7 @@ class ManeTranscript:
|
|
689
693
|
start_annotation_layer: AnnotationLayer,
|
690
694
|
gene: str | None = None,
|
691
695
|
ref: str | None = None,
|
692
|
-
|
696
|
+
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
693
697
|
mane_transcripts: set | None = None,
|
694
698
|
alt_ac: str | None = None,
|
695
699
|
end_annotation_layer: EndAnnotationLayer | None = None,
|
@@ -699,8 +703,8 @@ class ManeTranscript:
|
|
699
703
|
information.
|
700
704
|
|
701
705
|
>>> import asyncio
|
702
|
-
>>> from cool_seq_tool
|
703
|
-
>>> from cool_seq_tool.schemas import AnnotationLayer,
|
706
|
+
>>> from cool_seq_tool import CoolSeqTool
|
707
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, CoordinateType
|
704
708
|
>>> mane_mapper = CoolSeqTool().mane_transcript
|
705
709
|
>>> mane_transcripts = {
|
706
710
|
... "ENST00000646891.2",
|
@@ -714,7 +718,7 @@ class ManeTranscript:
|
|
714
718
|
... 599,
|
715
719
|
... gene="BRAF",
|
716
720
|
... start_annotation_layer=AnnotationLayer.PROTEIN,
|
717
|
-
...
|
721
|
+
... coordinate_type=CoordinateType.INTER_RESIDUE,
|
718
722
|
... mane_transcripts=mane_transcripts,
|
719
723
|
... )
|
720
724
|
... )
|
@@ -731,7 +735,7 @@ class ManeTranscript:
|
|
731
735
|
:param start_annotation_layer: Starting annotation layer
|
732
736
|
:param gene: HGNC gene symbol
|
733
737
|
:param ref: Reference at position given during input
|
734
|
-
:param
|
738
|
+
:param coordinate_type: Coordinate type for ``start_pos`` and ``end_pos``
|
735
739
|
:param mane_transcripts: Attempted mane transcripts that were not compatible
|
736
740
|
:param alt_ac: Genomic accession
|
737
741
|
:param end_annotation_layer: The end annotation layer. If not provided, will be
|
@@ -767,8 +771,8 @@ class ManeTranscript:
|
|
767
771
|
)
|
768
772
|
|
769
773
|
lcr_result = None
|
770
|
-
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos,
|
771
|
-
|
774
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, coordinate_type)
|
775
|
+
coordinate_type = CoordinateType.INTER_RESIDUE
|
772
776
|
|
773
777
|
is_p_or_c_start_anno = True
|
774
778
|
if start_annotation_layer == AnnotationLayer.PROTEIN:
|
@@ -856,7 +860,7 @@ class ManeTranscript:
|
|
856
860
|
{},
|
857
861
|
ref,
|
858
862
|
AnnotationLayer.PROTEIN,
|
859
|
-
|
863
|
+
coordinate_type,
|
860
864
|
)
|
861
865
|
elif start_annotation_layer == AnnotationLayer.CDNA:
|
862
866
|
valid_references = self._validate_references(
|
@@ -867,7 +871,7 @@ class ManeTranscript:
|
|
867
871
|
{},
|
868
872
|
ref,
|
869
873
|
AnnotationLayer.CDNA,
|
870
|
-
|
874
|
+
coordinate_type,
|
871
875
|
)
|
872
876
|
else:
|
873
877
|
valid_references = self._validate_references(
|
@@ -878,7 +882,7 @@ class ManeTranscript:
|
|
878
882
|
{},
|
879
883
|
ref,
|
880
884
|
AnnotationLayer.GENOMIC,
|
881
|
-
|
885
|
+
coordinate_type,
|
882
886
|
)
|
883
887
|
|
884
888
|
if not valid_references:
|
@@ -902,7 +906,7 @@ class ManeTranscript:
|
|
902
906
|
gene,
|
903
907
|
row["pro_ac"],
|
904
908
|
lcr_c_data.pos,
|
905
|
-
g
|
909
|
+
g.strand,
|
906
910
|
lcr_c_data.status,
|
907
911
|
)
|
908
912
|
coding_start_site = 0
|
@@ -910,7 +914,7 @@ class ManeTranscript:
|
|
910
914
|
ac = lcr_result.refseq or lcr_result.ensembl
|
911
915
|
pos = lcr_result.pos
|
912
916
|
|
913
|
-
if not self.
|
917
|
+
if not self.validate_index(ac, pos, coding_start_site):
|
914
918
|
_logger.warning(
|
915
919
|
"%s are not valid positions on %s with coding start site %s",
|
916
920
|
pos,
|
@@ -924,7 +928,7 @@ class ManeTranscript:
|
|
924
928
|
gene,
|
925
929
|
row["pro_ac"],
|
926
930
|
lcr_c_data.pos,
|
927
|
-
g
|
931
|
+
g.strand,
|
928
932
|
lcr_c_data.status,
|
929
933
|
),
|
930
934
|
cdna=lcr_c_data,
|
@@ -936,7 +940,7 @@ class ManeTranscript:
|
|
936
940
|
cds = lcr_result_dict[k].get("coding_start_site", 0)
|
937
941
|
ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
|
938
942
|
pos = lcr_result_dict[k]["pos"]
|
939
|
-
if not self.
|
943
|
+
if not self.validate_index(ac, pos, cds):
|
940
944
|
valid = False
|
941
945
|
_logger.warning(
|
942
946
|
"%s are not valid positions on %s with coding start site %s",
|
@@ -959,13 +963,22 @@ class ManeTranscript:
|
|
959
963
|
gene: str | None = None,
|
960
964
|
ref: str | None = None,
|
961
965
|
try_longest_compatible: bool = False,
|
962
|
-
|
963
|
-
| Literal[
|
966
|
+
coordinate_type: Literal[CoordinateType.RESIDUE]
|
967
|
+
| Literal[CoordinateType.INTER_RESIDUE] = CoordinateType.RESIDUE,
|
964
968
|
) -> DataRepresentation | CdnaRepresentation | None:
|
965
|
-
"""Return MANE
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
+
"""Return MANE representation
|
970
|
+
|
971
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.PROTEIN``, will return
|
972
|
+
``AnnotationLayer.PROTEIN`` representation.
|
973
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.CDNA``, will return
|
974
|
+
``AnnotationLayer.CDNA`` representation.
|
975
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.GENOMIC`` will return
|
976
|
+
``AnnotationLayer.CDNA`` representation if ``gene`` is provided and
|
977
|
+
``AnnotationLayer.GENOMIC`` GRCh38 representation if ``gene`` is NOT
|
978
|
+
provided.
|
979
|
+
|
980
|
+
>>> from cool_seq_tool import CoolSeqTool
|
981
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, CoordinateType
|
969
982
|
>>> import asyncio
|
970
983
|
>>> mane_mapper = CoolSeqTool().mane_transcript
|
971
984
|
>>> result = asyncio.run(
|
@@ -973,7 +986,7 @@ class ManeTranscript:
|
|
973
986
|
... "NP_004324.2",
|
974
987
|
... 599,
|
975
988
|
... AnnotationLayer.PROTEIN,
|
976
|
-
...
|
989
|
+
... coordinate_type=CoordinateType.INTER_RESIDUE,
|
977
990
|
... )
|
978
991
|
... )
|
979
992
|
>>> result.gene, result.refseq, result.status
|
@@ -983,17 +996,21 @@ class ManeTranscript:
|
|
983
996
|
:param start_pos: Start position change
|
984
997
|
:param end_pos: End position change
|
985
998
|
:param start_annotation_layer: Starting annotation layer.
|
986
|
-
:param gene: HGNC gene symbol
|
999
|
+
:param gene: HGNC gene symbol.
|
1000
|
+
If ``gene`` is not provided and ``start_annotation_layer`` is
|
1001
|
+
``AnnotationLayer.GENOMIC``, will return GRCh38 representation.
|
1002
|
+
If ``gene`` is provided and ``start_annotation_layer`` is
|
1003
|
+
``AnnotationLayer.GENOMIC``, will return cDNA representation.
|
987
1004
|
:param ref: Reference at position given during input
|
988
1005
|
:param try_longest_compatible: ``True`` if should try longest compatible remaining
|
989
1006
|
if mane transcript was not compatible. ``False`` otherwise.
|
990
|
-
:param
|
991
|
-
``end_pos``. Will always return
|
1007
|
+
:param CoordinateType coordinate_type: Starting Coordinate type for
|
1008
|
+
``start_pos`` and ``end_pos``. Will always return inter-residue coordinates
|
992
1009
|
:return: MANE data or longest transcript compatible data if validation
|
993
1010
|
checks are correct. Will return inter-residue coordinates. Else, ``None``.
|
994
1011
|
"""
|
995
|
-
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos,
|
996
|
-
|
1012
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, coordinate_type)
|
1013
|
+
coordinate_type = CoordinateType.INTER_RESIDUE
|
997
1014
|
if ref:
|
998
1015
|
ref = ref[: end_pos - start_pos]
|
999
1016
|
|
@@ -1012,7 +1029,7 @@ class ManeTranscript:
|
|
1012
1029
|
if g is None:
|
1013
1030
|
return None
|
1014
1031
|
# Get mane data for gene
|
1015
|
-
mane_data = self.mane_transcript_mappings.get_gene_mane_data(g
|
1032
|
+
mane_data = self.mane_transcript_mappings.get_gene_mane_data(g.gene)
|
1016
1033
|
if not mane_data:
|
1017
1034
|
return None
|
1018
1035
|
|
@@ -1039,10 +1056,8 @@ class ManeTranscript:
|
|
1039
1056
|
if not mane:
|
1040
1057
|
continue
|
1041
1058
|
|
1042
|
-
if not mane.alt_ac:
|
1043
|
-
|
1044
|
-
if g_alt_ac:
|
1045
|
-
mane.alt_ac = g_alt_ac
|
1059
|
+
if not mane.alt_ac and g.alt_ac:
|
1060
|
+
mane.alt_ac = g.alt_ac
|
1046
1061
|
|
1047
1062
|
valid_reading_frame = self._validate_reading_frames(
|
1048
1063
|
c_ac, c_pos[0], c_pos[1], mane
|
@@ -1058,13 +1073,13 @@ class ManeTranscript:
|
|
1058
1073
|
if ref:
|
1059
1074
|
valid_references = self._validate_references(
|
1060
1075
|
ac,
|
1061
|
-
g
|
1076
|
+
g.coding_start_site,
|
1062
1077
|
start_pos,
|
1063
1078
|
end_pos,
|
1064
1079
|
mane,
|
1065
1080
|
ref,
|
1066
1081
|
start_annotation_layer,
|
1067
|
-
|
1082
|
+
coordinate_type,
|
1068
1083
|
)
|
1069
1084
|
if not valid_references:
|
1070
1085
|
continue
|
@@ -1078,8 +1093,8 @@ class ManeTranscript:
|
|
1078
1093
|
end_pos,
|
1079
1094
|
AnnotationLayer.PROTEIN,
|
1080
1095
|
ref=ref,
|
1081
|
-
gene=g
|
1082
|
-
|
1096
|
+
gene=g.gene,
|
1097
|
+
coordinate_type=coordinate_type,
|
1083
1098
|
mane_transcripts=mane_transcripts,
|
1084
1099
|
)
|
1085
1100
|
return await self.get_longest_compatible_transcript(
|
@@ -1088,34 +1103,61 @@ class ManeTranscript:
|
|
1088
1103
|
AnnotationLayer.CDNA,
|
1089
1104
|
ref=ref,
|
1090
1105
|
gene=g["gene"],
|
1091
|
-
|
1106
|
+
coordinate_type=coordinate_type,
|
1092
1107
|
mane_transcripts=mane_transcripts,
|
1093
1108
|
)
|
1094
1109
|
return None
|
1095
1110
|
if start_annotation_layer == AnnotationLayer.GENOMIC:
|
1111
|
+
if not gene:
|
1112
|
+
return await self.g_to_grch38(
|
1113
|
+
ac,
|
1114
|
+
start_pos,
|
1115
|
+
end_pos,
|
1116
|
+
get_mane_genes=True,
|
1117
|
+
coordinate_type=coordinate_type,
|
1118
|
+
)
|
1119
|
+
|
1096
1120
|
return await self.g_to_mane_c(
|
1097
|
-
ac, start_pos, end_pos, gene
|
1121
|
+
ac, start_pos, end_pos, gene, coordinate_type=coordinate_type
|
1098
1122
|
)
|
1099
1123
|
_logger.warning("Annotation layer not supported: %s", start_annotation_layer)
|
1100
1124
|
return None
|
1101
1125
|
|
1102
|
-
async def g_to_grch38(
|
1126
|
+
async def g_to_grch38(
|
1127
|
+
self,
|
1128
|
+
ac: str,
|
1129
|
+
start_pos: int,
|
1130
|
+
end_pos: int,
|
1131
|
+
get_mane_genes: bool = False,
|
1132
|
+
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
1133
|
+
) -> GenomicRepresentation | None:
|
1103
1134
|
"""Return genomic coordinate on GRCh38 when not given gene context.
|
1104
1135
|
|
1105
1136
|
:param ac: Genomic accession
|
1106
1137
|
:param start_pos: Genomic start position
|
1107
1138
|
:param end_pos: Genomic end position
|
1108
|
-
:
|
1139
|
+
:param get_mane_genes: ``True`` if mane genes for genomic position should be
|
1140
|
+
included in response. ``False``, otherwise.
|
1141
|
+
:param coordinate_type: Coordinate type for ``start_pos`` and ``end_pos``
|
1142
|
+
:return: GRCh38 genomic representation (accession and start/end inter-residue
|
1143
|
+
position)
|
1109
1144
|
"""
|
1110
|
-
|
1111
|
-
end_pos = start_pos
|
1145
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, coordinate_type)
|
1112
1146
|
|
1113
1147
|
# Checking to see what chromosome and assembly we're on
|
1114
1148
|
descr = await self.uta_db.get_chr_assembly(ac)
|
1115
1149
|
if not descr:
|
1116
1150
|
# Already GRCh38 assembly
|
1117
|
-
if self.
|
1118
|
-
return
|
1151
|
+
if self.validate_index(ac, (start_pos, end_pos), 0):
|
1152
|
+
return GenomicRepresentation(
|
1153
|
+
ac=ac,
|
1154
|
+
pos=(start_pos, end_pos),
|
1155
|
+
mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
|
1156
|
+
ac, start_pos + 1, end_pos
|
1157
|
+
)
|
1158
|
+
if get_mane_genes
|
1159
|
+
else [],
|
1160
|
+
)
|
1119
1161
|
return None
|
1120
1162
|
chromosome, assembly = descr
|
1121
1163
|
is_same_pos = start_pos == end_pos
|
@@ -1145,13 +1187,21 @@ class ManeTranscript:
|
|
1145
1187
|
newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
|
1146
1188
|
if newest_ac:
|
1147
1189
|
ac = newest_ac[0]
|
1148
|
-
if self.
|
1149
|
-
return
|
1190
|
+
if self.validate_index(ac, (start_pos, end_pos), 0):
|
1191
|
+
return GenomicRepresentation(
|
1192
|
+
ac=ac,
|
1193
|
+
pos=(start_pos, end_pos),
|
1194
|
+
mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
|
1195
|
+
ac, start_pos + 1, end_pos
|
1196
|
+
)
|
1197
|
+
if get_mane_genes
|
1198
|
+
else [],
|
1199
|
+
)
|
1150
1200
|
return None
|
1151
1201
|
|
1152
1202
|
@staticmethod
|
1153
1203
|
def get_mane_c_pos_change(
|
1154
|
-
mane_tx_genomic_data:
|
1204
|
+
mane_tx_genomic_data: GenomicTxMetadata, coding_start_site: int
|
1155
1205
|
) -> tuple[int, int]:
|
1156
1206
|
"""Get mane c position change
|
1157
1207
|
|
@@ -1159,12 +1209,12 @@ class ManeTranscript:
|
|
1159
1209
|
:param coding_start_site: Coding start site
|
1160
1210
|
:return: cDNA pos start, cDNA pos end
|
1161
1211
|
"""
|
1162
|
-
tx_pos_range = mane_tx_genomic_data
|
1163
|
-
|
1212
|
+
tx_pos_range = mane_tx_genomic_data.tx_pos_range
|
1213
|
+
pos_change = mane_tx_genomic_data.pos_change
|
1164
1214
|
|
1165
1215
|
mane_c_pos_change = (
|
1166
|
-
tx_pos_range[0] +
|
1167
|
-
tx_pos_range[1] -
|
1216
|
+
tx_pos_range[0] + pos_change[0] - coding_start_site,
|
1217
|
+
tx_pos_range[1] - pos_change[1] - coding_start_site,
|
1168
1218
|
)
|
1169
1219
|
|
1170
1220
|
if mane_c_pos_change[0] > mane_c_pos_change[1]:
|
@@ -1176,16 +1226,13 @@ class ManeTranscript:
|
|
1176
1226
|
ac: str,
|
1177
1227
|
start_pos: int,
|
1178
1228
|
end_pos: int,
|
1179
|
-
gene: str
|
1180
|
-
|
1181
|
-
) ->
|
1229
|
+
gene: str,
|
1230
|
+
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
1231
|
+
) -> CdnaRepresentation | None:
|
1182
1232
|
"""Return MANE Transcript on the c. coordinate.
|
1183
1233
|
|
1184
|
-
If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
|
1185
|
-
representation.
|
1186
|
-
|
1187
1234
|
>>> import asyncio
|
1188
|
-
>>> from cool_seq_tool
|
1235
|
+
>>> from cool_seq_tool import CoolSeqTool
|
1189
1236
|
>>> cst = CoolSeqTool()
|
1190
1237
|
>>> result = asyncio.run(
|
1191
1238
|
... cst.mane_transcript.g_to_mane_c(
|
@@ -1198,33 +1245,16 @@ class ManeTranscript:
|
|
1198
1245
|
<TranscriptPriority.MANE_SELECT: 'mane_select'>
|
1199
1246
|
>>> del cst
|
1200
1247
|
|
1201
|
-
Locating a MANE transcript requires a ``gene`` symbol argument -- if none is
|
1202
|
-
given, this method will only lift over to genomic coordinates on GRCh38.
|
1203
|
-
|
1204
1248
|
:param ac: Transcript accession on g. coordinate
|
1205
1249
|
:param start_pos: genomic start position
|
1206
1250
|
:param end_pos: genomic end position
|
1207
1251
|
:param gene: HGNC gene symbol
|
1208
|
-
:param
|
1209
|
-
Will always return
|
1210
|
-
:return: MANE Transcripts with cDNA change on c. coordinate
|
1211
|
-
is provided. Else, GRCh38 data
|
1252
|
+
:param coordinate_type: Starting Coordinate type for ``start_pos`` and
|
1253
|
+
``end_pos``. Will always return inter-residue coordinates.
|
1254
|
+
:return: MANE Transcripts with cDNA change on c. coordinate
|
1212
1255
|
"""
|
1213
|
-
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos,
|
1214
|
-
|
1215
|
-
|
1216
|
-
# If gene not provided, return GRCh38
|
1217
|
-
if not gene:
|
1218
|
-
grch38 = await self.g_to_grch38(ac, start_pos, end_pos)
|
1219
|
-
if not grch38:
|
1220
|
-
return None
|
1221
|
-
|
1222
|
-
return GenomicRepresentation(
|
1223
|
-
refseq=grch38["ac"],
|
1224
|
-
pos=grch38["pos"],
|
1225
|
-
status=TranscriptPriority.GRCH38,
|
1226
|
-
alt_ac=grch38["ac"],
|
1227
|
-
)
|
1256
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, coordinate_type)
|
1257
|
+
coordinate_type = CoordinateType.INTER_RESIDUE
|
1228
1258
|
|
1229
1259
|
if not await self.uta_db.validate_genomic_ac(ac):
|
1230
1260
|
_logger.warning("Genomic accession does not exist: %s", ac)
|
@@ -1238,12 +1268,18 @@ class ManeTranscript:
|
|
1238
1268
|
mane_c_ac = current_mane_data["RefSeq_nuc"]
|
1239
1269
|
|
1240
1270
|
# Liftover to GRCh38
|
1241
|
-
grch38 = await self.g_to_grch38(
|
1271
|
+
grch38 = await self.g_to_grch38(
|
1272
|
+
ac,
|
1273
|
+
start_pos,
|
1274
|
+
end_pos,
|
1275
|
+
get_mane_genes=False,
|
1276
|
+
coordinate_type=coordinate_type,
|
1277
|
+
)
|
1242
1278
|
mane_tx_genomic_data = None
|
1243
1279
|
if grch38:
|
1244
1280
|
# GRCh38 -> MANE C
|
1245
1281
|
mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
|
1246
|
-
mane_c_ac, grch38
|
1282
|
+
mane_c_ac, grch38.ac, grch38.pos[0], grch38.pos[1]
|
1247
1283
|
)
|
1248
1284
|
|
1249
1285
|
if not grch38 or not mane_tx_genomic_data:
|
@@ -1255,15 +1291,13 @@ class ManeTranscript:
|
|
1255
1291
|
continue
|
1256
1292
|
_logger.info("Not using most recent assembly")
|
1257
1293
|
|
1258
|
-
coding_start_site = mane_tx_genomic_data
|
1259
|
-
coding_end_site = mane_tx_genomic_data
|
1294
|
+
coding_start_site = mane_tx_genomic_data.coding_start_site
|
1295
|
+
coding_end_site = mane_tx_genomic_data.coding_end_site
|
1260
1296
|
mane_c_pos_change = self.get_mane_c_pos_change(
|
1261
1297
|
mane_tx_genomic_data, coding_start_site
|
1262
1298
|
)
|
1263
1299
|
|
1264
|
-
if not self.
|
1265
|
-
mane_c_ac, mane_c_pos_change, coding_start_site
|
1266
|
-
):
|
1300
|
+
if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
|
1267
1301
|
_logger.warning(
|
1268
1302
|
"%s are not valid positions on %s with coding start site %s",
|
1269
1303
|
mane_c_pos_change,
|
@@ -1284,7 +1318,7 @@ class ManeTranscript:
|
|
1284
1318
|
),
|
1285
1319
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
1286
1320
|
ensembl_c_ac=current_mane_data["Ensembl_nuc"],
|
1287
|
-
alt_ac=grch38
|
1321
|
+
alt_ac=grch38.ac if grch38 else None,
|
1288
1322
|
)
|
1289
1323
|
return None
|
1290
1324
|
|
@@ -1294,7 +1328,7 @@ class ManeTranscript:
|
|
1294
1328
|
start_pos: int,
|
1295
1329
|
end_pos: int,
|
1296
1330
|
gene: str | None = None,
|
1297
|
-
|
1331
|
+
coordinate_type: CoordinateType = CoordinateType.RESIDUE,
|
1298
1332
|
try_longest_compatible: bool = False,
|
1299
1333
|
) -> dict | None:
|
1300
1334
|
"""Given GRCh38 genomic representation, return protein representation.
|
@@ -1307,8 +1341,8 @@ class ManeTranscript:
|
|
1307
1341
|
:param start_pos: Start position
|
1308
1342
|
:param end_pos: End position
|
1309
1343
|
:param gene: HGNC gene symbol
|
1310
|
-
:param
|
1311
|
-
always return
|
1344
|
+
:param coordinate_type: Starting Coordinate type for ``start_pos`` and
|
1345
|
+
``end_pos``. Will always return inter-residue coordinates.
|
1312
1346
|
:param try_longest_compatible: ``True`` if should try longest compatible remaining
|
1313
1347
|
if mane transcript(s) not compatible. ``False`` otherwise.
|
1314
1348
|
:return: If successful, return MANE data or longest compatible remaining (if
|
@@ -1327,8 +1361,8 @@ class ManeTranscript:
|
|
1327
1361
|
return None
|
1328
1362
|
|
1329
1363
|
# Step 2: Get inter-residue position
|
1330
|
-
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos,
|
1331
|
-
|
1364
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, coordinate_type)
|
1365
|
+
coordinate_type = CoordinateType.INTER_RESIDUE
|
1332
1366
|
|
1333
1367
|
# Step 3: Try getting MANE protein representation
|
1334
1368
|
mane_transcripts = set() # Used if getting longest compatible remaining
|
@@ -1344,16 +1378,14 @@ class ManeTranscript:
|
|
1344
1378
|
continue
|
1345
1379
|
|
1346
1380
|
# Get MANE C positions
|
1347
|
-
coding_start_site = mane_tx_genomic_data
|
1348
|
-
coding_end_site = mane_tx_genomic_data
|
1381
|
+
coding_start_site = mane_tx_genomic_data.coding_start_site
|
1382
|
+
coding_end_site = mane_tx_genomic_data.coding_end_site
|
1349
1383
|
mane_c_pos_change = self.get_mane_c_pos_change(
|
1350
1384
|
mane_tx_genomic_data, coding_start_site
|
1351
1385
|
)
|
1352
1386
|
|
1353
1387
|
# Validate MANE C positions
|
1354
|
-
if not self.
|
1355
|
-
mane_c_ac, mane_c_pos_change, coding_start_site
|
1356
|
-
):
|
1388
|
+
if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
|
1357
1389
|
_logger.warning(
|
1358
1390
|
"%s are not valid positions on %s with coding start site %s",
|
1359
1391
|
mane_c_pos_change,
|
@@ -1367,7 +1399,7 @@ class ManeTranscript:
|
|
1367
1399
|
cdna=self._get_c_data(
|
1368
1400
|
(coding_start_site, coding_end_site),
|
1369
1401
|
mane_c_pos_change,
|
1370
|
-
mane_tx_genomic_data
|
1402
|
+
mane_tx_genomic_data.strand,
|
1371
1403
|
TranscriptPriority(
|
1372
1404
|
"_".join(current_mane_data["MANE_status"].split()).lower()
|
1373
1405
|
),
|
@@ -1383,7 +1415,7 @@ class ManeTranscript:
|
|
1383
1415
|
start_pos,
|
1384
1416
|
end_pos,
|
1385
1417
|
AnnotationLayer.GENOMIC,
|
1386
|
-
|
1418
|
+
coordinate_type=coordinate_type,
|
1387
1419
|
alt_ac=alt_ac,
|
1388
1420
|
end_annotation_layer=EndAnnotationLayer.PROTEIN_AND_CDNA,
|
1389
1421
|
mane_transcripts=mane_transcripts,
|