cool-seq-tool 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/mappers/exon_genomic_coords.py +267 -242
- cool_seq_tool/sources/mane_transcript_mappings.py +3 -1
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.8.0.dist-info}/METADATA +1 -1
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.8.0.dist-info}/RECORD +7 -7
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.8.0.dist-info}/WHEEL +1 -1
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.8.0.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.8.0.dist-info}/top_level.txt +0 -0
@@ -14,13 +14,13 @@ from cool_seq_tool.schemas import (
|
|
14
14
|
Strand,
|
15
15
|
)
|
16
16
|
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
|
17
|
-
from cool_seq_tool.sources.uta_database import UtaDatabase
|
17
|
+
from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
|
18
18
|
from cool_seq_tool.utils import service_meta
|
19
19
|
|
20
20
|
_logger = logging.getLogger(__name__)
|
21
21
|
|
22
22
|
|
23
|
-
class
|
23
|
+
class _ExonCoord(BaseModelForbidExtra):
|
24
24
|
"""Model for representing exon coordinate data"""
|
25
25
|
|
26
26
|
ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
@@ -87,7 +87,9 @@ class GenomicTxSeg(BaseModelForbidExtra):
|
|
87
87
|
"""Model for representing a boundary for a transcript segment."""
|
88
88
|
|
89
89
|
seg: TxSegment | None = Field(None, description="Transcript segment.")
|
90
|
-
gene: StrictStr | None = Field(
|
90
|
+
gene: StrictStr | None = Field(
|
91
|
+
None, description="Valid, case-sensitive HGNC gene symbol."
|
92
|
+
)
|
91
93
|
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
92
94
|
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
93
95
|
errors: list[StrictStr] = Field([], description="Error messages.")
|
@@ -97,19 +99,18 @@ class GenomicTxSeg(BaseModelForbidExtra):
|
|
97
99
|
"""Ensure that fields are (un)set depending on errors
|
98
100
|
|
99
101
|
:param values: Values in model
|
100
|
-
:raises ValueError: If `seg`, `
|
102
|
+
:raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
|
101
103
|
provided when there are no errors
|
102
104
|
:return: Values in model
|
103
105
|
"""
|
104
106
|
if not values.get("errors") and not all(
|
105
107
|
(
|
106
108
|
values.get("seg"),
|
107
|
-
values.get("gene"),
|
108
109
|
values.get("genomic_ac"),
|
109
110
|
values.get("tx_ac"),
|
110
111
|
)
|
111
112
|
):
|
112
|
-
err_msg = "`seg`, `
|
113
|
+
err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
|
113
114
|
raise ValueError(err_msg)
|
114
115
|
return values
|
115
116
|
|
@@ -140,7 +141,9 @@ class GenomicTxSeg(BaseModelForbidExtra):
|
|
140
141
|
class GenomicTxSegService(BaseModelForbidExtra):
|
141
142
|
"""Service model for genomic and transcript data."""
|
142
143
|
|
143
|
-
gene: StrictStr | None = Field(
|
144
|
+
gene: StrictStr | None = Field(
|
145
|
+
None, description="Valid, case-sensitive HGNC gene symbol."
|
146
|
+
)
|
144
147
|
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
145
148
|
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
146
149
|
seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
|
@@ -154,20 +157,21 @@ class GenomicTxSegService(BaseModelForbidExtra):
|
|
154
157
|
on errors
|
155
158
|
|
156
159
|
:param values: Values in model
|
157
|
-
:raises ValueError: If `
|
160
|
+
:raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
|
158
161
|
not provided when there are no errors
|
159
162
|
:return: Values in model, including service metadata
|
160
163
|
"""
|
161
164
|
values["service_meta"] = service_meta()
|
162
165
|
if not values.get("errors") and not all(
|
163
166
|
(
|
164
|
-
values.get("gene"),
|
165
167
|
values.get("genomic_ac"),
|
166
168
|
values.get("tx_ac"),
|
167
169
|
values.get("seg_start") or values.get("seg_end"),
|
168
170
|
)
|
169
171
|
):
|
170
|
-
err_msg =
|
172
|
+
err_msg = (
|
173
|
+
"`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
|
174
|
+
)
|
171
175
|
raise ValueError(err_msg)
|
172
176
|
|
173
177
|
return values
|
@@ -292,7 +296,7 @@ class ExonGenomicCoordsMapper:
|
|
292
296
|
('NC_000001.11', 154192135, 154170399)
|
293
297
|
|
294
298
|
:param transcript: RefSeq transcript accession
|
295
|
-
:param gene: HGNC gene symbol
|
299
|
+
:param gene: Valid, case-sensitive HGNC gene symbol
|
296
300
|
:param exon_start: Starting transcript exon number (1-based). If not provided,
|
297
301
|
must provide ``exon_end``
|
298
302
|
:param exon_start_offset: Starting exon offset
|
@@ -335,22 +339,22 @@ class ExonGenomicCoordsMapper:
|
|
335
339
|
if errors:
|
336
340
|
return _return_service_errors(errors)
|
337
341
|
|
338
|
-
if gene:
|
339
|
-
gene = gene.upper()
|
340
|
-
|
341
342
|
# Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
|
342
343
|
# for exon(s)
|
343
|
-
|
344
|
+
(
|
345
|
+
genomic_aln_start,
|
346
|
+
genomic_aln_end,
|
347
|
+
err_msg,
|
348
|
+
) = await self._get_genomic_aln_coords(
|
344
349
|
transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
|
345
350
|
)
|
346
|
-
if
|
347
|
-
return _return_service_errors([err_msg]
|
348
|
-
alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
|
351
|
+
if err_msg:
|
352
|
+
return _return_service_errors([err_msg])
|
349
353
|
|
350
354
|
# Get gene and chromosome data, check that at least one was retrieved
|
351
|
-
gene =
|
355
|
+
gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
|
352
356
|
genomic_ac = (
|
353
|
-
|
357
|
+
genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
|
354
358
|
)
|
355
359
|
if gene is None or genomic_ac is None:
|
356
360
|
return _return_service_errors(
|
@@ -360,9 +364,9 @@ class ExonGenomicCoordsMapper:
|
|
360
364
|
)
|
361
365
|
|
362
366
|
strand = (
|
363
|
-
Strand(
|
364
|
-
if
|
365
|
-
else Strand(
|
367
|
+
Strand(genomic_aln_start.alt_strand)
|
368
|
+
if genomic_aln_start
|
369
|
+
else Strand(genomic_aln_end.alt_strand)
|
366
370
|
)
|
367
371
|
|
368
372
|
if exon_start_exists:
|
@@ -370,7 +374,7 @@ class ExonGenomicCoordsMapper:
|
|
370
374
|
genomic_ac,
|
371
375
|
strand,
|
372
376
|
exon_start_offset,
|
373
|
-
|
377
|
+
genomic_aln_start,
|
374
378
|
is_seg_start=True,
|
375
379
|
)
|
376
380
|
if err_msg:
|
@@ -380,7 +384,11 @@ class ExonGenomicCoordsMapper:
|
|
380
384
|
|
381
385
|
if exon_end_exists:
|
382
386
|
seg_end, err_msg = self._get_tx_segment(
|
383
|
-
genomic_ac,
|
387
|
+
genomic_ac,
|
388
|
+
strand,
|
389
|
+
exon_end_offset,
|
390
|
+
genomic_aln_end,
|
391
|
+
is_seg_start=False,
|
384
392
|
)
|
385
393
|
if err_msg:
|
386
394
|
return _return_service_errors([err_msg])
|
@@ -448,7 +456,7 @@ class ExonGenomicCoordsMapper:
|
|
448
456
|
following the breakpoint for the 3' end. For the negative strand, adjacent
|
449
457
|
is defined as the exon following the breakpoint for the 5' end and the exon
|
450
458
|
preceding the breakpoint for the 3' end.
|
451
|
-
:param gene:
|
459
|
+
:param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript``
|
452
460
|
value is provided.
|
453
461
|
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
|
454
462
|
``seg_end_genomic``
|
@@ -466,9 +474,6 @@ class ExonGenomicCoordsMapper:
|
|
466
474
|
if errors:
|
467
475
|
return _return_service_errors(errors)
|
468
476
|
|
469
|
-
if gene is not None:
|
470
|
-
gene = gene.upper()
|
471
|
-
|
472
477
|
params = {}
|
473
478
|
|
474
479
|
if seg_start_genomic:
|
@@ -479,7 +484,7 @@ class ExonGenomicCoordsMapper:
|
|
479
484
|
transcript=transcript,
|
480
485
|
gene=gene,
|
481
486
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
482
|
-
|
487
|
+
is_seg_start=True,
|
483
488
|
)
|
484
489
|
if start_tx_seg_data.errors:
|
485
490
|
return _return_service_errors(start_tx_seg_data.errors)
|
@@ -499,7 +504,7 @@ class ExonGenomicCoordsMapper:
|
|
499
504
|
transcript=transcript,
|
500
505
|
gene=gene,
|
501
506
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
502
|
-
|
507
|
+
is_seg_start=False,
|
503
508
|
)
|
504
509
|
if end_tx_seg_data.errors:
|
505
510
|
return _return_service_errors(end_tx_seg_data.errors)
|
@@ -525,53 +530,13 @@ class ExonGenomicCoordsMapper:
|
|
525
530
|
|
526
531
|
return GenomicTxSegService(**params)
|
527
532
|
|
528
|
-
async def _get_all_exon_coords(
|
529
|
-
self, tx_ac: str, genomic_ac: str | None = None
|
530
|
-
) -> list[ExonCoord]:
|
531
|
-
"""Get all exon coordinate data for a transcript.
|
532
|
-
|
533
|
-
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
534
|
-
associated to ``tx_ac``.
|
535
|
-
|
536
|
-
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
537
|
-
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
538
|
-
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
539
|
-
The exon coordinate data will include the exon number, transcript and
|
540
|
-
genomic positions for the start and end of the exon, and strand.
|
541
|
-
The list will be ordered by ascending exon number.
|
542
|
-
"""
|
543
|
-
if genomic_ac:
|
544
|
-
query = f"""
|
545
|
-
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
546
|
-
FROM {self.uta_db.schema}.tx_exon_aln_v
|
547
|
-
WHERE tx_ac = '{tx_ac}'
|
548
|
-
AND alt_aln_method = 'splign'
|
549
|
-
AND alt_ac = '{genomic_ac}'
|
550
|
-
ORDER BY ord ASC
|
551
|
-
""" # noqa: S608
|
552
|
-
else:
|
553
|
-
query = f"""
|
554
|
-
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
555
|
-
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
556
|
-
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
557
|
-
ON t.alt_ac = s.ac
|
558
|
-
WHERE s.descr = ''
|
559
|
-
AND t.tx_ac = '{tx_ac}'
|
560
|
-
AND t.alt_aln_method = 'splign'
|
561
|
-
AND t.alt_ac like 'NC_000%'
|
562
|
-
ORDER BY ord ASC
|
563
|
-
""" # noqa: S608
|
564
|
-
|
565
|
-
results = await self.uta_db.execute_query(query)
|
566
|
-
return [ExonCoord(**r) for r in results]
|
567
|
-
|
568
533
|
async def _get_start_end_exon_coords(
|
569
534
|
self,
|
570
535
|
tx_ac: str,
|
571
536
|
exon_start: int | None = None,
|
572
537
|
exon_end: int | None = None,
|
573
538
|
genomic_ac: str | None = None,
|
574
|
-
) -> tuple[
|
539
|
+
) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
|
575
540
|
"""Get exon coordinates for a transcript given exon start and exon end.
|
576
541
|
|
577
542
|
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
@@ -606,56 +571,160 @@ class ExonGenomicCoordsMapper:
|
|
606
571
|
|
607
572
|
return *start_end_exons, errors
|
608
573
|
|
609
|
-
async def
|
574
|
+
async def _get_all_exon_coords(
|
575
|
+
self, tx_ac: str, genomic_ac: str | None = None
|
576
|
+
) -> list[_ExonCoord]:
|
577
|
+
"""Get all exon coordinate data for a transcript.
|
578
|
+
|
579
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
580
|
+
associated to ``tx_ac``.
|
581
|
+
|
582
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
583
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
584
|
+
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
585
|
+
The exon coordinate data will include the exon number, transcript and
|
586
|
+
genomic positions for the start and end of the exon, and strand.
|
587
|
+
The list will be ordered by ascending exon number.
|
588
|
+
"""
|
589
|
+
if genomic_ac:
|
590
|
+
query = f"""
|
591
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
592
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
593
|
+
WHERE tx_ac = '{tx_ac}'
|
594
|
+
AND alt_aln_method = 'splign'
|
595
|
+
AND alt_ac = '{genomic_ac}'
|
596
|
+
ORDER BY ord ASC
|
597
|
+
""" # noqa: S608
|
598
|
+
else:
|
599
|
+
query = f"""
|
600
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
601
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
602
|
+
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
603
|
+
ON t.alt_ac = s.ac
|
604
|
+
WHERE s.descr = ''
|
605
|
+
AND t.tx_ac = '{tx_ac}'
|
606
|
+
AND t.alt_aln_method = 'splign'
|
607
|
+
AND t.alt_ac like 'NC_000%'
|
608
|
+
ORDER BY ord ASC
|
609
|
+
""" # noqa: S608
|
610
|
+
|
611
|
+
results = await self.uta_db.execute_query(query)
|
612
|
+
return [_ExonCoord(**r) for r in results]
|
613
|
+
|
614
|
+
async def _get_genomic_aln_coords(
|
610
615
|
self,
|
611
616
|
tx_ac: str,
|
612
|
-
tx_exon_start:
|
613
|
-
tx_exon_end:
|
617
|
+
tx_exon_start: _ExonCoord | None = None,
|
618
|
+
tx_exon_end: _ExonCoord | None = None,
|
614
619
|
gene: str | None = None,
|
615
|
-
) -> tuple[
|
620
|
+
) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
|
616
621
|
"""Get aligned genomic coordinates for transcript exon start and end.
|
617
622
|
|
623
|
+
``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
|
624
|
+
transcript and genomic accession.
|
625
|
+
|
618
626
|
:param tx_ac: Transcript accession
|
619
627
|
:param tx_exon_start: Transcript's exon start coordinates. If not provided,
|
620
628
|
must provide ``tx_exon_end``
|
621
629
|
:param tx_exon_end: Transcript's exon end coordinates. If not provided, must
|
622
630
|
provide ``tx_exon_start``
|
623
|
-
:param gene: HGNC gene symbol
|
624
|
-
:return:
|
631
|
+
:param gene: A valid, case-sensitive HGNC gene symbol
|
632
|
+
:return: Tuple containing aligned genomic data for start and end exon and
|
633
|
+
warnings if found
|
625
634
|
"""
|
626
635
|
if tx_exon_start is None and tx_exon_end is None:
|
627
636
|
msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
|
628
637
|
_logger.warning(msg)
|
629
|
-
return None, msg
|
638
|
+
return None, None, msg
|
630
639
|
|
631
|
-
|
640
|
+
aligned_coords = {"start": None, "end": None}
|
632
641
|
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
|
633
642
|
if exon:
|
634
|
-
|
643
|
+
aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
|
635
644
|
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
|
636
645
|
)
|
637
|
-
if
|
638
|
-
|
646
|
+
if aligned_coord:
|
647
|
+
aligned_coords[key] = aligned_coord
|
639
648
|
else:
|
640
|
-
return None, warning
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
649
|
+
return None, None, warning
|
650
|
+
|
651
|
+
return *aligned_coords.values(), None
|
652
|
+
|
653
|
+
def _get_tx_segment(
|
654
|
+
self,
|
655
|
+
genomic_ac: str,
|
656
|
+
strand: Strand,
|
657
|
+
offset: int,
|
658
|
+
genomic_ac_data: _ExonCoord,
|
659
|
+
is_seg_start: bool = False,
|
660
|
+
) -> tuple[TxSegment | None, str | None]:
|
661
|
+
"""Get transcript segment data given ``genomic_ac`` and offset data
|
662
|
+
|
663
|
+
:param genomic_ac: Genomic RefSeq accession
|
664
|
+
:param strand: Strand
|
665
|
+
:param offset: Exon offset
|
666
|
+
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
667
|
+
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
668
|
+
segment starts, defaults to ``False``
|
669
|
+
:return: Transcript segment data
|
670
|
+
"""
|
671
|
+
if is_seg_start:
|
672
|
+
if strand == Strand.POSITIVE:
|
673
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
674
|
+
else:
|
675
|
+
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
676
|
+
else:
|
677
|
+
if strand == Strand.POSITIVE:
|
678
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
679
|
+
else:
|
680
|
+
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
681
|
+
|
682
|
+
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
683
|
+
genomic_ac,
|
684
|
+
seg_genomic_pos,
|
685
|
+
is_seg_start=is_seg_start,
|
686
|
+
strand=strand,
|
687
|
+
)
|
688
|
+
if err_msg:
|
689
|
+
return None, err_msg
|
690
|
+
|
691
|
+
return TxSegment(
|
692
|
+
exon_ord=genomic_ac_data.ord,
|
693
|
+
genomic_location=genomic_loc,
|
694
|
+
offset=offset,
|
695
|
+
), None
|
696
|
+
|
697
|
+
def _get_vrs_seq_loc(
|
698
|
+
self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
|
699
|
+
) -> tuple[SequenceLocation | None, str | None]:
|
700
|
+
"""Create VRS Sequence Location for genomic position where transcript segment
|
701
|
+
occurs
|
702
|
+
|
703
|
+
:param genomic_ac: RefSeq genomic accession
|
704
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
705
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
|
706
|
+
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
707
|
+
:param strand: Strand
|
708
|
+
:return: Tuple containing VRS location (if successful) and error message (if
|
709
|
+
unable to get GA4GH identifier for ``genomic_ac``).
|
710
|
+
"""
|
711
|
+
ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
|
712
|
+
genomic_ac, "ga4gh"
|
713
|
+
)
|
714
|
+
if err_msg:
|
715
|
+
return None, err_msg
|
716
|
+
|
717
|
+
use_start = (
|
718
|
+
strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
|
719
|
+
)
|
720
|
+
|
721
|
+
return SequenceLocation(
|
722
|
+
sequenceReference=SequenceReference(
|
723
|
+
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
724
|
+
),
|
725
|
+
start=genomic_pos if use_start else None,
|
726
|
+
end=genomic_pos if not use_start else None,
|
727
|
+
), None
|
659
728
|
|
660
729
|
async def _genomic_to_tx_segment(
|
661
730
|
self,
|
@@ -665,7 +734,7 @@ class ExonGenomicCoordsMapper:
|
|
665
734
|
transcript: str | None = None,
|
666
735
|
gene: str | None = None,
|
667
736
|
get_nearest_transcript_junction: bool = False,
|
668
|
-
|
737
|
+
is_seg_start: bool = True,
|
669
738
|
) -> GenomicTxSeg:
|
670
739
|
"""Given genomic data, generate a boundary for a transcript segment.
|
671
740
|
|
@@ -684,7 +753,7 @@ class ExonGenomicCoordsMapper:
|
|
684
753
|
:param transcript: The transcript to use. If this is not given, we will try the
|
685
754
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
686
755
|
Compatible Transcript
|
687
|
-
:param gene: HGNC gene symbol
|
756
|
+
:param gene: Valid, case-sensitive HGNC gene symbol
|
688
757
|
:param get_nearest_transcript_junction: If ``True``, this will return the
|
689
758
|
adjacent exon if the position specified by``seg_start_genomic`` or
|
690
759
|
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
@@ -692,17 +761,17 @@ class ExonGenomicCoordsMapper:
|
|
692
761
|
following the breakpoint for the 3' end. For the negative strand, adjacent
|
693
762
|
is defined as the exon following the breakpoint for the 5' end and the exon
|
694
763
|
preceding the breakpoint for the 3' end.
|
695
|
-
:param
|
764
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
696
765
|
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
697
766
|
:return: Data for a transcript segment boundary (inter-residue coordinates)
|
698
767
|
"""
|
699
768
|
params = {key: None for key in GenomicTxSeg.model_fields}
|
700
769
|
|
701
770
|
if get_nearest_transcript_junction:
|
702
|
-
if not gene:
|
771
|
+
if not gene and not transcript:
|
703
772
|
return GenomicTxSeg(
|
704
773
|
errors=[
|
705
|
-
"`gene` must be provided to select the adjacent transcript junction"
|
774
|
+
"`gene` or `transcript` must be provided to select the adjacent transcript junction"
|
706
775
|
]
|
707
776
|
)
|
708
777
|
|
@@ -773,8 +842,8 @@ class ExonGenomicCoordsMapper:
|
|
773
842
|
exon_num = self._get_adjacent_exon(
|
774
843
|
tx_exons_genomic_coords=tx_exons,
|
775
844
|
strand=strand,
|
776
|
-
start=genomic_pos if
|
777
|
-
end=genomic_pos if not
|
845
|
+
start=genomic_pos if is_seg_start else None,
|
846
|
+
end=genomic_pos if not is_seg_start else None,
|
778
847
|
)
|
779
848
|
|
780
849
|
offset = self._get_exon_offset(
|
@@ -782,19 +851,26 @@ class ExonGenomicCoordsMapper:
|
|
782
851
|
end_i=tx_exons[exon_num].alt_end_i,
|
783
852
|
strand=strand,
|
784
853
|
use_start_i=strand == Strand.POSITIVE
|
785
|
-
if
|
854
|
+
if is_seg_start
|
786
855
|
else strand != Strand.POSITIVE,
|
787
856
|
is_in_exon=False,
|
788
|
-
start=genomic_pos if
|
789
|
-
end=genomic_pos if not
|
857
|
+
start=genomic_pos if is_seg_start else None,
|
858
|
+
end=genomic_pos if not is_seg_start else None,
|
790
859
|
)
|
791
860
|
|
792
861
|
genomic_location, err_msg = self._get_vrs_seq_loc(
|
793
|
-
genomic_ac, genomic_pos,
|
862
|
+
genomic_ac, genomic_pos, is_seg_start, strand
|
794
863
|
)
|
795
864
|
if err_msg:
|
796
865
|
return GenomicTxSeg(errors=[err_msg])
|
797
866
|
|
867
|
+
# gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
|
868
|
+
if not gene:
|
869
|
+
_gene, err_msg = await self._get_tx_ac_gene(transcript)
|
870
|
+
if err_msg:
|
871
|
+
return GenomicTxSeg(errors=[err_msg])
|
872
|
+
gene = _gene
|
873
|
+
|
798
874
|
return GenomicTxSeg(
|
799
875
|
gene=gene,
|
800
876
|
genomic_ac=genomic_ac,
|
@@ -807,20 +883,17 @@ class ExonGenomicCoordsMapper:
|
|
807
883
|
)
|
808
884
|
|
809
885
|
if genomic_ac:
|
810
|
-
# Check if valid accession is given
|
811
|
-
if not await self.uta_db.validate_genomic_ac(genomic_ac):
|
812
|
-
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
813
|
-
|
814
886
|
_gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
|
815
|
-
if _gene:
|
816
|
-
if gene and _gene != gene:
|
817
|
-
return GenomicTxSeg(
|
818
|
-
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
819
|
-
)
|
820
887
|
|
821
|
-
|
822
|
-
else:
|
888
|
+
if err_msg:
|
823
889
|
return GenomicTxSeg(errors=[err_msg])
|
890
|
+
|
891
|
+
if gene and _gene != gene:
|
892
|
+
return GenomicTxSeg(
|
893
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
894
|
+
)
|
895
|
+
|
896
|
+
gene = _gene
|
824
897
|
elif chromosome:
|
825
898
|
# Try GRCh38 first
|
826
899
|
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
|
@@ -858,7 +931,7 @@ class ExonGenomicCoordsMapper:
|
|
858
931
|
)
|
859
932
|
|
860
933
|
return await self._get_tx_seg_genomic_metadata(
|
861
|
-
genomic_ac, genomic_pos,
|
934
|
+
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
|
862
935
|
)
|
863
936
|
|
864
937
|
async def _get_grch38_ac_pos(
|
@@ -943,85 +1016,37 @@ class ExonGenomicCoordsMapper:
|
|
943
1016
|
|
944
1017
|
return results[0]["hgnc"], None
|
945
1018
|
|
946
|
-
def
|
1019
|
+
async def _get_tx_ac_gene(
|
947
1020
|
self,
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
genomic_ac_data: ExonCoord,
|
952
|
-
is_seg_start: bool = False,
|
953
|
-
) -> tuple[TxSegment | None, str | None]:
|
954
|
-
"""Get transcript segment data given ``genomic_ac`` and offset data
|
955
|
-
|
956
|
-
:param genomic_ac: Genomic RefSeq accession
|
957
|
-
:param strand: Strand
|
958
|
-
:param offset: Exon offset
|
959
|
-
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
960
|
-
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
961
|
-
segment starts, defaults to ``False``
|
962
|
-
:return: Transcript segment data
|
963
|
-
"""
|
964
|
-
if is_seg_start:
|
965
|
-
if strand == Strand.POSITIVE:
|
966
|
-
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
967
|
-
else:
|
968
|
-
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
969
|
-
else:
|
970
|
-
if strand == Strand.POSITIVE:
|
971
|
-
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
972
|
-
else:
|
973
|
-
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
974
|
-
|
975
|
-
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
976
|
-
genomic_ac,
|
977
|
-
seg_genomic_pos,
|
978
|
-
is_start=is_seg_start,
|
979
|
-
strand=strand,
|
980
|
-
)
|
981
|
-
if err_msg:
|
982
|
-
return None, err_msg
|
983
|
-
|
984
|
-
return TxSegment(
|
985
|
-
exon_ord=genomic_ac_data.ord,
|
986
|
-
genomic_location=genomic_loc,
|
987
|
-
offset=offset,
|
988
|
-
), None
|
1021
|
+
tx_ac: str,
|
1022
|
+
) -> tuple[str | None, str | None]:
|
1023
|
+
"""Get gene given a transcript.
|
989
1024
|
|
990
|
-
|
991
|
-
|
992
|
-
) -> tuple[SequenceLocation | None, str | None]:
|
993
|
-
"""Create VRS Sequence Location for genomic position where transcript segment
|
994
|
-
occurs
|
1025
|
+
If multiple genes are found for a given ``tx_ac``, only one
|
1026
|
+
gene will be returned.
|
995
1027
|
|
996
|
-
:param
|
997
|
-
:
|
998
|
-
|
999
|
-
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
1000
|
-
:param strand: Strand
|
1001
|
-
:return: Tuple containing VRS location (if successful) and error message (if
|
1002
|
-
unable to get GA4GH identifier for ``genomic_ac``).
|
1028
|
+
:param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
|
1029
|
+
:return: HGNC gene symbol associated to transcript and
|
1030
|
+
warning
|
1003
1031
|
"""
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1032
|
+
query = f"""
|
1033
|
+
SELECT DISTINCT hgnc
|
1034
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
1035
|
+
WHERE tx_ac = '{tx_ac}'
|
1036
|
+
ORDER BY hgnc
|
1037
|
+
LIMIT 1;
|
1038
|
+
""" # noqa: S608
|
1039
|
+
results = await self.uta_db.execute_query(query)
|
1040
|
+
if not results:
|
1041
|
+
return None, f"No gene(s) found given {tx_ac}"
|
1011
1042
|
|
1012
|
-
return
|
1013
|
-
sequenceReference=SequenceReference(
|
1014
|
-
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
1015
|
-
),
|
1016
|
-
start=genomic_pos if use_start else None,
|
1017
|
-
end=genomic_pos if not use_start else None,
|
1018
|
-
), None
|
1043
|
+
return results[0]["hgnc"], None
|
1019
1044
|
|
1020
1045
|
async def _get_tx_seg_genomic_metadata(
|
1021
1046
|
self,
|
1022
1047
|
genomic_ac: str,
|
1023
1048
|
genomic_pos: int,
|
1024
|
-
|
1049
|
+
is_seg_start: bool,
|
1025
1050
|
gene: str,
|
1026
1051
|
tx_ac: str | None,
|
1027
1052
|
) -> GenomicTxSeg:
|
@@ -1034,8 +1059,8 @@ class ExonGenomicCoordsMapper:
|
|
1034
1059
|
|
1035
1060
|
:param genomic_ac: Genomic RefSeq accession
|
1036
1061
|
:param genomic_pos: Genomic position where the transcript segment occurs
|
1037
|
-
:param
|
1038
|
-
:param gene: HGNC gene symbol
|
1062
|
+
:param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
|
1063
|
+
:param gene: Valid, case-sensitive HGNC gene symbol
|
1039
1064
|
:param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
|
1040
1065
|
transcript
|
1041
1066
|
:return: Transcript segment data and associated genomic metadata
|
@@ -1092,12 +1117,12 @@ class ExonGenomicCoordsMapper:
|
|
1092
1117
|
strand=Strand(tx_exon_aln_data.alt_strand),
|
1093
1118
|
use_start_i=False, # This doesn't impact anything since we're on the exon
|
1094
1119
|
is_in_exon=True,
|
1095
|
-
start=genomic_pos if
|
1096
|
-
end=genomic_pos if not
|
1120
|
+
start=genomic_pos if is_seg_start else None,
|
1121
|
+
end=genomic_pos if not is_seg_start else None,
|
1097
1122
|
)
|
1098
1123
|
|
1099
1124
|
genomic_location, err_msg = self._get_vrs_seq_loc(
|
1100
|
-
genomic_ac, genomic_pos,
|
1125
|
+
genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
|
1101
1126
|
)
|
1102
1127
|
if err_msg:
|
1103
1128
|
return GenomicTxSeg(errors=[err_msg])
|
@@ -1114,46 +1139,20 @@ class ExonGenomicCoordsMapper:
|
|
1114
1139
|
)
|
1115
1140
|
|
1116
1141
|
@staticmethod
|
1117
|
-
def
|
1118
|
-
|
1119
|
-
end_i: int,
|
1120
|
-
strand: Strand,
|
1121
|
-
use_start_i: bool = True,
|
1122
|
-
is_in_exon: bool = True,
|
1123
|
-
start: int | None = None,
|
1124
|
-
end: int | None = None,
|
1125
|
-
) -> int:
|
1126
|
-
"""Compute offset from exon start or end index
|
1142
|
+
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
|
1143
|
+
"""Check if a breakpoint occurs on an exon
|
1127
1144
|
|
1128
|
-
:param
|
1129
|
-
:param
|
1130
|
-
:
|
1131
|
-
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1132
|
-
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1133
|
-
``False``.
|
1134
|
-
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1135
|
-
``True``
|
1136
|
-
:param start: Provided start position, defaults to ``None``. Must provide
|
1137
|
-
``start`` or ``end``, not both.
|
1138
|
-
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1139
|
-
or ``end``, not both
|
1140
|
-
:return: Offset from exon start or end index
|
1145
|
+
:param pos: Genomic breakpoint
|
1146
|
+
:param tx_genomic_coords: A list of transcript exon coordinate data
|
1147
|
+
:return: ``True`` if the breakpoint occurs on an exon
|
1141
1148
|
"""
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
else:
|
1146
|
-
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
1147
|
-
else:
|
1148
|
-
if strand == Strand.POSITIVE:
|
1149
|
-
offset = start - start_i if use_start_i else end - end_i
|
1150
|
-
else:
|
1151
|
-
offset = start_i - end if use_start_i else end_i - start
|
1152
|
-
return offset
|
1149
|
+
return any(
|
1150
|
+
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
|
1151
|
+
)
|
1153
1152
|
|
1154
1153
|
@staticmethod
|
1155
1154
|
def _get_adjacent_exon(
|
1156
|
-
tx_exons_genomic_coords: list[
|
1155
|
+
tx_exons_genomic_coords: list[_ExonCoord],
|
1157
1156
|
strand: Strand,
|
1158
1157
|
start: int | None = None,
|
1159
1158
|
end: int | None = None,
|
@@ -1191,13 +1190,39 @@ class ExonGenomicCoordsMapper:
|
|
1191
1190
|
return exon.ord if end else exon.ord + 1
|
1192
1191
|
|
1193
1192
|
@staticmethod
|
1194
|
-
def
|
1195
|
-
|
1193
|
+
def _get_exon_offset(
|
1194
|
+
start_i: int,
|
1195
|
+
end_i: int,
|
1196
|
+
strand: Strand,
|
1197
|
+
use_start_i: bool = True,
|
1198
|
+
is_in_exon: bool = True,
|
1199
|
+
start: int | None = None,
|
1200
|
+
end: int | None = None,
|
1201
|
+
) -> int:
|
1202
|
+
"""Compute offset from exon start or end index
|
1196
1203
|
|
1197
|
-
:param
|
1198
|
-
:param
|
1199
|
-
:
|
1204
|
+
:param start_i: Exon start index (inter-residue)
|
1205
|
+
:param end_i: Exon end index (inter-residue)
|
1206
|
+
:param strand: Strand
|
1207
|
+
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1208
|
+
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1209
|
+
``False``.
|
1210
|
+
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1211
|
+
``True``
|
1212
|
+
:param start: Provided start position, defaults to ``None``. Must provide
|
1213
|
+
``start`` or ``end``, not both.
|
1214
|
+
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1215
|
+
or ``end``, not both
|
1216
|
+
:return: Offset from exon start or end index
|
1200
1217
|
"""
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1218
|
+
if is_in_exon:
|
1219
|
+
if start is not None:
|
1220
|
+
offset = start - start_i if strand == Strand.POSITIVE else end_i - start
|
1221
|
+
else:
|
1222
|
+
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
1223
|
+
else:
|
1224
|
+
if strand == Strand.POSITIVE:
|
1225
|
+
offset = start - start_i if use_start_i else end - end_i
|
1226
|
+
else:
|
1227
|
+
offset = start_i - end if use_start_i else end_i - start
|
1228
|
+
return offset
|
@@ -61,7 +61,9 @@ class ManeTranscriptMappings:
|
|
61
61
|
location information). The list is sorted so that a MANE Select entry comes
|
62
62
|
first, followed by a MANE Plus Clinical entry, if available.
|
63
63
|
"""
|
64
|
-
data = self.df.filter(
|
64
|
+
data = self.df.filter(
|
65
|
+
pl.col("symbol").str.to_uppercase() == gene_symbol.upper()
|
66
|
+
)
|
65
67
|
|
66
68
|
if len(data) == 0:
|
67
69
|
_logger.warning(
|
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
|
|
6
6
|
cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
|
7
7
|
cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
|
8
8
|
cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
|
9
|
-
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=
|
9
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=XoG60ha1JDoAI-vlc-0rh3tFSyKx61u49hVyeBpjPME,48836
|
10
10
|
cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
|
11
11
|
cool_seq_tool/mappers/mane_transcript.py,sha256=nirxlf3EGVInFYG4fsAqiEmDdTc_h1XuPyX2ul-a7Rk,54368
|
12
12
|
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
@@ -14,11 +14,11 @@ cool_seq_tool/resources/data_files.py,sha256=3lhu28tzlSoTs4vHZNu-hhoAWRrPGuZj_oI
|
|
14
14
|
cool_seq_tool/resources/status.py,sha256=L0KM-VG3N4Yuaqh3AKZd_2KPDLR0Y7rvW_OD6x8mF7A,5717
|
15
15
|
cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
|
16
16
|
cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
|
17
|
-
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=
|
17
|
+
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=Q6J57O2lLWXlgKT0zq3BIwkwFawySnORHOX-UxzfyDE,5399
|
18
18
|
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
19
19
|
cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
|
20
|
-
cool_seq_tool-0.
|
21
|
-
cool_seq_tool-0.
|
22
|
-
cool_seq_tool-0.
|
23
|
-
cool_seq_tool-0.
|
24
|
-
cool_seq_tool-0.
|
20
|
+
cool_seq_tool-0.8.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
21
|
+
cool_seq_tool-0.8.0.dist-info/METADATA,sha256=moL8cCRR-wPQh3t9kJviAVRuvAapZJ40-8Ea8pjIRic,6226
|
22
|
+
cool_seq_tool-0.8.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
23
|
+
cool_seq_tool-0.8.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
24
|
+
cool_seq_tool-0.8.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|