cool-seq-tool 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/mappers/exon_genomic_coords.py +256 -229
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.7.1.dist-info}/METADATA +1 -1
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.7.1.dist-info}/RECORD +6 -6
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.7.1.dist-info}/WHEEL +1 -1
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.7.1.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.7.0.dist-info → cool_seq_tool-0.7.1.dist-info}/top_level.txt +0 -0
@@ -14,13 +14,13 @@ from cool_seq_tool.schemas import (
|
|
14
14
|
Strand,
|
15
15
|
)
|
16
16
|
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
|
17
|
-
from cool_seq_tool.sources.uta_database import UtaDatabase
|
17
|
+
from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
|
18
18
|
from cool_seq_tool.utils import service_meta
|
19
19
|
|
20
20
|
_logger = logging.getLogger(__name__)
|
21
21
|
|
22
22
|
|
23
|
-
class
|
23
|
+
class _ExonCoord(BaseModelForbidExtra):
|
24
24
|
"""Model for representing exon coordinate data"""
|
25
25
|
|
26
26
|
ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
@@ -97,19 +97,18 @@ class GenomicTxSeg(BaseModelForbidExtra):
|
|
97
97
|
"""Ensure that fields are (un)set depending on errors
|
98
98
|
|
99
99
|
:param values: Values in model
|
100
|
-
:raises ValueError: If `seg`, `
|
100
|
+
:raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
|
101
101
|
provided when there are no errors
|
102
102
|
:return: Values in model
|
103
103
|
"""
|
104
104
|
if not values.get("errors") and not all(
|
105
105
|
(
|
106
106
|
values.get("seg"),
|
107
|
-
values.get("gene"),
|
108
107
|
values.get("genomic_ac"),
|
109
108
|
values.get("tx_ac"),
|
110
109
|
)
|
111
110
|
):
|
112
|
-
err_msg = "`seg`, `
|
111
|
+
err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
|
113
112
|
raise ValueError(err_msg)
|
114
113
|
return values
|
115
114
|
|
@@ -154,20 +153,21 @@ class GenomicTxSegService(BaseModelForbidExtra):
|
|
154
153
|
on errors
|
155
154
|
|
156
155
|
:param values: Values in model
|
157
|
-
:raises ValueError: If `
|
156
|
+
:raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
|
158
157
|
not provided when there are no errors
|
159
158
|
:return: Values in model, including service metadata
|
160
159
|
"""
|
161
160
|
values["service_meta"] = service_meta()
|
162
161
|
if not values.get("errors") and not all(
|
163
162
|
(
|
164
|
-
values.get("gene"),
|
165
163
|
values.get("genomic_ac"),
|
166
164
|
values.get("tx_ac"),
|
167
165
|
values.get("seg_start") or values.get("seg_end"),
|
168
166
|
)
|
169
167
|
):
|
170
|
-
err_msg =
|
168
|
+
err_msg = (
|
169
|
+
"`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
|
170
|
+
)
|
171
171
|
raise ValueError(err_msg)
|
172
172
|
|
173
173
|
return values
|
@@ -340,17 +340,20 @@ class ExonGenomicCoordsMapper:
|
|
340
340
|
|
341
341
|
# Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
|
342
342
|
# for exon(s)
|
343
|
-
|
343
|
+
(
|
344
|
+
genomic_aln_start,
|
345
|
+
genomic_aln_end,
|
346
|
+
err_msg,
|
347
|
+
) = await self._get_genomic_aln_coords(
|
344
348
|
transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
|
345
349
|
)
|
346
|
-
if
|
347
|
-
return _return_service_errors([err_msg]
|
348
|
-
alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
|
350
|
+
if err_msg:
|
351
|
+
return _return_service_errors([err_msg])
|
349
352
|
|
350
353
|
# Get gene and chromosome data, check that at least one was retrieved
|
351
|
-
gene =
|
354
|
+
gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
|
352
355
|
genomic_ac = (
|
353
|
-
|
356
|
+
genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
|
354
357
|
)
|
355
358
|
if gene is None or genomic_ac is None:
|
356
359
|
return _return_service_errors(
|
@@ -360,9 +363,9 @@ class ExonGenomicCoordsMapper:
|
|
360
363
|
)
|
361
364
|
|
362
365
|
strand = (
|
363
|
-
Strand(
|
364
|
-
if
|
365
|
-
else Strand(
|
366
|
+
Strand(genomic_aln_start.alt_strand)
|
367
|
+
if genomic_aln_start
|
368
|
+
else Strand(genomic_aln_end.alt_strand)
|
366
369
|
)
|
367
370
|
|
368
371
|
if exon_start_exists:
|
@@ -370,7 +373,7 @@ class ExonGenomicCoordsMapper:
|
|
370
373
|
genomic_ac,
|
371
374
|
strand,
|
372
375
|
exon_start_offset,
|
373
|
-
|
376
|
+
genomic_aln_start,
|
374
377
|
is_seg_start=True,
|
375
378
|
)
|
376
379
|
if err_msg:
|
@@ -380,7 +383,11 @@ class ExonGenomicCoordsMapper:
|
|
380
383
|
|
381
384
|
if exon_end_exists:
|
382
385
|
seg_end, err_msg = self._get_tx_segment(
|
383
|
-
genomic_ac,
|
386
|
+
genomic_ac,
|
387
|
+
strand,
|
388
|
+
exon_end_offset,
|
389
|
+
genomic_aln_end,
|
390
|
+
is_seg_start=False,
|
384
391
|
)
|
385
392
|
if err_msg:
|
386
393
|
return _return_service_errors([err_msg])
|
@@ -479,7 +486,7 @@ class ExonGenomicCoordsMapper:
|
|
479
486
|
transcript=transcript,
|
480
487
|
gene=gene,
|
481
488
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
482
|
-
|
489
|
+
is_seg_start=True,
|
483
490
|
)
|
484
491
|
if start_tx_seg_data.errors:
|
485
492
|
return _return_service_errors(start_tx_seg_data.errors)
|
@@ -499,7 +506,7 @@ class ExonGenomicCoordsMapper:
|
|
499
506
|
transcript=transcript,
|
500
507
|
gene=gene,
|
501
508
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
502
|
-
|
509
|
+
is_seg_start=False,
|
503
510
|
)
|
504
511
|
if end_tx_seg_data.errors:
|
505
512
|
return _return_service_errors(end_tx_seg_data.errors)
|
@@ -525,53 +532,13 @@ class ExonGenomicCoordsMapper:
|
|
525
532
|
|
526
533
|
return GenomicTxSegService(**params)
|
527
534
|
|
528
|
-
async def _get_all_exon_coords(
|
529
|
-
self, tx_ac: str, genomic_ac: str | None = None
|
530
|
-
) -> list[ExonCoord]:
|
531
|
-
"""Get all exon coordinate data for a transcript.
|
532
|
-
|
533
|
-
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
534
|
-
associated to ``tx_ac``.
|
535
|
-
|
536
|
-
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
537
|
-
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
538
|
-
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
539
|
-
The exon coordinate data will include the exon number, transcript and
|
540
|
-
genomic positions for the start and end of the exon, and strand.
|
541
|
-
The list will be ordered by ascending exon number.
|
542
|
-
"""
|
543
|
-
if genomic_ac:
|
544
|
-
query = f"""
|
545
|
-
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
546
|
-
FROM {self.uta_db.schema}.tx_exon_aln_v
|
547
|
-
WHERE tx_ac = '{tx_ac}'
|
548
|
-
AND alt_aln_method = 'splign'
|
549
|
-
AND alt_ac = '{genomic_ac}'
|
550
|
-
ORDER BY ord ASC
|
551
|
-
""" # noqa: S608
|
552
|
-
else:
|
553
|
-
query = f"""
|
554
|
-
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
555
|
-
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
556
|
-
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
557
|
-
ON t.alt_ac = s.ac
|
558
|
-
WHERE s.descr = ''
|
559
|
-
AND t.tx_ac = '{tx_ac}'
|
560
|
-
AND t.alt_aln_method = 'splign'
|
561
|
-
AND t.alt_ac like 'NC_000%'
|
562
|
-
ORDER BY ord ASC
|
563
|
-
""" # noqa: S608
|
564
|
-
|
565
|
-
results = await self.uta_db.execute_query(query)
|
566
|
-
return [ExonCoord(**r) for r in results]
|
567
|
-
|
568
535
|
async def _get_start_end_exon_coords(
|
569
536
|
self,
|
570
537
|
tx_ac: str,
|
571
538
|
exon_start: int | None = None,
|
572
539
|
exon_end: int | None = None,
|
573
540
|
genomic_ac: str | None = None,
|
574
|
-
) -> tuple[
|
541
|
+
) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
|
575
542
|
"""Get exon coordinates for a transcript given exon start and exon end.
|
576
543
|
|
577
544
|
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
@@ -606,56 +573,160 @@ class ExonGenomicCoordsMapper:
|
|
606
573
|
|
607
574
|
return *start_end_exons, errors
|
608
575
|
|
609
|
-
async def
|
576
|
+
async def _get_all_exon_coords(
|
577
|
+
self, tx_ac: str, genomic_ac: str | None = None
|
578
|
+
) -> list[_ExonCoord]:
|
579
|
+
"""Get all exon coordinate data for a transcript.
|
580
|
+
|
581
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
582
|
+
associated to ``tx_ac``.
|
583
|
+
|
584
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
585
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
586
|
+
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
587
|
+
The exon coordinate data will include the exon number, transcript and
|
588
|
+
genomic positions for the start and end of the exon, and strand.
|
589
|
+
The list will be ordered by ascending exon number.
|
590
|
+
"""
|
591
|
+
if genomic_ac:
|
592
|
+
query = f"""
|
593
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
594
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
595
|
+
WHERE tx_ac = '{tx_ac}'
|
596
|
+
AND alt_aln_method = 'splign'
|
597
|
+
AND alt_ac = '{genomic_ac}'
|
598
|
+
ORDER BY ord ASC
|
599
|
+
""" # noqa: S608
|
600
|
+
else:
|
601
|
+
query = f"""
|
602
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
603
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
604
|
+
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
605
|
+
ON t.alt_ac = s.ac
|
606
|
+
WHERE s.descr = ''
|
607
|
+
AND t.tx_ac = '{tx_ac}'
|
608
|
+
AND t.alt_aln_method = 'splign'
|
609
|
+
AND t.alt_ac like 'NC_000%'
|
610
|
+
ORDER BY ord ASC
|
611
|
+
""" # noqa: S608
|
612
|
+
|
613
|
+
results = await self.uta_db.execute_query(query)
|
614
|
+
return [_ExonCoord(**r) for r in results]
|
615
|
+
|
616
|
+
async def _get_genomic_aln_coords(
|
610
617
|
self,
|
611
618
|
tx_ac: str,
|
612
|
-
tx_exon_start:
|
613
|
-
tx_exon_end:
|
619
|
+
tx_exon_start: _ExonCoord | None = None,
|
620
|
+
tx_exon_end: _ExonCoord | None = None,
|
614
621
|
gene: str | None = None,
|
615
|
-
) -> tuple[
|
622
|
+
) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
|
616
623
|
"""Get aligned genomic coordinates for transcript exon start and end.
|
617
624
|
|
625
|
+
``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
|
626
|
+
transcript and genomic accession.
|
627
|
+
|
618
628
|
:param tx_ac: Transcript accession
|
619
629
|
:param tx_exon_start: Transcript's exon start coordinates. If not provided,
|
620
630
|
must provide ``tx_exon_end``
|
621
631
|
:param tx_exon_end: Transcript's exon end coordinates. If not provided, must
|
622
632
|
provide ``tx_exon_start``
|
623
633
|
:param gene: HGNC gene symbol
|
624
|
-
:return:
|
634
|
+
:return: Tuple containing aligned genomic data for start and end exon and
|
635
|
+
warnings if found
|
625
636
|
"""
|
626
637
|
if tx_exon_start is None and tx_exon_end is None:
|
627
638
|
msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
|
628
639
|
_logger.warning(msg)
|
629
|
-
return None, msg
|
640
|
+
return None, None, msg
|
630
641
|
|
631
|
-
|
642
|
+
aligned_coords = {"start": None, "end": None}
|
632
643
|
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
|
633
644
|
if exon:
|
634
|
-
|
645
|
+
aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
|
635
646
|
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
|
636
647
|
)
|
637
|
-
if
|
638
|
-
|
648
|
+
if aligned_coord:
|
649
|
+
aligned_coords[key] = aligned_coord
|
639
650
|
else:
|
640
|
-
return None, warning
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
651
|
+
return None, None, warning
|
652
|
+
|
653
|
+
return *aligned_coords.values(), None
|
654
|
+
|
655
|
+
def _get_tx_segment(
|
656
|
+
self,
|
657
|
+
genomic_ac: str,
|
658
|
+
strand: Strand,
|
659
|
+
offset: int,
|
660
|
+
genomic_ac_data: _ExonCoord,
|
661
|
+
is_seg_start: bool = False,
|
662
|
+
) -> tuple[TxSegment | None, str | None]:
|
663
|
+
"""Get transcript segment data given ``genomic_ac`` and offset data
|
664
|
+
|
665
|
+
:param genomic_ac: Genomic RefSeq accession
|
666
|
+
:param strand: Strand
|
667
|
+
:param offset: Exon offset
|
668
|
+
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
669
|
+
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
670
|
+
segment starts, defaults to ``False``
|
671
|
+
:return: Transcript segment data
|
672
|
+
"""
|
673
|
+
if is_seg_start:
|
674
|
+
if strand == Strand.POSITIVE:
|
675
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
676
|
+
else:
|
677
|
+
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
678
|
+
else:
|
679
|
+
if strand == Strand.POSITIVE:
|
680
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
681
|
+
else:
|
682
|
+
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
683
|
+
|
684
|
+
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
685
|
+
genomic_ac,
|
686
|
+
seg_genomic_pos,
|
687
|
+
is_seg_start=is_seg_start,
|
688
|
+
strand=strand,
|
689
|
+
)
|
690
|
+
if err_msg:
|
691
|
+
return None, err_msg
|
692
|
+
|
693
|
+
return TxSegment(
|
694
|
+
exon_ord=genomic_ac_data.ord,
|
695
|
+
genomic_location=genomic_loc,
|
696
|
+
offset=offset,
|
697
|
+
), None
|
698
|
+
|
699
|
+
def _get_vrs_seq_loc(
|
700
|
+
self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
|
701
|
+
) -> tuple[SequenceLocation | None, str | None]:
|
702
|
+
"""Create VRS Sequence Location for genomic position where transcript segment
|
703
|
+
occurs
|
704
|
+
|
705
|
+
:param genomic_ac: RefSeq genomic accession
|
706
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
707
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
|
708
|
+
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
709
|
+
:param strand: Strand
|
710
|
+
:return: Tuple containing VRS location (if successful) and error message (if
|
711
|
+
unable to get GA4GH identifier for ``genomic_ac``).
|
712
|
+
"""
|
713
|
+
ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
|
714
|
+
genomic_ac, "ga4gh"
|
715
|
+
)
|
716
|
+
if err_msg:
|
717
|
+
return None, err_msg
|
718
|
+
|
719
|
+
use_start = (
|
720
|
+
strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
|
721
|
+
)
|
722
|
+
|
723
|
+
return SequenceLocation(
|
724
|
+
sequenceReference=SequenceReference(
|
725
|
+
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
726
|
+
),
|
727
|
+
start=genomic_pos if use_start else None,
|
728
|
+
end=genomic_pos if not use_start else None,
|
729
|
+
), None
|
659
730
|
|
660
731
|
async def _genomic_to_tx_segment(
|
661
732
|
self,
|
@@ -665,7 +736,7 @@ class ExonGenomicCoordsMapper:
|
|
665
736
|
transcript: str | None = None,
|
666
737
|
gene: str | None = None,
|
667
738
|
get_nearest_transcript_junction: bool = False,
|
668
|
-
|
739
|
+
is_seg_start: bool = True,
|
669
740
|
) -> GenomicTxSeg:
|
670
741
|
"""Given genomic data, generate a boundary for a transcript segment.
|
671
742
|
|
@@ -692,17 +763,17 @@ class ExonGenomicCoordsMapper:
|
|
692
763
|
following the breakpoint for the 3' end. For the negative strand, adjacent
|
693
764
|
is defined as the exon following the breakpoint for the 5' end and the exon
|
694
765
|
preceding the breakpoint for the 3' end.
|
695
|
-
:param
|
766
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
696
767
|
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
697
768
|
:return: Data for a transcript segment boundary (inter-residue coordinates)
|
698
769
|
"""
|
699
770
|
params = {key: None for key in GenomicTxSeg.model_fields}
|
700
771
|
|
701
772
|
if get_nearest_transcript_junction:
|
702
|
-
if not gene:
|
773
|
+
if not gene and not transcript:
|
703
774
|
return GenomicTxSeg(
|
704
775
|
errors=[
|
705
|
-
"`gene` must be provided to select the adjacent transcript junction"
|
776
|
+
"`gene` or `transcript` must be provided to select the adjacent transcript junction"
|
706
777
|
]
|
707
778
|
)
|
708
779
|
|
@@ -773,8 +844,8 @@ class ExonGenomicCoordsMapper:
|
|
773
844
|
exon_num = self._get_adjacent_exon(
|
774
845
|
tx_exons_genomic_coords=tx_exons,
|
775
846
|
strand=strand,
|
776
|
-
start=genomic_pos if
|
777
|
-
end=genomic_pos if not
|
847
|
+
start=genomic_pos if is_seg_start else None,
|
848
|
+
end=genomic_pos if not is_seg_start else None,
|
778
849
|
)
|
779
850
|
|
780
851
|
offset = self._get_exon_offset(
|
@@ -782,19 +853,26 @@ class ExonGenomicCoordsMapper:
|
|
782
853
|
end_i=tx_exons[exon_num].alt_end_i,
|
783
854
|
strand=strand,
|
784
855
|
use_start_i=strand == Strand.POSITIVE
|
785
|
-
if
|
856
|
+
if is_seg_start
|
786
857
|
else strand != Strand.POSITIVE,
|
787
858
|
is_in_exon=False,
|
788
|
-
start=genomic_pos if
|
789
|
-
end=genomic_pos if not
|
859
|
+
start=genomic_pos if is_seg_start else None,
|
860
|
+
end=genomic_pos if not is_seg_start else None,
|
790
861
|
)
|
791
862
|
|
792
863
|
genomic_location, err_msg = self._get_vrs_seq_loc(
|
793
|
-
genomic_ac, genomic_pos,
|
864
|
+
genomic_ac, genomic_pos, is_seg_start, strand
|
794
865
|
)
|
795
866
|
if err_msg:
|
796
867
|
return GenomicTxSeg(errors=[err_msg])
|
797
868
|
|
869
|
+
# gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
|
870
|
+
if not gene:
|
871
|
+
_gene, err_msg = await self._get_tx_ac_gene(transcript)
|
872
|
+
if err_msg:
|
873
|
+
return GenomicTxSeg(errors=[err_msg])
|
874
|
+
gene = _gene
|
875
|
+
|
798
876
|
return GenomicTxSeg(
|
799
877
|
gene=gene,
|
800
878
|
genomic_ac=genomic_ac,
|
@@ -807,20 +885,17 @@ class ExonGenomicCoordsMapper:
|
|
807
885
|
)
|
808
886
|
|
809
887
|
if genomic_ac:
|
810
|
-
# Check if valid accession is given
|
811
|
-
if not await self.uta_db.validate_genomic_ac(genomic_ac):
|
812
|
-
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
813
|
-
|
814
888
|
_gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
|
815
|
-
if _gene:
|
816
|
-
if gene and _gene != gene:
|
817
|
-
return GenomicTxSeg(
|
818
|
-
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
819
|
-
)
|
820
889
|
|
821
|
-
|
822
|
-
else:
|
890
|
+
if err_msg:
|
823
891
|
return GenomicTxSeg(errors=[err_msg])
|
892
|
+
|
893
|
+
if gene and _gene != gene:
|
894
|
+
return GenomicTxSeg(
|
895
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
896
|
+
)
|
897
|
+
|
898
|
+
gene = _gene
|
824
899
|
elif chromosome:
|
825
900
|
# Try GRCh38 first
|
826
901
|
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
|
@@ -858,7 +933,7 @@ class ExonGenomicCoordsMapper:
|
|
858
933
|
)
|
859
934
|
|
860
935
|
return await self._get_tx_seg_genomic_metadata(
|
861
|
-
genomic_ac, genomic_pos,
|
936
|
+
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
|
862
937
|
)
|
863
938
|
|
864
939
|
async def _get_grch38_ac_pos(
|
@@ -943,85 +1018,37 @@ class ExonGenomicCoordsMapper:
|
|
943
1018
|
|
944
1019
|
return results[0]["hgnc"], None
|
945
1020
|
|
946
|
-
def
|
1021
|
+
async def _get_tx_ac_gene(
|
947
1022
|
self,
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
genomic_ac_data: ExonCoord,
|
952
|
-
is_seg_start: bool = False,
|
953
|
-
) -> tuple[TxSegment | None, str | None]:
|
954
|
-
"""Get transcript segment data given ``genomic_ac`` and offset data
|
955
|
-
|
956
|
-
:param genomic_ac: Genomic RefSeq accession
|
957
|
-
:param strand: Strand
|
958
|
-
:param offset: Exon offset
|
959
|
-
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
960
|
-
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
961
|
-
segment starts, defaults to ``False``
|
962
|
-
:return: Transcript segment data
|
963
|
-
"""
|
964
|
-
if is_seg_start:
|
965
|
-
if strand == Strand.POSITIVE:
|
966
|
-
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
967
|
-
else:
|
968
|
-
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
969
|
-
else:
|
970
|
-
if strand == Strand.POSITIVE:
|
971
|
-
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
972
|
-
else:
|
973
|
-
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
974
|
-
|
975
|
-
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
976
|
-
genomic_ac,
|
977
|
-
seg_genomic_pos,
|
978
|
-
is_start=is_seg_start,
|
979
|
-
strand=strand,
|
980
|
-
)
|
981
|
-
if err_msg:
|
982
|
-
return None, err_msg
|
983
|
-
|
984
|
-
return TxSegment(
|
985
|
-
exon_ord=genomic_ac_data.ord,
|
986
|
-
genomic_location=genomic_loc,
|
987
|
-
offset=offset,
|
988
|
-
), None
|
1023
|
+
tx_ac: str,
|
1024
|
+
) -> tuple[str | None, str | None]:
|
1025
|
+
"""Get gene given a transcript.
|
989
1026
|
|
990
|
-
|
991
|
-
|
992
|
-
) -> tuple[SequenceLocation | None, str | None]:
|
993
|
-
"""Create VRS Sequence Location for genomic position where transcript segment
|
994
|
-
occurs
|
1027
|
+
If multiple genes are found for a given ``tx_ac``, only one
|
1028
|
+
gene will be returned.
|
995
1029
|
|
996
|
-
:param
|
997
|
-
:
|
998
|
-
|
999
|
-
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
1000
|
-
:param strand: Strand
|
1001
|
-
:return: Tuple containing VRS location (if successful) and error message (if
|
1002
|
-
unable to get GA4GH identifier for ``genomic_ac``).
|
1030
|
+
:param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
|
1031
|
+
:return: HGNC gene symbol associated to transcript and
|
1032
|
+
warning
|
1003
1033
|
"""
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1034
|
+
query = f"""
|
1035
|
+
SELECT DISTINCT hgnc
|
1036
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
1037
|
+
WHERE tx_ac = '{tx_ac}'
|
1038
|
+
ORDER BY hgnc
|
1039
|
+
LIMIT 1;
|
1040
|
+
""" # noqa: S608
|
1041
|
+
results = await self.uta_db.execute_query(query)
|
1042
|
+
if not results:
|
1043
|
+
return None, f"No gene(s) found given {tx_ac}"
|
1011
1044
|
|
1012
|
-
return
|
1013
|
-
sequenceReference=SequenceReference(
|
1014
|
-
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
1015
|
-
),
|
1016
|
-
start=genomic_pos if use_start else None,
|
1017
|
-
end=genomic_pos if not use_start else None,
|
1018
|
-
), None
|
1045
|
+
return results[0]["hgnc"], None
|
1019
1046
|
|
1020
1047
|
async def _get_tx_seg_genomic_metadata(
|
1021
1048
|
self,
|
1022
1049
|
genomic_ac: str,
|
1023
1050
|
genomic_pos: int,
|
1024
|
-
|
1051
|
+
is_seg_start: bool,
|
1025
1052
|
gene: str,
|
1026
1053
|
tx_ac: str | None,
|
1027
1054
|
) -> GenomicTxSeg:
|
@@ -1034,7 +1061,7 @@ class ExonGenomicCoordsMapper:
|
|
1034
1061
|
|
1035
1062
|
:param genomic_ac: Genomic RefSeq accession
|
1036
1063
|
:param genomic_pos: Genomic position where the transcript segment occurs
|
1037
|
-
:param
|
1064
|
+
:param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
|
1038
1065
|
:param gene: HGNC gene symbol
|
1039
1066
|
:param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
|
1040
1067
|
transcript
|
@@ -1092,12 +1119,12 @@ class ExonGenomicCoordsMapper:
|
|
1092
1119
|
strand=Strand(tx_exon_aln_data.alt_strand),
|
1093
1120
|
use_start_i=False, # This doesn't impact anything since we're on the exon
|
1094
1121
|
is_in_exon=True,
|
1095
|
-
start=genomic_pos if
|
1096
|
-
end=genomic_pos if not
|
1122
|
+
start=genomic_pos if is_seg_start else None,
|
1123
|
+
end=genomic_pos if not is_seg_start else None,
|
1097
1124
|
)
|
1098
1125
|
|
1099
1126
|
genomic_location, err_msg = self._get_vrs_seq_loc(
|
1100
|
-
genomic_ac, genomic_pos,
|
1127
|
+
genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
|
1101
1128
|
)
|
1102
1129
|
if err_msg:
|
1103
1130
|
return GenomicTxSeg(errors=[err_msg])
|
@@ -1114,46 +1141,20 @@ class ExonGenomicCoordsMapper:
|
|
1114
1141
|
)
|
1115
1142
|
|
1116
1143
|
@staticmethod
|
1117
|
-
def
|
1118
|
-
|
1119
|
-
end_i: int,
|
1120
|
-
strand: Strand,
|
1121
|
-
use_start_i: bool = True,
|
1122
|
-
is_in_exon: bool = True,
|
1123
|
-
start: int | None = None,
|
1124
|
-
end: int | None = None,
|
1125
|
-
) -> int:
|
1126
|
-
"""Compute offset from exon start or end index
|
1144
|
+
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
|
1145
|
+
"""Check if a breakpoint occurs on an exon
|
1127
1146
|
|
1128
|
-
:param
|
1129
|
-
:param
|
1130
|
-
:
|
1131
|
-
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1132
|
-
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1133
|
-
``False``.
|
1134
|
-
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1135
|
-
``True``
|
1136
|
-
:param start: Provided start position, defaults to ``None``. Must provide
|
1137
|
-
``start`` or ``end``, not both.
|
1138
|
-
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1139
|
-
or ``end``, not both
|
1140
|
-
:return: Offset from exon start or end index
|
1147
|
+
:param pos: Genomic breakpoint
|
1148
|
+
:param tx_genomic_coords: A list of transcript exon coordinate data
|
1149
|
+
:return: ``True`` if the breakpoint occurs on an exon
|
1141
1150
|
"""
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
else:
|
1146
|
-
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
1147
|
-
else:
|
1148
|
-
if strand == Strand.POSITIVE:
|
1149
|
-
offset = start - start_i if use_start_i else end - end_i
|
1150
|
-
else:
|
1151
|
-
offset = start_i - end if use_start_i else end_i - start
|
1152
|
-
return offset
|
1151
|
+
return any(
|
1152
|
+
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
|
1153
|
+
)
|
1153
1154
|
|
1154
1155
|
@staticmethod
|
1155
1156
|
def _get_adjacent_exon(
|
1156
|
-
tx_exons_genomic_coords: list[
|
1157
|
+
tx_exons_genomic_coords: list[_ExonCoord],
|
1157
1158
|
strand: Strand,
|
1158
1159
|
start: int | None = None,
|
1159
1160
|
end: int | None = None,
|
@@ -1191,13 +1192,39 @@ class ExonGenomicCoordsMapper:
|
|
1191
1192
|
return exon.ord if end else exon.ord + 1
|
1192
1193
|
|
1193
1194
|
@staticmethod
|
1194
|
-
def
|
1195
|
-
|
1195
|
+
def _get_exon_offset(
|
1196
|
+
start_i: int,
|
1197
|
+
end_i: int,
|
1198
|
+
strand: Strand,
|
1199
|
+
use_start_i: bool = True,
|
1200
|
+
is_in_exon: bool = True,
|
1201
|
+
start: int | None = None,
|
1202
|
+
end: int | None = None,
|
1203
|
+
) -> int:
|
1204
|
+
"""Compute offset from exon start or end index
|
1196
1205
|
|
1197
|
-
:param
|
1198
|
-
:param
|
1199
|
-
:
|
1206
|
+
:param start_i: Exon start index (inter-residue)
|
1207
|
+
:param end_i: Exon end index (inter-residue)
|
1208
|
+
:param strand: Strand
|
1209
|
+
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1210
|
+
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1211
|
+
``False``.
|
1212
|
+
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1213
|
+
``True``
|
1214
|
+
:param start: Provided start position, defaults to ``None``. Must provide
|
1215
|
+
``start`` or ``end``, not both.
|
1216
|
+
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1217
|
+
or ``end``, not both
|
1218
|
+
:return: Offset from exon start or end index
|
1200
1219
|
"""
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1220
|
+
if is_in_exon:
|
1221
|
+
if start is not None:
|
1222
|
+
offset = start - start_i if strand == Strand.POSITIVE else end_i - start
|
1223
|
+
else:
|
1224
|
+
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
1225
|
+
else:
|
1226
|
+
if strand == Strand.POSITIVE:
|
1227
|
+
offset = start - start_i if use_start_i else end - end_i
|
1228
|
+
else:
|
1229
|
+
offset = start_i - end if use_start_i else end_i - start
|
1230
|
+
return offset
|
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
|
|
6
6
|
cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
|
7
7
|
cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
|
8
8
|
cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
|
9
|
-
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=
|
9
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=lfmzuVXaYT7w2FBDS3xhJNgETusllomFy5Utzhfhlpc,48782
|
10
10
|
cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
|
11
11
|
cool_seq_tool/mappers/mane_transcript.py,sha256=nirxlf3EGVInFYG4fsAqiEmDdTc_h1XuPyX2ul-a7Rk,54368
|
12
12
|
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
@@ -17,8 +17,8 @@ cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHO
|
|
17
17
|
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=E_pj7FEBcB6HUR8yhSVibB0beMMlKJ62pK0qvl4y5nw,5358
|
18
18
|
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
19
19
|
cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
|
20
|
-
cool_seq_tool-0.7.
|
21
|
-
cool_seq_tool-0.7.
|
22
|
-
cool_seq_tool-0.7.
|
23
|
-
cool_seq_tool-0.7.
|
24
|
-
cool_seq_tool-0.7.
|
20
|
+
cool_seq_tool-0.7.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
21
|
+
cool_seq_tool-0.7.1.dist-info/METADATA,sha256=Y9_RZI2iHpmNOFwXoFCCKyHs6aXmNrzKQfyHkmqUVmQ,6226
|
22
|
+
cool_seq_tool-0.7.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
23
|
+
cool_seq_tool-0.7.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
24
|
+
cool_seq_tool-0.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|