cool-seq-tool 0.7.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,13 +14,13 @@ from cool_seq_tool.schemas import (
14
14
  Strand,
15
15
  )
16
16
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
17
- from cool_seq_tool.sources.uta_database import UtaDatabase
17
+ from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
18
18
  from cool_seq_tool.utils import service_meta
19
19
 
20
20
  _logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
- class ExonCoord(BaseModelForbidExtra):
23
+ class _ExonCoord(BaseModelForbidExtra):
24
24
  """Model for representing exon coordinate data"""
25
25
 
26
26
  ord: StrictInt = Field(..., description="Exon number. 0-based.")
@@ -97,19 +97,18 @@ class GenomicTxSeg(BaseModelForbidExtra):
97
97
  """Ensure that fields are (un)set depending on errors
98
98
 
99
99
  :param values: Values in model
100
- :raises ValueError: If `seg`, `gene`, `genomic_ac` and `tx_ac` are not
100
+ :raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
101
101
  provided when there are no errors
102
102
  :return: Values in model
103
103
  """
104
104
  if not values.get("errors") and not all(
105
105
  (
106
106
  values.get("seg"),
107
- values.get("gene"),
108
107
  values.get("genomic_ac"),
109
108
  values.get("tx_ac"),
110
109
  )
111
110
  ):
112
- err_msg = "`seg`, `gene`, `genomic_ac` and `tx_ac` must be provided"
111
+ err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
113
112
  raise ValueError(err_msg)
114
113
  return values
115
114
 
@@ -154,20 +153,21 @@ class GenomicTxSegService(BaseModelForbidExtra):
154
153
  on errors
155
154
 
156
155
  :param values: Values in model
157
- :raises ValueError: If `gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
156
+ :raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
158
157
  not provided when there are no errors
159
158
  :return: Values in model, including service metadata
160
159
  """
161
160
  values["service_meta"] = service_meta()
162
161
  if not values.get("errors") and not all(
163
162
  (
164
- values.get("gene"),
165
163
  values.get("genomic_ac"),
166
164
  values.get("tx_ac"),
167
165
  values.get("seg_start") or values.get("seg_end"),
168
166
  )
169
167
  ):
170
- err_msg = "`gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
168
+ err_msg = (
169
+ "`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
170
+ )
171
171
  raise ValueError(err_msg)
172
172
 
173
173
  return values
@@ -340,17 +340,20 @@ class ExonGenomicCoordsMapper:
340
340
 
341
341
  # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
342
342
  # for exon(s)
343
- alt_ac_start_end, err_msg = await self._get_alt_ac_start_and_end(
343
+ (
344
+ genomic_aln_start,
345
+ genomic_aln_end,
346
+ err_msg,
347
+ ) = await self._get_genomic_aln_coords(
344
348
  transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
345
349
  )
346
- if not alt_ac_start_end:
347
- return _return_service_errors([err_msg] if err_msg else [])
348
- alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
350
+ if err_msg:
351
+ return _return_service_errors([err_msg])
349
352
 
350
353
  # Get gene and chromosome data, check that at least one was retrieved
351
- gene = alt_ac_start_data.hgnc if alt_ac_start_data else alt_ac_end_data.hgnc
354
+ gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
352
355
  genomic_ac = (
353
- alt_ac_start_data.alt_ac if alt_ac_start_data else alt_ac_end_data.alt_ac
356
+ genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
354
357
  )
355
358
  if gene is None or genomic_ac is None:
356
359
  return _return_service_errors(
@@ -360,9 +363,9 @@ class ExonGenomicCoordsMapper:
360
363
  )
361
364
 
362
365
  strand = (
363
- Strand(alt_ac_start_data.alt_strand)
364
- if alt_ac_start_data
365
- else Strand(alt_ac_end_data.alt_strand)
366
+ Strand(genomic_aln_start.alt_strand)
367
+ if genomic_aln_start
368
+ else Strand(genomic_aln_end.alt_strand)
366
369
  )
367
370
 
368
371
  if exon_start_exists:
@@ -370,7 +373,7 @@ class ExonGenomicCoordsMapper:
370
373
  genomic_ac,
371
374
  strand,
372
375
  exon_start_offset,
373
- alt_ac_start_data,
376
+ genomic_aln_start,
374
377
  is_seg_start=True,
375
378
  )
376
379
  if err_msg:
@@ -380,7 +383,11 @@ class ExonGenomicCoordsMapper:
380
383
 
381
384
  if exon_end_exists:
382
385
  seg_end, err_msg = self._get_tx_segment(
383
- genomic_ac, strand, exon_end_offset, alt_ac_end_data, is_seg_start=False
386
+ genomic_ac,
387
+ strand,
388
+ exon_end_offset,
389
+ genomic_aln_end,
390
+ is_seg_start=False,
384
391
  )
385
392
  if err_msg:
386
393
  return _return_service_errors([err_msg])
@@ -479,7 +486,7 @@ class ExonGenomicCoordsMapper:
479
486
  transcript=transcript,
480
487
  gene=gene,
481
488
  get_nearest_transcript_junction=get_nearest_transcript_junction,
482
- is_start=True,
489
+ is_seg_start=True,
483
490
  )
484
491
  if start_tx_seg_data.errors:
485
492
  return _return_service_errors(start_tx_seg_data.errors)
@@ -499,7 +506,7 @@ class ExonGenomicCoordsMapper:
499
506
  transcript=transcript,
500
507
  gene=gene,
501
508
  get_nearest_transcript_junction=get_nearest_transcript_junction,
502
- is_start=False,
509
+ is_seg_start=False,
503
510
  )
504
511
  if end_tx_seg_data.errors:
505
512
  return _return_service_errors(end_tx_seg_data.errors)
@@ -525,53 +532,13 @@ class ExonGenomicCoordsMapper:
525
532
 
526
533
  return GenomicTxSegService(**params)
527
534
 
528
- async def _get_all_exon_coords(
529
- self, tx_ac: str, genomic_ac: str | None = None
530
- ) -> list[ExonCoord]:
531
- """Get all exon coordinate data for a transcript.
532
-
533
- If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
534
- associated to ``tx_ac``.
535
-
536
- :param tx_ac: The RefSeq transcript accession to get exon data for.
537
- :param genomic_ac: The RefSeq genomic accession to get exon data for.
538
- :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
539
- The exon coordinate data will include the exon number, transcript and
540
- genomic positions for the start and end of the exon, and strand.
541
- The list will be ordered by ascending exon number.
542
- """
543
- if genomic_ac:
544
- query = f"""
545
- SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
546
- FROM {self.uta_db.schema}.tx_exon_aln_v
547
- WHERE tx_ac = '{tx_ac}'
548
- AND alt_aln_method = 'splign'
549
- AND alt_ac = '{genomic_ac}'
550
- ORDER BY ord ASC
551
- """ # noqa: S608
552
- else:
553
- query = f"""
554
- SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
555
- FROM {self.uta_db.schema}.tx_exon_aln_v as t
556
- INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
557
- ON t.alt_ac = s.ac
558
- WHERE s.descr = ''
559
- AND t.tx_ac = '{tx_ac}'
560
- AND t.alt_aln_method = 'splign'
561
- AND t.alt_ac like 'NC_000%'
562
- ORDER BY ord ASC
563
- """ # noqa: S608
564
-
565
- results = await self.uta_db.execute_query(query)
566
- return [ExonCoord(**r) for r in results]
567
-
568
535
  async def _get_start_end_exon_coords(
569
536
  self,
570
537
  tx_ac: str,
571
538
  exon_start: int | None = None,
572
539
  exon_end: int | None = None,
573
540
  genomic_ac: str | None = None,
574
- ) -> tuple[ExonCoord | None, ExonCoord | None, list[str]]:
541
+ ) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
575
542
  """Get exon coordinates for a transcript given exon start and exon end.
576
543
 
577
544
  If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
@@ -606,56 +573,160 @@ class ExonGenomicCoordsMapper:
606
573
 
607
574
  return *start_end_exons, errors
608
575
 
609
- async def _get_alt_ac_start_and_end(
576
+ async def _get_all_exon_coords(
577
+ self, tx_ac: str, genomic_ac: str | None = None
578
+ ) -> list[_ExonCoord]:
579
+ """Get all exon coordinate data for a transcript.
580
+
581
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
582
+ associated to ``tx_ac``.
583
+
584
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
585
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
586
+ :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
587
+ The exon coordinate data will include the exon number, transcript and
588
+ genomic positions for the start and end of the exon, and strand.
589
+ The list will be ordered by ascending exon number.
590
+ """
591
+ if genomic_ac:
592
+ query = f"""
593
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
594
+ FROM {self.uta_db.schema}.tx_exon_aln_v
595
+ WHERE tx_ac = '{tx_ac}'
596
+ AND alt_aln_method = 'splign'
597
+ AND alt_ac = '{genomic_ac}'
598
+ ORDER BY ord ASC
599
+ """ # noqa: S608
600
+ else:
601
+ query = f"""
602
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
603
+ FROM {self.uta_db.schema}.tx_exon_aln_v as t
604
+ INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
605
+ ON t.alt_ac = s.ac
606
+ WHERE s.descr = ''
607
+ AND t.tx_ac = '{tx_ac}'
608
+ AND t.alt_aln_method = 'splign'
609
+ AND t.alt_ac like 'NC_000%'
610
+ ORDER BY ord ASC
611
+ """ # noqa: S608
612
+
613
+ results = await self.uta_db.execute_query(query)
614
+ return [_ExonCoord(**r) for r in results]
615
+
616
+ async def _get_genomic_aln_coords(
610
617
  self,
611
618
  tx_ac: str,
612
- tx_exon_start: ExonCoord | None = None,
613
- tx_exon_end: ExonCoord | None = None,
619
+ tx_exon_start: _ExonCoord | None = None,
620
+ tx_exon_end: _ExonCoord | None = None,
614
621
  gene: str | None = None,
615
- ) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
622
+ ) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
616
623
  """Get aligned genomic coordinates for transcript exon start and end.
617
624
 
625
+ ``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
626
+ transcript and genomic accession.
627
+
618
628
  :param tx_ac: Transcript accession
619
629
  :param tx_exon_start: Transcript's exon start coordinates. If not provided,
620
630
  must provide ``tx_exon_end``
621
631
  :param tx_exon_end: Transcript's exon end coordinates. If not provided, must
622
632
  provide ``tx_exon_start``
623
633
  :param gene: HGNC gene symbol
624
- :return: Aligned genomic data, and warnings if found
634
+ :return: Tuple containing aligned genomic data for start and end exon and
635
+ warnings if found
625
636
  """
626
637
  if tx_exon_start is None and tx_exon_end is None:
627
638
  msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
628
639
  _logger.warning(msg)
629
- return None, msg
640
+ return None, None, msg
630
641
 
631
- alt_ac_data = {"start": None, "end": None}
642
+ aligned_coords = {"start": None, "end": None}
632
643
  for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
633
644
  if exon:
634
- alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
645
+ aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
635
646
  tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
636
647
  )
637
- if alt_ac_val:
638
- alt_ac_data[key] = alt_ac_val
648
+ if aligned_coord:
649
+ aligned_coords[key] = aligned_coord
639
650
  else:
640
- return None, warning
641
-
642
- alt_ac_data_values = alt_ac_data.values()
643
- # Validate that start and end alignments have matching gene, genomic accession,
644
- # and strand
645
- if all(alt_ac_data_values):
646
- for attr in ["hgnc", "alt_ac", "alt_strand"]:
647
- start_attr = getattr(alt_ac_data["start"], attr)
648
- end_attr = getattr(alt_ac_data["end"], attr)
649
- if start_attr != end_attr:
650
- error = f"{attr} mismatch. {start_attr} != {end_attr}."
651
- _logger.warning(
652
- "%s: %s != %s",
653
- error,
654
- start_attr,
655
- end_attr,
656
- )
657
- return None, error
658
- return tuple(alt_ac_data_values), None
651
+ return None, None, warning
652
+
653
+ return *aligned_coords.values(), None
654
+
655
+ def _get_tx_segment(
656
+ self,
657
+ genomic_ac: str,
658
+ strand: Strand,
659
+ offset: int,
660
+ genomic_ac_data: _ExonCoord,
661
+ is_seg_start: bool = False,
662
+ ) -> tuple[TxSegment | None, str | None]:
663
+ """Get transcript segment data given ``genomic_ac`` and offset data
664
+
665
+ :param genomic_ac: Genomic RefSeq accession
666
+ :param strand: Strand
667
+ :param offset: Exon offset
668
+ :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
669
+ :param is_seg_start: ``True`` if retrieving genomic data where the transcript
670
+ segment starts, defaults to ``False``
671
+ :return: Transcript segment data
672
+ """
673
+ if is_seg_start:
674
+ if strand == Strand.POSITIVE:
675
+ seg_genomic_pos = offset + genomic_ac_data.alt_start_i
676
+ else:
677
+ seg_genomic_pos = genomic_ac_data.alt_end_i - offset
678
+ else:
679
+ if strand == Strand.POSITIVE:
680
+ seg_genomic_pos = offset + genomic_ac_data.alt_end_i
681
+ else:
682
+ seg_genomic_pos = genomic_ac_data.alt_start_i - offset
683
+
684
+ genomic_loc, err_msg = self._get_vrs_seq_loc(
685
+ genomic_ac,
686
+ seg_genomic_pos,
687
+ is_seg_start=is_seg_start,
688
+ strand=strand,
689
+ )
690
+ if err_msg:
691
+ return None, err_msg
692
+
693
+ return TxSegment(
694
+ exon_ord=genomic_ac_data.ord,
695
+ genomic_location=genomic_loc,
696
+ offset=offset,
697
+ ), None
698
+
699
+ def _get_vrs_seq_loc(
700
+ self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
701
+ ) -> tuple[SequenceLocation | None, str | None]:
702
+ """Create VRS Sequence Location for genomic position where transcript segment
703
+ occurs
704
+
705
+ :param genomic_ac: RefSeq genomic accession
706
+ :param genomic_pos: Genomic position where the transcript segment occurs
707
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
708
+ starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
709
+ :param strand: Strand
710
+ :return: Tuple containing VRS location (if successful) and error message (if
711
+ unable to get GA4GH identifier for ``genomic_ac``).
712
+ """
713
+ ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
714
+ genomic_ac, "ga4gh"
715
+ )
716
+ if err_msg:
717
+ return None, err_msg
718
+
719
+ use_start = (
720
+ strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
721
+ )
722
+
723
+ return SequenceLocation(
724
+ sequenceReference=SequenceReference(
725
+ refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
726
+ ),
727
+ start=genomic_pos if use_start else None,
728
+ end=genomic_pos if not use_start else None,
729
+ ), None
659
730
 
660
731
  async def _genomic_to_tx_segment(
661
732
  self,
@@ -665,7 +736,7 @@ class ExonGenomicCoordsMapper:
665
736
  transcript: str | None = None,
666
737
  gene: str | None = None,
667
738
  get_nearest_transcript_junction: bool = False,
668
- is_start: bool = True,
739
+ is_seg_start: bool = True,
669
740
  ) -> GenomicTxSeg:
670
741
  """Given genomic data, generate a boundary for a transcript segment.
671
742
 
@@ -692,17 +763,17 @@ class ExonGenomicCoordsMapper:
692
763
  following the breakpoint for the 3' end. For the negative strand, adjacent
693
764
  is defined as the exon following the breakpoint for the 5' end and the exon
694
765
  preceding the breakpoint for the 3' end.
695
- :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
766
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
696
767
  ``False`` if ``genomic_pos`` is where the transcript segment ends.
697
768
  :return: Data for a transcript segment boundary (inter-residue coordinates)
698
769
  """
699
770
  params = {key: None for key in GenomicTxSeg.model_fields}
700
771
 
701
772
  if get_nearest_transcript_junction:
702
- if not gene:
773
+ if not gene and not transcript:
703
774
  return GenomicTxSeg(
704
775
  errors=[
705
- "`gene` must be provided to select the adjacent transcript junction"
776
+ "`gene` or `transcript` must be provided to select the adjacent transcript junction"
706
777
  ]
707
778
  )
708
779
 
@@ -773,8 +844,8 @@ class ExonGenomicCoordsMapper:
773
844
  exon_num = self._get_adjacent_exon(
774
845
  tx_exons_genomic_coords=tx_exons,
775
846
  strand=strand,
776
- start=genomic_pos if is_start else None,
777
- end=genomic_pos if not is_start else None,
847
+ start=genomic_pos if is_seg_start else None,
848
+ end=genomic_pos if not is_seg_start else None,
778
849
  )
779
850
 
780
851
  offset = self._get_exon_offset(
@@ -782,19 +853,26 @@ class ExonGenomicCoordsMapper:
782
853
  end_i=tx_exons[exon_num].alt_end_i,
783
854
  strand=strand,
784
855
  use_start_i=strand == Strand.POSITIVE
785
- if is_start
856
+ if is_seg_start
786
857
  else strand != Strand.POSITIVE,
787
858
  is_in_exon=False,
788
- start=genomic_pos if is_start else None,
789
- end=genomic_pos if not is_start else None,
859
+ start=genomic_pos if is_seg_start else None,
860
+ end=genomic_pos if not is_seg_start else None,
790
861
  )
791
862
 
792
863
  genomic_location, err_msg = self._get_vrs_seq_loc(
793
- genomic_ac, genomic_pos, is_start, strand
864
+ genomic_ac, genomic_pos, is_seg_start, strand
794
865
  )
795
866
  if err_msg:
796
867
  return GenomicTxSeg(errors=[err_msg])
797
868
 
869
+ # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
870
+ if not gene:
871
+ _gene, err_msg = await self._get_tx_ac_gene(transcript)
872
+ if err_msg:
873
+ return GenomicTxSeg(errors=[err_msg])
874
+ gene = _gene
875
+
798
876
  return GenomicTxSeg(
799
877
  gene=gene,
800
878
  genomic_ac=genomic_ac,
@@ -807,20 +885,17 @@ class ExonGenomicCoordsMapper:
807
885
  )
808
886
 
809
887
  if genomic_ac:
810
- # Check if valid accession is given
811
- if not await self.uta_db.validate_genomic_ac(genomic_ac):
812
- return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
813
-
814
888
  _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
815
- if _gene:
816
- if gene and _gene != gene:
817
- return GenomicTxSeg(
818
- errors=[f"Expected gene, {gene}, but found {_gene}"]
819
- )
820
889
 
821
- gene = _gene
822
- else:
890
+ if err_msg:
823
891
  return GenomicTxSeg(errors=[err_msg])
892
+
893
+ if gene and _gene != gene:
894
+ return GenomicTxSeg(
895
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
896
+ )
897
+
898
+ gene = _gene
824
899
  elif chromosome:
825
900
  # Try GRCh38 first
826
901
  for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
@@ -858,7 +933,7 @@ class ExonGenomicCoordsMapper:
858
933
  )
859
934
 
860
935
  return await self._get_tx_seg_genomic_metadata(
861
- genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript
936
+ genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
862
937
  )
863
938
 
864
939
  async def _get_grch38_ac_pos(
@@ -943,85 +1018,37 @@ class ExonGenomicCoordsMapper:
943
1018
 
944
1019
  return results[0]["hgnc"], None
945
1020
 
946
- def _get_tx_segment(
1021
+ async def _get_tx_ac_gene(
947
1022
  self,
948
- genomic_ac: str,
949
- strand: Strand,
950
- offset: int,
951
- genomic_ac_data: ExonCoord,
952
- is_seg_start: bool = False,
953
- ) -> tuple[TxSegment | None, str | None]:
954
- """Get transcript segment data given ``genomic_ac`` and offset data
955
-
956
- :param genomic_ac: Genomic RefSeq accession
957
- :param strand: Strand
958
- :param offset: Exon offset
959
- :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
960
- :param is_seg_start: ``True`` if retrieving genomic data where the transcript
961
- segment starts, defaults to ``False``
962
- :return: Transcript segment data
963
- """
964
- if is_seg_start:
965
- if strand == Strand.POSITIVE:
966
- seg_genomic_pos = offset + genomic_ac_data.alt_start_i
967
- else:
968
- seg_genomic_pos = genomic_ac_data.alt_end_i - offset
969
- else:
970
- if strand == Strand.POSITIVE:
971
- seg_genomic_pos = offset + genomic_ac_data.alt_end_i
972
- else:
973
- seg_genomic_pos = genomic_ac_data.alt_start_i - offset
974
-
975
- genomic_loc, err_msg = self._get_vrs_seq_loc(
976
- genomic_ac,
977
- seg_genomic_pos,
978
- is_start=is_seg_start,
979
- strand=strand,
980
- )
981
- if err_msg:
982
- return None, err_msg
983
-
984
- return TxSegment(
985
- exon_ord=genomic_ac_data.ord,
986
- genomic_location=genomic_loc,
987
- offset=offset,
988
- ), None
1023
+ tx_ac: str,
1024
+ ) -> tuple[str | None, str | None]:
1025
+ """Get gene given a transcript.
989
1026
 
990
- def _get_vrs_seq_loc(
991
- self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand
992
- ) -> tuple[SequenceLocation | None, str | None]:
993
- """Create VRS Sequence Location for genomic position where transcript segment
994
- occurs
1027
+ If multiple genes are found for a given ``tx_ac``, only one
1028
+ gene will be returned.
995
1029
 
996
- :param genomic_ac: RefSeq genomic accession
997
- :param genomic_pos: Genomic position where the transcript segment occurs
998
- :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment
999
- starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
1000
- :param strand: Strand
1001
- :return: Tuple containing VRS location (if successful) and error message (if
1002
- unable to get GA4GH identifier for ``genomic_ac``).
1030
+ :param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
1031
+ :return: HGNC gene symbol associated to transcript and
1032
+ warning
1003
1033
  """
1004
- ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
1005
- genomic_ac, "ga4gh"
1006
- )
1007
- if err_msg:
1008
- return None, err_msg
1009
-
1010
- use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE
1034
+ query = f"""
1035
+ SELECT DISTINCT hgnc
1036
+ FROM {self.uta_db.schema}.tx_exon_aln_v
1037
+ WHERE tx_ac = '{tx_ac}'
1038
+ ORDER BY hgnc
1039
+ LIMIT 1;
1040
+ """ # noqa: S608
1041
+ results = await self.uta_db.execute_query(query)
1042
+ if not results:
1043
+ return None, f"No gene(s) found given {tx_ac}"
1011
1044
 
1012
- return SequenceLocation(
1013
- sequenceReference=SequenceReference(
1014
- refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
1015
- ),
1016
- start=genomic_pos if use_start else None,
1017
- end=genomic_pos if not use_start else None,
1018
- ), None
1045
+ return results[0]["hgnc"], None
1019
1046
 
1020
1047
  async def _get_tx_seg_genomic_metadata(
1021
1048
  self,
1022
1049
  genomic_ac: str,
1023
1050
  genomic_pos: int,
1024
- is_start: bool,
1051
+ is_seg_start: bool,
1025
1052
  gene: str,
1026
1053
  tx_ac: str | None,
1027
1054
  ) -> GenomicTxSeg:
@@ -1034,7 +1061,7 @@ class ExonGenomicCoordsMapper:
1034
1061
 
1035
1062
  :param genomic_ac: Genomic RefSeq accession
1036
1063
  :param genomic_pos: Genomic position where the transcript segment occurs
1037
- :param is_start: Whether or not ``genomic_pos`` represents the start position.
1064
+ :param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
1038
1065
  :param gene: HGNC gene symbol
1039
1066
  :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
1040
1067
  transcript
@@ -1092,12 +1119,12 @@ class ExonGenomicCoordsMapper:
1092
1119
  strand=Strand(tx_exon_aln_data.alt_strand),
1093
1120
  use_start_i=False, # This doesn't impact anything since we're on the exon
1094
1121
  is_in_exon=True,
1095
- start=genomic_pos if is_start else None,
1096
- end=genomic_pos if not is_start else None,
1122
+ start=genomic_pos if is_seg_start else None,
1123
+ end=genomic_pos if not is_seg_start else None,
1097
1124
  )
1098
1125
 
1099
1126
  genomic_location, err_msg = self._get_vrs_seq_loc(
1100
- genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand
1127
+ genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
1101
1128
  )
1102
1129
  if err_msg:
1103
1130
  return GenomicTxSeg(errors=[err_msg])
@@ -1114,46 +1141,20 @@ class ExonGenomicCoordsMapper:
1114
1141
  )
1115
1142
 
1116
1143
  @staticmethod
1117
- def _get_exon_offset(
1118
- start_i: int,
1119
- end_i: int,
1120
- strand: Strand,
1121
- use_start_i: bool = True,
1122
- is_in_exon: bool = True,
1123
- start: int | None = None,
1124
- end: int | None = None,
1125
- ) -> int:
1126
- """Compute offset from exon start or end index
1144
+ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
1145
+ """Check if a breakpoint occurs on an exon
1127
1146
 
1128
- :param start_i: Exon start index (inter-residue)
1129
- :param end_i: Exon end index (inter-residue)
1130
- :param strand: Strand
1131
- :param use_start_i: Whether or not ``start_i`` should be used to compute the
1132
- offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1133
- ``False``.
1134
- :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1135
- ``True``
1136
- :param start: Provided start position, defaults to ``None``. Must provide
1137
- ``start`` or ``end``, not both.
1138
- :param end: Provided end position, defaults to ``None``. Must provide ``start``
1139
- or ``end``, not both
1140
- :return: Offset from exon start or end index
1147
+ :param pos: Genomic breakpoint
1148
+ :param tx_genomic_coords: A list of transcript exon coordinate data
1149
+ :return: ``True`` if the breakpoint occurs on an exon
1141
1150
  """
1142
- if is_in_exon:
1143
- if start is not None:
1144
- offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1145
- else:
1146
- offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1147
- else:
1148
- if strand == Strand.POSITIVE:
1149
- offset = start - start_i if use_start_i else end - end_i
1150
- else:
1151
- offset = start_i - end if use_start_i else end_i - start
1152
- return offset
1151
+ return any(
1152
+ exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1153
+ )
1153
1154
 
1154
1155
  @staticmethod
1155
1156
  def _get_adjacent_exon(
1156
- tx_exons_genomic_coords: list[ExonCoord],
1157
+ tx_exons_genomic_coords: list[_ExonCoord],
1157
1158
  strand: Strand,
1158
1159
  start: int | None = None,
1159
1160
  end: int | None = None,
@@ -1191,13 +1192,39 @@ class ExonGenomicCoordsMapper:
1191
1192
  return exon.ord if end else exon.ord + 1
1192
1193
 
1193
1194
  @staticmethod
1194
- def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool:
1195
- """Check if a breakpoint occurs on an exon
1195
+ def _get_exon_offset(
1196
+ start_i: int,
1197
+ end_i: int,
1198
+ strand: Strand,
1199
+ use_start_i: bool = True,
1200
+ is_in_exon: bool = True,
1201
+ start: int | None = None,
1202
+ end: int | None = None,
1203
+ ) -> int:
1204
+ """Compute offset from exon start or end index
1196
1205
 
1197
- :param pos: Genomic breakpoint
1198
- :param tx_genomic_coords: A list of transcript exon coordinate data
1199
- :return: ``True`` if the breakpoint occurs on an exon
1206
+ :param start_i: Exon start index (inter-residue)
1207
+ :param end_i: Exon end index (inter-residue)
1208
+ :param strand: Strand
1209
+ :param use_start_i: Whether or not ``start_i`` should be used to compute the
1210
+ offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1211
+ ``False``.
1212
+ :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1213
+ ``True``
1214
+ :param start: Provided start position, defaults to ``None``. Must provide
1215
+ ``start`` or ``end``, not both.
1216
+ :param end: Provided end position, defaults to ``None``. Must provide ``start``
1217
+ or ``end``, not both
1218
+ :return: Offset from exon start or end index
1200
1219
  """
1201
- return any(
1202
- exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1203
- )
1220
+ if is_in_exon:
1221
+ if start is not None:
1222
+ offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1223
+ else:
1224
+ offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1225
+ else:
1226
+ if strand == Strand.POSITIVE:
1227
+ offset = start - start_i if use_start_i else end - end_i
1228
+ else:
1229
+ offset = start_i - end if use_start_i else end_i - start
1230
+ return offset
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cool_seq_tool
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
6
6
  cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
7
7
  cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
8
8
  cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
9
- cool_seq_tool/mappers/exon_genomic_coords.py,sha256=hfzfuxsNwMvj6y9thwWCj4WcOXamdnqvvd29gmX19Bo,48261
9
+ cool_seq_tool/mappers/exon_genomic_coords.py,sha256=lfmzuVXaYT7w2FBDS3xhJNgETusllomFy5Utzhfhlpc,48782
10
10
  cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
11
11
  cool_seq_tool/mappers/mane_transcript.py,sha256=nirxlf3EGVInFYG4fsAqiEmDdTc_h1XuPyX2ul-a7Rk,54368
12
12
  cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
@@ -17,8 +17,8 @@ cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHO
17
17
  cool_seq_tool/sources/mane_transcript_mappings.py,sha256=E_pj7FEBcB6HUR8yhSVibB0beMMlKJ62pK0qvl4y5nw,5358
18
18
  cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
19
19
  cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
20
- cool_seq_tool-0.7.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
- cool_seq_tool-0.7.0.dist-info/METADATA,sha256=UrSjQTJOgl4sqFvMG_p_TpeZW2R0GE6lMGus9NQhUew,6226
22
- cool_seq_tool-0.7.0.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
23
- cool_seq_tool-0.7.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
- cool_seq_tool-0.7.0.dist-info/RECORD,,
20
+ cool_seq_tool-0.7.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
+ cool_seq_tool-0.7.1.dist-info/METADATA,sha256=Y9_RZI2iHpmNOFwXoFCCKyHs6aXmNrzKQfyHkmqUVmQ,6226
22
+ cool_seq_tool-0.7.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
23
+ cool_seq_tool-0.7.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
+ cool_seq_tool-0.7.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (73.0.1)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5