cool-seq-tool 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,13 +14,13 @@ from cool_seq_tool.schemas import (
14
14
  Strand,
15
15
  )
16
16
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
17
- from cool_seq_tool.sources.uta_database import UtaDatabase
17
+ from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
18
18
  from cool_seq_tool.utils import service_meta
19
19
 
20
20
  _logger = logging.getLogger(__name__)
21
21
 
22
22
 
23
- class ExonCoord(BaseModelForbidExtra):
23
+ class _ExonCoord(BaseModelForbidExtra):
24
24
  """Model for representing exon coordinate data"""
25
25
 
26
26
  ord: StrictInt = Field(..., description="Exon number. 0-based.")
@@ -87,7 +87,9 @@ class GenomicTxSeg(BaseModelForbidExtra):
87
87
  """Model for representing a boundary for a transcript segment."""
88
88
 
89
89
  seg: TxSegment | None = Field(None, description="Transcript segment.")
90
- gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
90
+ gene: StrictStr | None = Field(
91
+ None, description="Valid, case-sensitive HGNC gene symbol."
92
+ )
91
93
  genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
92
94
  tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
93
95
  errors: list[StrictStr] = Field([], description="Error messages.")
@@ -97,19 +99,18 @@ class GenomicTxSeg(BaseModelForbidExtra):
97
99
  """Ensure that fields are (un)set depending on errors
98
100
 
99
101
  :param values: Values in model
100
- :raises ValueError: If `seg`, `gene`, `genomic_ac` and `tx_ac` are not
102
+ :raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
101
103
  provided when there are no errors
102
104
  :return: Values in model
103
105
  """
104
106
  if not values.get("errors") and not all(
105
107
  (
106
108
  values.get("seg"),
107
- values.get("gene"),
108
109
  values.get("genomic_ac"),
109
110
  values.get("tx_ac"),
110
111
  )
111
112
  ):
112
- err_msg = "`seg`, `gene`, `genomic_ac` and `tx_ac` must be provided"
113
+ err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
113
114
  raise ValueError(err_msg)
114
115
  return values
115
116
 
@@ -140,7 +141,9 @@ class GenomicTxSeg(BaseModelForbidExtra):
140
141
  class GenomicTxSegService(BaseModelForbidExtra):
141
142
  """Service model for genomic and transcript data."""
142
143
 
143
- gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
144
+ gene: StrictStr | None = Field(
145
+ None, description="Valid, case-sensitive HGNC gene symbol."
146
+ )
144
147
  genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
145
148
  tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
146
149
  seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
@@ -154,20 +157,21 @@ class GenomicTxSegService(BaseModelForbidExtra):
154
157
  on errors
155
158
 
156
159
  :param values: Values in model
157
- :raises ValueError: If `gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
160
+ :raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
158
161
  not provided when there are no errors
159
162
  :return: Values in model, including service metadata
160
163
  """
161
164
  values["service_meta"] = service_meta()
162
165
  if not values.get("errors") and not all(
163
166
  (
164
- values.get("gene"),
165
167
  values.get("genomic_ac"),
166
168
  values.get("tx_ac"),
167
169
  values.get("seg_start") or values.get("seg_end"),
168
170
  )
169
171
  ):
170
- err_msg = "`gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
172
+ err_msg = (
173
+ "`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
174
+ )
171
175
  raise ValueError(err_msg)
172
176
 
173
177
  return values
@@ -292,7 +296,7 @@ class ExonGenomicCoordsMapper:
292
296
  ('NC_000001.11', 154192135, 154170399)
293
297
 
294
298
  :param transcript: RefSeq transcript accession
295
- :param gene: HGNC gene symbol
299
+ :param gene: Valid, case-sensitive HGNC gene symbol
296
300
  :param exon_start: Starting transcript exon number (1-based). If not provided,
297
301
  must provide ``exon_end``
298
302
  :param exon_start_offset: Starting exon offset
@@ -335,22 +339,22 @@ class ExonGenomicCoordsMapper:
335
339
  if errors:
336
340
  return _return_service_errors(errors)
337
341
 
338
- if gene:
339
- gene = gene.upper()
340
-
341
342
  # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
342
343
  # for exon(s)
343
- alt_ac_start_end, err_msg = await self._get_alt_ac_start_and_end(
344
+ (
345
+ genomic_aln_start,
346
+ genomic_aln_end,
347
+ err_msg,
348
+ ) = await self._get_genomic_aln_coords(
344
349
  transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
345
350
  )
346
- if not alt_ac_start_end:
347
- return _return_service_errors([err_msg] if err_msg else [])
348
- alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
351
+ if err_msg:
352
+ return _return_service_errors([err_msg])
349
353
 
350
354
  # Get gene and chromosome data, check that at least one was retrieved
351
- gene = alt_ac_start_data.hgnc if alt_ac_start_data else alt_ac_end_data.hgnc
355
+ gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
352
356
  genomic_ac = (
353
- alt_ac_start_data.alt_ac if alt_ac_start_data else alt_ac_end_data.alt_ac
357
+ genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
354
358
  )
355
359
  if gene is None or genomic_ac is None:
356
360
  return _return_service_errors(
@@ -360,9 +364,9 @@ class ExonGenomicCoordsMapper:
360
364
  )
361
365
 
362
366
  strand = (
363
- Strand(alt_ac_start_data.alt_strand)
364
- if alt_ac_start_data
365
- else Strand(alt_ac_end_data.alt_strand)
367
+ Strand(genomic_aln_start.alt_strand)
368
+ if genomic_aln_start
369
+ else Strand(genomic_aln_end.alt_strand)
366
370
  )
367
371
 
368
372
  if exon_start_exists:
@@ -370,7 +374,7 @@ class ExonGenomicCoordsMapper:
370
374
  genomic_ac,
371
375
  strand,
372
376
  exon_start_offset,
373
- alt_ac_start_data,
377
+ genomic_aln_start,
374
378
  is_seg_start=True,
375
379
  )
376
380
  if err_msg:
@@ -380,7 +384,11 @@ class ExonGenomicCoordsMapper:
380
384
 
381
385
  if exon_end_exists:
382
386
  seg_end, err_msg = self._get_tx_segment(
383
- genomic_ac, strand, exon_end_offset, alt_ac_end_data, is_seg_start=False
387
+ genomic_ac,
388
+ strand,
389
+ exon_end_offset,
390
+ genomic_aln_end,
391
+ is_seg_start=False,
384
392
  )
385
393
  if err_msg:
386
394
  return _return_service_errors([err_msg])
@@ -448,7 +456,7 @@ class ExonGenomicCoordsMapper:
448
456
  following the breakpoint for the 3' end. For the negative strand, adjacent
449
457
  is defined as the exon following the breakpoint for the 5' end and the exon
450
458
  preceding the breakpoint for the 3' end.
451
- :param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript``
459
+ :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript``
452
460
  value is provided.
453
461
  :param coordinate_type: Coordinate type for ``seg_start_genomic`` and
454
462
  ``seg_end_genomic``
@@ -466,9 +474,6 @@ class ExonGenomicCoordsMapper:
466
474
  if errors:
467
475
  return _return_service_errors(errors)
468
476
 
469
- if gene is not None:
470
- gene = gene.upper()
471
-
472
477
  params = {}
473
478
 
474
479
  if seg_start_genomic:
@@ -479,7 +484,7 @@ class ExonGenomicCoordsMapper:
479
484
  transcript=transcript,
480
485
  gene=gene,
481
486
  get_nearest_transcript_junction=get_nearest_transcript_junction,
482
- is_start=True,
487
+ is_seg_start=True,
483
488
  )
484
489
  if start_tx_seg_data.errors:
485
490
  return _return_service_errors(start_tx_seg_data.errors)
@@ -499,7 +504,7 @@ class ExonGenomicCoordsMapper:
499
504
  transcript=transcript,
500
505
  gene=gene,
501
506
  get_nearest_transcript_junction=get_nearest_transcript_junction,
502
- is_start=False,
507
+ is_seg_start=False,
503
508
  )
504
509
  if end_tx_seg_data.errors:
505
510
  return _return_service_errors(end_tx_seg_data.errors)
@@ -525,53 +530,13 @@ class ExonGenomicCoordsMapper:
525
530
 
526
531
  return GenomicTxSegService(**params)
527
532
 
528
- async def _get_all_exon_coords(
529
- self, tx_ac: str, genomic_ac: str | None = None
530
- ) -> list[ExonCoord]:
531
- """Get all exon coordinate data for a transcript.
532
-
533
- If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
534
- associated to ``tx_ac``.
535
-
536
- :param tx_ac: The RefSeq transcript accession to get exon data for.
537
- :param genomic_ac: The RefSeq genomic accession to get exon data for.
538
- :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
539
- The exon coordinate data will include the exon number, transcript and
540
- genomic positions for the start and end of the exon, and strand.
541
- The list will be ordered by ascending exon number.
542
- """
543
- if genomic_ac:
544
- query = f"""
545
- SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
546
- FROM {self.uta_db.schema}.tx_exon_aln_v
547
- WHERE tx_ac = '{tx_ac}'
548
- AND alt_aln_method = 'splign'
549
- AND alt_ac = '{genomic_ac}'
550
- ORDER BY ord ASC
551
- """ # noqa: S608
552
- else:
553
- query = f"""
554
- SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
555
- FROM {self.uta_db.schema}.tx_exon_aln_v as t
556
- INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
557
- ON t.alt_ac = s.ac
558
- WHERE s.descr = ''
559
- AND t.tx_ac = '{tx_ac}'
560
- AND t.alt_aln_method = 'splign'
561
- AND t.alt_ac like 'NC_000%'
562
- ORDER BY ord ASC
563
- """ # noqa: S608
564
-
565
- results = await self.uta_db.execute_query(query)
566
- return [ExonCoord(**r) for r in results]
567
-
568
533
  async def _get_start_end_exon_coords(
569
534
  self,
570
535
  tx_ac: str,
571
536
  exon_start: int | None = None,
572
537
  exon_end: int | None = None,
573
538
  genomic_ac: str | None = None,
574
- ) -> tuple[ExonCoord | None, ExonCoord | None, list[str]]:
539
+ ) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
575
540
  """Get exon coordinates for a transcript given exon start and exon end.
576
541
 
577
542
  If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
@@ -606,56 +571,160 @@ class ExonGenomicCoordsMapper:
606
571
 
607
572
  return *start_end_exons, errors
608
573
 
609
- async def _get_alt_ac_start_and_end(
574
+ async def _get_all_exon_coords(
575
+ self, tx_ac: str, genomic_ac: str | None = None
576
+ ) -> list[_ExonCoord]:
577
+ """Get all exon coordinate data for a transcript.
578
+
579
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
580
+ associated to ``tx_ac``.
581
+
582
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
583
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
584
+ :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
585
+ The exon coordinate data will include the exon number, transcript and
586
+ genomic positions for the start and end of the exon, and strand.
587
+ The list will be ordered by ascending exon number.
588
+ """
589
+ if genomic_ac:
590
+ query = f"""
591
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
592
+ FROM {self.uta_db.schema}.tx_exon_aln_v
593
+ WHERE tx_ac = '{tx_ac}'
594
+ AND alt_aln_method = 'splign'
595
+ AND alt_ac = '{genomic_ac}'
596
+ ORDER BY ord ASC
597
+ """ # noqa: S608
598
+ else:
599
+ query = f"""
600
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
601
+ FROM {self.uta_db.schema}.tx_exon_aln_v as t
602
+ INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
603
+ ON t.alt_ac = s.ac
604
+ WHERE s.descr = ''
605
+ AND t.tx_ac = '{tx_ac}'
606
+ AND t.alt_aln_method = 'splign'
607
+ AND t.alt_ac like 'NC_000%'
608
+ ORDER BY ord ASC
609
+ """ # noqa: S608
610
+
611
+ results = await self.uta_db.execute_query(query)
612
+ return [_ExonCoord(**r) for r in results]
613
+
614
+ async def _get_genomic_aln_coords(
610
615
  self,
611
616
  tx_ac: str,
612
- tx_exon_start: ExonCoord | None = None,
613
- tx_exon_end: ExonCoord | None = None,
617
+ tx_exon_start: _ExonCoord | None = None,
618
+ tx_exon_end: _ExonCoord | None = None,
614
619
  gene: str | None = None,
615
- ) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
620
+ ) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
616
621
  """Get aligned genomic coordinates for transcript exon start and end.
617
622
 
623
+ ``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
624
+ transcript and genomic accession.
625
+
618
626
  :param tx_ac: Transcript accession
619
627
  :param tx_exon_start: Transcript's exon start coordinates. If not provided,
620
628
  must provide ``tx_exon_end``
621
629
  :param tx_exon_end: Transcript's exon end coordinates. If not provided, must
622
630
  provide ``tx_exon_start``
623
- :param gene: HGNC gene symbol
624
- :return: Aligned genomic data, and warnings if found
631
+ :param gene: A valid, case-sensitive HGNC gene symbol
632
+ :return: Tuple containing aligned genomic data for start and end exon and
633
+ warnings if found
625
634
  """
626
635
  if tx_exon_start is None and tx_exon_end is None:
627
636
  msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
628
637
  _logger.warning(msg)
629
- return None, msg
638
+ return None, None, msg
630
639
 
631
- alt_ac_data = {"start": None, "end": None}
640
+ aligned_coords = {"start": None, "end": None}
632
641
  for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
633
642
  if exon:
634
- alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
643
+ aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
635
644
  tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
636
645
  )
637
- if alt_ac_val:
638
- alt_ac_data[key] = alt_ac_val
646
+ if aligned_coord:
647
+ aligned_coords[key] = aligned_coord
639
648
  else:
640
- return None, warning
641
-
642
- alt_ac_data_values = alt_ac_data.values()
643
- # Validate that start and end alignments have matching gene, genomic accession,
644
- # and strand
645
- if all(alt_ac_data_values):
646
- for attr in ["hgnc", "alt_ac", "alt_strand"]:
647
- start_attr = getattr(alt_ac_data["start"], attr)
648
- end_attr = getattr(alt_ac_data["end"], attr)
649
- if start_attr != end_attr:
650
- error = f"{attr} mismatch. {start_attr} != {end_attr}."
651
- _logger.warning(
652
- "%s: %s != %s",
653
- error,
654
- start_attr,
655
- end_attr,
656
- )
657
- return None, error
658
- return tuple(alt_ac_data_values), None
649
+ return None, None, warning
650
+
651
+ return *aligned_coords.values(), None
652
+
653
+ def _get_tx_segment(
654
+ self,
655
+ genomic_ac: str,
656
+ strand: Strand,
657
+ offset: int,
658
+ genomic_ac_data: _ExonCoord,
659
+ is_seg_start: bool = False,
660
+ ) -> tuple[TxSegment | None, str | None]:
661
+ """Get transcript segment data given ``genomic_ac`` and offset data
662
+
663
+ :param genomic_ac: Genomic RefSeq accession
664
+ :param strand: Strand
665
+ :param offset: Exon offset
666
+ :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
667
+ :param is_seg_start: ``True`` if retrieving genomic data where the transcript
668
+ segment starts, defaults to ``False``
669
+ :return: Transcript segment data
670
+ """
671
+ if is_seg_start:
672
+ if strand == Strand.POSITIVE:
673
+ seg_genomic_pos = offset + genomic_ac_data.alt_start_i
674
+ else:
675
+ seg_genomic_pos = genomic_ac_data.alt_end_i - offset
676
+ else:
677
+ if strand == Strand.POSITIVE:
678
+ seg_genomic_pos = offset + genomic_ac_data.alt_end_i
679
+ else:
680
+ seg_genomic_pos = genomic_ac_data.alt_start_i - offset
681
+
682
+ genomic_loc, err_msg = self._get_vrs_seq_loc(
683
+ genomic_ac,
684
+ seg_genomic_pos,
685
+ is_seg_start=is_seg_start,
686
+ strand=strand,
687
+ )
688
+ if err_msg:
689
+ return None, err_msg
690
+
691
+ return TxSegment(
692
+ exon_ord=genomic_ac_data.ord,
693
+ genomic_location=genomic_loc,
694
+ offset=offset,
695
+ ), None
696
+
697
+ def _get_vrs_seq_loc(
698
+ self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
699
+ ) -> tuple[SequenceLocation | None, str | None]:
700
+ """Create VRS Sequence Location for genomic position where transcript segment
701
+ occurs
702
+
703
+ :param genomic_ac: RefSeq genomic accession
704
+ :param genomic_pos: Genomic position where the transcript segment occurs
705
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
706
+ starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
707
+ :param strand: Strand
708
+ :return: Tuple containing VRS location (if successful) and error message (if
709
+ unable to get GA4GH identifier for ``genomic_ac``).
710
+ """
711
+ ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
712
+ genomic_ac, "ga4gh"
713
+ )
714
+ if err_msg:
715
+ return None, err_msg
716
+
717
+ use_start = (
718
+ strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
719
+ )
720
+
721
+ return SequenceLocation(
722
+ sequenceReference=SequenceReference(
723
+ refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
724
+ ),
725
+ start=genomic_pos if use_start else None,
726
+ end=genomic_pos if not use_start else None,
727
+ ), None
659
728
 
660
729
  async def _genomic_to_tx_segment(
661
730
  self,
@@ -665,7 +734,7 @@ class ExonGenomicCoordsMapper:
665
734
  transcript: str | None = None,
666
735
  gene: str | None = None,
667
736
  get_nearest_transcript_junction: bool = False,
668
- is_start: bool = True,
737
+ is_seg_start: bool = True,
669
738
  ) -> GenomicTxSeg:
670
739
  """Given genomic data, generate a boundary for a transcript segment.
671
740
 
@@ -684,7 +753,7 @@ class ExonGenomicCoordsMapper:
684
753
  :param transcript: The transcript to use. If this is not given, we will try the
685
754
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
686
755
  Compatible Transcript
687
- :param gene: HGNC gene symbol
756
+ :param gene: Valid, case-sensitive HGNC gene symbol
688
757
  :param get_nearest_transcript_junction: If ``True``, this will return the
689
758
  adjacent exon if the position specified by``seg_start_genomic`` or
690
759
  ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
@@ -692,17 +761,17 @@ class ExonGenomicCoordsMapper:
692
761
  following the breakpoint for the 3' end. For the negative strand, adjacent
693
762
  is defined as the exon following the breakpoint for the 5' end and the exon
694
763
  preceding the breakpoint for the 3' end.
695
- :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
764
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
696
765
  ``False`` if ``genomic_pos`` is where the transcript segment ends.
697
766
  :return: Data for a transcript segment boundary (inter-residue coordinates)
698
767
  """
699
768
  params = {key: None for key in GenomicTxSeg.model_fields}
700
769
 
701
770
  if get_nearest_transcript_junction:
702
- if not gene:
771
+ if not gene and not transcript:
703
772
  return GenomicTxSeg(
704
773
  errors=[
705
- "`gene` must be provided to select the adjacent transcript junction"
774
+ "`gene` or `transcript` must be provided to select the adjacent transcript junction"
706
775
  ]
707
776
  )
708
777
 
@@ -773,8 +842,8 @@ class ExonGenomicCoordsMapper:
773
842
  exon_num = self._get_adjacent_exon(
774
843
  tx_exons_genomic_coords=tx_exons,
775
844
  strand=strand,
776
- start=genomic_pos if is_start else None,
777
- end=genomic_pos if not is_start else None,
845
+ start=genomic_pos if is_seg_start else None,
846
+ end=genomic_pos if not is_seg_start else None,
778
847
  )
779
848
 
780
849
  offset = self._get_exon_offset(
@@ -782,19 +851,26 @@ class ExonGenomicCoordsMapper:
782
851
  end_i=tx_exons[exon_num].alt_end_i,
783
852
  strand=strand,
784
853
  use_start_i=strand == Strand.POSITIVE
785
- if is_start
854
+ if is_seg_start
786
855
  else strand != Strand.POSITIVE,
787
856
  is_in_exon=False,
788
- start=genomic_pos if is_start else None,
789
- end=genomic_pos if not is_start else None,
857
+ start=genomic_pos if is_seg_start else None,
858
+ end=genomic_pos if not is_seg_start else None,
790
859
  )
791
860
 
792
861
  genomic_location, err_msg = self._get_vrs_seq_loc(
793
- genomic_ac, genomic_pos, is_start, strand
862
+ genomic_ac, genomic_pos, is_seg_start, strand
794
863
  )
795
864
  if err_msg:
796
865
  return GenomicTxSeg(errors=[err_msg])
797
866
 
867
+ # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
868
+ if not gene:
869
+ _gene, err_msg = await self._get_tx_ac_gene(transcript)
870
+ if err_msg:
871
+ return GenomicTxSeg(errors=[err_msg])
872
+ gene = _gene
873
+
798
874
  return GenomicTxSeg(
799
875
  gene=gene,
800
876
  genomic_ac=genomic_ac,
@@ -807,20 +883,17 @@ class ExonGenomicCoordsMapper:
807
883
  )
808
884
 
809
885
  if genomic_ac:
810
- # Check if valid accession is given
811
- if not await self.uta_db.validate_genomic_ac(genomic_ac):
812
- return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
813
-
814
886
  _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
815
- if _gene:
816
- if gene and _gene != gene:
817
- return GenomicTxSeg(
818
- errors=[f"Expected gene, {gene}, but found {_gene}"]
819
- )
820
887
 
821
- gene = _gene
822
- else:
888
+ if err_msg:
823
889
  return GenomicTxSeg(errors=[err_msg])
890
+
891
+ if gene and _gene != gene:
892
+ return GenomicTxSeg(
893
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
894
+ )
895
+
896
+ gene = _gene
824
897
  elif chromosome:
825
898
  # Try GRCh38 first
826
899
  for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
@@ -858,7 +931,7 @@ class ExonGenomicCoordsMapper:
858
931
  )
859
932
 
860
933
  return await self._get_tx_seg_genomic_metadata(
861
- genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript
934
+ genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
862
935
  )
863
936
 
864
937
  async def _get_grch38_ac_pos(
@@ -943,85 +1016,37 @@ class ExonGenomicCoordsMapper:
943
1016
 
944
1017
  return results[0]["hgnc"], None
945
1018
 
946
- def _get_tx_segment(
1019
+ async def _get_tx_ac_gene(
947
1020
  self,
948
- genomic_ac: str,
949
- strand: Strand,
950
- offset: int,
951
- genomic_ac_data: ExonCoord,
952
- is_seg_start: bool = False,
953
- ) -> tuple[TxSegment | None, str | None]:
954
- """Get transcript segment data given ``genomic_ac`` and offset data
955
-
956
- :param genomic_ac: Genomic RefSeq accession
957
- :param strand: Strand
958
- :param offset: Exon offset
959
- :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
960
- :param is_seg_start: ``True`` if retrieving genomic data where the transcript
961
- segment starts, defaults to ``False``
962
- :return: Transcript segment data
963
- """
964
- if is_seg_start:
965
- if strand == Strand.POSITIVE:
966
- seg_genomic_pos = offset + genomic_ac_data.alt_start_i
967
- else:
968
- seg_genomic_pos = genomic_ac_data.alt_end_i - offset
969
- else:
970
- if strand == Strand.POSITIVE:
971
- seg_genomic_pos = offset + genomic_ac_data.alt_end_i
972
- else:
973
- seg_genomic_pos = genomic_ac_data.alt_start_i - offset
974
-
975
- genomic_loc, err_msg = self._get_vrs_seq_loc(
976
- genomic_ac,
977
- seg_genomic_pos,
978
- is_start=is_seg_start,
979
- strand=strand,
980
- )
981
- if err_msg:
982
- return None, err_msg
983
-
984
- return TxSegment(
985
- exon_ord=genomic_ac_data.ord,
986
- genomic_location=genomic_loc,
987
- offset=offset,
988
- ), None
1021
+ tx_ac: str,
1022
+ ) -> tuple[str | None, str | None]:
1023
+ """Get gene given a transcript.
989
1024
 
990
- def _get_vrs_seq_loc(
991
- self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand
992
- ) -> tuple[SequenceLocation | None, str | None]:
993
- """Create VRS Sequence Location for genomic position where transcript segment
994
- occurs
1025
+ If multiple genes are found for a given ``tx_ac``, only one
1026
+ gene will be returned.
995
1027
 
996
- :param genomic_ac: RefSeq genomic accession
997
- :param genomic_pos: Genomic position where the transcript segment occurs
998
- :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment
999
- starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
1000
- :param strand: Strand
1001
- :return: Tuple containing VRS location (if successful) and error message (if
1002
- unable to get GA4GH identifier for ``genomic_ac``).
1028
+ :param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
1029
+ :return: HGNC gene symbol associated to transcript and
1030
+ warning
1003
1031
  """
1004
- ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
1005
- genomic_ac, "ga4gh"
1006
- )
1007
- if err_msg:
1008
- return None, err_msg
1009
-
1010
- use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE
1032
+ query = f"""
1033
+ SELECT DISTINCT hgnc
1034
+ FROM {self.uta_db.schema}.tx_exon_aln_v
1035
+ WHERE tx_ac = '{tx_ac}'
1036
+ ORDER BY hgnc
1037
+ LIMIT 1;
1038
+ """ # noqa: S608
1039
+ results = await self.uta_db.execute_query(query)
1040
+ if not results:
1041
+ return None, f"No gene(s) found given {tx_ac}"
1011
1042
 
1012
- return SequenceLocation(
1013
- sequenceReference=SequenceReference(
1014
- refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
1015
- ),
1016
- start=genomic_pos if use_start else None,
1017
- end=genomic_pos if not use_start else None,
1018
- ), None
1043
+ return results[0]["hgnc"], None
1019
1044
 
1020
1045
  async def _get_tx_seg_genomic_metadata(
1021
1046
  self,
1022
1047
  genomic_ac: str,
1023
1048
  genomic_pos: int,
1024
- is_start: bool,
1049
+ is_seg_start: bool,
1025
1050
  gene: str,
1026
1051
  tx_ac: str | None,
1027
1052
  ) -> GenomicTxSeg:
@@ -1034,8 +1059,8 @@ class ExonGenomicCoordsMapper:
1034
1059
 
1035
1060
  :param genomic_ac: Genomic RefSeq accession
1036
1061
  :param genomic_pos: Genomic position where the transcript segment occurs
1037
- :param is_start: Whether or not ``genomic_pos`` represents the start position.
1038
- :param gene: HGNC gene symbol
1062
+ :param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
1063
+ :param gene: Valid, case-sensitive HGNC gene symbol
1039
1064
  :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
1040
1065
  transcript
1041
1066
  :return: Transcript segment data and associated genomic metadata
@@ -1092,12 +1117,12 @@ class ExonGenomicCoordsMapper:
1092
1117
  strand=Strand(tx_exon_aln_data.alt_strand),
1093
1118
  use_start_i=False, # This doesn't impact anything since we're on the exon
1094
1119
  is_in_exon=True,
1095
- start=genomic_pos if is_start else None,
1096
- end=genomic_pos if not is_start else None,
1120
+ start=genomic_pos if is_seg_start else None,
1121
+ end=genomic_pos if not is_seg_start else None,
1097
1122
  )
1098
1123
 
1099
1124
  genomic_location, err_msg = self._get_vrs_seq_loc(
1100
- genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand
1125
+ genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
1101
1126
  )
1102
1127
  if err_msg:
1103
1128
  return GenomicTxSeg(errors=[err_msg])
@@ -1114,46 +1139,20 @@ class ExonGenomicCoordsMapper:
1114
1139
  )
1115
1140
 
1116
1141
  @staticmethod
1117
- def _get_exon_offset(
1118
- start_i: int,
1119
- end_i: int,
1120
- strand: Strand,
1121
- use_start_i: bool = True,
1122
- is_in_exon: bool = True,
1123
- start: int | None = None,
1124
- end: int | None = None,
1125
- ) -> int:
1126
- """Compute offset from exon start or end index
1142
+ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
1143
+ """Check if a breakpoint occurs on an exon
1127
1144
 
1128
- :param start_i: Exon start index (inter-residue)
1129
- :param end_i: Exon end index (inter-residue)
1130
- :param strand: Strand
1131
- :param use_start_i: Whether or not ``start_i`` should be used to compute the
1132
- offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1133
- ``False``.
1134
- :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1135
- ``True``
1136
- :param start: Provided start position, defaults to ``None``. Must provide
1137
- ``start`` or ``end``, not both.
1138
- :param end: Provided end position, defaults to ``None``. Must provide ``start``
1139
- or ``end``, not both
1140
- :return: Offset from exon start or end index
1145
+ :param pos: Genomic breakpoint
1146
+ :param tx_genomic_coords: A list of transcript exon coordinate data
1147
+ :return: ``True`` if the breakpoint occurs on an exon
1141
1148
  """
1142
- if is_in_exon:
1143
- if start is not None:
1144
- offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1145
- else:
1146
- offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1147
- else:
1148
- if strand == Strand.POSITIVE:
1149
- offset = start - start_i if use_start_i else end - end_i
1150
- else:
1151
- offset = start_i - end if use_start_i else end_i - start
1152
- return offset
1149
+ return any(
1150
+ exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1151
+ )
1153
1152
 
1154
1153
  @staticmethod
1155
1154
  def _get_adjacent_exon(
1156
- tx_exons_genomic_coords: list[ExonCoord],
1155
+ tx_exons_genomic_coords: list[_ExonCoord],
1157
1156
  strand: Strand,
1158
1157
  start: int | None = None,
1159
1158
  end: int | None = None,
@@ -1191,13 +1190,39 @@ class ExonGenomicCoordsMapper:
1191
1190
  return exon.ord if end else exon.ord + 1
1192
1191
 
1193
1192
  @staticmethod
1194
- def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool:
1195
- """Check if a breakpoint occurs on an exon
1193
+ def _get_exon_offset(
1194
+ start_i: int,
1195
+ end_i: int,
1196
+ strand: Strand,
1197
+ use_start_i: bool = True,
1198
+ is_in_exon: bool = True,
1199
+ start: int | None = None,
1200
+ end: int | None = None,
1201
+ ) -> int:
1202
+ """Compute offset from exon start or end index
1196
1203
 
1197
- :param pos: Genomic breakpoint
1198
- :param tx_genomic_coords: A list of transcript exon coordinate data
1199
- :return: ``True`` if the breakpoint occurs on an exon
1204
+ :param start_i: Exon start index (inter-residue)
1205
+ :param end_i: Exon end index (inter-residue)
1206
+ :param strand: Strand
1207
+ :param use_start_i: Whether or not ``start_i`` should be used to compute the
1208
+ offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1209
+ ``False``.
1210
+ :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1211
+ ``True``
1212
+ :param start: Provided start position, defaults to ``None``. Must provide
1213
+ ``start`` or ``end``, not both.
1214
+ :param end: Provided end position, defaults to ``None``. Must provide ``start``
1215
+ or ``end``, not both
1216
+ :return: Offset from exon start or end index
1200
1217
  """
1201
- return any(
1202
- exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1203
- )
1218
+ if is_in_exon:
1219
+ if start is not None:
1220
+ offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1221
+ else:
1222
+ offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1223
+ else:
1224
+ if strand == Strand.POSITIVE:
1225
+ offset = start - start_i if use_start_i else end - end_i
1226
+ else:
1227
+ offset = start_i - end if use_start_i else end_i - start
1228
+ return offset
@@ -61,7 +61,9 @@ class ManeTranscriptMappings:
61
61
  location information). The list is sorted so that a MANE Select entry comes
62
62
  first, followed by a MANE Plus Clinical entry, if available.
63
63
  """
64
- data = self.df.filter(pl.col("symbol") == gene_symbol.upper())
64
+ data = self.df.filter(
65
+ pl.col("symbol").str.to_uppercase() == gene_symbol.upper()
66
+ )
65
67
 
66
68
  if len(data) == 0:
67
69
  _logger.warning(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cool_seq_tool
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
6
6
  cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
7
7
  cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
8
8
  cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
9
- cool_seq_tool/mappers/exon_genomic_coords.py,sha256=hfzfuxsNwMvj6y9thwWCj4WcOXamdnqvvd29gmX19Bo,48261
9
+ cool_seq_tool/mappers/exon_genomic_coords.py,sha256=XoG60ha1JDoAI-vlc-0rh3tFSyKx61u49hVyeBpjPME,48836
10
10
  cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
11
11
  cool_seq_tool/mappers/mane_transcript.py,sha256=nirxlf3EGVInFYG4fsAqiEmDdTc_h1XuPyX2ul-a7Rk,54368
12
12
  cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
@@ -14,11 +14,11 @@ cool_seq_tool/resources/data_files.py,sha256=3lhu28tzlSoTs4vHZNu-hhoAWRrPGuZj_oI
14
14
  cool_seq_tool/resources/status.py,sha256=L0KM-VG3N4Yuaqh3AKZd_2KPDLR0Y7rvW_OD6x8mF7A,5717
15
15
  cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
16
16
  cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
17
- cool_seq_tool/sources/mane_transcript_mappings.py,sha256=E_pj7FEBcB6HUR8yhSVibB0beMMlKJ62pK0qvl4y5nw,5358
17
+ cool_seq_tool/sources/mane_transcript_mappings.py,sha256=Q6J57O2lLWXlgKT0zq3BIwkwFawySnORHOX-UxzfyDE,5399
18
18
  cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
19
19
  cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
20
- cool_seq_tool-0.7.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
- cool_seq_tool-0.7.0.dist-info/METADATA,sha256=UrSjQTJOgl4sqFvMG_p_TpeZW2R0GE6lMGus9NQhUew,6226
22
- cool_seq_tool-0.7.0.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
23
- cool_seq_tool-0.7.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
- cool_seq_tool-0.7.0.dist-info/RECORD,,
20
+ cool_seq_tool-0.8.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
+ cool_seq_tool-0.8.0.dist-info/METADATA,sha256=moL8cCRR-wPQh3t9kJviAVRuvAapZJ40-8Ea8pjIRic,6226
22
+ cool_seq_tool-0.8.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
23
+ cool_seq_tool-0.8.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
+ cool_seq_tool-0.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (73.0.1)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5