cool-seq-tool 0.9.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ from cool_seq_tool.mappers.liftover import LiftOver
10
10
  from cool_seq_tool.schemas import (
11
11
  Assembly,
12
12
  BaseModelForbidExtra,
13
+ CoordinateType,
13
14
  ServiceMeta,
14
15
  Strand,
15
16
  )
@@ -410,15 +411,14 @@ class ExonGenomicCoordsMapper:
410
411
  seg_start_genomic: int | None = None,
411
412
  seg_end_genomic: int | None = None,
412
413
  transcript: str | None = None,
413
- get_nearest_transcript_junction: bool = False,
414
414
  gene: str | None = None,
415
+ coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
416
+ starting_assembly: Assembly = Assembly.GRCH38,
415
417
  ) -> GenomicTxSegService:
416
418
  """Get transcript segment data for genomic data, lifted over to GRCh38.
417
419
 
418
420
  If liftover to GRCh38 is unsuccessful, will return errors.
419
421
 
420
- Must provide inter-residue coordinates.
421
-
422
422
  MANE Transcript data will be returned if and only if ``transcript`` is not
423
423
  supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
424
424
 
@@ -442,24 +442,21 @@ class ExonGenomicCoordsMapper:
442
442
  used.
443
443
  :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
444
444
  must provide ``chromosome. If ``chromosome`` is also provided,
445
- ``genomic_ac`` will be used.
445
+ ``genomic_ac`` will be used. If the genomic accession is from GRCh37, it
446
+ will be lifted over to GRCh38 and the original accession version will be
447
+ ignored
446
448
  :param seg_start_genomic: Genomic position where the transcript segment starts
447
449
  :param seg_end_genomic: Genomic position where the transcript segment ends
448
450
  :param transcript: The transcript to use. If this is not given, we will try the
449
451
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
450
452
  Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
451
453
  page.
452
- :param get_nearest_transcript_junction: If ``True``, this will return the
453
- adjacent exon if the position specified by``seg_start_genomic`` or
454
- ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
455
- is defined as the exon preceding the breakpoint for the 5' end and the exon
456
- following the breakpoint for the 3' end. For the negative strand, adjacent
457
- is defined as the exon following the breakpoint for the 5' end and the exon
458
- preceding the breakpoint for the 3' end.
459
454
  :param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript``
460
455
  value is provided.
461
456
  :param coordinate_type: Coordinate type for ``seg_start_genomic`` and
462
- ``seg_end_genomic``
457
+ ``seg_end_genomic``. Expects inter-residue coordinates by default
458
+ :param starting_assembly: The assembly that the supplied coordinate comes from. Set to
459
+ GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
463
460
  :return: Genomic data (inter-residue coordinates)
464
461
  """
465
462
  errors = []
@@ -483,8 +480,9 @@ class ExonGenomicCoordsMapper:
483
480
  genomic_ac=genomic_ac,
484
481
  transcript=transcript,
485
482
  gene=gene,
486
- get_nearest_transcript_junction=get_nearest_transcript_junction,
487
483
  is_seg_start=True,
484
+ coordinate_type=coordinate_type,
485
+ starting_assembly=starting_assembly,
488
486
  )
489
487
  if start_tx_seg_data.errors:
490
488
  return _return_service_errors(start_tx_seg_data.errors)
@@ -503,8 +501,9 @@ class ExonGenomicCoordsMapper:
503
501
  genomic_ac=genomic_ac,
504
502
  transcript=transcript,
505
503
  gene=gene,
506
- get_nearest_transcript_junction=get_nearest_transcript_junction,
507
504
  is_seg_start=False,
505
+ coordinate_type=coordinate_type,
506
+ starting_assembly=starting_assembly,
508
507
  )
509
508
  if end_tx_seg_data.errors:
510
509
  return _return_service_errors(end_tx_seg_data.errors)
@@ -553,7 +552,7 @@ class ExonGenomicCoordsMapper:
553
552
  """
554
553
  tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
555
554
  if not tx_exons:
556
- return None, None, [f"No exons found given {tx_ac}"]
555
+ return None, None, [f"Transcript does not exist in UTA: {tx_ac}"]
557
556
 
558
557
  errors = []
559
558
  start_end_exons = []
@@ -733,288 +732,246 @@ class ExonGenomicCoordsMapper:
733
732
  genomic_ac: str | None = None,
734
733
  transcript: str | None = None,
735
734
  gene: str | None = None,
736
- get_nearest_transcript_junction: bool = False,
737
735
  is_seg_start: bool = True,
736
+ coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
737
+ starting_assembly: Assembly = Assembly.GRCH38,
738
738
  ) -> GenomicTxSeg:
739
739
  """Given genomic data, generate a boundary for a transcript segment.
740
740
 
741
741
  Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
742
742
  errors.
743
743
 
744
+ Either an HGNC gene symbol or transcript accession must be provided to this
745
+ method
746
+
744
747
  :param genomic_pos: Genomic position where the transcript segment starts or ends
745
- (inter-residue based)
746
748
  :param chromosome: Chromosome. Must give chromosome without a prefix
747
749
  (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
748
750
  position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
749
751
  If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
750
752
  :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
751
753
  must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
752
- will be used.
754
+ will be used. If the genomic accession is from GRCh37, it will be lifted
755
+ over to GRCh38 and the original accession version will be ignored
753
756
  :param transcript: The transcript to use. If this is not given, we will try the
754
757
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
755
758
  Compatible Transcript
756
759
  :param gene: Valid, case-sensitive HGNC gene symbol
757
- :param get_nearest_transcript_junction: If ``True``, this will return the
758
- adjacent exon if the position specified by``seg_start_genomic`` or
759
- ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
760
- is defined as the exon preceding the breakpoint for the 5' end and the exon
761
- following the breakpoint for the 3' end. For the negative strand, adjacent
762
- is defined as the exon following the breakpoint for the 5' end and the exon
763
- preceding the breakpoint for the 3' end.
764
760
  :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
765
761
  ``False`` if ``genomic_pos`` is where the transcript segment ends.
762
+ :param coordinate_type: Coordinate type for ``seg_start_genomic`` and
763
+ ``seg_end_genomic``. Expects inter-residue coordinates by default
764
+ :param starting_assembly: The assembly that the supplied coordinate comes from. Set to
765
+ GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
766
766
  :return: Data for a transcript segment boundary (inter-residue coordinates)
767
767
  """
768
768
  params = {key: None for key in GenomicTxSeg.model_fields}
769
769
 
770
- if get_nearest_transcript_junction:
771
- if not gene and not transcript:
772
- return GenomicTxSeg(
773
- errors=[
774
- "`gene` or `transcript` must be provided to select the adjacent transcript junction"
775
- ]
776
- )
777
-
778
- if not genomic_ac:
779
- genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
780
-
781
- if not genomic_acs:
782
- return GenomicTxSeg(
783
- errors=[err_msg],
784
- )
785
- genomic_ac = genomic_acs[0]
786
-
787
- # Always liftover to GRCh38
788
- genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
789
- genomic_ac, genomic_pos
790
- )
791
- if err_msg:
792
- return GenomicTxSeg(errors=[err_msg])
770
+ # Validate inputs exist in UTA
771
+ if gene:
772
+ gene_validation = await self.uta_db.gene_exists(gene)
773
+ if not gene_validation:
774
+ return GenomicTxSeg(errors=[f"Gene does not exist in UTA: {gene}"])
793
775
 
794
- if not transcript:
795
- # Select a transcript if not provided
796
- mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(
797
- gene
776
+ if transcript:
777
+ transcript_validation = await self.uta_db.transcript_exists(transcript)
778
+ if not transcript_validation:
779
+ return GenomicTxSeg(
780
+ errors=[f"Transcript does not exist in UTA: {transcript}"]
798
781
  )
799
782
 
800
- if mane_transcripts:
801
- transcript = mane_transcripts[0]["RefSeq_nuc"]
802
- else:
803
- # Attempt to find a coding transcript if a MANE transcript
804
- # cannot be found
805
- results = await self.uta_db.get_transcripts(
806
- gene=gene, alt_ac=genomic_ac
807
- )
808
-
809
- if not results.is_empty():
810
- transcript = results[0]["tx_ac"][0]
811
- else:
812
- # Run if gene is for a noncoding transcript
813
- query = f"""
814
- SELECT DISTINCT tx_ac
815
- FROM {self.uta_db.schema}.tx_exon_aln_v
816
- WHERE hgnc = '{gene}'
817
- AND alt_ac = '{genomic_ac}'
818
- """ # noqa: S608
819
- result = await self.uta_db.execute_query(query)
820
-
821
- if result:
822
- transcript = result[0]["tx_ac"]
823
- else:
824
- return GenomicTxSeg(
825
- errors=[
826
- f"Could not find a transcript for {gene} on {genomic_ac}"
827
- ]
828
- )
829
-
830
- tx_exons = await self._get_all_exon_coords(
831
- tx_ac=transcript, genomic_ac=genomic_ac
832
- )
833
- if not tx_exons:
834
- return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
835
-
836
- strand = Strand(tx_exons[0].alt_strand)
837
- params["strand"] = strand
838
-
839
- # Check if breakpoint occurs on an exon.
840
- # If not, determine the adjacent exon given the selected transcript
841
- if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
842
- exon_num = self._get_adjacent_exon(
843
- tx_exons_genomic_coords=tx_exons,
844
- strand=strand,
845
- start=genomic_pos if is_seg_start else None,
846
- end=genomic_pos if not is_seg_start else None,
783
+ if genomic_ac:
784
+ grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
785
+ if grch38_ac:
786
+ genomic_ac = grch38_ac[0]
787
+ else:
788
+ return GenomicTxSeg(
789
+ errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"]
847
790
  )
791
+ else:
792
+ genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
848
793
 
849
- offset = self._get_exon_offset(
850
- start_i=tx_exons[exon_num].alt_start_i,
851
- end_i=tx_exons[exon_num].alt_end_i,
852
- strand=strand,
853
- use_start_i=strand == Strand.POSITIVE
854
- if is_seg_start
855
- else strand != Strand.POSITIVE,
856
- is_in_exon=False,
857
- start=genomic_pos if is_seg_start else None,
858
- end=genomic_pos if not is_seg_start else None,
794
+ if not genomic_acs:
795
+ return GenomicTxSeg(
796
+ errors=[err_msg],
859
797
  )
798
+ genomic_ac = genomic_acs[0]
860
799
 
861
- genomic_location, err_msg = self._get_vrs_seq_loc(
862
- genomic_ac, genomic_pos, is_seg_start, strand
800
+ # Liftover to GRCh38 if the provided assembly is GRCh37
801
+ if starting_assembly == Assembly.GRCH37:
802
+ genomic_pos = await self._get_grch38_pos(
803
+ genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None
804
+ )
805
+ if not genomic_pos:
806
+ return GenomicTxSeg(
807
+ errors=[
808
+ f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
809
+ ]
863
810
  )
864
- if err_msg:
865
- return GenomicTxSeg(errors=[err_msg])
866
811
 
867
- # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
868
- if not gene:
869
- _gene, err_msg = await self._get_tx_ac_gene(transcript)
870
- if err_msg:
871
- return GenomicTxSeg(errors=[err_msg])
872
- gene = _gene
812
+ # Select a transcript if not provided
813
+ if not transcript:
814
+ mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene)
873
815
 
874
- return GenomicTxSeg(
875
- gene=gene,
876
- genomic_ac=genomic_ac,
877
- tx_ac=transcript,
878
- seg=TxSegment(
879
- exon_ord=exon_num,
880
- offset=offset,
881
- genomic_location=genomic_location,
882
- ),
816
+ if mane_transcripts:
817
+ transcript = mane_transcripts[0]["RefSeq_nuc"]
818
+ else:
819
+ # Attempt to find a coding transcript if a MANE transcript
820
+ # cannot be found
821
+ results = await self.uta_db.get_transcripts(
822
+ gene=gene, alt_ac=genomic_ac
883
823
  )
884
824
 
885
- if genomic_ac:
886
- _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
887
-
825
+ if not results.is_empty():
826
+ transcript = results[0]["tx_ac"][0]
827
+ else:
828
+ # Run if gene is for a noncoding transcript
829
+ query = f"""
830
+ SELECT DISTINCT tx_ac
831
+ FROM {self.uta_db.schema}.tx_exon_aln_v
832
+ WHERE hgnc = '{gene}'
833
+ AND alt_ac = '{genomic_ac}'
834
+ """ # noqa: S608
835
+ result = await self.uta_db.execute_query(query)
836
+
837
+ if result:
838
+ transcript = result[0]["tx_ac"]
839
+ else:
840
+ return GenomicTxSeg(
841
+ errors=[
842
+ f"Could not find a transcript for {gene} on {genomic_ac}"
843
+ ]
844
+ )
845
+ # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
846
+ if not gene:
847
+ _gene, err_msg = await self._get_tx_ac_gene(transcript)
888
848
  if err_msg:
889
849
  return GenomicTxSeg(errors=[err_msg])
850
+ gene = _gene
890
851
 
891
- if gene and _gene != gene:
892
- return GenomicTxSeg(
893
- errors=[f"Expected gene, {gene}, but found {_gene}"]
894
- )
852
+ tx_exons = await self._get_all_exon_coords(
853
+ tx_ac=transcript, genomic_ac=genomic_ac
854
+ )
855
+ if not tx_exons:
856
+ return GenomicTxSeg(
857
+ errors=[f"No exons found given transcript: {transcript}"]
858
+ )
895
859
 
896
- gene = _gene
897
- elif chromosome:
898
- # Try GRCh38 first
899
- for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
900
- _genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
901
- f"{assembly}:chr{chromosome}", "refseq"
902
- )
903
- if err_msg:
904
- return GenomicTxSeg(errors=[err_msg])
905
- _genomic_ac = _genomic_acs[0].split(":")[-1]
860
+ strand = Strand(tx_exons[0].alt_strand)
861
+ params["strand"] = strand
862
+ use_alt_start_i = self._use_alt_start_i(
863
+ is_seg_start=is_seg_start, strand=strand
864
+ )
865
+ if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
866
+ genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue
906
867
 
907
- _gene, err_msg = await self._get_genomic_ac_gene(
908
- genomic_pos, _genomic_ac
909
- )
910
- if _gene:
911
- if gene and _gene != gene:
912
- return GenomicTxSeg(
913
- errors=[f"Expected gene, {gene}, but found {_gene}"]
914
- )
915
- gene = _gene
916
- genomic_ac = _genomic_ac
917
- break
868
+ # Validate that the breakpoint between the first and last exon for the selected transcript
869
+ coordinate_check = await self._validate_genomic_breakpoint(
870
+ pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
871
+ )
872
+ if not coordinate_check:
873
+ return GenomicTxSeg(
874
+ errors=[
875
+ f"{genomic_pos} on {genomic_ac} does not occur within the exons for {transcript}"
876
+ ]
877
+ )
918
878
 
919
- if not genomic_ac:
920
- return GenomicTxSeg(
921
- errors=[
922
- f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
923
- ]
924
- )
879
+ # Check if breakpoint occurs on an exon.
880
+ # If not, determine the adjacent exon given the selected transcript
881
+ if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
882
+ exon_num = self._get_adjacent_exon(
883
+ tx_exons_genomic_coords=tx_exons,
884
+ strand=strand,
885
+ start=genomic_pos if is_seg_start else None,
886
+ end=genomic_pos if not is_seg_start else None,
887
+ )
888
+ else:
889
+ exon_data = await self.uta_db.get_tx_exon_aln_v_data(
890
+ transcript,
891
+ genomic_pos,
892
+ genomic_pos,
893
+ alt_ac=genomic_ac,
894
+ use_tx_pos=False,
895
+ )
896
+ exon_num = exon_data[0].ord
925
897
 
926
- if not gene:
927
- return GenomicTxSeg(
928
- errors=[
929
- f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
930
- ]
931
- )
898
+ offset = self._get_exon_offset(
899
+ genomic_pos=genomic_pos,
900
+ exon_boundary=tx_exons[exon_num].alt_start_i
901
+ if use_alt_start_i
902
+ else tx_exons[exon_num].alt_end_i,
903
+ strand=strand,
904
+ )
932
905
 
933
- return await self._get_tx_seg_genomic_metadata(
934
- genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
906
+ genomic_location, err_msg = self._get_vrs_seq_loc(
907
+ genomic_ac, genomic_pos, is_seg_start, strand
935
908
  )
909
+ if err_msg:
910
+ return GenomicTxSeg(errors=[err_msg])
936
911
 
937
- async def _get_grch38_ac_pos(
938
- self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
939
- ) -> tuple[str | None, int | None, str | None]:
912
+ return GenomicTxSeg(
913
+ gene=gene,
914
+ genomic_ac=genomic_ac,
915
+ tx_ac=transcript,
916
+ seg=TxSegment(
917
+ exon_ord=exon_num,
918
+ offset=offset,
919
+ genomic_location=genomic_location,
920
+ ),
921
+ )
922
+
923
+ async def _get_grch38_pos(
924
+ self,
925
+ genomic_ac: str,
926
+ genomic_pos: int,
927
+ chromosome: str | None = None,
928
+ ) -> int | None:
940
929
  """Get GRCh38 genomic representation for accession and position
941
930
 
942
- :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
943
- :param genomic_pos: Genomic position on ``genomic_ac``
944
- :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
945
- provided, will attempt to retrieve associated GRCh38 accession from UTA.
946
- :return: Tuple containing GRCh38 accession, GRCh38 position, and error message
947
- if unable to get GRCh38 representation
931
+ :param genomic_pos: A genomic coordinate in GRCh37
932
+ :param genomic_ac: The genomic accession in GRCh38
933
+ :param chromosome: The chromosome that genomic_pos occurs on
934
+ :return The genomic coordinate in GRCh38
948
935
  """
949
- if not grch38_ac:
950
- grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
951
- if not grch38_ac:
952
- return None, None, f"Unrecognized genomic accession: {genomic_ac}."
953
-
954
- grch38_ac = grch38_ac[0]
955
-
956
- if grch38_ac != genomic_ac:
957
- # Ensure genomic_ac is GRCh37
936
+ if not chromosome:
958
937
  chromosome, _ = self.seqrepo_access.translate_identifier(
959
- genomic_ac, Assembly.GRCH37.value
938
+ genomic_ac, target_namespaces=Assembly.GRCH38.value
960
939
  )
961
- if not chromosome:
962
- _logger.warning(
963
- "SeqRepo could not find associated %s assembly for genomic accession %s.",
964
- Assembly.GRCH37.value,
965
- genomic_ac,
966
- )
967
- return (
968
- None,
969
- None,
970
- f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
971
- )
972
-
973
940
  chromosome = chromosome[-1].split(":")[-1]
974
- liftover_data = self.liftover.get_liftover(
975
- chromosome, genomic_pos, Assembly.GRCH38
976
- )
977
- if liftover_data is None:
978
- return (
979
- None,
980
- None,
981
- f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
982
- )
983
-
984
- genomic_pos = liftover_data[1]
985
- genomic_ac = grch38_ac
986
-
987
- return genomic_ac, genomic_pos, None
941
+ liftover_data = self.liftover.get_liftover(
942
+ chromosome, genomic_pos, Assembly.GRCH38
943
+ )
944
+ return liftover_data[1] if liftover_data else None
988
945
 
989
- async def _get_genomic_ac_gene(
946
+ async def _validate_genomic_breakpoint(
990
947
  self,
991
948
  pos: int,
992
949
  genomic_ac: str,
993
- ) -> tuple[str | None, str | None]:
994
- """Get gene given a genomic accession and position.
995
-
996
- If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
997
- gene will be returned.
950
+ tx_ac: str,
951
+ ) -> bool:
952
+ """Validate that a genomic coordinate falls within the first and last exon
953
+ for a transcript on a given accession
998
954
 
999
955
  :param pos: Genomic position on ``genomic_ac``
1000
956
  :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
1001
- :return: HGNC gene symbol associated to genomic accession and position and
1002
- warning
957
+ :param transcript: A transcript accession
958
+ :return: ``True`` if the coordinate falls within the first and last exon
959
+ for the transcript, ``False`` if not
1003
960
  """
1004
961
  query = f"""
1005
- SELECT DISTINCT hgnc
1006
- FROM {self.uta_db.schema}.tx_exon_aln_v
1007
- WHERE alt_ac = '{genomic_ac}'
1008
- AND alt_aln_method = 'splign'
1009
- AND {pos} BETWEEN alt_start_i AND alt_end_i
1010
- ORDER BY hgnc
1011
- LIMIT 1;
962
+ WITH tx_boundaries AS (
963
+ SELECT
964
+ MIN(alt_start_i) AS min_start,
965
+ MAX(alt_end_i) AS max_end
966
+ FROM {self.uta_db.schema}.tx_exon_aln_v
967
+ WHERE tx_ac = '{tx_ac}'
968
+ AND alt_ac = '{genomic_ac}'
969
+ )
970
+ SELECT * FROM tx_boundaries
971
+ WHERE {pos} between tx_boundaries.min_start and tx_boundaries.max_end
1012
972
  """ # noqa: S608
1013
973
  results = await self.uta_db.execute_query(query)
1014
- if not results:
1015
- return None, f"No gene(s) found given {genomic_ac} on position {pos}"
1016
-
1017
- return results[0]["hgnc"], None
974
+ return bool(results)
1018
975
 
1019
976
  async def _get_tx_ac_gene(
1020
977
  self,
@@ -1042,102 +999,6 @@ class ExonGenomicCoordsMapper:
1042
999
 
1043
1000
  return results[0]["hgnc"], None
1044
1001
 
1045
- async def _get_tx_seg_genomic_metadata(
1046
- self,
1047
- genomic_ac: str,
1048
- genomic_pos: int,
1049
- is_seg_start: bool,
1050
- gene: str,
1051
- tx_ac: str | None,
1052
- ) -> GenomicTxSeg:
1053
- """Get transcript segment data and associated genomic metadata.
1054
-
1055
- Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
1056
- errors.
1057
-
1058
- If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
1059
-
1060
- :param genomic_ac: Genomic RefSeq accession
1061
- :param genomic_pos: Genomic position where the transcript segment occurs
1062
- :param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
1063
- :param gene: Valid, case-sensitive HGNC gene symbol
1064
- :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
1065
- transcript
1066
- :return: Transcript segment data and associated genomic metadata
1067
- """
1068
- if tx_ac:
1069
- # We should always try to liftover
1070
- grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
1071
- if not grch38_ac:
1072
- return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
1073
- grch38_ac = grch38_ac[0]
1074
- else:
1075
- mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
1076
- if not mane_data:
1077
- err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
1078
- if gene:
1079
- err_msg += f" on gene {gene}"
1080
- _logger.warning(err_msg)
1081
- return GenomicTxSeg(errors=[err_msg])
1082
-
1083
- mane_data = mane_data[0]
1084
- tx_ac = mane_data["RefSeq_nuc"]
1085
- grch38_ac = mane_data["GRCh38_chr"]
1086
-
1087
- # Always liftover to GRCh38
1088
- genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
1089
- genomic_ac, genomic_pos, grch38_ac=grch38_ac
1090
- )
1091
- if err_msg:
1092
- return GenomicTxSeg(errors=[err_msg])
1093
-
1094
- tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
1095
- if not tx_exons:
1096
- return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
1097
-
1098
- tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
1099
- tx_ac,
1100
- genomic_pos,
1101
- genomic_pos,
1102
- alt_ac=genomic_ac,
1103
- use_tx_pos=False,
1104
- )
1105
- if len(tx_exon_aln_data) != 1:
1106
- return GenomicTxSeg(
1107
- errors=[
1108
- f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
1109
- ]
1110
- )
1111
-
1112
- tx_exon_aln_data = tx_exon_aln_data[0]
1113
-
1114
- offset = self._get_exon_offset(
1115
- start_i=tx_exon_aln_data.alt_start_i,
1116
- end_i=tx_exon_aln_data.alt_end_i,
1117
- strand=Strand(tx_exon_aln_data.alt_strand),
1118
- use_start_i=False, # This doesn't impact anything since we're on the exon
1119
- is_in_exon=True,
1120
- start=genomic_pos if is_seg_start else None,
1121
- end=genomic_pos if not is_seg_start else None,
1122
- )
1123
-
1124
- genomic_location, err_msg = self._get_vrs_seq_loc(
1125
- genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
1126
- )
1127
- if err_msg:
1128
- return GenomicTxSeg(errors=[err_msg])
1129
-
1130
- return GenomicTxSeg(
1131
- gene=tx_exon_aln_data.hgnc,
1132
- genomic_ac=genomic_ac,
1133
- tx_ac=tx_exon_aln_data.tx_ac,
1134
- seg=TxSegment(
1135
- exon_ord=tx_exon_aln_data.ord,
1136
- offset=offset,
1137
- genomic_location=genomic_location,
1138
- ),
1139
- )
1140
-
1141
1002
  @staticmethod
1142
1003
  def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
1143
1004
  """Check if a breakpoint occurs on an exon
@@ -1150,6 +1011,24 @@ class ExonGenomicCoordsMapper:
1150
1011
  exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1151
1012
  )
1152
1013
 
1014
+ @staticmethod
1015
+ def _use_alt_start_i(is_seg_start: bool, strand: Strand) -> bool:
1016
+ """Determine whether to use alt_start_i or alt_end_i from UTA when computing
1017
+ exon offset
1018
+
1019
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
1020
+ ``False`` if ``genomic_pos`` is where the transcript segment ends.
1021
+ :param strand: The transcribed strand
1022
+ :return ``True`` if alt_start_i should be used, ``False`` if alt_end_i should
1023
+ be used
1024
+ """
1025
+ return (
1026
+ is_seg_start
1027
+ and strand == Strand.POSITIVE
1028
+ or not is_seg_start
1029
+ and strand == Strand.NEGATIVE
1030
+ )
1031
+
1153
1032
  @staticmethod
1154
1033
  def _get_adjacent_exon(
1155
1034
  tx_exons_genomic_coords: list[_ExonCoord],
@@ -1210,38 +1089,22 @@ class ExonGenomicCoordsMapper:
1210
1089
 
1211
1090
  @staticmethod
1212
1091
  def _get_exon_offset(
1213
- start_i: int,
1214
- end_i: int,
1092
+ genomic_pos: int,
1093
+ exon_boundary: int,
1215
1094
  strand: Strand,
1216
- use_start_i: bool = True,
1217
- is_in_exon: bool = True,
1218
- start: int | None = None,
1219
- end: int | None = None,
1220
1095
  ) -> int:
1221
1096
  """Compute offset from exon start or end index
1222
1097
 
1223
- :param start_i: Exon start index (inter-residue)
1224
- :param end_i: Exon end index (inter-residue)
1225
- :param strand: Strand
1226
- :param use_start_i: Whether or not ``start_i`` should be used to compute the
1227
- offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1228
- ``False``.
1229
- :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1230
- ``True``
1231
- :param start: Provided start position, defaults to ``None``. Must provide
1232
- ``start`` or ``end``, not both.
1233
- :param end: Provided end position, defaults to ``None``. Must provide ``start``
1234
- or ``end``, not both
1098
+ :param genomic_pos: The supplied genomic position. This can represent, for
1099
+ example, a fusion junction breakpoint. This position is represented using
1100
+ inter-residue coordinates
1101
+ :param exon_boundary: The genomic position for the exon boundary that the offset
1102
+ is being computed against
1103
+ :paran strand: The transcribed strand
1235
1104
  :return: Offset from exon start or end index
1236
1105
  """
1237
- if is_in_exon:
1238
- if start is not None:
1239
- offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1240
- else:
1241
- offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1242
- else:
1243
- if strand == Strand.POSITIVE:
1244
- offset = start - start_i if use_start_i else end - end_i
1245
- else:
1246
- offset = start_i - end if use_start_i else end_i - start
1247
- return offset
1106
+ return (
1107
+ genomic_pos - exon_boundary
1108
+ if strand == Strand.POSITIVE
1109
+ else (genomic_pos - exon_boundary) * -1
1110
+ )
@@ -378,6 +378,38 @@ class UtaDatabase:
378
378
  result = await self.execute_query(query)
379
379
  return result[0][0]
380
380
 
381
+ async def gene_exists(self, gene: str) -> bool:
382
+ """Return whether or not a gene symbol exists in UTA gene table
383
+
384
+ :param gene: Gene symbol
385
+ :return ``True`` if gene symbol exists in UTA, ``False`` if not
386
+ """
387
+ query = f"""
388
+ SELECT EXISTS(
389
+ SELECT hgnc
390
+ FROM {self.schema}.gene
391
+ WHERE hgnc = '{gene}'
392
+ );
393
+ """ # noqa: S608
394
+ result = await self.execute_query(query)
395
+ return result[0][0]
396
+
397
+ async def transcript_exists(self, transcript: str) -> bool:
398
+ """Return whether or not a transcript exists in the UTA tx_exon_aln_v table
399
+
400
+ :param transcript: A transcript accession
401
+ :return ``True`` if transcript exists in UTA, ``False`` if not
402
+ """
403
+ query = f"""
404
+ SELECT EXISTS(
405
+ SELECT tx_ac
406
+ FROM {self.schema}.tx_exon_aln_v
407
+ WHERE tx_ac = '{transcript}'
408
+ );
409
+ """ # noqa: S608
410
+ result = await self.execute_query(query)
411
+ return result[0][0]
412
+
381
413
  async def get_ac_descr(self, ac: str) -> str | None:
382
414
  """Return accession description. This is typically available only for accessions
383
415
  from older (pre-GRCh38) builds.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: cool_seq_tool
3
- Version: 0.9.1
3
+ Version: 0.11.0
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
6
6
  cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
7
7
  cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
8
8
  cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
9
- cool_seq_tool/mappers/exon_genomic_coords.py,sha256=XYHWYHL9PcBIKHB_EsN1YKwmhP-KLrGyZv8yH_7huuo,49533
9
+ cool_seq_tool/mappers/exon_genomic_coords.py,sha256=ORYjBVaX1HO6ln0gRJyRKxUCjZrBDi4JfYQEYebxIAc,43824
10
10
  cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
11
11
  cool_seq_tool/mappers/mane_transcript.py,sha256=C9eKEj8qhVg878oUhBKPYAZS7gpLM5aaQ0HhSkUg-2g,54365
12
12
  cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
@@ -16,9 +16,9 @@ cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5p
16
16
  cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
17
17
  cool_seq_tool/sources/mane_transcript_mappings.py,sha256=Q6J57O2lLWXlgKT0zq3BIwkwFawySnORHOX-UxzfyDE,5399
18
18
  cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
19
- cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
20
- cool_seq_tool-0.9.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
- cool_seq_tool-0.9.1.dist-info/METADATA,sha256=R9iVaov_Ktbpg3Qq4ey2UqZI0CSGEskXOXJlyhcKI5c,6556
22
- cool_seq_tool-0.9.1.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
23
- cool_seq_tool-0.9.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
- cool_seq_tool-0.9.1.dist-info/RECORD,,
19
+ cool_seq_tool/sources/uta_database.py,sha256=s7BkFplD_b2AmvXq8vZSCiBuZLy8RlxAqNyf-6QtR8w,36112
20
+ cool_seq_tool-0.11.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
21
+ cool_seq_tool-0.11.0.dist-info/METADATA,sha256=VcP6BvVyQ1YVB2u2XsZbEVd9DYYr-ZKcHadIt3ACsBY,6557
22
+ cool_seq_tool-0.11.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
23
+ cool_seq_tool-0.11.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
24
+ cool_seq_tool-0.11.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5