cool-seq-tool 0.9.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/mappers/exon_genomic_coords.py +222 -359
- cool_seq_tool/sources/uta_database.py +32 -0
- {cool_seq_tool-0.9.1.dist-info → cool_seq_tool-0.11.0.dist-info}/METADATA +2 -2
- {cool_seq_tool-0.9.1.dist-info → cool_seq_tool-0.11.0.dist-info}/RECORD +7 -7
- {cool_seq_tool-0.9.1.dist-info → cool_seq_tool-0.11.0.dist-info}/WHEEL +1 -1
- {cool_seq_tool-0.9.1.dist-info → cool_seq_tool-0.11.0.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.9.1.dist-info → cool_seq_tool-0.11.0.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from cool_seq_tool.mappers.liftover import LiftOver
|
|
10
10
|
from cool_seq_tool.schemas import (
|
11
11
|
Assembly,
|
12
12
|
BaseModelForbidExtra,
|
13
|
+
CoordinateType,
|
13
14
|
ServiceMeta,
|
14
15
|
Strand,
|
15
16
|
)
|
@@ -410,15 +411,14 @@ class ExonGenomicCoordsMapper:
|
|
410
411
|
seg_start_genomic: int | None = None,
|
411
412
|
seg_end_genomic: int | None = None,
|
412
413
|
transcript: str | None = None,
|
413
|
-
get_nearest_transcript_junction: bool = False,
|
414
414
|
gene: str | None = None,
|
415
|
+
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
|
416
|
+
starting_assembly: Assembly = Assembly.GRCH38,
|
415
417
|
) -> GenomicTxSegService:
|
416
418
|
"""Get transcript segment data for genomic data, lifted over to GRCh38.
|
417
419
|
|
418
420
|
If liftover to GRCh38 is unsuccessful, will return errors.
|
419
421
|
|
420
|
-
Must provide inter-residue coordinates.
|
421
|
-
|
422
422
|
MANE Transcript data will be returned if and only if ``transcript`` is not
|
423
423
|
supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
|
424
424
|
|
@@ -442,24 +442,21 @@ class ExonGenomicCoordsMapper:
|
|
442
442
|
used.
|
443
443
|
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
444
444
|
must provide ``chromosome. If ``chromosome`` is also provided,
|
445
|
-
``genomic_ac`` will be used.
|
445
|
+
``genomic_ac`` will be used. If the genomic accession is from GRCh37, it
|
446
|
+
will be lifted over to GRCh38 and the original accession version will be
|
447
|
+
ignored
|
446
448
|
:param seg_start_genomic: Genomic position where the transcript segment starts
|
447
449
|
:param seg_end_genomic: Genomic position where the transcript segment ends
|
448
450
|
:param transcript: The transcript to use. If this is not given, we will try the
|
449
451
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
450
452
|
Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
|
451
453
|
page.
|
452
|
-
:param get_nearest_transcript_junction: If ``True``, this will return the
|
453
|
-
adjacent exon if the position specified by``seg_start_genomic`` or
|
454
|
-
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
455
|
-
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
456
|
-
following the breakpoint for the 3' end. For the negative strand, adjacent
|
457
|
-
is defined as the exon following the breakpoint for the 5' end and the exon
|
458
|
-
preceding the breakpoint for the 3' end.
|
459
454
|
:param gene: A valid, case-sensitive HGNC symbol. Must be given if no ``transcript``
|
460
455
|
value is provided.
|
461
456
|
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
|
462
|
-
``seg_end_genomic
|
457
|
+
``seg_end_genomic``. Expects inter-residue coordinates by default
|
458
|
+
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
|
459
|
+
GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
|
463
460
|
:return: Genomic data (inter-residue coordinates)
|
464
461
|
"""
|
465
462
|
errors = []
|
@@ -483,8 +480,9 @@ class ExonGenomicCoordsMapper:
|
|
483
480
|
genomic_ac=genomic_ac,
|
484
481
|
transcript=transcript,
|
485
482
|
gene=gene,
|
486
|
-
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
487
483
|
is_seg_start=True,
|
484
|
+
coordinate_type=coordinate_type,
|
485
|
+
starting_assembly=starting_assembly,
|
488
486
|
)
|
489
487
|
if start_tx_seg_data.errors:
|
490
488
|
return _return_service_errors(start_tx_seg_data.errors)
|
@@ -503,8 +501,9 @@ class ExonGenomicCoordsMapper:
|
|
503
501
|
genomic_ac=genomic_ac,
|
504
502
|
transcript=transcript,
|
505
503
|
gene=gene,
|
506
|
-
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
507
504
|
is_seg_start=False,
|
505
|
+
coordinate_type=coordinate_type,
|
506
|
+
starting_assembly=starting_assembly,
|
508
507
|
)
|
509
508
|
if end_tx_seg_data.errors:
|
510
509
|
return _return_service_errors(end_tx_seg_data.errors)
|
@@ -553,7 +552,7 @@ class ExonGenomicCoordsMapper:
|
|
553
552
|
"""
|
554
553
|
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
|
555
554
|
if not tx_exons:
|
556
|
-
return None, None, [f"
|
555
|
+
return None, None, [f"Transcript does not exist in UTA: {tx_ac}"]
|
557
556
|
|
558
557
|
errors = []
|
559
558
|
start_end_exons = []
|
@@ -733,288 +732,246 @@ class ExonGenomicCoordsMapper:
|
|
733
732
|
genomic_ac: str | None = None,
|
734
733
|
transcript: str | None = None,
|
735
734
|
gene: str | None = None,
|
736
|
-
get_nearest_transcript_junction: bool = False,
|
737
735
|
is_seg_start: bool = True,
|
736
|
+
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
|
737
|
+
starting_assembly: Assembly = Assembly.GRCH38,
|
738
738
|
) -> GenomicTxSeg:
|
739
739
|
"""Given genomic data, generate a boundary for a transcript segment.
|
740
740
|
|
741
741
|
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
742
742
|
errors.
|
743
743
|
|
744
|
+
Either an HGNC gene symbol or transcript accession must be provided to this
|
745
|
+
method
|
746
|
+
|
744
747
|
:param genomic_pos: Genomic position where the transcript segment starts or ends
|
745
|
-
(inter-residue based)
|
746
748
|
:param chromosome: Chromosome. Must give chromosome without a prefix
|
747
749
|
(i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
|
748
750
|
position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
|
749
751
|
If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
|
750
752
|
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
751
753
|
must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
|
752
|
-
will be used.
|
754
|
+
will be used. If the genomic accession is from GRCh37, it will be lifted
|
755
|
+
over to GRCh38 and the original accession version will be ignored
|
753
756
|
:param transcript: The transcript to use. If this is not given, we will try the
|
754
757
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
755
758
|
Compatible Transcript
|
756
759
|
:param gene: Valid, case-sensitive HGNC gene symbol
|
757
|
-
:param get_nearest_transcript_junction: If ``True``, this will return the
|
758
|
-
adjacent exon if the position specified by``seg_start_genomic`` or
|
759
|
-
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
760
|
-
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
761
|
-
following the breakpoint for the 3' end. For the negative strand, adjacent
|
762
|
-
is defined as the exon following the breakpoint for the 5' end and the exon
|
763
|
-
preceding the breakpoint for the 3' end.
|
764
760
|
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
765
761
|
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
762
|
+
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
|
763
|
+
``seg_end_genomic``. Expects inter-residue coordinates by default
|
764
|
+
:param starting_assembly: The assembly that the supplied coordinate comes from. Set to
|
765
|
+
GRCh38 by default. Will attempt to liftover if starting assembly is GRCh37
|
766
766
|
:return: Data for a transcript segment boundary (inter-residue coordinates)
|
767
767
|
"""
|
768
768
|
params = {key: None for key in GenomicTxSeg.model_fields}
|
769
769
|
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
]
|
776
|
-
)
|
777
|
-
|
778
|
-
if not genomic_ac:
|
779
|
-
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
|
780
|
-
|
781
|
-
if not genomic_acs:
|
782
|
-
return GenomicTxSeg(
|
783
|
-
errors=[err_msg],
|
784
|
-
)
|
785
|
-
genomic_ac = genomic_acs[0]
|
786
|
-
|
787
|
-
# Always liftover to GRCh38
|
788
|
-
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
789
|
-
genomic_ac, genomic_pos
|
790
|
-
)
|
791
|
-
if err_msg:
|
792
|
-
return GenomicTxSeg(errors=[err_msg])
|
770
|
+
# Validate inputs exist in UTA
|
771
|
+
if gene:
|
772
|
+
gene_validation = await self.uta_db.gene_exists(gene)
|
773
|
+
if not gene_validation:
|
774
|
+
return GenomicTxSeg(errors=[f"Gene does not exist in UTA: {gene}"])
|
793
775
|
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
776
|
+
if transcript:
|
777
|
+
transcript_validation = await self.uta_db.transcript_exists(transcript)
|
778
|
+
if not transcript_validation:
|
779
|
+
return GenomicTxSeg(
|
780
|
+
errors=[f"Transcript does not exist in UTA: {transcript}"]
|
798
781
|
)
|
799
782
|
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
)
|
808
|
-
|
809
|
-
if not results.is_empty():
|
810
|
-
transcript = results[0]["tx_ac"][0]
|
811
|
-
else:
|
812
|
-
# Run if gene is for a noncoding transcript
|
813
|
-
query = f"""
|
814
|
-
SELECT DISTINCT tx_ac
|
815
|
-
FROM {self.uta_db.schema}.tx_exon_aln_v
|
816
|
-
WHERE hgnc = '{gene}'
|
817
|
-
AND alt_ac = '{genomic_ac}'
|
818
|
-
""" # noqa: S608
|
819
|
-
result = await self.uta_db.execute_query(query)
|
820
|
-
|
821
|
-
if result:
|
822
|
-
transcript = result[0]["tx_ac"]
|
823
|
-
else:
|
824
|
-
return GenomicTxSeg(
|
825
|
-
errors=[
|
826
|
-
f"Could not find a transcript for {gene} on {genomic_ac}"
|
827
|
-
]
|
828
|
-
)
|
829
|
-
|
830
|
-
tx_exons = await self._get_all_exon_coords(
|
831
|
-
tx_ac=transcript, genomic_ac=genomic_ac
|
832
|
-
)
|
833
|
-
if not tx_exons:
|
834
|
-
return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
|
835
|
-
|
836
|
-
strand = Strand(tx_exons[0].alt_strand)
|
837
|
-
params["strand"] = strand
|
838
|
-
|
839
|
-
# Check if breakpoint occurs on an exon.
|
840
|
-
# If not, determine the adjacent exon given the selected transcript
|
841
|
-
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
|
842
|
-
exon_num = self._get_adjacent_exon(
|
843
|
-
tx_exons_genomic_coords=tx_exons,
|
844
|
-
strand=strand,
|
845
|
-
start=genomic_pos if is_seg_start else None,
|
846
|
-
end=genomic_pos if not is_seg_start else None,
|
783
|
+
if genomic_ac:
|
784
|
+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
785
|
+
if grch38_ac:
|
786
|
+
genomic_ac = grch38_ac[0]
|
787
|
+
else:
|
788
|
+
return GenomicTxSeg(
|
789
|
+
errors=[f"Genomic accession does not exist in UTA: {genomic_ac}"]
|
847
790
|
)
|
791
|
+
else:
|
792
|
+
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
|
848
793
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
strand=strand,
|
853
|
-
use_start_i=strand == Strand.POSITIVE
|
854
|
-
if is_seg_start
|
855
|
-
else strand != Strand.POSITIVE,
|
856
|
-
is_in_exon=False,
|
857
|
-
start=genomic_pos if is_seg_start else None,
|
858
|
-
end=genomic_pos if not is_seg_start else None,
|
794
|
+
if not genomic_acs:
|
795
|
+
return GenomicTxSeg(
|
796
|
+
errors=[err_msg],
|
859
797
|
)
|
798
|
+
genomic_ac = genomic_acs[0]
|
860
799
|
|
861
|
-
|
862
|
-
|
800
|
+
# Liftover to GRCh38 if the provided assembly is GRCh37
|
801
|
+
if starting_assembly == Assembly.GRCH37:
|
802
|
+
genomic_pos = await self._get_grch38_pos(
|
803
|
+
genomic_ac, genomic_pos, chromosome=chromosome if chromosome else None
|
804
|
+
)
|
805
|
+
if not genomic_pos:
|
806
|
+
return GenomicTxSeg(
|
807
|
+
errors=[
|
808
|
+
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful."
|
809
|
+
]
|
863
810
|
)
|
864
|
-
if err_msg:
|
865
|
-
return GenomicTxSeg(errors=[err_msg])
|
866
811
|
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
if err_msg:
|
871
|
-
return GenomicTxSeg(errors=[err_msg])
|
872
|
-
gene = _gene
|
812
|
+
# Select a transcript if not provided
|
813
|
+
if not transcript:
|
814
|
+
mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
873
815
|
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
genomic_location=genomic_location,
|
882
|
-
),
|
816
|
+
if mane_transcripts:
|
817
|
+
transcript = mane_transcripts[0]["RefSeq_nuc"]
|
818
|
+
else:
|
819
|
+
# Attempt to find a coding transcript if a MANE transcript
|
820
|
+
# cannot be found
|
821
|
+
results = await self.uta_db.get_transcripts(
|
822
|
+
gene=gene, alt_ac=genomic_ac
|
883
823
|
)
|
884
824
|
|
885
|
-
|
886
|
-
|
887
|
-
|
825
|
+
if not results.is_empty():
|
826
|
+
transcript = results[0]["tx_ac"][0]
|
827
|
+
else:
|
828
|
+
# Run if gene is for a noncoding transcript
|
829
|
+
query = f"""
|
830
|
+
SELECT DISTINCT tx_ac
|
831
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
832
|
+
WHERE hgnc = '{gene}'
|
833
|
+
AND alt_ac = '{genomic_ac}'
|
834
|
+
""" # noqa: S608
|
835
|
+
result = await self.uta_db.execute_query(query)
|
836
|
+
|
837
|
+
if result:
|
838
|
+
transcript = result[0]["tx_ac"]
|
839
|
+
else:
|
840
|
+
return GenomicTxSeg(
|
841
|
+
errors=[
|
842
|
+
f"Could not find a transcript for {gene} on {genomic_ac}"
|
843
|
+
]
|
844
|
+
)
|
845
|
+
# gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
|
846
|
+
if not gene:
|
847
|
+
_gene, err_msg = await self._get_tx_ac_gene(transcript)
|
888
848
|
if err_msg:
|
889
849
|
return GenomicTxSeg(errors=[err_msg])
|
850
|
+
gene = _gene
|
890
851
|
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
852
|
+
tx_exons = await self._get_all_exon_coords(
|
853
|
+
tx_ac=transcript, genomic_ac=genomic_ac
|
854
|
+
)
|
855
|
+
if not tx_exons:
|
856
|
+
return GenomicTxSeg(
|
857
|
+
errors=[f"No exons found given transcript: {transcript}"]
|
858
|
+
)
|
895
859
|
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
if err_msg:
|
904
|
-
return GenomicTxSeg(errors=[err_msg])
|
905
|
-
_genomic_ac = _genomic_acs[0].split(":")[-1]
|
860
|
+
strand = Strand(tx_exons[0].alt_strand)
|
861
|
+
params["strand"] = strand
|
862
|
+
use_alt_start_i = self._use_alt_start_i(
|
863
|
+
is_seg_start=is_seg_start, strand=strand
|
864
|
+
)
|
865
|
+
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
|
866
|
+
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue
|
906
867
|
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
break
|
868
|
+
# Validate that the breakpoint between the first and last exon for the selected transcript
|
869
|
+
coordinate_check = await self._validate_genomic_breakpoint(
|
870
|
+
pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
|
871
|
+
)
|
872
|
+
if not coordinate_check:
|
873
|
+
return GenomicTxSeg(
|
874
|
+
errors=[
|
875
|
+
f"{genomic_pos} on {genomic_ac} does not occur within the exons for {transcript}"
|
876
|
+
]
|
877
|
+
)
|
918
878
|
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
879
|
+
# Check if breakpoint occurs on an exon.
|
880
|
+
# If not, determine the adjacent exon given the selected transcript
|
881
|
+
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
|
882
|
+
exon_num = self._get_adjacent_exon(
|
883
|
+
tx_exons_genomic_coords=tx_exons,
|
884
|
+
strand=strand,
|
885
|
+
start=genomic_pos if is_seg_start else None,
|
886
|
+
end=genomic_pos if not is_seg_start else None,
|
887
|
+
)
|
888
|
+
else:
|
889
|
+
exon_data = await self.uta_db.get_tx_exon_aln_v_data(
|
890
|
+
transcript,
|
891
|
+
genomic_pos,
|
892
|
+
genomic_pos,
|
893
|
+
alt_ac=genomic_ac,
|
894
|
+
use_tx_pos=False,
|
895
|
+
)
|
896
|
+
exon_num = exon_data[0].ord
|
925
897
|
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
898
|
+
offset = self._get_exon_offset(
|
899
|
+
genomic_pos=genomic_pos,
|
900
|
+
exon_boundary=tx_exons[exon_num].alt_start_i
|
901
|
+
if use_alt_start_i
|
902
|
+
else tx_exons[exon_num].alt_end_i,
|
903
|
+
strand=strand,
|
904
|
+
)
|
932
905
|
|
933
|
-
|
934
|
-
genomic_ac, genomic_pos, is_seg_start,
|
906
|
+
genomic_location, err_msg = self._get_vrs_seq_loc(
|
907
|
+
genomic_ac, genomic_pos, is_seg_start, strand
|
935
908
|
)
|
909
|
+
if err_msg:
|
910
|
+
return GenomicTxSeg(errors=[err_msg])
|
936
911
|
|
937
|
-
|
938
|
-
|
939
|
-
|
912
|
+
return GenomicTxSeg(
|
913
|
+
gene=gene,
|
914
|
+
genomic_ac=genomic_ac,
|
915
|
+
tx_ac=transcript,
|
916
|
+
seg=TxSegment(
|
917
|
+
exon_ord=exon_num,
|
918
|
+
offset=offset,
|
919
|
+
genomic_location=genomic_location,
|
920
|
+
),
|
921
|
+
)
|
922
|
+
|
923
|
+
async def _get_grch38_pos(
|
924
|
+
self,
|
925
|
+
genomic_ac: str,
|
926
|
+
genomic_pos: int,
|
927
|
+
chromosome: str | None = None,
|
928
|
+
) -> int | None:
|
940
929
|
"""Get GRCh38 genomic representation for accession and position
|
941
930
|
|
942
|
-
:param
|
943
|
-
:param
|
944
|
-
:param
|
945
|
-
|
946
|
-
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
|
947
|
-
if unable to get GRCh38 representation
|
931
|
+
:param genomic_pos: A genomic coordinate in GRCh37
|
932
|
+
:param genomic_ac: The genomic accession in GRCh38
|
933
|
+
:param chromosome: The chromosome that genomic_pos occurs on
|
934
|
+
:return The genomic coordinate in GRCh38
|
948
935
|
"""
|
949
|
-
if not
|
950
|
-
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
951
|
-
if not grch38_ac:
|
952
|
-
return None, None, f"Unrecognized genomic accession: {genomic_ac}."
|
953
|
-
|
954
|
-
grch38_ac = grch38_ac[0]
|
955
|
-
|
956
|
-
if grch38_ac != genomic_ac:
|
957
|
-
# Ensure genomic_ac is GRCh37
|
936
|
+
if not chromosome:
|
958
937
|
chromosome, _ = self.seqrepo_access.translate_identifier(
|
959
|
-
genomic_ac, Assembly.
|
938
|
+
genomic_ac, target_namespaces=Assembly.GRCH38.value
|
960
939
|
)
|
961
|
-
if not chromosome:
|
962
|
-
_logger.warning(
|
963
|
-
"SeqRepo could not find associated %s assembly for genomic accession %s.",
|
964
|
-
Assembly.GRCH37.value,
|
965
|
-
genomic_ac,
|
966
|
-
)
|
967
|
-
return (
|
968
|
-
None,
|
969
|
-
None,
|
970
|
-
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
|
971
|
-
)
|
972
|
-
|
973
940
|
chromosome = chromosome[-1].split(":")[-1]
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
return (
|
979
|
-
None,
|
980
|
-
None,
|
981
|
-
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
|
982
|
-
)
|
983
|
-
|
984
|
-
genomic_pos = liftover_data[1]
|
985
|
-
genomic_ac = grch38_ac
|
986
|
-
|
987
|
-
return genomic_ac, genomic_pos, None
|
941
|
+
liftover_data = self.liftover.get_liftover(
|
942
|
+
chromosome, genomic_pos, Assembly.GRCH38
|
943
|
+
)
|
944
|
+
return liftover_data[1] if liftover_data else None
|
988
945
|
|
989
|
-
async def
|
946
|
+
async def _validate_genomic_breakpoint(
|
990
947
|
self,
|
991
948
|
pos: int,
|
992
949
|
genomic_ac: str,
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
gene will be returned.
|
950
|
+
tx_ac: str,
|
951
|
+
) -> bool:
|
952
|
+
"""Validate that a genomic coordinate falls within the first and last exon
|
953
|
+
for a transcript on a given accession
|
998
954
|
|
999
955
|
:param pos: Genomic position on ``genomic_ac``
|
1000
956
|
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
|
1001
|
-
:
|
1002
|
-
|
957
|
+
:param transcript: A transcript accession
|
958
|
+
:return: ``True`` if the coordinate falls within the first and last exon
|
959
|
+
for the transcript, ``False`` if not
|
1003
960
|
"""
|
1004
961
|
query = f"""
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
962
|
+
WITH tx_boundaries AS (
|
963
|
+
SELECT
|
964
|
+
MIN(alt_start_i) AS min_start,
|
965
|
+
MAX(alt_end_i) AS max_end
|
966
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
967
|
+
WHERE tx_ac = '{tx_ac}'
|
968
|
+
AND alt_ac = '{genomic_ac}'
|
969
|
+
)
|
970
|
+
SELECT * FROM tx_boundaries
|
971
|
+
WHERE {pos} between tx_boundaries.min_start and tx_boundaries.max_end
|
1012
972
|
""" # noqa: S608
|
1013
973
|
results = await self.uta_db.execute_query(query)
|
1014
|
-
|
1015
|
-
return None, f"No gene(s) found given {genomic_ac} on position {pos}"
|
1016
|
-
|
1017
|
-
return results[0]["hgnc"], None
|
974
|
+
return bool(results)
|
1018
975
|
|
1019
976
|
async def _get_tx_ac_gene(
|
1020
977
|
self,
|
@@ -1042,102 +999,6 @@ class ExonGenomicCoordsMapper:
|
|
1042
999
|
|
1043
1000
|
return results[0]["hgnc"], None
|
1044
1001
|
|
1045
|
-
async def _get_tx_seg_genomic_metadata(
|
1046
|
-
self,
|
1047
|
-
genomic_ac: str,
|
1048
|
-
genomic_pos: int,
|
1049
|
-
is_seg_start: bool,
|
1050
|
-
gene: str,
|
1051
|
-
tx_ac: str | None,
|
1052
|
-
) -> GenomicTxSeg:
|
1053
|
-
"""Get transcript segment data and associated genomic metadata.
|
1054
|
-
|
1055
|
-
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
1056
|
-
errors.
|
1057
|
-
|
1058
|
-
If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
|
1059
|
-
|
1060
|
-
:param genomic_ac: Genomic RefSeq accession
|
1061
|
-
:param genomic_pos: Genomic position where the transcript segment occurs
|
1062
|
-
:param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
|
1063
|
-
:param gene: Valid, case-sensitive HGNC gene symbol
|
1064
|
-
:param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
|
1065
|
-
transcript
|
1066
|
-
:return: Transcript segment data and associated genomic metadata
|
1067
|
-
"""
|
1068
|
-
if tx_ac:
|
1069
|
-
# We should always try to liftover
|
1070
|
-
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
1071
|
-
if not grch38_ac:
|
1072
|
-
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
1073
|
-
grch38_ac = grch38_ac[0]
|
1074
|
-
else:
|
1075
|
-
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
1076
|
-
if not mane_data:
|
1077
|
-
err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
|
1078
|
-
if gene:
|
1079
|
-
err_msg += f" on gene {gene}"
|
1080
|
-
_logger.warning(err_msg)
|
1081
|
-
return GenomicTxSeg(errors=[err_msg])
|
1082
|
-
|
1083
|
-
mane_data = mane_data[0]
|
1084
|
-
tx_ac = mane_data["RefSeq_nuc"]
|
1085
|
-
grch38_ac = mane_data["GRCh38_chr"]
|
1086
|
-
|
1087
|
-
# Always liftover to GRCh38
|
1088
|
-
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
1089
|
-
genomic_ac, genomic_pos, grch38_ac=grch38_ac
|
1090
|
-
)
|
1091
|
-
if err_msg:
|
1092
|
-
return GenomicTxSeg(errors=[err_msg])
|
1093
|
-
|
1094
|
-
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
|
1095
|
-
if not tx_exons:
|
1096
|
-
return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
|
1097
|
-
|
1098
|
-
tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
|
1099
|
-
tx_ac,
|
1100
|
-
genomic_pos,
|
1101
|
-
genomic_pos,
|
1102
|
-
alt_ac=genomic_ac,
|
1103
|
-
use_tx_pos=False,
|
1104
|
-
)
|
1105
|
-
if len(tx_exon_aln_data) != 1:
|
1106
|
-
return GenomicTxSeg(
|
1107
|
-
errors=[
|
1108
|
-
f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
|
1109
|
-
]
|
1110
|
-
)
|
1111
|
-
|
1112
|
-
tx_exon_aln_data = tx_exon_aln_data[0]
|
1113
|
-
|
1114
|
-
offset = self._get_exon_offset(
|
1115
|
-
start_i=tx_exon_aln_data.alt_start_i,
|
1116
|
-
end_i=tx_exon_aln_data.alt_end_i,
|
1117
|
-
strand=Strand(tx_exon_aln_data.alt_strand),
|
1118
|
-
use_start_i=False, # This doesn't impact anything since we're on the exon
|
1119
|
-
is_in_exon=True,
|
1120
|
-
start=genomic_pos if is_seg_start else None,
|
1121
|
-
end=genomic_pos if not is_seg_start else None,
|
1122
|
-
)
|
1123
|
-
|
1124
|
-
genomic_location, err_msg = self._get_vrs_seq_loc(
|
1125
|
-
genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
|
1126
|
-
)
|
1127
|
-
if err_msg:
|
1128
|
-
return GenomicTxSeg(errors=[err_msg])
|
1129
|
-
|
1130
|
-
return GenomicTxSeg(
|
1131
|
-
gene=tx_exon_aln_data.hgnc,
|
1132
|
-
genomic_ac=genomic_ac,
|
1133
|
-
tx_ac=tx_exon_aln_data.tx_ac,
|
1134
|
-
seg=TxSegment(
|
1135
|
-
exon_ord=tx_exon_aln_data.ord,
|
1136
|
-
offset=offset,
|
1137
|
-
genomic_location=genomic_location,
|
1138
|
-
),
|
1139
|
-
)
|
1140
|
-
|
1141
1002
|
@staticmethod
|
1142
1003
|
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
|
1143
1004
|
"""Check if a breakpoint occurs on an exon
|
@@ -1150,6 +1011,24 @@ class ExonGenomicCoordsMapper:
|
|
1150
1011
|
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
|
1151
1012
|
)
|
1152
1013
|
|
1014
|
+
@staticmethod
|
1015
|
+
def _use_alt_start_i(is_seg_start: bool, strand: Strand) -> bool:
|
1016
|
+
"""Determine whether to use alt_start_i or alt_end_i from UTA when computing
|
1017
|
+
exon offset
|
1018
|
+
|
1019
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
1020
|
+
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
1021
|
+
:param strand: The transcribed strand
|
1022
|
+
:return ``True`` if alt_start_i should be used, ``False`` if alt_end_i should
|
1023
|
+
be used
|
1024
|
+
"""
|
1025
|
+
return (
|
1026
|
+
is_seg_start
|
1027
|
+
and strand == Strand.POSITIVE
|
1028
|
+
or not is_seg_start
|
1029
|
+
and strand == Strand.NEGATIVE
|
1030
|
+
)
|
1031
|
+
|
1153
1032
|
@staticmethod
|
1154
1033
|
def _get_adjacent_exon(
|
1155
1034
|
tx_exons_genomic_coords: list[_ExonCoord],
|
@@ -1210,38 +1089,22 @@ class ExonGenomicCoordsMapper:
|
|
1210
1089
|
|
1211
1090
|
@staticmethod
|
1212
1091
|
def _get_exon_offset(
|
1213
|
-
|
1214
|
-
|
1092
|
+
genomic_pos: int,
|
1093
|
+
exon_boundary: int,
|
1215
1094
|
strand: Strand,
|
1216
|
-
use_start_i: bool = True,
|
1217
|
-
is_in_exon: bool = True,
|
1218
|
-
start: int | None = None,
|
1219
|
-
end: int | None = None,
|
1220
1095
|
) -> int:
|
1221
1096
|
"""Compute offset from exon start or end index
|
1222
1097
|
|
1223
|
-
:param
|
1224
|
-
|
1225
|
-
|
1226
|
-
:param
|
1227
|
-
|
1228
|
-
|
1229
|
-
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1230
|
-
``True``
|
1231
|
-
:param start: Provided start position, defaults to ``None``. Must provide
|
1232
|
-
``start`` or ``end``, not both.
|
1233
|
-
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1234
|
-
or ``end``, not both
|
1098
|
+
:param genomic_pos: The supplied genomic position. This can represent, for
|
1099
|
+
example, a fusion junction breakpoint. This position is represented using
|
1100
|
+
inter-residue coordinates
|
1101
|
+
:param exon_boundary: The genomic position for the exon boundary that the offset
|
1102
|
+
is being computed against
|
1103
|
+
:paran strand: The transcribed strand
|
1235
1104
|
:return: Offset from exon start or end index
|
1236
1105
|
"""
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
else
|
1241
|
-
|
1242
|
-
else:
|
1243
|
-
if strand == Strand.POSITIVE:
|
1244
|
-
offset = start - start_i if use_start_i else end - end_i
|
1245
|
-
else:
|
1246
|
-
offset = start_i - end if use_start_i else end_i - start
|
1247
|
-
return offset
|
1106
|
+
return (
|
1107
|
+
genomic_pos - exon_boundary
|
1108
|
+
if strand == Strand.POSITIVE
|
1109
|
+
else (genomic_pos - exon_boundary) * -1
|
1110
|
+
)
|
@@ -378,6 +378,38 @@ class UtaDatabase:
|
|
378
378
|
result = await self.execute_query(query)
|
379
379
|
return result[0][0]
|
380
380
|
|
381
|
+
async def gene_exists(self, gene: str) -> bool:
|
382
|
+
"""Return whether or not a gene symbol exists in UTA gene table
|
383
|
+
|
384
|
+
:param gene: Gene symbol
|
385
|
+
:return ``True`` if gene symbol exists in UTA, ``False`` if not
|
386
|
+
"""
|
387
|
+
query = f"""
|
388
|
+
SELECT EXISTS(
|
389
|
+
SELECT hgnc
|
390
|
+
FROM {self.schema}.gene
|
391
|
+
WHERE hgnc = '{gene}'
|
392
|
+
);
|
393
|
+
""" # noqa: S608
|
394
|
+
result = await self.execute_query(query)
|
395
|
+
return result[0][0]
|
396
|
+
|
397
|
+
async def transcript_exists(self, transcript: str) -> bool:
|
398
|
+
"""Return whether or not a transcript exists in the UTA tx_exon_aln_v table
|
399
|
+
|
400
|
+
:param transcript: A transcript accession
|
401
|
+
:return ``True`` if transcript exists in UTA, ``False`` if not
|
402
|
+
"""
|
403
|
+
query = f"""
|
404
|
+
SELECT EXISTS(
|
405
|
+
SELECT tx_ac
|
406
|
+
FROM {self.schema}.tx_exon_aln_v
|
407
|
+
WHERE tx_ac = '{transcript}'
|
408
|
+
);
|
409
|
+
""" # noqa: S608
|
410
|
+
result = await self.execute_query(query)
|
411
|
+
return result[0][0]
|
412
|
+
|
381
413
|
async def get_ac_descr(self, ac: str) -> str | None:
|
382
414
|
"""Return accession description. This is typically available only for accessions
|
383
415
|
from older (pre-GRCh38) builds.
|
@@ -6,7 +6,7 @@ cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2p
|
|
6
6
|
cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
|
7
7
|
cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
|
8
8
|
cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
|
9
|
-
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=
|
9
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=ORYjBVaX1HO6ln0gRJyRKxUCjZrBDi4JfYQEYebxIAc,43824
|
10
10
|
cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
|
11
11
|
cool_seq_tool/mappers/mane_transcript.py,sha256=C9eKEj8qhVg878oUhBKPYAZS7gpLM5aaQ0HhSkUg-2g,54365
|
12
12
|
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
@@ -16,9 +16,9 @@ cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5p
|
|
16
16
|
cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
|
17
17
|
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=Q6J57O2lLWXlgKT0zq3BIwkwFawySnORHOX-UxzfyDE,5399
|
18
18
|
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
19
|
-
cool_seq_tool/sources/uta_database.py,sha256=
|
20
|
-
cool_seq_tool-0.
|
21
|
-
cool_seq_tool-0.
|
22
|
-
cool_seq_tool-0.
|
23
|
-
cool_seq_tool-0.
|
24
|
-
cool_seq_tool-0.
|
19
|
+
cool_seq_tool/sources/uta_database.py,sha256=s7BkFplD_b2AmvXq8vZSCiBuZLy8RlxAqNyf-6QtR8w,36112
|
20
|
+
cool_seq_tool-0.11.0.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
21
|
+
cool_seq_tool-0.11.0.dist-info/METADATA,sha256=VcP6BvVyQ1YVB2u2XsZbEVd9DYYr-ZKcHadIt3ACsBY,6557
|
22
|
+
cool_seq_tool-0.11.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
23
|
+
cool_seq_tool-0.11.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
24
|
+
cool_seq_tool-0.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|