cool-seq-tool 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cool_seq_tool/app.py CHANGED
@@ -107,6 +107,7 @@ class CoolSeqTool:
107
107
  self.ex_g_coords_mapper = ExonGenomicCoordsMapper(
108
108
  self.seqrepo_access,
109
109
  self.uta_db,
110
+ self.mane_transcript,
110
111
  self.mane_transcript_mappings,
111
112
  self.liftover,
112
113
  )
@@ -2,17 +2,21 @@
2
2
 
3
3
  import logging
4
4
 
5
+ from ga4gh.core.models import Extension
5
6
  from ga4gh.vrs.models import SequenceLocation, SequenceReference
6
7
  from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator
7
8
 
8
9
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
9
10
  from cool_seq_tool.mappers.liftover import LiftOver
11
+ from cool_seq_tool.mappers.mane_transcript import ManeTranscript
10
12
  from cool_seq_tool.schemas import (
13
+ AnnotationLayer,
11
14
  Assembly,
12
15
  BaseModelForbidExtra,
13
16
  CoordinateType,
14
17
  ServiceMeta,
15
18
  Strand,
19
+ TranscriptPriority,
16
20
  )
17
21
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
18
22
  from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
@@ -65,9 +69,6 @@ class TxSegment(BaseModelForbidExtra):
65
69
  genomic_location: SequenceLocation = Field(
66
70
  ..., description="The genomic position of a transcript segment."
67
71
  )
68
- is_exonic: bool = Field(
69
- default=True, description="If the position occurs on an exon"
70
- )
71
72
 
72
73
  @model_validator(mode="before")
73
74
  def check_seg_pos(cls, values: dict) -> dict: # noqa: N805
@@ -99,8 +100,8 @@ class TxSegment(BaseModelForbidExtra):
99
100
  "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
100
101
  },
101
102
  "end": 154192135,
103
+ "extensions": [{"name": "is_exonic", "value": True}],
102
104
  },
103
- "is_exonic": True,
104
105
  }
105
106
  }
106
107
  )
@@ -115,6 +116,9 @@ class GenomicTxSeg(BaseModelForbidExtra):
115
116
  )
116
117
  genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
117
118
  tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
119
+ tx_status: TranscriptPriority | None = Field(
120
+ None, description="Transcript priority for RefSeq transcript accession"
121
+ )
118
122
  strand: Strand | None = Field(
119
123
  None, description="The strand that the transcript accession exists on."
120
124
  )
@@ -146,6 +150,7 @@ class GenomicTxSeg(BaseModelForbidExtra):
146
150
  "gene": "TPM3",
147
151
  "genomic_ac": "NC_000001.11",
148
152
  "tx_ac": "NM_152263.3",
153
+ "tx_status": "longest_compatible_remaining",
149
154
  "strand": -1,
150
155
  "seg": {
151
156
  "exon_ord": 0,
@@ -157,8 +162,8 @@ class GenomicTxSeg(BaseModelForbidExtra):
157
162
  "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
158
163
  },
159
164
  "end": 154192135,
165
+ "extensions": [{"name": "is_exonic", "value": True}],
160
166
  },
161
- "is_exonic": True,
162
167
  },
163
168
  "errors": [],
164
169
  }
@@ -174,6 +179,9 @@ class GenomicTxSegService(BaseModelForbidExtra):
174
179
  )
175
180
  genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
176
181
  tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
182
+ tx_status: TranscriptPriority | None = Field(
183
+ None, description="Transcript priority for RefSeq transcript accession"
184
+ )
177
185
  strand: Strand | None = Field(
178
186
  None, description="The strand that the transcript exists on."
179
187
  )
@@ -213,6 +221,7 @@ class GenomicTxSegService(BaseModelForbidExtra):
213
221
  "gene": "TPM3",
214
222
  "genomic_ac": "NC_000001.11",
215
223
  "tx_ac": "NM_152263.3",
224
+ "tx_status": "longest_compatible_remaining",
216
225
  "strand": -1,
217
226
  "seg_start": {
218
227
  "exon_ord": 0,
@@ -224,8 +233,8 @@ class GenomicTxSegService(BaseModelForbidExtra):
224
233
  "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
225
234
  },
226
235
  "end": 154192135,
236
+ "extensions": [{"name": "is_exonic", "value": True}],
227
237
  },
228
- "is_exonic": True,
229
238
  },
230
239
  "seg_end": {
231
240
  "exon_ord": 7,
@@ -237,8 +246,8 @@ class GenomicTxSegService(BaseModelForbidExtra):
237
246
  "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
238
247
  },
239
248
  "start": 154170399,
249
+ "extensions": [{"name": "is_exonic", "value": True}],
240
250
  },
241
- "is_exonic": True,
242
251
  },
243
252
  }
244
253
  }
@@ -266,6 +275,7 @@ class ExonGenomicCoordsMapper:
266
275
  self,
267
276
  seqrepo_access: SeqRepoAccess,
268
277
  uta_db: UtaDatabase,
278
+ mane_transcript: ManeTranscript,
269
279
  mane_transcript_mappings: ManeTranscriptMappings,
270
280
  liftover: LiftOver,
271
281
  ) -> None:
@@ -290,11 +300,13 @@ class ExonGenomicCoordsMapper:
290
300
 
291
301
  :param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
292
302
  :param uta_db: UtaDatabase instance to give access to query UTA database
303
+ :param mane_transcript: ManeTranscript instance to give access to ManeTranscript class
293
304
  :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
294
305
  :param liftover: Instance to provide mapping between human genome assemblies
295
306
  """
296
307
  self.seqrepo_access = seqrepo_access
297
308
  self.uta_db = uta_db
309
+ self.mane_transcript = mane_transcript
298
310
  self.mane_transcript_mappings = mane_transcript_mappings
299
311
  self.liftover = liftover
300
312
 
@@ -433,6 +445,7 @@ class ExonGenomicCoordsMapper:
433
445
  gene=gene,
434
446
  genomic_ac=genomic_ac,
435
447
  tx_ac=transcript,
448
+ tx_status=self.mane_transcript_mappings.get_transcript_status(transcript),
436
449
  strand=strand,
437
450
  seg_start=seg_start,
438
451
  seg_end=seg_end,
@@ -524,6 +537,7 @@ class ExonGenomicCoordsMapper:
524
537
  params["gene"] = start_tx_seg_data.gene
525
538
  params["genomic_ac"] = start_tx_seg_data.genomic_ac
526
539
  params["tx_ac"] = start_tx_seg_data.tx_ac
540
+ params["tx_status"] = start_tx_seg_data.tx_status
527
541
  params["strand"] = start_tx_seg_data.strand
528
542
  params["seg_start"] = start_tx_seg_data.seg
529
543
  else:
@@ -559,6 +573,7 @@ class ExonGenomicCoordsMapper:
559
573
  params["gene"] = end_tx_seg_data.gene
560
574
  params["genomic_ac"] = end_tx_seg_data.genomic_ac
561
575
  params["tx_ac"] = end_tx_seg_data.tx_ac
576
+ params["tx_status"] = end_tx_seg_data.tx_status
562
577
  params["strand"] = end_tx_seg_data.strand
563
578
 
564
579
  params["seg_end"] = end_tx_seg_data.seg
@@ -730,7 +745,12 @@ class ExonGenomicCoordsMapper:
730
745
  ), None
731
746
 
732
747
  def _get_vrs_seq_loc(
733
- self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
748
+ self,
749
+ genomic_ac: str,
750
+ genomic_pos: int,
751
+ is_seg_start: bool,
752
+ strand: Strand,
753
+ is_exonic: bool = True,
734
754
  ) -> tuple[SequenceLocation | None, str | None]:
735
755
  """Create VRS Sequence Location for genomic position where transcript segment
736
756
  occurs
@@ -740,6 +760,8 @@ class ExonGenomicCoordsMapper:
740
760
  :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
741
761
  starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
742
762
  :param strand: Strand
763
+ :param is_exonic: A boolean indicating if the genomic breakpoint occurs
764
+ on an exon. By default, this is set to ``True``.
743
765
  :return: Tuple containing VRS location (if successful) and error message (if
744
766
  unable to get GA4GH identifier for ``genomic_ac``).
745
767
  """
@@ -759,6 +781,7 @@ class ExonGenomicCoordsMapper:
759
781
  ),
760
782
  start=genomic_pos if use_start else None,
761
783
  end=genomic_pos if not use_start else None,
784
+ extensions=[Extension(name="is_exonic", value=is_exonic)],
762
785
  ), None
763
786
 
764
787
  async def _genomic_to_tx_segment(
@@ -852,14 +875,18 @@ class ExonGenomicCoordsMapper:
852
875
  if mane_transcripts:
853
876
  transcript = mane_transcripts[0]["RefSeq_nuc"]
854
877
  else:
855
- # Attempt to find a coding transcript if a MANE transcript
878
+ # Attempt to find longest compatible transcript if a MANE transcript
856
879
  # cannot be found
857
- results = await self.uta_db.get_transcripts(
858
- gene=gene, alt_ac=genomic_ac
880
+ results = await self.mane_transcript.get_longest_compatible_transcript(
881
+ start_pos=genomic_pos,
882
+ end_pos=genomic_pos,
883
+ gene=gene,
884
+ alt_ac=genomic_ac,
885
+ start_annotation_layer=AnnotationLayer.GENOMIC,
859
886
  )
860
887
 
861
- if not results.is_empty():
862
- transcript = results[0]["tx_ac"][0]
888
+ if results:
889
+ transcript = results.refseq
863
890
  else:
864
891
  # Run if gene is for a noncoding transcript
865
892
  query = f"""
@@ -947,7 +974,7 @@ class ExonGenomicCoordsMapper:
947
974
  )
948
975
 
949
976
  genomic_location, err_msg = self._get_vrs_seq_loc(
950
- genomic_ac, genomic_pos, is_seg_start, strand
977
+ genomic_ac, genomic_pos, is_seg_start, strand, is_exonic
951
978
  )
952
979
  if err_msg:
953
980
  return GenomicTxSeg(errors=[err_msg])
@@ -956,12 +983,12 @@ class ExonGenomicCoordsMapper:
956
983
  gene=gene,
957
984
  genomic_ac=genomic_ac,
958
985
  tx_ac=transcript,
986
+ tx_status=self.mane_transcript_mappings.get_transcript_status(transcript),
959
987
  strand=strand,
960
988
  seg=TxSegment(
961
989
  exon_ord=exon_num,
962
990
  offset=offset,
963
991
  genomic_location=genomic_location,
964
- is_exonic=is_exonic,
965
992
  ),
966
993
  )
967
994
 
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  from collections import namedtuple
5
5
  from pathlib import Path
6
+ from urllib.parse import urlparse
6
7
 
7
8
  from agct._core import ChainfileError
8
9
  from asyncpg import InvalidCatalogNameError, UndefinedTableError
@@ -11,7 +12,7 @@ from biocommons.seqrepo import SeqRepo
11
12
  from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
12
13
  from cool_seq_tool.mappers.liftover import LiftOver
13
14
  from cool_seq_tool.resources.data_files import DataFile, get_data_file
14
- from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase
15
+ from cool_seq_tool.sources.uta_database import UTA_DB_URL, ParseResult, UtaDatabase
15
16
 
16
17
  _logger = logging.getLogger(__name__)
17
18
 
@@ -119,14 +120,20 @@ async def check_status(
119
120
  else:
120
121
  status["liftover"] = True
121
122
 
123
+ parsed_result = ParseResult(urlparse(db_url))
124
+ sanitized_url = parsed_result.sanitized_url
122
125
  try:
123
126
  await UtaDatabase.create(db_url)
127
+ except ValueError:
128
+ _logger.exception("Database URL is not valid")
124
129
  except (OSError, InvalidCatalogNameError, UndefinedTableError):
125
- _logger.exception("Encountered error instantiating UTA at URI %s", UTA_DB_URL)
130
+ _logger.exception(
131
+ "Encountered error instantiating UTA at URI %s", sanitized_url
132
+ )
126
133
  except Exception as e:
127
134
  _logger.critical(
128
135
  "Encountered unexpected error instantiating UTA from URI %s: %s",
129
- UTA_DB_URL,
136
+ sanitized_url,
130
137
  e,
131
138
  )
132
139
  else:
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  import polars as pl
9
9
 
10
10
  from cool_seq_tool.resources.data_files import DataFile, get_data_file
11
- from cool_seq_tool.schemas import ManeGeneData
11
+ from cool_seq_tool.schemas import ManeGeneData, TranscriptPriority
12
12
 
13
13
  _logger = logging.getLogger(__name__)
14
14
 
@@ -85,6 +85,22 @@ class ManeTranscriptMappings:
85
85
  return []
86
86
  return mane_rows.to_dicts()
87
87
 
88
+ def get_transcript_status(self, tx_ac: str) -> TranscriptPriority:
89
+ """Get MANE status for a transcript
90
+
91
+ :param tx_ac: A RefSeq transcript accession
92
+ :return: A TranscriptPriority object
93
+ """
94
+ mane_info = self.get_mane_from_transcripts([tx_ac])
95
+ if not mane_info:
96
+ return TranscriptPriority.LONGEST_COMPATIBLE_REMAINING
97
+ mane_info = mane_info[0]["MANE_status"]
98
+ return (
99
+ TranscriptPriority.MANE_SELECT
100
+ if mane_info == "MANE Select"
101
+ else TranscriptPriority.MANE_PLUS_CLINICAL
102
+ )
103
+
88
104
  def get_mane_data_from_chr_pos(
89
105
  self, alt_ac: str, start: int, end: int
90
106
  ) -> list[dict]:
@@ -5,7 +5,7 @@ import logging
5
5
  from os import environ
6
6
  from typing import Any, Literal, TypeVar
7
7
  from urllib.parse import ParseResult as UrlLibParseResult
8
- from urllib.parse import quote, unquote, urlparse
8
+ from urllib.parse import unquote, urlparse, urlunparse
9
9
 
10
10
  import asyncpg
11
11
  import boto3
@@ -101,8 +101,7 @@ class UtaDatabase:
101
101
  """
102
102
  self.schema = None
103
103
  self._connection_pool = None
104
- original_pwd = db_url.split("//")[-1].split("@")[0].split(":")[-1]
105
- self.db_url = db_url.replace(original_pwd, quote(original_pwd))
104
+ self.db_url = db_url
106
105
  self.args = self._get_conn_args()
107
106
 
108
107
  def _get_conn_args(self) -> DbConnectionArgs:
@@ -954,3 +953,28 @@ class ParseResult(UrlLibParseResult):
954
953
  """Create schema property."""
955
954
  path_elems = self.path.split("/")
956
955
  return path_elems[2] if len(path_elems) > 2 else None
956
+
957
+ @property
958
+ def sanitized_url(self) -> str:
959
+ """Sanitized DB URL with the password masked"""
960
+ netloc = ""
961
+ if self.username:
962
+ netloc += self.username
963
+ if self.password is not None and self.password != "":
964
+ netloc += ":***"
965
+ netloc += "@"
966
+ if self.hostname:
967
+ netloc += f"{self.hostname}"
968
+ if self.port:
969
+ netloc += f":{self.port}"
970
+
971
+ return urlunparse(
972
+ (
973
+ self.scheme,
974
+ netloc,
975
+ self.path,
976
+ self.params,
977
+ self.query,
978
+ self.fragment,
979
+ )
980
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cool_seq_tool
3
- Version: 0.14.4
3
+ Version: 0.15.0
4
4
  Summary: Common Operation on Lots of Sequences Tool
5
5
  Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
6
6
  License: MIT License
@@ -50,7 +50,7 @@ Requires-Dist: agct>=0.1.0-dev1
50
50
  Requires-Dist: polars~=1.0
51
51
  Requires-Dist: biocommons.seqrepo
52
52
  Requires-Dist: pydantic<3.0,>=2.0
53
- Requires-Dist: ga4gh.vrs<3.0,>=2.1.3
53
+ Requires-Dist: ga4gh.vrs<3.0,>=2.1.4
54
54
  Requires-Dist: wags-tails~=0.4.0
55
55
  Requires-Dist: bioutils
56
56
  Provides-Extra: dev
@@ -1,25 +1,25 @@
1
1
  cool_seq_tool/__init__.py,sha256=pJyVj7Z275BBAwpeFMm-WEn_tp-y1_ihRl1sLc4FFZY,400
2
- cool_seq_tool/app.py,sha256=vyqlQRffC8sWZXMm-f_f-8WuTTWo3oRNfPUa_qdPV2M,4944
2
+ cool_seq_tool/app.py,sha256=ThdumeXtLNrrGkQW9wwLT3Zm_Fc1wzg88ZxLIwMzcJU,4978
3
3
  cool_seq_tool/schemas.py,sha256=6c87iuA6v7BX7a8nkWEqFbJTksFysuuIeuYxkNCrAsI,5356
4
4
  cool_seq_tool/utils.py,sha256=jra2ZHS7HUqXqabSvyqd5imf6kkhYL8nQd20BWNLpb8,2950
5
5
  cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2pIfqow,78
6
6
  cool_seq_tool/handlers/seqrepo_access.py,sha256=lRzPc8V0eZJTlefbHuVKeZTEC8-KcyPzpqX7vx3amu8,9118
7
7
  cool_seq_tool/mappers/__init__.py,sha256=tavpwkNogg_nF1J_kb6Q9jk7ezqdRz063v7BMZ4koLM,390
8
8
  cool_seq_tool/mappers/alignment.py,sha256=kWgYssM8YL-Z13H9GdpL77P7simNcbxltAs9YDXHE54,9640
9
- cool_seq_tool/mappers/exon_genomic_coords.py,sha256=fV4LyrpHPLRrx6AtV15g93q5XCH3i-y3Wj9tl-Cg8mM,45845
9
+ cool_seq_tool/mappers/exon_genomic_coords.py,sha256=I59vvswLbXk1fOWLNyNd8NTVT39f5yxTCv20utlTCpo,47361
10
10
  cool_seq_tool/mappers/feature_overlap.py,sha256=X5UFClaH6ixRsO2fDLxqjywp-Z0bvNx4uzgBICy394U,9758
11
11
  cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
12
12
  cool_seq_tool/mappers/mane_transcript.py,sha256=IluiLBxPQoY-CxkpqpjEBcMlHvrNLa34wdKdQxtKgDY,54613
13
13
  cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
14
14
  cool_seq_tool/resources/data_files.py,sha256=6d1M5WjeFHdTQpzxqjQ78auQRZvIBVqH8QNCrmRRDXw,4205
15
- cool_seq_tool/resources/status.py,sha256=5UKx5FIQuyIY7FU4kSinDIM4MhLpr9_MiQDDBNt9kRo,5990
15
+ cool_seq_tool/resources/status.py,sha256=iP-4NiSmqV-D--gypZyrSqVbOWQvyBZICKQb-VinTik,6241
16
16
  cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
17
17
  cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
18
- cool_seq_tool/sources/mane_transcript_mappings.py,sha256=C5puIA1xuEzBaSvs8VtSxVb2OIDGUg5no8v6Ma2QSdw,6597
18
+ cool_seq_tool/sources/mane_transcript_mappings.py,sha256=PLJymduwvG1pt9mravE58BfJsGXdAYXcZYZdHPy12z0,7211
19
19
  cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
20
- cool_seq_tool/sources/uta_database.py,sha256=zzRzmYuybqzEg7zeuQjhK46SPK5GfbiWWNRGNJju8AI,36197
21
- cool_seq_tool-0.14.4.dist-info/licenses/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
22
- cool_seq_tool-0.14.4.dist-info/METADATA,sha256=gPz48irXCwNUecTcgpKrdrHiBhi8R_Is55S2UYs9Qtk,6535
23
- cool_seq_tool-0.14.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- cool_seq_tool-0.14.4.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
25
- cool_seq_tool-0.14.4.dist-info/RECORD,,
20
+ cool_seq_tool/sources/uta_database.py,sha256=38CQ0QHHh0kA87tdgsJHJiHdJHQc06ylBYfemGFUlZc,36759
21
+ cool_seq_tool-0.15.0.dist-info/licenses/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
22
+ cool_seq_tool-0.15.0.dist-info/METADATA,sha256=MEcG0vc2k8F8lQljcFgkIrabFHJec_pE7Ib6DzLZR2M,6535
23
+ cool_seq_tool-0.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ cool_seq_tool-0.15.0.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
25
+ cool_seq_tool-0.15.0.dist-info/RECORD,,