cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. cool_seq_tool/__init__.py +7 -11
  2. cool_seq_tool/app.py +44 -24
  3. cool_seq_tool/handlers/__init__.py +1 -0
  4. cool_seq_tool/handlers/seqrepo_access.py +27 -25
  5. cool_seq_tool/mappers/__init__.py +3 -1
  6. cool_seq_tool/mappers/alignment.py +5 -6
  7. cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
  8. cool_seq_tool/mappers/liftover.py +90 -0
  9. cool_seq_tool/mappers/mane_transcript.py +208 -113
  10. cool_seq_tool/resources/__init__.py +1 -0
  11. cool_seq_tool/resources/data_files.py +93 -0
  12. cool_seq_tool/resources/status.py +153 -0
  13. cool_seq_tool/schemas.py +92 -54
  14. cool_seq_tool/sources/__init__.py +1 -0
  15. cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
  16. cool_seq_tool/sources/transcript_mappings.py +41 -32
  17. cool_seq_tool/sources/uta_database.py +96 -249
  18. cool_seq_tool/utils.py +44 -4
  19. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
  20. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
  21. cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
  22. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
  23. cool_seq_tool/api.py +0 -42
  24. cool_seq_tool/data/__init__.py +0 -2
  25. cool_seq_tool/data/data_downloads.py +0 -89
  26. cool_seq_tool/paths.py +0 -28
  27. cool_seq_tool/routers/__init__.py +0 -16
  28. cool_seq_tool/routers/default.py +0 -125
  29. cool_seq_tool/routers/mane.py +0 -98
  30. cool_seq_tool/routers/mappings.py +0 -155
  31. cool_seq_tool/version.py +0 -2
  32. cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
  33. /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
  34. {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,15 @@
1
1
  """Provide transcript lookup and metadata tools via the UTA database."""
2
+
2
3
  import ast
3
- import base64
4
4
  import logging
5
5
  from os import environ
6
- from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
6
+ from typing import Any, Literal, TypeVar
7
7
  from urllib.parse import ParseResult as UrlLibParseResult
8
8
  from urllib.parse import quote, unquote, urlparse
9
9
 
10
10
  import asyncpg
11
11
  import boto3
12
12
  import polars as pl
13
- from agct import Converter, Genome
14
13
  from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError
15
14
  from botocore.exceptions import ClientError
16
15
 
@@ -19,15 +18,11 @@ from cool_seq_tool.schemas import AnnotationLayer, Assembly, Strand
19
18
  # use `bound` to upper-bound UtaDatabase or child classes
20
19
  UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase")
21
20
 
22
- # Environment variables for paths to chain files for agct
23
- LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
24
- LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
25
-
26
21
  UTA_DB_URL = environ.get(
27
- "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b"
22
+ "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5432/uta/uta_20210129b"
28
23
  )
29
24
 
30
- logger = logging.getLogger(__name__)
25
+ _logger = logging.getLogger(__name__)
31
26
 
32
27
 
33
28
  class UtaDatabase:
@@ -43,25 +38,12 @@ class UtaDatabase:
43
38
  >>> uta_db = asyncio.run(UtaDatabase.create())
44
39
  """
45
40
 
46
- def __init__(
47
- self,
48
- db_url: str = UTA_DB_URL,
49
- chain_file_37_to_38: Optional[str] = None,
50
- chain_file_38_to_37: Optional[str] = None,
51
- ) -> None:
41
+ def __init__(self, db_url: str = UTA_DB_URL) -> None:
52
42
  """Initialize DB class. Should only be used by ``create()`` method, and not
53
43
  be called directly by a user.
54
44
 
55
45
  :param db_url: PostgreSQL connection URL
56
46
  Format: ``driver://user:password@host/database/schema``
57
- :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
58
- This is used for ``agct``. If this is not provided, will check to see
59
- if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
60
- allow ``agct`` to download a chain file from UCSC
61
- :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
62
- This is used for ``agct``. If this is not provided, will check to see
63
- if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
64
- allow ``agct`` to download a chain file from UCSC
65
47
  """
66
48
  self.schema = None
67
49
  self._connection_pool = None
@@ -69,19 +51,7 @@ class UtaDatabase:
69
51
  self.db_url = db_url.replace(original_pwd, quote(original_pwd))
70
52
  self.args = self._get_conn_args()
71
53
 
72
- chain_file_37_to_38 = chain_file_37_to_38 or LIFTOVER_CHAIN_37_TO_38
73
- if chain_file_37_to_38:
74
- self.liftover_37_to_38 = Converter(chainfile=chain_file_37_to_38)
75
- else:
76
- self.liftover_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
77
-
78
- chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
79
- if chain_file_38_to_37:
80
- self.liftover_38_to_37 = Converter(chainfile=chain_file_38_to_37)
81
- else:
82
- self.liftover_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
83
-
84
- def _get_conn_args(self) -> Dict:
54
+ def _get_conn_args(self) -> dict:
85
55
  """Return connection arguments.
86
56
 
87
57
  :param db_url: raw connection URL
@@ -99,9 +69,9 @@ class UtaDatabase:
99
69
  self.schema = schema
100
70
 
101
71
  environ["PGPASSWORD"] = password
102
- environ[
103
- "UTA_DB_URL"
104
- ] = f"postgresql://{username}@{host}:{port}/{database}/{schema}"
72
+ environ["UTA_DB_URL"] = (
73
+ f"postgresql://{username}@{host}:{port}/{database}/{schema}"
74
+ )
105
75
  return {
106
76
  "host": host,
107
77
  "port": int(port),
@@ -137,7 +107,7 @@ class UtaDatabase:
137
107
  database=self.args["database"],
138
108
  )
139
109
  except InterfaceError as e:
140
- logger.error(
110
+ _logger.error(
141
111
  "While creating connection pool, encountered exception %s", e
142
112
  )
143
113
  msg = "Could not create connection pool"
@@ -145,7 +115,7 @@ class UtaDatabase:
145
115
 
146
116
  @classmethod
147
117
  async def create(
148
- cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL
118
+ cls: type[UTADatabaseType], db_url: str = UTA_DB_URL
149
119
  ) -> UTADatabaseType:
150
120
  """Manufacture a fully-initialized class instance (a la factory pattern). This
151
121
  method should be used instead of calling the class directly to create a new
@@ -173,7 +143,10 @@ class UtaDatabase:
173
143
  """
174
144
 
175
145
  async def _execute_query(q: str) -> Any: # noqa: ANN401
176
- async with self._connection_pool.acquire() as connection, connection.transaction():
146
+ async with (
147
+ self._connection_pool.acquire() as connection,
148
+ connection.transaction(),
149
+ ):
177
150
  return await connection.fetch(q)
178
151
 
179
152
  if not self._connection_pool:
@@ -197,7 +170,7 @@ class UtaDatabase:
197
170
  genomic_table_exists = await self.execute_query(check_table_exists)
198
171
  genomic_table_exists = genomic_table_exists[0].get("exists")
199
172
  if genomic_table_exists is None:
200
- logger.critical(
173
+ _logger.critical(
201
174
  "SELECT EXISTS query in UtaDatabase._create_genomic_table "
202
175
  "returned invalid response"
203
176
  )
@@ -234,25 +207,22 @@ class UtaDatabase:
234
207
  await self.execute_query(create_index)
235
208
 
236
209
  @staticmethod
237
- def _transform_list(li: List) -> List[List[Any]]:
210
+ def _transform_list(li: list) -> list[list[Any]]:
238
211
  """Transform list to only contain field values
239
212
 
240
213
  :param li: List of asyncpg.Record objects
241
214
  :return: List of list of objects
242
215
  """
243
- results = []
244
- for item in li:
245
- results.append(list(item))
246
- return results
216
+ return [list(i) for i in li]
247
217
 
248
218
  async def get_genes_and_alt_acs(
249
219
  self,
250
220
  pos: int,
251
- strand: Optional[Strand] = None,
252
- chromosome: Optional[int] = None,
253
- alt_ac: Optional[str] = None,
254
- gene: Optional[str] = None,
255
- ) -> Tuple[Optional[Dict], Optional[str]]:
221
+ strand: Strand | None = None,
222
+ chromosome: int | None = None,
223
+ alt_ac: str | None = None,
224
+ gene: str | None = None,
225
+ ) -> tuple[dict | None, str | None]:
256
226
  """Return genes and genomic accessions for a position on a chromosome or alt_ac
257
227
 
258
228
  :param pos: Genomic position
@@ -261,7 +231,7 @@ class UtaDatabase:
261
231
  (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
262
232
  If ``alt_ac`` is also provided, ``alt_ac`` will be used.
263
233
  :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
264
- must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
234
+ must provide ``chromosome``. If ``chromosome`` is also provided, ``alt_ac``
265
235
  will be used.
266
236
  :param gene: Gene symbol
267
237
  :return: Dictionary containing genes and genomic accessions and warnings if found
@@ -309,8 +279,8 @@ class UtaDatabase:
309
279
  return {"genes": genes, "alt_acs": alt_acs}, None
310
280
 
311
281
  async def get_tx_exons(
312
- self, tx_ac: str, alt_ac: Optional[str] = None
313
- ) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]:
282
+ self, tx_ac: str, alt_ac: str | None = None
283
+ ) -> tuple[list[tuple[int, int]] | None, str | None]:
314
284
  """Get list of transcript exons start/end coordinates.
315
285
 
316
286
  :param tx_ac: Transcript accession
@@ -343,7 +313,7 @@ class UtaDatabase:
343
313
 
344
314
  if not result:
345
315
  msg = f"Unable to get exons for {tx_ac}"
346
- logger.warning(msg)
316
+ _logger.warning(msg)
347
317
  return None, msg
348
318
  tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
349
319
  return tx_exons, None
@@ -352,7 +322,7 @@ class UtaDatabase:
352
322
  self,
353
323
  tx_ac: str,
354
324
  alt_ac: str,
355
- ) -> Tuple[Optional[Tuple[int, int, int, int, int]], Optional[str]]:
325
+ ) -> tuple[tuple[int, int, int, int, int] | None, str | None]:
356
326
  """Get exon number, transcript coordinates, and genomic coordinates
357
327
 
358
328
  :param tx_ac: Transcript accession
@@ -370,7 +340,7 @@ class UtaDatabase:
370
340
 
371
341
  if not result:
372
342
  msg = f"Unable to get exons and genomic coordinates for {tx_ac} on {alt_ac}"
373
- logger.warning(msg)
343
+ _logger.warning(msg)
374
344
  return None, msg
375
345
  tx_exons_genomic_coords = [
376
346
  (r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"])
@@ -379,8 +349,8 @@ class UtaDatabase:
379
349
  return tx_exons_genomic_coords, None
380
350
 
381
351
  async def get_alt_ac_start_or_end(
382
- self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str]
383
- ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]:
352
+ self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None
353
+ ) -> tuple[tuple[str, str, int, int, int] | None, str | None]:
384
354
  """Get genomic data for related transcript exon start or end.
385
355
 
386
356
  :param tx_ac: Transcript accession
@@ -415,12 +385,12 @@ class UtaDatabase:
415
385
  )
416
386
  if gene_query:
417
387
  msg += f" on gene {gene}"
418
- logger.warning(msg)
388
+ _logger.warning(msg)
419
389
  return None, msg
420
390
  result = result[0]
421
391
  return (result[0], result[1], result[2], result[3], result[4]), None
422
392
 
423
- async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]:
393
+ async def get_cds_start_end(self, tx_ac: str) -> tuple[int, int] | None:
424
394
  """Get coding start and end site
425
395
 
426
396
  :param tx_ac: Transcript accession
@@ -439,12 +409,12 @@ class UtaDatabase:
439
409
  if cds_start_end[0] is not None and cds_start_end[1] is not None: # noqa: RET503
440
410
  return cds_start_end[0], cds_start_end[1]
441
411
  else:
442
- logger.warning(
412
+ _logger.warning(
443
413
  "Unable to get coding start/end site for accession: %s", tx_ac
444
414
  )
445
415
  return None
446
416
 
447
- async def get_newest_assembly_ac(self, ac: str) -> List[str]:
417
+ async def get_newest_assembly_ac(self, ac: str) -> list[str]:
448
418
  """Find accession associated to latest genomic assembly
449
419
 
450
420
  :param ac: Accession
@@ -489,7 +459,7 @@ class UtaDatabase:
489
459
  result = await self.execute_query(query)
490
460
  return result[0][0]
491
461
 
492
- async def get_ac_descr(self, ac: str) -> Optional[str]:
462
+ async def get_ac_descr(self, ac: str) -> str | None:
493
463
  """Return accession description. This is typically available only for accessions
494
464
  from older (pre-GRCh38) builds.
495
465
 
@@ -512,7 +482,7 @@ class UtaDatabase:
512
482
  """ # noqa: S608
513
483
  result = await self.execute_query(query)
514
484
  if not result:
515
- logger.warning("Accession %s does not have a description", ac)
485
+ _logger.warning("Accession %s does not have a description", ac)
516
486
  return None
517
487
  result = result[0][0]
518
488
  if result == "":
@@ -524,10 +494,10 @@ class UtaDatabase:
524
494
  tx_ac: str,
525
495
  start_pos: int,
526
496
  end_pos: int,
527
- alt_ac: Optional[str] = None,
497
+ alt_ac: str | None = None,
528
498
  use_tx_pos: bool = True,
529
499
  like_tx_ac: bool = False,
530
- ) -> List:
500
+ ) -> list:
531
501
  """Return queried data from tx_exon_aln_v table.
532
502
 
533
503
  :param tx_ac: accession on c. coordinate
@@ -584,21 +554,18 @@ class UtaDatabase:
584
554
  """ # noqa: S608
585
555
  result = await self.execute_query(query)
586
556
  if not result:
587
- logger.warning("Unable to find transcript alignment for query: %s", query)
557
+ _logger.warning("Unable to find transcript alignment for query: %s", query)
588
558
  return []
589
559
  if alt_ac and not use_tx_pos and len(result) > 1:
590
- logger.debug(
560
+ _logger.debug(
591
561
  "Found more than one match for tx_ac %s and alt_ac = %s",
592
562
  temp_ac,
593
563
  alt_ac,
594
564
  )
595
- results = []
596
- for r in result:
597
- results.append(list(r))
598
- return results
565
+ return [list(r) for r in result]
599
566
 
600
567
  @staticmethod
601
- def data_from_result(result: List) -> Optional[Dict]:
568
+ def data_from_result(result: list) -> dict | None:
602
569
  """Return data found from result.
603
570
 
604
571
  :param result: Data from tx_exon_aln_v table
@@ -613,7 +580,7 @@ class UtaDatabase:
613
580
  alt_exon_id = result[10]
614
581
 
615
582
  if (tx_pos_range[1] - tx_pos_range[0]) != (alt_pos_range[1] - alt_pos_range[0]):
616
- logger.warning(
583
+ _logger.warning(
617
584
  "tx_pos_range %s is not the same length as alt_pos_range %s.",
618
585
  tx_pos_range,
619
586
  alt_pos_range,
@@ -631,8 +598,8 @@ class UtaDatabase:
631
598
  }
632
599
 
633
600
  async def get_mane_c_genomic_data(
634
- self, ac: str, alt_ac: Optional[str], start_pos: int, end_pos: int
635
- ) -> Optional[Dict]:
601
+ self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int
602
+ ) -> dict | None:
636
603
  """Get MANE transcript and genomic data. Used when going from g. to MANE c.
637
604
  representation.
638
605
 
@@ -671,7 +638,7 @@ class UtaDatabase:
671
638
 
672
639
  coding_start_site = await self.get_cds_start_end(ac)
673
640
  if coding_start_site is None:
674
- logger.warning("Accession %s not found in UTA", ac)
641
+ _logger.warning("Accession %s not found in UTA", ac)
675
642
  return None
676
643
 
677
644
  data["tx_ac"] = result[1]
@@ -697,13 +664,12 @@ class UtaDatabase:
697
664
  async def get_genomic_tx_data(
698
665
  self,
699
666
  tx_ac: str,
700
- pos: Tuple[int, int],
701
- annotation_layer: Union[
702
- AnnotationLayer.CDNA, AnnotationLayer.GENOMIC
703
- ] = AnnotationLayer.CDNA,
704
- alt_ac: Optional[str] = None,
667
+ pos: tuple[int, int],
668
+ annotation_layer: Literal[AnnotationLayer.CDNA]
669
+ | Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
670
+ alt_ac: str | None = None,
705
671
  target_genome_assembly: Assembly = Assembly.GRCH38,
706
- ) -> Optional[Dict]:
672
+ ) -> dict | None:
707
673
  """Get transcript mapping to genomic data.
708
674
 
709
675
  :param tx_ac: Accession on c. coordinate
@@ -760,7 +726,7 @@ class UtaDatabase:
760
726
 
761
727
  return data
762
728
 
763
- async def get_ac_from_gene(self, gene: str) -> List[str]:
729
+ async def get_ac_from_gene(self, gene: str) -> list[str]:
764
730
  """Return genomic accession(s) associated to a gene.
765
731
 
766
732
  :param gene: Gene symbol
@@ -784,14 +750,16 @@ class UtaDatabase:
784
750
 
785
751
  async def get_gene_from_ac(
786
752
  self, ac: str, start_pos: int, end_pos: int
787
- ) -> Optional[List[str]]:
753
+ ) -> list[str] | None:
788
754
  """Get gene(s) within the provided coordinate range
789
755
 
790
756
  >>> import asyncio
791
757
  >>> from cool_seq_tool.sources import UtaDatabase
792
758
  >>> async def get_gene():
793
759
  ... uta_db = await UtaDatabase.create()
794
- ... result = await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802)
760
+ ... result = await uta_db.get_gene_from_ac(
761
+ ... "NC_000017.11", 43044296, 43045802
762
+ ... )
795
763
  ... return result
796
764
  >>> asyncio.run(get_gene())
797
765
  ['BRCA1']
@@ -812,12 +780,12 @@ class UtaDatabase:
812
780
  """ # noqa: S608
813
781
  results = await self.execute_query(query)
814
782
  if not results:
815
- logger.warning(
783
+ _logger.warning(
816
784
  "Unable to find gene between %s and %s on %s", start_pos, end_pos, ac
817
785
  )
818
786
  return None
819
787
  if len(results) > 1:
820
- logger.info(
788
+ _logger.info(
821
789
  "Found more than one gene between %s and %s on %s",
822
790
  start_pos,
823
791
  end_pos,
@@ -828,11 +796,11 @@ class UtaDatabase:
828
796
 
829
797
  async def get_transcripts(
830
798
  self,
831
- start_pos: Optional[int] = None,
832
- end_pos: Optional[int] = None,
833
- gene: Optional[str] = None,
799
+ start_pos: int | None = None,
800
+ end_pos: int | None = None,
801
+ gene: str | None = None,
834
802
  use_tx_pos: bool = True,
835
- alt_ac: Optional[str] = None,
803
+ alt_ac: str | None = None,
836
804
  ) -> pl.DataFrame:
837
805
  """Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
838
806
 
@@ -901,16 +869,26 @@ class UtaDatabase:
901
869
  results = [
902
870
  (r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results
903
871
  ]
904
- results_df = pl.DataFrame(results, schema=schema)
872
+ results_df = pl.DataFrame(results, schema=schema, orient="row")
905
873
  if results:
906
874
  results_df = results_df.unique()
907
875
  return results_df
908
876
 
909
- async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]:
877
+ async def get_chr_assembly(self, ac: str) -> tuple[str, Assembly] | None:
910
878
  """Get chromosome and assembly for NC accession if not in GRCh38.
911
879
 
912
- :param ac: NC accession
913
- :return: Chromosome and Assembly accession is on
880
+ >>> import asyncio
881
+ >>> from cool_seq_tool.sources.uta_database import UtaDatabase
882
+ >>> uta_db = asyncio.run(UtaDatabase.create())
883
+ >>> result = asyncio.run(uta_db.get_chr_assembly("NC_000007.13"))
884
+ >>> result
885
+ ('chr7', <Assembly.GRCH37: 'GRCh37'>)
886
+
887
+ Returns ``None`` if unable to find (either unrecognized/invalid, or
888
+ a GRCh38 accession).
889
+
890
+ :param ac: RefSeq NC accession, eg ``"NC_000007.13"``
891
+ :return: Chromosome and assembly that accession is on, if available.
914
892
  """
915
893
  descr = await self.get_ac_descr(ac)
916
894
  if not descr:
@@ -920,136 +898,15 @@ class UtaDatabase:
920
898
  chromosome = f"chr{descr[0].split()[-1]}"
921
899
  assembly = f"GRCh{descr[1].split('.')[0].split('GRCh')[-1]}"
922
900
 
923
- if assembly not in ["GRCh37", "GRCh38"]:
924
- logger.warning(
925
- "Assembly not supported: %s. Only GRCh37 and GRCh38 are supported.",
926
- assembly,
927
- )
901
+ try:
902
+ assembly = Assembly(assembly)
903
+ except ValueError as e:
904
+ _logger.error(e)
928
905
  return None
929
906
 
930
907
  return chromosome, assembly
931
908
 
932
- async def liftover_to_38(self, genomic_tx_data: Dict) -> None:
933
- """Liftover genomic_tx_data to hg38 assembly.
934
-
935
- :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
936
- strand
937
- """
938
- descr = await self.get_chr_assembly(genomic_tx_data["alt_ac"])
939
- if descr is None:
940
- # already grch38
941
- return
942
- chromosome, _ = descr
943
-
944
- query = f"""
945
- SELECT DISTINCT alt_ac
946
- FROM {self.schema}.tx_exon_aln_v
947
- WHERE tx_ac = '{genomic_tx_data['tx_ac']}';
948
- """ # noqa: S608
949
- nc_acs = await self.execute_query(query)
950
- nc_acs = [nc_ac[0] for nc_ac in nc_acs]
951
- if nc_acs == [genomic_tx_data["alt_ac"]]:
952
- logger.warning(
953
- "UTA does not have GRCh38 assembly for %s",
954
- genomic_tx_data["alt_ac"].split(".")[0],
955
- )
956
- return
957
-
958
- # Get most recent assembly version position
959
- # Liftover range
960
- self._set_liftover(
961
- genomic_tx_data, "alt_pos_range", chromosome, Assembly.GRCH38
962
- )
963
-
964
- # Liftover changes range
965
- self._set_liftover(
966
- genomic_tx_data, "alt_pos_change_range", chromosome, Assembly.GRCH38
967
- )
968
-
969
- # Change alt_ac to most recent
970
- if genomic_tx_data["alt_ac"].startswith("EN"):
971
- order_by_cond = "ORDER BY alt_ac DESC;"
972
- else:
973
- order_by_cond = """
974
- ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1,
975
- LENGTH(alt_ac)) AS INT) DESC;
976
- """
977
- query = f"""
978
- SELECT alt_ac
979
- FROM {self.schema}.genomic
980
- WHERE alt_ac LIKE '{genomic_tx_data['alt_ac'].split('.')[0]}%'
981
- {order_by_cond}
982
- """ # noqa: S608
983
- nc_acs = await self.execute_query(query)
984
- genomic_tx_data["alt_ac"] = nc_acs[0][0]
985
-
986
- def get_liftover(
987
- self, chromosome: str, pos: int, liftover_to_assembly: Assembly
988
- ) -> Optional[Tuple[str, int]]:
989
- """Get new genome assembly data for a position on a chromosome.
990
-
991
- :param chromosome: The chromosome number. Must be prefixed with ``chr``
992
- :param pos: Position on the chromosome
993
- :param liftover_to_assembly: Assembly to liftover to
994
- :return: Target chromosome and target position for assembly
995
- """
996
- if not chromosome.startswith("chr"):
997
- logger.warning("`chromosome` must be prefixed with chr")
998
- return None
999
-
1000
- if liftover_to_assembly == Assembly.GRCH38:
1001
- liftover = self.liftover_37_to_38.convert_coordinate(chromosome, pos)
1002
- elif liftover_to_assembly == Assembly.GRCH37:
1003
- liftover = self.liftover_38_to_37.convert_coordinate(chromosome, pos)
1004
- else:
1005
- logger.warning("%s assembly not supported", liftover_to_assembly)
1006
- liftover = None
1007
-
1008
- if not liftover:
1009
- logger.warning("%s does not exist on %s", pos, chromosome)
1010
- return None
1011
- return liftover[0][:2]
1012
-
1013
- def _set_liftover(
1014
- self,
1015
- genomic_tx_data: Dict,
1016
- key: str,
1017
- chromosome: str,
1018
- liftover_to_assembly: Assembly,
1019
- ) -> None:
1020
- """Update genomic_tx_data to have coordinates for given assembly.
1021
-
1022
- :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
1023
- strand
1024
- :param key: Key to access coordinate positions
1025
- :param chromosome: Chromosome, must be prefixed with ``chr``
1026
- :param liftover_to_assembly: Assembly to liftover to
1027
- """
1028
- liftover_start_i = self.get_liftover(
1029
- chromosome, genomic_tx_data[key][0], liftover_to_assembly
1030
- )
1031
- if liftover_start_i is None:
1032
- logger.warning(
1033
- "Unable to liftover position %s on %s",
1034
- genomic_tx_data[key][0],
1035
- chromosome,
1036
- )
1037
- return
1038
-
1039
- liftover_end_i = self.get_liftover(
1040
- chromosome, genomic_tx_data[key][1], liftover_to_assembly
1041
- )
1042
- if liftover_end_i is None:
1043
- logger.warning(
1044
- "Unable to liftover position %s on %s",
1045
- genomic_tx_data[key][1],
1046
- chromosome,
1047
- )
1048
- return
1049
-
1050
- genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
1051
-
1052
- async def p_to_c_ac(self, p_ac: str) -> List[str]:
909
+ async def p_to_c_ac(self, p_ac: str) -> list[str]:
1053
910
  """Return cDNA reference sequence accession from protein reference sequence
1054
911
  accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
1055
912
 
@@ -1079,7 +936,7 @@ class UtaDatabase:
1079
936
 
1080
937
  async def get_transcripts_from_genomic_pos(
1081
938
  self, alt_ac: str, g_pos: int
1082
- ) -> List[str]:
939
+ ) -> list[str]:
1083
940
  """Get transcripts associated to a genomic ac and position.
1084
941
 
1085
942
  :param alt_ac: Genomic accession
@@ -1100,7 +957,12 @@ class UtaDatabase:
1100
957
 
1101
958
  @staticmethod
1102
959
  def get_secret() -> str:
1103
- """Get secrets for UTA DB instances. Used for deployment on AWS."""
960
+ """Get secrets for UTA DB instances. Used for deployment on AWS.
961
+
962
+ :raises ClientError: If unable to retrieve secret value due to decryption
963
+ decryption failure, internal service error, invalid parameter, invalid
964
+ request, or resource not found.
965
+ """
1104
966
  secret_name = environ["UTA_DB_SECRET"]
1105
967
  region_name = "us-east-2"
1106
968
 
@@ -1111,27 +973,12 @@ class UtaDatabase:
1111
973
  try:
1112
974
  get_secret_value_response = client.get_secret_value(SecretId=secret_name)
1113
975
  except ClientError as e:
1114
- logger.warning(e)
1115
- if e.response["Error"]["Code"] in {
1116
- # Secrets Manager can"t decrypt the protected secret text using the provided KMS key.
1117
- "DecryptionFailureException",
1118
- # An error occurred on the server side.
1119
- "InternalServiceErrorException",
1120
- # You provided an invalid value for a parameter.
1121
- "InvalidParameterException",
1122
- # You provided a parameter value that is not valid for the current state of the resource.
1123
- "InvalidRequestException",
1124
- # We can"t find the resource that you asked for.
1125
- "ResourceNotFoundException",
1126
- }:
1127
- raise e
976
+ # For a list of exceptions thrown, see
977
+ # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
978
+ _logger.error(e)
979
+ raise e
1128
980
  else:
1129
- # Decrypts secret using the associated KMS CMK.
1130
- # Depending on whether the secret is a string or binary,
1131
- # one of these fields will be populated.
1132
- if "SecretString" in get_secret_value_response:
1133
- return get_secret_value_response["SecretString"]
1134
- return base64.b64decode(get_secret_value_response["SecretBinary"])
981
+ return get_secret_value_response["SecretString"]
1135
982
 
1136
983
 
1137
984
  class ParseResult(UrlLibParseResult):
@@ -1145,13 +992,13 @@ class ParseResult(UrlLibParseResult):
1145
992
  return super(ParseResult, cls).__new__(cls, *pr) # noqa: UP008
1146
993
 
1147
994
  @property
1148
- def database(self) -> Optional[str]:
995
+ def database(self) -> str | None:
1149
996
  """Create database property."""
1150
997
  path_elems = self.path.split("/")
1151
998
  return path_elems[1] if len(path_elems) > 1 else None
1152
999
 
1153
1000
  @property
1154
- def schema(self) -> Optional[str]:
1001
+ def schema(self) -> str | None:
1155
1002
  """Create schema property."""
1156
1003
  path_elems = self.path.split("/")
1157
1004
  return path_elems[2] if len(path_elems) > 2 else None