cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +7 -11
- cool_seq_tool/app.py +44 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +27 -25
- cool_seq_tool/mappers/__init__.py +3 -1
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
- cool_seq_tool/mappers/liftover.py +90 -0
- cool_seq_tool/mappers/mane_transcript.py +208 -113
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +153 -0
- cool_seq_tool/schemas.py +92 -54
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +96 -249
- cool_seq_tool/utils.py +44 -4
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
- cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
- cool_seq_tool/api.py +0 -42
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool/routers/__init__.py +0 -16
- cool_seq_tool/routers/default.py +0 -125
- cool_seq_tool/routers/mane.py +0 -98
- cool_seq_tool/routers/mappings.py +0 -155
- cool_seq_tool/version.py +0 -2
- cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,15 @@
|
|
1
1
|
"""Provide transcript lookup and metadata tools via the UTA database."""
|
2
|
+
|
2
3
|
import ast
|
3
|
-
import base64
|
4
4
|
import logging
|
5
5
|
from os import environ
|
6
|
-
from typing import Any,
|
6
|
+
from typing import Any, Literal, TypeVar
|
7
7
|
from urllib.parse import ParseResult as UrlLibParseResult
|
8
8
|
from urllib.parse import quote, unquote, urlparse
|
9
9
|
|
10
10
|
import asyncpg
|
11
11
|
import boto3
|
12
12
|
import polars as pl
|
13
|
-
from agct import Converter, Genome
|
14
13
|
from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError
|
15
14
|
from botocore.exceptions import ClientError
|
16
15
|
|
@@ -19,15 +18,11 @@ from cool_seq_tool.schemas import AnnotationLayer, Assembly, Strand
|
|
19
18
|
# use `bound` to upper-bound UtaDatabase or child classes
|
20
19
|
UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase")
|
21
20
|
|
22
|
-
# Environment variables for paths to chain files for agct
|
23
|
-
LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
|
24
|
-
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
|
25
|
-
|
26
21
|
UTA_DB_URL = environ.get(
|
27
|
-
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:
|
22
|
+
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5432/uta/uta_20210129b"
|
28
23
|
)
|
29
24
|
|
30
|
-
|
25
|
+
_logger = logging.getLogger(__name__)
|
31
26
|
|
32
27
|
|
33
28
|
class UtaDatabase:
|
@@ -43,25 +38,12 @@ class UtaDatabase:
|
|
43
38
|
>>> uta_db = asyncio.run(UtaDatabase.create())
|
44
39
|
"""
|
45
40
|
|
46
|
-
def __init__(
|
47
|
-
self,
|
48
|
-
db_url: str = UTA_DB_URL,
|
49
|
-
chain_file_37_to_38: Optional[str] = None,
|
50
|
-
chain_file_38_to_37: Optional[str] = None,
|
51
|
-
) -> None:
|
41
|
+
def __init__(self, db_url: str = UTA_DB_URL) -> None:
|
52
42
|
"""Initialize DB class. Should only be used by ``create()`` method, and not
|
53
43
|
be called directly by a user.
|
54
44
|
|
55
45
|
:param db_url: PostgreSQL connection URL
|
56
46
|
Format: ``driver://user:password@host/database/schema``
|
57
|
-
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
|
58
|
-
This is used for ``agct``. If this is not provided, will check to see
|
59
|
-
if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
|
60
|
-
allow ``agct`` to download a chain file from UCSC
|
61
|
-
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
|
62
|
-
This is used for ``agct``. If this is not provided, will check to see
|
63
|
-
if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
|
64
|
-
allow ``agct`` to download a chain file from UCSC
|
65
47
|
"""
|
66
48
|
self.schema = None
|
67
49
|
self._connection_pool = None
|
@@ -69,19 +51,7 @@ class UtaDatabase:
|
|
69
51
|
self.db_url = db_url.replace(original_pwd, quote(original_pwd))
|
70
52
|
self.args = self._get_conn_args()
|
71
53
|
|
72
|
-
|
73
|
-
if chain_file_37_to_38:
|
74
|
-
self.liftover_37_to_38 = Converter(chainfile=chain_file_37_to_38)
|
75
|
-
else:
|
76
|
-
self.liftover_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
|
77
|
-
|
78
|
-
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
|
79
|
-
if chain_file_38_to_37:
|
80
|
-
self.liftover_38_to_37 = Converter(chainfile=chain_file_38_to_37)
|
81
|
-
else:
|
82
|
-
self.liftover_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
|
83
|
-
|
84
|
-
def _get_conn_args(self) -> Dict:
|
54
|
+
def _get_conn_args(self) -> dict:
|
85
55
|
"""Return connection arguments.
|
86
56
|
|
87
57
|
:param db_url: raw connection URL
|
@@ -99,9 +69,9 @@ class UtaDatabase:
|
|
99
69
|
self.schema = schema
|
100
70
|
|
101
71
|
environ["PGPASSWORD"] = password
|
102
|
-
environ[
|
103
|
-
"
|
104
|
-
|
72
|
+
environ["UTA_DB_URL"] = (
|
73
|
+
f"postgresql://{username}@{host}:{port}/{database}/{schema}"
|
74
|
+
)
|
105
75
|
return {
|
106
76
|
"host": host,
|
107
77
|
"port": int(port),
|
@@ -137,7 +107,7 @@ class UtaDatabase:
|
|
137
107
|
database=self.args["database"],
|
138
108
|
)
|
139
109
|
except InterfaceError as e:
|
140
|
-
|
110
|
+
_logger.error(
|
141
111
|
"While creating connection pool, encountered exception %s", e
|
142
112
|
)
|
143
113
|
msg = "Could not create connection pool"
|
@@ -145,7 +115,7 @@ class UtaDatabase:
|
|
145
115
|
|
146
116
|
@classmethod
|
147
117
|
async def create(
|
148
|
-
cls:
|
118
|
+
cls: type[UTADatabaseType], db_url: str = UTA_DB_URL
|
149
119
|
) -> UTADatabaseType:
|
150
120
|
"""Manufacture a fully-initialized class instance (a la factory pattern). This
|
151
121
|
method should be used instead of calling the class directly to create a new
|
@@ -173,7 +143,10 @@ class UtaDatabase:
|
|
173
143
|
"""
|
174
144
|
|
175
145
|
async def _execute_query(q: str) -> Any: # noqa: ANN401
|
176
|
-
async with
|
146
|
+
async with (
|
147
|
+
self._connection_pool.acquire() as connection,
|
148
|
+
connection.transaction(),
|
149
|
+
):
|
177
150
|
return await connection.fetch(q)
|
178
151
|
|
179
152
|
if not self._connection_pool:
|
@@ -197,7 +170,7 @@ class UtaDatabase:
|
|
197
170
|
genomic_table_exists = await self.execute_query(check_table_exists)
|
198
171
|
genomic_table_exists = genomic_table_exists[0].get("exists")
|
199
172
|
if genomic_table_exists is None:
|
200
|
-
|
173
|
+
_logger.critical(
|
201
174
|
"SELECT EXISTS query in UtaDatabase._create_genomic_table "
|
202
175
|
"returned invalid response"
|
203
176
|
)
|
@@ -234,25 +207,22 @@ class UtaDatabase:
|
|
234
207
|
await self.execute_query(create_index)
|
235
208
|
|
236
209
|
@staticmethod
|
237
|
-
def _transform_list(li:
|
210
|
+
def _transform_list(li: list) -> list[list[Any]]:
|
238
211
|
"""Transform list to only contain field values
|
239
212
|
|
240
213
|
:param li: List of asyncpg.Record objects
|
241
214
|
:return: List of list of objects
|
242
215
|
"""
|
243
|
-
|
244
|
-
for item in li:
|
245
|
-
results.append(list(item))
|
246
|
-
return results
|
216
|
+
return [list(i) for i in li]
|
247
217
|
|
248
218
|
async def get_genes_and_alt_acs(
|
249
219
|
self,
|
250
220
|
pos: int,
|
251
|
-
strand:
|
252
|
-
chromosome:
|
253
|
-
alt_ac:
|
254
|
-
gene:
|
255
|
-
) ->
|
221
|
+
strand: Strand | None = None,
|
222
|
+
chromosome: int | None = None,
|
223
|
+
alt_ac: str | None = None,
|
224
|
+
gene: str | None = None,
|
225
|
+
) -> tuple[dict | None, str | None]:
|
256
226
|
"""Return genes and genomic accessions for a position on a chromosome or alt_ac
|
257
227
|
|
258
228
|
:param pos: Genomic position
|
@@ -261,7 +231,7 @@ class UtaDatabase:
|
|
261
231
|
(i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
|
262
232
|
If ``alt_ac`` is also provided, ``alt_ac`` will be used.
|
263
233
|
:param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
264
|
-
must provide ``chromosome
|
234
|
+
must provide ``chromosome``. If ``chromosome`` is also provided, ``alt_ac``
|
265
235
|
will be used.
|
266
236
|
:param gene: Gene symbol
|
267
237
|
:return: Dictionary containing genes and genomic accessions and warnings if found
|
@@ -309,8 +279,8 @@ class UtaDatabase:
|
|
309
279
|
return {"genes": genes, "alt_acs": alt_acs}, None
|
310
280
|
|
311
281
|
async def get_tx_exons(
|
312
|
-
self, tx_ac: str, alt_ac:
|
313
|
-
) ->
|
282
|
+
self, tx_ac: str, alt_ac: str | None = None
|
283
|
+
) -> tuple[list[tuple[int, int]] | None, str | None]:
|
314
284
|
"""Get list of transcript exons start/end coordinates.
|
315
285
|
|
316
286
|
:param tx_ac: Transcript accession
|
@@ -343,7 +313,7 @@ class UtaDatabase:
|
|
343
313
|
|
344
314
|
if not result:
|
345
315
|
msg = f"Unable to get exons for {tx_ac}"
|
346
|
-
|
316
|
+
_logger.warning(msg)
|
347
317
|
return None, msg
|
348
318
|
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
|
349
319
|
return tx_exons, None
|
@@ -352,7 +322,7 @@ class UtaDatabase:
|
|
352
322
|
self,
|
353
323
|
tx_ac: str,
|
354
324
|
alt_ac: str,
|
355
|
-
) ->
|
325
|
+
) -> tuple[tuple[int, int, int, int, int] | None, str | None]:
|
356
326
|
"""Get exon number, transcript coordinates, and genomic coordinates
|
357
327
|
|
358
328
|
:param tx_ac: Transcript accession
|
@@ -370,7 +340,7 @@ class UtaDatabase:
|
|
370
340
|
|
371
341
|
if not result:
|
372
342
|
msg = f"Unable to get exons and genomic coordinates for {tx_ac} on {alt_ac}"
|
373
|
-
|
343
|
+
_logger.warning(msg)
|
374
344
|
return None, msg
|
375
345
|
tx_exons_genomic_coords = [
|
376
346
|
(r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"])
|
@@ -379,8 +349,8 @@ class UtaDatabase:
|
|
379
349
|
return tx_exons_genomic_coords, None
|
380
350
|
|
381
351
|
async def get_alt_ac_start_or_end(
|
382
|
-
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene:
|
383
|
-
) ->
|
352
|
+
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None
|
353
|
+
) -> tuple[tuple[str, str, int, int, int] | None, str | None]:
|
384
354
|
"""Get genomic data for related transcript exon start or end.
|
385
355
|
|
386
356
|
:param tx_ac: Transcript accession
|
@@ -415,12 +385,12 @@ class UtaDatabase:
|
|
415
385
|
)
|
416
386
|
if gene_query:
|
417
387
|
msg += f" on gene {gene}"
|
418
|
-
|
388
|
+
_logger.warning(msg)
|
419
389
|
return None, msg
|
420
390
|
result = result[0]
|
421
391
|
return (result[0], result[1], result[2], result[3], result[4]), None
|
422
392
|
|
423
|
-
async def get_cds_start_end(self, tx_ac: str) ->
|
393
|
+
async def get_cds_start_end(self, tx_ac: str) -> tuple[int, int] | None:
|
424
394
|
"""Get coding start and end site
|
425
395
|
|
426
396
|
:param tx_ac: Transcript accession
|
@@ -439,12 +409,12 @@ class UtaDatabase:
|
|
439
409
|
if cds_start_end[0] is not None and cds_start_end[1] is not None: # noqa: RET503
|
440
410
|
return cds_start_end[0], cds_start_end[1]
|
441
411
|
else:
|
442
|
-
|
412
|
+
_logger.warning(
|
443
413
|
"Unable to get coding start/end site for accession: %s", tx_ac
|
444
414
|
)
|
445
415
|
return None
|
446
416
|
|
447
|
-
async def get_newest_assembly_ac(self, ac: str) ->
|
417
|
+
async def get_newest_assembly_ac(self, ac: str) -> list[str]:
|
448
418
|
"""Find accession associated to latest genomic assembly
|
449
419
|
|
450
420
|
:param ac: Accession
|
@@ -489,7 +459,7 @@ class UtaDatabase:
|
|
489
459
|
result = await self.execute_query(query)
|
490
460
|
return result[0][0]
|
491
461
|
|
492
|
-
async def get_ac_descr(self, ac: str) ->
|
462
|
+
async def get_ac_descr(self, ac: str) -> str | None:
|
493
463
|
"""Return accession description. This is typically available only for accessions
|
494
464
|
from older (pre-GRCh38) builds.
|
495
465
|
|
@@ -512,7 +482,7 @@ class UtaDatabase:
|
|
512
482
|
""" # noqa: S608
|
513
483
|
result = await self.execute_query(query)
|
514
484
|
if not result:
|
515
|
-
|
485
|
+
_logger.warning("Accession %s does not have a description", ac)
|
516
486
|
return None
|
517
487
|
result = result[0][0]
|
518
488
|
if result == "":
|
@@ -524,10 +494,10 @@ class UtaDatabase:
|
|
524
494
|
tx_ac: str,
|
525
495
|
start_pos: int,
|
526
496
|
end_pos: int,
|
527
|
-
alt_ac:
|
497
|
+
alt_ac: str | None = None,
|
528
498
|
use_tx_pos: bool = True,
|
529
499
|
like_tx_ac: bool = False,
|
530
|
-
) ->
|
500
|
+
) -> list:
|
531
501
|
"""Return queried data from tx_exon_aln_v table.
|
532
502
|
|
533
503
|
:param tx_ac: accession on c. coordinate
|
@@ -584,21 +554,18 @@ class UtaDatabase:
|
|
584
554
|
""" # noqa: S608
|
585
555
|
result = await self.execute_query(query)
|
586
556
|
if not result:
|
587
|
-
|
557
|
+
_logger.warning("Unable to find transcript alignment for query: %s", query)
|
588
558
|
return []
|
589
559
|
if alt_ac and not use_tx_pos and len(result) > 1:
|
590
|
-
|
560
|
+
_logger.debug(
|
591
561
|
"Found more than one match for tx_ac %s and alt_ac = %s",
|
592
562
|
temp_ac,
|
593
563
|
alt_ac,
|
594
564
|
)
|
595
|
-
|
596
|
-
for r in result:
|
597
|
-
results.append(list(r))
|
598
|
-
return results
|
565
|
+
return [list(r) for r in result]
|
599
566
|
|
600
567
|
@staticmethod
|
601
|
-
def data_from_result(result:
|
568
|
+
def data_from_result(result: list) -> dict | None:
|
602
569
|
"""Return data found from result.
|
603
570
|
|
604
571
|
:param result: Data from tx_exon_aln_v table
|
@@ -613,7 +580,7 @@ class UtaDatabase:
|
|
613
580
|
alt_exon_id = result[10]
|
614
581
|
|
615
582
|
if (tx_pos_range[1] - tx_pos_range[0]) != (alt_pos_range[1] - alt_pos_range[0]):
|
616
|
-
|
583
|
+
_logger.warning(
|
617
584
|
"tx_pos_range %s is not the same length as alt_pos_range %s.",
|
618
585
|
tx_pos_range,
|
619
586
|
alt_pos_range,
|
@@ -631,8 +598,8 @@ class UtaDatabase:
|
|
631
598
|
}
|
632
599
|
|
633
600
|
async def get_mane_c_genomic_data(
|
634
|
-
self, ac: str, alt_ac:
|
635
|
-
) ->
|
601
|
+
self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int
|
602
|
+
) -> dict | None:
|
636
603
|
"""Get MANE transcript and genomic data. Used when going from g. to MANE c.
|
637
604
|
representation.
|
638
605
|
|
@@ -671,7 +638,7 @@ class UtaDatabase:
|
|
671
638
|
|
672
639
|
coding_start_site = await self.get_cds_start_end(ac)
|
673
640
|
if coding_start_site is None:
|
674
|
-
|
641
|
+
_logger.warning("Accession %s not found in UTA", ac)
|
675
642
|
return None
|
676
643
|
|
677
644
|
data["tx_ac"] = result[1]
|
@@ -697,13 +664,12 @@ class UtaDatabase:
|
|
697
664
|
async def get_genomic_tx_data(
|
698
665
|
self,
|
699
666
|
tx_ac: str,
|
700
|
-
pos:
|
701
|
-
annotation_layer:
|
702
|
-
|
703
|
-
|
704
|
-
alt_ac: Optional[str] = None,
|
667
|
+
pos: tuple[int, int],
|
668
|
+
annotation_layer: Literal[AnnotationLayer.CDNA]
|
669
|
+
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
670
|
+
alt_ac: str | None = None,
|
705
671
|
target_genome_assembly: Assembly = Assembly.GRCH38,
|
706
|
-
) ->
|
672
|
+
) -> dict | None:
|
707
673
|
"""Get transcript mapping to genomic data.
|
708
674
|
|
709
675
|
:param tx_ac: Accession on c. coordinate
|
@@ -760,7 +726,7 @@ class UtaDatabase:
|
|
760
726
|
|
761
727
|
return data
|
762
728
|
|
763
|
-
async def get_ac_from_gene(self, gene: str) ->
|
729
|
+
async def get_ac_from_gene(self, gene: str) -> list[str]:
|
764
730
|
"""Return genomic accession(s) associated to a gene.
|
765
731
|
|
766
732
|
:param gene: Gene symbol
|
@@ -784,14 +750,16 @@ class UtaDatabase:
|
|
784
750
|
|
785
751
|
async def get_gene_from_ac(
|
786
752
|
self, ac: str, start_pos: int, end_pos: int
|
787
|
-
) ->
|
753
|
+
) -> list[str] | None:
|
788
754
|
"""Get gene(s) within the provided coordinate range
|
789
755
|
|
790
756
|
>>> import asyncio
|
791
757
|
>>> from cool_seq_tool.sources import UtaDatabase
|
792
758
|
>>> async def get_gene():
|
793
759
|
... uta_db = await UtaDatabase.create()
|
794
|
-
... result = await uta_db.get_gene_from_ac(
|
760
|
+
... result = await uta_db.get_gene_from_ac(
|
761
|
+
... "NC_000017.11", 43044296, 43045802
|
762
|
+
... )
|
795
763
|
... return result
|
796
764
|
>>> asyncio.run(get_gene())
|
797
765
|
['BRCA1']
|
@@ -812,12 +780,12 @@ class UtaDatabase:
|
|
812
780
|
""" # noqa: S608
|
813
781
|
results = await self.execute_query(query)
|
814
782
|
if not results:
|
815
|
-
|
783
|
+
_logger.warning(
|
816
784
|
"Unable to find gene between %s and %s on %s", start_pos, end_pos, ac
|
817
785
|
)
|
818
786
|
return None
|
819
787
|
if len(results) > 1:
|
820
|
-
|
788
|
+
_logger.info(
|
821
789
|
"Found more than one gene between %s and %s on %s",
|
822
790
|
start_pos,
|
823
791
|
end_pos,
|
@@ -828,11 +796,11 @@ class UtaDatabase:
|
|
828
796
|
|
829
797
|
async def get_transcripts(
|
830
798
|
self,
|
831
|
-
start_pos:
|
832
|
-
end_pos:
|
833
|
-
gene:
|
799
|
+
start_pos: int | None = None,
|
800
|
+
end_pos: int | None = None,
|
801
|
+
gene: str | None = None,
|
834
802
|
use_tx_pos: bool = True,
|
835
|
-
alt_ac:
|
803
|
+
alt_ac: str | None = None,
|
836
804
|
) -> pl.DataFrame:
|
837
805
|
"""Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
|
838
806
|
|
@@ -901,16 +869,26 @@ class UtaDatabase:
|
|
901
869
|
results = [
|
902
870
|
(r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results
|
903
871
|
]
|
904
|
-
results_df = pl.DataFrame(results, schema=schema)
|
872
|
+
results_df = pl.DataFrame(results, schema=schema, orient="row")
|
905
873
|
if results:
|
906
874
|
results_df = results_df.unique()
|
907
875
|
return results_df
|
908
876
|
|
909
|
-
async def get_chr_assembly(self, ac: str) ->
|
877
|
+
async def get_chr_assembly(self, ac: str) -> tuple[str, Assembly] | None:
|
910
878
|
"""Get chromosome and assembly for NC accession if not in GRCh38.
|
911
879
|
|
912
|
-
|
913
|
-
|
880
|
+
>>> import asyncio
|
881
|
+
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
|
882
|
+
>>> uta_db = asyncio.run(UtaDatabase.create())
|
883
|
+
>>> result = asyncio.run(uta_db.get_chr_assembly("NC_000007.13"))
|
884
|
+
>>> result
|
885
|
+
('chr7', <Assembly.GRCH37: 'GRCh37'>)
|
886
|
+
|
887
|
+
Returns ``None`` if unable to find (either unrecognized/invalid, or
|
888
|
+
a GRCh38 accession).
|
889
|
+
|
890
|
+
:param ac: RefSeq NC accession, eg ``"NC_000007.13"``
|
891
|
+
:return: Chromosome and assembly that accession is on, if available.
|
914
892
|
"""
|
915
893
|
descr = await self.get_ac_descr(ac)
|
916
894
|
if not descr:
|
@@ -920,136 +898,15 @@ class UtaDatabase:
|
|
920
898
|
chromosome = f"chr{descr[0].split()[-1]}"
|
921
899
|
assembly = f"GRCh{descr[1].split('.')[0].split('GRCh')[-1]}"
|
922
900
|
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
)
|
901
|
+
try:
|
902
|
+
assembly = Assembly(assembly)
|
903
|
+
except ValueError as e:
|
904
|
+
_logger.error(e)
|
928
905
|
return None
|
929
906
|
|
930
907
|
return chromosome, assembly
|
931
908
|
|
932
|
-
async def
|
933
|
-
"""Liftover genomic_tx_data to hg38 assembly.
|
934
|
-
|
935
|
-
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
936
|
-
strand
|
937
|
-
"""
|
938
|
-
descr = await self.get_chr_assembly(genomic_tx_data["alt_ac"])
|
939
|
-
if descr is None:
|
940
|
-
# already grch38
|
941
|
-
return
|
942
|
-
chromosome, _ = descr
|
943
|
-
|
944
|
-
query = f"""
|
945
|
-
SELECT DISTINCT alt_ac
|
946
|
-
FROM {self.schema}.tx_exon_aln_v
|
947
|
-
WHERE tx_ac = '{genomic_tx_data['tx_ac']}';
|
948
|
-
""" # noqa: S608
|
949
|
-
nc_acs = await self.execute_query(query)
|
950
|
-
nc_acs = [nc_ac[0] for nc_ac in nc_acs]
|
951
|
-
if nc_acs == [genomic_tx_data["alt_ac"]]:
|
952
|
-
logger.warning(
|
953
|
-
"UTA does not have GRCh38 assembly for %s",
|
954
|
-
genomic_tx_data["alt_ac"].split(".")[0],
|
955
|
-
)
|
956
|
-
return
|
957
|
-
|
958
|
-
# Get most recent assembly version position
|
959
|
-
# Liftover range
|
960
|
-
self._set_liftover(
|
961
|
-
genomic_tx_data, "alt_pos_range", chromosome, Assembly.GRCH38
|
962
|
-
)
|
963
|
-
|
964
|
-
# Liftover changes range
|
965
|
-
self._set_liftover(
|
966
|
-
genomic_tx_data, "alt_pos_change_range", chromosome, Assembly.GRCH38
|
967
|
-
)
|
968
|
-
|
969
|
-
# Change alt_ac to most recent
|
970
|
-
if genomic_tx_data["alt_ac"].startswith("EN"):
|
971
|
-
order_by_cond = "ORDER BY alt_ac DESC;"
|
972
|
-
else:
|
973
|
-
order_by_cond = """
|
974
|
-
ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1,
|
975
|
-
LENGTH(alt_ac)) AS INT) DESC;
|
976
|
-
"""
|
977
|
-
query = f"""
|
978
|
-
SELECT alt_ac
|
979
|
-
FROM {self.schema}.genomic
|
980
|
-
WHERE alt_ac LIKE '{genomic_tx_data['alt_ac'].split('.')[0]}%'
|
981
|
-
{order_by_cond}
|
982
|
-
""" # noqa: S608
|
983
|
-
nc_acs = await self.execute_query(query)
|
984
|
-
genomic_tx_data["alt_ac"] = nc_acs[0][0]
|
985
|
-
|
986
|
-
def get_liftover(
|
987
|
-
self, chromosome: str, pos: int, liftover_to_assembly: Assembly
|
988
|
-
) -> Optional[Tuple[str, int]]:
|
989
|
-
"""Get new genome assembly data for a position on a chromosome.
|
990
|
-
|
991
|
-
:param chromosome: The chromosome number. Must be prefixed with ``chr``
|
992
|
-
:param pos: Position on the chromosome
|
993
|
-
:param liftover_to_assembly: Assembly to liftover to
|
994
|
-
:return: Target chromosome and target position for assembly
|
995
|
-
"""
|
996
|
-
if not chromosome.startswith("chr"):
|
997
|
-
logger.warning("`chromosome` must be prefixed with chr")
|
998
|
-
return None
|
999
|
-
|
1000
|
-
if liftover_to_assembly == Assembly.GRCH38:
|
1001
|
-
liftover = self.liftover_37_to_38.convert_coordinate(chromosome, pos)
|
1002
|
-
elif liftover_to_assembly == Assembly.GRCH37:
|
1003
|
-
liftover = self.liftover_38_to_37.convert_coordinate(chromosome, pos)
|
1004
|
-
else:
|
1005
|
-
logger.warning("%s assembly not supported", liftover_to_assembly)
|
1006
|
-
liftover = None
|
1007
|
-
|
1008
|
-
if not liftover:
|
1009
|
-
logger.warning("%s does not exist on %s", pos, chromosome)
|
1010
|
-
return None
|
1011
|
-
return liftover[0][:2]
|
1012
|
-
|
1013
|
-
def _set_liftover(
|
1014
|
-
self,
|
1015
|
-
genomic_tx_data: Dict,
|
1016
|
-
key: str,
|
1017
|
-
chromosome: str,
|
1018
|
-
liftover_to_assembly: Assembly,
|
1019
|
-
) -> None:
|
1020
|
-
"""Update genomic_tx_data to have coordinates for given assembly.
|
1021
|
-
|
1022
|
-
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
1023
|
-
strand
|
1024
|
-
:param key: Key to access coordinate positions
|
1025
|
-
:param chromosome: Chromosome, must be prefixed with ``chr``
|
1026
|
-
:param liftover_to_assembly: Assembly to liftover to
|
1027
|
-
"""
|
1028
|
-
liftover_start_i = self.get_liftover(
|
1029
|
-
chromosome, genomic_tx_data[key][0], liftover_to_assembly
|
1030
|
-
)
|
1031
|
-
if liftover_start_i is None:
|
1032
|
-
logger.warning(
|
1033
|
-
"Unable to liftover position %s on %s",
|
1034
|
-
genomic_tx_data[key][0],
|
1035
|
-
chromosome,
|
1036
|
-
)
|
1037
|
-
return
|
1038
|
-
|
1039
|
-
liftover_end_i = self.get_liftover(
|
1040
|
-
chromosome, genomic_tx_data[key][1], liftover_to_assembly
|
1041
|
-
)
|
1042
|
-
if liftover_end_i is None:
|
1043
|
-
logger.warning(
|
1044
|
-
"Unable to liftover position %s on %s",
|
1045
|
-
genomic_tx_data[key][1],
|
1046
|
-
chromosome,
|
1047
|
-
)
|
1048
|
-
return
|
1049
|
-
|
1050
|
-
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
1051
|
-
|
1052
|
-
async def p_to_c_ac(self, p_ac: str) -> List[str]:
|
909
|
+
async def p_to_c_ac(self, p_ac: str) -> list[str]:
|
1053
910
|
"""Return cDNA reference sequence accession from protein reference sequence
|
1054
911
|
accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
|
1055
912
|
|
@@ -1079,7 +936,7 @@ class UtaDatabase:
|
|
1079
936
|
|
1080
937
|
async def get_transcripts_from_genomic_pos(
|
1081
938
|
self, alt_ac: str, g_pos: int
|
1082
|
-
) ->
|
939
|
+
) -> list[str]:
|
1083
940
|
"""Get transcripts associated to a genomic ac and position.
|
1084
941
|
|
1085
942
|
:param alt_ac: Genomic accession
|
@@ -1100,7 +957,12 @@ class UtaDatabase:
|
|
1100
957
|
|
1101
958
|
@staticmethod
|
1102
959
|
def get_secret() -> str:
|
1103
|
-
"""Get secrets for UTA DB instances. Used for deployment on AWS.
|
960
|
+
"""Get secrets for UTA DB instances. Used for deployment on AWS.
|
961
|
+
|
962
|
+
:raises ClientError: If unable to retrieve secret value due to decryption
|
963
|
+
decryption failure, internal service error, invalid parameter, invalid
|
964
|
+
request, or resource not found.
|
965
|
+
"""
|
1104
966
|
secret_name = environ["UTA_DB_SECRET"]
|
1105
967
|
region_name = "us-east-2"
|
1106
968
|
|
@@ -1111,27 +973,12 @@ class UtaDatabase:
|
|
1111
973
|
try:
|
1112
974
|
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
|
1113
975
|
except ClientError as e:
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
# An error occurred on the server side.
|
1119
|
-
"InternalServiceErrorException",
|
1120
|
-
# You provided an invalid value for a parameter.
|
1121
|
-
"InvalidParameterException",
|
1122
|
-
# You provided a parameter value that is not valid for the current state of the resource.
|
1123
|
-
"InvalidRequestException",
|
1124
|
-
# We can"t find the resource that you asked for.
|
1125
|
-
"ResourceNotFoundException",
|
1126
|
-
}:
|
1127
|
-
raise e
|
976
|
+
# For a list of exceptions thrown, see
|
977
|
+
# https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
|
978
|
+
_logger.error(e)
|
979
|
+
raise e
|
1128
980
|
else:
|
1129
|
-
|
1130
|
-
# Depending on whether the secret is a string or binary,
|
1131
|
-
# one of these fields will be populated.
|
1132
|
-
if "SecretString" in get_secret_value_response:
|
1133
|
-
return get_secret_value_response["SecretString"]
|
1134
|
-
return base64.b64decode(get_secret_value_response["SecretBinary"])
|
981
|
+
return get_secret_value_response["SecretString"]
|
1135
982
|
|
1136
983
|
|
1137
984
|
class ParseResult(UrlLibParseResult):
|
@@ -1145,13 +992,13 @@ class ParseResult(UrlLibParseResult):
|
|
1145
992
|
return super(ParseResult, cls).__new__(cls, *pr) # noqa: UP008
|
1146
993
|
|
1147
994
|
@property
|
1148
|
-
def database(self) ->
|
995
|
+
def database(self) -> str | None:
|
1149
996
|
"""Create database property."""
|
1150
997
|
path_elems = self.path.split("/")
|
1151
998
|
return path_elems[1] if len(path_elems) > 1 else None
|
1152
999
|
|
1153
1000
|
@property
|
1154
|
-
def schema(self) ->
|
1001
|
+
def schema(self) -> str | None:
|
1155
1002
|
"""Create schema property."""
|
1156
1003
|
path_elems = self.path.split("/")
|
1157
1004
|
return path_elems[2] if len(path_elems) > 2 else None
|