cool-seq-tool 0.3.0.dev1__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/api.py +3 -3
- cool_seq_tool/app.py +32 -11
- cool_seq_tool/data/data_downloads.py +8 -5
- cool_seq_tool/handlers/seqrepo_access.py +55 -27
- cool_seq_tool/mappers/__init__.py +4 -1
- cool_seq_tool/mappers/alignment.py +40 -37
- cool_seq_tool/mappers/exon_genomic_coords.py +329 -138
- cool_seq_tool/mappers/mane_transcript.py +402 -227
- cool_seq_tool/routers/mane.py +1 -1
- cool_seq_tool/routers/mappings.py +1 -1
- cool_seq_tool/schemas.py +31 -24
- cool_seq_tool/sources/__init__.py +4 -2
- cool_seq_tool/sources/mane_transcript_mappings.py +28 -7
- cool_seq_tool/sources/transcript_mappings.py +27 -11
- cool_seq_tool/sources/uta_database.py +179 -232
- cool_seq_tool/utils.py +22 -24
- cool_seq_tool/version.py +1 -1
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/LICENSE +1 -1
- cool_seq_tool-0.4.0.dev0.dist-info/METADATA +130 -0
- cool_seq_tool-0.4.0.dev0.dist-info/RECORD +28 -0
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/transcript_mapping.tsv +0 -256226
- cool_seq_tool-0.3.0.dev1.dist-info/METADATA +0 -187
- cool_seq_tool-0.3.0.dev1.dist-info/RECORD +0 -29
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Provide transcript lookup and metadata tools via the UTA database."""
|
2
2
|
import ast
|
3
3
|
import base64
|
4
4
|
import logging
|
@@ -14,24 +14,34 @@ from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecification
|
|
14
14
|
from botocore.exceptions import ClientError
|
15
15
|
from pyliftover import LiftOver
|
16
16
|
|
17
|
-
from cool_seq_tool.schemas import AnnotationLayer, Assembly
|
17
|
+
from cool_seq_tool.schemas import AnnotationLayer, Assembly, Strand
|
18
18
|
|
19
|
-
# use `bound` to upper-bound
|
20
|
-
UTADatabaseType = TypeVar("UTADatabaseType", bound="
|
19
|
+
# use `bound` to upper-bound UtaDatabase or child classes
|
20
|
+
UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase")
|
21
21
|
|
22
22
|
# Environment variables for paths to chain files for pyliftover
|
23
23
|
LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
|
24
24
|
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
|
25
25
|
|
26
26
|
UTA_DB_URL = environ.get(
|
27
|
-
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/
|
27
|
+
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129b"
|
28
28
|
)
|
29
29
|
|
30
30
|
logger = logging.getLogger(__name__)
|
31
31
|
|
32
32
|
|
33
|
-
class
|
34
|
-
"""
|
33
|
+
class UtaDatabase:
|
34
|
+
"""Provide transcript lookup and metadata tools via the Universal Transcript Archive
|
35
|
+
(UTA) database.
|
36
|
+
|
37
|
+
Users should use the ``create()`` method to construct a new instance. Note that
|
38
|
+
almost all public methods are defined as ``async`` -- see the :ref:`Usage section <async_note>`
|
39
|
+
for more information.
|
40
|
+
|
41
|
+
>>> import asyncio
|
42
|
+
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
|
43
|
+
>>> uta_db = asyncio.run(UtaDatabase.create())
|
44
|
+
"""
|
35
45
|
|
36
46
|
def __init__(
|
37
47
|
self,
|
@@ -39,19 +49,19 @@ class UTADatabase:
|
|
39
49
|
chain_file_37_to_38: Optional[str] = None,
|
40
50
|
chain_file_38_to_37: Optional[str] = None,
|
41
51
|
) -> None:
|
42
|
-
"""Initialize DB class.
|
43
|
-
|
52
|
+
"""Initialize DB class. Should only be used by ``create()`` method, and not
|
53
|
+
be called directly by a user.
|
44
54
|
|
45
55
|
:param db_url: PostgreSQL connection URL
|
46
|
-
Format:
|
56
|
+
Format: ``driver://user:password@host/database/schema``
|
47
57
|
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
|
48
|
-
This is used for pyliftover
|
49
|
-
LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will
|
50
|
-
pyliftover to download a chain file from UCSC
|
58
|
+
This is used for ``pyliftover``. If this is not provided, will check to see
|
59
|
+
if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
|
60
|
+
allow ``pyliftover`` to download a chain file from UCSC
|
51
61
|
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
|
52
|
-
This is used for pyliftover
|
53
|
-
LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will
|
54
|
-
pyliftover to download a chain file from UCSC
|
62
|
+
This is used for ``pyliftover``. If this is not provided, will check to see
|
63
|
+
if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
|
64
|
+
allow ``pyliftover`` to download a chain file from UCSC
|
55
65
|
"""
|
56
66
|
self.schema = None
|
57
67
|
self._connection_pool = None
|
@@ -137,10 +147,17 @@ class UTADatabase:
|
|
137
147
|
async def create(
|
138
148
|
cls: Type[UTADatabaseType], db_url: str = UTA_DB_URL
|
139
149
|
) -> UTADatabaseType:
|
140
|
-
"""
|
141
|
-
|
142
|
-
|
143
|
-
|
150
|
+
"""Manufacture a fully-initialized class instance (a la factory pattern). This
|
151
|
+
method should be used instead of calling the class directly to create a new
|
152
|
+
instance.
|
153
|
+
|
154
|
+
>>> import asyncio
|
155
|
+
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
|
156
|
+
>>> uta_db = asyncio.run(UtaDatabase.create())
|
157
|
+
|
158
|
+
:param cls: supplied implicitly
|
159
|
+
:param db_url: PostgreSQL connection URL
|
160
|
+
Format: ``driver://user:password@host/database/schema``
|
144
161
|
:return: UTA DB access class instance
|
145
162
|
"""
|
146
163
|
self = cls(db_url)
|
@@ -151,7 +168,7 @@ class UTADatabase:
|
|
151
168
|
async def execute_query(self, query: str) -> Any: # noqa: ANN401
|
152
169
|
"""Execute a query and return its result.
|
153
170
|
|
154
|
-
:param
|
171
|
+
:param query: Query to make on database
|
155
172
|
:return: Query's result
|
156
173
|
"""
|
157
174
|
|
@@ -185,7 +202,7 @@ class UTADatabase:
|
|
185
202
|
genomic_table_exists = genomic_table_exists[0].get("exists")
|
186
203
|
if genomic_table_exists is None:
|
187
204
|
logger.critical(
|
188
|
-
"SELECT EXISTS query in
|
205
|
+
"SELECT EXISTS query in UtaDatabase._create_genomic_table "
|
189
206
|
"returned invalid response"
|
190
207
|
)
|
191
208
|
raise ValueError("SELECT EXISTS query returned invalid response")
|
@@ -212,8 +229,8 @@ class UTADatabase:
|
|
212
229
|
await self.execute_query(create_genomic_table)
|
213
230
|
|
214
231
|
indexes = [
|
215
|
-
f"""CREATE INDEX alt_pos_index ON {self.schema}.genomic (alt_ac, alt_start_i, alt_end_i);""",
|
216
|
-
f"""CREATE INDEX gene_alt_index ON {self.schema}.genomic (hgnc, alt_ac);""",
|
232
|
+
f"""CREATE INDEX alt_pos_index ON {self.schema}.genomic (alt_ac, alt_start_i, alt_end_i);""",
|
233
|
+
f"""CREATE INDEX gene_alt_index ON {self.schema}.genomic (hgnc, alt_ac);""",
|
217
234
|
f"""CREATE INDEX alt_ac_index ON {self.schema}.genomic (alt_ac);""",
|
218
235
|
]
|
219
236
|
for create_index in indexes:
|
@@ -223,7 +240,7 @@ class UTADatabase:
|
|
223
240
|
def _transform_list(li: List) -> List[List[Any]]:
|
224
241
|
"""Transform list to only contain field values
|
225
242
|
|
226
|
-
:param
|
243
|
+
:param li: List of asyncpg.Record objects
|
227
244
|
:return: List of list of objects
|
228
245
|
"""
|
229
246
|
results = list()
|
@@ -231,30 +248,33 @@ class UTADatabase:
|
|
231
248
|
results.append([field for field in item])
|
232
249
|
return results
|
233
250
|
|
234
|
-
async def
|
251
|
+
async def get_genes_and_alt_acs(
|
235
252
|
self,
|
236
|
-
chromosome: int,
|
237
253
|
pos: int,
|
238
|
-
strand: Optional[
|
254
|
+
strand: Optional[Strand] = None,
|
255
|
+
chromosome: Optional[int] = None,
|
239
256
|
alt_ac: Optional[str] = None,
|
240
257
|
gene: Optional[str] = None,
|
241
258
|
) -> Tuple[Optional[Dict], Optional[str]]:
|
242
|
-
"""Return genes and genomic accessions
|
243
|
-
|
244
|
-
:param
|
245
|
-
:param
|
246
|
-
:param
|
247
|
-
|
248
|
-
|
249
|
-
:
|
250
|
-
|
259
|
+
"""Return genes and genomic accessions for a position on a chromosome or alt_ac
|
260
|
+
|
261
|
+
:param pos: Genomic position
|
262
|
+
:param strand: Strand
|
263
|
+
:param chromosome: Chromosome. Must give chromosome without a prefix
|
264
|
+
(i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
|
265
|
+
If ``alt_ac`` is also provided, ``alt_ac`` will be used.
|
266
|
+
:param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
267
|
+
must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
|
268
|
+
will be used.
|
269
|
+
:param gene: Gene symbol
|
270
|
+
:return: Dictionary containing genes and genomic accessions and warnings if found
|
251
271
|
"""
|
252
272
|
alt_ac_cond = (
|
253
273
|
f"WHERE alt_ac = '{alt_ac}'"
|
254
274
|
if alt_ac
|
255
275
|
else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'"
|
256
276
|
)
|
257
|
-
strand_cond = f"AND alt_strand = '{strand}'" if strand else ""
|
277
|
+
strand_cond = f"AND alt_strand = '{strand.value}'" if strand else ""
|
258
278
|
gene_cond = f"AND hgnc = '{gene}'" if gene else ""
|
259
279
|
|
260
280
|
query = f"""
|
@@ -275,7 +295,10 @@ class UTADatabase:
|
|
275
295
|
f" is mapped between an exon's start and end coordinates"
|
276
296
|
)
|
277
297
|
if strand:
|
278
|
-
msg +=
|
298
|
+
msg += (
|
299
|
+
f" on the "
|
300
|
+
f"{'positive' if strand == Strand.POSITIVE else 'negative'} strand"
|
301
|
+
)
|
279
302
|
if gene:
|
280
303
|
msg += f" and on gene {gene}"
|
281
304
|
return None, msg
|
@@ -293,12 +316,12 @@ class UTADatabase:
|
|
293
316
|
) -> Tuple[Optional[List[Tuple[int, int]]], Optional[str]]:
|
294
317
|
"""Get list of transcript exons start/end coordinates.
|
295
318
|
|
296
|
-
:param
|
297
|
-
:param
|
319
|
+
:param tx_ac: Transcript accession
|
320
|
+
:param alt_ac: Genomic accession
|
298
321
|
:return: List of a transcript's accessions and warnings if found
|
299
322
|
"""
|
300
323
|
if alt_ac:
|
301
|
-
# We know what
|
324
|
+
# We know what assembly we're looking for since we have the
|
302
325
|
# genomic accession
|
303
326
|
query = f"""
|
304
327
|
SELECT DISTINCT tx_start_i, tx_end_i
|
@@ -329,126 +352,17 @@ class UTADatabase:
|
|
329
352
|
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
|
330
353
|
return tx_exons, None
|
331
354
|
|
332
|
-
@staticmethod
|
333
|
-
def _validate_exon(
|
334
|
-
transcript: str, tx_exons: List[Tuple[int, int]], exon_number: int
|
335
|
-
) -> Tuple[Optional[Tuple[int, int]], Optional[str]]:
|
336
|
-
"""Validate that exon number is valid
|
337
|
-
|
338
|
-
:param str transcript: Transcript accession
|
339
|
-
:param List tx_exons: List of transcript's exons
|
340
|
-
:param Optional[int] exon_number: Exon number to validate
|
341
|
-
:return: Transcript coordinates and warnings if found
|
342
|
-
"""
|
343
|
-
msg = f"Exon {exon_number} does not exist on {transcript}"
|
344
|
-
try:
|
345
|
-
if exon_number < 1:
|
346
|
-
return None, msg
|
347
|
-
exon = tx_exons[exon_number - 1]
|
348
|
-
except IndexError:
|
349
|
-
return None, msg
|
350
|
-
return exon, None
|
351
|
-
|
352
|
-
def get_tx_exon_coords(
|
353
|
-
self,
|
354
|
-
transcript: str,
|
355
|
-
tx_exons: List[Tuple[int, int]],
|
356
|
-
exon_start: Optional[int] = None,
|
357
|
-
exon_end: Optional[int] = None,
|
358
|
-
) -> Tuple[
|
359
|
-
Optional[Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]],
|
360
|
-
Optional[str],
|
361
|
-
]:
|
362
|
-
"""Get transcript exon coordinates
|
363
|
-
|
364
|
-
:param transcript: Transcript accession
|
365
|
-
:param tx_exons: List of transcript exons
|
366
|
-
:param exon_start: Start exon number
|
367
|
-
:param exon_end: End exon number
|
368
|
-
:return: [Transcript start exon coords, Transcript end exon coords],
|
369
|
-
and warnings if found
|
370
|
-
"""
|
371
|
-
if exon_start is not None:
|
372
|
-
tx_exon_start, warning = self._validate_exon(
|
373
|
-
transcript, tx_exons, exon_start
|
374
|
-
)
|
375
|
-
if not tx_exon_start:
|
376
|
-
return None, warning
|
377
|
-
else:
|
378
|
-
tx_exon_start = None
|
379
|
-
|
380
|
-
if exon_end is not None:
|
381
|
-
tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
|
382
|
-
if not tx_exon_end:
|
383
|
-
return None, warning
|
384
|
-
else:
|
385
|
-
tx_exon_end = None
|
386
|
-
return (tx_exon_start, tx_exon_end), None
|
387
|
-
|
388
|
-
async def get_alt_ac_start_and_end(
|
389
|
-
self,
|
390
|
-
tx_ac: str,
|
391
|
-
tx_exon_start: Optional[List[str]] = None,
|
392
|
-
tx_exon_end: Optional[List[str]] = None,
|
393
|
-
gene: Optional[str] = None,
|
394
|
-
) -> Tuple[Optional[Tuple[Tuple, Tuple]], Optional[str]]:
|
395
|
-
"""Get genomic coordinates for related transcript exon start and end.
|
396
|
-
|
397
|
-
:param str tx_ac: Transcript accession
|
398
|
-
:param Optional[List[str]] tx_exon_start: Transcript's exon start
|
399
|
-
coordinates
|
400
|
-
:param Optional[List[str]] tx_exon_end: Transcript's exon end
|
401
|
-
coordinates
|
402
|
-
:param str gene: Gene symbol
|
403
|
-
:return: Alt ac start and end data, and warnings if found
|
404
|
-
"""
|
405
|
-
if tx_exon_start:
|
406
|
-
alt_ac_start, warning = await self.get_alt_ac_start_or_end(
|
407
|
-
tx_ac, int(tx_exon_start[0]), int(tx_exon_start[1]), gene=gene
|
408
|
-
)
|
409
|
-
if not alt_ac_start:
|
410
|
-
return None, warning
|
411
|
-
else:
|
412
|
-
alt_ac_start = None
|
413
|
-
|
414
|
-
if tx_exon_end:
|
415
|
-
alt_ac_end, warning = await self.get_alt_ac_start_or_end(
|
416
|
-
tx_ac, int(tx_exon_end[0]), int(tx_exon_end[1]), gene=gene
|
417
|
-
)
|
418
|
-
if not alt_ac_end:
|
419
|
-
return None, warning
|
420
|
-
else:
|
421
|
-
alt_ac_end = None
|
422
|
-
|
423
|
-
if alt_ac_start is None and alt_ac_end is None:
|
424
|
-
msg = "Unable to find `alt_ac_start` or `alt_ac_end`"
|
425
|
-
logger.warning(msg)
|
426
|
-
return None, msg
|
427
|
-
|
428
|
-
# validate
|
429
|
-
if alt_ac_start and alt_ac_end:
|
430
|
-
for i in (0, 1, 4):
|
431
|
-
if alt_ac_start[i] != alt_ac_end[i]:
|
432
|
-
if i == 0:
|
433
|
-
error = "Gene symbol does not match"
|
434
|
-
elif i == 1:
|
435
|
-
error = "Chromosome does not match"
|
436
|
-
else:
|
437
|
-
error = "Strand does not match"
|
438
|
-
logger.warning(f"{error}: " f"{alt_ac_start[i]} != {alt_ac_end[i]}")
|
439
|
-
return (alt_ac_start, alt_ac_end), None
|
440
|
-
|
441
355
|
async def get_alt_ac_start_or_end(
|
442
356
|
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str]
|
443
357
|
) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]:
|
444
358
|
"""Get genomic data for related transcript exon start or end.
|
445
359
|
|
446
|
-
:param
|
447
|
-
:param
|
448
|
-
:param
|
449
|
-
:param
|
360
|
+
:param tx_ac: Transcript accession
|
361
|
+
:param tx_exon_start: Transcript's exon start coordinate
|
362
|
+
:param tx_exon_end: Transcript's exon end coordinate
|
363
|
+
:param gene: HGNC gene symbol
|
450
364
|
:return: [hgnc symbol, genomic accession for chromosome,
|
451
|
-
|
365
|
+
aligned genomic start coordinate, aligned genomic end coordinate, strand],
|
452
366
|
and warnings if found
|
453
367
|
"""
|
454
368
|
if gene:
|
@@ -487,7 +401,7 @@ class UTADatabase:
|
|
487
401
|
async def get_cds_start_end(self, tx_ac: str) -> Optional[Tuple[int, int]]:
|
488
402
|
"""Get coding start and end site
|
489
403
|
|
490
|
-
:param
|
404
|
+
:param tx_ac: Transcript accession
|
491
405
|
:return: [Coding start site, Coding end site]
|
492
406
|
"""
|
493
407
|
if tx_ac.startswith("ENS"):
|
@@ -511,7 +425,7 @@ class UTADatabase:
|
|
511
425
|
async def get_newest_assembly_ac(self, ac: str) -> List[str]:
|
512
426
|
"""Find accession associated to latest genomic assembly
|
513
427
|
|
514
|
-
:param
|
428
|
+
:param ac: Accession
|
515
429
|
:return: List of accessions associated to latest genomic assembly. Order by
|
516
430
|
desc
|
517
431
|
"""
|
@@ -540,8 +454,8 @@ class UTADatabase:
|
|
540
454
|
async def validate_genomic_ac(self, ac: str) -> bool:
|
541
455
|
"""Return whether or not genomic accession exists.
|
542
456
|
|
543
|
-
:param
|
544
|
-
:return:
|
457
|
+
:param ac: Genomic accession
|
458
|
+
:return: ``True`` if genomic accession exists. ``False`` otherwise.
|
545
459
|
"""
|
546
460
|
query = f"""
|
547
461
|
SELECT EXISTS(
|
@@ -554,10 +468,19 @@ class UTADatabase:
|
|
554
468
|
return result[0][0]
|
555
469
|
|
556
470
|
async def get_ac_descr(self, ac: str) -> Optional[str]:
|
557
|
-
"""Return accession description.
|
558
|
-
|
559
|
-
|
560
|
-
|
471
|
+
"""Return accession description. This is typically available only for accessions
|
472
|
+
from older (pre-GRCh38) builds.
|
473
|
+
|
474
|
+
>>> import asyncio
|
475
|
+
>>> from cool_seq_tool.sources.uta_database import UtaDatabase
|
476
|
+
>>> async def describe():
|
477
|
+
... uta_db = await UtaDatabase.create()
|
478
|
+
... result = await uta_db.get_ac_descr("NC_000001.10")
|
479
|
+
... return result
|
480
|
+
>>> asyncio.run(describe())
|
481
|
+
'Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly'
|
482
|
+
|
483
|
+
:param ac: chromosome accession, e.g. ``"NC_000001.10"``
|
561
484
|
:return: Description containing assembly and chromosome
|
562
485
|
"""
|
563
486
|
query = f"""
|
@@ -580,23 +503,23 @@ class UTADatabase:
|
|
580
503
|
tx_ac: str,
|
581
504
|
start_pos: int,
|
582
505
|
end_pos: int,
|
583
|
-
alt_ac: str = None,
|
506
|
+
alt_ac: Optional[str] = None,
|
584
507
|
use_tx_pos: bool = True,
|
585
508
|
like_tx_ac: bool = False,
|
586
509
|
) -> List:
|
587
510
|
"""Return queried data from tx_exon_aln_v table.
|
588
511
|
|
589
|
-
:param
|
590
|
-
:param
|
591
|
-
:param
|
592
|
-
:param
|
593
|
-
:param
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
:param
|
512
|
+
:param tx_ac: accession on c. coordinate
|
513
|
+
:param start_pos: Start position change
|
514
|
+
:param end_pos: End position change
|
515
|
+
:param alt_ac: accession on g. coordinate
|
516
|
+
:param use_tx_pos: ``True`` if querying on transcript position. This means
|
517
|
+
``start_pos`` and ``end_pos`` are on the c. coordinate
|
518
|
+
``False`` if querying on genomic position. This means ``start_pos`` and
|
519
|
+
``end_pos`` are on the g. coordinate
|
520
|
+
:param like_tx_ac: ``True`` if tx_ac condition should be a like statement.
|
598
521
|
This is used when you want to query an accession regardless of its version
|
599
|
-
|
522
|
+
``False`` if tx_condition will be exact match
|
600
523
|
:return: List of tx_exon_aln_v data
|
601
524
|
"""
|
602
525
|
if end_pos is None:
|
@@ -659,16 +582,13 @@ class UTADatabase:
|
|
659
582
|
def data_from_result(result: List) -> Optional[Dict]:
|
660
583
|
"""Return data found from result.
|
661
584
|
|
662
|
-
:param
|
585
|
+
:param result: Data from tx_exon_aln_v table
|
663
586
|
:return: Gene, strand, and position ranges for tx and alt_ac
|
664
587
|
"""
|
665
588
|
gene = result[0]
|
666
|
-
if result[7] == -1:
|
667
|
-
strand = "-"
|
668
|
-
else:
|
669
|
-
strand = "+"
|
670
589
|
tx_pos_range = result[2], result[3]
|
671
590
|
alt_pos_range = result[5], result[6]
|
591
|
+
strand = Strand(result[7])
|
672
592
|
alt_aln_method = result[8]
|
673
593
|
tx_exon_id = result[9]
|
674
594
|
alt_exon_id = result[10]
|
@@ -694,13 +614,30 @@ class UTADatabase:
|
|
694
614
|
async def get_mane_c_genomic_data(
|
695
615
|
self, ac: str, alt_ac: Optional[str], start_pos: int, end_pos: int
|
696
616
|
) -> Optional[Dict]:
|
697
|
-
"""Get MANE
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
617
|
+
"""Get MANE transcript and genomic data. Used when going from g. to MANE c.
|
618
|
+
representation.
|
619
|
+
|
620
|
+
>>> import asyncio
|
621
|
+
>>> from cool_seq_tool.sources import UtaDatabase
|
622
|
+
>>> async def get_braf_mane():
|
623
|
+
... uta_db = await UtaDatabase.create()
|
624
|
+
... result = await uta_db.get_mane_c_genomic_data(
|
625
|
+
... "NM_004333.6",
|
626
|
+
... None,
|
627
|
+
... 140753335,
|
628
|
+
... 140753335,
|
629
|
+
... )
|
630
|
+
... return result
|
631
|
+
>>> braf = asyncio.run(get_braf_mane())
|
632
|
+
>>> braf["alt_ac"]
|
633
|
+
'NC_000007.14'
|
634
|
+
|
635
|
+
:param ac: MANE transcript accession
|
636
|
+
:param alt_ac: NC accession. Used to triangulate on correct genomic data. Can
|
637
|
+
be set to ``None`` if unavailable.
|
638
|
+
:param start_pos: Genomic start position
|
639
|
+
:param end_pos: Genomic end position change
|
640
|
+
:return: MANE transcript results if successful
|
704
641
|
"""
|
705
642
|
results = await self.get_tx_exon_aln_v_data(
|
706
643
|
ac, start_pos, end_pos, alt_ac=alt_ac, use_tx_pos=False
|
@@ -723,9 +660,7 @@ class UTADatabase:
|
|
723
660
|
data["coding_start_site"] = coding_start_site[0]
|
724
661
|
data["coding_end_site"] = coding_start_site[1]
|
725
662
|
|
726
|
-
if data["strand"] ==
|
727
|
-
end_pos += 1
|
728
|
-
start_pos += 1
|
663
|
+
if data["strand"] == Strand.NEGATIVE:
|
729
664
|
data["alt_pos_change_range"] = (end_pos, start_pos)
|
730
665
|
data["alt_pos_change"] = (
|
731
666
|
data["alt_pos_range"][1] - data["alt_pos_change_range"][0],
|
@@ -752,13 +687,12 @@ class UTADatabase:
|
|
752
687
|
) -> Optional[Dict]:
|
753
688
|
"""Get transcript mapping to genomic data.
|
754
689
|
|
755
|
-
:param
|
756
|
-
:param
|
757
|
-
:param
|
758
|
-
|
759
|
-
:param
|
760
|
-
|
761
|
-
If `alt_ac` is provided, it will return the associated assembly.
|
690
|
+
:param tx_ac: Accession on c. coordinate
|
691
|
+
:param pos: (start pos, end pos)
|
692
|
+
:param annotation_layer: Annotation layer for ``ac`` and ``pos``
|
693
|
+
:param alt_ac: Accession on g. coordinate
|
694
|
+
:param target_genome_assembly: Genome assembly to get genomic data for.
|
695
|
+
If ``alt_ac`` is provided, it will return the associated assembly.
|
762
696
|
:return: Gene, Transcript accession and position change,
|
763
697
|
Altered transcript accession and position change, Strand
|
764
698
|
"""
|
@@ -789,7 +723,7 @@ class UTADatabase:
|
|
789
723
|
)
|
790
724
|
|
791
725
|
if annotation_layer == AnnotationLayer.CDNA:
|
792
|
-
if data["strand"] ==
|
726
|
+
if data["strand"] == Strand.NEGATIVE:
|
793
727
|
data["alt_pos_change_range"] = (
|
794
728
|
data["alt_pos_range"][1] - data["pos_change"][0],
|
795
729
|
data["alt_pos_range"][0] + data["pos_change"][1],
|
@@ -800,8 +734,8 @@ class UTADatabase:
|
|
800
734
|
data["alt_pos_range"][1] - data["pos_change"][1],
|
801
735
|
)
|
802
736
|
else:
|
803
|
-
if data["strand"] ==
|
804
|
-
data["alt_pos_change_range"] = (pos[1]
|
737
|
+
if data["strand"] == Strand.NEGATIVE:
|
738
|
+
data["alt_pos_change_range"] = (pos[1], pos[0])
|
805
739
|
else:
|
806
740
|
data["alt_pos_change_range"] = pos
|
807
741
|
|
@@ -810,7 +744,7 @@ class UTADatabase:
|
|
810
744
|
async def get_ac_from_gene(self, gene: str) -> List[str]:
|
811
745
|
"""Return genomic accession(s) associated to a gene.
|
812
746
|
|
813
|
-
:param
|
747
|
+
:param gene: Gene symbol
|
814
748
|
:return: List of genomic accessions, sorted in desc order
|
815
749
|
"""
|
816
750
|
query = f"""
|
@@ -832,11 +766,20 @@ class UTADatabase:
|
|
832
766
|
async def get_gene_from_ac(
|
833
767
|
self, ac: str, start_pos: int, end_pos: int
|
834
768
|
) -> Optional[List[str]]:
|
835
|
-
"""Get
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
769
|
+
"""Get gene(s) within the provided coordinate range
|
770
|
+
|
771
|
+
>>> import asyncio
|
772
|
+
>>> from cool_seq_tool.sources import UtaDatabase
|
773
|
+
>>> async def get_gene():
|
774
|
+
... uta_db = await UtaDatabase.create()
|
775
|
+
... result = await uta_db.get_gene_from_ac("NC_000017.11", 43044296, 43045802)
|
776
|
+
... return result
|
777
|
+
>>> asyncio.run(get_gene())
|
778
|
+
['BRCA1']
|
779
|
+
|
780
|
+
:param ac: NC accession, e.g. ``"NC_000001.11"``
|
781
|
+
:param start_pos: Start position change
|
782
|
+
:param end_pos: End position change
|
840
783
|
:return: List of HGNC gene symbols
|
841
784
|
"""
|
842
785
|
if end_pos is None:
|
@@ -871,20 +814,20 @@ class UTADatabase:
|
|
871
814
|
use_tx_pos: bool = True,
|
872
815
|
alt_ac: Optional[str] = None,
|
873
816
|
) -> pl.DataFrame:
|
874
|
-
"""Get transcripts for a given
|
817
|
+
"""Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
|
875
818
|
|
876
819
|
:param start_pos: Start position change
|
877
|
-
If not provided and
|
820
|
+
If not provided and ``end_pos`` not provided, all transcripts associated with
|
878
821
|
the gene and/or accession will be returned
|
879
822
|
:param end_pos: End position change
|
880
|
-
If not provided and
|
823
|
+
If not provided and ``start_pos`` not provided, all transcripts associated
|
881
824
|
with the gene and/or accession will be returned
|
882
825
|
:param gene: HGNC gene symbol
|
883
|
-
:param use_tx_pos:
|
884
|
-
|
885
|
-
on genomic position. This means
|
826
|
+
:param use_tx_pos: ``True`` if querying on transcript position. This means
|
827
|
+
``start_pos`` and ``end_pos`` are c. coordinate positions. ``False`` if querying
|
828
|
+
on genomic position. This means ``start_pos`` and ``end_pos`` are g. coordinate
|
886
829
|
positions
|
887
|
-
:param alt_ac: Genomic accession. If not provided, must provide
|
830
|
+
:param alt_ac: Genomic accession. If not provided, must provide ``gene``
|
888
831
|
:return: Data Frame containing transcripts associated with a gene.
|
889
832
|
Transcripts are ordered by most recent NC accession, then by
|
890
833
|
descending transcript length
|
@@ -938,12 +881,15 @@ class UTADatabase:
|
|
938
881
|
results = [
|
939
882
|
(r["pro_ac"], r["tx_ac"], r["alt_ac"], r["cds_start_i"]) for r in results
|
940
883
|
]
|
941
|
-
|
884
|
+
results_df = pl.DataFrame(results, schema=schema)
|
885
|
+
if results:
|
886
|
+
results_df = results_df.unique()
|
887
|
+
return results_df
|
942
888
|
|
943
889
|
async def get_chr_assembly(self, ac: str) -> Optional[Tuple[str, str]]:
|
944
890
|
"""Get chromosome and assembly for NC accession if not in GRCh38.
|
945
891
|
|
946
|
-
:param
|
892
|
+
:param ac: NC accession
|
947
893
|
:return: Chromosome and Assembly accession is on
|
948
894
|
"""
|
949
895
|
descr = await self.get_ac_descr(ac)
|
@@ -966,8 +912,8 @@ class UTADatabase:
|
|
966
912
|
async def liftover_to_38(self, genomic_tx_data: Dict) -> None:
|
967
913
|
"""Liftover genomic_tx_data to hg38 assembly.
|
968
914
|
|
969
|
-
:param
|
970
|
-
|
915
|
+
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
916
|
+
strand
|
971
917
|
"""
|
972
918
|
descr = await self.get_chr_assembly(genomic_tx_data["alt_ac"])
|
973
919
|
if descr is None:
|
@@ -1022,9 +968,9 @@ class UTADatabase:
|
|
1022
968
|
) -> Optional[Tuple]:
|
1023
969
|
"""Get new genome assembly data for a position on a chromosome.
|
1024
970
|
|
1025
|
-
:param
|
1026
|
-
:param
|
1027
|
-
:param
|
971
|
+
:param chromosome: The chromosome number. Must be prefixed with ``chr``
|
972
|
+
:param pos: Position on the chromosome
|
973
|
+
:param liftover_to_assembly: Assembly to liftover to
|
1028
974
|
:return: [Target chromosome, target position, target strand,
|
1029
975
|
conversion_chain_score] for assembly
|
1030
976
|
"""
|
@@ -1055,11 +1001,11 @@ class UTADatabase:
|
|
1055
1001
|
) -> None:
|
1056
1002
|
"""Update genomic_tx_data to have coordinates for given assembly.
|
1057
1003
|
|
1058
|
-
:param
|
1059
|
-
|
1060
|
-
:param
|
1061
|
-
:param
|
1062
|
-
:param
|
1004
|
+
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
1005
|
+
strand
|
1006
|
+
:param key: Key to access coordinate positions
|
1007
|
+
:param chromosome: Chromosome, must be prefixed with ``chr``
|
1008
|
+
:param liftover_to_assembly: Assembly to liftover to
|
1063
1009
|
"""
|
1064
1010
|
liftover_start_i = self.get_liftover(
|
1065
1011
|
chromosome, genomic_tx_data[key][0], liftover_to_assembly
|
@@ -1084,11 +1030,12 @@ class UTADatabase:
|
|
1084
1030
|
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
1085
1031
|
|
1086
1032
|
async def p_to_c_ac(self, p_ac: str) -> List[str]:
|
1087
|
-
"""Return
|
1033
|
+
"""Return cDNA reference sequence accession from protein reference sequence
|
1034
|
+
accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
|
1088
1035
|
|
1089
|
-
:param
|
1090
|
-
:return: List of rows containing c. accessions that are associated
|
1091
|
-
|
1036
|
+
:param p_ac: Protein accession
|
1037
|
+
:return: List of rows containing c. accessions that are associated with the
|
1038
|
+
given p. accession. In ascending order.
|
1092
1039
|
"""
|
1093
1040
|
# Ensembl accessions do not have versions
|
1094
1041
|
if p_ac.startswith("EN"):
|
@@ -1115,8 +1062,8 @@ class UTADatabase:
|
|
1115
1062
|
) -> List[str]:
|
1116
1063
|
"""Get transcripts associated to a genomic ac and position.
|
1117
1064
|
|
1118
|
-
:param
|
1119
|
-
:param
|
1065
|
+
:param alt_ac: Genomic accession
|
1066
|
+
:param g_pos: Genomic position
|
1120
1067
|
:return: RefSeq transcripts on c. coordinate
|
1121
1068
|
"""
|
1122
1069
|
query = f"""
|
@@ -1133,7 +1080,7 @@ class UTADatabase:
|
|
1133
1080
|
|
1134
1081
|
@staticmethod
|
1135
1082
|
def get_secret() -> str:
|
1136
|
-
"""Get secrets for UTA DB instances."""
|
1083
|
+
"""Get secrets for UTA DB instances. Used for deployment on AWS."""
|
1137
1084
|
secret_name = environ["UTA_DB_SECRET"]
|
1138
1085
|
region_name = "us-east-2"
|
1139
1086
|
|