cool-seq-tool 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +6 -0
- cool_seq_tool/app.py +1 -2
- cool_seq_tool/handlers/seqrepo_access.py +5 -5
- cool_seq_tool/mappers/alignment.py +16 -16
- cool_seq_tool/mappers/exon_genomic_coords.py +911 -667
- cool_seq_tool/mappers/mane_transcript.py +109 -104
- cool_seq_tool/schemas.py +30 -165
- cool_seq_tool/sources/uta_database.py +149 -229
- cool_seq_tool/utils.py +9 -9
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/METADATA +8 -8
- cool_seq_tool-0.7.1.dist-info/RECORD +24 -0
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/WHEEL +1 -1
- cool_seq_tool-0.6.0.dist-info/RECORD +0 -24
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,16 @@ import boto3
|
|
12
12
|
import polars as pl
|
13
13
|
from asyncpg.exceptions import InterfaceError, InvalidAuthorizationSpecificationError
|
14
14
|
from botocore.exceptions import ClientError
|
15
|
-
|
16
|
-
|
15
|
+
from pydantic import Field, StrictInt, StrictStr
|
16
|
+
|
17
|
+
from cool_seq_tool.schemas import (
|
18
|
+
AnnotationLayer,
|
19
|
+
Assembly,
|
20
|
+
BaseModelForbidExtra,
|
21
|
+
GenomicTxData,
|
22
|
+
GenomicTxMetadata,
|
23
|
+
Strand,
|
24
|
+
)
|
17
25
|
|
18
26
|
# use `bound` to upper-bound UtaDatabase or child classes
|
19
27
|
UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase")
|
@@ -25,6 +33,52 @@ UTA_DB_URL = environ.get(
|
|
25
33
|
_logger = logging.getLogger(__name__)
|
26
34
|
|
27
35
|
|
36
|
+
class DbConnectionArgs(BaseModelForbidExtra):
|
37
|
+
"""Represent database connection arguments"""
|
38
|
+
|
39
|
+
host: str
|
40
|
+
port: int
|
41
|
+
user: str
|
42
|
+
password: str
|
43
|
+
database: str
|
44
|
+
|
45
|
+
|
46
|
+
class GenomicAlnData(BaseModelForbidExtra):
|
47
|
+
"""Represent genomic alignment data from UTA tx_exon_aln_v view"""
|
48
|
+
|
49
|
+
hgnc: StrictStr = Field(..., description="HGNC gene symbol.")
|
50
|
+
ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
51
|
+
alt_ac: StrictStr = Field(..., description="RefSeq genomic accession.")
|
52
|
+
alt_start_i: StrictInt = Field(
|
53
|
+
...,
|
54
|
+
description="`alt_ac`'s start index of the exon using inter-residue coordinates.",
|
55
|
+
)
|
56
|
+
alt_end_i: StrictInt = Field(
|
57
|
+
...,
|
58
|
+
description="`alt_ac`'s end index of the exon using inter-residue coordinates.",
|
59
|
+
)
|
60
|
+
alt_strand: Strand = Field(..., description="Strand.")
|
61
|
+
|
62
|
+
|
63
|
+
class TxExonAlnData(GenomicAlnData):
|
64
|
+
"""Represent data from UTA tx_exon_aln_v view"""
|
65
|
+
|
66
|
+
tx_ac: StrictStr = Field(..., description="Transcript accession.")
|
67
|
+
tx_start_i: StrictInt = Field(
|
68
|
+
...,
|
69
|
+
description="`tx_ac`'s start index of the exon using inter-residue coordinates.",
|
70
|
+
)
|
71
|
+
tx_end_i: StrictInt = Field(
|
72
|
+
...,
|
73
|
+
description="`tx_ac`'s end index of the exon using inter-residue coordinates.",
|
74
|
+
)
|
75
|
+
alt_aln_method: StrictStr = Field(
|
76
|
+
..., description="The alignment method used to compare sequences."
|
77
|
+
)
|
78
|
+
tx_exon_id: StrictInt = Field(..., description="`tx_ac` exon identifier.")
|
79
|
+
alt_exon_id: StrictInt = Field(..., description="`alt_ac` exon identifier.")
|
80
|
+
|
81
|
+
|
28
82
|
class UtaDatabase:
|
29
83
|
"""Provide transcript lookup and metadata tools via the Universal Transcript Archive
|
30
84
|
(UTA) database.
|
@@ -51,11 +105,11 @@ class UtaDatabase:
|
|
51
105
|
self.db_url = db_url.replace(original_pwd, quote(original_pwd))
|
52
106
|
self.args = self._get_conn_args()
|
53
107
|
|
54
|
-
def _get_conn_args(self) ->
|
108
|
+
def _get_conn_args(self) -> DbConnectionArgs:
|
55
109
|
"""Return connection arguments.
|
56
110
|
|
57
111
|
:param db_url: raw connection URL
|
58
|
-
:return: Database
|
112
|
+
:return: Database connection arguments
|
59
113
|
"""
|
60
114
|
if "UTA_DB_PROD" in environ:
|
61
115
|
secret = ast.literal_eval(self.get_secret())
|
@@ -72,23 +126,24 @@ class UtaDatabase:
|
|
72
126
|
environ["UTA_DB_URL"] = (
|
73
127
|
f"postgresql://{username}@{host}:{port}/{database}/{schema}"
|
74
128
|
)
|
75
|
-
return
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
129
|
+
return DbConnectionArgs(
|
130
|
+
host=host,
|
131
|
+
port=int(port),
|
132
|
+
database=database,
|
133
|
+
user=username,
|
134
|
+
password=password,
|
135
|
+
)
|
136
|
+
|
82
137
|
url = ParseResult(urlparse(self.db_url))
|
83
138
|
self.schema = url.schema
|
84
139
|
password = unquote(url.password) if url.password else ""
|
85
|
-
return
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
140
|
+
return DbConnectionArgs(
|
141
|
+
host=url.hostname,
|
142
|
+
port=url.port,
|
143
|
+
database=url.database,
|
144
|
+
user=url.username,
|
145
|
+
password=password,
|
146
|
+
)
|
92
147
|
|
93
148
|
async def create_pool(self) -> None:
|
94
149
|
"""Create connection pool if not already created."""
|
@@ -100,11 +155,11 @@ class UtaDatabase:
|
|
100
155
|
max_size=10,
|
101
156
|
max_inactive_connection_lifetime=3,
|
102
157
|
command_timeout=60,
|
103
|
-
host=self.args
|
104
|
-
port=self.args
|
105
|
-
user=self.args
|
106
|
-
password=self.args
|
107
|
-
database=self.args
|
158
|
+
host=self.args.host,
|
159
|
+
port=self.args.port,
|
160
|
+
user=self.args.user,
|
161
|
+
password=self.args.password,
|
162
|
+
database=self.args.database,
|
108
163
|
)
|
109
164
|
except InterfaceError as e:
|
110
165
|
_logger.error(
|
@@ -215,156 +270,21 @@ class UtaDatabase:
|
|
215
270
|
"""
|
216
271
|
return [list(i) for i in li]
|
217
272
|
|
218
|
-
async def get_genes_and_alt_acs(
|
219
|
-
self,
|
220
|
-
pos: int,
|
221
|
-
strand: Strand | None = None,
|
222
|
-
chromosome: int | None = None,
|
223
|
-
alt_ac: str | None = None,
|
224
|
-
gene: str | None = None,
|
225
|
-
) -> tuple[dict | None, str | None]:
|
226
|
-
"""Return genes and genomic accessions for a position on a chromosome or alt_ac
|
227
|
-
|
228
|
-
:param pos: Genomic position
|
229
|
-
:param strand: Strand
|
230
|
-
:param chromosome: Chromosome. Must give chromosome without a prefix
|
231
|
-
(i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
|
232
|
-
If ``alt_ac`` is also provided, ``alt_ac`` will be used.
|
233
|
-
:param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
234
|
-
must provide ``chromosome``. If ``chromosome`` is also provided, ``alt_ac``
|
235
|
-
will be used.
|
236
|
-
:param gene: Gene symbol
|
237
|
-
:return: Dictionary containing genes and genomic accessions and warnings if found
|
238
|
-
"""
|
239
|
-
alt_ac_cond = (
|
240
|
-
f"WHERE alt_ac = '{alt_ac}'"
|
241
|
-
if alt_ac
|
242
|
-
else f"WHERE alt_ac ~ '^NC_[0-9]+0{chromosome}.[0-9]+$'"
|
243
|
-
)
|
244
|
-
strand_cond = f"AND alt_strand = '{strand.value}'" if strand else ""
|
245
|
-
gene_cond = f"AND hgnc = '{gene}'" if gene else ""
|
246
|
-
|
247
|
-
query = f"""
|
248
|
-
SELECT hgnc, alt_ac
|
249
|
-
FROM {self.schema}.tx_exon_aln_v
|
250
|
-
{alt_ac_cond}
|
251
|
-
AND alt_aln_method = 'splign'
|
252
|
-
AND {pos} BETWEEN alt_start_i AND alt_end_i
|
253
|
-
{strand_cond}
|
254
|
-
{gene_cond};
|
255
|
-
""" # noqa: S608
|
256
|
-
|
257
|
-
results = await self.execute_query(query)
|
258
|
-
if not results:
|
259
|
-
msg = (
|
260
|
-
f"Unable to find a result for chromosome "
|
261
|
-
f"{alt_ac or chromosome} where genomic coordinate {pos}"
|
262
|
-
f" is mapped between an exon's start and end coordinates"
|
263
|
-
)
|
264
|
-
if strand:
|
265
|
-
msg += (
|
266
|
-
f" on the "
|
267
|
-
f"{'positive' if strand == Strand.POSITIVE else 'negative'} strand"
|
268
|
-
)
|
269
|
-
if gene:
|
270
|
-
msg += f" and on gene {gene}"
|
271
|
-
return None, msg
|
272
|
-
|
273
|
-
results = self._transform_list(results)
|
274
|
-
genes = set()
|
275
|
-
alt_acs = set()
|
276
|
-
for r in results:
|
277
|
-
genes.add(r[0])
|
278
|
-
alt_acs.add(r[1])
|
279
|
-
return {"genes": genes, "alt_acs": alt_acs}, None
|
280
|
-
|
281
|
-
async def get_tx_exons(
|
282
|
-
self, tx_ac: str, alt_ac: str | None = None
|
283
|
-
) -> tuple[list[tuple[int, int]] | None, str | None]:
|
284
|
-
"""Get list of transcript exons start/end coordinates.
|
285
|
-
|
286
|
-
:param tx_ac: Transcript accession
|
287
|
-
:param alt_ac: Genomic accession
|
288
|
-
:return: List of a transcript's accessions and warnings if found
|
289
|
-
"""
|
290
|
-
if alt_ac:
|
291
|
-
# We know what assembly we're looking for since we have the
|
292
|
-
# genomic accession
|
293
|
-
query = f"""
|
294
|
-
SELECT DISTINCT tx_start_i, tx_end_i
|
295
|
-
FROM {self.schema}.tx_exon_aln_v
|
296
|
-
WHERE tx_ac = '{tx_ac}'
|
297
|
-
AND alt_aln_method = 'splign'
|
298
|
-
AND alt_ac = '{alt_ac}'
|
299
|
-
""" # noqa: S608
|
300
|
-
else:
|
301
|
-
# Use GRCh38 by default if no genomic accession is provided
|
302
|
-
query = f"""
|
303
|
-
SELECT DISTINCT tx_start_i, tx_end_i
|
304
|
-
FROM {self.schema}.tx_exon_aln_v as t
|
305
|
-
INNER JOIN {self.schema}._seq_anno_most_recent as s
|
306
|
-
ON t.alt_ac = s.ac
|
307
|
-
WHERE s.descr = ''
|
308
|
-
AND t.tx_ac = '{tx_ac}'
|
309
|
-
AND t.alt_aln_method = 'splign'
|
310
|
-
AND t.alt_ac like 'NC_000%'
|
311
|
-
""" # noqa: S608
|
312
|
-
result = await self.execute_query(query)
|
313
|
-
|
314
|
-
if not result:
|
315
|
-
msg = f"Unable to get exons for {tx_ac}"
|
316
|
-
_logger.warning(msg)
|
317
|
-
return None, msg
|
318
|
-
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
|
319
|
-
return tx_exons, None
|
320
|
-
|
321
|
-
async def get_tx_exons_genomic_coords(
|
322
|
-
self,
|
323
|
-
tx_ac: str,
|
324
|
-
alt_ac: str,
|
325
|
-
) -> tuple[tuple[int, int, int, int, int] | None, str | None]:
|
326
|
-
"""Get exon number, transcript coordinates, and genomic coordinates
|
327
|
-
|
328
|
-
:param tx_ac: Transcript accession
|
329
|
-
:param alt_ac: RefSeq genomic accession
|
330
|
-
:return: Tuple of exon numbers, transcript and genomic coordinates,
|
331
|
-
and warnings if found
|
332
|
-
"""
|
333
|
-
query = f"""
|
334
|
-
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i
|
335
|
-
FROM {self.schema}.tx_exon_aln_v
|
336
|
-
WHERE tx_ac = '{tx_ac}'
|
337
|
-
AND alt_ac = '{alt_ac}'
|
338
|
-
""" # noqa: S608
|
339
|
-
result = await self.execute_query(query)
|
340
|
-
|
341
|
-
if not result:
|
342
|
-
msg = f"Unable to get exons and genomic coordinates for {tx_ac} on {alt_ac}"
|
343
|
-
_logger.warning(msg)
|
344
|
-
return None, msg
|
345
|
-
tx_exons_genomic_coords = [
|
346
|
-
(r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"])
|
347
|
-
for r in result
|
348
|
-
]
|
349
|
-
return tx_exons_genomic_coords, None
|
350
|
-
|
351
273
|
async def get_alt_ac_start_or_end(
|
352
274
|
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None
|
353
|
-
) -> tuple[
|
275
|
+
) -> tuple[GenomicAlnData | None, str | None]:
|
354
276
|
"""Get genomic data for related transcript exon start or end.
|
355
277
|
|
356
278
|
:param tx_ac: Transcript accession
|
357
279
|
:param tx_exon_start: Transcript's exon start coordinate
|
358
280
|
:param tx_exon_end: Transcript's exon end coordinate
|
359
281
|
:param gene: HGNC gene symbol
|
360
|
-
:return:
|
361
|
-
aligned genomic start coordinate, aligned genomic end coordinate, strand],
|
362
|
-
and warnings if found
|
282
|
+
:return: Genomic alignment data and warnings if found
|
363
283
|
"""
|
364
284
|
gene_query = f"AND T.hgnc = '{gene}'" if gene else ""
|
365
285
|
|
366
286
|
query = f"""
|
367
|
-
SELECT T.hgnc, T.alt_ac, T.alt_start_i, T.alt_end_i, T.alt_strand
|
287
|
+
SELECT T.hgnc, T.alt_ac, T.alt_start_i, T.alt_end_i, T.alt_strand, T.ord
|
368
288
|
FROM {self.schema}._cds_exons_fp_v as C
|
369
289
|
JOIN {self.schema}.tx_exon_aln_v as T ON T.tx_ac = C.tx_ac
|
370
290
|
WHERE T.tx_ac = '{tx_ac}'
|
@@ -387,8 +307,7 @@ class UtaDatabase:
|
|
387
307
|
msg += f" on gene {gene}"
|
388
308
|
_logger.warning(msg)
|
389
309
|
return None, msg
|
390
|
-
|
391
|
-
return (result[0], result[1], result[2], result[3], result[4]), None
|
310
|
+
return GenomicAlnData(**result[0]), None
|
392
311
|
|
393
312
|
async def get_cds_start_end(self, tx_ac: str) -> tuple[int, int] | None:
|
394
313
|
"""Get coding start and end site
|
@@ -497,7 +416,7 @@ class UtaDatabase:
|
|
497
416
|
alt_ac: str | None = None,
|
498
417
|
use_tx_pos: bool = True,
|
499
418
|
like_tx_ac: bool = False,
|
500
|
-
) -> list:
|
419
|
+
) -> list[TxExonAlnData]:
|
501
420
|
"""Return queried data from tx_exon_aln_v table.
|
502
421
|
|
503
422
|
:param tx_ac: accession on c. coordinate
|
@@ -511,11 +430,8 @@ class UtaDatabase:
|
|
511
430
|
:param like_tx_ac: ``True`` if tx_ac condition should be a like statement.
|
512
431
|
This is used when you want to query an accession regardless of its version
|
513
432
|
``False`` if tx_condition will be exact match
|
514
|
-
:return: List of
|
433
|
+
:return: List of transcript exon alignment data
|
515
434
|
"""
|
516
|
-
if end_pos is None:
|
517
|
-
end_pos = start_pos
|
518
|
-
|
519
435
|
if tx_ac.startswith("EN"):
|
520
436
|
temp_ac = tx_ac.split(".")[0]
|
521
437
|
aln_method = f"AND alt_aln_method='genebuild'" # noqa: F541
|
@@ -543,7 +459,7 @@ class UtaDatabase:
|
|
543
459
|
|
544
460
|
query = f"""
|
545
461
|
SELECT hgnc, tx_ac, tx_start_i, tx_end_i, alt_ac, alt_start_i,
|
546
|
-
alt_end_i, alt_strand, alt_aln_method, tx_exon_id, alt_exon_id
|
462
|
+
alt_end_i, alt_strand, alt_aln_method, ord, tx_exon_id, alt_exon_id
|
547
463
|
FROM {self.schema}.tx_exon_aln_v
|
548
464
|
{tx_q}
|
549
465
|
{alt_ac_q}
|
@@ -562,22 +478,17 @@ class UtaDatabase:
|
|
562
478
|
temp_ac,
|
563
479
|
alt_ac,
|
564
480
|
)
|
565
|
-
return [
|
481
|
+
return [TxExonAlnData(**r) for r in result]
|
566
482
|
|
567
483
|
@staticmethod
|
568
|
-
def data_from_result(result:
|
484
|
+
def data_from_result(result: TxExonAlnData) -> GenomicTxData | None:
|
569
485
|
"""Return data found from result.
|
570
486
|
|
571
|
-
:param result:
|
572
|
-
:return:
|
487
|
+
:param result: Transcript exon alignment data
|
488
|
+
:return: Aligned genomic / transcript exon data
|
573
489
|
"""
|
574
|
-
|
575
|
-
|
576
|
-
alt_pos_range = result[5], result[6]
|
577
|
-
strand = Strand(result[7])
|
578
|
-
alt_aln_method = result[8]
|
579
|
-
tx_exon_id = result[9]
|
580
|
-
alt_exon_id = result[10]
|
490
|
+
tx_pos_range = result.tx_start_i, result.tx_end_i
|
491
|
+
alt_pos_range = result.alt_start_i, result.alt_end_i
|
581
492
|
|
582
493
|
if (tx_pos_range[1] - tx_pos_range[0]) != (alt_pos_range[1] - alt_pos_range[0]):
|
583
494
|
_logger.warning(
|
@@ -587,19 +498,19 @@ class UtaDatabase:
|
|
587
498
|
)
|
588
499
|
return None
|
589
500
|
|
590
|
-
return
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
501
|
+
return GenomicTxData(
|
502
|
+
gene=result.hgnc,
|
503
|
+
strand=Strand(result.alt_strand),
|
504
|
+
tx_pos_range=tx_pos_range,
|
505
|
+
alt_pos_range=alt_pos_range,
|
506
|
+
alt_aln_method=result.alt_aln_method,
|
507
|
+
tx_exon_id=result.tx_exon_id,
|
508
|
+
alt_exon_id=result.alt_exon_id,
|
509
|
+
)
|
599
510
|
|
600
511
|
async def get_mane_c_genomic_data(
|
601
512
|
self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int
|
602
|
-
) ->
|
513
|
+
) -> GenomicTxMetadata | None:
|
603
514
|
"""Get MANE transcript and genomic data. Used when going from g. to MANE c.
|
604
515
|
representation.
|
605
516
|
|
@@ -623,7 +534,8 @@ class UtaDatabase:
|
|
623
534
|
be set to ``None`` if unavailable.
|
624
535
|
:param start_pos: Genomic start position
|
625
536
|
:param end_pos: Genomic end position change
|
626
|
-
:return: MANE transcript results if
|
537
|
+
:return: Metadata for MANE genomic and transcript accessions results if
|
538
|
+
successful
|
627
539
|
"""
|
628
540
|
results = await self.get_tx_exon_aln_v_data(
|
629
541
|
ac, start_pos, end_pos, alt_ac=alt_ac, use_tx_pos=False
|
@@ -632,8 +544,8 @@ class UtaDatabase:
|
|
632
544
|
return None
|
633
545
|
result = results[0]
|
634
546
|
|
635
|
-
|
636
|
-
if not
|
547
|
+
genomic_tx_data = self.data_from_result(result)
|
548
|
+
if not genomic_tx_data:
|
637
549
|
return None
|
638
550
|
|
639
551
|
coding_start_site = await self.get_cds_start_end(ac)
|
@@ -641,25 +553,30 @@ class UtaDatabase:
|
|
641
553
|
_logger.warning("Accession %s not found in UTA", ac)
|
642
554
|
return None
|
643
555
|
|
644
|
-
|
645
|
-
data["alt_ac"] = result[4]
|
646
|
-
data["coding_start_site"] = coding_start_site[0]
|
647
|
-
data["coding_end_site"] = coding_start_site[1]
|
556
|
+
coding_start_site, coding_end_site = coding_start_site
|
648
557
|
|
649
|
-
if
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
558
|
+
if genomic_tx_data.strand == Strand.NEGATIVE:
|
559
|
+
alt_pos_change_range = (end_pos, start_pos)
|
560
|
+
pos_change = (
|
561
|
+
genomic_tx_data.alt_pos_range[1] - alt_pos_change_range[0],
|
562
|
+
alt_pos_change_range[1] - genomic_tx_data.alt_pos_range[0],
|
654
563
|
)
|
655
564
|
else:
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
565
|
+
alt_pos_change_range = (start_pos, end_pos)
|
566
|
+
pos_change = (
|
567
|
+
alt_pos_change_range[0] - genomic_tx_data.alt_pos_range[0],
|
568
|
+
genomic_tx_data.alt_pos_range[1] - alt_pos_change_range[1],
|
660
569
|
)
|
661
570
|
|
662
|
-
return
|
571
|
+
return GenomicTxMetadata(
|
572
|
+
**genomic_tx_data.model_dump(),
|
573
|
+
pos_change=pos_change,
|
574
|
+
tx_ac=result.tx_ac,
|
575
|
+
alt_ac=result.alt_ac,
|
576
|
+
coding_start_site=coding_start_site,
|
577
|
+
coding_end_site=coding_end_site,
|
578
|
+
alt_pos_change_range=alt_pos_change_range,
|
579
|
+
)
|
663
580
|
|
664
581
|
async def get_genomic_tx_data(
|
665
582
|
self,
|
@@ -669,7 +586,7 @@ class UtaDatabase:
|
|
669
586
|
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
670
587
|
alt_ac: str | None = None,
|
671
588
|
target_genome_assembly: Assembly = Assembly.GRCH38,
|
672
|
-
) ->
|
589
|
+
) -> GenomicTxMetadata | None:
|
673
590
|
"""Get transcript mapping to genomic data.
|
674
591
|
|
675
592
|
:param tx_ac: Accession on c. coordinate
|
@@ -678,8 +595,7 @@ class UtaDatabase:
|
|
678
595
|
:param alt_ac: Accession on g. coordinate
|
679
596
|
:param target_genome_assembly: Genome assembly to get genomic data for.
|
680
597
|
If ``alt_ac`` is provided, it will return the associated assembly.
|
681
|
-
:return:
|
682
|
-
Altered transcript accession and position change, Strand
|
598
|
+
:return: Metadata for genomic and transcript accessions
|
683
599
|
"""
|
684
600
|
results = await self.get_tx_exon_aln_v_data(
|
685
601
|
tx_ac,
|
@@ -696,35 +612,39 @@ class UtaDatabase:
|
|
696
612
|
else:
|
697
613
|
result = results[0]
|
698
614
|
|
699
|
-
|
700
|
-
if not
|
615
|
+
genomic_tx_data = self.data_from_result(result)
|
616
|
+
if not genomic_tx_data:
|
701
617
|
return None
|
702
|
-
data["tx_ac"] = result[1]
|
703
|
-
data["alt_ac"] = result[4]
|
704
618
|
|
705
|
-
|
706
|
-
pos[0] -
|
707
|
-
|
619
|
+
pos_change = (
|
620
|
+
pos[0] - genomic_tx_data.tx_pos_range[0],
|
621
|
+
genomic_tx_data.tx_pos_range[1] - pos[1],
|
708
622
|
)
|
709
623
|
|
710
624
|
if annotation_layer == AnnotationLayer.CDNA:
|
711
|
-
if
|
712
|
-
|
713
|
-
|
714
|
-
|
625
|
+
if genomic_tx_data.strand == Strand.NEGATIVE:
|
626
|
+
alt_pos_change_range = (
|
627
|
+
genomic_tx_data.alt_pos_range[1] - pos_change[0],
|
628
|
+
genomic_tx_data.alt_pos_range[0] + pos_change[1],
|
715
629
|
)
|
716
630
|
else:
|
717
|
-
|
718
|
-
|
719
|
-
|
631
|
+
alt_pos_change_range = (
|
632
|
+
genomic_tx_data.alt_pos_range[0] + pos_change[0],
|
633
|
+
genomic_tx_data.alt_pos_range[1] - pos_change[1],
|
720
634
|
)
|
721
635
|
else:
|
722
|
-
if
|
723
|
-
|
636
|
+
if genomic_tx_data.strand == Strand.NEGATIVE:
|
637
|
+
alt_pos_change_range = (pos[1], pos[0])
|
724
638
|
else:
|
725
|
-
|
726
|
-
|
727
|
-
return
|
639
|
+
alt_pos_change_range = pos
|
640
|
+
|
641
|
+
return GenomicTxMetadata(
|
642
|
+
**genomic_tx_data.model_dump(),
|
643
|
+
tx_ac=result.tx_ac,
|
644
|
+
alt_ac=result.alt_ac,
|
645
|
+
pos_change=pos_change,
|
646
|
+
alt_pos_change_range=alt_pos_change_range,
|
647
|
+
)
|
728
648
|
|
729
649
|
async def get_ac_from_gene(self, gene: str) -> list[str]:
|
730
650
|
"""Return genomic accession(s) associated to a gene.
|
cool_seq_tool/utils.py
CHANGED
@@ -6,35 +6,35 @@ import logging
|
|
6
6
|
from bioutils.accessions import chr22XY
|
7
7
|
|
8
8
|
from cool_seq_tool import __version__
|
9
|
-
from cool_seq_tool.schemas import
|
9
|
+
from cool_seq_tool.schemas import CoordinateType, ServiceMeta
|
10
10
|
|
11
11
|
_logger = logging.getLogger(__name__)
|
12
12
|
|
13
13
|
|
14
14
|
def get_inter_residue_pos(
|
15
|
-
start_pos: int, end_pos: int,
|
15
|
+
start_pos: int, end_pos: int, coordinate_type: CoordinateType
|
16
16
|
) -> tuple[int, int]:
|
17
17
|
"""Return equivalent inter-residue position.
|
18
18
|
|
19
|
-
|
19
|
+
Residue coordinates start with 1, whereas inter-residue coordinates start with 0.
|
20
|
+
|
21
|
+
It is preferred to work with inter-residue coordinates where possible. Our
|
20
22
|
rationale is detailed in an appendix to the
|
21
23
|
`VRS docs <https://vrs.ga4gh.org/en/stable/appendices/design_decisions.html#inter-residue-coordinates>`_.
|
22
24
|
This function is used internally to shift user-provided coordinates accordingly.
|
23
25
|
|
24
26
|
>>> from cool_seq_tool.utils import get_inter_residue_pos
|
25
|
-
>>> from cool_seq_tool.schemas import
|
26
|
-
>>> get_inter_residue_pos(10,
|
27
|
+
>>> from cool_seq_tool.schemas import CoordinateType
|
28
|
+
>>> get_inter_residue_pos(10, CoordinateType.RESIDUE)
|
27
29
|
((9, 9), None)
|
28
30
|
|
29
31
|
:param start_pos: Start position
|
30
32
|
:param end_pos: End position
|
31
|
-
:param
|
33
|
+
:param coordinate_type: Coordinate type for `start_pos` and `end_pos`
|
32
34
|
:return: Inter-residue coordinates
|
33
35
|
"""
|
34
|
-
if
|
36
|
+
if coordinate_type == CoordinateType.RESIDUE:
|
35
37
|
start_pos -= 1
|
36
|
-
elif residue_mode == ResidueMode.ZERO:
|
37
|
-
end_pos += 1
|
38
38
|
return start_pos, end_pos
|
39
39
|
|
40
40
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cool_seq_tool
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.1
|
4
4
|
Summary: Common Operation on Lots of Sequences Tool
|
5
5
|
Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
|
6
6
|
License: MIT License
|
@@ -52,7 +52,7 @@ Requires-Dist: polars ~=1.0
|
|
52
52
|
Requires-Dist: hgvs
|
53
53
|
Requires-Dist: biocommons.seqrepo
|
54
54
|
Requires-Dist: pydantic ==2.*
|
55
|
-
Requires-Dist: ga4gh.vrs
|
55
|
+
Requires-Dist: ga4gh.vrs ~=2.0.0a10
|
56
56
|
Requires-Dist: wags-tails ~=0.1.3
|
57
57
|
Requires-Dist: bioutils
|
58
58
|
Provides-Extra: dev
|
@@ -83,7 +83,7 @@ CoolSeqTool
|
|
83
83
|
|
84
84
|
---
|
85
85
|
|
86
|
-
**[Documentation](https://coolseqtool.readthedocs.io/
|
86
|
+
**[Documentation](https://coolseqtool.readthedocs.io/stable/)** · [Installation](https://coolseqtool.readthedocs.io/stable/install.html) · [Usage](https://coolseqtool.readthedocs.io/stable/usage.html) · [API reference](https://coolseqtool.readthedocs.io/stable/reference/index.html)
|
87
87
|
|
88
88
|
---
|
89
89
|
|
@@ -107,7 +107,7 @@ CoolSeqTool is available on [PyPI](https://pypi.org/project/cool-seq-tool)
|
|
107
107
|
python3 -m pip install cool-seq-tool
|
108
108
|
```
|
109
109
|
|
110
|
-
See the [installation instructions](https://coolseqtool.readthedocs.io/
|
110
|
+
See the [installation instructions](https://coolseqtool.readthedocs.io/stable/install.html) in the documentation for a description of dependency setup requirements.
|
111
111
|
|
112
112
|
---
|
113
113
|
|
@@ -116,14 +116,14 @@ See the [installation instructions](https://coolseqtool.readthedocs.io/latest/in
|
|
116
116
|
All CoolSeqTool resources can be initialized by way of a top-level class instance:
|
117
117
|
|
118
118
|
```pycon
|
119
|
-
>>> from cool_seq_tool
|
120
|
-
>>> from cool_seq_tool.schemas import AnnotationLayer,
|
119
|
+
>>> from cool_seq_tool import CoolSeqTool
|
120
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, CoordinateType
|
121
121
|
>>> cst = CoolSeqTool()
|
122
122
|
>>> result = await cst.mane_transcript.get_mane_transcript(
|
123
123
|
... "NP_004324.2",
|
124
124
|
... 599,
|
125
125
|
... AnnotationLayer.PROTEIN,
|
126
|
-
...
|
126
|
+
... coordinate_type=CoordinateType.INTER_RESIDUE,
|
127
127
|
... )
|
128
128
|
>>> result.gene, result.refseq, result.status
|
129
129
|
('EGFR', 'NM_005228.5', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
|
@@ -133,4 +133,4 @@ All CoolSeqTool resources can be initialized by way of a top-level class instanc
|
|
133
133
|
|
134
134
|
## Feedback and contributing
|
135
135
|
|
136
|
-
We welcome bug reports, feature requests, and code contributions from users and interested collaborators. The [documentation](https://coolseqtool.readthedocs.io/
|
136
|
+
We welcome bug reports, feature requests, and code contributions from users and interested collaborators. The [documentation](https://coolseqtool.readthedocs.io/stable/contributing.html) contains guidance for submitting feedback and contributing new code.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
cool_seq_tool/__init__.py,sha256=pJyVj7Z275BBAwpeFMm-WEn_tp-y1_ihRl1sLc4FFZY,400
|
2
|
+
cool_seq_tool/app.py,sha256=vyqlQRffC8sWZXMm-f_f-8WuTTWo3oRNfPUa_qdPV2M,4944
|
3
|
+
cool_seq_tool/schemas.py,sha256=HInmKpsujybVR6pRmkKNOIzPCBqk9Ni5q1ZKNFtip50,3945
|
4
|
+
cool_seq_tool/utils.py,sha256=kesu7UnOplDzvNBg_G-_m1xMM22979nmsi4yWtweetU,2959
|
5
|
+
cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2pIfqow,78
|
6
|
+
cool_seq_tool/handlers/seqrepo_access.py,sha256=Jd19jbdUvPRPn_XWozL67ph-nSIxpb4_UUimapDrsm4,9162
|
7
|
+
cool_seq_tool/mappers/__init__.py,sha256=O0JRxNFk8nWxD4v5ij47xelhvfVLdEXS43l2tzRuiUE,305
|
8
|
+
cool_seq_tool/mappers/alignment.py,sha256=nV6PS3mhkQ2MD1GcpNBujBOqd3AKxYSYA9BCusFOa1o,9636
|
9
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=lfmzuVXaYT7w2FBDS3xhJNgETusllomFy5Utzhfhlpc,48782
|
10
|
+
cool_seq_tool/mappers/liftover.py,sha256=lltx9zxfkrb5PHtJlKp3a39JCwPP4e0Zft-mQc1jXL8,3367
|
11
|
+
cool_seq_tool/mappers/mane_transcript.py,sha256=nirxlf3EGVInFYG4fsAqiEmDdTc_h1XuPyX2ul-a7Rk,54368
|
12
|
+
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
13
|
+
cool_seq_tool/resources/data_files.py,sha256=3lhu28tzlSoTs4vHZNu-hhoAWRrPGuZj_oIjqk2sYQM,3837
|
14
|
+
cool_seq_tool/resources/status.py,sha256=L0KM-VG3N4Yuaqh3AKZd_2KPDLR0Y7rvW_OD6x8mF7A,5717
|
15
|
+
cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
|
16
|
+
cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
|
17
|
+
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=E_pj7FEBcB6HUR8yhSVibB0beMMlKJ62pK0qvl4y5nw,5358
|
18
|
+
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
19
|
+
cool_seq_tool/sources/uta_database.py,sha256=gc5wsKOIhvzhwFmPmqOY0hhaVfRkRSzYNa9tpBt81_U,35017
|
20
|
+
cool_seq_tool-0.7.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
21
|
+
cool_seq_tool-0.7.1.dist-info/METADATA,sha256=Y9_RZI2iHpmNOFwXoFCCKyHs6aXmNrzKQfyHkmqUVmQ,6226
|
22
|
+
cool_seq_tool-0.7.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
23
|
+
cool_seq_tool-0.7.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
24
|
+
cool_seq_tool-0.7.1.dist-info/RECORD,,
|