cool-seq-tool 0.4.0.dev2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +1 -3
- cool_seq_tool/api.py +1 -2
- cool_seq_tool/app.py +42 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +13 -15
- cool_seq_tool/mappers/__init__.py +1 -0
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +232 -68
- cool_seq_tool/mappers/mane_transcript.py +84 -86
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +151 -0
- cool_seq_tool/routers/__init__.py +1 -0
- cool_seq_tool/routers/default.py +1 -0
- cool_seq_tool/routers/mane.py +4 -4
- cool_seq_tool/routers/mappings.py +2 -2
- cool_seq_tool/schemas.py +83 -37
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +14 -7
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +120 -69
- cool_seq_tool/utils.py +2 -2
- cool_seq_tool/version.py +2 -1
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/METADATA +15 -8
- cool_seq_tool-0.4.1.dist-info/RECORD +29 -0
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool-0.4.0.dev2.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
|
|
1
1
|
"""Provide transcript lookup and metadata tools via the UTA database."""
|
2
|
+
|
2
3
|
import ast
|
3
4
|
import base64
|
4
5
|
import logging
|
5
6
|
from os import environ
|
6
|
-
from typing import Any,
|
7
|
+
from typing import Any, Literal, TypeVar
|
7
8
|
from urllib.parse import ParseResult as UrlLibParseResult
|
8
9
|
from urllib.parse import quote, unquote, urlparse
|
9
10
|
|
@@ -24,12 +25,43 @@ LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
|
|
24
25
|
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
|
25
26
|
|
26
27
|
UTA_DB_URL = environ.get(
|
27
|
-
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:
|
28
|
+
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5432/uta/uta_20210129b"
|
28
29
|
)
|
29
30
|
|
30
31
|
logger = logging.getLogger(__name__)
|
31
32
|
|
32
33
|
|
34
|
+
def get_liftover(
|
35
|
+
chain_file_37_to_38: str | None = None, chain_file_38_to_37: str | None = None
|
36
|
+
) -> tuple[Converter, Converter]:
|
37
|
+
"""Fetch Converter instances between GRCh37 and 38.
|
38
|
+
|
39
|
+
Factored out of the UTA Database initialization method to support less expensive
|
40
|
+
status check-type operations.
|
41
|
+
|
42
|
+
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
|
43
|
+
This is used for ``agct``. If this is not provided, will check to see
|
44
|
+
if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
|
45
|
+
allow ``agct`` to download a chain file from UCSC
|
46
|
+
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
|
47
|
+
This is used for ``agct``. If this is not provided, will check to see
|
48
|
+
if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
|
49
|
+
allow ``agct`` to download a chain file from UCSC
|
50
|
+
:return: converters (37->38, 38->37)
|
51
|
+
"""
|
52
|
+
chain_file_37_to_38 = chain_file_37_to_38 or LIFTOVER_CHAIN_37_TO_38
|
53
|
+
if chain_file_37_to_38:
|
54
|
+
converter_37_to_38 = Converter(chainfile=chain_file_37_to_38)
|
55
|
+
else:
|
56
|
+
converter_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
|
57
|
+
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
|
58
|
+
if chain_file_38_to_37:
|
59
|
+
converter_38_to_37 = Converter(chainfile=chain_file_38_to_37)
|
60
|
+
else:
|
61
|
+
converter_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
|
62
|
+
return (converter_37_to_38, converter_38_to_37)
|
63
|
+
|
64
|
+
|
33
65
|
class UtaDatabase:
|
34
66
|
"""Provide transcript lookup and metadata tools via the Universal Transcript Archive
|
35
67
|
(UTA) database.
|
@@ -46,8 +78,8 @@ class UtaDatabase:
|
|
46
78
|
def __init__(
|
47
79
|
self,
|
48
80
|
db_url: str = UTA_DB_URL,
|
49
|
-
chain_file_37_to_38:
|
50
|
-
chain_file_38_to_37:
|
81
|
+
chain_file_37_to_38: str | None = None,
|
82
|
+
chain_file_38_to_37: str | None = None,
|
51
83
|
) -> None:
|
52
84
|
"""Initialize DB class. Should only be used by ``create()`` method, and not
|
53
85
|
be called directly by a user.
|
@@ -68,20 +100,11 @@ class UtaDatabase:
|
|
68
100
|
original_pwd = db_url.split("//")[-1].split("@")[0].split(":")[-1]
|
69
101
|
self.db_url = db_url.replace(original_pwd, quote(original_pwd))
|
70
102
|
self.args = self._get_conn_args()
|
103
|
+
self.liftover_37_to_38, self.liftover_38_to_37 = get_liftover(
|
104
|
+
chain_file_37_to_38, chain_file_38_to_37
|
105
|
+
)
|
71
106
|
|
72
|
-
|
73
|
-
if chain_file_37_to_38:
|
74
|
-
self.liftover_37_to_38 = Converter(chainfile=chain_file_37_to_38)
|
75
|
-
else:
|
76
|
-
self.liftover_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
|
77
|
-
|
78
|
-
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
|
79
|
-
if chain_file_38_to_37:
|
80
|
-
self.liftover_38_to_37 = Converter(chainfile=chain_file_38_to_37)
|
81
|
-
else:
|
82
|
-
self.liftover_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
|
83
|
-
|
84
|
-
def _get_conn_args(self) -> Dict:
|
107
|
+
def _get_conn_args(self) -> dict:
|
85
108
|
"""Return connection arguments.
|
86
109
|
|
87
110
|
:param db_url: raw connection URL
|
@@ -99,9 +122,9 @@ class UtaDatabase:
|
|
99
122
|
self.schema = schema
|
100
123
|
|
101
124
|
environ["PGPASSWORD"] = password
|
102
|
-
environ[
|
103
|
-
"
|
104
|
-
|
125
|
+
environ["UTA_DB_URL"] = (
|
126
|
+
f"postgresql://{username}@{host}:{port}/{database}/{schema}"
|
127
|
+
)
|
105
128
|
return {
|
106
129
|
"host": host,
|
107
130
|
"port": int(port),
|
@@ -145,7 +168,7 @@ class UtaDatabase:
|
|
145
168
|
|
146
169
|
@classmethod
|
147
170
|
async def create(
|
148
|
-
cls:
|
171
|
+
cls: type[UTADatabaseType], db_url: str = UTA_DB_URL
|
149
172
|
) -> UTADatabaseType:
|
150
173
|
"""Manufacture a fully-initialized class instance (a la factory pattern). This
|
151
174
|
method should be used instead of calling the class directly to create a new
|
@@ -173,7 +196,10 @@ class UtaDatabase:
|
|
173
196
|
"""
|
174
197
|
|
175
198
|
async def _execute_query(q: str) -> Any: # noqa: ANN401
|
176
|
-
async with
|
199
|
+
async with (
|
200
|
+
self._connection_pool.acquire() as connection,
|
201
|
+
connection.transaction(),
|
202
|
+
):
|
177
203
|
return await connection.fetch(q)
|
178
204
|
|
179
205
|
if not self._connection_pool:
|
@@ -234,25 +260,22 @@ class UtaDatabase:
|
|
234
260
|
await self.execute_query(create_index)
|
235
261
|
|
236
262
|
@staticmethod
|
237
|
-
def _transform_list(li:
|
263
|
+
def _transform_list(li: list) -> list[list[Any]]:
|
238
264
|
"""Transform list to only contain field values
|
239
265
|
|
240
266
|
:param li: List of asyncpg.Record objects
|
241
267
|
:return: List of list of objects
|
242
268
|
"""
|
243
|
-
|
244
|
-
for item in li:
|
245
|
-
results.append(list(item))
|
246
|
-
return results
|
269
|
+
return [list(i) for i in li]
|
247
270
|
|
248
271
|
async def get_genes_and_alt_acs(
|
249
272
|
self,
|
250
273
|
pos: int,
|
251
|
-
strand:
|
252
|
-
chromosome:
|
253
|
-
alt_ac:
|
254
|
-
gene:
|
255
|
-
) ->
|
274
|
+
strand: Strand | None = None,
|
275
|
+
chromosome: int | None = None,
|
276
|
+
alt_ac: str | None = None,
|
277
|
+
gene: str | None = None,
|
278
|
+
) -> tuple[dict | None, str | None]:
|
256
279
|
"""Return genes and genomic accessions for a position on a chromosome or alt_ac
|
257
280
|
|
258
281
|
:param pos: Genomic position
|
@@ -309,8 +332,8 @@ class UtaDatabase:
|
|
309
332
|
return {"genes": genes, "alt_acs": alt_acs}, None
|
310
333
|
|
311
334
|
async def get_tx_exons(
|
312
|
-
self, tx_ac: str, alt_ac:
|
313
|
-
) ->
|
335
|
+
self, tx_ac: str, alt_ac: str | None = None
|
336
|
+
) -> tuple[list[tuple[int, int]] | None, str | None]:
|
314
337
|
"""Get list of transcript exons start/end coordinates.
|
315
338
|
|
316
339
|
:param tx_ac: Transcript accession
|
@@ -348,9 +371,39 @@ class UtaDatabase:
|
|
348
371
|
tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result]
|
349
372
|
return tx_exons, None
|
350
373
|
|
374
|
+
async def get_tx_exons_genomic_coords(
|
375
|
+
self,
|
376
|
+
tx_ac: str,
|
377
|
+
alt_ac: str,
|
378
|
+
) -> tuple[tuple[int, int, int, int, int] | None, str | None]:
|
379
|
+
"""Get exon number, transcript coordinates, and genomic coordinates
|
380
|
+
|
381
|
+
:param tx_ac: Transcript accession
|
382
|
+
:param alt_ac: RefSeq genomic accession
|
383
|
+
:return: Tuple of exon numbers, transcript and genomic coordinates,
|
384
|
+
and warnings if found
|
385
|
+
"""
|
386
|
+
query = f"""
|
387
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i
|
388
|
+
FROM {self.schema}.tx_exon_aln_v
|
389
|
+
WHERE tx_ac = '{tx_ac}'
|
390
|
+
AND alt_ac = '{alt_ac}'
|
391
|
+
""" # noqa: S608
|
392
|
+
result = await self.execute_query(query)
|
393
|
+
|
394
|
+
if not result:
|
395
|
+
msg = f"Unable to get exons and genomic coordinates for {tx_ac} on {alt_ac}"
|
396
|
+
logger.warning(msg)
|
397
|
+
return None, msg
|
398
|
+
tx_exons_genomic_coords = [
|
399
|
+
(r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"])
|
400
|
+
for r in result
|
401
|
+
]
|
402
|
+
return tx_exons_genomic_coords, None
|
403
|
+
|
351
404
|
async def get_alt_ac_start_or_end(
|
352
|
-
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene:
|
353
|
-
) ->
|
405
|
+
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None
|
406
|
+
) -> tuple[tuple[str, str, int, int, int] | None, str | None]:
|
354
407
|
"""Get genomic data for related transcript exon start or end.
|
355
408
|
|
356
409
|
:param tx_ac: Transcript accession
|
@@ -390,7 +443,7 @@ class UtaDatabase:
|
|
390
443
|
result = result[0]
|
391
444
|
return (result[0], result[1], result[2], result[3], result[4]), None
|
392
445
|
|
393
|
-
async def get_cds_start_end(self, tx_ac: str) ->
|
446
|
+
async def get_cds_start_end(self, tx_ac: str) -> tuple[int, int] | None:
|
394
447
|
"""Get coding start and end site
|
395
448
|
|
396
449
|
:param tx_ac: Transcript accession
|
@@ -414,7 +467,7 @@ class UtaDatabase:
|
|
414
467
|
)
|
415
468
|
return None
|
416
469
|
|
417
|
-
async def get_newest_assembly_ac(self, ac: str) ->
|
470
|
+
async def get_newest_assembly_ac(self, ac: str) -> list[str]:
|
418
471
|
"""Find accession associated to latest genomic assembly
|
419
472
|
|
420
473
|
:param ac: Accession
|
@@ -459,7 +512,7 @@ class UtaDatabase:
|
|
459
512
|
result = await self.execute_query(query)
|
460
513
|
return result[0][0]
|
461
514
|
|
462
|
-
async def get_ac_descr(self, ac: str) ->
|
515
|
+
async def get_ac_descr(self, ac: str) -> str | None:
|
463
516
|
"""Return accession description. This is typically available only for accessions
|
464
517
|
from older (pre-GRCh38) builds.
|
465
518
|
|
@@ -494,10 +547,10 @@ class UtaDatabase:
|
|
494
547
|
tx_ac: str,
|
495
548
|
start_pos: int,
|
496
549
|
end_pos: int,
|
497
|
-
alt_ac:
|
550
|
+
alt_ac: str | None = None,
|
498
551
|
use_tx_pos: bool = True,
|
499
552
|
like_tx_ac: bool = False,
|
500
|
-
) ->
|
553
|
+
) -> list:
|
501
554
|
"""Return queried data from tx_exon_aln_v table.
|
502
555
|
|
503
556
|
:param tx_ac: accession on c. coordinate
|
@@ -562,13 +615,10 @@ class UtaDatabase:
|
|
562
615
|
temp_ac,
|
563
616
|
alt_ac,
|
564
617
|
)
|
565
|
-
|
566
|
-
for r in result:
|
567
|
-
results.append(list(r))
|
568
|
-
return results
|
618
|
+
return [list(r) for r in result]
|
569
619
|
|
570
620
|
@staticmethod
|
571
|
-
def data_from_result(result:
|
621
|
+
def data_from_result(result: list) -> dict | None:
|
572
622
|
"""Return data found from result.
|
573
623
|
|
574
624
|
:param result: Data from tx_exon_aln_v table
|
@@ -601,8 +651,8 @@ class UtaDatabase:
|
|
601
651
|
}
|
602
652
|
|
603
653
|
async def get_mane_c_genomic_data(
|
604
|
-
self, ac: str, alt_ac:
|
605
|
-
) ->
|
654
|
+
self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int
|
655
|
+
) -> dict | None:
|
606
656
|
"""Get MANE transcript and genomic data. Used when going from g. to MANE c.
|
607
657
|
representation.
|
608
658
|
|
@@ -667,13 +717,12 @@ class UtaDatabase:
|
|
667
717
|
async def get_genomic_tx_data(
|
668
718
|
self,
|
669
719
|
tx_ac: str,
|
670
|
-
pos:
|
671
|
-
annotation_layer:
|
672
|
-
|
673
|
-
|
674
|
-
alt_ac: Optional[str] = None,
|
720
|
+
pos: tuple[int, int],
|
721
|
+
annotation_layer: Literal[AnnotationLayer.CDNA]
|
722
|
+
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
723
|
+
alt_ac: str | None = None,
|
675
724
|
target_genome_assembly: Assembly = Assembly.GRCH38,
|
676
|
-
) ->
|
725
|
+
) -> dict | None:
|
677
726
|
"""Get transcript mapping to genomic data.
|
678
727
|
|
679
728
|
:param tx_ac: Accession on c. coordinate
|
@@ -730,7 +779,7 @@ class UtaDatabase:
|
|
730
779
|
|
731
780
|
return data
|
732
781
|
|
733
|
-
async def get_ac_from_gene(self, gene: str) ->
|
782
|
+
async def get_ac_from_gene(self, gene: str) -> list[str]:
|
734
783
|
"""Return genomic accession(s) associated to a gene.
|
735
784
|
|
736
785
|
:param gene: Gene symbol
|
@@ -754,14 +803,16 @@ class UtaDatabase:
|
|
754
803
|
|
755
804
|
async def get_gene_from_ac(
|
756
805
|
self, ac: str, start_pos: int, end_pos: int
|
757
|
-
) ->
|
806
|
+
) -> list[str] | None:
|
758
807
|
"""Get gene(s) within the provided coordinate range
|
759
808
|
|
760
809
|
>>> import asyncio
|
761
810
|
>>> from cool_seq_tool.sources import UtaDatabase
|
762
811
|
>>> async def get_gene():
|
763
812
|
... uta_db = await UtaDatabase.create()
|
764
|
-
... result = await uta_db.get_gene_from_ac(
|
813
|
+
... result = await uta_db.get_gene_from_ac(
|
814
|
+
... "NC_000017.11", 43044296, 43045802
|
815
|
+
... )
|
765
816
|
... return result
|
766
817
|
>>> asyncio.run(get_gene())
|
767
818
|
['BRCA1']
|
@@ -798,11 +849,11 @@ class UtaDatabase:
|
|
798
849
|
|
799
850
|
async def get_transcripts(
|
800
851
|
self,
|
801
|
-
start_pos:
|
802
|
-
end_pos:
|
803
|
-
gene:
|
852
|
+
start_pos: int | None = None,
|
853
|
+
end_pos: int | None = None,
|
854
|
+
gene: str | None = None,
|
804
855
|
use_tx_pos: bool = True,
|
805
|
-
alt_ac:
|
856
|
+
alt_ac: str | None = None,
|
806
857
|
) -> pl.DataFrame:
|
807
858
|
"""Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
|
808
859
|
|
@@ -876,7 +927,7 @@ class UtaDatabase:
|
|
876
927
|
results_df = results_df.unique()
|
877
928
|
return results_df
|
878
929
|
|
879
|
-
async def get_chr_assembly(self, ac: str) ->
|
930
|
+
async def get_chr_assembly(self, ac: str) -> tuple[str, str] | None:
|
880
931
|
"""Get chromosome and assembly for NC accession if not in GRCh38.
|
881
932
|
|
882
933
|
:param ac: NC accession
|
@@ -899,7 +950,7 @@ class UtaDatabase:
|
|
899
950
|
|
900
951
|
return chromosome, assembly
|
901
952
|
|
902
|
-
async def liftover_to_38(self, genomic_tx_data:
|
953
|
+
async def liftover_to_38(self, genomic_tx_data: dict) -> None:
|
903
954
|
"""Liftover genomic_tx_data to hg38 assembly.
|
904
955
|
|
905
956
|
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
@@ -955,7 +1006,7 @@ class UtaDatabase:
|
|
955
1006
|
|
956
1007
|
def get_liftover(
|
957
1008
|
self, chromosome: str, pos: int, liftover_to_assembly: Assembly
|
958
|
-
) ->
|
1009
|
+
) -> tuple[str, int] | None:
|
959
1010
|
"""Get new genome assembly data for a position on a chromosome.
|
960
1011
|
|
961
1012
|
:param chromosome: The chromosome number. Must be prefixed with ``chr``
|
@@ -982,7 +1033,7 @@ class UtaDatabase:
|
|
982
1033
|
|
983
1034
|
def _set_liftover(
|
984
1035
|
self,
|
985
|
-
genomic_tx_data:
|
1036
|
+
genomic_tx_data: dict,
|
986
1037
|
key: str,
|
987
1038
|
chromosome: str,
|
988
1039
|
liftover_to_assembly: Assembly,
|
@@ -1019,7 +1070,7 @@ class UtaDatabase:
|
|
1019
1070
|
|
1020
1071
|
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
1021
1072
|
|
1022
|
-
async def p_to_c_ac(self, p_ac: str) ->
|
1073
|
+
async def p_to_c_ac(self, p_ac: str) -> list[str]:
|
1023
1074
|
"""Return cDNA reference sequence accession from protein reference sequence
|
1024
1075
|
accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
|
1025
1076
|
|
@@ -1049,7 +1100,7 @@ class UtaDatabase:
|
|
1049
1100
|
|
1050
1101
|
async def get_transcripts_from_genomic_pos(
|
1051
1102
|
self, alt_ac: str, g_pos: int
|
1052
|
-
) ->
|
1103
|
+
) -> list[str]:
|
1053
1104
|
"""Get transcripts associated to a genomic ac and position.
|
1054
1105
|
|
1055
1106
|
:param alt_ac: Genomic accession
|
@@ -1115,13 +1166,13 @@ class ParseResult(UrlLibParseResult):
|
|
1115
1166
|
return super(ParseResult, cls).__new__(cls, *pr) # noqa: UP008
|
1116
1167
|
|
1117
1168
|
@property
|
1118
|
-
def database(self) ->
|
1169
|
+
def database(self) -> str | None:
|
1119
1170
|
"""Create database property."""
|
1120
1171
|
path_elems = self.path.split("/")
|
1121
1172
|
return path_elems[1] if len(path_elems) > 1 else None
|
1122
1173
|
|
1123
1174
|
@property
|
1124
|
-
def schema(self) ->
|
1175
|
+
def schema(self) -> str | None:
|
1125
1176
|
"""Create schema property."""
|
1126
1177
|
path_elems = self.path.split("/")
|
1127
1178
|
return path_elems[2] if len(path_elems) > 2 else None
|
cool_seq_tool/utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
"""Provide a small set of general helper functions."""
|
2
|
+
|
2
3
|
import datetime
|
3
4
|
import logging
|
4
|
-
from typing import Tuple
|
5
5
|
|
6
6
|
from cool_seq_tool.schemas import ResidueMode, ServiceMeta
|
7
7
|
from cool_seq_tool.version import __version__
|
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|
11
11
|
|
12
12
|
def get_inter_residue_pos(
|
13
13
|
start_pos: int, end_pos: int, residue_mode: ResidueMode
|
14
|
-
) ->
|
14
|
+
) -> tuple[int, int]:
|
15
15
|
"""Return equivalent inter-residue position.
|
16
16
|
|
17
17
|
Generally, we prefer to work with inter-residue coordinates where possible. Our
|
cool_seq_tool/version.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cool_seq_tool
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Common Operation on Lots of Sequences Tool
|
5
5
|
Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
|
6
6
|
License: MIT License
|
7
7
|
|
8
|
-
Copyright (c) 2021-
|
8
|
+
Copyright (c) 2021-2024 Wagner Lab
|
9
9
|
|
10
10
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
11
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -26,7 +26,7 @@ License: MIT License
|
|
26
26
|
SOFTWARE.
|
27
27
|
|
28
28
|
Project-URL: Homepage, https://github.com/genomicmedlab/cool-seq-tool
|
29
|
-
Project-URL: Documentation, https://coolseqtool.readthedocs.io/
|
29
|
+
Project-URL: Documentation, https://coolseqtool.readthedocs.io/
|
30
30
|
Project-URL: Changelog, https://github.com/genomicmedlab/cool-seq-tool/releases
|
31
31
|
Project-URL: Source, https://github.com/genomicmedlab/cool-seq-tool
|
32
32
|
Project-URL: Bug Tracker, https://github.com/genomicmedlab/cool-seq-tool/issues
|
@@ -39,30 +39,30 @@ Classifier: Intended Audience :: Developers
|
|
39
39
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
40
40
|
Classifier: License :: OSI Approved :: MIT License
|
41
41
|
Classifier: Programming Language :: Python :: 3
|
42
|
-
Classifier: Programming Language :: Python :: 3.8
|
43
|
-
Classifier: Programming Language :: Python :: 3.9
|
44
42
|
Classifier: Programming Language :: Python :: 3.10
|
45
43
|
Classifier: Programming Language :: Python :: 3.11
|
46
|
-
|
44
|
+
Classifier: Programming Language :: Python :: 3.12
|
45
|
+
Requires-Python: >=3.10
|
47
46
|
Description-Content-Type: text/markdown
|
48
47
|
License-File: LICENSE
|
49
48
|
Requires-Dist: asyncpg
|
50
49
|
Requires-Dist: aiofiles
|
51
50
|
Requires-Dist: boto3
|
52
51
|
Requires-Dist: agct >=0.1.0-dev1
|
53
|
-
Requires-Dist: polars
|
52
|
+
Requires-Dist: polars ~=1.0
|
54
53
|
Requires-Dist: hgvs
|
55
54
|
Requires-Dist: biocommons.seqrepo
|
56
55
|
Requires-Dist: pydantic ==2.*
|
57
56
|
Requires-Dist: uvicorn
|
58
57
|
Requires-Dist: fastapi
|
59
58
|
Requires-Dist: ga4gh.vrs
|
59
|
+
Requires-Dist: wags-tails ~=0.1.3
|
60
60
|
Provides-Extra: dev
|
61
61
|
Requires-Dist: pre-commit ; extra == 'dev'
|
62
62
|
Requires-Dist: ipython ; extra == 'dev'
|
63
63
|
Requires-Dist: ipykernel ; extra == 'dev'
|
64
64
|
Requires-Dist: psycopg2-binary ; extra == 'dev'
|
65
|
-
Requires-Dist: ruff
|
65
|
+
Requires-Dist: ruff ==0.5.0 ; extra == 'dev'
|
66
66
|
Provides-Extra: docs
|
67
67
|
Requires-Dist: sphinx ==6.1.3 ; extra == 'docs'
|
68
68
|
Requires-Dist: sphinx-autodoc-typehints ==1.22.0 ; extra == 'docs'
|
@@ -81,8 +81,14 @@ Requires-Dist: mock ; extra == 'tests'
|
|
81
81
|
CoolSeqTool
|
82
82
|
</h1>
|
83
83
|
|
84
|
+
[](https://pypi.python.org/pypi/cool-seq-tool) [](https://pypi.python.org/pypi/cool-seq-tool) [](https://pypi.python.org/pypi/cool-seq-tool) [](https://github.com/genomicmedlab/cool-seq-tool/actions/checks.yaml)
|
85
|
+
|
86
|
+
---
|
87
|
+
|
84
88
|
**[Documentation](https://coolseqtool.readthedocs.io/latest/)** · [Installation](https://coolseqtool.readthedocs.io/latest/install.html) · [Usage](https://coolseqtool.readthedocs.io/latest/usage.html) · [API reference](https://coolseqtool.readthedocs.io/latest/reference/index.html)
|
85
89
|
|
90
|
+
---
|
91
|
+
|
86
92
|
## Overview
|
87
93
|
|
88
94
|
<!-- description -->
|
@@ -113,6 +119,7 @@ All CoolSeqTool resources can be initialized by way of a top-level class instanc
|
|
113
119
|
|
114
120
|
```pycon
|
115
121
|
>>> from cool_seq_tool.app import CoolSeqTool
|
122
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
116
123
|
>>> cst = CoolSeqTool()
|
117
124
|
>>> result = await cst.mane_transcript.get_mane_transcript(
|
118
125
|
... "NP_004324.2",
|
@@ -0,0 +1,29 @@
|
|
1
|
+
cool_seq_tool/__init__.py,sha256=BTfkS0bkMtxBL4yGHc4Z7ubmNhdhY2WALfadnk8N1lw,280
|
2
|
+
cool_seq_tool/api.py,sha256=AbCmdUVH8ltwqH8k7DiVsHpujMzb6c5pyAKY12iIC0U,1210
|
3
|
+
cool_seq_tool/app.py,sha256=5dBmzTf5SeIF90y_ZyI0K6AMSKgchC33eW_ABN6D8_s,4790
|
4
|
+
cool_seq_tool/schemas.py,sha256=8xGrP0rAcKLXtZYEe_DJcNp4zapjhN0StRq8uCjoobE,16720
|
5
|
+
cool_seq_tool/utils.py,sha256=lckkyFKxMAqG79SYO3p28q6BWgEjlQP7CumE2TDP1zc,1601
|
6
|
+
cool_seq_tool/version.py,sha256=hs3N9Wl67casrrQa2sGIAcpcaUySVk4oLE7JffoQuCI,53
|
7
|
+
cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2pIfqow,78
|
8
|
+
cool_seq_tool/handlers/seqrepo_access.py,sha256=JB3cg7YiV2JKa7ImJXz4WtP9XWShk9qYvhCCrZnBQ6M,8983
|
9
|
+
cool_seq_tool/mappers/__init__.py,sha256=SMSf6sPcu7mdQNuJ4Cj1mbOwFUPuMdFSf0noY4XvTxE,262
|
10
|
+
cool_seq_tool/mappers/alignment.py,sha256=6Vk4XEar54ivuH8N7oBqa9gUa8E5GjWCI9hC1HCkM18,9552
|
11
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=tOmo6kFGcFIRmLBQwSsIZUSiratiyACf946YKV_IU78,38544
|
12
|
+
cool_seq_tool/mappers/mane_transcript.py,sha256=RrVRUS4IqxxX-HyamNLqpQ_WVWABgiLqwmmIh92uny8,49264
|
13
|
+
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
14
|
+
cool_seq_tool/resources/data_files.py,sha256=3lhu28tzlSoTs4vHZNu-hhoAWRrPGuZj_oIjqk2sYQM,3837
|
15
|
+
cool_seq_tool/resources/status.py,sha256=ENsLiwSxzJOLOsY5IKDM805UWbQAOV3w9s7Rv_FLAUs,5761
|
16
|
+
cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
|
17
|
+
cool_seq_tool/routers/__init__.py,sha256=7SqhLv6_mDPpK1Q0L9aykmjhCmsymFqgbSWZH8LuCW0,437
|
18
|
+
cool_seq_tool/routers/default.py,sha256=zqeQmHmfGUvV32xLbN-fUfYnK_UI1gpqIL8Eu5Y8KzY,3928
|
19
|
+
cool_seq_tool/routers/mane.py,sha256=boZKP5PH0BAcqEeTBBr9Z3EMY4lhvLLX-pJxUqjBZQ0,3508
|
20
|
+
cool_seq_tool/routers/mappings.py,sha256=UJaip0QvRfK3Lk3eVuwofUwg2XJqMV5OVY9OLcpnWS4,6061
|
21
|
+
cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
|
22
|
+
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=9Rd_tRCrTk9i9Urp-pMMttq4cCbIJaEJ0n8rM9y9-7I,4077
|
23
|
+
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
24
|
+
cool_seq_tool/sources/uta_database.py,sha256=GJHhYbH130YJo9FIRroR8eavlbaziMwI0JVNP8IPGPM,45636
|
25
|
+
cool_seq_tool-0.4.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
26
|
+
cool_seq_tool-0.4.1.dist-info/METADATA,sha256=CnZwl-rVLfY6kcVkQKYsYziT19q48qHRkYFQ96-OCx0,6262
|
27
|
+
cool_seq_tool-0.4.1.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
28
|
+
cool_seq_tool-0.4.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
29
|
+
cool_seq_tool-0.4.1.dist-info/RECORD,,
|
cool_seq_tool/data/__init__.py
DELETED
@@ -1,89 +0,0 @@
|
|
1
|
-
"""Handle acquisition of external data."""
|
2
|
-
import datetime
|
3
|
-
import gzip
|
4
|
-
import logging
|
5
|
-
import shutil
|
6
|
-
from ftplib import FTP
|
7
|
-
from pathlib import Path
|
8
|
-
|
9
|
-
from dateutil import parser
|
10
|
-
|
11
|
-
from cool_seq_tool import APP_ROOT
|
12
|
-
|
13
|
-
logger = logging.getLogger("cool_seq_tool")
|
14
|
-
|
15
|
-
|
16
|
-
class DataDownload:
|
17
|
-
"""Manage downloadable data files. Responsible for checking if files are available
|
18
|
-
under expected locations, and fetching them if not.
|
19
|
-
|
20
|
-
Relevant methods are called automatically by data classes; users should not have
|
21
|
-
to interact with this class under normal circumstances.
|
22
|
-
"""
|
23
|
-
|
24
|
-
def __init__(self) -> None:
|
25
|
-
"""Initialize downloadable data locations."""
|
26
|
-
self._data_dir = APP_ROOT / "data"
|
27
|
-
|
28
|
-
def get_mane_summary(self) -> Path:
|
29
|
-
"""Identify latest MANE summary data. If unavailable locally, download from
|
30
|
-
`NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/>`_.
|
31
|
-
|
32
|
-
:return: path to MANE summary file
|
33
|
-
"""
|
34
|
-
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
|
35
|
-
ftp.login()
|
36
|
-
ftp.cwd("/refseq/MANE/MANE_human/current")
|
37
|
-
files = ftp.nlst()
|
38
|
-
mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")]
|
39
|
-
if not mane_summary_file:
|
40
|
-
msg = "Unable to download MANE summary data"
|
41
|
-
raise Exception(msg)
|
42
|
-
mane_summary_file = mane_summary_file[0]
|
43
|
-
self._mane_summary_path = self._data_dir / mane_summary_file[:-3]
|
44
|
-
mane_data_path = self._data_dir / mane_summary_file
|
45
|
-
if not self._mane_summary_path.exists():
|
46
|
-
logger.info("Downloading MANE summary file from NCBI.")
|
47
|
-
with mane_data_path.open("wb") as fp:
|
48
|
-
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
|
49
|
-
with gzip.open(
|
50
|
-
mane_data_path, "rb"
|
51
|
-
) as f_in, self._mane_summary_path.open("wb") as f_out:
|
52
|
-
shutil.copyfileobj(f_in, f_out)
|
53
|
-
mane_data_path.unlink()
|
54
|
-
logger.info("MANE summary file download complete.")
|
55
|
-
return self._mane_summary_path
|
56
|
-
|
57
|
-
def get_lrg_refseq_gene_data(self) -> Path:
|
58
|
-
"""Identify latest LRG RefSeq Gene file. If unavailable locally, download from
|
59
|
-
`NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/>`_.
|
60
|
-
|
61
|
-
:return: path to acquired LRG RefSeq Gene data file
|
62
|
-
"""
|
63
|
-
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
|
64
|
-
ftp.login()
|
65
|
-
lrg_refseqgene_file = "LRG_RefSeqGene"
|
66
|
-
ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/"
|
67
|
-
ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
|
68
|
-
timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
|
69
|
-
date = str(parser.parse(timestamp)).split()[0]
|
70
|
-
version = (
|
71
|
-
datetime.datetime.strptime(date, "%Y-%m-%d")
|
72
|
-
.astimezone(tz=datetime.timezone.utc)
|
73
|
-
.strftime("%Y%m%d")
|
74
|
-
)
|
75
|
-
fn_versioned = f"{lrg_refseqgene_file}_{version}"
|
76
|
-
lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
|
77
|
-
self._lrg_refseqgene_path = self._data_dir / fn_versioned
|
78
|
-
if not self._lrg_refseqgene_path.exists():
|
79
|
-
logger.info("Downloading LRG RefSeq data from NCBI.")
|
80
|
-
ftp.cwd(ftp_dir_path)
|
81
|
-
with lrg_refseqgene_path.open("wb") as fp:
|
82
|
-
ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
|
83
|
-
with lrg_refseqgene_path.open(
|
84
|
-
"rb"
|
85
|
-
) as f_in, self._lrg_refseqgene_path.open("wb") as f_out:
|
86
|
-
shutil.copyfileobj(f_in, f_out)
|
87
|
-
lrg_refseqgene_path.unlink()
|
88
|
-
logger.info("LRG RefSeq data download complete.")
|
89
|
-
return self._lrg_refseqgene_path
|