cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +1 -3
- cool_seq_tool/api.py +1 -2
- cool_seq_tool/app.py +38 -23
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +13 -15
- cool_seq_tool/mappers/__init__.py +1 -0
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +75 -73
- cool_seq_tool/mappers/mane_transcript.py +84 -86
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +151 -0
- cool_seq_tool/routers/__init__.py +1 -0
- cool_seq_tool/routers/default.py +1 -0
- cool_seq_tool/routers/mane.py +4 -4
- cool_seq_tool/routers/mappings.py +2 -2
- cool_seq_tool/schemas.py +83 -37
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +14 -7
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +91 -70
- cool_seq_tool/utils.py +2 -2
- cool_seq_tool/version.py +2 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.4.1.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.4.1.dist-info}/METADATA +15 -8
- cool_seq_tool-0.4.1.dist-info/RECORD +29 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.4.1.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
|
|
1
1
|
"""Provide transcript lookup and metadata tools via the UTA database."""
|
2
|
+
|
2
3
|
import ast
|
3
4
|
import base64
|
4
5
|
import logging
|
5
6
|
from os import environ
|
6
|
-
from typing import Any,
|
7
|
+
from typing import Any, Literal, TypeVar
|
7
8
|
from urllib.parse import ParseResult as UrlLibParseResult
|
8
9
|
from urllib.parse import quote, unquote, urlparse
|
9
10
|
|
@@ -24,12 +25,43 @@ LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
|
|
24
25
|
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
|
25
26
|
|
26
27
|
UTA_DB_URL = environ.get(
|
27
|
-
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:
|
28
|
+
"UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5432/uta/uta_20210129b"
|
28
29
|
)
|
29
30
|
|
30
31
|
logger = logging.getLogger(__name__)
|
31
32
|
|
32
33
|
|
34
|
+
def get_liftover(
|
35
|
+
chain_file_37_to_38: str | None = None, chain_file_38_to_37: str | None = None
|
36
|
+
) -> tuple[Converter, Converter]:
|
37
|
+
"""Fetch Converter instances between GRCh37 and 38.
|
38
|
+
|
39
|
+
Factored out of the UTA Database initialization method to support less expensive
|
40
|
+
status check-type operations.
|
41
|
+
|
42
|
+
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly.
|
43
|
+
This is used for ``agct``. If this is not provided, will check to see
|
44
|
+
if ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will
|
45
|
+
allow ``agct`` to download a chain file from UCSC
|
46
|
+
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
|
47
|
+
This is used for ``agct``. If this is not provided, will check to see
|
48
|
+
if ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will
|
49
|
+
allow ``agct`` to download a chain file from UCSC
|
50
|
+
:return: converters (37->38, 38->37)
|
51
|
+
"""
|
52
|
+
chain_file_37_to_38 = chain_file_37_to_38 or LIFTOVER_CHAIN_37_TO_38
|
53
|
+
if chain_file_37_to_38:
|
54
|
+
converter_37_to_38 = Converter(chainfile=chain_file_37_to_38)
|
55
|
+
else:
|
56
|
+
converter_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
|
57
|
+
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
|
58
|
+
if chain_file_38_to_37:
|
59
|
+
converter_38_to_37 = Converter(chainfile=chain_file_38_to_37)
|
60
|
+
else:
|
61
|
+
converter_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
|
62
|
+
return (converter_37_to_38, converter_38_to_37)
|
63
|
+
|
64
|
+
|
33
65
|
class UtaDatabase:
|
34
66
|
"""Provide transcript lookup and metadata tools via the Universal Transcript Archive
|
35
67
|
(UTA) database.
|
@@ -46,8 +78,8 @@ class UtaDatabase:
|
|
46
78
|
def __init__(
|
47
79
|
self,
|
48
80
|
db_url: str = UTA_DB_URL,
|
49
|
-
chain_file_37_to_38:
|
50
|
-
chain_file_38_to_37:
|
81
|
+
chain_file_37_to_38: str | None = None,
|
82
|
+
chain_file_38_to_37: str | None = None,
|
51
83
|
) -> None:
|
52
84
|
"""Initialize DB class. Should only be used by ``create()`` method, and not
|
53
85
|
be called directly by a user.
|
@@ -68,20 +100,11 @@ class UtaDatabase:
|
|
68
100
|
original_pwd = db_url.split("//")[-1].split("@")[0].split(":")[-1]
|
69
101
|
self.db_url = db_url.replace(original_pwd, quote(original_pwd))
|
70
102
|
self.args = self._get_conn_args()
|
103
|
+
self.liftover_37_to_38, self.liftover_38_to_37 = get_liftover(
|
104
|
+
chain_file_37_to_38, chain_file_38_to_37
|
105
|
+
)
|
71
106
|
|
72
|
-
|
73
|
-
if chain_file_37_to_38:
|
74
|
-
self.liftover_37_to_38 = Converter(chainfile=chain_file_37_to_38)
|
75
|
-
else:
|
76
|
-
self.liftover_37_to_38 = Converter(from_db=Genome.HG19, to_db=Genome.HG38)
|
77
|
-
|
78
|
-
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
|
79
|
-
if chain_file_38_to_37:
|
80
|
-
self.liftover_38_to_37 = Converter(chainfile=chain_file_38_to_37)
|
81
|
-
else:
|
82
|
-
self.liftover_38_to_37 = Converter(from_db=Genome.HG38, to_db=Genome.HG19)
|
83
|
-
|
84
|
-
def _get_conn_args(self) -> Dict:
|
107
|
+
def _get_conn_args(self) -> dict:
|
85
108
|
"""Return connection arguments.
|
86
109
|
|
87
110
|
:param db_url: raw connection URL
|
@@ -99,9 +122,9 @@ class UtaDatabase:
|
|
99
122
|
self.schema = schema
|
100
123
|
|
101
124
|
environ["PGPASSWORD"] = password
|
102
|
-
environ[
|
103
|
-
"
|
104
|
-
|
125
|
+
environ["UTA_DB_URL"] = (
|
126
|
+
f"postgresql://{username}@{host}:{port}/{database}/{schema}"
|
127
|
+
)
|
105
128
|
return {
|
106
129
|
"host": host,
|
107
130
|
"port": int(port),
|
@@ -145,7 +168,7 @@ class UtaDatabase:
|
|
145
168
|
|
146
169
|
@classmethod
|
147
170
|
async def create(
|
148
|
-
cls:
|
171
|
+
cls: type[UTADatabaseType], db_url: str = UTA_DB_URL
|
149
172
|
) -> UTADatabaseType:
|
150
173
|
"""Manufacture a fully-initialized class instance (a la factory pattern). This
|
151
174
|
method should be used instead of calling the class directly to create a new
|
@@ -173,7 +196,10 @@ class UtaDatabase:
|
|
173
196
|
"""
|
174
197
|
|
175
198
|
async def _execute_query(q: str) -> Any: # noqa: ANN401
|
176
|
-
async with
|
199
|
+
async with (
|
200
|
+
self._connection_pool.acquire() as connection,
|
201
|
+
connection.transaction(),
|
202
|
+
):
|
177
203
|
return await connection.fetch(q)
|
178
204
|
|
179
205
|
if not self._connection_pool:
|
@@ -234,25 +260,22 @@ class UtaDatabase:
|
|
234
260
|
await self.execute_query(create_index)
|
235
261
|
|
236
262
|
@staticmethod
|
237
|
-
def _transform_list(li:
|
263
|
+
def _transform_list(li: list) -> list[list[Any]]:
|
238
264
|
"""Transform list to only contain field values
|
239
265
|
|
240
266
|
:param li: List of asyncpg.Record objects
|
241
267
|
:return: List of list of objects
|
242
268
|
"""
|
243
|
-
|
244
|
-
for item in li:
|
245
|
-
results.append(list(item))
|
246
|
-
return results
|
269
|
+
return [list(i) for i in li]
|
247
270
|
|
248
271
|
async def get_genes_and_alt_acs(
|
249
272
|
self,
|
250
273
|
pos: int,
|
251
|
-
strand:
|
252
|
-
chromosome:
|
253
|
-
alt_ac:
|
254
|
-
gene:
|
255
|
-
) ->
|
274
|
+
strand: Strand | None = None,
|
275
|
+
chromosome: int | None = None,
|
276
|
+
alt_ac: str | None = None,
|
277
|
+
gene: str | None = None,
|
278
|
+
) -> tuple[dict | None, str | None]:
|
256
279
|
"""Return genes and genomic accessions for a position on a chromosome or alt_ac
|
257
280
|
|
258
281
|
:param pos: Genomic position
|
@@ -309,8 +332,8 @@ class UtaDatabase:
|
|
309
332
|
return {"genes": genes, "alt_acs": alt_acs}, None
|
310
333
|
|
311
334
|
async def get_tx_exons(
|
312
|
-
self, tx_ac: str, alt_ac:
|
313
|
-
) ->
|
335
|
+
self, tx_ac: str, alt_ac: str | None = None
|
336
|
+
) -> tuple[list[tuple[int, int]] | None, str | None]:
|
314
337
|
"""Get list of transcript exons start/end coordinates.
|
315
338
|
|
316
339
|
:param tx_ac: Transcript accession
|
@@ -352,7 +375,7 @@ class UtaDatabase:
|
|
352
375
|
self,
|
353
376
|
tx_ac: str,
|
354
377
|
alt_ac: str,
|
355
|
-
) ->
|
378
|
+
) -> tuple[tuple[int, int, int, int, int] | None, str | None]:
|
356
379
|
"""Get exon number, transcript coordinates, and genomic coordinates
|
357
380
|
|
358
381
|
:param tx_ac: Transcript accession
|
@@ -379,8 +402,8 @@ class UtaDatabase:
|
|
379
402
|
return tx_exons_genomic_coords, None
|
380
403
|
|
381
404
|
async def get_alt_ac_start_or_end(
|
382
|
-
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene:
|
383
|
-
) ->
|
405
|
+
self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: str | None
|
406
|
+
) -> tuple[tuple[str, str, int, int, int] | None, str | None]:
|
384
407
|
"""Get genomic data for related transcript exon start or end.
|
385
408
|
|
386
409
|
:param tx_ac: Transcript accession
|
@@ -420,7 +443,7 @@ class UtaDatabase:
|
|
420
443
|
result = result[0]
|
421
444
|
return (result[0], result[1], result[2], result[3], result[4]), None
|
422
445
|
|
423
|
-
async def get_cds_start_end(self, tx_ac: str) ->
|
446
|
+
async def get_cds_start_end(self, tx_ac: str) -> tuple[int, int] | None:
|
424
447
|
"""Get coding start and end site
|
425
448
|
|
426
449
|
:param tx_ac: Transcript accession
|
@@ -444,7 +467,7 @@ class UtaDatabase:
|
|
444
467
|
)
|
445
468
|
return None
|
446
469
|
|
447
|
-
async def get_newest_assembly_ac(self, ac: str) ->
|
470
|
+
async def get_newest_assembly_ac(self, ac: str) -> list[str]:
|
448
471
|
"""Find accession associated to latest genomic assembly
|
449
472
|
|
450
473
|
:param ac: Accession
|
@@ -489,7 +512,7 @@ class UtaDatabase:
|
|
489
512
|
result = await self.execute_query(query)
|
490
513
|
return result[0][0]
|
491
514
|
|
492
|
-
async def get_ac_descr(self, ac: str) ->
|
515
|
+
async def get_ac_descr(self, ac: str) -> str | None:
|
493
516
|
"""Return accession description. This is typically available only for accessions
|
494
517
|
from older (pre-GRCh38) builds.
|
495
518
|
|
@@ -524,10 +547,10 @@ class UtaDatabase:
|
|
524
547
|
tx_ac: str,
|
525
548
|
start_pos: int,
|
526
549
|
end_pos: int,
|
527
|
-
alt_ac:
|
550
|
+
alt_ac: str | None = None,
|
528
551
|
use_tx_pos: bool = True,
|
529
552
|
like_tx_ac: bool = False,
|
530
|
-
) ->
|
553
|
+
) -> list:
|
531
554
|
"""Return queried data from tx_exon_aln_v table.
|
532
555
|
|
533
556
|
:param tx_ac: accession on c. coordinate
|
@@ -592,13 +615,10 @@ class UtaDatabase:
|
|
592
615
|
temp_ac,
|
593
616
|
alt_ac,
|
594
617
|
)
|
595
|
-
|
596
|
-
for r in result:
|
597
|
-
results.append(list(r))
|
598
|
-
return results
|
618
|
+
return [list(r) for r in result]
|
599
619
|
|
600
620
|
@staticmethod
|
601
|
-
def data_from_result(result:
|
621
|
+
def data_from_result(result: list) -> dict | None:
|
602
622
|
"""Return data found from result.
|
603
623
|
|
604
624
|
:param result: Data from tx_exon_aln_v table
|
@@ -631,8 +651,8 @@ class UtaDatabase:
|
|
631
651
|
}
|
632
652
|
|
633
653
|
async def get_mane_c_genomic_data(
|
634
|
-
self, ac: str, alt_ac:
|
635
|
-
) ->
|
654
|
+
self, ac: str, alt_ac: str | None, start_pos: int, end_pos: int
|
655
|
+
) -> dict | None:
|
636
656
|
"""Get MANE transcript and genomic data. Used when going from g. to MANE c.
|
637
657
|
representation.
|
638
658
|
|
@@ -697,13 +717,12 @@ class UtaDatabase:
|
|
697
717
|
async def get_genomic_tx_data(
|
698
718
|
self,
|
699
719
|
tx_ac: str,
|
700
|
-
pos:
|
701
|
-
annotation_layer:
|
702
|
-
|
703
|
-
|
704
|
-
alt_ac: Optional[str] = None,
|
720
|
+
pos: tuple[int, int],
|
721
|
+
annotation_layer: Literal[AnnotationLayer.CDNA]
|
722
|
+
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
723
|
+
alt_ac: str | None = None,
|
705
724
|
target_genome_assembly: Assembly = Assembly.GRCH38,
|
706
|
-
) ->
|
725
|
+
) -> dict | None:
|
707
726
|
"""Get transcript mapping to genomic data.
|
708
727
|
|
709
728
|
:param tx_ac: Accession on c. coordinate
|
@@ -760,7 +779,7 @@ class UtaDatabase:
|
|
760
779
|
|
761
780
|
return data
|
762
781
|
|
763
|
-
async def get_ac_from_gene(self, gene: str) ->
|
782
|
+
async def get_ac_from_gene(self, gene: str) -> list[str]:
|
764
783
|
"""Return genomic accession(s) associated to a gene.
|
765
784
|
|
766
785
|
:param gene: Gene symbol
|
@@ -784,14 +803,16 @@ class UtaDatabase:
|
|
784
803
|
|
785
804
|
async def get_gene_from_ac(
|
786
805
|
self, ac: str, start_pos: int, end_pos: int
|
787
|
-
) ->
|
806
|
+
) -> list[str] | None:
|
788
807
|
"""Get gene(s) within the provided coordinate range
|
789
808
|
|
790
809
|
>>> import asyncio
|
791
810
|
>>> from cool_seq_tool.sources import UtaDatabase
|
792
811
|
>>> async def get_gene():
|
793
812
|
... uta_db = await UtaDatabase.create()
|
794
|
-
... result = await uta_db.get_gene_from_ac(
|
813
|
+
... result = await uta_db.get_gene_from_ac(
|
814
|
+
... "NC_000017.11", 43044296, 43045802
|
815
|
+
... )
|
795
816
|
... return result
|
796
817
|
>>> asyncio.run(get_gene())
|
797
818
|
['BRCA1']
|
@@ -828,11 +849,11 @@ class UtaDatabase:
|
|
828
849
|
|
829
850
|
async def get_transcripts(
|
830
851
|
self,
|
831
|
-
start_pos:
|
832
|
-
end_pos:
|
833
|
-
gene:
|
852
|
+
start_pos: int | None = None,
|
853
|
+
end_pos: int | None = None,
|
854
|
+
gene: str | None = None,
|
834
855
|
use_tx_pos: bool = True,
|
835
|
-
alt_ac:
|
856
|
+
alt_ac: str | None = None,
|
836
857
|
) -> pl.DataFrame:
|
837
858
|
"""Get transcripts for a given ``gene`` or ``alt_ac`` related to optional positions.
|
838
859
|
|
@@ -906,7 +927,7 @@ class UtaDatabase:
|
|
906
927
|
results_df = results_df.unique()
|
907
928
|
return results_df
|
908
929
|
|
909
|
-
async def get_chr_assembly(self, ac: str) ->
|
930
|
+
async def get_chr_assembly(self, ac: str) -> tuple[str, str] | None:
|
910
931
|
"""Get chromosome and assembly for NC accession if not in GRCh38.
|
911
932
|
|
912
933
|
:param ac: NC accession
|
@@ -929,7 +950,7 @@ class UtaDatabase:
|
|
929
950
|
|
930
951
|
return chromosome, assembly
|
931
952
|
|
932
|
-
async def liftover_to_38(self, genomic_tx_data:
|
953
|
+
async def liftover_to_38(self, genomic_tx_data: dict) -> None:
|
933
954
|
"""Liftover genomic_tx_data to hg38 assembly.
|
934
955
|
|
935
956
|
:param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
|
@@ -985,7 +1006,7 @@ class UtaDatabase:
|
|
985
1006
|
|
986
1007
|
def get_liftover(
|
987
1008
|
self, chromosome: str, pos: int, liftover_to_assembly: Assembly
|
988
|
-
) ->
|
1009
|
+
) -> tuple[str, int] | None:
|
989
1010
|
"""Get new genome assembly data for a position on a chromosome.
|
990
1011
|
|
991
1012
|
:param chromosome: The chromosome number. Must be prefixed with ``chr``
|
@@ -1012,7 +1033,7 @@ class UtaDatabase:
|
|
1012
1033
|
|
1013
1034
|
def _set_liftover(
|
1014
1035
|
self,
|
1015
|
-
genomic_tx_data:
|
1036
|
+
genomic_tx_data: dict,
|
1016
1037
|
key: str,
|
1017
1038
|
chromosome: str,
|
1018
1039
|
liftover_to_assembly: Assembly,
|
@@ -1049,7 +1070,7 @@ class UtaDatabase:
|
|
1049
1070
|
|
1050
1071
|
genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
|
1051
1072
|
|
1052
|
-
async def p_to_c_ac(self, p_ac: str) ->
|
1073
|
+
async def p_to_c_ac(self, p_ac: str) -> list[str]:
|
1053
1074
|
"""Return cDNA reference sequence accession from protein reference sequence
|
1054
1075
|
accession (i.e. ``p.`` to ``c.`` in HGVS syntax)
|
1055
1076
|
|
@@ -1079,7 +1100,7 @@ class UtaDatabase:
|
|
1079
1100
|
|
1080
1101
|
async def get_transcripts_from_genomic_pos(
|
1081
1102
|
self, alt_ac: str, g_pos: int
|
1082
|
-
) ->
|
1103
|
+
) -> list[str]:
|
1083
1104
|
"""Get transcripts associated to a genomic ac and position.
|
1084
1105
|
|
1085
1106
|
:param alt_ac: Genomic accession
|
@@ -1145,13 +1166,13 @@ class ParseResult(UrlLibParseResult):
|
|
1145
1166
|
return super(ParseResult, cls).__new__(cls, *pr) # noqa: UP008
|
1146
1167
|
|
1147
1168
|
@property
|
1148
|
-
def database(self) ->
|
1169
|
+
def database(self) -> str | None:
|
1149
1170
|
"""Create database property."""
|
1150
1171
|
path_elems = self.path.split("/")
|
1151
1172
|
return path_elems[1] if len(path_elems) > 1 else None
|
1152
1173
|
|
1153
1174
|
@property
|
1154
|
-
def schema(self) ->
|
1175
|
+
def schema(self) -> str | None:
|
1155
1176
|
"""Create schema property."""
|
1156
1177
|
path_elems = self.path.split("/")
|
1157
1178
|
return path_elems[2] if len(path_elems) > 2 else None
|
cool_seq_tool/utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
"""Provide a small set of general helper functions."""
|
2
|
+
|
2
3
|
import datetime
|
3
4
|
import logging
|
4
|
-
from typing import Tuple
|
5
5
|
|
6
6
|
from cool_seq_tool.schemas import ResidueMode, ServiceMeta
|
7
7
|
from cool_seq_tool.version import __version__
|
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|
11
11
|
|
12
12
|
def get_inter_residue_pos(
|
13
13
|
start_pos: int, end_pos: int, residue_mode: ResidueMode
|
14
|
-
) ->
|
14
|
+
) -> tuple[int, int]:
|
15
15
|
"""Return equivalent inter-residue position.
|
16
16
|
|
17
17
|
Generally, we prefer to work with inter-residue coordinates where possible. Our
|
cool_seq_tool/version.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cool_seq_tool
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Common Operation on Lots of Sequences Tool
|
5
5
|
Author: Kori Kuzma, James Stevenson, Katie Stahl, Alex Wagner
|
6
6
|
License: MIT License
|
7
7
|
|
8
|
-
Copyright (c) 2021-
|
8
|
+
Copyright (c) 2021-2024 Wagner Lab
|
9
9
|
|
10
10
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
11
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -26,7 +26,7 @@ License: MIT License
|
|
26
26
|
SOFTWARE.
|
27
27
|
|
28
28
|
Project-URL: Homepage, https://github.com/genomicmedlab/cool-seq-tool
|
29
|
-
Project-URL: Documentation, https://coolseqtool.readthedocs.io/
|
29
|
+
Project-URL: Documentation, https://coolseqtool.readthedocs.io/
|
30
30
|
Project-URL: Changelog, https://github.com/genomicmedlab/cool-seq-tool/releases
|
31
31
|
Project-URL: Source, https://github.com/genomicmedlab/cool-seq-tool
|
32
32
|
Project-URL: Bug Tracker, https://github.com/genomicmedlab/cool-seq-tool/issues
|
@@ -39,30 +39,30 @@ Classifier: Intended Audience :: Developers
|
|
39
39
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
40
40
|
Classifier: License :: OSI Approved :: MIT License
|
41
41
|
Classifier: Programming Language :: Python :: 3
|
42
|
-
Classifier: Programming Language :: Python :: 3.8
|
43
|
-
Classifier: Programming Language :: Python :: 3.9
|
44
42
|
Classifier: Programming Language :: Python :: 3.10
|
45
43
|
Classifier: Programming Language :: Python :: 3.11
|
46
|
-
|
44
|
+
Classifier: Programming Language :: Python :: 3.12
|
45
|
+
Requires-Python: >=3.10
|
47
46
|
Description-Content-Type: text/markdown
|
48
47
|
License-File: LICENSE
|
49
48
|
Requires-Dist: asyncpg
|
50
49
|
Requires-Dist: aiofiles
|
51
50
|
Requires-Dist: boto3
|
52
51
|
Requires-Dist: agct >=0.1.0-dev1
|
53
|
-
Requires-Dist: polars
|
52
|
+
Requires-Dist: polars ~=1.0
|
54
53
|
Requires-Dist: hgvs
|
55
54
|
Requires-Dist: biocommons.seqrepo
|
56
55
|
Requires-Dist: pydantic ==2.*
|
57
56
|
Requires-Dist: uvicorn
|
58
57
|
Requires-Dist: fastapi
|
59
58
|
Requires-Dist: ga4gh.vrs
|
59
|
+
Requires-Dist: wags-tails ~=0.1.3
|
60
60
|
Provides-Extra: dev
|
61
61
|
Requires-Dist: pre-commit ; extra == 'dev'
|
62
62
|
Requires-Dist: ipython ; extra == 'dev'
|
63
63
|
Requires-Dist: ipykernel ; extra == 'dev'
|
64
64
|
Requires-Dist: psycopg2-binary ; extra == 'dev'
|
65
|
-
Requires-Dist: ruff ==0.
|
65
|
+
Requires-Dist: ruff ==0.5.0 ; extra == 'dev'
|
66
66
|
Provides-Extra: docs
|
67
67
|
Requires-Dist: sphinx ==6.1.3 ; extra == 'docs'
|
68
68
|
Requires-Dist: sphinx-autodoc-typehints ==1.22.0 ; extra == 'docs'
|
@@ -81,8 +81,14 @@ Requires-Dist: mock ; extra == 'tests'
|
|
81
81
|
CoolSeqTool
|
82
82
|
</h1>
|
83
83
|
|
84
|
+
[](https://pypi.python.org/pypi/cool-seq-tool) [](https://pypi.python.org/pypi/cool-seq-tool) [](https://pypi.python.org/pypi/cool-seq-tool) [](https://github.com/genomicmedlab/cool-seq-tool/actions/checks.yaml)
|
85
|
+
|
86
|
+
---
|
87
|
+
|
84
88
|
**[Documentation](https://coolseqtool.readthedocs.io/latest/)** · [Installation](https://coolseqtool.readthedocs.io/latest/install.html) · [Usage](https://coolseqtool.readthedocs.io/latest/usage.html) · [API reference](https://coolseqtool.readthedocs.io/latest/reference/index.html)
|
85
89
|
|
90
|
+
---
|
91
|
+
|
86
92
|
## Overview
|
87
93
|
|
88
94
|
<!-- description -->
|
@@ -113,6 +119,7 @@ All CoolSeqTool resources can be initialized by way of a top-level class instanc
|
|
113
119
|
|
114
120
|
```pycon
|
115
121
|
>>> from cool_seq_tool.app import CoolSeqTool
|
122
|
+
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
116
123
|
>>> cst = CoolSeqTool()
|
117
124
|
>>> result = await cst.mane_transcript.get_mane_transcript(
|
118
125
|
... "NP_004324.2",
|
@@ -0,0 +1,29 @@
|
|
1
|
+
cool_seq_tool/__init__.py,sha256=BTfkS0bkMtxBL4yGHc4Z7ubmNhdhY2WALfadnk8N1lw,280
|
2
|
+
cool_seq_tool/api.py,sha256=AbCmdUVH8ltwqH8k7DiVsHpujMzb6c5pyAKY12iIC0U,1210
|
3
|
+
cool_seq_tool/app.py,sha256=5dBmzTf5SeIF90y_ZyI0K6AMSKgchC33eW_ABN6D8_s,4790
|
4
|
+
cool_seq_tool/schemas.py,sha256=8xGrP0rAcKLXtZYEe_DJcNp4zapjhN0StRq8uCjoobE,16720
|
5
|
+
cool_seq_tool/utils.py,sha256=lckkyFKxMAqG79SYO3p28q6BWgEjlQP7CumE2TDP1zc,1601
|
6
|
+
cool_seq_tool/version.py,sha256=hs3N9Wl67casrrQa2sGIAcpcaUySVk4oLE7JffoQuCI,53
|
7
|
+
cool_seq_tool/handlers/__init__.py,sha256=KalQ46vX1MO4SJz2SlspKoIRy1n3c3Vp1t4Y2pIfqow,78
|
8
|
+
cool_seq_tool/handlers/seqrepo_access.py,sha256=JB3cg7YiV2JKa7ImJXz4WtP9XWShk9qYvhCCrZnBQ6M,8983
|
9
|
+
cool_seq_tool/mappers/__init__.py,sha256=SMSf6sPcu7mdQNuJ4Cj1mbOwFUPuMdFSf0noY4XvTxE,262
|
10
|
+
cool_seq_tool/mappers/alignment.py,sha256=6Vk4XEar54ivuH8N7oBqa9gUa8E5GjWCI9hC1HCkM18,9552
|
11
|
+
cool_seq_tool/mappers/exon_genomic_coords.py,sha256=tOmo6kFGcFIRmLBQwSsIZUSiratiyACf946YKV_IU78,38544
|
12
|
+
cool_seq_tool/mappers/mane_transcript.py,sha256=RrVRUS4IqxxX-HyamNLqpQ_WVWABgiLqwmmIh92uny8,49264
|
13
|
+
cool_seq_tool/resources/__init__.py,sha256=VwUC8YaucTS6SmRirToulZTF6CuvuLQRSxFfSfAovCc,77
|
14
|
+
cool_seq_tool/resources/data_files.py,sha256=3lhu28tzlSoTs4vHZNu-hhoAWRrPGuZj_oIjqk2sYQM,3837
|
15
|
+
cool_seq_tool/resources/status.py,sha256=ENsLiwSxzJOLOsY5IKDM805UWbQAOV3w9s7Rv_FLAUs,5761
|
16
|
+
cool_seq_tool/resources/transcript_mapping.tsv,sha256=AO3luYQAbFiCoRgiiPXotakb5pAwx1jDCeXpvGdIuac,24138769
|
17
|
+
cool_seq_tool/routers/__init__.py,sha256=7SqhLv6_mDPpK1Q0L9aykmjhCmsymFqgbSWZH8LuCW0,437
|
18
|
+
cool_seq_tool/routers/default.py,sha256=zqeQmHmfGUvV32xLbN-fUfYnK_UI1gpqIL8Eu5Y8KzY,3928
|
19
|
+
cool_seq_tool/routers/mane.py,sha256=boZKP5PH0BAcqEeTBBr9Z3EMY4lhvLLX-pJxUqjBZQ0,3508
|
20
|
+
cool_seq_tool/routers/mappings.py,sha256=UJaip0QvRfK3Lk3eVuwofUwg2XJqMV5OVY9OLcpnWS4,6061
|
21
|
+
cool_seq_tool/sources/__init__.py,sha256=51QiymeptF7AeVGgV-tW_9f4pIUr0xtYbyzpvHOCneM,304
|
22
|
+
cool_seq_tool/sources/mane_transcript_mappings.py,sha256=9Rd_tRCrTk9i9Urp-pMMttq4cCbIJaEJ0n8rM9y9-7I,4077
|
23
|
+
cool_seq_tool/sources/transcript_mappings.py,sha256=903RKTMBO2rbKh6iTQ1BEWnY4C7saBFMPw2_4ATuudg,10054
|
24
|
+
cool_seq_tool/sources/uta_database.py,sha256=GJHhYbH130YJo9FIRroR8eavlbaziMwI0JVNP8IPGPM,45636
|
25
|
+
cool_seq_tool-0.4.1.dist-info/LICENSE,sha256=IpqC9A-tZW7XXXvCS8c4AVINqkmpxiVA-34Qe3CZSjo,1072
|
26
|
+
cool_seq_tool-0.4.1.dist-info/METADATA,sha256=CnZwl-rVLfY6kcVkQKYsYziT19q48qHRkYFQ96-OCx0,6262
|
27
|
+
cool_seq_tool-0.4.1.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
28
|
+
cool_seq_tool-0.4.1.dist-info/top_level.txt,sha256=cGuxdN6p3y16jQf6hCwWhE4OptwUeZPm_PNJlPb3b0k,14
|
29
|
+
cool_seq_tool-0.4.1.dist-info/RECORD,,
|
cool_seq_tool/data/__init__.py
DELETED
@@ -1,89 +0,0 @@
|
|
1
|
-
"""Handle acquisition of external data."""
|
2
|
-
import datetime
|
3
|
-
import gzip
|
4
|
-
import logging
|
5
|
-
import shutil
|
6
|
-
from ftplib import FTP
|
7
|
-
from pathlib import Path
|
8
|
-
|
9
|
-
from dateutil import parser
|
10
|
-
|
11
|
-
from cool_seq_tool import APP_ROOT
|
12
|
-
|
13
|
-
logger = logging.getLogger("cool_seq_tool")
|
14
|
-
|
15
|
-
|
16
|
-
class DataDownload:
|
17
|
-
"""Manage downloadable data files. Responsible for checking if files are available
|
18
|
-
under expected locations, and fetching them if not.
|
19
|
-
|
20
|
-
Relevant methods are called automatically by data classes; users should not have
|
21
|
-
to interact with this class under normal circumstances.
|
22
|
-
"""
|
23
|
-
|
24
|
-
def __init__(self) -> None:
|
25
|
-
"""Initialize downloadable data locations."""
|
26
|
-
self._data_dir = APP_ROOT / "data"
|
27
|
-
|
28
|
-
def get_mane_summary(self) -> Path:
|
29
|
-
"""Identify latest MANE summary data. If unavailable locally, download from
|
30
|
-
`NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/>`_.
|
31
|
-
|
32
|
-
:return: path to MANE summary file
|
33
|
-
"""
|
34
|
-
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
|
35
|
-
ftp.login()
|
36
|
-
ftp.cwd("/refseq/MANE/MANE_human/current")
|
37
|
-
files = ftp.nlst()
|
38
|
-
mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")]
|
39
|
-
if not mane_summary_file:
|
40
|
-
msg = "Unable to download MANE summary data"
|
41
|
-
raise Exception(msg)
|
42
|
-
mane_summary_file = mane_summary_file[0]
|
43
|
-
self._mane_summary_path = self._data_dir / mane_summary_file[:-3]
|
44
|
-
mane_data_path = self._data_dir / mane_summary_file
|
45
|
-
if not self._mane_summary_path.exists():
|
46
|
-
logger.info("Downloading MANE summary file from NCBI.")
|
47
|
-
with mane_data_path.open("wb") as fp:
|
48
|
-
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
|
49
|
-
with gzip.open(
|
50
|
-
mane_data_path, "rb"
|
51
|
-
) as f_in, self._mane_summary_path.open("wb") as f_out:
|
52
|
-
shutil.copyfileobj(f_in, f_out)
|
53
|
-
mane_data_path.unlink()
|
54
|
-
logger.info("MANE summary file download complete.")
|
55
|
-
return self._mane_summary_path
|
56
|
-
|
57
|
-
def get_lrg_refseq_gene_data(self) -> Path:
|
58
|
-
"""Identify latest LRG RefSeq Gene file. If unavailable locally, download from
|
59
|
-
`NCBI FTP server <https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/>`_.
|
60
|
-
|
61
|
-
:return: path to acquired LRG RefSeq Gene data file
|
62
|
-
"""
|
63
|
-
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
|
64
|
-
ftp.login()
|
65
|
-
lrg_refseqgene_file = "LRG_RefSeqGene"
|
66
|
-
ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/"
|
67
|
-
ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
|
68
|
-
timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
|
69
|
-
date = str(parser.parse(timestamp)).split()[0]
|
70
|
-
version = (
|
71
|
-
datetime.datetime.strptime(date, "%Y-%m-%d")
|
72
|
-
.astimezone(tz=datetime.timezone.utc)
|
73
|
-
.strftime("%Y%m%d")
|
74
|
-
)
|
75
|
-
fn_versioned = f"{lrg_refseqgene_file}_{version}"
|
76
|
-
lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
|
77
|
-
self._lrg_refseqgene_path = self._data_dir / fn_versioned
|
78
|
-
if not self._lrg_refseqgene_path.exists():
|
79
|
-
logger.info("Downloading LRG RefSeq data from NCBI.")
|
80
|
-
ftp.cwd(ftp_dir_path)
|
81
|
-
with lrg_refseqgene_path.open("wb") as fp:
|
82
|
-
ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
|
83
|
-
with lrg_refseqgene_path.open(
|
84
|
-
"rb"
|
85
|
-
) as f_in, self._lrg_refseqgene_path.open("wb") as f_out:
|
86
|
-
shutil.copyfileobj(f_in, f_out)
|
87
|
-
lrg_refseqgene_path.unlink()
|
88
|
-
logger.info("LRG RefSeq data download complete.")
|
89
|
-
return self._lrg_refseqgene_path
|
cool_seq_tool/paths.py
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
"""Provide paths to shared files, and trigger data acquisition if unavailable."""
|
2
|
-
from os import environ
|
3
|
-
from pathlib import Path
|
4
|
-
|
5
|
-
from cool_seq_tool.data.data_downloads import DataDownload
|
6
|
-
|
7
|
-
APP_ROOT = Path(__file__).resolve().parents[0]
|
8
|
-
|
9
|
-
TRANSCRIPT_MAPPINGS_PATH = Path(
|
10
|
-
environ.get("TRANSCRIPT_MAPPINGS_PATH", f"{APP_ROOT}/data/transcript_mapping.tsv")
|
11
|
-
)
|
12
|
-
|
13
|
-
d = DataDownload()
|
14
|
-
|
15
|
-
provided_mane_summary_path = environ.get("MANE_SUMMARY_PATH", "")
|
16
|
-
if provided_mane_summary_path:
|
17
|
-
MANE_SUMMARY_PATH = Path(provided_mane_summary_path)
|
18
|
-
else:
|
19
|
-
MANE_SUMMARY_PATH = d.get_mane_summary()
|
20
|
-
|
21
|
-
provided_lrg_refseq_path = environ.get("LRG_REFSEQGENE_PATH", "")
|
22
|
-
if provided_lrg_refseq_path:
|
23
|
-
LRG_REFSEQGENE_PATH = Path(provided_lrg_refseq_path)
|
24
|
-
else:
|
25
|
-
LRG_REFSEQGENE_PATH = d.get_lrg_refseq_gene_data()
|
26
|
-
|
27
|
-
|
28
|
-
SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
|