cool-seq-tool 0.4.0.dev2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +1 -3
- cool_seq_tool/api.py +1 -2
- cool_seq_tool/app.py +42 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +13 -15
- cool_seq_tool/mappers/__init__.py +1 -0
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +232 -68
- cool_seq_tool/mappers/mane_transcript.py +84 -86
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +151 -0
- cool_seq_tool/routers/__init__.py +1 -0
- cool_seq_tool/routers/default.py +1 -0
- cool_seq_tool/routers/mane.py +4 -4
- cool_seq_tool/routers/mappings.py +2 -2
- cool_seq_tool/schemas.py +83 -37
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +14 -7
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +120 -69
- cool_seq_tool/utils.py +2 -2
- cool_seq_tool/version.py +2 -1
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/METADATA +15 -8
- cool_seq_tool-0.4.1.dist-info/RECORD +29 -0
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool-0.4.0.dev2.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/top_level.txt +0 -0
@@ -11,10 +11,11 @@ Steps:
|
|
11
11
|
In addition to a mapper utility class, this module also defines several vocabulary
|
12
12
|
constraints and data models for coordinate representation.
|
13
13
|
"""
|
14
|
+
|
14
15
|
import logging
|
15
16
|
import math
|
16
17
|
from enum import Enum
|
17
|
-
from typing import
|
18
|
+
from typing import Literal
|
18
19
|
|
19
20
|
import polars as pl
|
20
21
|
from pydantic import BaseModel
|
@@ -50,10 +51,10 @@ class EndAnnotationLayer(str, Enum):
|
|
50
51
|
class DataRepresentation(BaseModel):
|
51
52
|
"""Define object model for final output representation"""
|
52
53
|
|
53
|
-
gene:
|
54
|
+
gene: str | None = None
|
54
55
|
refseq: str
|
55
|
-
ensembl:
|
56
|
-
pos:
|
56
|
+
ensembl: str | None = None
|
57
|
+
pos: tuple[int, int]
|
57
58
|
strand: Strand
|
58
59
|
status: TranscriptPriority
|
59
60
|
|
@@ -63,14 +64,14 @@ class CdnaRepresentation(DataRepresentation):
|
|
63
64
|
|
64
65
|
coding_start_site: int
|
65
66
|
coding_end_site: int
|
66
|
-
alt_ac:
|
67
|
+
alt_ac: str | None = None
|
67
68
|
|
68
69
|
|
69
70
|
class GenomicRepresentation(BaseModel):
|
70
71
|
"""Define object model for genomic representation"""
|
71
72
|
|
72
73
|
refseq: str
|
73
|
-
pos:
|
74
|
+
pos: tuple[int, int]
|
74
75
|
status: TranscriptPriority
|
75
76
|
alt_ac: str
|
76
77
|
|
@@ -105,7 +106,7 @@ class ManeTranscript:
|
|
105
106
|
|
106
107
|
>>> import asyncio
|
107
108
|
>>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
|
108
|
-
>>> result[
|
109
|
+
>>> result["ac"]
|
109
110
|
'NC_000001.11'
|
110
111
|
|
111
112
|
See the :ref:`Usage section <async_note>` for more information.
|
@@ -135,7 +136,7 @@ class ManeTranscript:
|
|
135
136
|
return pos_mod_3
|
136
137
|
|
137
138
|
@staticmethod
|
138
|
-
def _p_to_c_pos(start: int, end: int) ->
|
139
|
+
def _p_to_c_pos(start: int, end: int) -> tuple[int, int]:
|
139
140
|
"""Return cDNA position given a protein position.
|
140
141
|
|
141
142
|
:param start: Start protein position. Inter-residue coordinates
|
@@ -148,7 +149,7 @@ class ManeTranscript:
|
|
148
149
|
|
149
150
|
async def _p_to_c(
|
150
151
|
self, ac: str, start_pos: int, end_pos: int
|
151
|
-
) ->
|
152
|
+
) -> tuple[str, tuple[int, int]] | None:
|
152
153
|
"""Convert protein (p.) annotation to cDNA (c.) annotation.
|
153
154
|
|
154
155
|
:param ac: Protein accession
|
@@ -176,7 +177,7 @@ class ManeTranscript:
|
|
176
177
|
pos = self._p_to_c_pos(start_pos, end_pos)
|
177
178
|
return ac, pos
|
178
179
|
|
179
|
-
async def _c_to_g(self, ac: str, pos:
|
180
|
+
async def _c_to_g(self, ac: str, pos: tuple[int, int]) -> dict | None:
|
180
181
|
"""Get g. annotation from c. annotation.
|
181
182
|
|
182
183
|
:param ac: cDNA accession
|
@@ -217,13 +218,12 @@ class ManeTranscript:
|
|
217
218
|
async def _get_and_validate_genomic_tx_data(
|
218
219
|
self,
|
219
220
|
tx_ac: str,
|
220
|
-
pos:
|
221
|
-
annotation_layer:
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
) -> Optional[Dict]:
|
221
|
+
pos: tuple[int, int],
|
222
|
+
annotation_layer: Literal[AnnotationLayer.CDNA]
|
223
|
+
| Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
|
224
|
+
coding_start_site: int | None = None,
|
225
|
+
alt_ac: str | None = None,
|
226
|
+
) -> dict | None:
|
227
227
|
"""Get and validate genomic_tx_data
|
228
228
|
|
229
229
|
:param tx_ac: Accession on c. coordinate
|
@@ -266,14 +266,14 @@ class ManeTranscript:
|
|
266
266
|
|
267
267
|
@staticmethod
|
268
268
|
def _get_c_data(
|
269
|
-
cds_start_end:
|
270
|
-
c_pos_change:
|
269
|
+
cds_start_end: tuple[int, int],
|
270
|
+
c_pos_change: tuple[int, int],
|
271
271
|
strand: Strand,
|
272
272
|
status: TranscriptPriority,
|
273
273
|
refseq_c_ac: str,
|
274
|
-
gene:
|
275
|
-
ensembl_c_ac:
|
276
|
-
alt_ac:
|
274
|
+
gene: str | None = None,
|
275
|
+
ensembl_c_ac: str | None = None,
|
276
|
+
alt_ac: str | None = None,
|
277
277
|
) -> CdnaRepresentation:
|
278
278
|
"""Return transcript data on c. coordinate.
|
279
279
|
|
@@ -311,7 +311,7 @@ class ManeTranscript:
|
|
311
311
|
alt_ac=alt_ac,
|
312
312
|
)
|
313
313
|
|
314
|
-
def _c_to_p_pos(self, c_pos:
|
314
|
+
def _c_to_p_pos(self, c_pos: tuple[int, int]) -> tuple[int, int]:
|
315
315
|
"""Get protein position from cdna position
|
316
316
|
|
317
317
|
:param c_pos: cdna position. inter-residue coordinates
|
@@ -325,7 +325,7 @@ class ManeTranscript:
|
|
325
325
|
return start, end
|
326
326
|
|
327
327
|
def _get_mane_p(
|
328
|
-
self, mane_data:
|
328
|
+
self, mane_data: dict, mane_c_pos_range: tuple[int, int]
|
329
329
|
) -> DataRepresentation:
|
330
330
|
"""Translate MANE Transcript c. annotation to p. annotation
|
331
331
|
|
@@ -349,13 +349,13 @@ class ManeTranscript:
|
|
349
349
|
|
350
350
|
async def _g_to_c(
|
351
351
|
self,
|
352
|
-
g:
|
352
|
+
g: dict,
|
353
353
|
refseq_c_ac: str,
|
354
354
|
status: TranscriptPriority,
|
355
|
-
ensembl_c_ac:
|
356
|
-
alt_ac:
|
355
|
+
ensembl_c_ac: str | None = None,
|
356
|
+
alt_ac: str | None = None,
|
357
357
|
found_result: bool = False,
|
358
|
-
) ->
|
358
|
+
) -> CdnaRepresentation | None:
|
359
359
|
"""Get transcript c. annotation data from g. annotation.
|
360
360
|
|
361
361
|
:param g: Genomic data
|
@@ -459,9 +459,9 @@ class ManeTranscript:
|
|
459
459
|
coding_start_site: int,
|
460
460
|
start_pos: int,
|
461
461
|
end_pos: int,
|
462
|
-
mane_transcript:
|
463
|
-
|
464
|
-
|
462
|
+
mane_transcript: DataRepresentation
|
463
|
+
| CdnaRepresentation
|
464
|
+
| GenomicRepresentation,
|
465
465
|
expected_ref: str,
|
466
466
|
anno: AnnotationLayer,
|
467
467
|
residue_mode: ResidueMode,
|
@@ -522,7 +522,7 @@ class ManeTranscript:
|
|
522
522
|
return True
|
523
523
|
|
524
524
|
def _validate_index(
|
525
|
-
self, ac: str, pos:
|
525
|
+
self, ac: str, pos: tuple[int, int], coding_start_site: int
|
526
526
|
) -> bool:
|
527
527
|
"""Validate that positions actually exist on accession
|
528
528
|
|
@@ -533,13 +533,13 @@ class ManeTranscript:
|
|
533
533
|
"""
|
534
534
|
start_pos = pos[0] + coding_start_site
|
535
535
|
end_pos = pos[1] + coding_start_site
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
536
|
+
return bool(
|
537
|
+
self.seqrepo_access.get_reference_sequence(
|
538
|
+
ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
|
539
|
+
)[0]
|
540
|
+
)
|
541
541
|
|
542
|
-
def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) ->
|
542
|
+
def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list:
|
543
543
|
"""Sort and filter transcripts from gene to get priority list
|
544
544
|
|
545
545
|
:param df: Data frame containing transcripts from gene
|
@@ -550,7 +550,7 @@ class ManeTranscript:
|
|
550
550
|
most recent version of a transcript associated with an assembly will be kept
|
551
551
|
"""
|
552
552
|
copy_df = df.clone()
|
553
|
-
copy_df = copy_df.drop(
|
553
|
+
copy_df = copy_df.drop("alt_ac").unique()
|
554
554
|
copy_df = copy_df.with_columns(
|
555
555
|
[
|
556
556
|
pl.col("tx_ac")
|
@@ -590,15 +590,13 @@ class ManeTranscript:
|
|
590
590
|
start_pos: int,
|
591
591
|
end_pos: int,
|
592
592
|
start_annotation_layer: AnnotationLayer,
|
593
|
-
gene:
|
594
|
-
ref:
|
593
|
+
gene: str | None = None,
|
594
|
+
ref: str | None = None,
|
595
595
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
596
|
-
mane_transcripts:
|
597
|
-
alt_ac:
|
598
|
-
end_annotation_layer:
|
599
|
-
) ->
|
600
|
-
Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation]
|
601
|
-
]:
|
596
|
+
mane_transcripts: set | None = None,
|
597
|
+
alt_ac: str | None = None,
|
598
|
+
end_annotation_layer: EndAnnotationLayer | None = None,
|
599
|
+
) -> DataRepresentation | CdnaRepresentation | ProteinAndCdnaRepresentation | None:
|
602
600
|
"""Get longest compatible transcript from a gene. See the documentation for
|
603
601
|
the :ref:`transcript compatibility policy <transcript_compatibility>` for more
|
604
602
|
information.
|
@@ -613,14 +611,16 @@ class ManeTranscript:
|
|
613
611
|
... "NM_004333.6",
|
614
612
|
... "ENST00000644969.2",
|
615
613
|
... }
|
616
|
-
>>> result = asyncio.run(
|
617
|
-
...
|
618
|
-
...
|
619
|
-
...
|
620
|
-
...
|
621
|
-
...
|
622
|
-
...
|
623
|
-
...
|
614
|
+
>>> result = asyncio.run(
|
615
|
+
... mane_mapper.get_longest_compatible_transcript(
|
616
|
+
... 599,
|
617
|
+
... 599,
|
618
|
+
... gene="BRAF",
|
619
|
+
... start_annotation_layer=AnnotationLayer.PROTEIN,
|
620
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
621
|
+
... mane_transcripts=mane_transcripts,
|
622
|
+
... )
|
623
|
+
... )
|
624
624
|
>>> result.refseq
|
625
625
|
'NP_001365396.1'
|
626
626
|
|
@@ -645,9 +645,9 @@ class ManeTranscript:
|
|
645
645
|
"""
|
646
646
|
|
647
647
|
def _get_protein_rep(
|
648
|
-
gene:
|
648
|
+
gene: str | None,
|
649
649
|
pro_ac: str,
|
650
|
-
lcr_c_data_pos:
|
650
|
+
lcr_c_data_pos: tuple[int, int],
|
651
651
|
strand: Strand,
|
652
652
|
status: TranscriptPriority,
|
653
653
|
) -> DataRepresentation:
|
@@ -731,7 +731,7 @@ class ManeTranscript:
|
|
731
731
|
|
732
732
|
# Get prioritized transcript data for gene
|
733
733
|
# grch38 -> c
|
734
|
-
lcr_c_data:
|
734
|
+
lcr_c_data: CdnaRepresentation | None = await self._g_to_c(
|
735
735
|
g=g,
|
736
736
|
refseq_c_ac=tx_ac,
|
737
737
|
status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
|
@@ -859,25 +859,26 @@ class ManeTranscript:
|
|
859
859
|
start_pos: int,
|
860
860
|
end_pos: int,
|
861
861
|
start_annotation_layer: AnnotationLayer,
|
862
|
-
gene:
|
863
|
-
ref:
|
862
|
+
gene: str | None = None,
|
863
|
+
ref: str | None = None,
|
864
864
|
try_longest_compatible: bool = False,
|
865
|
-
residue_mode:
|
866
|
-
|
867
|
-
|
868
|
-
) -> Optional[Union[DataRepresentation, CdnaRepresentation]]:
|
865
|
+
residue_mode: Literal[ResidueMode.RESIDUE]
|
866
|
+
| Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.RESIDUE,
|
867
|
+
) -> DataRepresentation | CdnaRepresentation | None:
|
869
868
|
"""Return MANE transcript.
|
870
869
|
|
871
870
|
>>> from cool_seq_tool.app import CoolSeqTool
|
872
871
|
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
873
872
|
>>> import asyncio
|
874
873
|
>>> mane_mapper = CoolSeqTool().mane_transcript
|
875
|
-
>>> result = asyncio.run(
|
876
|
-
...
|
877
|
-
...
|
878
|
-
...
|
879
|
-
...
|
880
|
-
...
|
874
|
+
>>> result = asyncio.run(
|
875
|
+
... mane_mapper.get_mane_transcript(
|
876
|
+
... "NP_004324.2",
|
877
|
+
... 599,
|
878
|
+
... AnnotationLayer.PROTEIN,
|
879
|
+
... residue_mode=ResidueMode.INTER_RESIDUE,
|
880
|
+
... )
|
881
|
+
... )
|
881
882
|
>>> result.gene, result.refseq, result.status
|
882
883
|
('BRAF', 'NP_004324.2', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
|
883
884
|
|
@@ -930,7 +931,7 @@ class ManeTranscript:
|
|
930
931
|
current_mane_data["RefSeq_nuc"],
|
931
932
|
current_mane_data["Ensembl_nuc"],
|
932
933
|
}
|
933
|
-
mane:
|
934
|
+
mane: CdnaRepresentation | None = await self._g_to_c(
|
934
935
|
g=g,
|
935
936
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
936
937
|
status=TranscriptPriority(
|
@@ -1001,9 +1002,7 @@ class ManeTranscript:
|
|
1001
1002
|
logger.warning("Annotation layer not supported: %s", start_annotation_layer)
|
1002
1003
|
return None
|
1003
1004
|
|
1004
|
-
async def g_to_grch38(
|
1005
|
-
self, ac: str, start_pos: int, end_pos: int
|
1006
|
-
) -> Optional[Dict]:
|
1005
|
+
async def g_to_grch38(self, ac: str, start_pos: int, end_pos: int) -> dict | None:
|
1007
1006
|
"""Return genomic coordinate on GRCh38 when not given gene context.
|
1008
1007
|
|
1009
1008
|
:param ac: Genomic accession
|
@@ -1055,8 +1054,8 @@ class ManeTranscript:
|
|
1055
1054
|
|
1056
1055
|
@staticmethod
|
1057
1056
|
def get_mane_c_pos_change(
|
1058
|
-
mane_tx_genomic_data:
|
1059
|
-
) ->
|
1057
|
+
mane_tx_genomic_data: dict, coding_start_site: int
|
1058
|
+
) -> tuple[int, int]:
|
1060
1059
|
"""Get mane c position change
|
1061
1060
|
|
1062
1061
|
:param mane_tx_genomic_data: MANE transcript and genomic data
|
@@ -1080,9 +1079,9 @@ class ManeTranscript:
|
|
1080
1079
|
ac: str,
|
1081
1080
|
start_pos: int,
|
1082
1081
|
end_pos: int,
|
1083
|
-
gene:
|
1082
|
+
gene: str | None = None,
|
1084
1083
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1085
|
-
) ->
|
1084
|
+
) -> GenomicRepresentation | CdnaRepresentation | None:
|
1086
1085
|
"""Return MANE Transcript on the c. coordinate.
|
1087
1086
|
|
1088
1087
|
If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
|
@@ -1091,12 +1090,11 @@ class ManeTranscript:
|
|
1091
1090
|
>>> import asyncio
|
1092
1091
|
>>> from cool_seq_tool.app import CoolSeqTool
|
1093
1092
|
>>> cst = CoolSeqTool()
|
1094
|
-
>>> result = asyncio.run(
|
1095
|
-
...
|
1096
|
-
...
|
1097
|
-
...
|
1098
|
-
...
|
1099
|
-
... ))
|
1093
|
+
>>> result = asyncio.run(
|
1094
|
+
... cst.mane_transcript.g_to_mane_c(
|
1095
|
+
... "NC_000007.13", 55259515, None, gene="EGFR"
|
1096
|
+
... )
|
1097
|
+
... )
|
1100
1098
|
>>> type(result)
|
1101
1099
|
<class 'cool_seq_tool.mappers.mane_transcript.CdnaRepresentation'>
|
1102
1100
|
>>> result.status
|
@@ -1198,10 +1196,10 @@ class ManeTranscript:
|
|
1198
1196
|
alt_ac: str,
|
1199
1197
|
start_pos: int,
|
1200
1198
|
end_pos: int,
|
1201
|
-
gene:
|
1199
|
+
gene: str | None = None,
|
1202
1200
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1203
1201
|
try_longest_compatible: bool = False,
|
1204
|
-
) ->
|
1202
|
+
) -> dict | None:
|
1205
1203
|
"""Given GRCh38 genomic representation, return protein representation.
|
1206
1204
|
|
1207
1205
|
Will try MANE Select and then MANE Plus Clinical. If neither is found and
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Provide tools for acquiring and managing Cool-Seq-Tool data resources."""
|
@@ -0,0 +1,93 @@
|
|
1
|
+
"""Fetch data files regarding transcript mapping and annotation."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from enum import Enum
|
5
|
+
from importlib import resources
|
6
|
+
from os import environ
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
from wags_tails import NcbiLrgRefSeqGeneData, NcbiManeSummaryData
|
10
|
+
|
11
|
+
_logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class DataFile(str, Enum):
|
15
|
+
"""Constrain legal values for file resource fetching in :py:meth:`get_data_file() <cool_seq_tool.resources.data_files.get_data_file>`."""
|
16
|
+
|
17
|
+
TRANSCRIPT_MAPPINGS = "transcript_mappings"
|
18
|
+
MANE_SUMMARY = "mane_summary"
|
19
|
+
LRG_REFSEQGENE = "lrg_refseqgene"
|
20
|
+
|
21
|
+
def lower(self) -> str:
|
22
|
+
"""Return lower-cased value
|
23
|
+
|
24
|
+
:return: lower case string
|
25
|
+
"""
|
26
|
+
return self.value.lower()
|
27
|
+
|
28
|
+
|
29
|
+
_resource_acquisition_params = {
|
30
|
+
DataFile.TRANSCRIPT_MAPPINGS: (
|
31
|
+
"TRANSCRIPT_MAPPINGS_PATH",
|
32
|
+
lambda _: resources.files(__package__) / "transcript_mapping.tsv",
|
33
|
+
),
|
34
|
+
DataFile.MANE_SUMMARY: (
|
35
|
+
"MANE_SUMMARY_PATH",
|
36
|
+
lambda from_local: NcbiManeSummaryData(silent=True).get_latest(
|
37
|
+
from_local=from_local
|
38
|
+
)[0],
|
39
|
+
),
|
40
|
+
DataFile.LRG_REFSEQGENE: (
|
41
|
+
"LRG_REFSEQGENE_PATH",
|
42
|
+
lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest(
|
43
|
+
from_local=from_local
|
44
|
+
)[0],
|
45
|
+
),
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
def get_data_file(resource: DataFile, from_local: bool = False) -> Path:
|
50
|
+
"""Acquire Cool-Seq-Tool file dependency.
|
51
|
+
|
52
|
+
Each resource can be defined using an environment variable:
|
53
|
+
|
54
|
+
* ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH``
|
55
|
+
* ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH``
|
56
|
+
* ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH``
|
57
|
+
|
58
|
+
Otherwise, this function falls back on default expected locations:
|
59
|
+
|
60
|
+
* ``transcript_mappings.tsv`` is bundled with this library.
|
61
|
+
* LRG RefseqGene and MANE summary files are acquired from NCBI using the `wags-tails <https://wags-tails.readthedocs.io/stable/>`_ if unavailable locally, or out of date.
|
62
|
+
|
63
|
+
:param resource: resource to fetch
|
64
|
+
:param from_local: if ``True``, don't check for or acquire latest version -- just
|
65
|
+
provide most recent locally available file and raise FileNotFoundError otherwise
|
66
|
+
:return: path to file. Consuming functions can assume that it exists and is a file.
|
67
|
+
:raise FileNotFoundError: if file location configured by env var doesn't exist
|
68
|
+
:raise ValueError: if file location configured by env var isn't a file
|
69
|
+
"""
|
70
|
+
params = _resource_acquisition_params[resource]
|
71
|
+
configured_path = environ.get(params[0])
|
72
|
+
if configured_path:
|
73
|
+
_logger.debug(
|
74
|
+
"Acquiring %s via env var %s:%s", resource, params[0], configured_path
|
75
|
+
)
|
76
|
+
path = Path(configured_path)
|
77
|
+
loc_descr = (
|
78
|
+
"the default file bundled with Cool-Seq-Tool"
|
79
|
+
if resource == DataFile.TRANSCRIPT_MAPPINGS
|
80
|
+
else "the the default file pattern and possibly acquire from source via the `wags-tails` package"
|
81
|
+
)
|
82
|
+
msg = f'No {params[0].replace("_", " ").title()} file exists at path {configured_path} defined under env var {params[0]}. Either unset to use {loc_descr}, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more: https://coolseqtool.readthedocs.io/stable/usage.html#environment-configuration'
|
83
|
+
if not path.exists():
|
84
|
+
raise FileNotFoundError(msg)
|
85
|
+
if not path.is_file():
|
86
|
+
raise ValueError(msg)
|
87
|
+
else:
|
88
|
+
_logger.debug("Acquiring %s from default location/method.", resource)
|
89
|
+
# param[1] is the resource fetcher function -- use `from_local` param to
|
90
|
+
# optionally avoid unnecessary fetches
|
91
|
+
path = params[1](from_local)
|
92
|
+
_logger.debug("Acquired %s at %s", resource, path)
|
93
|
+
return path
|
@@ -0,0 +1,151 @@
|
|
1
|
+
"""Enable quick status check of Cool-Seq-Tool resources."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from collections import namedtuple
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from agct._core import ChainfileError
|
8
|
+
from asyncpg import InvalidCatalogNameError, UndefinedTableError
|
9
|
+
from biocommons.seqrepo import SeqRepo
|
10
|
+
|
11
|
+
from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
|
12
|
+
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
13
|
+
from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase, get_liftover
|
14
|
+
|
15
|
+
_logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
ResourceStatus = namedtuple(
|
19
|
+
"ResourceStatus",
|
20
|
+
(
|
21
|
+
"uta",
|
22
|
+
"seqrepo",
|
23
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(),
|
24
|
+
DataFile.MANE_SUMMARY.lower(),
|
25
|
+
DataFile.LRG_REFSEQGENE.lower(),
|
26
|
+
"liftover",
|
27
|
+
),
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
async def check_status(
|
32
|
+
transcript_file_path: Path | None = None,
|
33
|
+
lrg_refseqgene_path: Path | None = None,
|
34
|
+
mane_data_path: Path | None = None,
|
35
|
+
db_url: str = UTA_DB_URL,
|
36
|
+
sr: SeqRepo | None = None,
|
37
|
+
chain_file_37_to_38: str | None = None,
|
38
|
+
chain_file_38_to_37: str | None = None,
|
39
|
+
) -> ResourceStatus:
|
40
|
+
"""Perform basic status checks on availability of required data resources.
|
41
|
+
|
42
|
+
Arguments are intended to mirror arguments to :py:meth:`cool_seq_tool.app.CoolSeqTool.__init__`.
|
43
|
+
|
44
|
+
Additional arguments are available for testing paths to specific chainfiles (same
|
45
|
+
signature as :py:meth:`cool_seq_tool.sources.uta_database.UtaDatabase.__init__`).
|
46
|
+
Note that chainfile failures also entail UTA initialization failure; this status is
|
47
|
+
reported separately to enable more precise debugging.
|
48
|
+
|
49
|
+
>>> from cool_seq_tool.resources.status import check_status
|
50
|
+
>>> await check_status()
|
51
|
+
ResourceStatus(uta=True, seqrepo=True, transcript_mappings=True, mane_summary=True, lrg_refseqgene=True, liftover=True)
|
52
|
+
|
53
|
+
:param transcript_file_path: The path to ``transcript_mapping.tsv``
|
54
|
+
:param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
|
55
|
+
:param mane_data_path: Path to RefSeq MANE summary data
|
56
|
+
:param db_url: PostgreSQL connection URL
|
57
|
+
Format: ``driver://user:password@host/database/schema``
|
58
|
+
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. This
|
59
|
+
is used for ``agct``. If this is not provided, will check to see if
|
60
|
+
``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will allow
|
61
|
+
``agct`` to download a chain file from UCSC
|
62
|
+
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly. This
|
63
|
+
is used for ``agct``. If this is not provided, will check to see if
|
64
|
+
``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will allow
|
65
|
+
``agct`` to download a chain file from UCSC
|
66
|
+
:return: boolean description of availability of each resource, given current
|
67
|
+
environment configurations
|
68
|
+
"""
|
69
|
+
file_path_params = {
|
70
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(): transcript_file_path,
|
71
|
+
DataFile.LRG_REFSEQGENE.lower(): lrg_refseqgene_path,
|
72
|
+
DataFile.MANE_SUMMARY.lower(): mane_data_path,
|
73
|
+
}
|
74
|
+
|
75
|
+
status = {
|
76
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(): False,
|
77
|
+
DataFile.LRG_REFSEQGENE.lower(): False,
|
78
|
+
DataFile.MANE_SUMMARY.lower(): False,
|
79
|
+
"liftover": False,
|
80
|
+
"uta": False,
|
81
|
+
"seqrepo": False,
|
82
|
+
}
|
83
|
+
for r in list(DataFile):
|
84
|
+
name_lower = r.lower()
|
85
|
+
declared_path = file_path_params[name_lower]
|
86
|
+
if declared_path and declared_path.exists() and declared_path.is_file():
|
87
|
+
status[name_lower] = True
|
88
|
+
continue
|
89
|
+
try:
|
90
|
+
get_data_file(r)
|
91
|
+
except FileNotFoundError:
|
92
|
+
_logger.error(
|
93
|
+
"%s does not exist at configured location %s", name_lower, declared_path
|
94
|
+
)
|
95
|
+
except ValueError:
|
96
|
+
_logger.error(
|
97
|
+
"%s configured at %s is not a valid file.", name_lower, declared_path
|
98
|
+
)
|
99
|
+
except Exception as e:
|
100
|
+
_logger.critical(
|
101
|
+
"Encountered unexpected error fetching %s: %s", name_lower, e
|
102
|
+
)
|
103
|
+
else:
|
104
|
+
status[name_lower] = True
|
105
|
+
|
106
|
+
try:
|
107
|
+
get_liftover(chain_file_37_to_38, chain_file_38_to_37)
|
108
|
+
except (FileNotFoundError, ChainfileError) as e:
|
109
|
+
_logger.error("agct converter setup failed: %s", e)
|
110
|
+
except Exception as e:
|
111
|
+
_logger.critical("Encountered unexpected error setting up agct: %s", e)
|
112
|
+
else:
|
113
|
+
status["liftover"] = True
|
114
|
+
|
115
|
+
try:
|
116
|
+
await UtaDatabase.create(db_url)
|
117
|
+
except (OSError, InvalidCatalogNameError, UndefinedTableError) as e:
|
118
|
+
_logger.error(
|
119
|
+
"Encountered error instantiating UTA at URI %s: %s", UTA_DB_URL, e
|
120
|
+
)
|
121
|
+
except Exception as e:
|
122
|
+
_logger.critical(
|
123
|
+
"Encountered unexpected error instantiating UTA from URI %s: %s",
|
124
|
+
UTA_DB_URL,
|
125
|
+
e,
|
126
|
+
)
|
127
|
+
else:
|
128
|
+
status["uta"] = True
|
129
|
+
|
130
|
+
try:
|
131
|
+
if not sr:
|
132
|
+
sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
|
133
|
+
sra = SeqRepoAccess(sr)
|
134
|
+
sra.sr["NC_000001.11"][1000:1001]
|
135
|
+
except OSError as e:
|
136
|
+
_logger.error("Encountered error while instantiating SeqRepo: %s", e)
|
137
|
+
except KeyError:
|
138
|
+
_logger.error("SeqRepo data fetch test failed -- is it populated?")
|
139
|
+
except Exception as e:
|
140
|
+
_logger.critical("Encountered unexpected error setting up SeqRepo: %s", e)
|
141
|
+
else:
|
142
|
+
status["seqrepo"] = True
|
143
|
+
|
144
|
+
structured_status = ResourceStatus(**status)
|
145
|
+
if all(status.values()):
|
146
|
+
_logger.info("Cool-Seq-Tool resource status passed")
|
147
|
+
else:
|
148
|
+
_logger.error(
|
149
|
+
"Cool-Seq-Tool resource check failed. Result: %s", structured_status
|
150
|
+
)
|
151
|
+
return structured_status
|
cool_seq_tool/routers/default.py
CHANGED
cool_seq_tool/routers/mane.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Module containing routes related to MANE data"""
|
2
|
+
|
2
3
|
import logging
|
3
|
-
from typing import Optional
|
4
4
|
|
5
5
|
from fastapi import APIRouter, Query
|
6
6
|
|
@@ -45,11 +45,11 @@ async def get_mane_data(
|
|
45
45
|
start_annotation_layer: AnnotationLayer = Query(
|
46
46
|
..., description="Starting annotation layer for query"
|
47
47
|
),
|
48
|
-
end_pos:
|
48
|
+
end_pos: int | None = Query(
|
49
49
|
None, description="End position. If not set, will set to `start_pos`."
|
50
50
|
),
|
51
|
-
gene:
|
52
|
-
ref:
|
51
|
+
gene: str | None = Query(None, description="HGNC gene symbol"),
|
52
|
+
ref: str | None = Query(None, description=ref_descr),
|
53
53
|
try_longest_compatible: bool = Query(
|
54
54
|
True, description=try_longest_compatible_descr
|
55
55
|
),
|