cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +7 -11
- cool_seq_tool/app.py +44 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +27 -25
- cool_seq_tool/mappers/__init__.py +3 -1
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
- cool_seq_tool/mappers/liftover.py +90 -0
- cool_seq_tool/mappers/mane_transcript.py +208 -113
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +153 -0
- cool_seq_tool/schemas.py +92 -54
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +96 -249
- cool_seq_tool/utils.py +44 -4
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
- cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
- cool_seq_tool/api.py +0 -42
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool/routers/__init__.py +0 -16
- cool_seq_tool/routers/default.py +0 -125
- cool_seq_tool/routers/mane.py +0 -98
- cool_seq_tool/routers/mappings.py +0 -155
- cool_seq_tool/version.py +0 -2
- cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
"""Provide tools for acquiring and managing Cool-Seq-Tool data resources."""
|
@@ -0,0 +1,93 @@
|
|
1
|
+
"""Fetch data files regarding transcript mapping and annotation."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from enum import Enum
|
5
|
+
from importlib import resources
|
6
|
+
from os import environ
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
from wags_tails import NcbiLrgRefSeqGeneData, NcbiManeSummaryData
|
10
|
+
|
11
|
+
_logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class DataFile(str, Enum):
|
15
|
+
"""Constrain legal values for file resource fetching in :py:meth:`get_data_file() <cool_seq_tool.resources.data_files.get_data_file>`."""
|
16
|
+
|
17
|
+
TRANSCRIPT_MAPPINGS = "transcript_mappings"
|
18
|
+
MANE_SUMMARY = "mane_summary"
|
19
|
+
LRG_REFSEQGENE = "lrg_refseqgene"
|
20
|
+
|
21
|
+
def lower(self) -> str:
|
22
|
+
"""Return lower-cased value
|
23
|
+
|
24
|
+
:return: lower case string
|
25
|
+
"""
|
26
|
+
return self.value.lower()
|
27
|
+
|
28
|
+
|
29
|
+
_resource_acquisition_params = {
|
30
|
+
DataFile.TRANSCRIPT_MAPPINGS: (
|
31
|
+
"TRANSCRIPT_MAPPINGS_PATH",
|
32
|
+
lambda _: resources.files(__package__) / "transcript_mapping.tsv",
|
33
|
+
),
|
34
|
+
DataFile.MANE_SUMMARY: (
|
35
|
+
"MANE_SUMMARY_PATH",
|
36
|
+
lambda from_local: NcbiManeSummaryData(silent=True).get_latest(
|
37
|
+
from_local=from_local
|
38
|
+
)[0],
|
39
|
+
),
|
40
|
+
DataFile.LRG_REFSEQGENE: (
|
41
|
+
"LRG_REFSEQGENE_PATH",
|
42
|
+
lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest(
|
43
|
+
from_local=from_local
|
44
|
+
)[0],
|
45
|
+
),
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
def get_data_file(resource: DataFile, from_local: bool = False) -> Path:
|
50
|
+
"""Acquire Cool-Seq-Tool file dependency.
|
51
|
+
|
52
|
+
Each resource can be defined using an environment variable:
|
53
|
+
|
54
|
+
* ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH``
|
55
|
+
* ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH``
|
56
|
+
* ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH``
|
57
|
+
|
58
|
+
Otherwise, this function falls back on default expected locations:
|
59
|
+
|
60
|
+
* ``transcript_mappings.tsv`` is bundled with this library.
|
61
|
+
* LRG RefseqGene and MANE summary files are acquired from NCBI using the `wags-tails <https://wags-tails.readthedocs.io/stable/>`_ if unavailable locally, or out of date.
|
62
|
+
|
63
|
+
:param resource: resource to fetch
|
64
|
+
:param from_local: if ``True``, don't check for or acquire latest version -- just
|
65
|
+
provide most recent locally available file and raise FileNotFoundError otherwise
|
66
|
+
:return: path to file. Consuming functions can assume that it exists and is a file.
|
67
|
+
:raise FileNotFoundError: if file location configured by env var doesn't exist
|
68
|
+
:raise ValueError: if file location configured by env var isn't a file
|
69
|
+
"""
|
70
|
+
params = _resource_acquisition_params[resource]
|
71
|
+
configured_path = environ.get(params[0])
|
72
|
+
if configured_path:
|
73
|
+
_logger.debug(
|
74
|
+
"Acquiring %s via env var %s:%s", resource, params[0], configured_path
|
75
|
+
)
|
76
|
+
path = Path(configured_path)
|
77
|
+
loc_descr = (
|
78
|
+
"the default file bundled with Cool-Seq-Tool"
|
79
|
+
if resource == DataFile.TRANSCRIPT_MAPPINGS
|
80
|
+
else "the the default file pattern and possibly acquire from source via the `wags-tails` package"
|
81
|
+
)
|
82
|
+
msg = f'No {params[0].replace("_", " ").title()} file exists at path {configured_path} defined under env var {params[0]}. Either unset to use {loc_descr}, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more: https://coolseqtool.readthedocs.io/stable/usage.html#environment-configuration'
|
83
|
+
if not path.exists():
|
84
|
+
raise FileNotFoundError(msg)
|
85
|
+
if not path.is_file():
|
86
|
+
raise ValueError(msg)
|
87
|
+
else:
|
88
|
+
_logger.debug("Acquiring %s from default location/method.", resource)
|
89
|
+
# param[1] is the resource fetcher function -- use `from_local` param to
|
90
|
+
# optionally avoid unnecessary fetches
|
91
|
+
path = params[1](from_local)
|
92
|
+
_logger.debug("Acquired %s at %s", resource, path)
|
93
|
+
return path
|
@@ -0,0 +1,153 @@
|
|
1
|
+
"""Enable quick status check of Cool-Seq-Tool resources."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from collections import namedtuple
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from agct._core import ChainfileError
|
8
|
+
from asyncpg import InvalidCatalogNameError, UndefinedTableError
|
9
|
+
from biocommons.seqrepo import SeqRepo
|
10
|
+
|
11
|
+
from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
|
12
|
+
from cool_seq_tool.mappers.liftover import LiftOver
|
13
|
+
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
14
|
+
from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase
|
15
|
+
|
16
|
+
_logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
ResourceStatus = namedtuple(
|
20
|
+
"ResourceStatus",
|
21
|
+
(
|
22
|
+
"uta",
|
23
|
+
"seqrepo",
|
24
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(),
|
25
|
+
DataFile.MANE_SUMMARY.lower(),
|
26
|
+
DataFile.LRG_REFSEQGENE.lower(),
|
27
|
+
"liftover",
|
28
|
+
),
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
async def check_status(
|
33
|
+
transcript_file_path: Path | None = None,
|
34
|
+
lrg_refseqgene_path: Path | None = None,
|
35
|
+
mane_data_path: Path | None = None,
|
36
|
+
db_url: str = UTA_DB_URL,
|
37
|
+
sr: SeqRepo | None = None,
|
38
|
+
chain_file_37_to_38: str | None = None,
|
39
|
+
chain_file_38_to_37: str | None = None,
|
40
|
+
) -> ResourceStatus:
|
41
|
+
"""Perform basic status checks on availability of required data resources.
|
42
|
+
|
43
|
+
Arguments are intended to mirror arguments to :py:meth:`cool_seq_tool.app.CoolSeqTool.__init__`.
|
44
|
+
|
45
|
+
Additional arguments are available for testing paths to specific chainfiles (same
|
46
|
+
signature as :py:meth:`cool_seq_tool.mappers.liftover.LiftOver.__init__`).
|
47
|
+
|
48
|
+
>>> from cool_seq_tool.resources.status import check_status
|
49
|
+
>>> await check_status()
|
50
|
+
ResourceStatus(uta=True, seqrepo=True, transcript_mappings=True, mane_summary=True, lrg_refseqgene=True, liftover=True)
|
51
|
+
|
52
|
+
:param transcript_file_path: The path to ``transcript_mapping.tsv``
|
53
|
+
:param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
|
54
|
+
:param mane_data_path: Path to RefSeq MANE summary data
|
55
|
+
:param db_url: PostgreSQL connection URL
|
56
|
+
Format: ``driver://user:password@host/database/schema``
|
57
|
+
:param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. This
|
58
|
+
is used for ``agct``. If this is not provided, will check to see if
|
59
|
+
``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will allow
|
60
|
+
``agct`` to download a chain file from UCSC
|
61
|
+
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly. This
|
62
|
+
is used for ``agct``. If this is not provided, will check to see if
|
63
|
+
``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will allow
|
64
|
+
``agct`` to download a chain file from UCSC
|
65
|
+
:return: boolean description of availability of each resource, given current
|
66
|
+
environment configurations
|
67
|
+
"""
|
68
|
+
file_path_params = {
|
69
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(): transcript_file_path,
|
70
|
+
DataFile.LRG_REFSEQGENE.lower(): lrg_refseqgene_path,
|
71
|
+
DataFile.MANE_SUMMARY.lower(): mane_data_path,
|
72
|
+
}
|
73
|
+
|
74
|
+
status = {
|
75
|
+
DataFile.TRANSCRIPT_MAPPINGS.lower(): False,
|
76
|
+
DataFile.LRG_REFSEQGENE.lower(): False,
|
77
|
+
DataFile.MANE_SUMMARY.lower(): False,
|
78
|
+
"liftover": False,
|
79
|
+
"uta": False,
|
80
|
+
"seqrepo": False,
|
81
|
+
}
|
82
|
+
for r in list(DataFile):
|
83
|
+
name_lower = r.lower()
|
84
|
+
declared_path = file_path_params[name_lower]
|
85
|
+
if declared_path and declared_path.exists() and declared_path.is_file():
|
86
|
+
status[name_lower] = True
|
87
|
+
continue
|
88
|
+
try:
|
89
|
+
get_data_file(r)
|
90
|
+
except FileNotFoundError:
|
91
|
+
_logger.error(
|
92
|
+
"%s does not exist at configured location %s", name_lower, declared_path
|
93
|
+
)
|
94
|
+
except ValueError:
|
95
|
+
_logger.error(
|
96
|
+
"%s configured at %s is not a valid file.", name_lower, declared_path
|
97
|
+
)
|
98
|
+
except Exception as e:
|
99
|
+
_logger.critical(
|
100
|
+
"Encountered unexpected error fetching %s: %s", name_lower, e
|
101
|
+
)
|
102
|
+
else:
|
103
|
+
status[name_lower] = True
|
104
|
+
|
105
|
+
try:
|
106
|
+
LiftOver(
|
107
|
+
chain_file_37_to_38=chain_file_37_to_38,
|
108
|
+
chain_file_38_to_37=chain_file_38_to_37,
|
109
|
+
)
|
110
|
+
except (FileNotFoundError, ChainfileError) as e:
|
111
|
+
_logger.error("agct converter setup failed: %s", e)
|
112
|
+
except Exception as e:
|
113
|
+
_logger.critical("Encountered unexpected error setting up agct: %s", e)
|
114
|
+
else:
|
115
|
+
status["liftover"] = True
|
116
|
+
|
117
|
+
try:
|
118
|
+
await UtaDatabase.create(db_url)
|
119
|
+
except (OSError, InvalidCatalogNameError, UndefinedTableError) as e:
|
120
|
+
_logger.error(
|
121
|
+
"Encountered error instantiating UTA at URI %s: %s", UTA_DB_URL, e
|
122
|
+
)
|
123
|
+
except Exception as e:
|
124
|
+
_logger.critical(
|
125
|
+
"Encountered unexpected error instantiating UTA from URI %s: %s",
|
126
|
+
UTA_DB_URL,
|
127
|
+
e,
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
status["uta"] = True
|
131
|
+
|
132
|
+
try:
|
133
|
+
if not sr:
|
134
|
+
sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
|
135
|
+
sra = SeqRepoAccess(sr)
|
136
|
+
sra.sr["NC_000001.11"][1000:1001]
|
137
|
+
except OSError as e:
|
138
|
+
_logger.error("Encountered error while instantiating SeqRepo: %s", e)
|
139
|
+
except KeyError:
|
140
|
+
_logger.error("SeqRepo data fetch test failed -- is it populated?")
|
141
|
+
except Exception as e:
|
142
|
+
_logger.critical("Encountered unexpected error setting up SeqRepo: %s", e)
|
143
|
+
else:
|
144
|
+
status["seqrepo"] = True
|
145
|
+
|
146
|
+
structured_status = ResourceStatus(**status)
|
147
|
+
if all(status.values()):
|
148
|
+
_logger.info("Cool-Seq-Tool resource status passed")
|
149
|
+
else:
|
150
|
+
_logger.error(
|
151
|
+
"Cool-Seq-Tool resource check failed. Result: %s", structured_status
|
152
|
+
)
|
153
|
+
return structured_status
|
cool_seq_tool/schemas.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
"""Defines attribute constants, useful object structures, and API response schemas."""
|
2
|
+
|
2
3
|
import datetime
|
3
|
-
import re
|
4
4
|
from enum import Enum, IntEnum
|
5
|
-
from typing import
|
5
|
+
from typing import Literal
|
6
6
|
|
7
7
|
from pydantic import (
|
8
8
|
BaseModel,
|
9
9
|
ConfigDict,
|
10
10
|
StrictInt,
|
11
11
|
StrictStr,
|
12
|
-
field_validator,
|
13
12
|
model_validator,
|
14
13
|
)
|
15
14
|
|
16
|
-
from cool_seq_tool
|
15
|
+
from cool_seq_tool import __version__
|
17
16
|
|
18
17
|
_now = str(datetime.datetime.now(tz=datetime.timezone.utc))
|
19
18
|
|
@@ -34,11 +33,16 @@ class Strand(IntEnum):
|
|
34
33
|
|
35
34
|
|
36
35
|
class Assembly(str, Enum):
|
37
|
-
"""
|
36
|
+
"""Define supported genomic assemblies. Must be defined in ascending order"""
|
38
37
|
|
39
38
|
GRCH37 = "GRCh37"
|
40
39
|
GRCH38 = "GRCh38"
|
41
40
|
|
41
|
+
@classmethod
|
42
|
+
def values(cls) -> list[str]:
|
43
|
+
"""Return list of values in enum (ascending assembly order)"""
|
44
|
+
return [item.value for item in cls]
|
45
|
+
|
42
46
|
|
43
47
|
class TranscriptPriority(str, Enum):
|
44
48
|
"""Create Enum for Transcript Priority labels"""
|
@@ -52,10 +56,55 @@ class TranscriptPriority(str, Enum):
|
|
52
56
|
class ResidueMode(str, Enum):
|
53
57
|
"""Create Enum for residue modes.
|
54
58
|
|
59
|
+
We typically prefer to operate in inter-residue coordinates, but users should be
|
60
|
+
careful to define the coordinate mode of their data when calling ``cool-seq-tool``
|
61
|
+
functions.
|
62
|
+
|
55
63
|
| | C | | T | | G | |
|
56
64
|
ZERO | | 0 | | 1 | | 2 | |
|
57
65
|
RESIDUE | | 1 | | 2 | | 3 | |
|
58
66
|
INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
|
67
|
+
|
68
|
+
.. tabularcolumns:: |L|C|C|C|C|C|C|C|
|
69
|
+
.. list-table::
|
70
|
+
:header-rows: 1
|
71
|
+
|
72
|
+
* -
|
73
|
+
-
|
74
|
+
- C
|
75
|
+
-
|
76
|
+
- T
|
77
|
+
-
|
78
|
+
- G
|
79
|
+
-
|
80
|
+
* - ``ZERO``
|
81
|
+
-
|
82
|
+
- 0
|
83
|
+
-
|
84
|
+
- 1
|
85
|
+
-
|
86
|
+
- 2
|
87
|
+
-
|
88
|
+
* - ``RESIDUE``
|
89
|
+
-
|
90
|
+
- 1
|
91
|
+
-
|
92
|
+
- 2
|
93
|
+
-
|
94
|
+
- 3
|
95
|
+
-
|
96
|
+
* - ``INTER_RESIDUE``
|
97
|
+
- 0
|
98
|
+
-
|
99
|
+
- 1
|
100
|
+
-
|
101
|
+
- 2
|
102
|
+
-
|
103
|
+
- 3
|
104
|
+
|
105
|
+
|
106
|
+
See "Conventions that promote reliable data sharing" and figure 3 within the
|
107
|
+
`Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
|
59
108
|
"""
|
60
109
|
|
61
110
|
ZERO = "zero"
|
@@ -70,12 +119,12 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"):
|
|
70
119
|
class GenomicRequestBody(BaseModelForbidExtra):
|
71
120
|
"""Define constraints for genomic to transcript exon coordinates request body"""
|
72
121
|
|
73
|
-
chromosome:
|
74
|
-
start:
|
75
|
-
end:
|
76
|
-
strand:
|
77
|
-
transcript:
|
78
|
-
gene:
|
122
|
+
chromosome: StrictStr | StrictInt
|
123
|
+
start: StrictInt | None = None
|
124
|
+
end: StrictInt | None = None
|
125
|
+
strand: Strand | None = None
|
126
|
+
transcript: StrictStr | None = None
|
127
|
+
gene: StrictStr | None = None
|
79
128
|
residue_mode: ResidueMode = ResidueMode.RESIDUE
|
80
129
|
|
81
130
|
@model_validator(mode="after")
|
@@ -106,11 +155,11 @@ class TranscriptRequestBody(BaseModelForbidExtra):
|
|
106
155
|
"""Define constraints for transcript exon to genomic coordinates request body"""
|
107
156
|
|
108
157
|
transcript: StrictStr
|
109
|
-
gene:
|
110
|
-
exon_start:
|
111
|
-
exon_start_offset:
|
112
|
-
exon_end:
|
113
|
-
exon_end_offset:
|
158
|
+
gene: StrictStr | None = None
|
159
|
+
exon_start: StrictInt | None = None
|
160
|
+
exon_start_offset: StrictInt | None = 0
|
161
|
+
exon_end: StrictInt | None = None
|
162
|
+
exon_end_offset: StrictInt | None = 0
|
114
163
|
|
115
164
|
@model_validator(mode="after")
|
116
165
|
def check_exon_start_and_exon_end(cls, values):
|
@@ -166,12 +215,12 @@ class GenomicData(BaseModelForbidExtra):
|
|
166
215
|
|
167
216
|
gene: StrictStr
|
168
217
|
chr: StrictStr
|
169
|
-
start:
|
170
|
-
end:
|
171
|
-
exon_start:
|
172
|
-
exon_start_offset:
|
173
|
-
exon_end:
|
174
|
-
exon_end_offset:
|
218
|
+
start: StrictInt | None = None # Genomic start position
|
219
|
+
end: StrictInt | None = None # Genomic end position
|
220
|
+
exon_start: StrictInt | None = None
|
221
|
+
exon_start_offset: StrictInt | None = 0
|
222
|
+
exon_end: StrictInt | None = None
|
223
|
+
exon_end_offset: StrictInt | None = 0
|
175
224
|
transcript: StrictStr
|
176
225
|
strand: Strand
|
177
226
|
|
@@ -226,20 +275,9 @@ class ServiceMeta(BaseModelForbidExtra):
|
|
226
275
|
name: Literal["cool_seq_tool"] = "cool_seq_tool"
|
227
276
|
version: StrictStr
|
228
277
|
response_datetime: datetime.datetime
|
229
|
-
url: Literal[
|
278
|
+
url: Literal["https://github.com/GenomicMedLab/cool-seq-tool"] = (
|
230
279
|
"https://github.com/GenomicMedLab/cool-seq-tool"
|
231
|
-
|
232
|
-
|
233
|
-
@field_validator("version")
|
234
|
-
def validate_version(cls, v):
|
235
|
-
"""Check version matches semantic versioning regex pattern.
|
236
|
-
https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
|
237
|
-
"""
|
238
|
-
version_regex = r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
|
239
|
-
if not re.match(version_regex, v):
|
240
|
-
msg = f"Invalid version {v}"
|
241
|
-
raise ValueError(msg)
|
242
|
-
return v
|
280
|
+
)
|
243
281
|
|
244
282
|
model_config = ConfigDict(
|
245
283
|
json_schema_extra={
|
@@ -256,8 +294,8 @@ class ServiceMeta(BaseModelForbidExtra):
|
|
256
294
|
class TranscriptExonDataResponse(BaseModelForbidExtra):
|
257
295
|
"""Response model for Transcript Exon Data"""
|
258
296
|
|
259
|
-
transcript_exon_data:
|
260
|
-
warnings:
|
297
|
+
transcript_exon_data: TranscriptExonData | None = None
|
298
|
+
warnings: list[StrictStr] = []
|
261
299
|
service_meta: ServiceMeta
|
262
300
|
|
263
301
|
model_config = ConfigDict(
|
@@ -287,8 +325,8 @@ class TranscriptExonDataResponse(BaseModelForbidExtra):
|
|
287
325
|
class GenomicDataResponse(BaseModelForbidExtra):
|
288
326
|
"""Response model for Genomic Data"""
|
289
327
|
|
290
|
-
genomic_data:
|
291
|
-
warnings:
|
328
|
+
genomic_data: GenomicData | None = None
|
329
|
+
warnings: list[StrictStr] = []
|
292
330
|
service_meta: ServiceMeta
|
293
331
|
|
294
332
|
model_config = ConfigDict(
|
@@ -323,7 +361,7 @@ class MappedManeData(BaseModel):
|
|
323
361
|
|
324
362
|
gene: StrictStr
|
325
363
|
refseq: StrictStr
|
326
|
-
ensembl:
|
364
|
+
ensembl: StrictStr | None = None
|
327
365
|
strand: Strand
|
328
366
|
status: TranscriptPriority
|
329
367
|
alt_ac: StrictStr
|
@@ -338,7 +376,7 @@ class MappedManeData(BaseModel):
|
|
338
376
|
"strand": Strand.NEGATIVE,
|
339
377
|
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
340
378
|
"alt_ac": "NC_000007.13",
|
341
|
-
"assembly":
|
379
|
+
"assembly": Assembly.GRCH37,
|
342
380
|
}
|
343
381
|
}
|
344
382
|
)
|
@@ -347,8 +385,8 @@ class MappedManeData(BaseModel):
|
|
347
385
|
class MappedManeDataService(BaseModelForbidExtra):
|
348
386
|
"""Service model response for mapped mane data"""
|
349
387
|
|
350
|
-
mapped_mane_data:
|
351
|
-
warnings:
|
388
|
+
mapped_mane_data: MappedManeData | None = None
|
389
|
+
warnings: list[StrictStr] = []
|
352
390
|
service_meta: ServiceMeta
|
353
391
|
|
354
392
|
model_config = ConfigDict(
|
@@ -361,7 +399,7 @@ class MappedManeDataService(BaseModelForbidExtra):
|
|
361
399
|
"strand": Strand.NEGATIVE,
|
362
400
|
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
363
401
|
"alt_ac": "NC_000007.13",
|
364
|
-
"assembly":
|
402
|
+
"assembly": Assembly.GRCH37,
|
365
403
|
},
|
366
404
|
"warnings": [],
|
367
405
|
"service_meta": {
|
@@ -378,10 +416,10 @@ class MappedManeDataService(BaseModelForbidExtra):
|
|
378
416
|
class ManeData(BaseModel):
|
379
417
|
"""Define mane data fields"""
|
380
418
|
|
381
|
-
gene:
|
382
|
-
refseq:
|
383
|
-
ensembl:
|
384
|
-
pos:
|
419
|
+
gene: StrictStr | None = None
|
420
|
+
refseq: StrictStr | None = None
|
421
|
+
ensembl: StrictStr | None = None
|
422
|
+
pos: tuple[int, int]
|
385
423
|
strand: Strand
|
386
424
|
status: TranscriptPriority
|
387
425
|
|
@@ -402,8 +440,8 @@ class ManeData(BaseModel):
|
|
402
440
|
class ManeDataService(BaseModelForbidExtra):
|
403
441
|
"""Service model response for getting mane data"""
|
404
442
|
|
405
|
-
mane_data:
|
406
|
-
warnings:
|
443
|
+
mane_data: ManeData | None = None
|
444
|
+
warnings: list[StrictStr] = []
|
407
445
|
service_meta: ServiceMeta
|
408
446
|
|
409
447
|
model_config = ConfigDict(
|
@@ -457,8 +495,8 @@ class CdnaRepresentation(BaseModelForbidExtra):
|
|
457
495
|
class ToCdnaService(BaseModelForbidExtra):
|
458
496
|
"""Service model response for protein -> cDNA"""
|
459
497
|
|
460
|
-
c_data:
|
461
|
-
warnings:
|
498
|
+
c_data: CdnaRepresentation | None = None
|
499
|
+
warnings: list[StrictStr] = []
|
462
500
|
service_meta: ServiceMeta
|
463
501
|
|
464
502
|
model_config = ConfigDict(
|
@@ -506,8 +544,8 @@ class GenomicRepresentation(BaseModelForbidExtra):
|
|
506
544
|
class ToGenomicService(BaseModelForbidExtra):
|
507
545
|
"""Service model response for cDNA -> genomic"""
|
508
546
|
|
509
|
-
g_data:
|
510
|
-
warnings:
|
547
|
+
g_data: GenomicRepresentation | None = None
|
548
|
+
warnings: list[StrictStr] = []
|
511
549
|
service_meta: ServiceMeta
|
512
550
|
|
513
551
|
model_config = ConfigDict(
|
@@ -1,15 +1,15 @@
|
|
1
1
|
"""Provide fast tabular access to MANE summary file. Enables retrieval of associated
|
2
2
|
MANE transcripts for gene symbols, genomic positions, or transcript accessions.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import logging
|
5
6
|
from pathlib import Path
|
6
|
-
from typing import Dict, List
|
7
7
|
|
8
8
|
import polars as pl
|
9
9
|
|
10
|
-
from cool_seq_tool.
|
10
|
+
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
11
11
|
|
12
|
-
|
12
|
+
_logger = logging.getLogger(__name__)
|
13
13
|
|
14
14
|
|
15
15
|
class ManeTranscriptMappings:
|
@@ -22,11 +22,18 @@ class ManeTranscriptMappings:
|
|
22
22
|
See the `NCBI MANE page <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ for more information.
|
23
23
|
"""
|
24
24
|
|
25
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self, mane_data_path: Path | None = None, from_local: bool = False
|
27
|
+
) -> None:
|
26
28
|
"""Initialize the MANE Transcript mappings class.
|
27
29
|
|
28
|
-
:param
|
30
|
+
:param mane_data_path: Path to RefSeq MANE summary data
|
31
|
+
:param from_local: if ``True``, don't check for or acquire latest version --
|
32
|
+
just provide most recent locally available file, if possible, and raise
|
33
|
+
error otherwise
|
29
34
|
"""
|
35
|
+
if not mane_data_path:
|
36
|
+
mane_data_path = get_data_file(DataFile.MANE_SUMMARY, from_local)
|
30
37
|
self.mane_data_path = mane_data_path
|
31
38
|
self.df = self._load_mane_transcript_data()
|
32
39
|
|
@@ -37,7 +44,7 @@ class ManeTranscriptMappings:
|
|
37
44
|
"""
|
38
45
|
return pl.read_csv(self.mane_data_path, separator="\t")
|
39
46
|
|
40
|
-
def get_gene_mane_data(self, gene_symbol: str) ->
|
47
|
+
def get_gene_mane_data(self, gene_symbol: str) -> list[dict]:
|
41
48
|
"""Return MANE Transcript data for a gene.
|
42
49
|
|
43
50
|
>>> from cool_seq_tool.sources import ManeTranscriptMappings
|
@@ -56,7 +63,7 @@ class ManeTranscriptMappings:
|
|
56
63
|
data = self.df.filter(pl.col("symbol") == gene_symbol.upper())
|
57
64
|
|
58
65
|
if len(data) == 0:
|
59
|
-
|
66
|
+
_logger.warning(
|
60
67
|
"Unable to get MANE Transcript data for gene: %s", gene_symbol
|
61
68
|
)
|
62
69
|
return []
|
@@ -64,7 +71,7 @@ class ManeTranscriptMappings:
|
|
64
71
|
data = data.sort(by="MANE_status", descending=True)
|
65
72
|
return data.to_dicts()
|
66
73
|
|
67
|
-
def get_mane_from_transcripts(self, transcripts:
|
74
|
+
def get_mane_from_transcripts(self, transcripts: list[str]) -> list[dict]:
|
68
75
|
"""Get mane transcripts from a list of transcripts
|
69
76
|
|
70
77
|
:param List[str] transcripts: RefSeq transcripts on c. coordinate
|
@@ -77,7 +84,7 @@ class ManeTranscriptMappings:
|
|
77
84
|
|
78
85
|
def get_mane_data_from_chr_pos(
|
79
86
|
self, alt_ac: str, start: int, end: int
|
80
|
-
) ->
|
87
|
+
) -> list[dict]:
|
81
88
|
"""Get MANE data given a GRCh38 genomic position.
|
82
89
|
|
83
90
|
:param str alt_ac: NC Accession
|