cool-seq-tool 0.3.0.dev1__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/api.py +3 -3
- cool_seq_tool/app.py +32 -11
- cool_seq_tool/data/data_downloads.py +8 -5
- cool_seq_tool/handlers/seqrepo_access.py +55 -27
- cool_seq_tool/mappers/__init__.py +4 -1
- cool_seq_tool/mappers/alignment.py +40 -37
- cool_seq_tool/mappers/exon_genomic_coords.py +329 -138
- cool_seq_tool/mappers/mane_transcript.py +402 -227
- cool_seq_tool/routers/mane.py +1 -1
- cool_seq_tool/routers/mappings.py +1 -1
- cool_seq_tool/schemas.py +31 -24
- cool_seq_tool/sources/__init__.py +4 -2
- cool_seq_tool/sources/mane_transcript_mappings.py +28 -7
- cool_seq_tool/sources/transcript_mappings.py +27 -11
- cool_seq_tool/sources/uta_database.py +179 -232
- cool_seq_tool/utils.py +22 -24
- cool_seq_tool/version.py +1 -1
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/LICENSE +1 -1
- cool_seq_tool-0.4.0.dev0.dist-info/METADATA +130 -0
- cool_seq_tool-0.4.0.dev0.dist-info/RECORD +28 -0
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/transcript_mapping.tsv +0 -256226
- cool_seq_tool-0.3.0.dev1.dist-info/METADATA +0 -187
- cool_seq_tool-0.3.0.dev1.dist-info/RECORD +0 -29
- {cool_seq_tool-0.3.0.dev1.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,20 @@
|
|
1
|
-
"""
|
1
|
+
"""Provide mapping capabilities between transcript exon and genomic coordinates."""
|
2
2
|
import logging
|
3
3
|
from typing import Dict, List, Optional, Tuple, TypeVar, Union
|
4
4
|
|
5
|
-
from cool_seq_tool.mappers import
|
5
|
+
from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
|
6
6
|
from cool_seq_tool.schemas import (
|
7
7
|
AnnotationLayer,
|
8
8
|
Assembly,
|
9
9
|
GenomicData,
|
10
10
|
GenomicDataResponse,
|
11
11
|
ResidueMode,
|
12
|
+
Strand,
|
12
13
|
TranscriptExonData,
|
13
14
|
TranscriptExonDataResponse,
|
14
15
|
)
|
15
|
-
from cool_seq_tool.sources.uta_database import
|
16
|
-
from cool_seq_tool.utils import service_meta
|
16
|
+
from cool_seq_tool.sources.uta_database import UtaDatabase
|
17
|
+
from cool_seq_tool.utils import get_inter_residue_pos, service_meta
|
17
18
|
|
18
19
|
CoordinatesResponseType = TypeVar(
|
19
20
|
"CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse
|
@@ -23,14 +24,33 @@ logger = logging.getLogger(__name__)
|
|
23
24
|
|
24
25
|
|
25
26
|
class ExonGenomicCoordsMapper:
|
26
|
-
"""
|
27
|
-
representation
|
27
|
+
"""Provide capabilities for mapping transcript exon representation to/from genomic
|
28
|
+
coordinate representation.
|
28
29
|
"""
|
29
30
|
|
30
|
-
def __init__(self, uta_db:
|
31
|
-
"""Initialize ExonGenomicCoordsMapper class
|
31
|
+
def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None:
|
32
|
+
"""Initialize ExonGenomicCoordsMapper class.
|
32
33
|
|
33
|
-
|
34
|
+
A lot of resources are required for initialization, so when defaults are enough,
|
35
|
+
it's easiest to let the core CoolSeqTool class handle it for you:
|
36
|
+
|
37
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
38
|
+
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
39
|
+
|
40
|
+
Note that this class's public methods are all defined as ``async``, so they will
|
41
|
+
need to be called with ``await`` when called from a function, or run from an
|
42
|
+
event loop. See the :ref:`Usage section <async_note>` for more information.
|
43
|
+
|
44
|
+
>>> import asyncio
|
45
|
+
>>> result = asyncio.run(egc.transcript_to_genomic_coordinates(
|
46
|
+
... "NM_002529.3",
|
47
|
+
... exon_start=2,
|
48
|
+
... exon_end=17
|
49
|
+
... ))
|
50
|
+
>>> result.genomic_data.start, result.genomic_data.end
|
51
|
+
(156864428, 156881456)
|
52
|
+
|
53
|
+
:param uta_db: UtaDatabase instance to give access to query UTA database
|
34
54
|
:param mane_transcript: Instance to align to MANE or compatible representation
|
35
55
|
"""
|
36
56
|
self.uta_db = uta_db
|
@@ -43,8 +63,8 @@ class ExonGenomicCoordsMapper:
|
|
43
63
|
"""Add warnings to response object
|
44
64
|
|
45
65
|
:param resp: Response object
|
46
|
-
:param warning_msg: Warning message on why
|
47
|
-
|
66
|
+
:param warning_msg: Warning message on why ``transcript_exon_data`` or
|
67
|
+
``genomic_data`` field is ``None``
|
48
68
|
:return: Response object with warning message
|
49
69
|
"""
|
50
70
|
logger.warning(warning_msg)
|
@@ -53,22 +73,35 @@ class ExonGenomicCoordsMapper:
|
|
53
73
|
|
54
74
|
async def transcript_to_genomic_coordinates(
|
55
75
|
self,
|
76
|
+
transcript: str,
|
56
77
|
gene: Optional[str] = None,
|
57
|
-
transcript: Optional[str] = None,
|
58
78
|
exon_start: Optional[int] = None,
|
59
79
|
exon_start_offset: int = 0,
|
60
80
|
exon_end: Optional[int] = None,
|
61
81
|
exon_end_offset: int = 0,
|
62
|
-
**kwargs,
|
63
82
|
) -> GenomicDataResponse:
|
64
83
|
"""Get genomic data given transcript data.
|
65
|
-
Will use GRCh38 coordinates if possible
|
66
84
|
|
67
|
-
|
85
|
+
By default, transcript data is aligned to the GRCh38 assembly.
|
86
|
+
|
87
|
+
>>> import asyncio
|
88
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
89
|
+
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
90
|
+
>>> tpm3 = asyncio.run(egc.transcript_to_genomic_coordinates(
|
91
|
+
... "NM_152263.3"
|
92
|
+
... gene="TPM3", chr="NC_000001.11",
|
93
|
+
... exon_start=1, exon_end=8,
|
94
|
+
... ))
|
95
|
+
>>> tpm3.genomic_data.chr, tpm3.genomic_data.start, tpm3.genomic_data.end
|
96
|
+
('NC_000001.11', 154192135, 154170399)
|
97
|
+
|
68
98
|
:param transcript: Transcript accession
|
69
|
-
:param
|
70
|
-
:param
|
99
|
+
:param gene: HGNC gene symbol
|
100
|
+
:param exon_start: Starting transcript exon number (1-based). If not provided,
|
101
|
+
must provide ``exon_end``
|
71
102
|
:param exon_start_offset: Starting exon offset
|
103
|
+
:param exon_end: Ending transcript exon number (1-based). If not provided, must
|
104
|
+
provide ``exon_start``
|
72
105
|
:param exon_end_offset: Ending exon offset
|
73
106
|
:return: GRCh38 genomic data (inter-residue coordinates)
|
74
107
|
"""
|
@@ -76,80 +109,102 @@ class ExonGenomicCoordsMapper:
|
|
76
109
|
genomic_data=None, warnings=[], service_meta=service_meta()
|
77
110
|
)
|
78
111
|
|
112
|
+
# Ensure valid inputs
|
79
113
|
if not transcript:
|
80
114
|
return self._return_warnings(resp, "Must provide `transcript`")
|
81
115
|
else:
|
82
116
|
transcript = transcript.strip()
|
83
117
|
|
84
|
-
|
118
|
+
exon_start_exists, exon_end_exists = False, False
|
119
|
+
if exon_start is not None:
|
120
|
+
if exon_start < 1:
|
121
|
+
return self._return_warnings(resp, "`exon_start` cannot be less than 1")
|
122
|
+
exon_start_exists = True
|
123
|
+
|
124
|
+
if exon_end is not None:
|
125
|
+
if exon_end < 1:
|
126
|
+
return self._return_warnings(resp, "`exon_end` cannot be less than 1")
|
127
|
+
exon_end_exists = True
|
128
|
+
|
129
|
+
if not exon_start_exists and not exon_end_exists:
|
85
130
|
return self._return_warnings(
|
86
131
|
resp, "Must provide either `exon_start` or `exon_end`"
|
87
132
|
)
|
88
|
-
|
89
|
-
if gene:
|
90
|
-
gene = gene.upper().strip()
|
91
|
-
|
92
|
-
if exon_start and exon_end:
|
133
|
+
elif exon_start_exists and exon_end_exists:
|
93
134
|
if exon_start > exon_end:
|
94
135
|
return self._return_warnings(
|
95
136
|
resp,
|
96
137
|
f"Start exon {exon_start} is greater than end exon {exon_end}",
|
97
138
|
)
|
98
139
|
|
140
|
+
# Get all exons and associated start/end coordinates for transcript
|
99
141
|
tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
|
100
142
|
if not tx_exons:
|
101
143
|
return self._return_warnings(resp, warning or "")
|
102
144
|
|
103
|
-
|
145
|
+
# Get exon start and exon end coordinates
|
146
|
+
tx_exon_coords, warning = self.get_tx_exon_coords(
|
104
147
|
transcript, tx_exons, exon_start, exon_end
|
105
148
|
)
|
106
149
|
if not tx_exon_coords:
|
107
150
|
return self._return_warnings(resp, warning or "")
|
108
|
-
|
151
|
+
tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
|
109
152
|
|
110
|
-
|
111
|
-
|
153
|
+
if gene:
|
154
|
+
gene = gene.upper().strip()
|
155
|
+
|
156
|
+
# Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
|
157
|
+
# for exon(s)
|
158
|
+
alt_ac_start_end, warning = await self._get_alt_ac_start_and_end(
|
159
|
+
transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
|
112
160
|
)
|
113
161
|
if not alt_ac_start_end:
|
114
162
|
return self._return_warnings(resp, warning or "")
|
115
|
-
|
163
|
+
alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
|
116
164
|
|
117
|
-
gene
|
118
|
-
|
165
|
+
# Get gene and chromosome data, check that at least one was retrieved
|
166
|
+
gene = alt_ac_start_data[0] if alt_ac_start_data else alt_ac_end_data[0]
|
167
|
+
chromosome = alt_ac_start_data[1] if alt_ac_start_data else alt_ac_end_data[1]
|
119
168
|
if gene is None or chromosome is None:
|
120
169
|
return self._return_warnings(
|
121
170
|
resp,
|
122
|
-
"Unable to retrieve `gene` or `chromosome` from "
|
123
|
-
"genomic
|
171
|
+
"Unable to retrieve `gene` or `chromosome` from genomic start and "
|
172
|
+
"genomic end data",
|
124
173
|
)
|
125
174
|
|
126
|
-
|
127
|
-
|
128
|
-
strand =
|
175
|
+
g_start = alt_ac_start_data[3] - 1 if alt_ac_start_data else None
|
176
|
+
g_end = alt_ac_end_data[2] + 1 if alt_ac_end_data else None
|
177
|
+
strand = (
|
178
|
+
Strand(alt_ac_start_data[4])
|
179
|
+
if alt_ac_start_data
|
180
|
+
else Strand(alt_ac_end_data[4])
|
181
|
+
)
|
129
182
|
|
130
183
|
# Using none since could set to 0
|
131
|
-
start_exits =
|
132
|
-
end_exists =
|
184
|
+
start_exits = g_start is not None
|
185
|
+
end_exists = g_end is not None
|
133
186
|
|
134
|
-
|
187
|
+
# Calculate offsets
|
188
|
+
if strand == Strand.NEGATIVE:
|
135
189
|
start_offset = exon_start_offset * -1 if start_exits else None
|
136
|
-
end_offset = exon_end_offset * -1 if end_exists else
|
190
|
+
end_offset = exon_end_offset * -1 if end_exists else 0
|
137
191
|
else:
|
138
|
-
start_offset = exon_start_offset if start_exits else
|
139
|
-
end_offset = exon_end_offset if end_exists else
|
192
|
+
start_offset = exon_start_offset if start_exits else 0
|
193
|
+
end_offset = exon_end_offset if end_exists else 0
|
140
194
|
|
141
|
-
|
142
|
-
|
195
|
+
# Get genomic coordinates with offsets included
|
196
|
+
g_start = g_start + start_offset if start_exits else None
|
197
|
+
g_end = g_end + end_offset if end_exists else None
|
143
198
|
|
144
199
|
resp.genomic_data = GenomicData(
|
145
200
|
gene=gene,
|
146
201
|
chr=chromosome,
|
147
|
-
start=
|
148
|
-
end=
|
202
|
+
start=g_start,
|
203
|
+
end=g_end,
|
149
204
|
exon_start=exon_start if start_exits else None,
|
150
|
-
exon_start_offset=exon_start_offset
|
205
|
+
exon_start_offset=exon_start_offset,
|
151
206
|
exon_end=exon_end if end_exists else None,
|
152
|
-
exon_end_offset=exon_end_offset
|
207
|
+
exon_end_offset=exon_end_offset,
|
153
208
|
transcript=transcript,
|
154
209
|
strand=strand,
|
155
210
|
)
|
@@ -158,31 +213,51 @@ class ExonGenomicCoordsMapper:
|
|
158
213
|
|
159
214
|
async def genomic_to_transcript_exon_coordinates(
|
160
215
|
self,
|
161
|
-
chromosome:
|
216
|
+
chromosome: Optional[str] = None,
|
217
|
+
alt_ac: Optional[str] = None,
|
162
218
|
start: Optional[int] = None,
|
163
219
|
end: Optional[int] = None,
|
164
|
-
strand: Optional[
|
220
|
+
strand: Optional[Strand] = None,
|
165
221
|
transcript: Optional[str] = None,
|
166
222
|
gene: Optional[str] = None,
|
167
|
-
residue_mode:
|
168
|
-
|
223
|
+
residue_mode: Union[
|
224
|
+
ResidueMode.INTER_RESIDUE, ResidueMode.RESIDUE
|
225
|
+
] = ResidueMode.RESIDUE,
|
169
226
|
) -> GenomicDataResponse:
|
170
|
-
"""Get transcript data for genomic data.
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
227
|
+
"""Get transcript data for genomic data, lifted over to GRCh38.
|
228
|
+
|
229
|
+
MANE Transcript data will be returned if and only if ``transcript`` is not
|
230
|
+
supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
|
231
|
+
|
232
|
+
>>> import asyncio
|
233
|
+
>>> from cool_seq_tool.app import CoolSeqTool
|
234
|
+
>>> from cool_seq_tool.schemas import Strand
|
235
|
+
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
236
|
+
>>> result = asyncio.run(egc.genomic_to_transcript_exon_coordinates(
|
237
|
+
... chromosome="NC_000001.11",
|
238
|
+
... start=154192136,
|
239
|
+
... end=154170400,
|
240
|
+
... strand=Strand.NEGATIVE,
|
241
|
+
... transcript="NM_152263.3"
|
242
|
+
... ))
|
243
|
+
>>> result.genomic_data.exon_start, result.genomic_data.exon_end
|
244
|
+
(1, 8)
|
245
|
+
|
246
|
+
:param chromosome: Chromosome. Must give chromosome without a prefix
|
247
|
+
(i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
|
248
|
+
If ``alt_ac`` is also provided, ``alt_ac`` will be used.
|
249
|
+
:param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
250
|
+
must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
|
251
|
+
will be used.
|
177
252
|
:param start: Start genomic position
|
178
253
|
:param end: End genomic position
|
179
|
-
:param strand: Strand
|
254
|
+
:param strand: Strand
|
180
255
|
:param transcript: The transcript to use. If this is not given, we will try the
|
181
256
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
182
|
-
Compatible Transcript
|
183
|
-
|
184
|
-
:param
|
185
|
-
|
257
|
+
Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
|
258
|
+
page.
|
259
|
+
:param gene: HGNC gene symbol
|
260
|
+
:param residue_mode: Residue mode for ``start`` and ``end``
|
186
261
|
:return: Genomic data (inter-residue coordinates)
|
187
262
|
"""
|
188
263
|
resp = GenomicDataResponse(
|
@@ -191,21 +266,23 @@ class ExonGenomicCoordsMapper:
|
|
191
266
|
if start is None and end is None:
|
192
267
|
return self._return_warnings(resp, "Must provide either `start` or `end`")
|
193
268
|
|
194
|
-
params = {key: None for key in GenomicData.
|
269
|
+
params = {key: None for key in GenomicData.model_fields.keys()}
|
195
270
|
if gene is not None:
|
196
271
|
gene = gene.upper().strip()
|
197
272
|
|
198
273
|
if start:
|
199
274
|
if residue_mode == ResidueMode.RESIDUE:
|
275
|
+
# zero-based for UTA
|
200
276
|
start -= 1
|
277
|
+
residue_mode = ResidueMode.ZERO
|
201
278
|
start_data = await self._genomic_to_transcript_exon_coordinate(
|
202
|
-
chromosome,
|
203
279
|
start,
|
280
|
+
chromosome=chromosome,
|
281
|
+
alt_ac=alt_ac,
|
204
282
|
strand=strand,
|
205
283
|
transcript=transcript,
|
206
284
|
gene=gene,
|
207
285
|
is_start=True,
|
208
|
-
residue_mode=ResidueMode.INTER_RESIDUE,
|
209
286
|
)
|
210
287
|
if start_data.transcript_exon_data:
|
211
288
|
start_data = start_data.transcript_exon_data.model_dump()
|
@@ -215,16 +292,16 @@ class ExonGenomicCoordsMapper:
|
|
215
292
|
start_data = None
|
216
293
|
|
217
294
|
if end:
|
218
|
-
|
219
|
-
|
295
|
+
end -= 1
|
296
|
+
residue_mode = ResidueMode.ZERO
|
220
297
|
end_data = await self._genomic_to_transcript_exon_coordinate(
|
221
|
-
chromosome,
|
222
298
|
end,
|
299
|
+
chromosome=chromosome,
|
300
|
+
alt_ac=alt_ac,
|
223
301
|
strand=strand,
|
224
302
|
transcript=transcript,
|
225
303
|
gene=gene,
|
226
304
|
is_start=False,
|
227
|
-
residue_mode=ResidueMode.INTER_RESIDUE,
|
228
305
|
)
|
229
306
|
if end_data.transcript_exon_data:
|
230
307
|
end_data = end_data.transcript_exon_data.model_dump()
|
@@ -261,30 +338,139 @@ class ExonGenomicCoordsMapper:
|
|
261
338
|
resp.genomic_data = GenomicData(**params)
|
262
339
|
return resp
|
263
340
|
|
341
|
+
@staticmethod
|
342
|
+
def _validate_exon(
|
343
|
+
transcript: str, tx_exons: List[Tuple[int, int]], exon_number: int
|
344
|
+
) -> Tuple[Optional[Tuple[int, int]], Optional[str]]:
|
345
|
+
"""Validate that exon number exists on a given transcript
|
346
|
+
|
347
|
+
:param transcript: Transcript accession
|
348
|
+
:param tx_exons: List of transcript's exons and associated coordinates
|
349
|
+
:param exon_number: Exon number to validate
|
350
|
+
:return: Exon coordinates for a given exon number and warnings if found
|
351
|
+
"""
|
352
|
+
msg = f"Exon {exon_number} does not exist on {transcript}"
|
353
|
+
try:
|
354
|
+
if exon_number < 1:
|
355
|
+
return None, msg
|
356
|
+
exon = tx_exons[exon_number - 1]
|
357
|
+
except IndexError:
|
358
|
+
return None, msg
|
359
|
+
return exon, None
|
360
|
+
|
361
|
+
def get_tx_exon_coords(
|
362
|
+
self,
|
363
|
+
transcript: str,
|
364
|
+
tx_exons: List[Tuple[int, int]],
|
365
|
+
exon_start: Optional[int] = None,
|
366
|
+
exon_end: Optional[int] = None,
|
367
|
+
) -> Tuple[
|
368
|
+
Optional[Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]],
|
369
|
+
Optional[str],
|
370
|
+
]:
|
371
|
+
"""Get exon coordinates for ``exon_start`` and ``exon_end``
|
372
|
+
|
373
|
+
:param transcript: Transcript accession
|
374
|
+
:param tx_exons: List of all transcript exons and coordinates
|
375
|
+
:param exon_start: Start exon number
|
376
|
+
:param exon_end: End exon number
|
377
|
+
:return: [Transcript start exon coords, Transcript end exon coords],
|
378
|
+
and warnings if found
|
379
|
+
"""
|
380
|
+
if exon_start is not None:
|
381
|
+
tx_exon_start, warning = self._validate_exon(
|
382
|
+
transcript, tx_exons, exon_start
|
383
|
+
)
|
384
|
+
if not tx_exon_start:
|
385
|
+
return None, warning
|
386
|
+
else:
|
387
|
+
tx_exon_start = None
|
388
|
+
|
389
|
+
if exon_end is not None:
|
390
|
+
tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
|
391
|
+
if not tx_exon_end:
|
392
|
+
return None, warning
|
393
|
+
else:
|
394
|
+
tx_exon_end = None
|
395
|
+
return (tx_exon_start, tx_exon_end), None
|
396
|
+
|
397
|
+
async def _get_alt_ac_start_and_end(
|
398
|
+
self,
|
399
|
+
tx_ac: str,
|
400
|
+
tx_exon_start: Optional[Tuple[int, int]] = None,
|
401
|
+
tx_exon_end: Optional[Tuple[int, int]] = None,
|
402
|
+
gene: Optional[str] = None,
|
403
|
+
) -> Tuple[Optional[Tuple[Tuple[int, int], Tuple[int, int]]], Optional[str]]:
|
404
|
+
"""Get aligned genomic coordinates for transcript exon start and end.
|
405
|
+
|
406
|
+
:param tx_ac: Transcript accession
|
407
|
+
:param tx_exon_start: Transcript's exon start coordinates. If not provided,
|
408
|
+
must provide ``tx_exon_end``
|
409
|
+
:param tx_exon_end: Transcript's exon end coordinates. If not provided, must
|
410
|
+
provide ``tx_exon_start``
|
411
|
+
:param gene: HGNC gene symbol
|
412
|
+
:return: Aligned genomic data, and warnings if found
|
413
|
+
"""
|
414
|
+
if tx_exon_start is None and tx_exon_end is None:
|
415
|
+
msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
|
416
|
+
logger.warning(msg)
|
417
|
+
return None, msg
|
418
|
+
|
419
|
+
alt_ac_data = {"start": None, "end": None}
|
420
|
+
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
|
421
|
+
if exon:
|
422
|
+
alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
|
423
|
+
tx_ac, exon[0], exon[1], gene=gene
|
424
|
+
)
|
425
|
+
if alt_ac_val:
|
426
|
+
alt_ac_data[key] = alt_ac_val
|
427
|
+
else:
|
428
|
+
return None, warning
|
429
|
+
|
430
|
+
alt_ac_data_values = alt_ac_data.values()
|
431
|
+
# Validate that start and end alignments have matching gene, genomic accession,
|
432
|
+
# and strand
|
433
|
+
if all(alt_ac_data_values):
|
434
|
+
for i in (0, 1, 4):
|
435
|
+
if alt_ac_data["start"][i] != alt_ac_data["end"][i]:
|
436
|
+
if i == 0:
|
437
|
+
error = "HGNC gene symbol does not match"
|
438
|
+
elif i == 1:
|
439
|
+
error = "Genomic accession does not match"
|
440
|
+
else:
|
441
|
+
error = "Strand does not match"
|
442
|
+
logger.warning(
|
443
|
+
f"{error}: {alt_ac_data['start'][i]} != {alt_ac_data['end'][i]}"
|
444
|
+
)
|
445
|
+
return None, error
|
446
|
+
return tuple(alt_ac_data_values), None
|
447
|
+
|
264
448
|
async def _genomic_to_transcript_exon_coordinate(
|
265
449
|
self,
|
266
|
-
chromosome: Union[str, int],
|
267
450
|
pos: int,
|
268
|
-
|
269
|
-
|
270
|
-
|
451
|
+
chromosome: Optional[str] = None,
|
452
|
+
alt_ac: Optional[str] = None,
|
453
|
+
strand: Optional[Strand] = None,
|
454
|
+
transcript: Optional[str] = None,
|
455
|
+
gene: Optional[str] = None,
|
271
456
|
is_start: bool = True,
|
272
|
-
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
273
457
|
) -> TranscriptExonDataResponse:
|
274
458
|
"""Convert individual genomic data to transcript data
|
275
459
|
|
276
|
-
:param
|
277
|
-
|
278
|
-
|
279
|
-
|
460
|
+
:param pos: Genomic position (zero-based)
|
461
|
+
:param chromosome: Chromosome. Must give chromosome without a prefix
|
462
|
+
(i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
|
463
|
+
If ``alt_ac`` is also provided, ``alt_ac`` will be used.
|
464
|
+
:param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
465
|
+
must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
|
466
|
+
will be used.
|
467
|
+
:param strand: Strand
|
280
468
|
:param transcript: The transcript to use. If this is not given, we will try the
|
281
469
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
282
470
|
Compatible Transcript
|
283
|
-
:param gene:
|
284
|
-
:param is_start:
|
285
|
-
position.
|
286
|
-
:param residue_mode: Default is `resiude` (1-based). Must be either `residue`
|
287
|
-
or `inter-residue` (0-based).
|
471
|
+
:param gene: HGNC gene symbol
|
472
|
+
:param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
|
473
|
+
end position.
|
288
474
|
:return: Transcript data (inter-residue coordinates)
|
289
475
|
"""
|
290
476
|
resp = TranscriptExonDataResponse(
|
@@ -296,9 +482,19 @@ class ExonGenomicCoordsMapper:
|
|
296
482
|
resp, "Must provide either `gene` or `transcript`"
|
297
483
|
)
|
298
484
|
|
299
|
-
params = {key: None for key in TranscriptExonData.
|
485
|
+
params = {key: None for key in TranscriptExonData.model_fields.keys()}
|
300
486
|
|
301
|
-
|
487
|
+
if alt_ac:
|
488
|
+
# Check if valid accession is given
|
489
|
+
if not await self.uta_db.validate_genomic_ac(alt_ac):
|
490
|
+
return self._return_warnings(
|
491
|
+
resp, f"Invalid genomic accession: {alt_ac}"
|
492
|
+
)
|
493
|
+
|
494
|
+
genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
|
495
|
+
pos, strand=strand, alt_ac=alt_ac, gene=gene
|
496
|
+
)
|
497
|
+
elif chromosome:
|
302
498
|
# Check if just chromosome is given. If it is, we should
|
303
499
|
# convert this to the correct accession version
|
304
500
|
if chromosome == "X":
|
@@ -307,21 +503,13 @@ class ExonGenomicCoordsMapper:
|
|
307
503
|
chromosome = 24
|
308
504
|
else:
|
309
505
|
chromosome = int(chromosome)
|
310
|
-
except ValueError:
|
311
|
-
# Check if valid accession is given
|
312
|
-
if not await self.uta_db.validate_genomic_ac(chromosome):
|
313
|
-
return self._return_warnings(resp, f"Invalid chromosome: {chromosome}")
|
314
506
|
|
315
|
-
|
316
|
-
|
317
|
-
genes_alt_acs, warning = await self.uta_db.chr_to_gene_and_accessions(
|
318
|
-
chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene
|
507
|
+
genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
|
508
|
+
pos, strand=strand, chromosome=chromosome, gene=gene
|
319
509
|
)
|
320
510
|
else:
|
321
|
-
|
322
|
-
|
323
|
-
chromosome, pos, strand=strand, alt_ac=None, gene=gene
|
324
|
-
)
|
511
|
+
genes_alt_acs = None
|
512
|
+
|
325
513
|
if not genes_alt_acs:
|
326
514
|
return self._return_warnings(resp, warning)
|
327
515
|
|
@@ -332,7 +520,7 @@ class ExonGenomicCoordsMapper:
|
|
332
520
|
|
333
521
|
if transcript is None:
|
334
522
|
warnings = await self._set_mane_genomic_data(
|
335
|
-
params, gene, alt_ac, pos, strand, is_start
|
523
|
+
params, gene, alt_ac, pos, strand, is_start
|
336
524
|
)
|
337
525
|
if warnings:
|
338
526
|
return self._return_warnings(resp, warnings)
|
@@ -394,9 +582,8 @@ class ExonGenomicCoordsMapper:
|
|
394
582
|
gene: str,
|
395
583
|
alt_ac: str,
|
396
584
|
pos: int,
|
397
|
-
strand:
|
585
|
+
strand: Strand,
|
398
586
|
is_start: bool,
|
399
|
-
residue_mode: ResidueMode,
|
400
587
|
) -> Optional[str]:
|
401
588
|
"""Set genomic data in `params` found from MANE.
|
402
589
|
|
@@ -407,16 +594,19 @@ class ExonGenomicCoordsMapper:
|
|
407
594
|
:param strand: Strand
|
408
595
|
:param is_start: `True` if `pos` is start position. `False` if `pos` is end
|
409
596
|
position.
|
410
|
-
:param residue_mode: Residue mode for `pos`
|
411
597
|
:return: Warnings if found
|
412
598
|
"""
|
413
|
-
|
599
|
+
start, end = get_inter_residue_pos(pos, pos, residue_mode=ResidueMode.ZERO)
|
600
|
+
mane_data: Optional[
|
601
|
+
CdnaRepresentation
|
602
|
+
] = await self.mane_transcript.get_mane_transcript(
|
414
603
|
alt_ac,
|
415
|
-
|
604
|
+
start,
|
605
|
+
end,
|
416
606
|
AnnotationLayer.GENOMIC,
|
417
607
|
gene=gene,
|
418
608
|
try_longest_compatible=True,
|
419
|
-
residue_mode=
|
609
|
+
residue_mode=ResidueMode.INTER_RESIDUE,
|
420
610
|
)
|
421
611
|
if not mane_data:
|
422
612
|
msg = f"Unable to find mane data for {alt_ac} with position {pos}"
|
@@ -425,23 +615,18 @@ class ExonGenomicCoordsMapper:
|
|
425
615
|
logger.warning(msg)
|
426
616
|
return msg
|
427
617
|
|
428
|
-
|
429
|
-
mane_data["strand"] = -1
|
430
|
-
elif mane_data["strand"] == "+":
|
431
|
-
mane_data["strand"] = 1
|
432
|
-
|
433
|
-
params["gene"] = mane_data["gene"]
|
618
|
+
params["gene"] = mane_data.gene
|
434
619
|
params["transcript"] = (
|
435
|
-
mane_data
|
436
|
-
if mane_data
|
437
|
-
else mane_data
|
438
|
-
if mane_data
|
620
|
+
mane_data.refseq
|
621
|
+
if mane_data.refseq
|
622
|
+
else mane_data.ensembl
|
623
|
+
if mane_data.ensembl
|
439
624
|
else None
|
440
625
|
)
|
441
626
|
tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)
|
442
627
|
if not tx_exons:
|
443
628
|
return f"Unable to get exons for {params['transcript']}"
|
444
|
-
tx_pos = mane_data
|
629
|
+
tx_pos = mane_data.pos[0] + mane_data.coding_start_site
|
445
630
|
params["exon"] = self._get_exon_number(tx_exons, tx_pos)
|
446
631
|
|
447
632
|
try:
|
@@ -454,7 +639,7 @@ class ExonGenomicCoordsMapper:
|
|
454
639
|
logger.warning(msg)
|
455
640
|
return msg
|
456
641
|
|
457
|
-
strand_to_use = strand if strand is not None else mane_data
|
642
|
+
strand_to_use = strand if strand is not None else mane_data.strand
|
458
643
|
params["strand"] = strand_to_use
|
459
644
|
self._set_exon_offset(
|
460
645
|
params,
|
@@ -474,7 +659,7 @@ class ExonGenomicCoordsMapper:
|
|
474
659
|
|
475
660
|
params["chr"] = genomic_data[1]
|
476
661
|
genomic_coords = genomic_data[2], genomic_data[3]
|
477
|
-
genomic_pos = genomic_coords[1] if is_start else genomic_coords[0]
|
662
|
+
genomic_pos = genomic_coords[1] - 1 if is_start else genomic_coords[0] + 1
|
478
663
|
params["pos"] = (
|
479
664
|
genomic_pos - params["exon_offset"]
|
480
665
|
if strand_to_use == -1
|
@@ -483,14 +668,14 @@ class ExonGenomicCoordsMapper:
|
|
483
668
|
return None
|
484
669
|
|
485
670
|
async def _set_genomic_data(
|
486
|
-
self, params: Dict, strand:
|
671
|
+
self, params: Dict, strand: Strand, is_start: bool
|
487
672
|
) -> Optional[str]:
|
488
|
-
"""Set genomic data in
|
673
|
+
"""Set genomic data in ``params``
|
489
674
|
|
490
675
|
:param params: Parameters for response
|
491
676
|
:param strand: Strand
|
492
|
-
:param is_start:
|
493
|
-
position.
|
677
|
+
:param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
|
678
|
+
end position.
|
494
679
|
:return: Warnings if found
|
495
680
|
"""
|
496
681
|
# We should always try to liftover
|
@@ -521,6 +706,7 @@ class ExonGenomicCoordsMapper:
|
|
521
706
|
tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac)
|
522
707
|
if not tx_exons:
|
523
708
|
return f"Unable to get exons for {params['transcript']}"
|
709
|
+
|
524
710
|
data = await self.uta_db.get_tx_exon_aln_v_data(
|
525
711
|
params["transcript"],
|
526
712
|
params["pos"],
|
@@ -549,12 +735,15 @@ class ExonGenomicCoordsMapper:
|
|
549
735
|
i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1
|
550
736
|
params["exon"] = i
|
551
737
|
|
552
|
-
strand_to_use = strand if strand is not None else data[7]
|
738
|
+
strand_to_use = strand if strand is not None else Strand(data[7])
|
553
739
|
params["strand"] = strand_to_use
|
740
|
+
if not is_start:
|
741
|
+
# convert back to inter-residue for end position
|
742
|
+
params["pos"] += 1
|
554
743
|
self._set_exon_offset(
|
555
744
|
params,
|
556
|
-
data[5],
|
557
|
-
data[6],
|
745
|
+
data[5] if is_start else data[5] + 1, # need to convert to inter-residue
|
746
|
+
data[6] - 1 if is_start else data[6], # need to convert to inter-residue
|
558
747
|
params["pos"],
|
559
748
|
is_start=is_start,
|
560
749
|
strand=strand_to_use,
|
@@ -563,25 +752,25 @@ class ExonGenomicCoordsMapper:
|
|
563
752
|
|
564
753
|
@staticmethod
|
565
754
|
def _set_exon_offset(
|
566
|
-
params: Dict, start: int, end: int, pos: int, is_start: bool, strand:
|
755
|
+
params: Dict, start: int, end: int, pos: int, is_start: bool, strand: Strand
|
567
756
|
) -> None:
|
568
|
-
"""Set
|
757
|
+
"""Set value for ``exon_offset`` in ``params``.
|
569
758
|
|
570
759
|
:param params: Parameters for response
|
571
|
-
:param start: Start exon coord (can be transcript or genomic)
|
572
|
-
:param end: End exon coord (can be transcript or genomic)
|
760
|
+
:param start: Start exon coord (can be transcript or aligned genomic)
|
761
|
+
:param end: End exon coord (can be transcript or aligned genomic)
|
573
762
|
:param pos: Position change (can be transcript or genomic)
|
574
|
-
:param is_start:
|
575
|
-
position
|
576
|
-
:param
|
763
|
+
:param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
|
764
|
+
end position
|
765
|
+
:param strand: Strand
|
577
766
|
"""
|
578
767
|
if is_start:
|
579
|
-
if strand ==
|
768
|
+
if strand == Strand.NEGATIVE:
|
580
769
|
params["exon_offset"] = end - pos
|
581
770
|
else:
|
582
771
|
params["exon_offset"] = pos - end
|
583
772
|
else:
|
584
|
-
if strand ==
|
773
|
+
if strand == Strand.NEGATIVE:
|
585
774
|
params["exon_offset"] = start - pos
|
586
775
|
else:
|
587
776
|
params["exon_offset"] = pos - start
|
@@ -595,21 +784,23 @@ class ExonGenomicCoordsMapper:
|
|
595
784
|
:param alt_ac: Genomic accession
|
596
785
|
:return: List of tuples containing transcript exon coordinates
|
597
786
|
"""
|
598
|
-
result =
|
787
|
+
result = []
|
599
788
|
tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac)
|
789
|
+
|
600
790
|
if not tx_exons:
|
601
791
|
return result
|
792
|
+
|
602
793
|
for coords in tx_exons:
|
603
794
|
result.append((coords[0], coords[1]))
|
604
795
|
return result
|
605
796
|
|
606
797
|
@staticmethod
|
607
798
|
def _get_exon_number(tx_exons: List, tx_pos: int) -> int:
|
608
|
-
"""Find exon number
|
799
|
+
"""Find related exon number for a position
|
609
800
|
|
610
|
-
:param tx_exons: List of exon coordinates
|
801
|
+
:param tx_exons: List of exon coordinates for a transcript
|
611
802
|
:param tx_pos: Transcript position change
|
612
|
-
:return: Exon number associated to transcript position change
|
803
|
+
:return: Exon number associated to transcript position change. Will be 1-based
|
613
804
|
"""
|
614
805
|
i = 1
|
615
806
|
for coords in tx_exons:
|