cool-seq-tool 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +6 -0
- cool_seq_tool/app.py +1 -2
- cool_seq_tool/handlers/seqrepo_access.py +5 -5
- cool_seq_tool/mappers/alignment.py +16 -16
- cool_seq_tool/mappers/exon_genomic_coords.py +911 -667
- cool_seq_tool/mappers/mane_transcript.py +109 -104
- cool_seq_tool/schemas.py +30 -165
- cool_seq_tool/sources/uta_database.py +149 -229
- cool_seq_tool/utils.py +9 -9
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/METADATA +8 -8
- cool_seq_tool-0.7.1.dist-info/RECORD +24 -0
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/WHEEL +1 -1
- cool_seq_tool-0.6.0.dist-info/RECORD +0 -24
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.6.0.dist-info → cool_seq_tool-0.7.1.dist-info}/top_level.txt +0 -0
@@ -1,32 +1,224 @@
|
|
1
1
|
"""Provide mapping capabilities between transcript exon and genomic coordinates."""
|
2
2
|
|
3
3
|
import logging
|
4
|
-
|
4
|
+
|
5
|
+
from ga4gh.vrs.models import SequenceLocation, SequenceReference
|
6
|
+
from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator
|
5
7
|
|
6
8
|
from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
|
7
9
|
from cool_seq_tool.mappers.liftover import LiftOver
|
8
|
-
from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
|
9
10
|
from cool_seq_tool.schemas import (
|
10
|
-
AnnotationLayer,
|
11
11
|
Assembly,
|
12
|
-
|
13
|
-
|
14
|
-
ResidueMode,
|
12
|
+
BaseModelForbidExtra,
|
13
|
+
ServiceMeta,
|
15
14
|
Strand,
|
16
|
-
TranscriptExonData,
|
17
|
-
TranscriptExonDataResponse,
|
18
15
|
)
|
19
16
|
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
|
20
|
-
from cool_seq_tool.sources.uta_database import UtaDatabase
|
21
|
-
from cool_seq_tool.utils import
|
22
|
-
|
23
|
-
CoordinatesResponseType = TypeVar(
|
24
|
-
"CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse
|
25
|
-
)
|
17
|
+
from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
|
18
|
+
from cool_seq_tool.utils import service_meta
|
26
19
|
|
27
20
|
_logger = logging.getLogger(__name__)
|
28
21
|
|
29
22
|
|
23
|
+
class _ExonCoord(BaseModelForbidExtra):
|
24
|
+
"""Model for representing exon coordinate data"""
|
25
|
+
|
26
|
+
ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
27
|
+
tx_start_i: StrictInt = Field(
|
28
|
+
...,
|
29
|
+
description="Transcript start index of the exon. Inter-residue coordinates.",
|
30
|
+
)
|
31
|
+
tx_end_i: StrictInt = Field(
|
32
|
+
..., description="Transcript end index of the exon. Inter-residue coordinates."
|
33
|
+
)
|
34
|
+
alt_start_i: StrictInt = Field(
|
35
|
+
..., description="Genomic start index of the exon. Inter-residue coordinates."
|
36
|
+
)
|
37
|
+
alt_end_i: StrictInt = Field(
|
38
|
+
..., description="Genomic end index of the exon. Inter-residue coordinates."
|
39
|
+
)
|
40
|
+
alt_strand: Strand = Field(..., description="Strand.")
|
41
|
+
|
42
|
+
model_config = ConfigDict(
|
43
|
+
json_schema_extra={
|
44
|
+
"example": {
|
45
|
+
"ord": 0,
|
46
|
+
"tx_start_i": 0,
|
47
|
+
"tx_end_i": 234,
|
48
|
+
"alt_start_i": 154191901,
|
49
|
+
"alt_end_i": 154192135,
|
50
|
+
"alt_strand": Strand.NEGATIVE,
|
51
|
+
}
|
52
|
+
}
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
class TxSegment(BaseModelForbidExtra):
|
57
|
+
"""Model for representing transcript segment data."""
|
58
|
+
|
59
|
+
exon_ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
60
|
+
offset: StrictInt = Field(
|
61
|
+
0,
|
62
|
+
description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.",
|
63
|
+
)
|
64
|
+
genomic_location: SequenceLocation = Field(
|
65
|
+
..., description="The genomic position of a transcript segment."
|
66
|
+
)
|
67
|
+
|
68
|
+
model_config = ConfigDict(
|
69
|
+
json_schema_extra={
|
70
|
+
"example": {
|
71
|
+
"exon_ord": 0,
|
72
|
+
"offset": 0,
|
73
|
+
"genomic_location": {
|
74
|
+
"type": "SequenceLocation",
|
75
|
+
"sequenceReference": {
|
76
|
+
"type": "SequenceReference",
|
77
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
78
|
+
},
|
79
|
+
"end": 154192135,
|
80
|
+
},
|
81
|
+
}
|
82
|
+
}
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
class GenomicTxSeg(BaseModelForbidExtra):
|
87
|
+
"""Model for representing a boundary for a transcript segment."""
|
88
|
+
|
89
|
+
seg: TxSegment | None = Field(None, description="Transcript segment.")
|
90
|
+
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
|
91
|
+
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
92
|
+
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
93
|
+
errors: list[StrictStr] = Field([], description="Error messages.")
|
94
|
+
|
95
|
+
@model_validator(mode="before")
|
96
|
+
def check_errors(cls, values: dict) -> dict: # noqa: N805
|
97
|
+
"""Ensure that fields are (un)set depending on errors
|
98
|
+
|
99
|
+
:param values: Values in model
|
100
|
+
:raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
|
101
|
+
provided when there are no errors
|
102
|
+
:return: Values in model
|
103
|
+
"""
|
104
|
+
if not values.get("errors") and not all(
|
105
|
+
(
|
106
|
+
values.get("seg"),
|
107
|
+
values.get("genomic_ac"),
|
108
|
+
values.get("tx_ac"),
|
109
|
+
)
|
110
|
+
):
|
111
|
+
err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
|
112
|
+
raise ValueError(err_msg)
|
113
|
+
return values
|
114
|
+
|
115
|
+
model_config = ConfigDict(
|
116
|
+
json_schema_extra={
|
117
|
+
"example": {
|
118
|
+
"gene": "TPM3",
|
119
|
+
"genomic_ac": "NC_000001.11",
|
120
|
+
"tx_ac": "NM_152263.3",
|
121
|
+
"seg": {
|
122
|
+
"exon_ord": 0,
|
123
|
+
"offset": 0,
|
124
|
+
"genomic_location": {
|
125
|
+
"type": "SequenceLocation",
|
126
|
+
"sequenceReference": {
|
127
|
+
"type": "SequenceReference",
|
128
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
129
|
+
},
|
130
|
+
"end": 154192135,
|
131
|
+
},
|
132
|
+
},
|
133
|
+
"errors": [],
|
134
|
+
}
|
135
|
+
}
|
136
|
+
)
|
137
|
+
|
138
|
+
|
139
|
+
class GenomicTxSegService(BaseModelForbidExtra):
|
140
|
+
"""Service model for genomic and transcript data."""
|
141
|
+
|
142
|
+
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
|
143
|
+
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
144
|
+
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
145
|
+
seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
|
146
|
+
seg_end: TxSegment | None = Field(None, description="End transcript segment.")
|
147
|
+
errors: list[StrictStr] = Field([], description="Error messages.")
|
148
|
+
service_meta: ServiceMeta = Field(..., description="Service metadata.")
|
149
|
+
|
150
|
+
@model_validator(mode="before")
|
151
|
+
def add_meta_check_errors(cls, values: dict) -> dict: # noqa: N805
|
152
|
+
"""Add service metadata to model and ensure that fields are (un)set depending
|
153
|
+
on errors
|
154
|
+
|
155
|
+
:param values: Values in model
|
156
|
+
:raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
|
157
|
+
not provided when there are no errors
|
158
|
+
:return: Values in model, including service metadata
|
159
|
+
"""
|
160
|
+
values["service_meta"] = service_meta()
|
161
|
+
if not values.get("errors") and not all(
|
162
|
+
(
|
163
|
+
values.get("genomic_ac"),
|
164
|
+
values.get("tx_ac"),
|
165
|
+
values.get("seg_start") or values.get("seg_end"),
|
166
|
+
)
|
167
|
+
):
|
168
|
+
err_msg = (
|
169
|
+
"`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
|
170
|
+
)
|
171
|
+
raise ValueError(err_msg)
|
172
|
+
|
173
|
+
return values
|
174
|
+
|
175
|
+
model_config = ConfigDict(
|
176
|
+
json_schema_extra={
|
177
|
+
"example": {
|
178
|
+
"gene": "TPM3",
|
179
|
+
"genomic_ac": "NC_000001.11",
|
180
|
+
"tx_ac": "NM_152263.3",
|
181
|
+
"seg_start": {
|
182
|
+
"exon_ord": 0,
|
183
|
+
"offset": 0,
|
184
|
+
"genomic_location": {
|
185
|
+
"type": "SequenceLocation",
|
186
|
+
"sequenceReference": {
|
187
|
+
"type": "SequenceReference",
|
188
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
189
|
+
},
|
190
|
+
"end": 154192135,
|
191
|
+
},
|
192
|
+
},
|
193
|
+
"seg_end": {
|
194
|
+
"exon_ord": 7,
|
195
|
+
"offset": 0,
|
196
|
+
"genomic_location": {
|
197
|
+
"type": "SequenceLocation",
|
198
|
+
"sequenceReference": {
|
199
|
+
"type": "SequenceReference",
|
200
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
201
|
+
},
|
202
|
+
"start": 154170399,
|
203
|
+
},
|
204
|
+
},
|
205
|
+
}
|
206
|
+
}
|
207
|
+
)
|
208
|
+
|
209
|
+
|
210
|
+
def _return_service_errors(errors: list[str]) -> GenomicTxSegService:
|
211
|
+
"""Log errors and return service object with errors.
|
212
|
+
|
213
|
+
:param errors: Error message(s)
|
214
|
+
:return: Service object with error messages.
|
215
|
+
"""
|
216
|
+
for error in errors:
|
217
|
+
_logger.warning(error)
|
218
|
+
|
219
|
+
return GenomicTxSegService(errors=errors)
|
220
|
+
|
221
|
+
|
30
222
|
class ExonGenomicCoordsMapper:
|
31
223
|
"""Provide capabilities for mapping transcript exon representation to/from genomic
|
32
224
|
coordinate representation.
|
@@ -36,7 +228,6 @@ class ExonGenomicCoordsMapper:
|
|
36
228
|
self,
|
37
229
|
seqrepo_access: SeqRepoAccess,
|
38
230
|
uta_db: UtaDatabase,
|
39
|
-
mane_transcript: ManeTranscript,
|
40
231
|
mane_transcript_mappings: ManeTranscriptMappings,
|
41
232
|
liftover: LiftOver,
|
42
233
|
) -> None:
|
@@ -45,7 +236,7 @@ class ExonGenomicCoordsMapper:
|
|
45
236
|
A lot of resources are required for initialization, so when defaults are enough,
|
46
237
|
it's easiest to let the core CoolSeqTool class handle it for you:
|
47
238
|
|
48
|
-
>>> from cool_seq_tool
|
239
|
+
>>> from cool_seq_tool import CoolSeqTool
|
49
240
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
50
241
|
|
51
242
|
Note that this class's public methods are all defined as ``async``, so they will
|
@@ -54,42 +245,22 @@ class ExonGenomicCoordsMapper:
|
|
54
245
|
|
55
246
|
>>> import asyncio
|
56
247
|
>>> result = asyncio.run(
|
57
|
-
... egc.
|
58
|
-
... "NM_002529.3", exon_start=2, exon_end=17
|
59
|
-
... )
|
248
|
+
... egc.tx_segment_to_genomic("NM_002529.3", exon_start=2, exon_end=17)
|
60
249
|
... )
|
61
250
|
>>> result.genomic_data.start, result.genomic_data.end
|
62
251
|
(156864428, 156881456)
|
63
252
|
|
64
253
|
:param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
|
65
254
|
:param uta_db: UtaDatabase instance to give access to query UTA database
|
66
|
-
:param mane_transcript: Instance to align to MANE or compatible representation
|
67
255
|
:param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
|
68
256
|
:param liftover: Instance to provide mapping between human genome assemblies
|
69
257
|
"""
|
70
258
|
self.seqrepo_access = seqrepo_access
|
71
259
|
self.uta_db = uta_db
|
72
|
-
self.mane_transcript = mane_transcript
|
73
260
|
self.mane_transcript_mappings = mane_transcript_mappings
|
74
261
|
self.liftover = liftover
|
75
262
|
|
76
|
-
|
77
|
-
def _return_warnings(
|
78
|
-
resp: CoordinatesResponseType, warning_msg: list[str]
|
79
|
-
) -> CoordinatesResponseType:
|
80
|
-
"""Add warnings to response object
|
81
|
-
|
82
|
-
:param resp: Response object
|
83
|
-
:param warning_msg: Warning message(s) on why ``transcript_exon_data`` or
|
84
|
-
``genomic_data`` field is ``None``
|
85
|
-
:return: Response object with warning message
|
86
|
-
"""
|
87
|
-
for msg in warning_msg:
|
88
|
-
_logger.warning(msg)
|
89
|
-
resp.warnings.append(msg)
|
90
|
-
return resp
|
91
|
-
|
92
|
-
async def transcript_to_genomic_coordinates(
|
263
|
+
async def tx_segment_to_genomic(
|
93
264
|
self,
|
94
265
|
transcript: str,
|
95
266
|
gene: str | None = None,
|
@@ -97,26 +268,30 @@ class ExonGenomicCoordsMapper:
|
|
97
268
|
exon_start_offset: int = 0,
|
98
269
|
exon_end: int | None = None,
|
99
270
|
exon_end_offset: int = 0,
|
100
|
-
) ->
|
101
|
-
"""Get genomic data given transcript data.
|
271
|
+
) -> GenomicTxSegService:
|
272
|
+
"""Get aligned genomic data given transcript segment data.
|
102
273
|
|
103
274
|
By default, transcript data is aligned to the GRCh38 assembly.
|
104
275
|
|
105
276
|
>>> import asyncio
|
106
|
-
>>> from cool_seq_tool
|
277
|
+
>>> from cool_seq_tool import CoolSeqTool
|
107
278
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
108
279
|
>>> tpm3 = asyncio.run(
|
109
|
-
... egc.
|
280
|
+
... egc.tx_segment_to_genomic(
|
110
281
|
... "NM_152263.3",
|
111
282
|
... gene="TPM3",
|
112
283
|
... exon_start=1,
|
113
284
|
... exon_end=8,
|
114
285
|
... )
|
115
286
|
... )
|
116
|
-
>>>
|
287
|
+
>>> (
|
288
|
+
... tpm3.genomic_ac,
|
289
|
+
... tpm3.seg_start.genomic_location.end,
|
290
|
+
... tpm3.seg_end.genomic_location.start,
|
291
|
+
... )
|
117
292
|
('NC_000001.11', 154192135, 154170399)
|
118
293
|
|
119
|
-
:param transcript:
|
294
|
+
:param transcript: RefSeq transcript accession
|
120
295
|
:param gene: HGNC gene symbol
|
121
296
|
:param exon_start: Starting transcript exon number (1-based). If not provided,
|
122
297
|
must provide ``exon_end``
|
@@ -126,422 +301,497 @@ class ExonGenomicCoordsMapper:
|
|
126
301
|
:param exon_end_offset: Ending exon offset
|
127
302
|
:return: GRCh38 genomic data (inter-residue coordinates)
|
128
303
|
"""
|
129
|
-
resp = GenomicDataResponse(
|
130
|
-
genomic_data=None, warnings=[], service_meta=service_meta()
|
131
|
-
)
|
132
|
-
|
133
304
|
# Ensure valid inputs
|
134
|
-
|
135
|
-
if not transcript:
|
136
|
-
warnings.append("Must provide `transcript`")
|
137
|
-
else:
|
138
|
-
transcript = transcript.strip()
|
139
|
-
|
305
|
+
errors = []
|
140
306
|
exon_start_exists, exon_end_exists = False, False
|
141
307
|
if exon_start is not None:
|
142
308
|
if exon_start < 1:
|
143
|
-
|
309
|
+
errors.append("`exon_start` cannot be less than 1")
|
144
310
|
exon_start_exists = True
|
145
311
|
|
146
312
|
if exon_end is not None:
|
147
313
|
if exon_end < 1:
|
148
|
-
|
314
|
+
errors.append("`exon_end` cannot be less than 1")
|
149
315
|
exon_end_exists = True
|
150
316
|
|
151
317
|
if not exon_start_exists and not exon_end_exists:
|
152
|
-
|
318
|
+
errors.append("Must provide either `exon_start` or `exon_end`")
|
153
319
|
if exon_start_exists and exon_end_exists and (exon_start > exon_end):
|
154
|
-
|
320
|
+
errors.append(
|
155
321
|
f"Start exon {exon_start} is greater than end exon {exon_end}"
|
156
322
|
)
|
157
323
|
|
158
|
-
if
|
159
|
-
return
|
160
|
-
|
161
|
-
# Get all exons and associated start/end coordinates for transcript
|
162
|
-
tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
|
163
|
-
if not tx_exons:
|
164
|
-
return self._return_warnings(resp, [warning] if warning else [])
|
324
|
+
if errors:
|
325
|
+
return _return_service_errors(errors)
|
165
326
|
|
166
327
|
# Get exon start and exon end coordinates
|
167
|
-
|
168
|
-
|
328
|
+
(
|
329
|
+
tx_exon_start_coords,
|
330
|
+
tx_exon_end_coords,
|
331
|
+
errors,
|
332
|
+
) = await self._get_start_end_exon_coords(
|
333
|
+
transcript, exon_start=exon_start, exon_end=exon_end
|
169
334
|
)
|
170
|
-
if
|
171
|
-
return
|
172
|
-
tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
|
335
|
+
if errors:
|
336
|
+
return _return_service_errors(errors)
|
173
337
|
|
174
338
|
if gene:
|
175
|
-
gene = gene.upper()
|
339
|
+
gene = gene.upper()
|
176
340
|
|
177
341
|
# Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
|
178
342
|
# for exon(s)
|
179
|
-
|
343
|
+
(
|
344
|
+
genomic_aln_start,
|
345
|
+
genomic_aln_end,
|
346
|
+
err_msg,
|
347
|
+
) = await self._get_genomic_aln_coords(
|
180
348
|
transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
|
181
349
|
)
|
182
|
-
if
|
183
|
-
return
|
184
|
-
alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
|
350
|
+
if err_msg:
|
351
|
+
return _return_service_errors([err_msg])
|
185
352
|
|
186
353
|
# Get gene and chromosome data, check that at least one was retrieved
|
187
|
-
gene =
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
354
|
+
gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
|
355
|
+
genomic_ac = (
|
356
|
+
genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
|
357
|
+
)
|
358
|
+
if gene is None or genomic_ac is None:
|
359
|
+
return _return_service_errors(
|
192
360
|
[
|
193
|
-
"Unable to retrieve `gene` or `
|
361
|
+
"Unable to retrieve `gene` or `genomic_ac` from genomic start and genomic end data"
|
194
362
|
],
|
195
363
|
)
|
196
364
|
|
197
|
-
g_start = alt_ac_start_data[3] - 1 if alt_ac_start_data else None
|
198
|
-
g_end = alt_ac_end_data[2] + 1 if alt_ac_end_data else None
|
199
365
|
strand = (
|
200
|
-
Strand(
|
201
|
-
if
|
202
|
-
else Strand(
|
366
|
+
Strand(genomic_aln_start.alt_strand)
|
367
|
+
if genomic_aln_start
|
368
|
+
else Strand(genomic_aln_end.alt_strand)
|
203
369
|
)
|
204
370
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
371
|
+
if exon_start_exists:
|
372
|
+
seg_start, err_msg = self._get_tx_segment(
|
373
|
+
genomic_ac,
|
374
|
+
strand,
|
375
|
+
exon_start_offset,
|
376
|
+
genomic_aln_start,
|
377
|
+
is_seg_start=True,
|
378
|
+
)
|
379
|
+
if err_msg:
|
380
|
+
return _return_service_errors([err_msg])
|
213
381
|
else:
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
382
|
+
seg_start = None
|
383
|
+
|
384
|
+
if exon_end_exists:
|
385
|
+
seg_end, err_msg = self._get_tx_segment(
|
386
|
+
genomic_ac,
|
387
|
+
strand,
|
388
|
+
exon_end_offset,
|
389
|
+
genomic_aln_end,
|
390
|
+
is_seg_start=False,
|
391
|
+
)
|
392
|
+
if err_msg:
|
393
|
+
return _return_service_errors([err_msg])
|
394
|
+
else:
|
395
|
+
seg_end = None
|
220
396
|
|
221
|
-
|
397
|
+
return GenomicTxSegService(
|
222
398
|
gene=gene,
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
exon_start_offset=exon_start_offset,
|
228
|
-
exon_end=exon_end if end_exists else None,
|
229
|
-
exon_end_offset=exon_end_offset,
|
230
|
-
transcript=transcript,
|
231
|
-
strand=strand,
|
399
|
+
genomic_ac=genomic_ac,
|
400
|
+
tx_ac=transcript,
|
401
|
+
seg_start=seg_start,
|
402
|
+
seg_end=seg_end,
|
232
403
|
)
|
233
404
|
|
234
|
-
|
235
|
-
|
236
|
-
async def genomic_to_transcript_exon_coordinates(
|
405
|
+
async def genomic_to_tx_segment(
|
237
406
|
self,
|
238
407
|
chromosome: str | None = None,
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
strand: Strand | None = None,
|
408
|
+
genomic_ac: str | None = None,
|
409
|
+
seg_start_genomic: int | None = None,
|
410
|
+
seg_end_genomic: int | None = None,
|
243
411
|
transcript: str | None = None,
|
244
412
|
get_nearest_transcript_junction: bool = False,
|
245
413
|
gene: str | None = None,
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
414
|
+
) -> GenomicTxSegService:
|
415
|
+
"""Get transcript segment data for genomic data, lifted over to GRCh38.
|
416
|
+
|
417
|
+
If liftover to GRCh38 is unsuccessful, will return errors.
|
418
|
+
|
419
|
+
Must provide inter-residue coordinates.
|
250
420
|
|
251
421
|
MANE Transcript data will be returned if and only if ``transcript`` is not
|
252
422
|
supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
|
253
423
|
|
254
424
|
>>> import asyncio
|
255
|
-
>>> from cool_seq_tool
|
425
|
+
>>> from cool_seq_tool import CoolSeqTool
|
256
426
|
>>> from cool_seq_tool.schemas import Strand
|
257
427
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
258
428
|
>>> result = asyncio.run(
|
259
|
-
... egc.
|
260
|
-
...
|
261
|
-
...
|
262
|
-
...
|
263
|
-
... strand=Strand.NEGATIVE,
|
429
|
+
... egc.genomic_to_tx_segment(
|
430
|
+
... genomic_ac="NC_000001.11",
|
431
|
+
... seg_start_genomic=154192135,
|
432
|
+
... seg_end_genomic=154170399,
|
264
433
|
... transcript="NM_152263.3",
|
265
434
|
... )
|
266
435
|
... )
|
267
|
-
>>> result.
|
268
|
-
(
|
436
|
+
>>> result.seg_start.exon_ord, result.seg_end.exon_ord
|
437
|
+
(0, 7)
|
269
438
|
|
270
439
|
:param chromosome: e.g. ``"1"`` or ``"chr1"``. If not provided, must provide
|
271
|
-
``
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
:param
|
277
|
-
:param
|
440
|
+
``genomic_ac``. If ``genomic_ac`` is also provided, ``genomic_ac`` will be
|
441
|
+
used.
|
442
|
+
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
443
|
+
must provide ``chromosome. If ``chromosome`` is also provided,
|
444
|
+
``genomic_ac`` will be used.
|
445
|
+
:param seg_start_genomic: Genomic position where the transcript segment starts
|
446
|
+
:param seg_end_genomic: Genomic position where the transcript segment ends
|
278
447
|
:param transcript: The transcript to use. If this is not given, we will try the
|
279
448
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
280
449
|
Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
|
281
450
|
page.
|
282
451
|
:param get_nearest_transcript_junction: If ``True``, this will return the
|
283
|
-
adjacent exon if the position specified by``
|
284
|
-
occur on an exon. For the positive strand, adjacent
|
285
|
-
preceding the breakpoint for the 5' end and the exon
|
286
|
-
breakpoint for the 3' end. For the negative strand, adjacent
|
287
|
-
the exon following the breakpoint for the 5' end and the exon
|
288
|
-
breakpoint for the 3' end.
|
452
|
+
adjacent exon if the position specified by``seg_start_genomic`` or
|
453
|
+
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
454
|
+
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
455
|
+
following the breakpoint for the 3' end. For the negative strand, adjacent
|
456
|
+
is defined as the exon following the breakpoint for the 5' end and the exon
|
457
|
+
preceding the breakpoint for the 3' end.
|
289
458
|
:param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript``
|
290
459
|
value is provided.
|
291
|
-
:param
|
460
|
+
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
|
461
|
+
``seg_end_genomic``
|
292
462
|
:return: Genomic data (inter-residue coordinates)
|
293
463
|
"""
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
warnings.append("Must provide either `chromosome` or `alt_ac`")
|
464
|
+
errors = []
|
465
|
+
if seg_start_genomic is None and seg_end_genomic is None:
|
466
|
+
errors.append(
|
467
|
+
"Must provide either `seg_start_genomic` or `seg_end_genomic`"
|
468
|
+
)
|
469
|
+
if chromosome is None and genomic_ac is None:
|
470
|
+
errors.append("Must provide either `chromosome` or `alt_ac`")
|
302
471
|
if transcript is None and gene is None:
|
303
|
-
|
304
|
-
if
|
305
|
-
return
|
472
|
+
errors.append("Must provide either `gene` or `transcript`")
|
473
|
+
if errors:
|
474
|
+
return _return_service_errors(errors)
|
306
475
|
|
307
|
-
params = {key: None for key in GenomicData.model_fields}
|
308
476
|
if gene is not None:
|
309
|
-
gene = gene.upper()
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
start_data = await self._genomic_to_transcript_exon_coordinate(
|
317
|
-
start,
|
477
|
+
gene = gene.upper()
|
478
|
+
|
479
|
+
params = {}
|
480
|
+
|
481
|
+
if seg_start_genomic:
|
482
|
+
start_tx_seg_data = await self._genomic_to_tx_segment(
|
483
|
+
seg_start_genomic,
|
318
484
|
chromosome=chromosome,
|
319
|
-
|
320
|
-
strand=strand,
|
485
|
+
genomic_ac=genomic_ac,
|
321
486
|
transcript=transcript,
|
322
487
|
gene=gene,
|
323
488
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
324
|
-
|
489
|
+
is_seg_start=True,
|
325
490
|
)
|
326
|
-
if
|
327
|
-
|
328
|
-
|
329
|
-
|
491
|
+
if start_tx_seg_data.errors:
|
492
|
+
return _return_service_errors(start_tx_seg_data.errors)
|
493
|
+
|
494
|
+
params["gene"] = start_tx_seg_data.gene
|
495
|
+
params["genomic_ac"] = start_tx_seg_data.genomic_ac
|
496
|
+
params["tx_ac"] = start_tx_seg_data.tx_ac
|
497
|
+
params["seg_start"] = start_tx_seg_data.seg
|
330
498
|
else:
|
331
|
-
|
499
|
+
start_tx_seg_data = None
|
332
500
|
|
333
|
-
if
|
334
|
-
|
335
|
-
|
336
|
-
end_data = await self._genomic_to_transcript_exon_coordinate(
|
337
|
-
end,
|
501
|
+
if seg_end_genomic:
|
502
|
+
end_tx_seg_data = await self._genomic_to_tx_segment(
|
503
|
+
seg_end_genomic,
|
338
504
|
chromosome=chromosome,
|
339
|
-
|
340
|
-
strand=strand,
|
505
|
+
genomic_ac=genomic_ac,
|
341
506
|
transcript=transcript,
|
342
507
|
gene=gene,
|
343
508
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
344
|
-
|
509
|
+
is_seg_start=False,
|
345
510
|
)
|
346
|
-
if
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
return
|
361
|
-
params[field] = start_data[field]
|
511
|
+
if end_tx_seg_data.errors:
|
512
|
+
return _return_service_errors(end_tx_seg_data.errors)
|
513
|
+
|
514
|
+
if start_tx_seg_data:
|
515
|
+
# Need to check that gene, genomic_ac, tx_ac all match
|
516
|
+
errors = []
|
517
|
+
for attr in ["gene", "genomic_ac", "tx_ac"]:
|
518
|
+
start_seg_attr = params[attr]
|
519
|
+
end_seg_attr = getattr(end_tx_seg_data, attr)
|
520
|
+
if start_seg_attr != end_seg_attr:
|
521
|
+
errors.append(
|
522
|
+
f"Start end end segment mismatch for `{attr}`. {start_seg_attr} != {end_seg_attr}."
|
523
|
+
)
|
524
|
+
if errors:
|
525
|
+
return _return_service_errors(errors)
|
362
526
|
else:
|
363
|
-
params[
|
527
|
+
params["gene"] = end_tx_seg_data.gene
|
528
|
+
params["genomic_ac"] = end_tx_seg_data.genomic_ac
|
529
|
+
params["tx_ac"] = end_tx_seg_data.tx_ac
|
364
530
|
|
365
|
-
|
366
|
-
msg = (
|
367
|
-
f"Input gene, {gene}, does not match expected output"
|
368
|
-
f"gene, {params['gene']}"
|
369
|
-
)
|
370
|
-
return self._return_warnings(resp, [msg])
|
531
|
+
params["seg_end"] = end_tx_seg_data.seg
|
371
532
|
|
372
|
-
|
373
|
-
if data:
|
374
|
-
params[label] = data["pos"]
|
375
|
-
params[f"exon_{label}"] = data["exon"]
|
376
|
-
params[f"exon_{label}_offset"] = data["exon_offset"]
|
377
|
-
resp.genomic_data = GenomicData(**params)
|
378
|
-
return resp
|
533
|
+
return GenomicTxSegService(**params)
|
379
534
|
|
380
|
-
|
381
|
-
def _validate_exon(
|
382
|
-
transcript: str, tx_exons: list[tuple[int, int]], exon_number: int
|
383
|
-
) -> tuple[tuple[int, int] | None, str | None]:
|
384
|
-
"""Validate that exon number exists on a given transcript
|
385
|
-
|
386
|
-
:param transcript: Transcript accession
|
387
|
-
:param tx_exons: List of transcript's exons and associated coordinates
|
388
|
-
:param exon_number: Exon number to validate
|
389
|
-
:return: Exon coordinates for a given exon number and warnings if found
|
390
|
-
"""
|
391
|
-
msg = f"Exon {exon_number} does not exist on {transcript}"
|
392
|
-
try:
|
393
|
-
if exon_number < 1:
|
394
|
-
return None, msg
|
395
|
-
exon = tx_exons[exon_number - 1]
|
396
|
-
except IndexError:
|
397
|
-
return None, msg
|
398
|
-
return exon, None
|
399
|
-
|
400
|
-
def get_tx_exon_coords(
|
535
|
+
async def _get_start_end_exon_coords(
|
401
536
|
self,
|
402
|
-
|
403
|
-
tx_exons: list[tuple[int, int]],
|
537
|
+
tx_ac: str,
|
404
538
|
exon_start: int | None = None,
|
405
539
|
exon_end: int | None = None,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
:param
|
414
|
-
:param exon_start: Start exon number
|
415
|
-
:param exon_end: End exon number
|
416
|
-
:
|
417
|
-
|
540
|
+
genomic_ac: str | None = None,
|
541
|
+
) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
|
542
|
+
"""Get exon coordinates for a transcript given exon start and exon end.
|
543
|
+
|
544
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
545
|
+
associated to ``tx_ac``.
|
546
|
+
|
547
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
548
|
+
:param exon_start: Start exon number to get coordinate data for. 1-based.
|
549
|
+
:param exon_end: End exon number to get coordinate data for. 1-based.
|
550
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
551
|
+
:return: Tuple containing start exon coordinate data, end exon coordinate data,
|
552
|
+
and list of errors. The exon coordinate data will include the exon number,
|
553
|
+
transcript and genomic positions for the start and end of the exon, and
|
554
|
+
strand.
|
418
555
|
"""
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
556
|
+
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
|
557
|
+
if not tx_exons:
|
558
|
+
return None, None, [f"No exons found given {tx_ac}"]
|
559
|
+
|
560
|
+
errors = []
|
561
|
+
start_end_exons = []
|
562
|
+
for exon_num in [exon_start, exon_end]:
|
563
|
+
if exon_num is not None:
|
564
|
+
try:
|
565
|
+
start_end_exons.append(tx_exons[exon_num - 1])
|
566
|
+
continue
|
567
|
+
except IndexError:
|
568
|
+
errors.append(f"Exon {exon_num} does not exist on {tx_ac}")
|
569
|
+
start_end_exons.append(None)
|
570
|
+
|
571
|
+
if errors:
|
572
|
+
start_end_exons = [None, None]
|
573
|
+
|
574
|
+
return *start_end_exons, errors
|
575
|
+
|
576
|
+
async def _get_all_exon_coords(
|
577
|
+
self, tx_ac: str, genomic_ac: str | None = None
|
578
|
+
) -> list[_ExonCoord]:
|
579
|
+
"""Get all exon coordinate data for a transcript.
|
580
|
+
|
581
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
582
|
+
associated to ``tx_ac``.
|
583
|
+
|
584
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
585
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
586
|
+
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
587
|
+
The exon coordinate data will include the exon number, transcript and
|
588
|
+
genomic positions for the start and end of the exon, and strand.
|
589
|
+
The list will be ordered by ascending exon number.
|
590
|
+
"""
|
591
|
+
if genomic_ac:
|
592
|
+
query = f"""
|
593
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
594
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
595
|
+
WHERE tx_ac = '{tx_ac}'
|
596
|
+
AND alt_aln_method = 'splign'
|
597
|
+
AND alt_ac = '{genomic_ac}'
|
598
|
+
ORDER BY ord ASC
|
599
|
+
""" # noqa: S608
|
432
600
|
else:
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
601
|
+
query = f"""
|
602
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
603
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
604
|
+
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
605
|
+
ON t.alt_ac = s.ac
|
606
|
+
WHERE s.descr = ''
|
607
|
+
AND t.tx_ac = '{tx_ac}'
|
608
|
+
AND t.alt_aln_method = 'splign'
|
609
|
+
AND t.alt_ac like 'NC_000%'
|
610
|
+
ORDER BY ord ASC
|
611
|
+
""" # noqa: S608
|
612
|
+
|
613
|
+
results = await self.uta_db.execute_query(query)
|
614
|
+
return [_ExonCoord(**r) for r in results]
|
615
|
+
|
616
|
+
async def _get_genomic_aln_coords(
|
437
617
|
self,
|
438
618
|
tx_ac: str,
|
439
|
-
tx_exon_start:
|
440
|
-
tx_exon_end:
|
619
|
+
tx_exon_start: _ExonCoord | None = None,
|
620
|
+
tx_exon_end: _ExonCoord | None = None,
|
441
621
|
gene: str | None = None,
|
442
|
-
) -> tuple[
|
622
|
+
) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
|
443
623
|
"""Get aligned genomic coordinates for transcript exon start and end.
|
444
624
|
|
625
|
+
``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
|
626
|
+
transcript and genomic accession.
|
627
|
+
|
445
628
|
:param tx_ac: Transcript accession
|
446
629
|
:param tx_exon_start: Transcript's exon start coordinates. If not provided,
|
447
630
|
must provide ``tx_exon_end``
|
448
631
|
:param tx_exon_end: Transcript's exon end coordinates. If not provided, must
|
449
632
|
provide ``tx_exon_start``
|
450
633
|
:param gene: HGNC gene symbol
|
451
|
-
:return:
|
634
|
+
:return: Tuple containing aligned genomic data for start and end exon and
|
635
|
+
warnings if found
|
452
636
|
"""
|
453
637
|
if tx_exon_start is None and tx_exon_end is None:
|
454
638
|
msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
|
455
639
|
_logger.warning(msg)
|
456
|
-
return None, msg
|
640
|
+
return None, None, msg
|
457
641
|
|
458
|
-
|
642
|
+
aligned_coords = {"start": None, "end": None}
|
459
643
|
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
|
460
644
|
if exon:
|
461
|
-
|
462
|
-
tx_ac, exon
|
645
|
+
aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
|
646
|
+
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
|
463
647
|
)
|
464
|
-
if
|
465
|
-
|
648
|
+
if aligned_coord:
|
649
|
+
aligned_coords[key] = aligned_coord
|
466
650
|
else:
|
467
|
-
return None, warning
|
468
|
-
|
469
|
-
|
470
|
-
# Validate that start and end alignments have matching gene, genomic accession,
|
471
|
-
# and strand
|
472
|
-
if all(alt_ac_data_values):
|
473
|
-
for i in (0, 1, 4):
|
474
|
-
if alt_ac_data["start"][i] != alt_ac_data["end"][i]:
|
475
|
-
if i == 0:
|
476
|
-
error = "HGNC gene symbol does not match"
|
477
|
-
elif i == 1:
|
478
|
-
error = "Genomic accession does not match"
|
479
|
-
else:
|
480
|
-
error = "Strand does not match"
|
481
|
-
_logger.warning(
|
482
|
-
"%s: %s != %s",
|
483
|
-
error,
|
484
|
-
alt_ac_data["start"][i],
|
485
|
-
alt_ac_data["end"][i],
|
486
|
-
)
|
487
|
-
return None, error
|
488
|
-
return tuple(alt_ac_data_values), None
|
651
|
+
return None, None, warning
|
652
|
+
|
653
|
+
return *aligned_coords.values(), None
|
489
654
|
|
490
|
-
|
655
|
+
def _get_tx_segment(
|
491
656
|
self,
|
492
|
-
|
657
|
+
genomic_ac: str,
|
658
|
+
strand: Strand,
|
659
|
+
offset: int,
|
660
|
+
genomic_ac_data: _ExonCoord,
|
661
|
+
is_seg_start: bool = False,
|
662
|
+
) -> tuple[TxSegment | None, str | None]:
|
663
|
+
"""Get transcript segment data given ``genomic_ac`` and offset data
|
664
|
+
|
665
|
+
:param genomic_ac: Genomic RefSeq accession
|
666
|
+
:param strand: Strand
|
667
|
+
:param offset: Exon offset
|
668
|
+
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
669
|
+
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
670
|
+
segment starts, defaults to ``False``
|
671
|
+
:return: Transcript segment data
|
672
|
+
"""
|
673
|
+
if is_seg_start:
|
674
|
+
if strand == Strand.POSITIVE:
|
675
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
676
|
+
else:
|
677
|
+
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
678
|
+
else:
|
679
|
+
if strand == Strand.POSITIVE:
|
680
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
681
|
+
else:
|
682
|
+
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
683
|
+
|
684
|
+
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
685
|
+
genomic_ac,
|
686
|
+
seg_genomic_pos,
|
687
|
+
is_seg_start=is_seg_start,
|
688
|
+
strand=strand,
|
689
|
+
)
|
690
|
+
if err_msg:
|
691
|
+
return None, err_msg
|
692
|
+
|
693
|
+
return TxSegment(
|
694
|
+
exon_ord=genomic_ac_data.ord,
|
695
|
+
genomic_location=genomic_loc,
|
696
|
+
offset=offset,
|
697
|
+
), None
|
698
|
+
|
699
|
+
def _get_vrs_seq_loc(
|
700
|
+
self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
|
701
|
+
) -> tuple[SequenceLocation | None, str | None]:
|
702
|
+
"""Create VRS Sequence Location for genomic position where transcript segment
|
703
|
+
occurs
|
704
|
+
|
705
|
+
:param genomic_ac: RefSeq genomic accession
|
706
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
707
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
|
708
|
+
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
709
|
+
:param strand: Strand
|
710
|
+
:return: Tuple containing VRS location (if successful) and error message (if
|
711
|
+
unable to get GA4GH identifier for ``genomic_ac``).
|
712
|
+
"""
|
713
|
+
ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
|
714
|
+
genomic_ac, "ga4gh"
|
715
|
+
)
|
716
|
+
if err_msg:
|
717
|
+
return None, err_msg
|
718
|
+
|
719
|
+
use_start = (
|
720
|
+
strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
|
721
|
+
)
|
722
|
+
|
723
|
+
return SequenceLocation(
|
724
|
+
sequenceReference=SequenceReference(
|
725
|
+
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
726
|
+
),
|
727
|
+
start=genomic_pos if use_start else None,
|
728
|
+
end=genomic_pos if not use_start else None,
|
729
|
+
), None
|
730
|
+
|
731
|
+
async def _genomic_to_tx_segment(
|
732
|
+
self,
|
733
|
+
genomic_pos: int,
|
493
734
|
chromosome: str | None = None,
|
494
|
-
|
495
|
-
strand: Strand | None = None,
|
735
|
+
genomic_ac: str | None = None,
|
496
736
|
transcript: str | None = None,
|
497
737
|
gene: str | None = None,
|
498
738
|
get_nearest_transcript_junction: bool = False,
|
499
|
-
|
500
|
-
) ->
|
501
|
-
"""
|
739
|
+
is_seg_start: bool = True,
|
740
|
+
) -> GenomicTxSeg:
|
741
|
+
"""Given genomic data, generate a boundary for a transcript segment.
|
742
|
+
|
743
|
+
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
744
|
+
errors.
|
502
745
|
|
503
|
-
:param
|
746
|
+
:param genomic_pos: Genomic position where the transcript segment starts or ends
|
747
|
+
(inter-residue based)
|
504
748
|
:param chromosome: Chromosome. Must give chromosome without a prefix
|
505
|
-
(i.e. ``1`` or ``X``). If not provided, must provide ``
|
506
|
-
|
507
|
-
|
508
|
-
|
749
|
+
(i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
|
750
|
+
position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
|
751
|
+
If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
|
752
|
+
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
753
|
+
must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
|
509
754
|
will be used.
|
510
|
-
:param strand: Strand
|
511
755
|
:param transcript: The transcript to use. If this is not given, we will try the
|
512
756
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
513
757
|
Compatible Transcript
|
514
758
|
:param gene: HGNC gene symbol
|
515
759
|
:param get_nearest_transcript_junction: If ``True``, this will return the
|
516
|
-
adjacent exon if the position specified by``
|
517
|
-
occur on an exon. For the positive strand, adjacent
|
518
|
-
preceding the breakpoint for the 5' end and the exon
|
519
|
-
breakpoint for the 3' end. For the negative strand, adjacent
|
520
|
-
the exon following the breakpoint for the 5' end and the exon
|
521
|
-
breakpoint for the 3' end.
|
522
|
-
:param
|
523
|
-
|
524
|
-
:return:
|
760
|
+
adjacent exon if the position specified by``seg_start_genomic`` or
|
761
|
+
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
762
|
+
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
763
|
+
following the breakpoint for the 3' end. For the negative strand, adjacent
|
764
|
+
is defined as the exon following the breakpoint for the 5' end and the exon
|
765
|
+
preceding the breakpoint for the 3' end.
|
766
|
+
:param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
767
|
+
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
768
|
+
:return: Data for a transcript segment boundary (inter-residue coordinates)
|
525
769
|
"""
|
526
|
-
|
527
|
-
transcript_exon_data=None, warnings=[], service_meta=service_meta()
|
528
|
-
)
|
529
|
-
params = {key: None for key in TranscriptExonData.model_fields}
|
770
|
+
params = {key: None for key in GenomicTxSeg.model_fields}
|
530
771
|
|
531
772
|
if get_nearest_transcript_junction:
|
532
|
-
if not gene
|
533
|
-
return
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
],
|
773
|
+
if not gene and not transcript:
|
774
|
+
return GenomicTxSeg(
|
775
|
+
errors=[
|
776
|
+
"`gene` or `transcript` must be provided to select the adjacent transcript junction"
|
777
|
+
]
|
538
778
|
)
|
539
|
-
if not alt_ac:
|
540
|
-
alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome)
|
541
779
|
|
542
|
-
|
543
|
-
|
544
|
-
|
780
|
+
if not genomic_ac:
|
781
|
+
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
|
782
|
+
|
783
|
+
if not genomic_acs:
|
784
|
+
return GenomicTxSeg(
|
785
|
+
errors=[err_msg],
|
786
|
+
)
|
787
|
+
genomic_ac = genomic_acs[0]
|
788
|
+
|
789
|
+
# Always liftover to GRCh38
|
790
|
+
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
791
|
+
genomic_ac, genomic_pos
|
792
|
+
)
|
793
|
+
if err_msg:
|
794
|
+
return GenomicTxSeg(errors=[err_msg])
|
545
795
|
|
546
796
|
if not transcript:
|
547
797
|
# Select a transcript if not provided
|
@@ -555,7 +805,7 @@ class ExonGenomicCoordsMapper:
|
|
555
805
|
# Attempt to find a coding transcript if a MANE transcript
|
556
806
|
# cannot be found
|
557
807
|
results = await self.uta_db.get_transcripts(
|
558
|
-
gene=gene, alt_ac=
|
808
|
+
gene=gene, alt_ac=genomic_ac
|
559
809
|
)
|
560
810
|
|
561
811
|
if not results.is_empty():
|
@@ -566,376 +816,345 @@ class ExonGenomicCoordsMapper:
|
|
566
816
|
SELECT DISTINCT tx_ac
|
567
817
|
FROM {self.uta_db.schema}.tx_exon_aln_v
|
568
818
|
WHERE hgnc = '{gene}'
|
569
|
-
AND alt_ac = '{
|
819
|
+
AND alt_ac = '{genomic_ac}'
|
570
820
|
""" # noqa: S608
|
571
821
|
result = await self.uta_db.execute_query(query)
|
572
822
|
|
573
823
|
if result:
|
574
824
|
transcript = result[0]["tx_ac"]
|
575
825
|
else:
|
576
|
-
return
|
577
|
-
|
578
|
-
|
826
|
+
return GenomicTxSeg(
|
827
|
+
errors=[
|
828
|
+
f"Could not find a transcript for {gene} on {genomic_ac}"
|
829
|
+
]
|
579
830
|
)
|
580
831
|
|
581
|
-
|
582
|
-
tx_ac=transcript,
|
832
|
+
tx_exons = await self._get_all_exon_coords(
|
833
|
+
tx_ac=transcript, genomic_ac=genomic_ac
|
583
834
|
)
|
584
|
-
if not
|
585
|
-
return
|
835
|
+
if not tx_exons:
|
836
|
+
return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
|
837
|
+
|
838
|
+
strand = Strand(tx_exons[0].alt_strand)
|
839
|
+
params["strand"] = strand
|
586
840
|
|
587
841
|
# Check if breakpoint occurs on an exon.
|
588
842
|
# If not, determine the adjacent exon given the selected transcript
|
589
|
-
if not self._is_exonic_breakpoint(
|
590
|
-
|
591
|
-
tx_exons_genomic_coords=
|
843
|
+
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
|
844
|
+
exon_num = self._get_adjacent_exon(
|
845
|
+
tx_exons_genomic_coords=tx_exons,
|
592
846
|
strand=strand,
|
593
|
-
start=
|
594
|
-
end=
|
847
|
+
start=genomic_pos if is_seg_start else None,
|
848
|
+
end=genomic_pos if not is_seg_start else None,
|
595
849
|
)
|
596
850
|
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
params["pos"] = pos
|
601
|
-
params["chr"] = alt_ac
|
602
|
-
|
603
|
-
self._set_exon_offset(
|
604
|
-
params=params,
|
605
|
-
start=tx_genomic_coords[exon - 1][3], # Start exon coordinate
|
606
|
-
end=tx_genomic_coords[exon - 1][4], # End exon coordinate
|
607
|
-
pos=pos,
|
608
|
-
is_start=is_start,
|
851
|
+
offset = self._get_exon_offset(
|
852
|
+
start_i=tx_exons[exon_num].alt_start_i,
|
853
|
+
end_i=tx_exons[exon_num].alt_end_i,
|
609
854
|
strand=strand,
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
# Check if valid accession is given
|
617
|
-
if not await self.uta_db.validate_genomic_ac(alt_ac):
|
618
|
-
return self._return_warnings(
|
619
|
-
resp, [f"Invalid genomic accession: {alt_ac}"]
|
855
|
+
use_start_i=strand == Strand.POSITIVE
|
856
|
+
if is_seg_start
|
857
|
+
else strand != Strand.POSITIVE,
|
858
|
+
is_in_exon=False,
|
859
|
+
start=genomic_pos if is_seg_start else None,
|
860
|
+
end=genomic_pos if not is_seg_start else None,
|
620
861
|
)
|
621
862
|
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
863
|
+
genomic_location, err_msg = self._get_vrs_seq_loc(
|
864
|
+
genomic_ac, genomic_pos, is_seg_start, strand
|
865
|
+
)
|
866
|
+
if err_msg:
|
867
|
+
return GenomicTxSeg(errors=[err_msg])
|
868
|
+
|
869
|
+
# gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
|
870
|
+
if not gene:
|
871
|
+
_gene, err_msg = await self._get_tx_ac_gene(transcript)
|
872
|
+
if err_msg:
|
873
|
+
return GenomicTxSeg(errors=[err_msg])
|
874
|
+
gene = _gene
|
875
|
+
|
876
|
+
return GenomicTxSeg(
|
877
|
+
gene=gene,
|
878
|
+
genomic_ac=genomic_ac,
|
879
|
+
tx_ac=transcript,
|
880
|
+
seg=TxSegment(
|
881
|
+
exon_ord=exon_num,
|
882
|
+
offset=offset,
|
883
|
+
genomic_location=genomic_location,
|
884
|
+
),
|
885
|
+
)
|
643
886
|
|
644
|
-
|
645
|
-
|
646
|
-
return self._return_warnings(resp, [warning])
|
647
|
-
gene, alt_ac = gene_alt_ac
|
887
|
+
if genomic_ac:
|
888
|
+
_gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
|
648
889
|
|
649
|
-
|
650
|
-
|
651
|
-
params, gene, alt_ac, pos, strand, is_start
|
652
|
-
)
|
653
|
-
if warnings:
|
654
|
-
return self._return_warnings(resp, [warnings])
|
655
|
-
else:
|
656
|
-
params["transcript"] = transcript
|
657
|
-
params["gene"] = gene
|
658
|
-
params["pos"] = pos
|
659
|
-
params["chr"] = alt_ac
|
660
|
-
warning = await self._set_genomic_data(params, strand, is_start)
|
661
|
-
if warning:
|
662
|
-
return self._return_warnings(resp, [warning])
|
890
|
+
if err_msg:
|
891
|
+
return GenomicTxSeg(errors=[err_msg])
|
663
892
|
|
664
|
-
|
665
|
-
|
893
|
+
if gene and _gene != gene:
|
894
|
+
return GenomicTxSeg(
|
895
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
896
|
+
)
|
666
897
|
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
alt_acs = genes_alt_acs["alt_acs"]
|
678
|
-
len_alt_acs = len(alt_acs)
|
679
|
-
if len_alt_acs > 1:
|
680
|
-
return None, f"Found more than one accessions: {alt_acs}"
|
681
|
-
if len_alt_acs == 0:
|
682
|
-
return None, "No genomic accessions found"
|
683
|
-
alt_ac = next(iter(alt_acs))
|
684
|
-
|
685
|
-
genes = genes_alt_acs["genes"]
|
686
|
-
len_genes = len(genes)
|
687
|
-
input_gene = gene
|
688
|
-
output_gene = None
|
689
|
-
if len_genes == 1:
|
690
|
-
output_gene = next(iter(genes))
|
691
|
-
elif len_genes > 1:
|
692
|
-
return None, f"Found more than one gene: {genes}"
|
693
|
-
elif len_genes == 0:
|
694
|
-
return None, "No genes found"
|
695
|
-
|
696
|
-
if input_gene is not None and output_gene != input_gene.upper():
|
697
|
-
return (
|
698
|
-
None,
|
699
|
-
f"Input gene, {input_gene}, does not match "
|
700
|
-
f"expected output gene, {output_gene}",
|
701
|
-
)
|
898
|
+
gene = _gene
|
899
|
+
elif chromosome:
|
900
|
+
# Try GRCh38 first
|
901
|
+
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
|
902
|
+
_genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
|
903
|
+
f"{assembly}:chr{chromosome}", "refseq"
|
904
|
+
)
|
905
|
+
if err_msg:
|
906
|
+
return GenomicTxSeg(errors=[err_msg])
|
907
|
+
_genomic_ac = _genomic_acs[0].split(":")[-1]
|
702
908
|
|
703
|
-
|
704
|
-
|
909
|
+
_gene, err_msg = await self._get_genomic_ac_gene(
|
910
|
+
genomic_pos, _genomic_ac
|
911
|
+
)
|
912
|
+
if _gene:
|
913
|
+
if gene and _gene != gene:
|
914
|
+
return GenomicTxSeg(
|
915
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
916
|
+
)
|
917
|
+
gene = _gene
|
918
|
+
genomic_ac = _genomic_ac
|
919
|
+
break
|
920
|
+
|
921
|
+
if not genomic_ac:
|
922
|
+
return GenomicTxSeg(
|
923
|
+
errors=[
|
924
|
+
f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
|
925
|
+
]
|
926
|
+
)
|
705
927
|
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
strand: Strand,
|
713
|
-
is_start: bool,
|
714
|
-
) -> str | None:
|
715
|
-
"""Set genomic data in `params` found from MANE.
|
716
|
-
|
717
|
-
:param params: Parameters for response
|
718
|
-
:param gene: Gene symbol
|
719
|
-
:param alt_ac: Genomic accession
|
720
|
-
:param pos: Genomic position
|
721
|
-
:param strand: Strand
|
722
|
-
:param is_start: `True` if `pos` is start position. `False` if `pos` is end
|
723
|
-
position.
|
724
|
-
:return: Warnings if found
|
725
|
-
"""
|
726
|
-
start, end = get_inter_residue_pos(pos, pos, residue_mode=ResidueMode.ZERO)
|
727
|
-
mane_data: (
|
728
|
-
CdnaRepresentation | None
|
729
|
-
) = await self.mane_transcript.get_mane_transcript(
|
730
|
-
alt_ac,
|
731
|
-
start,
|
732
|
-
end,
|
733
|
-
AnnotationLayer.GENOMIC,
|
734
|
-
gene=gene,
|
735
|
-
try_longest_compatible=True,
|
736
|
-
residue_mode=ResidueMode.INTER_RESIDUE,
|
737
|
-
)
|
738
|
-
if not mane_data:
|
739
|
-
msg = f"Unable to find mane data for {alt_ac} with position {pos}"
|
740
|
-
if gene:
|
741
|
-
msg += f" on gene {gene}"
|
742
|
-
_logger.warning(msg)
|
743
|
-
return msg
|
744
|
-
|
745
|
-
params["gene"] = mane_data.gene
|
746
|
-
params["transcript"] = (
|
747
|
-
mane_data.refseq
|
748
|
-
if mane_data.refseq
|
749
|
-
else mane_data.ensembl
|
750
|
-
if mane_data.ensembl
|
751
|
-
else None
|
752
|
-
)
|
753
|
-
tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)
|
754
|
-
if not tx_exons:
|
755
|
-
return f"Unable to get exons for {params['transcript']}"
|
756
|
-
tx_pos = mane_data.pos[0] + mane_data.coding_start_site
|
757
|
-
params["exon"] = self._get_exon_number(tx_exons, tx_pos)
|
758
|
-
|
759
|
-
try:
|
760
|
-
tx_exon = tx_exons[params["exon"] - 1]
|
761
|
-
except IndexError:
|
762
|
-
msg = (
|
763
|
-
f"{params['transcript']} with position {tx_pos} "
|
764
|
-
f"does not exist on exons: {tx_exons}"
|
765
|
-
)
|
766
|
-
_logger.warning(msg)
|
767
|
-
return msg
|
768
|
-
|
769
|
-
strand_to_use = strand if strand is not None else mane_data.strand
|
770
|
-
params["strand"] = strand_to_use
|
771
|
-
self._set_exon_offset(
|
772
|
-
params,
|
773
|
-
tx_exon[0],
|
774
|
-
tx_exon[1],
|
775
|
-
tx_pos,
|
776
|
-
is_start=is_start,
|
777
|
-
strand=strand_to_use,
|
778
|
-
)
|
928
|
+
if not gene:
|
929
|
+
return GenomicTxSeg(
|
930
|
+
errors=[
|
931
|
+
f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
|
932
|
+
]
|
933
|
+
)
|
779
934
|
|
780
|
-
|
781
|
-
|
782
|
-
params["transcript"], tx_pos, tx_pos, gene
|
783
|
-
)
|
784
|
-
if genomic_data is None:
|
785
|
-
return warnings
|
786
|
-
|
787
|
-
params["chr"] = genomic_data[1]
|
788
|
-
genomic_coords = genomic_data[2], genomic_data[3]
|
789
|
-
genomic_pos = genomic_coords[1] - 1 if is_start else genomic_coords[0] + 1
|
790
|
-
params["pos"] = (
|
791
|
-
genomic_pos - params["exon_offset"]
|
792
|
-
if strand_to_use == -1
|
793
|
-
else genomic_pos + params["exon_offset"]
|
935
|
+
return await self._get_tx_seg_genomic_metadata(
|
936
|
+
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
|
794
937
|
)
|
795
|
-
return None
|
796
|
-
|
797
|
-
async def _set_genomic_data(
|
798
|
-
self, params: dict, strand: Strand, is_start: bool
|
799
|
-
) -> str | None:
|
800
|
-
"""Set genomic data in ``params``
|
801
938
|
|
802
|
-
|
803
|
-
:
|
804
|
-
|
805
|
-
|
806
|
-
|
939
|
+
async def _get_grch38_ac_pos(
|
940
|
+
self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
|
941
|
+
) -> tuple[str | None, int | None, str | None]:
|
942
|
+
"""Get GRCh38 genomic representation for accession and position
|
943
|
+
|
944
|
+
:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
|
945
|
+
:param genomic_pos: Genomic position on ``genomic_ac``
|
946
|
+
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
|
947
|
+
provided, will attempt to retrieve associated GRCh38 accession from UTA.
|
948
|
+
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
|
949
|
+
if unable to get GRCh38 representation
|
807
950
|
"""
|
808
|
-
# We should always try to liftover
|
809
|
-
grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"])
|
810
951
|
if not grch38_ac:
|
811
|
-
|
952
|
+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
953
|
+
if not grch38_ac:
|
954
|
+
return None, None, f"Unrecognized genomic accession: {genomic_ac}."
|
955
|
+
|
956
|
+
grch38_ac = grch38_ac[0]
|
812
957
|
|
813
|
-
grch38_ac
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
958
|
+
if grch38_ac != genomic_ac:
|
959
|
+
# Ensure genomic_ac is GRCh37
|
960
|
+
chromosome, _ = self.seqrepo_access.translate_identifier(
|
961
|
+
genomic_ac, Assembly.GRCH37.value
|
962
|
+
)
|
963
|
+
if not chromosome:
|
964
|
+
_logger.warning(
|
965
|
+
"SeqRepo could not find associated %s assembly for genomic accession %s.",
|
966
|
+
Assembly.GRCH37.value,
|
967
|
+
genomic_ac,
|
968
|
+
)
|
969
|
+
return (
|
970
|
+
None,
|
971
|
+
None,
|
972
|
+
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
|
973
|
+
)
|
819
974
|
|
820
|
-
|
975
|
+
chromosome = chromosome[-1].split(":")[-1]
|
821
976
|
liftover_data = self.liftover.get_liftover(
|
822
|
-
|
977
|
+
chromosome, genomic_pos, Assembly.GRCH38
|
823
978
|
)
|
824
979
|
if liftover_data is None:
|
825
980
|
return (
|
826
|
-
|
827
|
-
|
981
|
+
None,
|
982
|
+
None,
|
983
|
+
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
|
828
984
|
)
|
829
985
|
|
830
|
-
|
831
|
-
|
986
|
+
genomic_pos = liftover_data[1]
|
987
|
+
genomic_ac = grch38_ac
|
832
988
|
|
833
|
-
|
834
|
-
if not tx_exons:
|
835
|
-
return f"Unable to get exons for {params['transcript']}"
|
989
|
+
return genomic_ac, genomic_pos, None
|
836
990
|
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
)
|
844
|
-
if len(data) != 1:
|
845
|
-
return (
|
846
|
-
f"Must find exactly one row for genomic data, "
|
847
|
-
f"but found: {len(data)}"
|
848
|
-
)
|
991
|
+
async def _get_genomic_ac_gene(
|
992
|
+
self,
|
993
|
+
pos: int,
|
994
|
+
genomic_ac: str,
|
995
|
+
) -> tuple[str | None, str | None]:
|
996
|
+
"""Get gene given a genomic accession and position.
|
849
997
|
|
850
|
-
|
851
|
-
|
852
|
-
data_exons = data[2], data[3]
|
853
|
-
i = 1
|
854
|
-
found_tx_exon = False
|
855
|
-
for exon in tx_exons:
|
856
|
-
if data_exons == exon:
|
857
|
-
found_tx_exon = True
|
858
|
-
break
|
859
|
-
i += 1
|
860
|
-
if not found_tx_exon:
|
861
|
-
# Either first or last
|
862
|
-
i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1
|
863
|
-
params["exon"] = i
|
864
|
-
|
865
|
-
strand_to_use = strand if strand is not None else Strand(data[7])
|
866
|
-
params["strand"] = strand_to_use
|
867
|
-
if not is_start:
|
868
|
-
# convert back to inter-residue for end position
|
869
|
-
params["pos"] += 1
|
870
|
-
self._set_exon_offset(
|
871
|
-
params,
|
872
|
-
data[5] if is_start else data[5] + 1, # need to convert to inter-residue
|
873
|
-
data[6] - 1 if is_start else data[6], # need to convert to inter-residue
|
874
|
-
params["pos"],
|
875
|
-
is_start=is_start,
|
876
|
-
strand=strand_to_use,
|
877
|
-
)
|
878
|
-
return None
|
998
|
+
If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
|
999
|
+
gene will be returned.
|
879
1000
|
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
"""Set value for ``exon_offset`` in ``params``.
|
885
|
-
|
886
|
-
:param params: Parameters for response
|
887
|
-
:param start: Start exon coord (can be transcript or aligned genomic)
|
888
|
-
:param end: End exon coord (can be transcript or aligned genomic)
|
889
|
-
:param pos: Position change (can be transcript or genomic)
|
890
|
-
:param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
|
891
|
-
end position
|
892
|
-
:param strand: Strand
|
1001
|
+
:param pos: Genomic position on ``genomic_ac``
|
1002
|
+
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
|
1003
|
+
:return: HGNC gene symbol associated to genomic accession and position and
|
1004
|
+
warning
|
893
1005
|
"""
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
1006
|
+
query = f"""
|
1007
|
+
SELECT DISTINCT hgnc
|
1008
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
1009
|
+
WHERE alt_ac = '{genomic_ac}'
|
1010
|
+
AND alt_aln_method = 'splign'
|
1011
|
+
AND {pos} BETWEEN alt_start_i AND alt_end_i
|
1012
|
+
ORDER BY hgnc
|
1013
|
+
LIMIT 1;
|
1014
|
+
""" # noqa: S608
|
1015
|
+
results = await self.uta_db.execute_query(query)
|
1016
|
+
if not results:
|
1017
|
+
return None, f"No gene(s) found given {genomic_ac} on position {pos}"
|
1018
|
+
|
1019
|
+
return results[0]["hgnc"], None
|
1020
|
+
|
1021
|
+
async def _get_tx_ac_gene(
|
1022
|
+
self,
|
1023
|
+
tx_ac: str,
|
1024
|
+
) -> tuple[str | None, str | None]:
|
1025
|
+
"""Get gene given a transcript.
|
904
1026
|
|
905
|
-
|
906
|
-
|
907
|
-
) -> list[tuple[int, int]]:
|
908
|
-
"""Structure exons as list of tuples.
|
1027
|
+
If multiple genes are found for a given ``tx_ac``, only one
|
1028
|
+
gene will be returned.
|
909
1029
|
|
910
|
-
:param
|
911
|
-
:
|
912
|
-
|
1030
|
+
:param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
|
1031
|
+
:return: HGNC gene symbol associated to transcript and
|
1032
|
+
warning
|
913
1033
|
"""
|
914
|
-
|
1034
|
+
query = f"""
|
1035
|
+
SELECT DISTINCT hgnc
|
1036
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
1037
|
+
WHERE tx_ac = '{tx_ac}'
|
1038
|
+
ORDER BY hgnc
|
1039
|
+
LIMIT 1;
|
1040
|
+
""" # noqa: S608
|
1041
|
+
results = await self.uta_db.execute_query(query)
|
1042
|
+
if not results:
|
1043
|
+
return None, f"No gene(s) found given {tx_ac}"
|
1044
|
+
|
1045
|
+
return results[0]["hgnc"], None
|
1046
|
+
|
1047
|
+
async def _get_tx_seg_genomic_metadata(
|
1048
|
+
self,
|
1049
|
+
genomic_ac: str,
|
1050
|
+
genomic_pos: int,
|
1051
|
+
is_seg_start: bool,
|
1052
|
+
gene: str,
|
1053
|
+
tx_ac: str | None,
|
1054
|
+
) -> GenomicTxSeg:
|
1055
|
+
"""Get transcript segment data and associated genomic metadata.
|
1056
|
+
|
1057
|
+
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
1058
|
+
errors.
|
1059
|
+
|
1060
|
+
If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
|
915
1061
|
|
1062
|
+
:param genomic_ac: Genomic RefSeq accession
|
1063
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
1064
|
+
:param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
|
1065
|
+
:param gene: HGNC gene symbol
|
1066
|
+
:param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
|
1067
|
+
transcript
|
1068
|
+
:return: Transcript segment data and associated genomic metadata
|
1069
|
+
"""
|
1070
|
+
if tx_ac:
|
1071
|
+
# We should always try to liftover
|
1072
|
+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
1073
|
+
if not grch38_ac:
|
1074
|
+
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
1075
|
+
grch38_ac = grch38_ac[0]
|
1076
|
+
else:
|
1077
|
+
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
1078
|
+
if not mane_data:
|
1079
|
+
err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
|
1080
|
+
if gene:
|
1081
|
+
err_msg += f" on gene {gene}"
|
1082
|
+
_logger.warning(err_msg)
|
1083
|
+
return GenomicTxSeg(errors=[err_msg])
|
1084
|
+
|
1085
|
+
mane_data = mane_data[0]
|
1086
|
+
tx_ac = mane_data["RefSeq_nuc"]
|
1087
|
+
grch38_ac = mane_data["GRCh38_chr"]
|
1088
|
+
|
1089
|
+
# Always liftover to GRCh38
|
1090
|
+
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
1091
|
+
genomic_ac, genomic_pos, grch38_ac=grch38_ac
|
1092
|
+
)
|
1093
|
+
if err_msg:
|
1094
|
+
return GenomicTxSeg(errors=[err_msg])
|
1095
|
+
|
1096
|
+
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
|
916
1097
|
if not tx_exons:
|
917
|
-
return []
|
1098
|
+
return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
|
1099
|
+
|
1100
|
+
tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
|
1101
|
+
tx_ac,
|
1102
|
+
genomic_pos,
|
1103
|
+
genomic_pos,
|
1104
|
+
alt_ac=genomic_ac,
|
1105
|
+
use_tx_pos=False,
|
1106
|
+
)
|
1107
|
+
if len(tx_exon_aln_data) != 1:
|
1108
|
+
return GenomicTxSeg(
|
1109
|
+
errors=[
|
1110
|
+
f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
|
1111
|
+
]
|
1112
|
+
)
|
1113
|
+
|
1114
|
+
tx_exon_aln_data = tx_exon_aln_data[0]
|
1115
|
+
|
1116
|
+
offset = self._get_exon_offset(
|
1117
|
+
start_i=tx_exon_aln_data.alt_start_i,
|
1118
|
+
end_i=tx_exon_aln_data.alt_end_i,
|
1119
|
+
strand=Strand(tx_exon_aln_data.alt_strand),
|
1120
|
+
use_start_i=False, # This doesn't impact anything since we're on the exon
|
1121
|
+
is_in_exon=True,
|
1122
|
+
start=genomic_pos if is_seg_start else None,
|
1123
|
+
end=genomic_pos if not is_seg_start else None,
|
1124
|
+
)
|
918
1125
|
|
919
|
-
|
1126
|
+
genomic_location, err_msg = self._get_vrs_seq_loc(
|
1127
|
+
genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
|
1128
|
+
)
|
1129
|
+
if err_msg:
|
1130
|
+
return GenomicTxSeg(errors=[err_msg])
|
1131
|
+
|
1132
|
+
return GenomicTxSeg(
|
1133
|
+
gene=tx_exon_aln_data.hgnc,
|
1134
|
+
genomic_ac=genomic_ac,
|
1135
|
+
tx_ac=tx_exon_aln_data.tx_ac,
|
1136
|
+
seg=TxSegment(
|
1137
|
+
exon_ord=tx_exon_aln_data.ord,
|
1138
|
+
offset=offset,
|
1139
|
+
genomic_location=genomic_location,
|
1140
|
+
),
|
1141
|
+
)
|
920
1142
|
|
921
1143
|
@staticmethod
|
922
|
-
def
|
923
|
-
"""
|
1144
|
+
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
|
1145
|
+
"""Check if a breakpoint occurs on an exon
|
924
1146
|
|
925
|
-
:param
|
926
|
-
:param
|
927
|
-
:return:
|
1147
|
+
:param pos: Genomic breakpoint
|
1148
|
+
:param tx_genomic_coords: A list of transcript exon coordinate data
|
1149
|
+
:return: ``True`` if the breakpoint occurs on an exon
|
928
1150
|
"""
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
break
|
933
|
-
i += 1
|
934
|
-
return i
|
1151
|
+
return any(
|
1152
|
+
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
|
1153
|
+
)
|
935
1154
|
|
936
1155
|
@staticmethod
|
937
1156
|
def _get_adjacent_exon(
|
938
|
-
tx_exons_genomic_coords: list[
|
1157
|
+
tx_exons_genomic_coords: list[_ExonCoord],
|
939
1158
|
strand: Strand,
|
940
1159
|
start: int | None = None,
|
941
1160
|
end: int | None = None,
|
@@ -946,20 +1165,18 @@ class ExonGenomicCoordsMapper:
|
|
946
1165
|
adjacent is defined as the exon following the breakpoint for the 5' end and the
|
947
1166
|
exon preceding the breakpoint for the 3' end.
|
948
1167
|
|
949
|
-
:param
|
950
|
-
coordinates for a transcript. Each tuple contains the transcript number
|
951
|
-
(0-indexed), the transcript coordinates for the exon, and the genomic
|
952
|
-
coordinates for the exon. Pos 0 in the tuple corresponds to the exon
|
953
|
-
number, pos 1 and pos 2 refer to the start and end transcript coordinates,
|
954
|
-
respectively, and pos 3 and 4 refer to the start and end genomic
|
955
|
-
coordinates, respectively.
|
1168
|
+
:param tx_exons_genomic_coords: Transcript exon coordinate data
|
956
1169
|
:param strand: Strand
|
957
|
-
:param
|
958
|
-
:param
|
959
|
-
:return: Exon number corresponding to adjacent exon. Will be
|
1170
|
+
:param start: Genomic coordinate of breakpoint
|
1171
|
+
:param end: Genomic coordinate of breakpoint
|
1172
|
+
:return: Exon number corresponding to adjacent exon. Will be 0-based
|
960
1173
|
"""
|
961
1174
|
for i in range(len(tx_exons_genomic_coords) - 1):
|
962
1175
|
exon = tx_exons_genomic_coords[i]
|
1176
|
+
if start == exon.alt_start_i:
|
1177
|
+
break
|
1178
|
+
if end == exon.alt_end_i:
|
1179
|
+
break
|
963
1180
|
next_exon = tx_exons_genomic_coords[i + 1]
|
964
1181
|
bp = start if start else end
|
965
1182
|
if strand == Strand.POSITIVE:
|
@@ -968,19 +1185,46 @@ class ExonGenomicCoordsMapper:
|
|
968
1185
|
else:
|
969
1186
|
lte_exon = next_exon
|
970
1187
|
gte_exon = exon
|
971
|
-
if bp >= lte_exon
|
1188
|
+
if bp >= lte_exon.alt_end_i and bp <= gte_exon.alt_start_i:
|
972
1189
|
break
|
973
1190
|
# Return current exon if end position is provided, next exon if start position
|
974
|
-
# is provided.
|
975
|
-
|
976
|
-
return exon[0] + 1 if end else exon[0] + 2
|
1191
|
+
# is provided.
|
1192
|
+
return exon.ord if end else exon.ord + 1
|
977
1193
|
|
978
1194
|
@staticmethod
|
979
|
-
def
|
980
|
-
|
1195
|
+
def _get_exon_offset(
|
1196
|
+
start_i: int,
|
1197
|
+
end_i: int,
|
1198
|
+
strand: Strand,
|
1199
|
+
use_start_i: bool = True,
|
1200
|
+
is_in_exon: bool = True,
|
1201
|
+
start: int | None = None,
|
1202
|
+
end: int | None = None,
|
1203
|
+
) -> int:
|
1204
|
+
"""Compute offset from exon start or end index
|
981
1205
|
|
982
|
-
:param
|
983
|
-
:param
|
984
|
-
:
|
1206
|
+
:param start_i: Exon start index (inter-residue)
|
1207
|
+
:param end_i: Exon end index (inter-residue)
|
1208
|
+
:param strand: Strand
|
1209
|
+
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1210
|
+
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1211
|
+
``False``.
|
1212
|
+
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1213
|
+
``True``
|
1214
|
+
:param start: Provided start position, defaults to ``None``. Must provide
|
1215
|
+
``start`` or ``end``, not both.
|
1216
|
+
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1217
|
+
or ``end``, not both
|
1218
|
+
:return: Offset from exon start or end index
|
985
1219
|
"""
|
986
|
-
|
1220
|
+
if is_in_exon:
|
1221
|
+
if start is not None:
|
1222
|
+
offset = start - start_i if strand == Strand.POSITIVE else end_i - start
|
1223
|
+
else:
|
1224
|
+
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
1225
|
+
else:
|
1226
|
+
if strand == Strand.POSITIVE:
|
1227
|
+
offset = start - start_i if use_start_i else end - end_i
|
1228
|
+
else:
|
1229
|
+
offset = start_i - end if use_start_i else end_i - start
|
1230
|
+
return offset
|