cool-seq-tool 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +6 -0
- cool_seq_tool/app.py +1 -2
- cool_seq_tool/handlers/seqrepo_access.py +5 -5
- cool_seq_tool/mappers/alignment.py +16 -16
- cool_seq_tool/mappers/exon_genomic_coords.py +845 -628
- cool_seq_tool/mappers/mane_transcript.py +184 -152
- cool_seq_tool/schemas.py +30 -438
- cool_seq_tool/sources/mane_transcript_mappings.py +35 -0
- cool_seq_tool/sources/uta_database.py +149 -229
- cool_seq_tool/utils.py +9 -9
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/METADATA +8 -8
- cool_seq_tool-0.7.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/WHEEL +1 -1
- cool_seq_tool-0.5.1.dist-info/RECORD +0 -24
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,32 +1,224 @@
|
|
1
1
|
"""Provide mapping capabilities between transcript exon and genomic coordinates."""
|
2
2
|
|
3
3
|
import logging
|
4
|
-
|
4
|
+
|
5
|
+
from ga4gh.vrs.models import SequenceLocation, SequenceReference
|
6
|
+
from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator
|
5
7
|
|
6
8
|
from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
|
7
9
|
from cool_seq_tool.mappers.liftover import LiftOver
|
8
|
-
from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
|
9
10
|
from cool_seq_tool.schemas import (
|
10
|
-
AnnotationLayer,
|
11
11
|
Assembly,
|
12
|
-
|
13
|
-
|
14
|
-
ResidueMode,
|
12
|
+
BaseModelForbidExtra,
|
13
|
+
ServiceMeta,
|
15
14
|
Strand,
|
16
|
-
TranscriptExonData,
|
17
|
-
TranscriptExonDataResponse,
|
18
15
|
)
|
19
16
|
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
|
20
17
|
from cool_seq_tool.sources.uta_database import UtaDatabase
|
21
|
-
from cool_seq_tool.utils import
|
22
|
-
|
23
|
-
CoordinatesResponseType = TypeVar(
|
24
|
-
"CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse
|
25
|
-
)
|
18
|
+
from cool_seq_tool.utils import service_meta
|
26
19
|
|
27
20
|
_logger = logging.getLogger(__name__)
|
28
21
|
|
29
22
|
|
23
|
+
class ExonCoord(BaseModelForbidExtra):
|
24
|
+
"""Model for representing exon coordinate data"""
|
25
|
+
|
26
|
+
ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
27
|
+
tx_start_i: StrictInt = Field(
|
28
|
+
...,
|
29
|
+
description="Transcript start index of the exon. Inter-residue coordinates.",
|
30
|
+
)
|
31
|
+
tx_end_i: StrictInt = Field(
|
32
|
+
..., description="Transcript end index of the exon. Inter-residue coordinates."
|
33
|
+
)
|
34
|
+
alt_start_i: StrictInt = Field(
|
35
|
+
..., description="Genomic start index of the exon. Inter-residue coordinates."
|
36
|
+
)
|
37
|
+
alt_end_i: StrictInt = Field(
|
38
|
+
..., description="Genomic end index of the exon. Inter-residue coordinates."
|
39
|
+
)
|
40
|
+
alt_strand: Strand = Field(..., description="Strand.")
|
41
|
+
|
42
|
+
model_config = ConfigDict(
|
43
|
+
json_schema_extra={
|
44
|
+
"example": {
|
45
|
+
"ord": 0,
|
46
|
+
"tx_start_i": 0,
|
47
|
+
"tx_end_i": 234,
|
48
|
+
"alt_start_i": 154191901,
|
49
|
+
"alt_end_i": 154192135,
|
50
|
+
"alt_strand": Strand.NEGATIVE,
|
51
|
+
}
|
52
|
+
}
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
class TxSegment(BaseModelForbidExtra):
|
57
|
+
"""Model for representing transcript segment data."""
|
58
|
+
|
59
|
+
exon_ord: StrictInt = Field(..., description="Exon number. 0-based.")
|
60
|
+
offset: StrictInt = Field(
|
61
|
+
0,
|
62
|
+
description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.",
|
63
|
+
)
|
64
|
+
genomic_location: SequenceLocation = Field(
|
65
|
+
..., description="The genomic position of a transcript segment."
|
66
|
+
)
|
67
|
+
|
68
|
+
model_config = ConfigDict(
|
69
|
+
json_schema_extra={
|
70
|
+
"example": {
|
71
|
+
"exon_ord": 0,
|
72
|
+
"offset": 0,
|
73
|
+
"genomic_location": {
|
74
|
+
"type": "SequenceLocation",
|
75
|
+
"sequenceReference": {
|
76
|
+
"type": "SequenceReference",
|
77
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
78
|
+
},
|
79
|
+
"end": 154192135,
|
80
|
+
},
|
81
|
+
}
|
82
|
+
}
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
class GenomicTxSeg(BaseModelForbidExtra):
|
87
|
+
"""Model for representing a boundary for a transcript segment."""
|
88
|
+
|
89
|
+
seg: TxSegment | None = Field(None, description="Transcript segment.")
|
90
|
+
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
|
91
|
+
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
92
|
+
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
93
|
+
errors: list[StrictStr] = Field([], description="Error messages.")
|
94
|
+
|
95
|
+
@model_validator(mode="before")
|
96
|
+
def check_errors(cls, values: dict) -> dict: # noqa: N805
|
97
|
+
"""Ensure that fields are (un)set depending on errors
|
98
|
+
|
99
|
+
:param values: Values in model
|
100
|
+
:raises ValueError: If `seg`, `gene`, `genomic_ac` and `tx_ac` are not
|
101
|
+
provided when there are no errors
|
102
|
+
:return: Values in model
|
103
|
+
"""
|
104
|
+
if not values.get("errors") and not all(
|
105
|
+
(
|
106
|
+
values.get("seg"),
|
107
|
+
values.get("gene"),
|
108
|
+
values.get("genomic_ac"),
|
109
|
+
values.get("tx_ac"),
|
110
|
+
)
|
111
|
+
):
|
112
|
+
err_msg = "`seg`, `gene`, `genomic_ac` and `tx_ac` must be provided"
|
113
|
+
raise ValueError(err_msg)
|
114
|
+
return values
|
115
|
+
|
116
|
+
model_config = ConfigDict(
|
117
|
+
json_schema_extra={
|
118
|
+
"example": {
|
119
|
+
"gene": "TPM3",
|
120
|
+
"genomic_ac": "NC_000001.11",
|
121
|
+
"tx_ac": "NM_152263.3",
|
122
|
+
"seg": {
|
123
|
+
"exon_ord": 0,
|
124
|
+
"offset": 0,
|
125
|
+
"genomic_location": {
|
126
|
+
"type": "SequenceLocation",
|
127
|
+
"sequenceReference": {
|
128
|
+
"type": "SequenceReference",
|
129
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
130
|
+
},
|
131
|
+
"end": 154192135,
|
132
|
+
},
|
133
|
+
},
|
134
|
+
"errors": [],
|
135
|
+
}
|
136
|
+
}
|
137
|
+
)
|
138
|
+
|
139
|
+
|
140
|
+
class GenomicTxSegService(BaseModelForbidExtra):
|
141
|
+
"""Service model for genomic and transcript data."""
|
142
|
+
|
143
|
+
gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
|
144
|
+
genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
|
145
|
+
tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
|
146
|
+
seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
|
147
|
+
seg_end: TxSegment | None = Field(None, description="End transcript segment.")
|
148
|
+
errors: list[StrictStr] = Field([], description="Error messages.")
|
149
|
+
service_meta: ServiceMeta = Field(..., description="Service metadata.")
|
150
|
+
|
151
|
+
@model_validator(mode="before")
|
152
|
+
def add_meta_check_errors(cls, values: dict) -> dict: # noqa: N805
|
153
|
+
"""Add service metadata to model and ensure that fields are (un)set depending
|
154
|
+
on errors
|
155
|
+
|
156
|
+
:param values: Values in model
|
157
|
+
:raises ValueError: If `gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
|
158
|
+
not provided when there are no errors
|
159
|
+
:return: Values in model, including service metadata
|
160
|
+
"""
|
161
|
+
values["service_meta"] = service_meta()
|
162
|
+
if not values.get("errors") and not all(
|
163
|
+
(
|
164
|
+
values.get("gene"),
|
165
|
+
values.get("genomic_ac"),
|
166
|
+
values.get("tx_ac"),
|
167
|
+
values.get("seg_start") or values.get("seg_end"),
|
168
|
+
)
|
169
|
+
):
|
170
|
+
err_msg = "`gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
|
171
|
+
raise ValueError(err_msg)
|
172
|
+
|
173
|
+
return values
|
174
|
+
|
175
|
+
model_config = ConfigDict(
|
176
|
+
json_schema_extra={
|
177
|
+
"example": {
|
178
|
+
"gene": "TPM3",
|
179
|
+
"genomic_ac": "NC_000001.11",
|
180
|
+
"tx_ac": "NM_152263.3",
|
181
|
+
"seg_start": {
|
182
|
+
"exon_ord": 0,
|
183
|
+
"offset": 0,
|
184
|
+
"genomic_location": {
|
185
|
+
"type": "SequenceLocation",
|
186
|
+
"sequenceReference": {
|
187
|
+
"type": "SequenceReference",
|
188
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
189
|
+
},
|
190
|
+
"end": 154192135,
|
191
|
+
},
|
192
|
+
},
|
193
|
+
"seg_end": {
|
194
|
+
"exon_ord": 7,
|
195
|
+
"offset": 0,
|
196
|
+
"genomic_location": {
|
197
|
+
"type": "SequenceLocation",
|
198
|
+
"sequenceReference": {
|
199
|
+
"type": "SequenceReference",
|
200
|
+
"refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
|
201
|
+
},
|
202
|
+
"start": 154170399,
|
203
|
+
},
|
204
|
+
},
|
205
|
+
}
|
206
|
+
}
|
207
|
+
)
|
208
|
+
|
209
|
+
|
210
|
+
def _return_service_errors(errors: list[str]) -> GenomicTxSegService:
|
211
|
+
"""Log errors and return service object with errors.
|
212
|
+
|
213
|
+
:param errors: Error message(s)
|
214
|
+
:return: Service object with error messages.
|
215
|
+
"""
|
216
|
+
for error in errors:
|
217
|
+
_logger.warning(error)
|
218
|
+
|
219
|
+
return GenomicTxSegService(errors=errors)
|
220
|
+
|
221
|
+
|
30
222
|
class ExonGenomicCoordsMapper:
|
31
223
|
"""Provide capabilities for mapping transcript exon representation to/from genomic
|
32
224
|
coordinate representation.
|
@@ -36,7 +228,6 @@ class ExonGenomicCoordsMapper:
|
|
36
228
|
self,
|
37
229
|
seqrepo_access: SeqRepoAccess,
|
38
230
|
uta_db: UtaDatabase,
|
39
|
-
mane_transcript: ManeTranscript,
|
40
231
|
mane_transcript_mappings: ManeTranscriptMappings,
|
41
232
|
liftover: LiftOver,
|
42
233
|
) -> None:
|
@@ -45,7 +236,7 @@ class ExonGenomicCoordsMapper:
|
|
45
236
|
A lot of resources are required for initialization, so when defaults are enough,
|
46
237
|
it's easiest to let the core CoolSeqTool class handle it for you:
|
47
238
|
|
48
|
-
>>> from cool_seq_tool
|
239
|
+
>>> from cool_seq_tool import CoolSeqTool
|
49
240
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
50
241
|
|
51
242
|
Note that this class's public methods are all defined as ``async``, so they will
|
@@ -54,42 +245,22 @@ class ExonGenomicCoordsMapper:
|
|
54
245
|
|
55
246
|
>>> import asyncio
|
56
247
|
>>> result = asyncio.run(
|
57
|
-
... egc.
|
58
|
-
... "NM_002529.3", exon_start=2, exon_end=17
|
59
|
-
... )
|
248
|
+
... egc.tx_segment_to_genomic("NM_002529.3", exon_start=2, exon_end=17)
|
60
249
|
... )
|
61
250
|
>>> result.genomic_data.start, result.genomic_data.end
|
62
251
|
(156864428, 156881456)
|
63
252
|
|
64
253
|
:param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
|
65
254
|
:param uta_db: UtaDatabase instance to give access to query UTA database
|
66
|
-
:param mane_transcript: Instance to align to MANE or compatible representation
|
67
255
|
:param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
|
68
256
|
:param liftover: Instance to provide mapping between human genome assemblies
|
69
257
|
"""
|
70
258
|
self.seqrepo_access = seqrepo_access
|
71
259
|
self.uta_db = uta_db
|
72
|
-
self.mane_transcript = mane_transcript
|
73
260
|
self.mane_transcript_mappings = mane_transcript_mappings
|
74
261
|
self.liftover = liftover
|
75
262
|
|
76
|
-
|
77
|
-
def _return_warnings(
|
78
|
-
resp: CoordinatesResponseType, warning_msg: list[str]
|
79
|
-
) -> CoordinatesResponseType:
|
80
|
-
"""Add warnings to response object
|
81
|
-
|
82
|
-
:param resp: Response object
|
83
|
-
:param warning_msg: Warning message(s) on why ``transcript_exon_data`` or
|
84
|
-
``genomic_data`` field is ``None``
|
85
|
-
:return: Response object with warning message
|
86
|
-
"""
|
87
|
-
for msg in warning_msg:
|
88
|
-
_logger.warning(msg)
|
89
|
-
resp.warnings.append(msg)
|
90
|
-
return resp
|
91
|
-
|
92
|
-
async def transcript_to_genomic_coordinates(
|
263
|
+
async def tx_segment_to_genomic(
|
93
264
|
self,
|
94
265
|
transcript: str,
|
95
266
|
gene: str | None = None,
|
@@ -97,26 +268,30 @@ class ExonGenomicCoordsMapper:
|
|
97
268
|
exon_start_offset: int = 0,
|
98
269
|
exon_end: int | None = None,
|
99
270
|
exon_end_offset: int = 0,
|
100
|
-
) ->
|
101
|
-
"""Get genomic data given transcript data.
|
271
|
+
) -> GenomicTxSegService:
|
272
|
+
"""Get aligned genomic data given transcript segment data.
|
102
273
|
|
103
274
|
By default, transcript data is aligned to the GRCh38 assembly.
|
104
275
|
|
105
276
|
>>> import asyncio
|
106
|
-
>>> from cool_seq_tool
|
277
|
+
>>> from cool_seq_tool import CoolSeqTool
|
107
278
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
108
279
|
>>> tpm3 = asyncio.run(
|
109
|
-
... egc.
|
280
|
+
... egc.tx_segment_to_genomic(
|
110
281
|
... "NM_152263.3",
|
111
282
|
... gene="TPM3",
|
112
283
|
... exon_start=1,
|
113
284
|
... exon_end=8,
|
114
285
|
... )
|
115
286
|
... )
|
116
|
-
>>>
|
287
|
+
>>> (
|
288
|
+
... tpm3.genomic_ac,
|
289
|
+
... tpm3.seg_start.genomic_location.end,
|
290
|
+
... tpm3.seg_end.genomic_location.start,
|
291
|
+
... )
|
117
292
|
('NC_000001.11', 154192135, 154170399)
|
118
293
|
|
119
|
-
:param transcript:
|
294
|
+
:param transcript: RefSeq transcript accession
|
120
295
|
:param gene: HGNC gene symbol
|
121
296
|
:param exon_start: Starting transcript exon number (1-based). If not provided,
|
122
297
|
must provide ``exon_end``
|
@@ -126,318 +301,316 @@ class ExonGenomicCoordsMapper:
|
|
126
301
|
:param exon_end_offset: Ending exon offset
|
127
302
|
:return: GRCh38 genomic data (inter-residue coordinates)
|
128
303
|
"""
|
129
|
-
resp = GenomicDataResponse(
|
130
|
-
genomic_data=None, warnings=[], service_meta=service_meta()
|
131
|
-
)
|
132
|
-
|
133
304
|
# Ensure valid inputs
|
134
|
-
|
135
|
-
if not transcript:
|
136
|
-
warnings.append("Must provide `transcript`")
|
137
|
-
else:
|
138
|
-
transcript = transcript.strip()
|
139
|
-
|
305
|
+
errors = []
|
140
306
|
exon_start_exists, exon_end_exists = False, False
|
141
307
|
if exon_start is not None:
|
142
308
|
if exon_start < 1:
|
143
|
-
|
309
|
+
errors.append("`exon_start` cannot be less than 1")
|
144
310
|
exon_start_exists = True
|
145
311
|
|
146
312
|
if exon_end is not None:
|
147
313
|
if exon_end < 1:
|
148
|
-
|
314
|
+
errors.append("`exon_end` cannot be less than 1")
|
149
315
|
exon_end_exists = True
|
150
316
|
|
151
317
|
if not exon_start_exists and not exon_end_exists:
|
152
|
-
|
318
|
+
errors.append("Must provide either `exon_start` or `exon_end`")
|
153
319
|
if exon_start_exists and exon_end_exists and (exon_start > exon_end):
|
154
|
-
|
320
|
+
errors.append(
|
155
321
|
f"Start exon {exon_start} is greater than end exon {exon_end}"
|
156
322
|
)
|
157
323
|
|
158
|
-
if
|
159
|
-
return
|
160
|
-
|
161
|
-
# Get all exons and associated start/end coordinates for transcript
|
162
|
-
tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
|
163
|
-
if not tx_exons:
|
164
|
-
return self._return_warnings(resp, [warning] if warning else [])
|
324
|
+
if errors:
|
325
|
+
return _return_service_errors(errors)
|
165
326
|
|
166
327
|
# Get exon start and exon end coordinates
|
167
|
-
|
168
|
-
|
328
|
+
(
|
329
|
+
tx_exon_start_coords,
|
330
|
+
tx_exon_end_coords,
|
331
|
+
errors,
|
332
|
+
) = await self._get_start_end_exon_coords(
|
333
|
+
transcript, exon_start=exon_start, exon_end=exon_end
|
169
334
|
)
|
170
|
-
if
|
171
|
-
return
|
172
|
-
tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
|
335
|
+
if errors:
|
336
|
+
return _return_service_errors(errors)
|
173
337
|
|
174
338
|
if gene:
|
175
|
-
gene = gene.upper()
|
339
|
+
gene = gene.upper()
|
176
340
|
|
177
341
|
# Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
|
178
342
|
# for exon(s)
|
179
|
-
alt_ac_start_end,
|
343
|
+
alt_ac_start_end, err_msg = await self._get_alt_ac_start_and_end(
|
180
344
|
transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
|
181
345
|
)
|
182
346
|
if not alt_ac_start_end:
|
183
|
-
return
|
347
|
+
return _return_service_errors([err_msg] if err_msg else [])
|
184
348
|
alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
|
185
349
|
|
186
350
|
# Get gene and chromosome data, check that at least one was retrieved
|
187
|
-
gene = alt_ac_start_data
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
351
|
+
gene = alt_ac_start_data.hgnc if alt_ac_start_data else alt_ac_end_data.hgnc
|
352
|
+
genomic_ac = (
|
353
|
+
alt_ac_start_data.alt_ac if alt_ac_start_data else alt_ac_end_data.alt_ac
|
354
|
+
)
|
355
|
+
if gene is None or genomic_ac is None:
|
356
|
+
return _return_service_errors(
|
192
357
|
[
|
193
|
-
"Unable to retrieve `gene` or `
|
358
|
+
"Unable to retrieve `gene` or `genomic_ac` from genomic start and genomic end data"
|
194
359
|
],
|
195
360
|
)
|
196
361
|
|
197
|
-
g_start = alt_ac_start_data[3] - 1 if alt_ac_start_data else None
|
198
|
-
g_end = alt_ac_end_data[2] + 1 if alt_ac_end_data else None
|
199
362
|
strand = (
|
200
|
-
Strand(alt_ac_start_data
|
363
|
+
Strand(alt_ac_start_data.alt_strand)
|
201
364
|
if alt_ac_start_data
|
202
|
-
else Strand(alt_ac_end_data
|
365
|
+
else Strand(alt_ac_end_data.alt_strand)
|
203
366
|
)
|
204
367
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
368
|
+
if exon_start_exists:
|
369
|
+
seg_start, err_msg = self._get_tx_segment(
|
370
|
+
genomic_ac,
|
371
|
+
strand,
|
372
|
+
exon_start_offset,
|
373
|
+
alt_ac_start_data,
|
374
|
+
is_seg_start=True,
|
375
|
+
)
|
376
|
+
if err_msg:
|
377
|
+
return _return_service_errors([err_msg])
|
213
378
|
else:
|
214
|
-
|
215
|
-
end_offset = exon_end_offset if end_exists else 0
|
379
|
+
seg_start = None
|
216
380
|
|
217
|
-
|
218
|
-
|
219
|
-
|
381
|
+
if exon_end_exists:
|
382
|
+
seg_end, err_msg = self._get_tx_segment(
|
383
|
+
genomic_ac, strand, exon_end_offset, alt_ac_end_data, is_seg_start=False
|
384
|
+
)
|
385
|
+
if err_msg:
|
386
|
+
return _return_service_errors([err_msg])
|
387
|
+
else:
|
388
|
+
seg_end = None
|
220
389
|
|
221
|
-
|
390
|
+
return GenomicTxSegService(
|
222
391
|
gene=gene,
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
exon_start_offset=exon_start_offset,
|
228
|
-
exon_end=exon_end if end_exists else None,
|
229
|
-
exon_end_offset=exon_end_offset,
|
230
|
-
transcript=transcript,
|
231
|
-
strand=strand,
|
392
|
+
genomic_ac=genomic_ac,
|
393
|
+
tx_ac=transcript,
|
394
|
+
seg_start=seg_start,
|
395
|
+
seg_end=seg_end,
|
232
396
|
)
|
233
397
|
|
234
|
-
|
235
|
-
|
236
|
-
async def genomic_to_transcript_exon_coordinates(
|
398
|
+
async def genomic_to_tx_segment(
|
237
399
|
self,
|
238
400
|
chromosome: str | None = None,
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
strand: Strand | None = None,
|
401
|
+
genomic_ac: str | None = None,
|
402
|
+
seg_start_genomic: int | None = None,
|
403
|
+
seg_end_genomic: int | None = None,
|
243
404
|
transcript: str | None = None,
|
244
405
|
get_nearest_transcript_junction: bool = False,
|
245
406
|
gene: str | None = None,
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
407
|
+
) -> GenomicTxSegService:
|
408
|
+
"""Get transcript segment data for genomic data, lifted over to GRCh38.
|
409
|
+
|
410
|
+
If liftover to GRCh38 is unsuccessful, will return errors.
|
411
|
+
|
412
|
+
Must provide inter-residue coordinates.
|
250
413
|
|
251
414
|
MANE Transcript data will be returned if and only if ``transcript`` is not
|
252
415
|
supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
|
253
416
|
|
254
417
|
>>> import asyncio
|
255
|
-
>>> from cool_seq_tool
|
418
|
+
>>> from cool_seq_tool import CoolSeqTool
|
256
419
|
>>> from cool_seq_tool.schemas import Strand
|
257
420
|
>>> egc = CoolSeqTool().ex_g_coords_mapper
|
258
421
|
>>> result = asyncio.run(
|
259
|
-
... egc.
|
260
|
-
...
|
261
|
-
...
|
262
|
-
...
|
263
|
-
... strand=Strand.NEGATIVE,
|
422
|
+
... egc.genomic_to_tx_segment(
|
423
|
+
... genomic_ac="NC_000001.11",
|
424
|
+
... seg_start_genomic=154192135,
|
425
|
+
... seg_end_genomic=154170399,
|
264
426
|
... transcript="NM_152263.3",
|
265
427
|
... )
|
266
428
|
... )
|
267
|
-
>>> result.
|
268
|
-
(
|
429
|
+
>>> result.seg_start.exon_ord, result.seg_end.exon_ord
|
430
|
+
(0, 7)
|
269
431
|
|
270
432
|
:param chromosome: e.g. ``"1"`` or ``"chr1"``. If not provided, must provide
|
271
|
-
``
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
:param
|
277
|
-
:param
|
433
|
+
``genomic_ac``. If ``genomic_ac`` is also provided, ``genomic_ac`` will be
|
434
|
+
used.
|
435
|
+
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
436
|
+
must provide ``chromosome. If ``chromosome`` is also provided,
|
437
|
+
``genomic_ac`` will be used.
|
438
|
+
:param seg_start_genomic: Genomic position where the transcript segment starts
|
439
|
+
:param seg_end_genomic: Genomic position where the transcript segment ends
|
278
440
|
:param transcript: The transcript to use. If this is not given, we will try the
|
279
441
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
280
442
|
Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
|
281
443
|
page.
|
282
444
|
:param get_nearest_transcript_junction: If ``True``, this will return the
|
283
|
-
adjacent exon if the position specified by``
|
284
|
-
occur on an exon. For the positive strand, adjacent
|
285
|
-
preceding the breakpoint for the 5' end and the exon
|
286
|
-
breakpoint for the 3' end. For the negative strand, adjacent
|
287
|
-
the exon following the breakpoint for the 5' end and the exon
|
288
|
-
breakpoint for the 3' end.
|
445
|
+
adjacent exon if the position specified by``seg_start_genomic`` or
|
446
|
+
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
447
|
+
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
448
|
+
following the breakpoint for the 3' end. For the negative strand, adjacent
|
449
|
+
is defined as the exon following the breakpoint for the 5' end and the exon
|
450
|
+
preceding the breakpoint for the 3' end.
|
289
451
|
:param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript``
|
290
452
|
value is provided.
|
291
|
-
:param
|
453
|
+
:param coordinate_type: Coordinate type for ``seg_start_genomic`` and
|
454
|
+
``seg_end_genomic``
|
292
455
|
:return: Genomic data (inter-residue coordinates)
|
293
456
|
"""
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
warnings.append("Must provide either `chromosome` or `alt_ac`")
|
457
|
+
errors = []
|
458
|
+
if seg_start_genomic is None and seg_end_genomic is None:
|
459
|
+
errors.append(
|
460
|
+
"Must provide either `seg_start_genomic` or `seg_end_genomic`"
|
461
|
+
)
|
462
|
+
if chromosome is None and genomic_ac is None:
|
463
|
+
errors.append("Must provide either `chromosome` or `alt_ac`")
|
302
464
|
if transcript is None and gene is None:
|
303
|
-
|
304
|
-
if
|
305
|
-
return
|
465
|
+
errors.append("Must provide either `gene` or `transcript`")
|
466
|
+
if errors:
|
467
|
+
return _return_service_errors(errors)
|
306
468
|
|
307
|
-
params = {key: None for key in GenomicData.model_fields}
|
308
469
|
if gene is not None:
|
309
|
-
gene = gene.upper()
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
start_data = await self._genomic_to_transcript_exon_coordinate(
|
317
|
-
start,
|
470
|
+
gene = gene.upper()
|
471
|
+
|
472
|
+
params = {}
|
473
|
+
|
474
|
+
if seg_start_genomic:
|
475
|
+
start_tx_seg_data = await self._genomic_to_tx_segment(
|
476
|
+
seg_start_genomic,
|
318
477
|
chromosome=chromosome,
|
319
|
-
|
320
|
-
strand=strand,
|
478
|
+
genomic_ac=genomic_ac,
|
321
479
|
transcript=transcript,
|
322
480
|
gene=gene,
|
323
481
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
324
482
|
is_start=True,
|
325
483
|
)
|
326
|
-
if
|
327
|
-
|
328
|
-
|
329
|
-
|
484
|
+
if start_tx_seg_data.errors:
|
485
|
+
return _return_service_errors(start_tx_seg_data.errors)
|
486
|
+
|
487
|
+
params["gene"] = start_tx_seg_data.gene
|
488
|
+
params["genomic_ac"] = start_tx_seg_data.genomic_ac
|
489
|
+
params["tx_ac"] = start_tx_seg_data.tx_ac
|
490
|
+
params["seg_start"] = start_tx_seg_data.seg
|
330
491
|
else:
|
331
|
-
|
492
|
+
start_tx_seg_data = None
|
332
493
|
|
333
|
-
if
|
334
|
-
|
335
|
-
|
336
|
-
end_data = await self._genomic_to_transcript_exon_coordinate(
|
337
|
-
end,
|
494
|
+
if seg_end_genomic:
|
495
|
+
end_tx_seg_data = await self._genomic_to_tx_segment(
|
496
|
+
seg_end_genomic,
|
338
497
|
chromosome=chromosome,
|
339
|
-
|
340
|
-
strand=strand,
|
498
|
+
genomic_ac=genomic_ac,
|
341
499
|
transcript=transcript,
|
342
500
|
gene=gene,
|
343
501
|
get_nearest_transcript_junction=get_nearest_transcript_junction,
|
344
502
|
is_start=False,
|
345
503
|
)
|
346
|
-
if
|
347
|
-
|
504
|
+
if end_tx_seg_data.errors:
|
505
|
+
return _return_service_errors(end_tx_seg_data.errors)
|
506
|
+
|
507
|
+
if start_tx_seg_data:
|
508
|
+
# Need to check that gene, genomic_ac, tx_ac all match
|
509
|
+
errors = []
|
510
|
+
for attr in ["gene", "genomic_ac", "tx_ac"]:
|
511
|
+
start_seg_attr = params[attr]
|
512
|
+
end_seg_attr = getattr(end_tx_seg_data, attr)
|
513
|
+
if start_seg_attr != end_seg_attr:
|
514
|
+
errors.append(
|
515
|
+
f"Start end end segment mismatch for `{attr}`. {start_seg_attr} != {end_seg_attr}."
|
516
|
+
)
|
517
|
+
if errors:
|
518
|
+
return _return_service_errors(errors)
|
348
519
|
else:
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
for field in ["transcript", "gene", "chr", "strand"]:
|
354
|
-
if start_data:
|
355
|
-
if end_data and (start_data[field] != end_data[field]):
|
356
|
-
msg = (
|
357
|
-
f"Start `{field}`, {start_data[field]}, does "
|
358
|
-
f"not match End `{field}`, {end_data[field]}"
|
359
|
-
)
|
360
|
-
return self._return_warnings(resp, [msg])
|
361
|
-
params[field] = start_data[field]
|
362
|
-
else:
|
363
|
-
params[field] = end_data[field]
|
520
|
+
params["gene"] = end_tx_seg_data.gene
|
521
|
+
params["genomic_ac"] = end_tx_seg_data.genomic_ac
|
522
|
+
params["tx_ac"] = end_tx_seg_data.tx_ac
|
364
523
|
|
365
|
-
|
366
|
-
msg = (
|
367
|
-
f"Input gene, {gene}, does not match expected output"
|
368
|
-
f"gene, {params['gene']}"
|
369
|
-
)
|
370
|
-
return self._return_warnings(resp, [msg])
|
524
|
+
params["seg_end"] = end_tx_seg_data.seg
|
371
525
|
|
372
|
-
|
373
|
-
if data:
|
374
|
-
params[label] = data["pos"]
|
375
|
-
params[f"exon_{label}"] = data["exon"]
|
376
|
-
params[f"exon_{label}_offset"] = data["exon_offset"]
|
377
|
-
resp.genomic_data = GenomicData(**params)
|
378
|
-
return resp
|
526
|
+
return GenomicTxSegService(**params)
|
379
527
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
:param tx_exons: List of transcript's exons and associated coordinates
|
388
|
-
:param exon_number: Exon number to validate
|
389
|
-
:return: Exon coordinates for a given exon number and warnings if found
|
390
|
-
"""
|
391
|
-
msg = f"Exon {exon_number} does not exist on {transcript}"
|
392
|
-
try:
|
393
|
-
if exon_number < 1:
|
394
|
-
return None, msg
|
395
|
-
exon = tx_exons[exon_number - 1]
|
396
|
-
except IndexError:
|
397
|
-
return None, msg
|
398
|
-
return exon, None
|
528
|
+
async def _get_all_exon_coords(
|
529
|
+
self, tx_ac: str, genomic_ac: str | None = None
|
530
|
+
) -> list[ExonCoord]:
|
531
|
+
"""Get all exon coordinate data for a transcript.
|
532
|
+
|
533
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
534
|
+
associated to ``tx_ac``.
|
399
535
|
|
400
|
-
|
536
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
537
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
538
|
+
:return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
|
539
|
+
The exon coordinate data will include the exon number, transcript and
|
540
|
+
genomic positions for the start and end of the exon, and strand.
|
541
|
+
The list will be ordered by ascending exon number.
|
542
|
+
"""
|
543
|
+
if genomic_ac:
|
544
|
+
query = f"""
|
545
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
546
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
547
|
+
WHERE tx_ac = '{tx_ac}'
|
548
|
+
AND alt_aln_method = 'splign'
|
549
|
+
AND alt_ac = '{genomic_ac}'
|
550
|
+
ORDER BY ord ASC
|
551
|
+
""" # noqa: S608
|
552
|
+
else:
|
553
|
+
query = f"""
|
554
|
+
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
|
555
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v as t
|
556
|
+
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
|
557
|
+
ON t.alt_ac = s.ac
|
558
|
+
WHERE s.descr = ''
|
559
|
+
AND t.tx_ac = '{tx_ac}'
|
560
|
+
AND t.alt_aln_method = 'splign'
|
561
|
+
AND t.alt_ac like 'NC_000%'
|
562
|
+
ORDER BY ord ASC
|
563
|
+
""" # noqa: S608
|
564
|
+
|
565
|
+
results = await self.uta_db.execute_query(query)
|
566
|
+
return [ExonCoord(**r) for r in results]
|
567
|
+
|
568
|
+
async def _get_start_end_exon_coords(
|
401
569
|
self,
|
402
|
-
|
403
|
-
tx_exons: list[tuple[int, int]],
|
570
|
+
tx_ac: str,
|
404
571
|
exon_start: int | None = None,
|
405
572
|
exon_end: int | None = None,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
:param
|
414
|
-
:param exon_start: Start exon number
|
415
|
-
:param exon_end: End exon number
|
416
|
-
:
|
417
|
-
|
573
|
+
genomic_ac: str | None = None,
|
574
|
+
) -> tuple[ExonCoord | None, ExonCoord | None, list[str]]:
|
575
|
+
"""Get exon coordinates for a transcript given exon start and exon end.
|
576
|
+
|
577
|
+
If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
|
578
|
+
associated to ``tx_ac``.
|
579
|
+
|
580
|
+
:param tx_ac: The RefSeq transcript accession to get exon data for.
|
581
|
+
:param exon_start: Start exon number to get coordinate data for. 1-based.
|
582
|
+
:param exon_end: End exon number to get coordinate data for. 1-based.
|
583
|
+
:param genomic_ac: The RefSeq genomic accession to get exon data for.
|
584
|
+
:return: Tuple containing start exon coordinate data, end exon coordinate data,
|
585
|
+
and list of errors. The exon coordinate data will include the exon number,
|
586
|
+
transcript and genomic positions for the start and end of the exon, and
|
587
|
+
strand.
|
418
588
|
"""
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
)
|
423
|
-
if not tx_exon_start:
|
424
|
-
return None, warning
|
425
|
-
else:
|
426
|
-
tx_exon_start = None
|
589
|
+
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
|
590
|
+
if not tx_exons:
|
591
|
+
return None, None, [f"No exons found given {tx_ac}"]
|
427
592
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
593
|
+
errors = []
|
594
|
+
start_end_exons = []
|
595
|
+
for exon_num in [exon_start, exon_end]:
|
596
|
+
if exon_num is not None:
|
597
|
+
try:
|
598
|
+
start_end_exons.append(tx_exons[exon_num - 1])
|
599
|
+
continue
|
600
|
+
except IndexError:
|
601
|
+
errors.append(f"Exon {exon_num} does not exist on {tx_ac}")
|
602
|
+
start_end_exons.append(None)
|
603
|
+
|
604
|
+
if errors:
|
605
|
+
start_end_exons = [None, None]
|
606
|
+
|
607
|
+
return *start_end_exons, errors
|
435
608
|
|
436
609
|
async def _get_alt_ac_start_and_end(
|
437
610
|
self,
|
438
611
|
tx_ac: str,
|
439
|
-
tx_exon_start:
|
440
|
-
tx_exon_end:
|
612
|
+
tx_exon_start: ExonCoord | None = None,
|
613
|
+
tx_exon_end: ExonCoord | None = None,
|
441
614
|
gene: str | None = None,
|
442
615
|
) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
|
443
616
|
"""Get aligned genomic coordinates for transcript exon start and end.
|
@@ -459,7 +632,7 @@ class ExonGenomicCoordsMapper:
|
|
459
632
|
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
|
460
633
|
if exon:
|
461
634
|
alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
|
462
|
-
tx_ac, exon
|
635
|
+
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
|
463
636
|
)
|
464
637
|
if alt_ac_val:
|
465
638
|
alt_ac_data[key] = alt_ac_val
|
@@ -470,78 +643,84 @@ class ExonGenomicCoordsMapper:
|
|
470
643
|
# Validate that start and end alignments have matching gene, genomic accession,
|
471
644
|
# and strand
|
472
645
|
if all(alt_ac_data_values):
|
473
|
-
for
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
error = "Genomic accession does not match"
|
479
|
-
else:
|
480
|
-
error = "Strand does not match"
|
646
|
+
for attr in ["hgnc", "alt_ac", "alt_strand"]:
|
647
|
+
start_attr = getattr(alt_ac_data["start"], attr)
|
648
|
+
end_attr = getattr(alt_ac_data["end"], attr)
|
649
|
+
if start_attr != end_attr:
|
650
|
+
error = f"{attr} mismatch. {start_attr} != {end_attr}."
|
481
651
|
_logger.warning(
|
482
652
|
"%s: %s != %s",
|
483
653
|
error,
|
484
|
-
|
485
|
-
|
654
|
+
start_attr,
|
655
|
+
end_attr,
|
486
656
|
)
|
487
657
|
return None, error
|
488
658
|
return tuple(alt_ac_data_values), None
|
489
659
|
|
490
|
-
async def
|
660
|
+
async def _genomic_to_tx_segment(
|
491
661
|
self,
|
492
|
-
|
662
|
+
genomic_pos: int,
|
493
663
|
chromosome: str | None = None,
|
494
|
-
|
495
|
-
strand: Strand | None = None,
|
664
|
+
genomic_ac: str | None = None,
|
496
665
|
transcript: str | None = None,
|
497
666
|
gene: str | None = None,
|
498
667
|
get_nearest_transcript_junction: bool = False,
|
499
668
|
is_start: bool = True,
|
500
|
-
) ->
|
501
|
-
"""
|
669
|
+
) -> GenomicTxSeg:
|
670
|
+
"""Given genomic data, generate a boundary for a transcript segment.
|
671
|
+
|
672
|
+
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
673
|
+
errors.
|
502
674
|
|
503
|
-
:param
|
675
|
+
:param genomic_pos: Genomic position where the transcript segment starts or ends
|
676
|
+
(inter-residue based)
|
504
677
|
:param chromosome: Chromosome. Must give chromosome without a prefix
|
505
|
-
(i.e. ``1`` or ``X``). If not provided, must provide ``
|
506
|
-
|
507
|
-
|
508
|
-
|
678
|
+
(i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
|
679
|
+
position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
|
680
|
+
If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
|
681
|
+
:param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
|
682
|
+
must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
|
509
683
|
will be used.
|
510
|
-
:param strand: Strand
|
511
684
|
:param transcript: The transcript to use. If this is not given, we will try the
|
512
685
|
following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
|
513
686
|
Compatible Transcript
|
514
687
|
:param gene: HGNC gene symbol
|
515
688
|
:param get_nearest_transcript_junction: If ``True``, this will return the
|
516
|
-
adjacent exon if the position specified by``
|
517
|
-
occur on an exon. For the positive strand, adjacent
|
518
|
-
preceding the breakpoint for the 5' end and the exon
|
519
|
-
breakpoint for the 3' end. For the negative strand, adjacent
|
520
|
-
the exon following the breakpoint for the 5' end and the exon
|
521
|
-
breakpoint for the 3' end.
|
522
|
-
:param is_start: ``True`` if ``
|
523
|
-
|
524
|
-
:return:
|
689
|
+
adjacent exon if the position specified by``seg_start_genomic`` or
|
690
|
+
``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
|
691
|
+
is defined as the exon preceding the breakpoint for the 5' end and the exon
|
692
|
+
following the breakpoint for the 3' end. For the negative strand, adjacent
|
693
|
+
is defined as the exon following the breakpoint for the 5' end and the exon
|
694
|
+
preceding the breakpoint for the 3' end.
|
695
|
+
:param is_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
|
696
|
+
``False`` if ``genomic_pos`` is where the transcript segment ends.
|
697
|
+
:return: Data for a transcript segment boundary (inter-residue coordinates)
|
525
698
|
"""
|
526
|
-
|
527
|
-
transcript_exon_data=None, warnings=[], service_meta=service_meta()
|
528
|
-
)
|
529
|
-
params = {key: None for key in TranscriptExonData.model_fields}
|
699
|
+
params = {key: None for key in GenomicTxSeg.model_fields}
|
530
700
|
|
531
701
|
if get_nearest_transcript_junction:
|
532
|
-
if not gene
|
533
|
-
return
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
],
|
702
|
+
if not gene:
|
703
|
+
return GenomicTxSeg(
|
704
|
+
errors=[
|
705
|
+
"`gene` must be provided to select the adjacent transcript junction"
|
706
|
+
]
|
538
707
|
)
|
539
|
-
if not alt_ac:
|
540
|
-
alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome)
|
541
708
|
|
542
|
-
|
543
|
-
|
544
|
-
|
709
|
+
if not genomic_ac:
|
710
|
+
genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
|
711
|
+
|
712
|
+
if not genomic_acs:
|
713
|
+
return GenomicTxSeg(
|
714
|
+
errors=[err_msg],
|
715
|
+
)
|
716
|
+
genomic_ac = genomic_acs[0]
|
717
|
+
|
718
|
+
# Always liftover to GRCh38
|
719
|
+
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
720
|
+
genomic_ac, genomic_pos
|
721
|
+
)
|
722
|
+
if err_msg:
|
723
|
+
return GenomicTxSeg(errors=[err_msg])
|
545
724
|
|
546
725
|
if not transcript:
|
547
726
|
# Select a transcript if not provided
|
@@ -555,7 +734,7 @@ class ExonGenomicCoordsMapper:
|
|
555
734
|
# Attempt to find a coding transcript if a MANE transcript
|
556
735
|
# cannot be found
|
557
736
|
results = await self.uta_db.get_transcripts(
|
558
|
-
gene=gene, alt_ac=
|
737
|
+
gene=gene, alt_ac=genomic_ac
|
559
738
|
)
|
560
739
|
|
561
740
|
if not results.is_empty():
|
@@ -566,376 +745,415 @@ class ExonGenomicCoordsMapper:
|
|
566
745
|
SELECT DISTINCT tx_ac
|
567
746
|
FROM {self.uta_db.schema}.tx_exon_aln_v
|
568
747
|
WHERE hgnc = '{gene}'
|
569
|
-
AND alt_ac = '{
|
748
|
+
AND alt_ac = '{genomic_ac}'
|
570
749
|
""" # noqa: S608
|
571
750
|
result = await self.uta_db.execute_query(query)
|
572
751
|
|
573
752
|
if result:
|
574
753
|
transcript = result[0]["tx_ac"]
|
575
754
|
else:
|
576
|
-
return
|
577
|
-
|
578
|
-
|
755
|
+
return GenomicTxSeg(
|
756
|
+
errors=[
|
757
|
+
f"Could not find a transcript for {gene} on {genomic_ac}"
|
758
|
+
]
|
579
759
|
)
|
580
760
|
|
581
|
-
|
582
|
-
tx_ac=transcript,
|
761
|
+
tx_exons = await self._get_all_exon_coords(
|
762
|
+
tx_ac=transcript, genomic_ac=genomic_ac
|
583
763
|
)
|
584
|
-
if not
|
585
|
-
return
|
764
|
+
if not tx_exons:
|
765
|
+
return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
|
766
|
+
|
767
|
+
strand = Strand(tx_exons[0].alt_strand)
|
768
|
+
params["strand"] = strand
|
586
769
|
|
587
770
|
# Check if breakpoint occurs on an exon.
|
588
771
|
# If not, determine the adjacent exon given the selected transcript
|
589
|
-
if not self._is_exonic_breakpoint(
|
590
|
-
|
591
|
-
tx_exons_genomic_coords=
|
772
|
+
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
|
773
|
+
exon_num = self._get_adjacent_exon(
|
774
|
+
tx_exons_genomic_coords=tx_exons,
|
592
775
|
strand=strand,
|
593
|
-
start=
|
594
|
-
end=
|
776
|
+
start=genomic_pos if is_start else None,
|
777
|
+
end=genomic_pos if not is_start else None,
|
595
778
|
)
|
596
779
|
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
params["pos"] = pos
|
601
|
-
params["chr"] = alt_ac
|
602
|
-
|
603
|
-
self._set_exon_offset(
|
604
|
-
params=params,
|
605
|
-
start=tx_genomic_coords[exon - 1][3], # Start exon coordinate
|
606
|
-
end=tx_genomic_coords[exon - 1][4], # End exon coordinate
|
607
|
-
pos=pos,
|
608
|
-
is_start=is_start,
|
780
|
+
offset = self._get_exon_offset(
|
781
|
+
start_i=tx_exons[exon_num].alt_start_i,
|
782
|
+
end_i=tx_exons[exon_num].alt_end_i,
|
609
783
|
strand=strand,
|
784
|
+
use_start_i=strand == Strand.POSITIVE
|
785
|
+
if is_start
|
786
|
+
else strand != Strand.POSITIVE,
|
787
|
+
is_in_exon=False,
|
788
|
+
start=genomic_pos if is_start else None,
|
789
|
+
end=genomic_pos if not is_start else None,
|
610
790
|
)
|
611
|
-
params["strand"] = strand.value
|
612
|
-
resp.transcript_exon_data = TranscriptExonData(**params)
|
613
|
-
return resp
|
614
791
|
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
792
|
+
genomic_location, err_msg = self._get_vrs_seq_loc(
|
793
|
+
genomic_ac, genomic_pos, is_start, strand
|
794
|
+
)
|
795
|
+
if err_msg:
|
796
|
+
return GenomicTxSeg(errors=[err_msg])
|
797
|
+
|
798
|
+
return GenomicTxSeg(
|
799
|
+
gene=gene,
|
800
|
+
genomic_ac=genomic_ac,
|
801
|
+
tx_ac=transcript,
|
802
|
+
seg=TxSegment(
|
803
|
+
exon_ord=exon_num,
|
804
|
+
offset=offset,
|
805
|
+
genomic_location=genomic_location,
|
806
|
+
),
|
620
807
|
)
|
621
808
|
|
622
|
-
|
623
|
-
|
624
|
-
)
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
if
|
629
|
-
|
630
|
-
|
631
|
-
|
809
|
+
if genomic_ac:
|
810
|
+
# Check if valid accession is given
|
811
|
+
if not await self.uta_db.validate_genomic_ac(genomic_ac):
|
812
|
+
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
813
|
+
|
814
|
+
_gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
|
815
|
+
if _gene:
|
816
|
+
if gene and _gene != gene:
|
817
|
+
return GenomicTxSeg(
|
818
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
819
|
+
)
|
820
|
+
|
821
|
+
gene = _gene
|
632
822
|
else:
|
633
|
-
|
823
|
+
return GenomicTxSeg(errors=[err_msg])
|
824
|
+
elif chromosome:
|
825
|
+
# Try GRCh38 first
|
826
|
+
for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
|
827
|
+
_genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
|
828
|
+
f"{assembly}:chr{chromosome}", "refseq"
|
829
|
+
)
|
830
|
+
if err_msg:
|
831
|
+
return GenomicTxSeg(errors=[err_msg])
|
832
|
+
_genomic_ac = _genomic_acs[0].split(":")[-1]
|
634
833
|
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
834
|
+
_gene, err_msg = await self._get_genomic_ac_gene(
|
835
|
+
genomic_pos, _genomic_ac
|
836
|
+
)
|
837
|
+
if _gene:
|
838
|
+
if gene and _gene != gene:
|
839
|
+
return GenomicTxSeg(
|
840
|
+
errors=[f"Expected gene, {gene}, but found {_gene}"]
|
841
|
+
)
|
842
|
+
gene = _gene
|
843
|
+
genomic_ac = _genomic_ac
|
844
|
+
break
|
845
|
+
|
846
|
+
if not genomic_ac:
|
847
|
+
return GenomicTxSeg(
|
848
|
+
errors=[
|
849
|
+
f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
|
850
|
+
]
|
851
|
+
)
|
640
852
|
|
641
|
-
|
642
|
-
|
853
|
+
if not gene:
|
854
|
+
return GenomicTxSeg(
|
855
|
+
errors=[
|
856
|
+
f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
|
857
|
+
]
|
858
|
+
)
|
643
859
|
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
gene, alt_ac = gene_alt_ac
|
860
|
+
return await self._get_tx_seg_genomic_metadata(
|
861
|
+
genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript
|
862
|
+
)
|
648
863
|
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
864
|
+
async def _get_grch38_ac_pos(
|
865
|
+
self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
|
866
|
+
) -> tuple[str | None, int | None, str | None]:
|
867
|
+
"""Get GRCh38 genomic representation for accession and position
|
868
|
+
|
869
|
+
:param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
|
870
|
+
:param genomic_pos: Genomic position on ``genomic_ac``
|
871
|
+
:param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
|
872
|
+
provided, will attempt to retrieve associated GRCh38 accession from UTA.
|
873
|
+
:return: Tuple containing GRCh38 accession, GRCh38 position, and error message
|
874
|
+
if unable to get GRCh38 representation
|
875
|
+
"""
|
876
|
+
if not grch38_ac:
|
877
|
+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
878
|
+
if not grch38_ac:
|
879
|
+
return None, None, f"Unrecognized genomic accession: {genomic_ac}."
|
663
880
|
|
664
|
-
|
665
|
-
return resp
|
881
|
+
grch38_ac = grch38_ac[0]
|
666
882
|
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
"""Return gene genomic accession
|
672
|
-
|
673
|
-
:param genes_alt_acs: Dictionary containing genes and genomic accessions
|
674
|
-
:param gene: Gene symbol
|
675
|
-
:return: (Gene, Genomic accession) if both exist
|
676
|
-
"""
|
677
|
-
alt_acs = genes_alt_acs["alt_acs"]
|
678
|
-
len_alt_acs = len(alt_acs)
|
679
|
-
if len_alt_acs > 1:
|
680
|
-
return None, f"Found more than one accessions: {alt_acs}"
|
681
|
-
if len_alt_acs == 0:
|
682
|
-
return None, "No genomic accessions found"
|
683
|
-
alt_ac = next(iter(alt_acs))
|
684
|
-
|
685
|
-
genes = genes_alt_acs["genes"]
|
686
|
-
len_genes = len(genes)
|
687
|
-
input_gene = gene
|
688
|
-
output_gene = None
|
689
|
-
if len_genes == 1:
|
690
|
-
output_gene = next(iter(genes))
|
691
|
-
elif len_genes > 1:
|
692
|
-
return None, f"Found more than one gene: {genes}"
|
693
|
-
elif len_genes == 0:
|
694
|
-
return None, "No genes found"
|
695
|
-
|
696
|
-
if input_gene is not None and output_gene != input_gene.upper():
|
697
|
-
return (
|
698
|
-
None,
|
699
|
-
f"Input gene, {input_gene}, does not match "
|
700
|
-
f"expected output gene, {output_gene}",
|
883
|
+
if grch38_ac != genomic_ac:
|
884
|
+
# Ensure genomic_ac is GRCh37
|
885
|
+
chromosome, _ = self.seqrepo_access.translate_identifier(
|
886
|
+
genomic_ac, Assembly.GRCH37.value
|
701
887
|
)
|
888
|
+
if not chromosome:
|
889
|
+
_logger.warning(
|
890
|
+
"SeqRepo could not find associated %s assembly for genomic accession %s.",
|
891
|
+
Assembly.GRCH37.value,
|
892
|
+
genomic_ac,
|
893
|
+
)
|
894
|
+
return (
|
895
|
+
None,
|
896
|
+
None,
|
897
|
+
f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
|
898
|
+
)
|
899
|
+
|
900
|
+
chromosome = chromosome[-1].split(":")[-1]
|
901
|
+
liftover_data = self.liftover.get_liftover(
|
902
|
+
chromosome, genomic_pos, Assembly.GRCH38
|
903
|
+
)
|
904
|
+
if liftover_data is None:
|
905
|
+
return (
|
906
|
+
None,
|
907
|
+
None,
|
908
|
+
f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
|
909
|
+
)
|
910
|
+
|
911
|
+
genomic_pos = liftover_data[1]
|
912
|
+
genomic_ac = grch38_ac
|
702
913
|
|
703
|
-
|
704
|
-
return (gene, alt_ac), None
|
914
|
+
return genomic_ac, genomic_pos, None
|
705
915
|
|
706
|
-
async def
|
916
|
+
async def _get_genomic_ac_gene(
|
707
917
|
self,
|
708
|
-
params: dict,
|
709
|
-
gene: str,
|
710
|
-
alt_ac: str,
|
711
918
|
pos: int,
|
919
|
+
genomic_ac: str,
|
920
|
+
) -> tuple[str | None, str | None]:
|
921
|
+
"""Get gene given a genomic accession and position.
|
922
|
+
|
923
|
+
If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
|
924
|
+
gene will be returned.
|
925
|
+
|
926
|
+
:param pos: Genomic position on ``genomic_ac``
|
927
|
+
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
|
928
|
+
:return: HGNC gene symbol associated to genomic accession and position and
|
929
|
+
warning
|
930
|
+
"""
|
931
|
+
query = f"""
|
932
|
+
SELECT DISTINCT hgnc
|
933
|
+
FROM {self.uta_db.schema}.tx_exon_aln_v
|
934
|
+
WHERE alt_ac = '{genomic_ac}'
|
935
|
+
AND alt_aln_method = 'splign'
|
936
|
+
AND {pos} BETWEEN alt_start_i AND alt_end_i
|
937
|
+
ORDER BY hgnc
|
938
|
+
LIMIT 1;
|
939
|
+
""" # noqa: S608
|
940
|
+
results = await self.uta_db.execute_query(query)
|
941
|
+
if not results:
|
942
|
+
return None, f"No gene(s) found given {genomic_ac} on position {pos}"
|
943
|
+
|
944
|
+
return results[0]["hgnc"], None
|
945
|
+
|
946
|
+
def _get_tx_segment(
|
947
|
+
self,
|
948
|
+
genomic_ac: str,
|
712
949
|
strand: Strand,
|
713
|
-
|
714
|
-
|
715
|
-
|
950
|
+
offset: int,
|
951
|
+
genomic_ac_data: ExonCoord,
|
952
|
+
is_seg_start: bool = False,
|
953
|
+
) -> tuple[TxSegment | None, str | None]:
|
954
|
+
"""Get transcript segment data given ``genomic_ac`` and offset data
|
716
955
|
|
717
|
-
:param
|
718
|
-
:param gene: Gene symbol
|
719
|
-
:param alt_ac: Genomic accession
|
720
|
-
:param pos: Genomic position
|
956
|
+
:param genomic_ac: Genomic RefSeq accession
|
721
957
|
:param strand: Strand
|
722
|
-
:param
|
723
|
-
|
724
|
-
:
|
958
|
+
:param offset: Exon offset
|
959
|
+
:param genomic_ac_data: Exon coordinate data for ``genomic_ac``
|
960
|
+
:param is_seg_start: ``True`` if retrieving genomic data where the transcript
|
961
|
+
segment starts, defaults to ``False``
|
962
|
+
:return: Transcript segment data
|
725
963
|
"""
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
residue_mode=ResidueMode.INTER_RESIDUE,
|
737
|
-
)
|
738
|
-
if not mane_data:
|
739
|
-
msg = f"Unable to find mane data for {alt_ac} with position {pos}"
|
740
|
-
if gene:
|
741
|
-
msg += f" on gene {gene}"
|
742
|
-
_logger.warning(msg)
|
743
|
-
return msg
|
744
|
-
|
745
|
-
params["gene"] = mane_data.gene
|
746
|
-
params["transcript"] = (
|
747
|
-
mane_data.refseq
|
748
|
-
if mane_data.refseq
|
749
|
-
else mane_data.ensembl
|
750
|
-
if mane_data.ensembl
|
751
|
-
else None
|
752
|
-
)
|
753
|
-
tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)
|
754
|
-
if not tx_exons:
|
755
|
-
return f"Unable to get exons for {params['transcript']}"
|
756
|
-
tx_pos = mane_data.pos[0] + mane_data.coding_start_site
|
757
|
-
params["exon"] = self._get_exon_number(tx_exons, tx_pos)
|
758
|
-
|
759
|
-
try:
|
760
|
-
tx_exon = tx_exons[params["exon"] - 1]
|
761
|
-
except IndexError:
|
762
|
-
msg = (
|
763
|
-
f"{params['transcript']} with position {tx_pos} "
|
764
|
-
f"does not exist on exons: {tx_exons}"
|
765
|
-
)
|
766
|
-
_logger.warning(msg)
|
767
|
-
return msg
|
768
|
-
|
769
|
-
strand_to_use = strand if strand is not None else mane_data.strand
|
770
|
-
params["strand"] = strand_to_use
|
771
|
-
self._set_exon_offset(
|
772
|
-
params,
|
773
|
-
tx_exon[0],
|
774
|
-
tx_exon[1],
|
775
|
-
tx_pos,
|
776
|
-
is_start=is_start,
|
777
|
-
strand=strand_to_use,
|
778
|
-
)
|
964
|
+
if is_seg_start:
|
965
|
+
if strand == Strand.POSITIVE:
|
966
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_start_i
|
967
|
+
else:
|
968
|
+
seg_genomic_pos = genomic_ac_data.alt_end_i - offset
|
969
|
+
else:
|
970
|
+
if strand == Strand.POSITIVE:
|
971
|
+
seg_genomic_pos = offset + genomic_ac_data.alt_end_i
|
972
|
+
else:
|
973
|
+
seg_genomic_pos = genomic_ac_data.alt_start_i - offset
|
779
974
|
|
780
|
-
|
781
|
-
|
782
|
-
|
975
|
+
genomic_loc, err_msg = self._get_vrs_seq_loc(
|
976
|
+
genomic_ac,
|
977
|
+
seg_genomic_pos,
|
978
|
+
is_start=is_seg_start,
|
979
|
+
strand=strand,
|
783
980
|
)
|
784
|
-
if
|
785
|
-
return
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
981
|
+
if err_msg:
|
982
|
+
return None, err_msg
|
983
|
+
|
984
|
+
return TxSegment(
|
985
|
+
exon_ord=genomic_ac_data.ord,
|
986
|
+
genomic_location=genomic_loc,
|
987
|
+
offset=offset,
|
988
|
+
), None
|
989
|
+
|
990
|
+
def _get_vrs_seq_loc(
|
991
|
+
self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand
|
992
|
+
) -> tuple[SequenceLocation | None, str | None]:
|
993
|
+
"""Create VRS Sequence Location for genomic position where transcript segment
|
994
|
+
occurs
|
995
|
+
|
996
|
+
:param genomic_ac: RefSeq genomic accession
|
997
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
998
|
+
:param is_start: ``True`` if ``genomic_pos`` is where the transcript segment
|
999
|
+
starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
|
1000
|
+
:param strand: Strand
|
1001
|
+
:return: Tuple containing VRS location (if successful) and error message (if
|
1002
|
+
unable to get GA4GH identifier for ``genomic_ac``).
|
1003
|
+
"""
|
1004
|
+
ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
|
1005
|
+
genomic_ac, "ga4gh"
|
794
1006
|
)
|
795
|
-
|
1007
|
+
if err_msg:
|
1008
|
+
return None, err_msg
|
796
1009
|
|
797
|
-
|
798
|
-
self, params: dict, strand: Strand, is_start: bool
|
799
|
-
) -> str | None:
|
800
|
-
"""Set genomic data in ``params``
|
1010
|
+
use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE
|
801
1011
|
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"])
|
810
|
-
if not grch38_ac:
|
811
|
-
return f"Invalid genomic accession: {params['chr']}"
|
1012
|
+
return SequenceLocation(
|
1013
|
+
sequenceReference=SequenceReference(
|
1014
|
+
refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
|
1015
|
+
),
|
1016
|
+
start=genomic_pos if use_start else None,
|
1017
|
+
end=genomic_pos if not use_start else None,
|
1018
|
+
), None
|
812
1019
|
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
1020
|
+
async def _get_tx_seg_genomic_metadata(
|
1021
|
+
self,
|
1022
|
+
genomic_ac: str,
|
1023
|
+
genomic_pos: int,
|
1024
|
+
is_start: bool,
|
1025
|
+
gene: str,
|
1026
|
+
tx_ac: str | None,
|
1027
|
+
) -> GenomicTxSeg:
|
1028
|
+
"""Get transcript segment data and associated genomic metadata.
|
819
1029
|
|
820
|
-
|
821
|
-
|
822
|
-
chromosome_number, params["pos"], Assembly.GRCH38
|
823
|
-
)
|
824
|
-
if liftover_data is None:
|
825
|
-
return (
|
826
|
-
f"Position {params['pos']} does not exist on "
|
827
|
-
f"chromosome {chromosome_number}"
|
828
|
-
)
|
1030
|
+
Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
|
1031
|
+
errors.
|
829
1032
|
|
830
|
-
|
831
|
-
params["chr"] = grch38_ac
|
1033
|
+
If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
|
832
1034
|
|
833
|
-
|
1035
|
+
:param genomic_ac: Genomic RefSeq accession
|
1036
|
+
:param genomic_pos: Genomic position where the transcript segment occurs
|
1037
|
+
:param is_start: Whether or not ``genomic_pos`` represents the start position.
|
1038
|
+
:param gene: HGNC gene symbol
|
1039
|
+
:param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
|
1040
|
+
transcript
|
1041
|
+
:return: Transcript segment data and associated genomic metadata
|
1042
|
+
"""
|
1043
|
+
if tx_ac:
|
1044
|
+
# We should always try to liftover
|
1045
|
+
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
|
1046
|
+
if not grch38_ac:
|
1047
|
+
return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
|
1048
|
+
grch38_ac = grch38_ac[0]
|
1049
|
+
else:
|
1050
|
+
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
1051
|
+
if not mane_data:
|
1052
|
+
err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
|
1053
|
+
if gene:
|
1054
|
+
err_msg += f" on gene {gene}"
|
1055
|
+
_logger.warning(err_msg)
|
1056
|
+
return GenomicTxSeg(errors=[err_msg])
|
1057
|
+
|
1058
|
+
mane_data = mane_data[0]
|
1059
|
+
tx_ac = mane_data["RefSeq_nuc"]
|
1060
|
+
grch38_ac = mane_data["GRCh38_chr"]
|
1061
|
+
|
1062
|
+
# Always liftover to GRCh38
|
1063
|
+
genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
|
1064
|
+
genomic_ac, genomic_pos, grch38_ac=grch38_ac
|
1065
|
+
)
|
1066
|
+
if err_msg:
|
1067
|
+
return GenomicTxSeg(errors=[err_msg])
|
1068
|
+
|
1069
|
+
tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
|
834
1070
|
if not tx_exons:
|
835
|
-
return f"
|
1071
|
+
return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
|
836
1072
|
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
alt_ac=
|
1073
|
+
tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
|
1074
|
+
tx_ac,
|
1075
|
+
genomic_pos,
|
1076
|
+
genomic_pos,
|
1077
|
+
alt_ac=genomic_ac,
|
842
1078
|
use_tx_pos=False,
|
843
1079
|
)
|
844
|
-
if len(
|
845
|
-
return (
|
846
|
-
|
847
|
-
|
1080
|
+
if len(tx_exon_aln_data) != 1:
|
1081
|
+
return GenomicTxSeg(
|
1082
|
+
errors=[
|
1083
|
+
f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
|
1084
|
+
]
|
848
1085
|
)
|
849
1086
|
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
1087
|
+
tx_exon_aln_data = tx_exon_aln_data[0]
|
1088
|
+
|
1089
|
+
offset = self._get_exon_offset(
|
1090
|
+
start_i=tx_exon_aln_data.alt_start_i,
|
1091
|
+
end_i=tx_exon_aln_data.alt_end_i,
|
1092
|
+
strand=Strand(tx_exon_aln_data.alt_strand),
|
1093
|
+
use_start_i=False, # This doesn't impact anything since we're on the exon
|
1094
|
+
is_in_exon=True,
|
1095
|
+
start=genomic_pos if is_start else None,
|
1096
|
+
end=genomic_pos if not is_start else None,
|
1097
|
+
)
|
1098
|
+
|
1099
|
+
genomic_location, err_msg = self._get_vrs_seq_loc(
|
1100
|
+
genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand
|
1101
|
+
)
|
1102
|
+
if err_msg:
|
1103
|
+
return GenomicTxSeg(errors=[err_msg])
|
1104
|
+
|
1105
|
+
return GenomicTxSeg(
|
1106
|
+
gene=tx_exon_aln_data.hgnc,
|
1107
|
+
genomic_ac=genomic_ac,
|
1108
|
+
tx_ac=tx_exon_aln_data.tx_ac,
|
1109
|
+
seg=TxSegment(
|
1110
|
+
exon_ord=tx_exon_aln_data.ord,
|
1111
|
+
offset=offset,
|
1112
|
+
genomic_location=genomic_location,
|
1113
|
+
),
|
877
1114
|
)
|
878
|
-
return None
|
879
1115
|
|
880
1116
|
@staticmethod
|
881
|
-
def
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
:
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
1117
|
+
def _get_exon_offset(
|
1118
|
+
start_i: int,
|
1119
|
+
end_i: int,
|
1120
|
+
strand: Strand,
|
1121
|
+
use_start_i: bool = True,
|
1122
|
+
is_in_exon: bool = True,
|
1123
|
+
start: int | None = None,
|
1124
|
+
end: int | None = None,
|
1125
|
+
) -> int:
|
1126
|
+
"""Compute offset from exon start or end index
|
1127
|
+
|
1128
|
+
:param start_i: Exon start index (inter-residue)
|
1129
|
+
:param end_i: Exon end index (inter-residue)
|
892
1130
|
:param strand: Strand
|
1131
|
+
:param use_start_i: Whether or not ``start_i`` should be used to compute the
|
1132
|
+
offset, defaults to ``True``. This is only used when ``is_in_exon`` is
|
1133
|
+
``False``.
|
1134
|
+
:param is_in_exon: Whether or not the position occurs in an exon, defaults to
|
1135
|
+
``True``
|
1136
|
+
:param start: Provided start position, defaults to ``None``. Must provide
|
1137
|
+
``start`` or ``end``, not both.
|
1138
|
+
:param end: Provided end position, defaults to ``None``. Must provide ``start``
|
1139
|
+
or ``end``, not both
|
1140
|
+
:return: Offset from exon start or end index
|
893
1141
|
"""
|
894
|
-
if
|
895
|
-
if
|
896
|
-
|
1142
|
+
if is_in_exon:
|
1143
|
+
if start is not None:
|
1144
|
+
offset = start - start_i if strand == Strand.POSITIVE else end_i - start
|
897
1145
|
else:
|
898
|
-
|
1146
|
+
offset = end - end_i if strand == Strand.POSITIVE else start_i - end
|
899
1147
|
else:
|
900
|
-
if strand == Strand.
|
901
|
-
|
1148
|
+
if strand == Strand.POSITIVE:
|
1149
|
+
offset = start - start_i if use_start_i else end - end_i
|
902
1150
|
else:
|
903
|
-
|
904
|
-
|
905
|
-
async def _structure_exons(
|
906
|
-
self, transcript: str, alt_ac: str | None = None
|
907
|
-
) -> list[tuple[int, int]]:
|
908
|
-
"""Structure exons as list of tuples.
|
909
|
-
|
910
|
-
:param transcript: Transcript accession
|
911
|
-
:param alt_ac: Genomic accession
|
912
|
-
:return: List of tuples containing transcript exon coordinates
|
913
|
-
"""
|
914
|
-
tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac)
|
915
|
-
|
916
|
-
if not tx_exons:
|
917
|
-
return []
|
918
|
-
|
919
|
-
return [(coords[0], coords[1]) for coords in tx_exons]
|
920
|
-
|
921
|
-
@staticmethod
|
922
|
-
def _get_exon_number(tx_exons: list, tx_pos: int) -> int:
|
923
|
-
"""Find related exon number for a position
|
924
|
-
|
925
|
-
:param tx_exons: List of exon coordinates for a transcript
|
926
|
-
:param tx_pos: Transcript position change
|
927
|
-
:return: Exon number associated to transcript position change. Will be 1-based
|
928
|
-
"""
|
929
|
-
i = 1
|
930
|
-
for coords in tx_exons:
|
931
|
-
if coords[0] <= tx_pos <= coords[1]:
|
932
|
-
break
|
933
|
-
i += 1
|
934
|
-
return i
|
1151
|
+
offset = start_i - end if use_start_i else end_i - start
|
1152
|
+
return offset
|
935
1153
|
|
936
1154
|
@staticmethod
|
937
1155
|
def _get_adjacent_exon(
|
938
|
-
tx_exons_genomic_coords: list[
|
1156
|
+
tx_exons_genomic_coords: list[ExonCoord],
|
939
1157
|
strand: Strand,
|
940
1158
|
start: int | None = None,
|
941
1159
|
end: int | None = None,
|
@@ -946,20 +1164,18 @@ class ExonGenomicCoordsMapper:
|
|
946
1164
|
adjacent is defined as the exon following the breakpoint for the 5' end and the
|
947
1165
|
exon preceding the breakpoint for the 3' end.
|
948
1166
|
|
949
|
-
:param
|
950
|
-
coordinates for a transcript. Each tuple contains the transcript number
|
951
|
-
(0-indexed), the transcript coordinates for the exon, and the genomic
|
952
|
-
coordinates for the exon. Pos 0 in the tuple corresponds to the exon
|
953
|
-
number, pos 1 and pos 2 refer to the start and end transcript coordinates,
|
954
|
-
respectively, and pos 3 and 4 refer to the start and end genomic
|
955
|
-
coordinates, respectively.
|
1167
|
+
:param tx_exons_genomic_coords: Transcript exon coordinate data
|
956
1168
|
:param strand: Strand
|
957
|
-
:param
|
958
|
-
:param
|
959
|
-
:return: Exon number corresponding to adjacent exon. Will be
|
1169
|
+
:param start: Genomic coordinate of breakpoint
|
1170
|
+
:param end: Genomic coordinate of breakpoint
|
1171
|
+
:return: Exon number corresponding to adjacent exon. Will be 0-based
|
960
1172
|
"""
|
961
1173
|
for i in range(len(tx_exons_genomic_coords) - 1):
|
962
1174
|
exon = tx_exons_genomic_coords[i]
|
1175
|
+
if start == exon.alt_start_i:
|
1176
|
+
break
|
1177
|
+
if end == exon.alt_end_i:
|
1178
|
+
break
|
963
1179
|
next_exon = tx_exons_genomic_coords[i + 1]
|
964
1180
|
bp = start if start else end
|
965
1181
|
if strand == Strand.POSITIVE:
|
@@ -968,19 +1184,20 @@ class ExonGenomicCoordsMapper:
|
|
968
1184
|
else:
|
969
1185
|
lte_exon = next_exon
|
970
1186
|
gte_exon = exon
|
971
|
-
if bp >= lte_exon
|
1187
|
+
if bp >= lte_exon.alt_end_i and bp <= gte_exon.alt_start_i:
|
972
1188
|
break
|
973
1189
|
# Return current exon if end position is provided, next exon if start position
|
974
|
-
# is provided.
|
975
|
-
|
976
|
-
return exon[0] + 1 if end else exon[0] + 2
|
1190
|
+
# is provided.
|
1191
|
+
return exon.ord if end else exon.ord + 1
|
977
1192
|
|
978
1193
|
@staticmethod
|
979
|
-
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list) -> bool:
|
1194
|
+
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool:
|
980
1195
|
"""Check if a breakpoint occurs on an exon
|
981
1196
|
|
982
1197
|
:param pos: Genomic breakpoint
|
983
|
-
:param tx_genomic_coords: A list of
|
984
|
-
:return: True if the breakpoint occurs on an exon
|
1198
|
+
:param tx_genomic_coords: A list of transcript exon coordinate data
|
1199
|
+
:return: ``True`` if the breakpoint occurs on an exon
|
985
1200
|
"""
|
986
|
-
return any(
|
1201
|
+
return any(
|
1202
|
+
exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
|
1203
|
+
)
|