cool-seq-tool 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,224 @@
1
1
  """Provide mapping capabilities between transcript exon and genomic coordinates."""
2
2
 
3
3
  import logging
4
- from typing import Literal, TypeVar
4
+
5
+ from ga4gh.vrs.models import SequenceLocation, SequenceReference
6
+ from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator
5
7
 
6
8
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
7
9
  from cool_seq_tool.mappers.liftover import LiftOver
8
- from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
9
10
  from cool_seq_tool.schemas import (
10
- AnnotationLayer,
11
11
  Assembly,
12
- GenomicData,
13
- GenomicDataResponse,
14
- ResidueMode,
12
+ BaseModelForbidExtra,
13
+ ServiceMeta,
15
14
  Strand,
16
- TranscriptExonData,
17
- TranscriptExonDataResponse,
18
15
  )
19
16
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
20
17
  from cool_seq_tool.sources.uta_database import UtaDatabase
21
- from cool_seq_tool.utils import get_inter_residue_pos, service_meta
22
-
23
- CoordinatesResponseType = TypeVar(
24
- "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse
25
- )
18
+ from cool_seq_tool.utils import service_meta
26
19
 
27
20
  _logger = logging.getLogger(__name__)
28
21
 
29
22
 
23
+ class ExonCoord(BaseModelForbidExtra):
24
+ """Model for representing exon coordinate data"""
25
+
26
+ ord: StrictInt = Field(..., description="Exon number. 0-based.")
27
+ tx_start_i: StrictInt = Field(
28
+ ...,
29
+ description="Transcript start index of the exon. Inter-residue coordinates.",
30
+ )
31
+ tx_end_i: StrictInt = Field(
32
+ ..., description="Transcript end index of the exon. Inter-residue coordinates."
33
+ )
34
+ alt_start_i: StrictInt = Field(
35
+ ..., description="Genomic start index of the exon. Inter-residue coordinates."
36
+ )
37
+ alt_end_i: StrictInt = Field(
38
+ ..., description="Genomic end index of the exon. Inter-residue coordinates."
39
+ )
40
+ alt_strand: Strand = Field(..., description="Strand.")
41
+
42
+ model_config = ConfigDict(
43
+ json_schema_extra={
44
+ "example": {
45
+ "ord": 0,
46
+ "tx_start_i": 0,
47
+ "tx_end_i": 234,
48
+ "alt_start_i": 154191901,
49
+ "alt_end_i": 154192135,
50
+ "alt_strand": Strand.NEGATIVE,
51
+ }
52
+ }
53
+ )
54
+
55
+
56
+ class TxSegment(BaseModelForbidExtra):
57
+ """Model for representing transcript segment data."""
58
+
59
+ exon_ord: StrictInt = Field(..., description="Exon number. 0-based.")
60
+ offset: StrictInt = Field(
61
+ 0,
62
+ description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.",
63
+ )
64
+ genomic_location: SequenceLocation = Field(
65
+ ..., description="The genomic position of a transcript segment."
66
+ )
67
+
68
+ model_config = ConfigDict(
69
+ json_schema_extra={
70
+ "example": {
71
+ "exon_ord": 0,
72
+ "offset": 0,
73
+ "genomic_location": {
74
+ "type": "SequenceLocation",
75
+ "sequenceReference": {
76
+ "type": "SequenceReference",
77
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
78
+ },
79
+ "end": 154192135,
80
+ },
81
+ }
82
+ }
83
+ )
84
+
85
+
86
+ class GenomicTxSeg(BaseModelForbidExtra):
87
+ """Model for representing a boundary for a transcript segment."""
88
+
89
+ seg: TxSegment | None = Field(None, description="Transcript segment.")
90
+ gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
91
+ genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
92
+ tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
93
+ errors: list[StrictStr] = Field([], description="Error messages.")
94
+
95
+ @model_validator(mode="before")
96
+ def check_errors(cls, values: dict) -> dict: # noqa: N805
97
+ """Ensure that fields are (un)set depending on errors
98
+
99
+ :param values: Values in model
100
+ :raises ValueError: If `seg`, `gene`, `genomic_ac` and `tx_ac` are not
101
+ provided when there are no errors
102
+ :return: Values in model
103
+ """
104
+ if not values.get("errors") and not all(
105
+ (
106
+ values.get("seg"),
107
+ values.get("gene"),
108
+ values.get("genomic_ac"),
109
+ values.get("tx_ac"),
110
+ )
111
+ ):
112
+ err_msg = "`seg`, `gene`, `genomic_ac` and `tx_ac` must be provided"
113
+ raise ValueError(err_msg)
114
+ return values
115
+
116
+ model_config = ConfigDict(
117
+ json_schema_extra={
118
+ "example": {
119
+ "gene": "TPM3",
120
+ "genomic_ac": "NC_000001.11",
121
+ "tx_ac": "NM_152263.3",
122
+ "seg": {
123
+ "exon_ord": 0,
124
+ "offset": 0,
125
+ "genomic_location": {
126
+ "type": "SequenceLocation",
127
+ "sequenceReference": {
128
+ "type": "SequenceReference",
129
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
130
+ },
131
+ "end": 154192135,
132
+ },
133
+ },
134
+ "errors": [],
135
+ }
136
+ }
137
+ )
138
+
139
+
140
+ class GenomicTxSegService(BaseModelForbidExtra):
141
+ """Service model for genomic and transcript data."""
142
+
143
+ gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
144
+ genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
145
+ tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
146
+ seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
147
+ seg_end: TxSegment | None = Field(None, description="End transcript segment.")
148
+ errors: list[StrictStr] = Field([], description="Error messages.")
149
+ service_meta: ServiceMeta = Field(..., description="Service metadata.")
150
+
151
+ @model_validator(mode="before")
152
+ def add_meta_check_errors(cls, values: dict) -> dict: # noqa: N805
153
+ """Add service metadata to model and ensure that fields are (un)set depending
154
+ on errors
155
+
156
+ :param values: Values in model
157
+ :raises ValueError: If `gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
158
+ not provided when there are no errors
159
+ :return: Values in model, including service metadata
160
+ """
161
+ values["service_meta"] = service_meta()
162
+ if not values.get("errors") and not all(
163
+ (
164
+ values.get("gene"),
165
+ values.get("genomic_ac"),
166
+ values.get("tx_ac"),
167
+ values.get("seg_start") or values.get("seg_end"),
168
+ )
169
+ ):
170
+ err_msg = "`gene`, `genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
171
+ raise ValueError(err_msg)
172
+
173
+ return values
174
+
175
+ model_config = ConfigDict(
176
+ json_schema_extra={
177
+ "example": {
178
+ "gene": "TPM3",
179
+ "genomic_ac": "NC_000001.11",
180
+ "tx_ac": "NM_152263.3",
181
+ "seg_start": {
182
+ "exon_ord": 0,
183
+ "offset": 0,
184
+ "genomic_location": {
185
+ "type": "SequenceLocation",
186
+ "sequenceReference": {
187
+ "type": "SequenceReference",
188
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
189
+ },
190
+ "end": 154192135,
191
+ },
192
+ },
193
+ "seg_end": {
194
+ "exon_ord": 7,
195
+ "offset": 0,
196
+ "genomic_location": {
197
+ "type": "SequenceLocation",
198
+ "sequenceReference": {
199
+ "type": "SequenceReference",
200
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
201
+ },
202
+ "start": 154170399,
203
+ },
204
+ },
205
+ }
206
+ }
207
+ )
208
+
209
+
210
+ def _return_service_errors(errors: list[str]) -> GenomicTxSegService:
211
+ """Log errors and return service object with errors.
212
+
213
+ :param errors: Error message(s)
214
+ :return: Service object with error messages.
215
+ """
216
+ for error in errors:
217
+ _logger.warning(error)
218
+
219
+ return GenomicTxSegService(errors=errors)
220
+
221
+
30
222
  class ExonGenomicCoordsMapper:
31
223
  """Provide capabilities for mapping transcript exon representation to/from genomic
32
224
  coordinate representation.
@@ -36,7 +228,6 @@ class ExonGenomicCoordsMapper:
36
228
  self,
37
229
  seqrepo_access: SeqRepoAccess,
38
230
  uta_db: UtaDatabase,
39
- mane_transcript: ManeTranscript,
40
231
  mane_transcript_mappings: ManeTranscriptMappings,
41
232
  liftover: LiftOver,
42
233
  ) -> None:
@@ -45,7 +236,7 @@ class ExonGenomicCoordsMapper:
45
236
  A lot of resources are required for initialization, so when defaults are enough,
46
237
  it's easiest to let the core CoolSeqTool class handle it for you:
47
238
 
48
- >>> from cool_seq_tool.app import CoolSeqTool
239
+ >>> from cool_seq_tool import CoolSeqTool
49
240
  >>> egc = CoolSeqTool().ex_g_coords_mapper
50
241
 
51
242
  Note that this class's public methods are all defined as ``async``, so they will
@@ -54,42 +245,22 @@ class ExonGenomicCoordsMapper:
54
245
 
55
246
  >>> import asyncio
56
247
  >>> result = asyncio.run(
57
- ... egc.transcript_to_genomic_coordinates(
58
- ... "NM_002529.3", exon_start=2, exon_end=17
59
- ... )
248
+ ... egc.tx_segment_to_genomic("NM_002529.3", exon_start=2, exon_end=17)
60
249
  ... )
61
250
  >>> result.genomic_data.start, result.genomic_data.end
62
251
  (156864428, 156881456)
63
252
 
64
253
  :param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
65
254
  :param uta_db: UtaDatabase instance to give access to query UTA database
66
- :param mane_transcript: Instance to align to MANE or compatible representation
67
255
  :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
68
256
  :param liftover: Instance to provide mapping between human genome assemblies
69
257
  """
70
258
  self.seqrepo_access = seqrepo_access
71
259
  self.uta_db = uta_db
72
- self.mane_transcript = mane_transcript
73
260
  self.mane_transcript_mappings = mane_transcript_mappings
74
261
  self.liftover = liftover
75
262
 
76
- @staticmethod
77
- def _return_warnings(
78
- resp: CoordinatesResponseType, warning_msg: list[str]
79
- ) -> CoordinatesResponseType:
80
- """Add warnings to response object
81
-
82
- :param resp: Response object
83
- :param warning_msg: Warning message(s) on why ``transcript_exon_data`` or
84
- ``genomic_data`` field is ``None``
85
- :return: Response object with warning message
86
- """
87
- for msg in warning_msg:
88
- _logger.warning(msg)
89
- resp.warnings.append(msg)
90
- return resp
91
-
92
- async def transcript_to_genomic_coordinates(
263
+ async def tx_segment_to_genomic(
93
264
  self,
94
265
  transcript: str,
95
266
  gene: str | None = None,
@@ -97,26 +268,30 @@ class ExonGenomicCoordsMapper:
97
268
  exon_start_offset: int = 0,
98
269
  exon_end: int | None = None,
99
270
  exon_end_offset: int = 0,
100
- ) -> GenomicDataResponse:
101
- """Get genomic data given transcript data.
271
+ ) -> GenomicTxSegService:
272
+ """Get aligned genomic data given transcript segment data.
102
273
 
103
274
  By default, transcript data is aligned to the GRCh38 assembly.
104
275
 
105
276
  >>> import asyncio
106
- >>> from cool_seq_tool.app import CoolSeqTool
277
+ >>> from cool_seq_tool import CoolSeqTool
107
278
  >>> egc = CoolSeqTool().ex_g_coords_mapper
108
279
  >>> tpm3 = asyncio.run(
109
- ... egc.transcript_to_genomic_coordinates(
280
+ ... egc.tx_segment_to_genomic(
110
281
  ... "NM_152263.3",
111
282
  ... gene="TPM3",
112
283
  ... exon_start=1,
113
284
  ... exon_end=8,
114
285
  ... )
115
286
  ... )
116
- >>> tpm3.genomic_data.chr, tpm3.genomic_data.start, tpm3.genomic_data.end
287
+ >>> (
288
+ ... tpm3.genomic_ac,
289
+ ... tpm3.seg_start.genomic_location.end,
290
+ ... tpm3.seg_end.genomic_location.start,
291
+ ... )
117
292
  ('NC_000001.11', 154192135, 154170399)
118
293
 
119
- :param transcript: Transcript accession
294
+ :param transcript: RefSeq transcript accession
120
295
  :param gene: HGNC gene symbol
121
296
  :param exon_start: Starting transcript exon number (1-based). If not provided,
122
297
  must provide ``exon_end``
@@ -126,318 +301,316 @@ class ExonGenomicCoordsMapper:
126
301
  :param exon_end_offset: Ending exon offset
127
302
  :return: GRCh38 genomic data (inter-residue coordinates)
128
303
  """
129
- resp = GenomicDataResponse(
130
- genomic_data=None, warnings=[], service_meta=service_meta()
131
- )
132
-
133
304
  # Ensure valid inputs
134
- warnings = []
135
- if not transcript:
136
- warnings.append("Must provide `transcript`")
137
- else:
138
- transcript = transcript.strip()
139
-
305
+ errors = []
140
306
  exon_start_exists, exon_end_exists = False, False
141
307
  if exon_start is not None:
142
308
  if exon_start < 1:
143
- warnings.append("`exon_start` cannot be less than 1")
309
+ errors.append("`exon_start` cannot be less than 1")
144
310
  exon_start_exists = True
145
311
 
146
312
  if exon_end is not None:
147
313
  if exon_end < 1:
148
- warnings.append("`exon_end` cannot be less than 1")
314
+ errors.append("`exon_end` cannot be less than 1")
149
315
  exon_end_exists = True
150
316
 
151
317
  if not exon_start_exists and not exon_end_exists:
152
- warnings.append("Must provide either `exon_start` or `exon_end`")
318
+ errors.append("Must provide either `exon_start` or `exon_end`")
153
319
  if exon_start_exists and exon_end_exists and (exon_start > exon_end):
154
- warnings.append(
320
+ errors.append(
155
321
  f"Start exon {exon_start} is greater than end exon {exon_end}"
156
322
  )
157
323
 
158
- if warnings:
159
- return self._return_warnings(resp, warnings)
160
-
161
- # Get all exons and associated start/end coordinates for transcript
162
- tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
163
- if not tx_exons:
164
- return self._return_warnings(resp, [warning] if warning else [])
324
+ if errors:
325
+ return _return_service_errors(errors)
165
326
 
166
327
  # Get exon start and exon end coordinates
167
- tx_exon_coords, warning = self.get_tx_exon_coords(
168
- transcript, tx_exons, exon_start, exon_end
328
+ (
329
+ tx_exon_start_coords,
330
+ tx_exon_end_coords,
331
+ errors,
332
+ ) = await self._get_start_end_exon_coords(
333
+ transcript, exon_start=exon_start, exon_end=exon_end
169
334
  )
170
- if not tx_exon_coords:
171
- return self._return_warnings(resp, [warning] if warning else [])
172
- tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
335
+ if errors:
336
+ return _return_service_errors(errors)
173
337
 
174
338
  if gene:
175
- gene = gene.upper().strip()
339
+ gene = gene.upper()
176
340
 
177
341
  # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
178
342
  # for exon(s)
179
- alt_ac_start_end, warning = await self._get_alt_ac_start_and_end(
343
+ alt_ac_start_end, err_msg = await self._get_alt_ac_start_and_end(
180
344
  transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
181
345
  )
182
346
  if not alt_ac_start_end:
183
- return self._return_warnings(resp, [warning] if warning else [])
347
+ return _return_service_errors([err_msg] if err_msg else [])
184
348
  alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
185
349
 
186
350
  # Get gene and chromosome data, check that at least one was retrieved
187
- gene = alt_ac_start_data[0] if alt_ac_start_data else alt_ac_end_data[0]
188
- chromosome = alt_ac_start_data[1] if alt_ac_start_data else alt_ac_end_data[1]
189
- if gene is None or chromosome is None:
190
- return self._return_warnings(
191
- resp,
351
+ gene = alt_ac_start_data.hgnc if alt_ac_start_data else alt_ac_end_data.hgnc
352
+ genomic_ac = (
353
+ alt_ac_start_data.alt_ac if alt_ac_start_data else alt_ac_end_data.alt_ac
354
+ )
355
+ if gene is None or genomic_ac is None:
356
+ return _return_service_errors(
192
357
  [
193
- "Unable to retrieve `gene` or `chromosome` from genomic start and genomic end data"
358
+ "Unable to retrieve `gene` or `genomic_ac` from genomic start and genomic end data"
194
359
  ],
195
360
  )
196
361
 
197
- g_start = alt_ac_start_data[3] - 1 if alt_ac_start_data else None
198
- g_end = alt_ac_end_data[2] + 1 if alt_ac_end_data else None
199
362
  strand = (
200
- Strand(alt_ac_start_data[4])
363
+ Strand(alt_ac_start_data.alt_strand)
201
364
  if alt_ac_start_data
202
- else Strand(alt_ac_end_data[4])
365
+ else Strand(alt_ac_end_data.alt_strand)
203
366
  )
204
367
 
205
- # Using none since could set to 0
206
- start_exits = g_start is not None
207
- end_exists = g_end is not None
208
-
209
- # Calculate offsets
210
- if strand == Strand.NEGATIVE:
211
- start_offset = exon_start_offset * -1 if start_exits else None
212
- end_offset = exon_end_offset * -1 if end_exists else 0
368
+ if exon_start_exists:
369
+ seg_start, err_msg = self._get_tx_segment(
370
+ genomic_ac,
371
+ strand,
372
+ exon_start_offset,
373
+ alt_ac_start_data,
374
+ is_seg_start=True,
375
+ )
376
+ if err_msg:
377
+ return _return_service_errors([err_msg])
213
378
  else:
214
- start_offset = exon_start_offset if start_exits else 0
215
- end_offset = exon_end_offset if end_exists else 0
379
+ seg_start = None
216
380
 
217
- # Get genomic coordinates with offsets included
218
- g_start = g_start + start_offset if start_exits else None
219
- g_end = g_end + end_offset if end_exists else None
381
+ if exon_end_exists:
382
+ seg_end, err_msg = self._get_tx_segment(
383
+ genomic_ac, strand, exon_end_offset, alt_ac_end_data, is_seg_start=False
384
+ )
385
+ if err_msg:
386
+ return _return_service_errors([err_msg])
387
+ else:
388
+ seg_end = None
220
389
 
221
- resp.genomic_data = GenomicData(
390
+ return GenomicTxSegService(
222
391
  gene=gene,
223
- chr=chromosome,
224
- start=g_start,
225
- end=g_end,
226
- exon_start=exon_start if start_exits else None,
227
- exon_start_offset=exon_start_offset,
228
- exon_end=exon_end if end_exists else None,
229
- exon_end_offset=exon_end_offset,
230
- transcript=transcript,
231
- strand=strand,
392
+ genomic_ac=genomic_ac,
393
+ tx_ac=transcript,
394
+ seg_start=seg_start,
395
+ seg_end=seg_end,
232
396
  )
233
397
 
234
- return resp
235
-
236
- async def genomic_to_transcript_exon_coordinates(
398
+ async def genomic_to_tx_segment(
237
399
  self,
238
400
  chromosome: str | None = None,
239
- alt_ac: str | None = None,
240
- start: int | None = None,
241
- end: int | None = None,
242
- strand: Strand | None = None,
401
+ genomic_ac: str | None = None,
402
+ seg_start_genomic: int | None = None,
403
+ seg_end_genomic: int | None = None,
243
404
  transcript: str | None = None,
244
405
  get_nearest_transcript_junction: bool = False,
245
406
  gene: str | None = None,
246
- residue_mode: Literal[ResidueMode.INTER_RESIDUE]
247
- | Literal[ResidueMode.RESIDUE] = ResidueMode.RESIDUE,
248
- ) -> GenomicDataResponse:
249
- """Get transcript data for genomic data, lifted over to GRCh38.
407
+ ) -> GenomicTxSegService:
408
+ """Get transcript segment data for genomic data, lifted over to GRCh38.
409
+
410
+ If liftover to GRCh38 is unsuccessful, will return errors.
411
+
412
+ Must provide inter-residue coordinates.
250
413
 
251
414
  MANE Transcript data will be returned if and only if ``transcript`` is not
252
415
  supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
253
416
 
254
417
  >>> import asyncio
255
- >>> from cool_seq_tool.app import CoolSeqTool
418
+ >>> from cool_seq_tool import CoolSeqTool
256
419
  >>> from cool_seq_tool.schemas import Strand
257
420
  >>> egc = CoolSeqTool().ex_g_coords_mapper
258
421
  >>> result = asyncio.run(
259
- ... egc.genomic_to_transcript_exon_coordinates(
260
- ... alt_ac="NC_000001.11",
261
- ... start=154192136,
262
- ... end=154170400,
263
- ... strand=Strand.NEGATIVE,
422
+ ... egc.genomic_to_tx_segment(
423
+ ... genomic_ac="NC_000001.11",
424
+ ... seg_start_genomic=154192135,
425
+ ... seg_end_genomic=154170399,
264
426
  ... transcript="NM_152263.3",
265
427
  ... )
266
428
  ... )
267
- >>> result.genomic_data.exon_start, result.genomic_data.exon_end
268
- (1, 8)
429
+ >>> result.seg_start.exon_ord, result.seg_end.exon_ord
430
+ (0, 7)
269
431
 
270
432
  :param chromosome: e.g. ``"1"`` or ``"chr1"``. If not provided, must provide
271
- ``alt_ac``. If ``alt_ac`` is also provided, ``alt_ac`` will be used.
272
- :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
273
- must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
274
- will be used.
275
- :param start: Start genomic position
276
- :param end: End genomic position
277
- :param strand: Strand
433
+ ``genomic_ac``. If ``genomic_ac`` is also provided, ``genomic_ac`` will be
434
+ used.
435
+ :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
436
+ must provide ``chromosome. If ``chromosome`` is also provided,
437
+ ``genomic_ac`` will be used.
438
+ :param seg_start_genomic: Genomic position where the transcript segment starts
439
+ :param seg_end_genomic: Genomic position where the transcript segment ends
278
440
  :param transcript: The transcript to use. If this is not given, we will try the
279
441
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
280
442
  Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
281
443
  page.
282
444
  :param get_nearest_transcript_junction: If ``True``, this will return the
283
- adjacent exon if the position specified by``start`` or ``end`` does not
284
- occur on an exon. For the positive strand, adjacent is defined as the exon
285
- preceding the breakpoint for the 5' end and the exon following the
286
- breakpoint for the 3' end. For the negative strand, adjacent is defined as
287
- the exon following the breakpoint for the 5' end and the exon preceding the
288
- breakpoint for the 3' end.
445
+ adjacent exon if the position specified by``seg_start_genomic`` or
446
+ ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
447
+ is defined as the exon preceding the breakpoint for the 5' end and the exon
448
+ following the breakpoint for the 3' end. For the negative strand, adjacent
449
+ is defined as the exon following the breakpoint for the 5' end and the exon
450
+ preceding the breakpoint for the 3' end.
289
451
  :param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript``
290
452
  value is provided.
291
- :param residue_mode: Residue mode for ``start`` and ``end``
453
+ :param coordinate_type: Coordinate type for ``seg_start_genomic`` and
454
+ ``seg_end_genomic``
292
455
  :return: Genomic data (inter-residue coordinates)
293
456
  """
294
- resp = GenomicDataResponse(
295
- genomic_data=None, warnings=[], service_meta=service_meta()
296
- )
297
- warnings = []
298
- if start is None and end is None:
299
- warnings.append("Must provide either `start` or `end`")
300
- if chromosome is None and alt_ac is None:
301
- warnings.append("Must provide either `chromosome` or `alt_ac`")
457
+ errors = []
458
+ if seg_start_genomic is None and seg_end_genomic is None:
459
+ errors.append(
460
+ "Must provide either `seg_start_genomic` or `seg_end_genomic`"
461
+ )
462
+ if chromosome is None and genomic_ac is None:
463
+ errors.append("Must provide either `chromosome` or `alt_ac`")
302
464
  if transcript is None and gene is None:
303
- warnings.append("Must provide either `gene` or `transcript`")
304
- if warnings:
305
- return self._return_warnings(resp, warnings)
465
+ errors.append("Must provide either `gene` or `transcript`")
466
+ if errors:
467
+ return _return_service_errors(errors)
306
468
 
307
- params = {key: None for key in GenomicData.model_fields}
308
469
  if gene is not None:
309
- gene = gene.upper().strip()
310
-
311
- if start:
312
- if residue_mode == ResidueMode.RESIDUE:
313
- # zero-based for UTA
314
- start -= 1
315
- residue_mode = ResidueMode.ZERO
316
- start_data = await self._genomic_to_transcript_exon_coordinate(
317
- start,
470
+ gene = gene.upper()
471
+
472
+ params = {}
473
+
474
+ if seg_start_genomic:
475
+ start_tx_seg_data = await self._genomic_to_tx_segment(
476
+ seg_start_genomic,
318
477
  chromosome=chromosome,
319
- alt_ac=alt_ac,
320
- strand=strand,
478
+ genomic_ac=genomic_ac,
321
479
  transcript=transcript,
322
480
  gene=gene,
323
481
  get_nearest_transcript_junction=get_nearest_transcript_junction,
324
482
  is_start=True,
325
483
  )
326
- if start_data.transcript_exon_data:
327
- start_data = start_data.transcript_exon_data.model_dump()
328
- else:
329
- return self._return_warnings(resp, [start_data.warnings[0]])
484
+ if start_tx_seg_data.errors:
485
+ return _return_service_errors(start_tx_seg_data.errors)
486
+
487
+ params["gene"] = start_tx_seg_data.gene
488
+ params["genomic_ac"] = start_tx_seg_data.genomic_ac
489
+ params["tx_ac"] = start_tx_seg_data.tx_ac
490
+ params["seg_start"] = start_tx_seg_data.seg
330
491
  else:
331
- start_data = None
492
+ start_tx_seg_data = None
332
493
 
333
- if end:
334
- end -= 1
335
- residue_mode = ResidueMode.ZERO
336
- end_data = await self._genomic_to_transcript_exon_coordinate(
337
- end,
494
+ if seg_end_genomic:
495
+ end_tx_seg_data = await self._genomic_to_tx_segment(
496
+ seg_end_genomic,
338
497
  chromosome=chromosome,
339
- alt_ac=alt_ac,
340
- strand=strand,
498
+ genomic_ac=genomic_ac,
341
499
  transcript=transcript,
342
500
  gene=gene,
343
501
  get_nearest_transcript_junction=get_nearest_transcript_junction,
344
502
  is_start=False,
345
503
  )
346
- if end_data.transcript_exon_data:
347
- end_data = end_data.transcript_exon_data.model_dump()
504
+ if end_tx_seg_data.errors:
505
+ return _return_service_errors(end_tx_seg_data.errors)
506
+
507
+ if start_tx_seg_data:
508
+ # Need to check that gene, genomic_ac, tx_ac all match
509
+ errors = []
510
+ for attr in ["gene", "genomic_ac", "tx_ac"]:
511
+ start_seg_attr = params[attr]
512
+ end_seg_attr = getattr(end_tx_seg_data, attr)
513
+ if start_seg_attr != end_seg_attr:
514
+ errors.append(
515
+ f"Start end end segment mismatch for `{attr}`. {start_seg_attr} != {end_seg_attr}."
516
+ )
517
+ if errors:
518
+ return _return_service_errors(errors)
348
519
  else:
349
- return self._return_warnings(resp, [end_data.warnings[0]])
350
- else:
351
- end_data = None
352
-
353
- for field in ["transcript", "gene", "chr", "strand"]:
354
- if start_data:
355
- if end_data and (start_data[field] != end_data[field]):
356
- msg = (
357
- f"Start `{field}`, {start_data[field]}, does "
358
- f"not match End `{field}`, {end_data[field]}"
359
- )
360
- return self._return_warnings(resp, [msg])
361
- params[field] = start_data[field]
362
- else:
363
- params[field] = end_data[field]
520
+ params["gene"] = end_tx_seg_data.gene
521
+ params["genomic_ac"] = end_tx_seg_data.genomic_ac
522
+ params["tx_ac"] = end_tx_seg_data.tx_ac
364
523
 
365
- if gene and gene != params["gene"]:
366
- msg = (
367
- f"Input gene, {gene}, does not match expected output"
368
- f"gene, {params['gene']}"
369
- )
370
- return self._return_warnings(resp, [msg])
524
+ params["seg_end"] = end_tx_seg_data.seg
371
525
 
372
- for label, data in [("start", start_data), ("end", end_data)]:
373
- if data:
374
- params[label] = data["pos"]
375
- params[f"exon_{label}"] = data["exon"]
376
- params[f"exon_{label}_offset"] = data["exon_offset"]
377
- resp.genomic_data = GenomicData(**params)
378
- return resp
526
+ return GenomicTxSegService(**params)
379
527
 
380
- @staticmethod
381
- def _validate_exon(
382
- transcript: str, tx_exons: list[tuple[int, int]], exon_number: int
383
- ) -> tuple[tuple[int, int] | None, str | None]:
384
- """Validate that exon number exists on a given transcript
385
-
386
- :param transcript: Transcript accession
387
- :param tx_exons: List of transcript's exons and associated coordinates
388
- :param exon_number: Exon number to validate
389
- :return: Exon coordinates for a given exon number and warnings if found
390
- """
391
- msg = f"Exon {exon_number} does not exist on {transcript}"
392
- try:
393
- if exon_number < 1:
394
- return None, msg
395
- exon = tx_exons[exon_number - 1]
396
- except IndexError:
397
- return None, msg
398
- return exon, None
528
+ async def _get_all_exon_coords(
529
+ self, tx_ac: str, genomic_ac: str | None = None
530
+ ) -> list[ExonCoord]:
531
+ """Get all exon coordinate data for a transcript.
532
+
533
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
534
+ associated to ``tx_ac``.
399
535
 
400
- def get_tx_exon_coords(
536
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
537
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
538
+ :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
539
+ The exon coordinate data will include the exon number, transcript and
540
+ genomic positions for the start and end of the exon, and strand.
541
+ The list will be ordered by ascending exon number.
542
+ """
543
+ if genomic_ac:
544
+ query = f"""
545
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
546
+ FROM {self.uta_db.schema}.tx_exon_aln_v
547
+ WHERE tx_ac = '{tx_ac}'
548
+ AND alt_aln_method = 'splign'
549
+ AND alt_ac = '{genomic_ac}'
550
+ ORDER BY ord ASC
551
+ """ # noqa: S608
552
+ else:
553
+ query = f"""
554
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
555
+ FROM {self.uta_db.schema}.tx_exon_aln_v as t
556
+ INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
557
+ ON t.alt_ac = s.ac
558
+ WHERE s.descr = ''
559
+ AND t.tx_ac = '{tx_ac}'
560
+ AND t.alt_aln_method = 'splign'
561
+ AND t.alt_ac like 'NC_000%'
562
+ ORDER BY ord ASC
563
+ """ # noqa: S608
564
+
565
+ results = await self.uta_db.execute_query(query)
566
+ return [ExonCoord(**r) for r in results]
567
+
568
+ async def _get_start_end_exon_coords(
401
569
  self,
402
- transcript: str,
403
- tx_exons: list[tuple[int, int]],
570
+ tx_ac: str,
404
571
  exon_start: int | None = None,
405
572
  exon_end: int | None = None,
406
- ) -> tuple[
407
- tuple[tuple[int, int] | None, tuple[int, int] | None] | None,
408
- str | None,
409
- ]:
410
- """Get exon coordinates for ``exon_start`` and ``exon_end``
411
-
412
- :param transcript: Transcript accession
413
- :param tx_exons: List of all transcript exons and coordinates
414
- :param exon_start: Start exon number
415
- :param exon_end: End exon number
416
- :return: [Transcript start exon coords, Transcript end exon coords],
417
- and warnings if found
573
+ genomic_ac: str | None = None,
574
+ ) -> tuple[ExonCoord | None, ExonCoord | None, list[str]]:
575
+ """Get exon coordinates for a transcript given exon start and exon end.
576
+
577
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
578
+ associated to ``tx_ac``.
579
+
580
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
581
+ :param exon_start: Start exon number to get coordinate data for. 1-based.
582
+ :param exon_end: End exon number to get coordinate data for. 1-based.
583
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
584
+ :return: Tuple containing start exon coordinate data, end exon coordinate data,
585
+ and list of errors. The exon coordinate data will include the exon number,
586
+ transcript and genomic positions for the start and end of the exon, and
587
+ strand.
418
588
  """
419
- if exon_start is not None:
420
- tx_exon_start, warning = self._validate_exon(
421
- transcript, tx_exons, exon_start
422
- )
423
- if not tx_exon_start:
424
- return None, warning
425
- else:
426
- tx_exon_start = None
589
+ tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
590
+ if not tx_exons:
591
+ return None, None, [f"No exons found given {tx_ac}"]
427
592
 
428
- if exon_end is not None:
429
- tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
430
- if not tx_exon_end:
431
- return None, warning
432
- else:
433
- tx_exon_end = None
434
- return (tx_exon_start, tx_exon_end), None
593
+ errors = []
594
+ start_end_exons = []
595
+ for exon_num in [exon_start, exon_end]:
596
+ if exon_num is not None:
597
+ try:
598
+ start_end_exons.append(tx_exons[exon_num - 1])
599
+ continue
600
+ except IndexError:
601
+ errors.append(f"Exon {exon_num} does not exist on {tx_ac}")
602
+ start_end_exons.append(None)
603
+
604
+ if errors:
605
+ start_end_exons = [None, None]
606
+
607
+ return *start_end_exons, errors
435
608
 
436
609
  async def _get_alt_ac_start_and_end(
437
610
  self,
438
611
  tx_ac: str,
439
- tx_exon_start: tuple[int, int] | None = None,
440
- tx_exon_end: tuple[int, int] | None = None,
612
+ tx_exon_start: ExonCoord | None = None,
613
+ tx_exon_end: ExonCoord | None = None,
441
614
  gene: str | None = None,
442
615
  ) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
443
616
  """Get aligned genomic coordinates for transcript exon start and end.
@@ -459,7 +632,7 @@ class ExonGenomicCoordsMapper:
459
632
  for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
460
633
  if exon:
461
634
  alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
462
- tx_ac, exon[0], exon[1], gene=gene
635
+ tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
463
636
  )
464
637
  if alt_ac_val:
465
638
  alt_ac_data[key] = alt_ac_val
@@ -470,78 +643,84 @@ class ExonGenomicCoordsMapper:
470
643
  # Validate that start and end alignments have matching gene, genomic accession,
471
644
  # and strand
472
645
  if all(alt_ac_data_values):
473
- for i in (0, 1, 4):
474
- if alt_ac_data["start"][i] != alt_ac_data["end"][i]:
475
- if i == 0:
476
- error = "HGNC gene symbol does not match"
477
- elif i == 1:
478
- error = "Genomic accession does not match"
479
- else:
480
- error = "Strand does not match"
646
+ for attr in ["hgnc", "alt_ac", "alt_strand"]:
647
+ start_attr = getattr(alt_ac_data["start"], attr)
648
+ end_attr = getattr(alt_ac_data["end"], attr)
649
+ if start_attr != end_attr:
650
+ error = f"{attr} mismatch. {start_attr} != {end_attr}."
481
651
  _logger.warning(
482
652
  "%s: %s != %s",
483
653
  error,
484
- alt_ac_data["start"][i],
485
- alt_ac_data["end"][i],
654
+ start_attr,
655
+ end_attr,
486
656
  )
487
657
  return None, error
488
658
  return tuple(alt_ac_data_values), None
489
659
 
490
- async def _genomic_to_transcript_exon_coordinate(
660
+ async def _genomic_to_tx_segment(
491
661
  self,
492
- pos: int,
662
+ genomic_pos: int,
493
663
  chromosome: str | None = None,
494
- alt_ac: str | None = None,
495
- strand: Strand | None = None,
664
+ genomic_ac: str | None = None,
496
665
  transcript: str | None = None,
497
666
  gene: str | None = None,
498
667
  get_nearest_transcript_junction: bool = False,
499
668
  is_start: bool = True,
500
- ) -> TranscriptExonDataResponse:
501
- """Convert individual genomic data to transcript data
669
+ ) -> GenomicTxSeg:
670
+ """Given genomic data, generate a boundary for a transcript segment.
671
+
672
+ Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
673
+ errors.
502
674
 
503
- :param pos: Genomic position (zero-based)
675
+ :param genomic_pos: Genomic position where the transcript segment starts or ends
676
+ (inter-residue based)
504
677
  :param chromosome: Chromosome. Must give chromosome without a prefix
505
- (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
506
- If ``alt_ac`` is also provided, ``alt_ac`` will be used.
507
- :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
508
- must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
678
+ (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
679
+ position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
680
+ If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
681
+ :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
682
+ must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
509
683
  will be used.
510
- :param strand: Strand
511
684
  :param transcript: The transcript to use. If this is not given, we will try the
512
685
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
513
686
  Compatible Transcript
514
687
  :param gene: HGNC gene symbol
515
688
  :param get_nearest_transcript_junction: If ``True``, this will return the
516
- adjacent exon if the position specified by``start`` or ``end`` does not
517
- occur on an exon. For the positive strand, adjacent is defined as the exon
518
- preceding the breakpoint for the 5' end and the exon following the
519
- breakpoint for the 3' end. For the negative strand, adjacent is defined as
520
- the exon following the breakpoint for the 5' end and the exon preceding the
521
- breakpoint for the 3' end.
522
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
523
- end position.
524
- :return: Transcript data (inter-residue coordinates)
689
+ adjacent exon if the position specified by``seg_start_genomic`` or
690
+ ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
691
+ is defined as the exon preceding the breakpoint for the 5' end and the exon
692
+ following the breakpoint for the 3' end. For the negative strand, adjacent
693
+ is defined as the exon following the breakpoint for the 5' end and the exon
694
+ preceding the breakpoint for the 3' end.
695
+ :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
696
+ ``False`` if ``genomic_pos`` is where the transcript segment ends.
697
+ :return: Data for a transcript segment boundary (inter-residue coordinates)
525
698
  """
526
- resp = TranscriptExonDataResponse(
527
- transcript_exon_data=None, warnings=[], service_meta=service_meta()
528
- )
529
- params = {key: None for key in TranscriptExonData.model_fields}
699
+ params = {key: None for key in GenomicTxSeg.model_fields}
530
700
 
531
701
  if get_nearest_transcript_junction:
532
- if not gene or not strand:
533
- return self._return_warnings(
534
- resp,
535
- [
536
- "Gene or strand must be provided to select the adjacent transcript junction"
537
- ],
702
+ if not gene:
703
+ return GenomicTxSeg(
704
+ errors=[
705
+ "`gene` must be provided to select the adjacent transcript junction"
706
+ ]
538
707
  )
539
- if not alt_ac:
540
- alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome)
541
708
 
542
- if not alt_acs:
543
- return self._return_warnings(resp, [w])
544
- alt_ac = alt_acs[0]
709
+ if not genomic_ac:
710
+ genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
711
+
712
+ if not genomic_acs:
713
+ return GenomicTxSeg(
714
+ errors=[err_msg],
715
+ )
716
+ genomic_ac = genomic_acs[0]
717
+
718
+ # Always liftover to GRCh38
719
+ genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
720
+ genomic_ac, genomic_pos
721
+ )
722
+ if err_msg:
723
+ return GenomicTxSeg(errors=[err_msg])
545
724
 
546
725
  if not transcript:
547
726
  # Select a transcript if not provided
@@ -555,7 +734,7 @@ class ExonGenomicCoordsMapper:
555
734
  # Attempt to find a coding transcript if a MANE transcript
556
735
  # cannot be found
557
736
  results = await self.uta_db.get_transcripts(
558
- gene=gene, alt_ac=alt_ac
737
+ gene=gene, alt_ac=genomic_ac
559
738
  )
560
739
 
561
740
  if not results.is_empty():
@@ -566,376 +745,415 @@ class ExonGenomicCoordsMapper:
566
745
  SELECT DISTINCT tx_ac
567
746
  FROM {self.uta_db.schema}.tx_exon_aln_v
568
747
  WHERE hgnc = '{gene}'
569
- AND alt_ac = '{alt_ac}'
748
+ AND alt_ac = '{genomic_ac}'
570
749
  """ # noqa: S608
571
750
  result = await self.uta_db.execute_query(query)
572
751
 
573
752
  if result:
574
753
  transcript = result[0]["tx_ac"]
575
754
  else:
576
- return self._return_warnings(
577
- resp,
578
- [f"Could not find a transcript for {gene} on {alt_ac}"],
755
+ return GenomicTxSeg(
756
+ errors=[
757
+ f"Could not find a transcript for {gene} on {genomic_ac}"
758
+ ]
579
759
  )
580
760
 
581
- tx_genomic_coords, w = await self.uta_db.get_tx_exons_genomic_coords(
582
- tx_ac=transcript, alt_ac=alt_ac
761
+ tx_exons = await self._get_all_exon_coords(
762
+ tx_ac=transcript, genomic_ac=genomic_ac
583
763
  )
584
- if not tx_genomic_coords:
585
- return self._return_warnings(resp, [w])
764
+ if not tx_exons:
765
+ return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
766
+
767
+ strand = Strand(tx_exons[0].alt_strand)
768
+ params["strand"] = strand
586
769
 
587
770
  # Check if breakpoint occurs on an exon.
588
771
  # If not, determine the adjacent exon given the selected transcript
589
- if not self._is_exonic_breakpoint(pos, tx_genomic_coords):
590
- exon = self._get_adjacent_exon(
591
- tx_exons_genomic_coords=tx_genomic_coords,
772
+ if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
773
+ exon_num = self._get_adjacent_exon(
774
+ tx_exons_genomic_coords=tx_exons,
592
775
  strand=strand,
593
- start=pos if is_start else None,
594
- end=pos if not is_start else None,
776
+ start=genomic_pos if is_start else None,
777
+ end=genomic_pos if not is_start else None,
595
778
  )
596
779
 
597
- params["exon"] = exon
598
- params["transcript"] = transcript
599
- params["gene"] = gene
600
- params["pos"] = pos
601
- params["chr"] = alt_ac
602
-
603
- self._set_exon_offset(
604
- params=params,
605
- start=tx_genomic_coords[exon - 1][3], # Start exon coordinate
606
- end=tx_genomic_coords[exon - 1][4], # End exon coordinate
607
- pos=pos,
608
- is_start=is_start,
780
+ offset = self._get_exon_offset(
781
+ start_i=tx_exons[exon_num].alt_start_i,
782
+ end_i=tx_exons[exon_num].alt_end_i,
609
783
  strand=strand,
784
+ use_start_i=strand == Strand.POSITIVE
785
+ if is_start
786
+ else strand != Strand.POSITIVE,
787
+ is_in_exon=False,
788
+ start=genomic_pos if is_start else None,
789
+ end=genomic_pos if not is_start else None,
610
790
  )
611
- params["strand"] = strand.value
612
- resp.transcript_exon_data = TranscriptExonData(**params)
613
- return resp
614
791
 
615
- if alt_ac:
616
- # Check if valid accession is given
617
- if not await self.uta_db.validate_genomic_ac(alt_ac):
618
- return self._return_warnings(
619
- resp, [f"Invalid genomic accession: {alt_ac}"]
792
+ genomic_location, err_msg = self._get_vrs_seq_loc(
793
+ genomic_ac, genomic_pos, is_start, strand
794
+ )
795
+ if err_msg:
796
+ return GenomicTxSeg(errors=[err_msg])
797
+
798
+ return GenomicTxSeg(
799
+ gene=gene,
800
+ genomic_ac=genomic_ac,
801
+ tx_ac=transcript,
802
+ seg=TxSegment(
803
+ exon_ord=exon_num,
804
+ offset=offset,
805
+ genomic_location=genomic_location,
806
+ ),
620
807
  )
621
808
 
622
- genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
623
- pos, strand=strand, alt_ac=alt_ac, gene=gene
624
- )
625
- elif chromosome:
626
- # Check if just chromosome is given. If it is, we should
627
- # convert this to the correct accession version
628
- if chromosome == "X":
629
- chromosome = 23
630
- elif chromosome == "Y":
631
- chromosome = 24
809
+ if genomic_ac:
810
+ # Check if valid accession is given
811
+ if not await self.uta_db.validate_genomic_ac(genomic_ac):
812
+ return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
813
+
814
+ _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
815
+ if _gene:
816
+ if gene and _gene != gene:
817
+ return GenomicTxSeg(
818
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
819
+ )
820
+
821
+ gene = _gene
632
822
  else:
633
- chromosome = int(chromosome)
823
+ return GenomicTxSeg(errors=[err_msg])
824
+ elif chromosome:
825
+ # Try GRCh38 first
826
+ for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
827
+ _genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
828
+ f"{assembly}:chr{chromosome}", "refseq"
829
+ )
830
+ if err_msg:
831
+ return GenomicTxSeg(errors=[err_msg])
832
+ _genomic_ac = _genomic_acs[0].split(":")[-1]
634
833
 
635
- genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
636
- pos, strand=strand, chromosome=chromosome, gene=gene
637
- )
638
- else:
639
- genes_alt_acs = None
834
+ _gene, err_msg = await self._get_genomic_ac_gene(
835
+ genomic_pos, _genomic_ac
836
+ )
837
+ if _gene:
838
+ if gene and _gene != gene:
839
+ return GenomicTxSeg(
840
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
841
+ )
842
+ gene = _gene
843
+ genomic_ac = _genomic_ac
844
+ break
845
+
846
+ if not genomic_ac:
847
+ return GenomicTxSeg(
848
+ errors=[
849
+ f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
850
+ ]
851
+ )
640
852
 
641
- if not genes_alt_acs:
642
- return self._return_warnings(resp, [warning])
853
+ if not gene:
854
+ return GenomicTxSeg(
855
+ errors=[
856
+ f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
857
+ ]
858
+ )
643
859
 
644
- gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene)
645
- if not gene_alt_ac:
646
- return self._return_warnings(resp, [warning])
647
- gene, alt_ac = gene_alt_ac
860
+ return await self._get_tx_seg_genomic_metadata(
861
+ genomic_ac, genomic_pos, is_start, gene, tx_ac=transcript
862
+ )
648
863
 
649
- if transcript is None:
650
- warnings = await self._set_mane_genomic_data(
651
- params, gene, alt_ac, pos, strand, is_start
652
- )
653
- if warnings:
654
- return self._return_warnings(resp, [warnings])
655
- else:
656
- params["transcript"] = transcript
657
- params["gene"] = gene
658
- params["pos"] = pos
659
- params["chr"] = alt_ac
660
- warning = await self._set_genomic_data(params, strand, is_start)
661
- if warning:
662
- return self._return_warnings(resp, [warning])
864
+ async def _get_grch38_ac_pos(
865
+ self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
866
+ ) -> tuple[str | None, int | None, str | None]:
867
+ """Get GRCh38 genomic representation for accession and position
868
+
869
+ :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
870
+ :param genomic_pos: Genomic position on ``genomic_ac``
871
+ :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
872
+ provided, will attempt to retrieve associated GRCh38 accession from UTA.
873
+ :return: Tuple containing GRCh38 accession, GRCh38 position, and error message
874
+ if unable to get GRCh38 representation
875
+ """
876
+ if not grch38_ac:
877
+ grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
878
+ if not grch38_ac:
879
+ return None, None, f"Unrecognized genomic accession: {genomic_ac}."
663
880
 
664
- resp.transcript_exon_data = TranscriptExonData(**params)
665
- return resp
881
+ grch38_ac = grch38_ac[0]
666
882
 
667
- @staticmethod
668
- def _get_gene_and_alt_ac(
669
- genes_alt_acs: dict, gene: str | None
670
- ) -> tuple[tuple[str, str] | None, str | None]:
671
- """Return gene genomic accession
672
-
673
- :param genes_alt_acs: Dictionary containing genes and genomic accessions
674
- :param gene: Gene symbol
675
- :return: (Gene, Genomic accession) if both exist
676
- """
677
- alt_acs = genes_alt_acs["alt_acs"]
678
- len_alt_acs = len(alt_acs)
679
- if len_alt_acs > 1:
680
- return None, f"Found more than one accessions: {alt_acs}"
681
- if len_alt_acs == 0:
682
- return None, "No genomic accessions found"
683
- alt_ac = next(iter(alt_acs))
684
-
685
- genes = genes_alt_acs["genes"]
686
- len_genes = len(genes)
687
- input_gene = gene
688
- output_gene = None
689
- if len_genes == 1:
690
- output_gene = next(iter(genes))
691
- elif len_genes > 1:
692
- return None, f"Found more than one gene: {genes}"
693
- elif len_genes == 0:
694
- return None, "No genes found"
695
-
696
- if input_gene is not None and output_gene != input_gene.upper():
697
- return (
698
- None,
699
- f"Input gene, {input_gene}, does not match "
700
- f"expected output gene, {output_gene}",
883
+ if grch38_ac != genomic_ac:
884
+ # Ensure genomic_ac is GRCh37
885
+ chromosome, _ = self.seqrepo_access.translate_identifier(
886
+ genomic_ac, Assembly.GRCH37.value
701
887
  )
888
+ if not chromosome:
889
+ _logger.warning(
890
+ "SeqRepo could not find associated %s assembly for genomic accession %s.",
891
+ Assembly.GRCH37.value,
892
+ genomic_ac,
893
+ )
894
+ return (
895
+ None,
896
+ None,
897
+ f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
898
+ )
899
+
900
+ chromosome = chromosome[-1].split(":")[-1]
901
+ liftover_data = self.liftover.get_liftover(
902
+ chromosome, genomic_pos, Assembly.GRCH38
903
+ )
904
+ if liftover_data is None:
905
+ return (
906
+ None,
907
+ None,
908
+ f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
909
+ )
910
+
911
+ genomic_pos = liftover_data[1]
912
+ genomic_ac = grch38_ac
702
913
 
703
- gene = output_gene if output_gene else input_gene
704
- return (gene, alt_ac), None
914
+ return genomic_ac, genomic_pos, None
705
915
 
706
- async def _set_mane_genomic_data(
916
+ async def _get_genomic_ac_gene(
707
917
  self,
708
- params: dict,
709
- gene: str,
710
- alt_ac: str,
711
918
  pos: int,
919
+ genomic_ac: str,
920
+ ) -> tuple[str | None, str | None]:
921
+ """Get gene given a genomic accession and position.
922
+
923
+ If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
924
+ gene will be returned.
925
+
926
+ :param pos: Genomic position on ``genomic_ac``
927
+ :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
928
+ :return: HGNC gene symbol associated to genomic accession and position and
929
+ warning
930
+ """
931
+ query = f"""
932
+ SELECT DISTINCT hgnc
933
+ FROM {self.uta_db.schema}.tx_exon_aln_v
934
+ WHERE alt_ac = '{genomic_ac}'
935
+ AND alt_aln_method = 'splign'
936
+ AND {pos} BETWEEN alt_start_i AND alt_end_i
937
+ ORDER BY hgnc
938
+ LIMIT 1;
939
+ """ # noqa: S608
940
+ results = await self.uta_db.execute_query(query)
941
+ if not results:
942
+ return None, f"No gene(s) found given {genomic_ac} on position {pos}"
943
+
944
+ return results[0]["hgnc"], None
945
+
946
+ def _get_tx_segment(
947
+ self,
948
+ genomic_ac: str,
712
949
  strand: Strand,
713
- is_start: bool,
714
- ) -> str | None:
715
- """Set genomic data in `params` found from MANE.
950
+ offset: int,
951
+ genomic_ac_data: ExonCoord,
952
+ is_seg_start: bool = False,
953
+ ) -> tuple[TxSegment | None, str | None]:
954
+ """Get transcript segment data given ``genomic_ac`` and offset data
716
955
 
717
- :param params: Parameters for response
718
- :param gene: Gene symbol
719
- :param alt_ac: Genomic accession
720
- :param pos: Genomic position
956
+ :param genomic_ac: Genomic RefSeq accession
721
957
  :param strand: Strand
722
- :param is_start: `True` if `pos` is start position. `False` if `pos` is end
723
- position.
724
- :return: Warnings if found
958
+ :param offset: Exon offset
959
+ :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
960
+ :param is_seg_start: ``True`` if retrieving genomic data where the transcript
961
+ segment starts, defaults to ``False``
962
+ :return: Transcript segment data
725
963
  """
726
- start, end = get_inter_residue_pos(pos, pos, residue_mode=ResidueMode.ZERO)
727
- mane_data: (
728
- CdnaRepresentation | None
729
- ) = await self.mane_transcript.get_mane_transcript(
730
- alt_ac,
731
- start,
732
- end,
733
- AnnotationLayer.GENOMIC,
734
- gene=gene,
735
- try_longest_compatible=True,
736
- residue_mode=ResidueMode.INTER_RESIDUE,
737
- )
738
- if not mane_data:
739
- msg = f"Unable to find mane data for {alt_ac} with position {pos}"
740
- if gene:
741
- msg += f" on gene {gene}"
742
- _logger.warning(msg)
743
- return msg
744
-
745
- params["gene"] = mane_data.gene
746
- params["transcript"] = (
747
- mane_data.refseq
748
- if mane_data.refseq
749
- else mane_data.ensembl
750
- if mane_data.ensembl
751
- else None
752
- )
753
- tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)
754
- if not tx_exons:
755
- return f"Unable to get exons for {params['transcript']}"
756
- tx_pos = mane_data.pos[0] + mane_data.coding_start_site
757
- params["exon"] = self._get_exon_number(tx_exons, tx_pos)
758
-
759
- try:
760
- tx_exon = tx_exons[params["exon"] - 1]
761
- except IndexError:
762
- msg = (
763
- f"{params['transcript']} with position {tx_pos} "
764
- f"does not exist on exons: {tx_exons}"
765
- )
766
- _logger.warning(msg)
767
- return msg
768
-
769
- strand_to_use = strand if strand is not None else mane_data.strand
770
- params["strand"] = strand_to_use
771
- self._set_exon_offset(
772
- params,
773
- tx_exon[0],
774
- tx_exon[1],
775
- tx_pos,
776
- is_start=is_start,
777
- strand=strand_to_use,
778
- )
964
+ if is_seg_start:
965
+ if strand == Strand.POSITIVE:
966
+ seg_genomic_pos = offset + genomic_ac_data.alt_start_i
967
+ else:
968
+ seg_genomic_pos = genomic_ac_data.alt_end_i - offset
969
+ else:
970
+ if strand == Strand.POSITIVE:
971
+ seg_genomic_pos = offset + genomic_ac_data.alt_end_i
972
+ else:
973
+ seg_genomic_pos = genomic_ac_data.alt_start_i - offset
779
974
 
780
- # Need to check if we need to change pos for liftover
781
- genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end(
782
- params["transcript"], tx_pos, tx_pos, gene
975
+ genomic_loc, err_msg = self._get_vrs_seq_loc(
976
+ genomic_ac,
977
+ seg_genomic_pos,
978
+ is_start=is_seg_start,
979
+ strand=strand,
783
980
  )
784
- if genomic_data is None:
785
- return warnings
786
-
787
- params["chr"] = genomic_data[1]
788
- genomic_coords = genomic_data[2], genomic_data[3]
789
- genomic_pos = genomic_coords[1] - 1 if is_start else genomic_coords[0] + 1
790
- params["pos"] = (
791
- genomic_pos - params["exon_offset"]
792
- if strand_to_use == -1
793
- else genomic_pos + params["exon_offset"]
981
+ if err_msg:
982
+ return None, err_msg
983
+
984
+ return TxSegment(
985
+ exon_ord=genomic_ac_data.ord,
986
+ genomic_location=genomic_loc,
987
+ offset=offset,
988
+ ), None
989
+
990
+ def _get_vrs_seq_loc(
991
+ self, genomic_ac: str, genomic_pos: int, is_start: bool, strand: Strand
992
+ ) -> tuple[SequenceLocation | None, str | None]:
993
+ """Create VRS Sequence Location for genomic position where transcript segment
994
+ occurs
995
+
996
+ :param genomic_ac: RefSeq genomic accession
997
+ :param genomic_pos: Genomic position where the transcript segment occurs
998
+ :param is_start: ``True`` if ``genomic_pos`` is where the transcript segment
999
+ starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
1000
+ :param strand: Strand
1001
+ :return: Tuple containing VRS location (if successful) and error message (if
1002
+ unable to get GA4GH identifier for ``genomic_ac``).
1003
+ """
1004
+ ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
1005
+ genomic_ac, "ga4gh"
794
1006
  )
795
- return None
1007
+ if err_msg:
1008
+ return None, err_msg
796
1009
 
797
- async def _set_genomic_data(
798
- self, params: dict, strand: Strand, is_start: bool
799
- ) -> str | None:
800
- """Set genomic data in ``params``
1010
+ use_start = strand == Strand.POSITIVE if is_start else strand != Strand.POSITIVE
801
1011
 
802
- :param params: Parameters for response
803
- :param strand: Strand
804
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
805
- end position.
806
- :return: Warnings if found
807
- """
808
- # We should always try to liftover
809
- grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"])
810
- if not grch38_ac:
811
- return f"Invalid genomic accession: {params['chr']}"
1012
+ return SequenceLocation(
1013
+ sequenceReference=SequenceReference(
1014
+ refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
1015
+ ),
1016
+ start=genomic_pos if use_start else None,
1017
+ end=genomic_pos if not use_start else None,
1018
+ ), None
812
1019
 
813
- grch38_ac = grch38_ac[0]
814
- if grch38_ac != params["chr"]: # params["chr"] is genomic accession
815
- # Liftover to 38
816
- descr = await self.uta_db.get_chr_assembly(params["chr"])
817
- if descr is None:
818
- return f"Unable to get chromosome and assembly for " f"{params['chr']}"
1020
+ async def _get_tx_seg_genomic_metadata(
1021
+ self,
1022
+ genomic_ac: str,
1023
+ genomic_pos: int,
1024
+ is_start: bool,
1025
+ gene: str,
1026
+ tx_ac: str | None,
1027
+ ) -> GenomicTxSeg:
1028
+ """Get transcript segment data and associated genomic metadata.
819
1029
 
820
- chromosome_number, assembly = descr
821
- liftover_data = self.liftover.get_liftover(
822
- chromosome_number, params["pos"], Assembly.GRCH38
823
- )
824
- if liftover_data is None:
825
- return (
826
- f"Position {params['pos']} does not exist on "
827
- f"chromosome {chromosome_number}"
828
- )
1030
+ Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
1031
+ errors.
829
1032
 
830
- params["pos"] = liftover_data[1]
831
- params["chr"] = grch38_ac
1033
+ If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
832
1034
 
833
- tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac)
1035
+ :param genomic_ac: Genomic RefSeq accession
1036
+ :param genomic_pos: Genomic position where the transcript segment occurs
1037
+ :param is_start: Whether or not ``genomic_pos`` represents the start position.
1038
+ :param gene: HGNC gene symbol
1039
+ :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
1040
+ transcript
1041
+ :return: Transcript segment data and associated genomic metadata
1042
+ """
1043
+ if tx_ac:
1044
+ # We should always try to liftover
1045
+ grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
1046
+ if not grch38_ac:
1047
+ return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
1048
+ grch38_ac = grch38_ac[0]
1049
+ else:
1050
+ mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
1051
+ if not mane_data:
1052
+ err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
1053
+ if gene:
1054
+ err_msg += f" on gene {gene}"
1055
+ _logger.warning(err_msg)
1056
+ return GenomicTxSeg(errors=[err_msg])
1057
+
1058
+ mane_data = mane_data[0]
1059
+ tx_ac = mane_data["RefSeq_nuc"]
1060
+ grch38_ac = mane_data["GRCh38_chr"]
1061
+
1062
+ # Always liftover to GRCh38
1063
+ genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
1064
+ genomic_ac, genomic_pos, grch38_ac=grch38_ac
1065
+ )
1066
+ if err_msg:
1067
+ return GenomicTxSeg(errors=[err_msg])
1068
+
1069
+ tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
834
1070
  if not tx_exons:
835
- return f"Unable to get exons for {params['transcript']}"
1071
+ return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
836
1072
 
837
- data = await self.uta_db.get_tx_exon_aln_v_data(
838
- params["transcript"],
839
- params["pos"],
840
- params["pos"],
841
- alt_ac=params["chr"],
1073
+ tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
1074
+ tx_ac,
1075
+ genomic_pos,
1076
+ genomic_pos,
1077
+ alt_ac=genomic_ac,
842
1078
  use_tx_pos=False,
843
1079
  )
844
- if len(data) != 1:
845
- return (
846
- f"Must find exactly one row for genomic data, "
847
- f"but found: {len(data)}"
1080
+ if len(tx_exon_aln_data) != 1:
1081
+ return GenomicTxSeg(
1082
+ errors=[
1083
+ f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
1084
+ ]
848
1085
  )
849
1086
 
850
- # Find exon number
851
- data = data[0]
852
- data_exons = data[2], data[3]
853
- i = 1
854
- found_tx_exon = False
855
- for exon in tx_exons:
856
- if data_exons == exon:
857
- found_tx_exon = True
858
- break
859
- i += 1
860
- if not found_tx_exon:
861
- # Either first or last
862
- i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1
863
- params["exon"] = i
864
-
865
- strand_to_use = strand if strand is not None else Strand(data[7])
866
- params["strand"] = strand_to_use
867
- if not is_start:
868
- # convert back to inter-residue for end position
869
- params["pos"] += 1
870
- self._set_exon_offset(
871
- params,
872
- data[5] if is_start else data[5] + 1, # need to convert to inter-residue
873
- data[6] - 1 if is_start else data[6], # need to convert to inter-residue
874
- params["pos"],
875
- is_start=is_start,
876
- strand=strand_to_use,
1087
+ tx_exon_aln_data = tx_exon_aln_data[0]
1088
+
1089
+ offset = self._get_exon_offset(
1090
+ start_i=tx_exon_aln_data.alt_start_i,
1091
+ end_i=tx_exon_aln_data.alt_end_i,
1092
+ strand=Strand(tx_exon_aln_data.alt_strand),
1093
+ use_start_i=False, # This doesn't impact anything since we're on the exon
1094
+ is_in_exon=True,
1095
+ start=genomic_pos if is_start else None,
1096
+ end=genomic_pos if not is_start else None,
1097
+ )
1098
+
1099
+ genomic_location, err_msg = self._get_vrs_seq_loc(
1100
+ genomic_ac, genomic_pos, is_start, tx_exon_aln_data.alt_strand
1101
+ )
1102
+ if err_msg:
1103
+ return GenomicTxSeg(errors=[err_msg])
1104
+
1105
+ return GenomicTxSeg(
1106
+ gene=tx_exon_aln_data.hgnc,
1107
+ genomic_ac=genomic_ac,
1108
+ tx_ac=tx_exon_aln_data.tx_ac,
1109
+ seg=TxSegment(
1110
+ exon_ord=tx_exon_aln_data.ord,
1111
+ offset=offset,
1112
+ genomic_location=genomic_location,
1113
+ ),
877
1114
  )
878
- return None
879
1115
 
880
1116
  @staticmethod
881
- def _set_exon_offset(
882
- params: dict, start: int, end: int, pos: int, is_start: bool, strand: Strand
883
- ) -> None:
884
- """Set value for ``exon_offset`` in ``params``.
885
-
886
- :param params: Parameters for response
887
- :param start: Start exon coord (can be transcript or aligned genomic)
888
- :param end: End exon coord (can be transcript or aligned genomic)
889
- :param pos: Position change (can be transcript or genomic)
890
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
891
- end position
1117
+ def _get_exon_offset(
1118
+ start_i: int,
1119
+ end_i: int,
1120
+ strand: Strand,
1121
+ use_start_i: bool = True,
1122
+ is_in_exon: bool = True,
1123
+ start: int | None = None,
1124
+ end: int | None = None,
1125
+ ) -> int:
1126
+ """Compute offset from exon start or end index
1127
+
1128
+ :param start_i: Exon start index (inter-residue)
1129
+ :param end_i: Exon end index (inter-residue)
892
1130
  :param strand: Strand
1131
+ :param use_start_i: Whether or not ``start_i`` should be used to compute the
1132
+ offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1133
+ ``False``.
1134
+ :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1135
+ ``True``
1136
+ :param start: Provided start position, defaults to ``None``. Must provide
1137
+ ``start`` or ``end``, not both.
1138
+ :param end: Provided end position, defaults to ``None``. Must provide ``start``
1139
+ or ``end``, not both
1140
+ :return: Offset from exon start or end index
893
1141
  """
894
- if is_start:
895
- if strand == Strand.NEGATIVE:
896
- params["exon_offset"] = end - pos
1142
+ if is_in_exon:
1143
+ if start is not None:
1144
+ offset = start - start_i if strand == Strand.POSITIVE else end_i - start
897
1145
  else:
898
- params["exon_offset"] = pos - end
1146
+ offset = end - end_i if strand == Strand.POSITIVE else start_i - end
899
1147
  else:
900
- if strand == Strand.NEGATIVE:
901
- params["exon_offset"] = start - pos
1148
+ if strand == Strand.POSITIVE:
1149
+ offset = start - start_i if use_start_i else end - end_i
902
1150
  else:
903
- params["exon_offset"] = pos - start
904
-
905
- async def _structure_exons(
906
- self, transcript: str, alt_ac: str | None = None
907
- ) -> list[tuple[int, int]]:
908
- """Structure exons as list of tuples.
909
-
910
- :param transcript: Transcript accession
911
- :param alt_ac: Genomic accession
912
- :return: List of tuples containing transcript exon coordinates
913
- """
914
- tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac)
915
-
916
- if not tx_exons:
917
- return []
918
-
919
- return [(coords[0], coords[1]) for coords in tx_exons]
920
-
921
- @staticmethod
922
- def _get_exon_number(tx_exons: list, tx_pos: int) -> int:
923
- """Find related exon number for a position
924
-
925
- :param tx_exons: List of exon coordinates for a transcript
926
- :param tx_pos: Transcript position change
927
- :return: Exon number associated to transcript position change. Will be 1-based
928
- """
929
- i = 1
930
- for coords in tx_exons:
931
- if coords[0] <= tx_pos <= coords[1]:
932
- break
933
- i += 1
934
- return i
1151
+ offset = start_i - end if use_start_i else end_i - start
1152
+ return offset
935
1153
 
936
1154
  @staticmethod
937
1155
  def _get_adjacent_exon(
938
- tx_exons_genomic_coords: list[tuple[int, int, int, int, int]],
1156
+ tx_exons_genomic_coords: list[ExonCoord],
939
1157
  strand: Strand,
940
1158
  start: int | None = None,
941
1159
  end: int | None = None,
@@ -946,20 +1164,18 @@ class ExonGenomicCoordsMapper:
946
1164
  adjacent is defined as the exon following the breakpoint for the 5' end and the
947
1165
  exon preceding the breakpoint for the 3' end.
948
1166
 
949
- :param: tx_exons_genomic_coords: List of tuples describing exons and genomic
950
- coordinates for a transcript. Each tuple contains the transcript number
951
- (0-indexed), the transcript coordinates for the exon, and the genomic
952
- coordinates for the exon. Pos 0 in the tuple corresponds to the exon
953
- number, pos 1 and pos 2 refer to the start and end transcript coordinates,
954
- respectively, and pos 3 and 4 refer to the start and end genomic
955
- coordinates, respectively.
1167
+ :param tx_exons_genomic_coords: Transcript exon coordinate data
956
1168
  :param strand: Strand
957
- :param: start: Genomic coordinate of breakpoint
958
- :param: end: Genomic coordinate of breakpoint
959
- :return: Exon number corresponding to adjacent exon. Will be 1-based
1169
+ :param start: Genomic coordinate of breakpoint
1170
+ :param end: Genomic coordinate of breakpoint
1171
+ :return: Exon number corresponding to adjacent exon. Will be 0-based
960
1172
  """
961
1173
  for i in range(len(tx_exons_genomic_coords) - 1):
962
1174
  exon = tx_exons_genomic_coords[i]
1175
+ if start == exon.alt_start_i:
1176
+ break
1177
+ if end == exon.alt_end_i:
1178
+ break
963
1179
  next_exon = tx_exons_genomic_coords[i + 1]
964
1180
  bp = start if start else end
965
1181
  if strand == Strand.POSITIVE:
@@ -968,19 +1184,20 @@ class ExonGenomicCoordsMapper:
968
1184
  else:
969
1185
  lte_exon = next_exon
970
1186
  gte_exon = exon
971
- if bp >= lte_exon[4] and bp <= gte_exon[3]:
1187
+ if bp >= lte_exon.alt_end_i and bp <= gte_exon.alt_start_i:
972
1188
  break
973
1189
  # Return current exon if end position is provided, next exon if start position
974
- # is provided. exon[0] needs to be incremented by 1 in both cases as exons are
975
- # 0-based in UTA
976
- return exon[0] + 1 if end else exon[0] + 2
1190
+ # is provided.
1191
+ return exon.ord if end else exon.ord + 1
977
1192
 
978
1193
  @staticmethod
979
- def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list) -> bool:
1194
+ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[ExonCoord]) -> bool:
980
1195
  """Check if a breakpoint occurs on an exon
981
1196
 
982
1197
  :param pos: Genomic breakpoint
983
- :param tx_genomic_coords: A list of genomic coordinates for a transcript
984
- :return: True if the breakpoint occurs on an exon
1198
+ :param tx_genomic_coords: A list of transcript exon coordinate data
1199
+ :return: ``True`` if the breakpoint occurs on an exon
985
1200
  """
986
- return any(pos >= exon[3] and pos <= exon[4] for exon in tx_genomic_coords)
1201
+ return any(
1202
+ exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1203
+ )