cool-seq-tool 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,224 @@
1
1
  """Provide mapping capabilities between transcript exon and genomic coordinates."""
2
2
 
3
3
  import logging
4
- from typing import Literal, TypeVar
4
+
5
+ from ga4gh.vrs.models import SequenceLocation, SequenceReference
6
+ from pydantic import ConfigDict, Field, StrictInt, StrictStr, model_validator
5
7
 
6
8
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
7
9
  from cool_seq_tool.mappers.liftover import LiftOver
8
- from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript
9
10
  from cool_seq_tool.schemas import (
10
- AnnotationLayer,
11
11
  Assembly,
12
- GenomicData,
13
- GenomicDataResponse,
14
- ResidueMode,
12
+ BaseModelForbidExtra,
13
+ ServiceMeta,
15
14
  Strand,
16
- TranscriptExonData,
17
- TranscriptExonDataResponse,
18
15
  )
19
16
  from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
20
- from cool_seq_tool.sources.uta_database import UtaDatabase
21
- from cool_seq_tool.utils import get_inter_residue_pos, service_meta
22
-
23
- CoordinatesResponseType = TypeVar(
24
- "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse
25
- )
17
+ from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
18
+ from cool_seq_tool.utils import service_meta
26
19
 
27
20
  _logger = logging.getLogger(__name__)
28
21
 
29
22
 
23
+ class _ExonCoord(BaseModelForbidExtra):
24
+ """Model for representing exon coordinate data"""
25
+
26
+ ord: StrictInt = Field(..., description="Exon number. 0-based.")
27
+ tx_start_i: StrictInt = Field(
28
+ ...,
29
+ description="Transcript start index of the exon. Inter-residue coordinates.",
30
+ )
31
+ tx_end_i: StrictInt = Field(
32
+ ..., description="Transcript end index of the exon. Inter-residue coordinates."
33
+ )
34
+ alt_start_i: StrictInt = Field(
35
+ ..., description="Genomic start index of the exon. Inter-residue coordinates."
36
+ )
37
+ alt_end_i: StrictInt = Field(
38
+ ..., description="Genomic end index of the exon. Inter-residue coordinates."
39
+ )
40
+ alt_strand: Strand = Field(..., description="Strand.")
41
+
42
+ model_config = ConfigDict(
43
+ json_schema_extra={
44
+ "example": {
45
+ "ord": 0,
46
+ "tx_start_i": 0,
47
+ "tx_end_i": 234,
48
+ "alt_start_i": 154191901,
49
+ "alt_end_i": 154192135,
50
+ "alt_strand": Strand.NEGATIVE,
51
+ }
52
+ }
53
+ )
54
+
55
+
56
+ class TxSegment(BaseModelForbidExtra):
57
+ """Model for representing transcript segment data."""
58
+
59
+ exon_ord: StrictInt = Field(..., description="Exon number. 0-based.")
60
+ offset: StrictInt = Field(
61
+ 0,
62
+ description="The value added to or subtracted from the `genomic_location` to find the start or end of an exon.",
63
+ )
64
+ genomic_location: SequenceLocation = Field(
65
+ ..., description="The genomic position of a transcript segment."
66
+ )
67
+
68
+ model_config = ConfigDict(
69
+ json_schema_extra={
70
+ "example": {
71
+ "exon_ord": 0,
72
+ "offset": 0,
73
+ "genomic_location": {
74
+ "type": "SequenceLocation",
75
+ "sequenceReference": {
76
+ "type": "SequenceReference",
77
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
78
+ },
79
+ "end": 154192135,
80
+ },
81
+ }
82
+ }
83
+ )
84
+
85
+
86
+ class GenomicTxSeg(BaseModelForbidExtra):
87
+ """Model for representing a boundary for a transcript segment."""
88
+
89
+ seg: TxSegment | None = Field(None, description="Transcript segment.")
90
+ gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
91
+ genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
92
+ tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
93
+ errors: list[StrictStr] = Field([], description="Error messages.")
94
+
95
+ @model_validator(mode="before")
96
+ def check_errors(cls, values: dict) -> dict: # noqa: N805
97
+ """Ensure that fields are (un)set depending on errors
98
+
99
+ :param values: Values in model
100
+ :raises ValueError: If `seg`, `genomic_ac` and `tx_ac` are not
101
+ provided when there are no errors
102
+ :return: Values in model
103
+ """
104
+ if not values.get("errors") and not all(
105
+ (
106
+ values.get("seg"),
107
+ values.get("genomic_ac"),
108
+ values.get("tx_ac"),
109
+ )
110
+ ):
111
+ err_msg = "`seg`, `genomic_ac` and `tx_ac` must be provided"
112
+ raise ValueError(err_msg)
113
+ return values
114
+
115
+ model_config = ConfigDict(
116
+ json_schema_extra={
117
+ "example": {
118
+ "gene": "TPM3",
119
+ "genomic_ac": "NC_000001.11",
120
+ "tx_ac": "NM_152263.3",
121
+ "seg": {
122
+ "exon_ord": 0,
123
+ "offset": 0,
124
+ "genomic_location": {
125
+ "type": "SequenceLocation",
126
+ "sequenceReference": {
127
+ "type": "SequenceReference",
128
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
129
+ },
130
+ "end": 154192135,
131
+ },
132
+ },
133
+ "errors": [],
134
+ }
135
+ }
136
+ )
137
+
138
+
139
+ class GenomicTxSegService(BaseModelForbidExtra):
140
+ """Service model for genomic and transcript data."""
141
+
142
+ gene: StrictStr | None = Field(None, description="HGNC gene symbol.")
143
+ genomic_ac: StrictStr | None = Field(None, description="RefSeq genomic accession.")
144
+ tx_ac: StrictStr | None = Field(None, description="RefSeq transcript accession.")
145
+ seg_start: TxSegment | None = Field(None, description="Start transcript segment.")
146
+ seg_end: TxSegment | None = Field(None, description="End transcript segment.")
147
+ errors: list[StrictStr] = Field([], description="Error messages.")
148
+ service_meta: ServiceMeta = Field(..., description="Service metadata.")
149
+
150
+ @model_validator(mode="before")
151
+ def add_meta_check_errors(cls, values: dict) -> dict: # noqa: N805
152
+ """Add service metadata to model and ensure that fields are (un)set depending
153
+ on errors
154
+
155
+ :param values: Values in model
156
+ :raises ValueError: If `genomic_ac`, `tx_ac` and `seg_start` or `seg_end`
157
+ not provided when there are no errors
158
+ :return: Values in model, including service metadata
159
+ """
160
+ values["service_meta"] = service_meta()
161
+ if not values.get("errors") and not all(
162
+ (
163
+ values.get("genomic_ac"),
164
+ values.get("tx_ac"),
165
+ values.get("seg_start") or values.get("seg_end"),
166
+ )
167
+ ):
168
+ err_msg = (
169
+ "`genomic_ac`, `tx_ac` and `seg_start` or `seg_end` must be provided"
170
+ )
171
+ raise ValueError(err_msg)
172
+
173
+ return values
174
+
175
+ model_config = ConfigDict(
176
+ json_schema_extra={
177
+ "example": {
178
+ "gene": "TPM3",
179
+ "genomic_ac": "NC_000001.11",
180
+ "tx_ac": "NM_152263.3",
181
+ "seg_start": {
182
+ "exon_ord": 0,
183
+ "offset": 0,
184
+ "genomic_location": {
185
+ "type": "SequenceLocation",
186
+ "sequenceReference": {
187
+ "type": "SequenceReference",
188
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
189
+ },
190
+ "end": 154192135,
191
+ },
192
+ },
193
+ "seg_end": {
194
+ "exon_ord": 7,
195
+ "offset": 0,
196
+ "genomic_location": {
197
+ "type": "SequenceLocation",
198
+ "sequenceReference": {
199
+ "type": "SequenceReference",
200
+ "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
201
+ },
202
+ "start": 154170399,
203
+ },
204
+ },
205
+ }
206
+ }
207
+ )
208
+
209
+
210
+ def _return_service_errors(errors: list[str]) -> GenomicTxSegService:
211
+ """Log errors and return service object with errors.
212
+
213
+ :param errors: Error message(s)
214
+ :return: Service object with error messages.
215
+ """
216
+ for error in errors:
217
+ _logger.warning(error)
218
+
219
+ return GenomicTxSegService(errors=errors)
220
+
221
+
30
222
  class ExonGenomicCoordsMapper:
31
223
  """Provide capabilities for mapping transcript exon representation to/from genomic
32
224
  coordinate representation.
@@ -36,7 +228,6 @@ class ExonGenomicCoordsMapper:
36
228
  self,
37
229
  seqrepo_access: SeqRepoAccess,
38
230
  uta_db: UtaDatabase,
39
- mane_transcript: ManeTranscript,
40
231
  mane_transcript_mappings: ManeTranscriptMappings,
41
232
  liftover: LiftOver,
42
233
  ) -> None:
@@ -45,7 +236,7 @@ class ExonGenomicCoordsMapper:
45
236
  A lot of resources are required for initialization, so when defaults are enough,
46
237
  it's easiest to let the core CoolSeqTool class handle it for you:
47
238
 
48
- >>> from cool_seq_tool.app import CoolSeqTool
239
+ >>> from cool_seq_tool import CoolSeqTool
49
240
  >>> egc = CoolSeqTool().ex_g_coords_mapper
50
241
 
51
242
  Note that this class's public methods are all defined as ``async``, so they will
@@ -54,42 +245,22 @@ class ExonGenomicCoordsMapper:
54
245
 
55
246
  >>> import asyncio
56
247
  >>> result = asyncio.run(
57
- ... egc.transcript_to_genomic_coordinates(
58
- ... "NM_002529.3", exon_start=2, exon_end=17
59
- ... )
248
+ ... egc.tx_segment_to_genomic("NM_002529.3", exon_start=2, exon_end=17)
60
249
  ... )
61
250
  >>> result.genomic_data.start, result.genomic_data.end
62
251
  (156864428, 156881456)
63
252
 
64
253
  :param seqrepo_access: SeqRepo instance to give access to query SeqRepo database
65
254
  :param uta_db: UtaDatabase instance to give access to query UTA database
66
- :param mane_transcript: Instance to align to MANE or compatible representation
67
255
  :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class
68
256
  :param liftover: Instance to provide mapping between human genome assemblies
69
257
  """
70
258
  self.seqrepo_access = seqrepo_access
71
259
  self.uta_db = uta_db
72
- self.mane_transcript = mane_transcript
73
260
  self.mane_transcript_mappings = mane_transcript_mappings
74
261
  self.liftover = liftover
75
262
 
76
- @staticmethod
77
- def _return_warnings(
78
- resp: CoordinatesResponseType, warning_msg: list[str]
79
- ) -> CoordinatesResponseType:
80
- """Add warnings to response object
81
-
82
- :param resp: Response object
83
- :param warning_msg: Warning message(s) on why ``transcript_exon_data`` or
84
- ``genomic_data`` field is ``None``
85
- :return: Response object with warning message
86
- """
87
- for msg in warning_msg:
88
- _logger.warning(msg)
89
- resp.warnings.append(msg)
90
- return resp
91
-
92
- async def transcript_to_genomic_coordinates(
263
+ async def tx_segment_to_genomic(
93
264
  self,
94
265
  transcript: str,
95
266
  gene: str | None = None,
@@ -97,26 +268,30 @@ class ExonGenomicCoordsMapper:
97
268
  exon_start_offset: int = 0,
98
269
  exon_end: int | None = None,
99
270
  exon_end_offset: int = 0,
100
- ) -> GenomicDataResponse:
101
- """Get genomic data given transcript data.
271
+ ) -> GenomicTxSegService:
272
+ """Get aligned genomic data given transcript segment data.
102
273
 
103
274
  By default, transcript data is aligned to the GRCh38 assembly.
104
275
 
105
276
  >>> import asyncio
106
- >>> from cool_seq_tool.app import CoolSeqTool
277
+ >>> from cool_seq_tool import CoolSeqTool
107
278
  >>> egc = CoolSeqTool().ex_g_coords_mapper
108
279
  >>> tpm3 = asyncio.run(
109
- ... egc.transcript_to_genomic_coordinates(
280
+ ... egc.tx_segment_to_genomic(
110
281
  ... "NM_152263.3",
111
282
  ... gene="TPM3",
112
283
  ... exon_start=1,
113
284
  ... exon_end=8,
114
285
  ... )
115
286
  ... )
116
- >>> tpm3.genomic_data.chr, tpm3.genomic_data.start, tpm3.genomic_data.end
287
+ >>> (
288
+ ... tpm3.genomic_ac,
289
+ ... tpm3.seg_start.genomic_location.end,
290
+ ... tpm3.seg_end.genomic_location.start,
291
+ ... )
117
292
  ('NC_000001.11', 154192135, 154170399)
118
293
 
119
- :param transcript: Transcript accession
294
+ :param transcript: RefSeq transcript accession
120
295
  :param gene: HGNC gene symbol
121
296
  :param exon_start: Starting transcript exon number (1-based). If not provided,
122
297
  must provide ``exon_end``
@@ -126,422 +301,497 @@ class ExonGenomicCoordsMapper:
126
301
  :param exon_end_offset: Ending exon offset
127
302
  :return: GRCh38 genomic data (inter-residue coordinates)
128
303
  """
129
- resp = GenomicDataResponse(
130
- genomic_data=None, warnings=[], service_meta=service_meta()
131
- )
132
-
133
304
  # Ensure valid inputs
134
- warnings = []
135
- if not transcript:
136
- warnings.append("Must provide `transcript`")
137
- else:
138
- transcript = transcript.strip()
139
-
305
+ errors = []
140
306
  exon_start_exists, exon_end_exists = False, False
141
307
  if exon_start is not None:
142
308
  if exon_start < 1:
143
- warnings.append("`exon_start` cannot be less than 1")
309
+ errors.append("`exon_start` cannot be less than 1")
144
310
  exon_start_exists = True
145
311
 
146
312
  if exon_end is not None:
147
313
  if exon_end < 1:
148
- warnings.append("`exon_end` cannot be less than 1")
314
+ errors.append("`exon_end` cannot be less than 1")
149
315
  exon_end_exists = True
150
316
 
151
317
  if not exon_start_exists and not exon_end_exists:
152
- warnings.append("Must provide either `exon_start` or `exon_end`")
318
+ errors.append("Must provide either `exon_start` or `exon_end`")
153
319
  if exon_start_exists and exon_end_exists and (exon_start > exon_end):
154
- warnings.append(
320
+ errors.append(
155
321
  f"Start exon {exon_start} is greater than end exon {exon_end}"
156
322
  )
157
323
 
158
- if warnings:
159
- return self._return_warnings(resp, warnings)
160
-
161
- # Get all exons and associated start/end coordinates for transcript
162
- tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
163
- if not tx_exons:
164
- return self._return_warnings(resp, [warning] if warning else [])
324
+ if errors:
325
+ return _return_service_errors(errors)
165
326
 
166
327
  # Get exon start and exon end coordinates
167
- tx_exon_coords, warning = self.get_tx_exon_coords(
168
- transcript, tx_exons, exon_start, exon_end
328
+ (
329
+ tx_exon_start_coords,
330
+ tx_exon_end_coords,
331
+ errors,
332
+ ) = await self._get_start_end_exon_coords(
333
+ transcript, exon_start=exon_start, exon_end=exon_end
169
334
  )
170
- if not tx_exon_coords:
171
- return self._return_warnings(resp, [warning] if warning else [])
172
- tx_exon_start_coords, tx_exon_end_coords = tx_exon_coords
335
+ if errors:
336
+ return _return_service_errors(errors)
173
337
 
174
338
  if gene:
175
- gene = gene.upper().strip()
339
+ gene = gene.upper()
176
340
 
177
341
  # Get aligned genomic data (hgnc gene, alt_ac, alt_start_i, alt_end_i, strand)
178
342
  # for exon(s)
179
- alt_ac_start_end, warning = await self._get_alt_ac_start_and_end(
343
+ (
344
+ genomic_aln_start,
345
+ genomic_aln_end,
346
+ err_msg,
347
+ ) = await self._get_genomic_aln_coords(
180
348
  transcript, tx_exon_start_coords, tx_exon_end_coords, gene=gene
181
349
  )
182
- if not alt_ac_start_end:
183
- return self._return_warnings(resp, [warning] if warning else [])
184
- alt_ac_start_data, alt_ac_end_data = alt_ac_start_end
350
+ if err_msg:
351
+ return _return_service_errors([err_msg])
185
352
 
186
353
  # Get gene and chromosome data, check that at least one was retrieved
187
- gene = alt_ac_start_data[0] if alt_ac_start_data else alt_ac_end_data[0]
188
- chromosome = alt_ac_start_data[1] if alt_ac_start_data else alt_ac_end_data[1]
189
- if gene is None or chromosome is None:
190
- return self._return_warnings(
191
- resp,
354
+ gene = genomic_aln_start.hgnc if genomic_aln_start else genomic_aln_end.hgnc
355
+ genomic_ac = (
356
+ genomic_aln_start.alt_ac if genomic_aln_start else genomic_aln_end.alt_ac
357
+ )
358
+ if gene is None or genomic_ac is None:
359
+ return _return_service_errors(
192
360
  [
193
- "Unable to retrieve `gene` or `chromosome` from genomic start and genomic end data"
361
+ "Unable to retrieve `gene` or `genomic_ac` from genomic start and genomic end data"
194
362
  ],
195
363
  )
196
364
 
197
- g_start = alt_ac_start_data[3] - 1 if alt_ac_start_data else None
198
- g_end = alt_ac_end_data[2] + 1 if alt_ac_end_data else None
199
365
  strand = (
200
- Strand(alt_ac_start_data[4])
201
- if alt_ac_start_data
202
- else Strand(alt_ac_end_data[4])
366
+ Strand(genomic_aln_start.alt_strand)
367
+ if genomic_aln_start
368
+ else Strand(genomic_aln_end.alt_strand)
203
369
  )
204
370
 
205
- # Using none since could set to 0
206
- start_exits = g_start is not None
207
- end_exists = g_end is not None
208
-
209
- # Calculate offsets
210
- if strand == Strand.NEGATIVE:
211
- start_offset = exon_start_offset * -1 if start_exits else None
212
- end_offset = exon_end_offset * -1 if end_exists else 0
371
+ if exon_start_exists:
372
+ seg_start, err_msg = self._get_tx_segment(
373
+ genomic_ac,
374
+ strand,
375
+ exon_start_offset,
376
+ genomic_aln_start,
377
+ is_seg_start=True,
378
+ )
379
+ if err_msg:
380
+ return _return_service_errors([err_msg])
213
381
  else:
214
- start_offset = exon_start_offset if start_exits else 0
215
- end_offset = exon_end_offset if end_exists else 0
216
-
217
- # Get genomic coordinates with offsets included
218
- g_start = g_start + start_offset if start_exits else None
219
- g_end = g_end + end_offset if end_exists else None
382
+ seg_start = None
383
+
384
+ if exon_end_exists:
385
+ seg_end, err_msg = self._get_tx_segment(
386
+ genomic_ac,
387
+ strand,
388
+ exon_end_offset,
389
+ genomic_aln_end,
390
+ is_seg_start=False,
391
+ )
392
+ if err_msg:
393
+ return _return_service_errors([err_msg])
394
+ else:
395
+ seg_end = None
220
396
 
221
- resp.genomic_data = GenomicData(
397
+ return GenomicTxSegService(
222
398
  gene=gene,
223
- chr=chromosome,
224
- start=g_start,
225
- end=g_end,
226
- exon_start=exon_start if start_exits else None,
227
- exon_start_offset=exon_start_offset,
228
- exon_end=exon_end if end_exists else None,
229
- exon_end_offset=exon_end_offset,
230
- transcript=transcript,
231
- strand=strand,
399
+ genomic_ac=genomic_ac,
400
+ tx_ac=transcript,
401
+ seg_start=seg_start,
402
+ seg_end=seg_end,
232
403
  )
233
404
 
234
- return resp
235
-
236
- async def genomic_to_transcript_exon_coordinates(
405
+ async def genomic_to_tx_segment(
237
406
  self,
238
407
  chromosome: str | None = None,
239
- alt_ac: str | None = None,
240
- start: int | None = None,
241
- end: int | None = None,
242
- strand: Strand | None = None,
408
+ genomic_ac: str | None = None,
409
+ seg_start_genomic: int | None = None,
410
+ seg_end_genomic: int | None = None,
243
411
  transcript: str | None = None,
244
412
  get_nearest_transcript_junction: bool = False,
245
413
  gene: str | None = None,
246
- residue_mode: Literal[ResidueMode.INTER_RESIDUE]
247
- | Literal[ResidueMode.RESIDUE] = ResidueMode.RESIDUE,
248
- ) -> GenomicDataResponse:
249
- """Get transcript data for genomic data, lifted over to GRCh38.
414
+ ) -> GenomicTxSegService:
415
+ """Get transcript segment data for genomic data, lifted over to GRCh38.
416
+
417
+ If liftover to GRCh38 is unsuccessful, will return errors.
418
+
419
+ Must provide inter-residue coordinates.
250
420
 
251
421
  MANE Transcript data will be returned if and only if ``transcript`` is not
252
422
  supplied. ``gene`` must be given in order to retrieve MANE Transcript data.
253
423
 
254
424
  >>> import asyncio
255
- >>> from cool_seq_tool.app import CoolSeqTool
425
+ >>> from cool_seq_tool import CoolSeqTool
256
426
  >>> from cool_seq_tool.schemas import Strand
257
427
  >>> egc = CoolSeqTool().ex_g_coords_mapper
258
428
  >>> result = asyncio.run(
259
- ... egc.genomic_to_transcript_exon_coordinates(
260
- ... alt_ac="NC_000001.11",
261
- ... start=154192136,
262
- ... end=154170400,
263
- ... strand=Strand.NEGATIVE,
429
+ ... egc.genomic_to_tx_segment(
430
+ ... genomic_ac="NC_000001.11",
431
+ ... seg_start_genomic=154192135,
432
+ ... seg_end_genomic=154170399,
264
433
  ... transcript="NM_152263.3",
265
434
  ... )
266
435
  ... )
267
- >>> result.genomic_data.exon_start, result.genomic_data.exon_end
268
- (1, 8)
436
+ >>> result.seg_start.exon_ord, result.seg_end.exon_ord
437
+ (0, 7)
269
438
 
270
439
  :param chromosome: e.g. ``"1"`` or ``"chr1"``. If not provided, must provide
271
- ``alt_ac``. If ``alt_ac`` is also provided, ``alt_ac`` will be used.
272
- :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
273
- must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
274
- will be used.
275
- :param start: Start genomic position
276
- :param end: End genomic position
277
- :param strand: Strand
440
+ ``genomic_ac``. If ``genomic_ac`` is also provided, ``genomic_ac`` will be
441
+ used.
442
+ :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
443
+ must provide ``chromosome. If ``chromosome`` is also provided,
444
+ ``genomic_ac`` will be used.
445
+ :param seg_start_genomic: Genomic position where the transcript segment starts
446
+ :param seg_end_genomic: Genomic position where the transcript segment ends
278
447
  :param transcript: The transcript to use. If this is not given, we will try the
279
448
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
280
449
  Compatible Transcript. See the :ref:`Transcript Selection policy <transcript_selection_policy>`
281
450
  page.
282
451
  :param get_nearest_transcript_junction: If ``True``, this will return the
283
- adjacent exon if the position specified by``start`` or ``end`` does not
284
- occur on an exon. For the positive strand, adjacent is defined as the exon
285
- preceding the breakpoint for the 5' end and the exon following the
286
- breakpoint for the 3' end. For the negative strand, adjacent is defined as
287
- the exon following the breakpoint for the 5' end and the exon preceding the
288
- breakpoint for the 3' end.
452
+ adjacent exon if the position specified by``seg_start_genomic`` or
453
+ ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
454
+ is defined as the exon preceding the breakpoint for the 5' end and the exon
455
+ following the breakpoint for the 3' end. For the negative strand, adjacent
456
+ is defined as the exon following the breakpoint for the 5' end and the exon
457
+ preceding the breakpoint for the 3' end.
289
458
  :param gene: gene name. Ideally, HGNC symbol. Must be given if no ``transcript``
290
459
  value is provided.
291
- :param residue_mode: Residue mode for ``start`` and ``end``
460
+ :param coordinate_type: Coordinate type for ``seg_start_genomic`` and
461
+ ``seg_end_genomic``
292
462
  :return: Genomic data (inter-residue coordinates)
293
463
  """
294
- resp = GenomicDataResponse(
295
- genomic_data=None, warnings=[], service_meta=service_meta()
296
- )
297
- warnings = []
298
- if start is None and end is None:
299
- warnings.append("Must provide either `start` or `end`")
300
- if chromosome is None and alt_ac is None:
301
- warnings.append("Must provide either `chromosome` or `alt_ac`")
464
+ errors = []
465
+ if seg_start_genomic is None and seg_end_genomic is None:
466
+ errors.append(
467
+ "Must provide either `seg_start_genomic` or `seg_end_genomic`"
468
+ )
469
+ if chromosome is None and genomic_ac is None:
470
+ errors.append("Must provide either `chromosome` or `alt_ac`")
302
471
  if transcript is None and gene is None:
303
- warnings.append("Must provide either `gene` or `transcript`")
304
- if warnings:
305
- return self._return_warnings(resp, warnings)
472
+ errors.append("Must provide either `gene` or `transcript`")
473
+ if errors:
474
+ return _return_service_errors(errors)
306
475
 
307
- params = {key: None for key in GenomicData.model_fields}
308
476
  if gene is not None:
309
- gene = gene.upper().strip()
310
-
311
- if start:
312
- if residue_mode == ResidueMode.RESIDUE:
313
- # zero-based for UTA
314
- start -= 1
315
- residue_mode = ResidueMode.ZERO
316
- start_data = await self._genomic_to_transcript_exon_coordinate(
317
- start,
477
+ gene = gene.upper()
478
+
479
+ params = {}
480
+
481
+ if seg_start_genomic:
482
+ start_tx_seg_data = await self._genomic_to_tx_segment(
483
+ seg_start_genomic,
318
484
  chromosome=chromosome,
319
- alt_ac=alt_ac,
320
- strand=strand,
485
+ genomic_ac=genomic_ac,
321
486
  transcript=transcript,
322
487
  gene=gene,
323
488
  get_nearest_transcript_junction=get_nearest_transcript_junction,
324
- is_start=True,
489
+ is_seg_start=True,
325
490
  )
326
- if start_data.transcript_exon_data:
327
- start_data = start_data.transcript_exon_data.model_dump()
328
- else:
329
- return self._return_warnings(resp, [start_data.warnings[0]])
491
+ if start_tx_seg_data.errors:
492
+ return _return_service_errors(start_tx_seg_data.errors)
493
+
494
+ params["gene"] = start_tx_seg_data.gene
495
+ params["genomic_ac"] = start_tx_seg_data.genomic_ac
496
+ params["tx_ac"] = start_tx_seg_data.tx_ac
497
+ params["seg_start"] = start_tx_seg_data.seg
330
498
  else:
331
- start_data = None
499
+ start_tx_seg_data = None
332
500
 
333
- if end:
334
- end -= 1
335
- residue_mode = ResidueMode.ZERO
336
- end_data = await self._genomic_to_transcript_exon_coordinate(
337
- end,
501
+ if seg_end_genomic:
502
+ end_tx_seg_data = await self._genomic_to_tx_segment(
503
+ seg_end_genomic,
338
504
  chromosome=chromosome,
339
- alt_ac=alt_ac,
340
- strand=strand,
505
+ genomic_ac=genomic_ac,
341
506
  transcript=transcript,
342
507
  gene=gene,
343
508
  get_nearest_transcript_junction=get_nearest_transcript_junction,
344
- is_start=False,
509
+ is_seg_start=False,
345
510
  )
346
- if end_data.transcript_exon_data:
347
- end_data = end_data.transcript_exon_data.model_dump()
348
- else:
349
- return self._return_warnings(resp, [end_data.warnings[0]])
350
- else:
351
- end_data = None
352
-
353
- for field in ["transcript", "gene", "chr", "strand"]:
354
- if start_data:
355
- if end_data and (start_data[field] != end_data[field]):
356
- msg = (
357
- f"Start `{field}`, {start_data[field]}, does "
358
- f"not match End `{field}`, {end_data[field]}"
359
- )
360
- return self._return_warnings(resp, [msg])
361
- params[field] = start_data[field]
511
+ if end_tx_seg_data.errors:
512
+ return _return_service_errors(end_tx_seg_data.errors)
513
+
514
+ if start_tx_seg_data:
515
+ # Need to check that gene, genomic_ac, tx_ac all match
516
+ errors = []
517
+ for attr in ["gene", "genomic_ac", "tx_ac"]:
518
+ start_seg_attr = params[attr]
519
+ end_seg_attr = getattr(end_tx_seg_data, attr)
520
+ if start_seg_attr != end_seg_attr:
521
+ errors.append(
522
+ f"Start end end segment mismatch for `{attr}`. {start_seg_attr} != {end_seg_attr}."
523
+ )
524
+ if errors:
525
+ return _return_service_errors(errors)
362
526
  else:
363
- params[field] = end_data[field]
527
+ params["gene"] = end_tx_seg_data.gene
528
+ params["genomic_ac"] = end_tx_seg_data.genomic_ac
529
+ params["tx_ac"] = end_tx_seg_data.tx_ac
364
530
 
365
- if gene and gene != params["gene"]:
366
- msg = (
367
- f"Input gene, {gene}, does not match expected output"
368
- f"gene, {params['gene']}"
369
- )
370
- return self._return_warnings(resp, [msg])
531
+ params["seg_end"] = end_tx_seg_data.seg
371
532
 
372
- for label, data in [("start", start_data), ("end", end_data)]:
373
- if data:
374
- params[label] = data["pos"]
375
- params[f"exon_{label}"] = data["exon"]
376
- params[f"exon_{label}_offset"] = data["exon_offset"]
377
- resp.genomic_data = GenomicData(**params)
378
- return resp
533
+ return GenomicTxSegService(**params)
379
534
 
380
- @staticmethod
381
- def _validate_exon(
382
- transcript: str, tx_exons: list[tuple[int, int]], exon_number: int
383
- ) -> tuple[tuple[int, int] | None, str | None]:
384
- """Validate that exon number exists on a given transcript
385
-
386
- :param transcript: Transcript accession
387
- :param tx_exons: List of transcript's exons and associated coordinates
388
- :param exon_number: Exon number to validate
389
- :return: Exon coordinates for a given exon number and warnings if found
390
- """
391
- msg = f"Exon {exon_number} does not exist on {transcript}"
392
- try:
393
- if exon_number < 1:
394
- return None, msg
395
- exon = tx_exons[exon_number - 1]
396
- except IndexError:
397
- return None, msg
398
- return exon, None
399
-
400
- def get_tx_exon_coords(
535
+ async def _get_start_end_exon_coords(
401
536
  self,
402
- transcript: str,
403
- tx_exons: list[tuple[int, int]],
537
+ tx_ac: str,
404
538
  exon_start: int | None = None,
405
539
  exon_end: int | None = None,
406
- ) -> tuple[
407
- tuple[tuple[int, int] | None, tuple[int, int] | None] | None,
408
- str | None,
409
- ]:
410
- """Get exon coordinates for ``exon_start`` and ``exon_end``
411
-
412
- :param transcript: Transcript accession
413
- :param tx_exons: List of all transcript exons and coordinates
414
- :param exon_start: Start exon number
415
- :param exon_end: End exon number
416
- :return: [Transcript start exon coords, Transcript end exon coords],
417
- and warnings if found
540
+ genomic_ac: str | None = None,
541
+ ) -> tuple[_ExonCoord | None, _ExonCoord | None, list[str]]:
542
+ """Get exon coordinates for a transcript given exon start and exon end.
543
+
544
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
545
+ associated to ``tx_ac``.
546
+
547
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
548
+ :param exon_start: Start exon number to get coordinate data for. 1-based.
549
+ :param exon_end: End exon number to get coordinate data for. 1-based.
550
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
551
+ :return: Tuple containing start exon coordinate data, end exon coordinate data,
552
+ and list of errors. The exon coordinate data will include the exon number,
553
+ transcript and genomic positions for the start and end of the exon, and
554
+ strand.
418
555
  """
419
- if exon_start is not None:
420
- tx_exon_start, warning = self._validate_exon(
421
- transcript, tx_exons, exon_start
422
- )
423
- if not tx_exon_start:
424
- return None, warning
425
- else:
426
- tx_exon_start = None
427
-
428
- if exon_end is not None:
429
- tx_exon_end, warning = self._validate_exon(transcript, tx_exons, exon_end)
430
- if not tx_exon_end:
431
- return None, warning
556
+ tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=genomic_ac)
557
+ if not tx_exons:
558
+ return None, None, [f"No exons found given {tx_ac}"]
559
+
560
+ errors = []
561
+ start_end_exons = []
562
+ for exon_num in [exon_start, exon_end]:
563
+ if exon_num is not None:
564
+ try:
565
+ start_end_exons.append(tx_exons[exon_num - 1])
566
+ continue
567
+ except IndexError:
568
+ errors.append(f"Exon {exon_num} does not exist on {tx_ac}")
569
+ start_end_exons.append(None)
570
+
571
+ if errors:
572
+ start_end_exons = [None, None]
573
+
574
+ return *start_end_exons, errors
575
+
576
+ async def _get_all_exon_coords(
577
+ self, tx_ac: str, genomic_ac: str | None = None
578
+ ) -> list[_ExonCoord]:
579
+ """Get all exon coordinate data for a transcript.
580
+
581
+ If ``genomic_ac`` is NOT provided, this method will use the GRCh38 accession
582
+ associated to ``tx_ac``.
583
+
584
+ :param tx_ac: The RefSeq transcript accession to get exon data for.
585
+ :param genomic_ac: The RefSeq genomic accession to get exon data for.
586
+ :return: List of all exon coordinate data for ``tx_ac`` and ``genomic_ac``.
587
+ The exon coordinate data will include the exon number, transcript and
588
+ genomic positions for the start and end of the exon, and strand.
589
+ The list will be ordered by ascending exon number.
590
+ """
591
+ if genomic_ac:
592
+ query = f"""
593
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
594
+ FROM {self.uta_db.schema}.tx_exon_aln_v
595
+ WHERE tx_ac = '{tx_ac}'
596
+ AND alt_aln_method = 'splign'
597
+ AND alt_ac = '{genomic_ac}'
598
+ ORDER BY ord ASC
599
+ """ # noqa: S608
432
600
  else:
433
- tx_exon_end = None
434
- return (tx_exon_start, tx_exon_end), None
435
-
436
- async def _get_alt_ac_start_and_end(
601
+ query = f"""
602
+ SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
603
+ FROM {self.uta_db.schema}.tx_exon_aln_v as t
604
+ INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
605
+ ON t.alt_ac = s.ac
606
+ WHERE s.descr = ''
607
+ AND t.tx_ac = '{tx_ac}'
608
+ AND t.alt_aln_method = 'splign'
609
+ AND t.alt_ac like 'NC_000%'
610
+ ORDER BY ord ASC
611
+ """ # noqa: S608
612
+
613
+ results = await self.uta_db.execute_query(query)
614
+ return [_ExonCoord(**r) for r in results]
615
+
616
+ async def _get_genomic_aln_coords(
437
617
  self,
438
618
  tx_ac: str,
439
- tx_exon_start: tuple[int, int] | None = None,
440
- tx_exon_end: tuple[int, int] | None = None,
619
+ tx_exon_start: _ExonCoord | None = None,
620
+ tx_exon_end: _ExonCoord | None = None,
441
621
  gene: str | None = None,
442
- ) -> tuple[tuple[tuple[int, int], tuple[int, int]] | None, str | None]:
622
+ ) -> tuple[GenomicAlnData | None, GenomicAlnData | None, str | None]:
443
623
  """Get aligned genomic coordinates for transcript exon start and end.
444
624
 
625
+ ``tx_exon_start`` and ``tx_exon_end`` is expected to reference the same
626
+ transcript and genomic accession.
627
+
445
628
  :param tx_ac: Transcript accession
446
629
  :param tx_exon_start: Transcript's exon start coordinates. If not provided,
447
630
  must provide ``tx_exon_end``
448
631
  :param tx_exon_end: Transcript's exon end coordinates. If not provided, must
449
632
  provide ``tx_exon_start``
450
633
  :param gene: HGNC gene symbol
451
- :return: Aligned genomic data, and warnings if found
634
+ :return: Tuple containing aligned genomic data for start and end exon and
635
+ warnings if found
452
636
  """
453
637
  if tx_exon_start is None and tx_exon_end is None:
454
638
  msg = "Must provide either `tx_exon_start` or `tx_exon_end` or both"
455
639
  _logger.warning(msg)
456
- return None, msg
640
+ return None, None, msg
457
641
 
458
- alt_ac_data = {"start": None, "end": None}
642
+ aligned_coords = {"start": None, "end": None}
459
643
  for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
460
644
  if exon:
461
- alt_ac_val, warning = await self.uta_db.get_alt_ac_start_or_end(
462
- tx_ac, exon[0], exon[1], gene=gene
645
+ aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
646
+ tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
463
647
  )
464
- if alt_ac_val:
465
- alt_ac_data[key] = alt_ac_val
648
+ if aligned_coord:
649
+ aligned_coords[key] = aligned_coord
466
650
  else:
467
- return None, warning
468
-
469
- alt_ac_data_values = alt_ac_data.values()
470
- # Validate that start and end alignments have matching gene, genomic accession,
471
- # and strand
472
- if all(alt_ac_data_values):
473
- for i in (0, 1, 4):
474
- if alt_ac_data["start"][i] != alt_ac_data["end"][i]:
475
- if i == 0:
476
- error = "HGNC gene symbol does not match"
477
- elif i == 1:
478
- error = "Genomic accession does not match"
479
- else:
480
- error = "Strand does not match"
481
- _logger.warning(
482
- "%s: %s != %s",
483
- error,
484
- alt_ac_data["start"][i],
485
- alt_ac_data["end"][i],
486
- )
487
- return None, error
488
- return tuple(alt_ac_data_values), None
651
+ return None, None, warning
652
+
653
+ return *aligned_coords.values(), None
489
654
 
490
- async def _genomic_to_transcript_exon_coordinate(
655
+ def _get_tx_segment(
491
656
  self,
492
- pos: int,
657
+ genomic_ac: str,
658
+ strand: Strand,
659
+ offset: int,
660
+ genomic_ac_data: _ExonCoord,
661
+ is_seg_start: bool = False,
662
+ ) -> tuple[TxSegment | None, str | None]:
663
+ """Get transcript segment data given ``genomic_ac`` and offset data
664
+
665
+ :param genomic_ac: Genomic RefSeq accession
666
+ :param strand: Strand
667
+ :param offset: Exon offset
668
+ :param genomic_ac_data: Exon coordinate data for ``genomic_ac``
669
+ :param is_seg_start: ``True`` if retrieving genomic data where the transcript
670
+ segment starts, defaults to ``False``
671
+ :return: Transcript segment data
672
+ """
673
+ if is_seg_start:
674
+ if strand == Strand.POSITIVE:
675
+ seg_genomic_pos = offset + genomic_ac_data.alt_start_i
676
+ else:
677
+ seg_genomic_pos = genomic_ac_data.alt_end_i - offset
678
+ else:
679
+ if strand == Strand.POSITIVE:
680
+ seg_genomic_pos = offset + genomic_ac_data.alt_end_i
681
+ else:
682
+ seg_genomic_pos = genomic_ac_data.alt_start_i - offset
683
+
684
+ genomic_loc, err_msg = self._get_vrs_seq_loc(
685
+ genomic_ac,
686
+ seg_genomic_pos,
687
+ is_seg_start=is_seg_start,
688
+ strand=strand,
689
+ )
690
+ if err_msg:
691
+ return None, err_msg
692
+
693
+ return TxSegment(
694
+ exon_ord=genomic_ac_data.ord,
695
+ genomic_location=genomic_loc,
696
+ offset=offset,
697
+ ), None
698
+
699
+ def _get_vrs_seq_loc(
700
+ self, genomic_ac: str, genomic_pos: int, is_seg_start: bool, strand: Strand
701
+ ) -> tuple[SequenceLocation | None, str | None]:
702
+ """Create VRS Sequence Location for genomic position where transcript segment
703
+ occurs
704
+
705
+ :param genomic_ac: RefSeq genomic accession
706
+ :param genomic_pos: Genomic position where the transcript segment occurs
707
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment
708
+ starts. ``False`` if ``genomic_pos`` is where the transcript segment ends.
709
+ :param strand: Strand
710
+ :return: Tuple containing VRS location (if successful) and error message (if
711
+ unable to get GA4GH identifier for ``genomic_ac``).
712
+ """
713
+ ga4gh_seq_id, err_msg = self.seqrepo_access.translate_identifier(
714
+ genomic_ac, "ga4gh"
715
+ )
716
+ if err_msg:
717
+ return None, err_msg
718
+
719
+ use_start = (
720
+ strand == Strand.POSITIVE if is_seg_start else strand != Strand.POSITIVE
721
+ )
722
+
723
+ return SequenceLocation(
724
+ sequenceReference=SequenceReference(
725
+ refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1]
726
+ ),
727
+ start=genomic_pos if use_start else None,
728
+ end=genomic_pos if not use_start else None,
729
+ ), None
730
+
731
+ async def _genomic_to_tx_segment(
732
+ self,
733
+ genomic_pos: int,
493
734
  chromosome: str | None = None,
494
- alt_ac: str | None = None,
495
- strand: Strand | None = None,
735
+ genomic_ac: str | None = None,
496
736
  transcript: str | None = None,
497
737
  gene: str | None = None,
498
738
  get_nearest_transcript_junction: bool = False,
499
- is_start: bool = True,
500
- ) -> TranscriptExonDataResponse:
501
- """Convert individual genomic data to transcript data
739
+ is_seg_start: bool = True,
740
+ ) -> GenomicTxSeg:
741
+ """Given genomic data, generate a boundary for a transcript segment.
742
+
743
+ Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
744
+ errors.
502
745
 
503
- :param pos: Genomic position (zero-based)
746
+ :param genomic_pos: Genomic position where the transcript segment starts or ends
747
+ (inter-residue based)
504
748
  :param chromosome: Chromosome. Must give chromosome without a prefix
505
- (i.e. ``1`` or ``X``). If not provided, must provide ``alt_ac``.
506
- If ``alt_ac`` is also provided, ``alt_ac`` will be used.
507
- :param alt_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
508
- must provide ``chromosome. If ``chromosome`` is also provided, ``alt_ac``
749
+ (i.e. ``1`` or ``X``). If not provided, must provide ``genomic_ac``. If
750
+ position maps to both GRCh37 and GRCh38, GRCh38 assembly will be used.
751
+ If ``genomic_ac`` is also provided, ``genomic_ac`` will be used.
752
+ :param genomic_ac: Genomic accession (i.e. ``NC_000001.11``). If not provided,
753
+ must provide ``chromosome. If ``chromosome`` is also provided, ``genomic_ac``
509
754
  will be used.
510
- :param strand: Strand
511
755
  :param transcript: The transcript to use. If this is not given, we will try the
512
756
  following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining
513
757
  Compatible Transcript
514
758
  :param gene: HGNC gene symbol
515
759
  :param get_nearest_transcript_junction: If ``True``, this will return the
516
- adjacent exon if the position specified by``start`` or ``end`` does not
517
- occur on an exon. For the positive strand, adjacent is defined as the exon
518
- preceding the breakpoint for the 5' end and the exon following the
519
- breakpoint for the 3' end. For the negative strand, adjacent is defined as
520
- the exon following the breakpoint for the 5' end and the exon preceding the
521
- breakpoint for the 3' end.
522
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
523
- end position.
524
- :return: Transcript data (inter-residue coordinates)
760
+ adjacent exon if the position specified by``seg_start_genomic`` or
761
+ ``seg_end_genomic`` does not occur on an exon. For the positive strand, adjacent
762
+ is defined as the exon preceding the breakpoint for the 5' end and the exon
763
+ following the breakpoint for the 3' end. For the negative strand, adjacent
764
+ is defined as the exon following the breakpoint for the 5' end and the exon
765
+ preceding the breakpoint for the 3' end.
766
+ :param is_seg_start: ``True`` if ``genomic_pos`` is where the transcript segment starts.
767
+ ``False`` if ``genomic_pos`` is where the transcript segment ends.
768
+ :return: Data for a transcript segment boundary (inter-residue coordinates)
525
769
  """
526
- resp = TranscriptExonDataResponse(
527
- transcript_exon_data=None, warnings=[], service_meta=service_meta()
528
- )
529
- params = {key: None for key in TranscriptExonData.model_fields}
770
+ params = {key: None for key in GenomicTxSeg.model_fields}
530
771
 
531
772
  if get_nearest_transcript_junction:
532
- if not gene or not strand:
533
- return self._return_warnings(
534
- resp,
535
- [
536
- "Gene or strand must be provided to select the adjacent transcript junction"
537
- ],
773
+ if not gene and not transcript:
774
+ return GenomicTxSeg(
775
+ errors=[
776
+ "`gene` or `transcript` must be provided to select the adjacent transcript junction"
777
+ ]
538
778
  )
539
- if not alt_ac:
540
- alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome)
541
779
 
542
- if not alt_acs:
543
- return self._return_warnings(resp, [w])
544
- alt_ac = alt_acs[0]
780
+ if not genomic_ac:
781
+ genomic_acs, err_msg = self.seqrepo_access.chromosome_to_acs(chromosome)
782
+
783
+ if not genomic_acs:
784
+ return GenomicTxSeg(
785
+ errors=[err_msg],
786
+ )
787
+ genomic_ac = genomic_acs[0]
788
+
789
+ # Always liftover to GRCh38
790
+ genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
791
+ genomic_ac, genomic_pos
792
+ )
793
+ if err_msg:
794
+ return GenomicTxSeg(errors=[err_msg])
545
795
 
546
796
  if not transcript:
547
797
  # Select a transcript if not provided
@@ -555,7 +805,7 @@ class ExonGenomicCoordsMapper:
555
805
  # Attempt to find a coding transcript if a MANE transcript
556
806
  # cannot be found
557
807
  results = await self.uta_db.get_transcripts(
558
- gene=gene, alt_ac=alt_ac
808
+ gene=gene, alt_ac=genomic_ac
559
809
  )
560
810
 
561
811
  if not results.is_empty():
@@ -566,376 +816,345 @@ class ExonGenomicCoordsMapper:
566
816
  SELECT DISTINCT tx_ac
567
817
  FROM {self.uta_db.schema}.tx_exon_aln_v
568
818
  WHERE hgnc = '{gene}'
569
- AND alt_ac = '{alt_ac}'
819
+ AND alt_ac = '{genomic_ac}'
570
820
  """ # noqa: S608
571
821
  result = await self.uta_db.execute_query(query)
572
822
 
573
823
  if result:
574
824
  transcript = result[0]["tx_ac"]
575
825
  else:
576
- return self._return_warnings(
577
- resp,
578
- [f"Could not find a transcript for {gene} on {alt_ac}"],
826
+ return GenomicTxSeg(
827
+ errors=[
828
+ f"Could not find a transcript for {gene} on {genomic_ac}"
829
+ ]
579
830
  )
580
831
 
581
- tx_genomic_coords, w = await self.uta_db.get_tx_exons_genomic_coords(
582
- tx_ac=transcript, alt_ac=alt_ac
832
+ tx_exons = await self._get_all_exon_coords(
833
+ tx_ac=transcript, genomic_ac=genomic_ac
583
834
  )
584
- if not tx_genomic_coords:
585
- return self._return_warnings(resp, [w])
835
+ if not tx_exons:
836
+ return GenomicTxSeg(errors=[f"No exons found given {transcript}"])
837
+
838
+ strand = Strand(tx_exons[0].alt_strand)
839
+ params["strand"] = strand
586
840
 
587
841
  # Check if breakpoint occurs on an exon.
588
842
  # If not, determine the adjacent exon given the selected transcript
589
- if not self._is_exonic_breakpoint(pos, tx_genomic_coords):
590
- exon = self._get_adjacent_exon(
591
- tx_exons_genomic_coords=tx_genomic_coords,
843
+ if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
844
+ exon_num = self._get_adjacent_exon(
845
+ tx_exons_genomic_coords=tx_exons,
592
846
  strand=strand,
593
- start=pos if is_start else None,
594
- end=pos if not is_start else None,
847
+ start=genomic_pos if is_seg_start else None,
848
+ end=genomic_pos if not is_seg_start else None,
595
849
  )
596
850
 
597
- params["exon"] = exon
598
- params["transcript"] = transcript
599
- params["gene"] = gene
600
- params["pos"] = pos
601
- params["chr"] = alt_ac
602
-
603
- self._set_exon_offset(
604
- params=params,
605
- start=tx_genomic_coords[exon - 1][3], # Start exon coordinate
606
- end=tx_genomic_coords[exon - 1][4], # End exon coordinate
607
- pos=pos,
608
- is_start=is_start,
851
+ offset = self._get_exon_offset(
852
+ start_i=tx_exons[exon_num].alt_start_i,
853
+ end_i=tx_exons[exon_num].alt_end_i,
609
854
  strand=strand,
610
- )
611
- params["strand"] = strand.value
612
- resp.transcript_exon_data = TranscriptExonData(**params)
613
- return resp
614
-
615
- if alt_ac:
616
- # Check if valid accession is given
617
- if not await self.uta_db.validate_genomic_ac(alt_ac):
618
- return self._return_warnings(
619
- resp, [f"Invalid genomic accession: {alt_ac}"]
855
+ use_start_i=strand == Strand.POSITIVE
856
+ if is_seg_start
857
+ else strand != Strand.POSITIVE,
858
+ is_in_exon=False,
859
+ start=genomic_pos if is_seg_start else None,
860
+ end=genomic_pos if not is_seg_start else None,
620
861
  )
621
862
 
622
- genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
623
- pos, strand=strand, alt_ac=alt_ac, gene=gene
624
- )
625
- elif chromosome:
626
- # Check if just chromosome is given. If it is, we should
627
- # convert this to the correct accession version
628
- if chromosome == "X":
629
- chromosome = 23
630
- elif chromosome == "Y":
631
- chromosome = 24
632
- else:
633
- chromosome = int(chromosome)
634
-
635
- genes_alt_acs, warning = await self.uta_db.get_genes_and_alt_acs(
636
- pos, strand=strand, chromosome=chromosome, gene=gene
637
- )
638
- else:
639
- genes_alt_acs = None
640
-
641
- if not genes_alt_acs:
642
- return self._return_warnings(resp, [warning])
863
+ genomic_location, err_msg = self._get_vrs_seq_loc(
864
+ genomic_ac, genomic_pos, is_seg_start, strand
865
+ )
866
+ if err_msg:
867
+ return GenomicTxSeg(errors=[err_msg])
868
+
869
+ # gene is not required to liftover coordinates if tx_ac and genomic_ac are given, but we should set the associated gene
870
+ if not gene:
871
+ _gene, err_msg = await self._get_tx_ac_gene(transcript)
872
+ if err_msg:
873
+ return GenomicTxSeg(errors=[err_msg])
874
+ gene = _gene
875
+
876
+ return GenomicTxSeg(
877
+ gene=gene,
878
+ genomic_ac=genomic_ac,
879
+ tx_ac=transcript,
880
+ seg=TxSegment(
881
+ exon_ord=exon_num,
882
+ offset=offset,
883
+ genomic_location=genomic_location,
884
+ ),
885
+ )
643
886
 
644
- gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene)
645
- if not gene_alt_ac:
646
- return self._return_warnings(resp, [warning])
647
- gene, alt_ac = gene_alt_ac
887
+ if genomic_ac:
888
+ _gene, err_msg = await self._get_genomic_ac_gene(genomic_pos, genomic_ac)
648
889
 
649
- if transcript is None:
650
- warnings = await self._set_mane_genomic_data(
651
- params, gene, alt_ac, pos, strand, is_start
652
- )
653
- if warnings:
654
- return self._return_warnings(resp, [warnings])
655
- else:
656
- params["transcript"] = transcript
657
- params["gene"] = gene
658
- params["pos"] = pos
659
- params["chr"] = alt_ac
660
- warning = await self._set_genomic_data(params, strand, is_start)
661
- if warning:
662
- return self._return_warnings(resp, [warning])
890
+ if err_msg:
891
+ return GenomicTxSeg(errors=[err_msg])
663
892
 
664
- resp.transcript_exon_data = TranscriptExonData(**params)
665
- return resp
893
+ if gene and _gene != gene:
894
+ return GenomicTxSeg(
895
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
896
+ )
666
897
 
667
- @staticmethod
668
- def _get_gene_and_alt_ac(
669
- genes_alt_acs: dict, gene: str | None
670
- ) -> tuple[tuple[str, str] | None, str | None]:
671
- """Return gene genomic accession
672
-
673
- :param genes_alt_acs: Dictionary containing genes and genomic accessions
674
- :param gene: Gene symbol
675
- :return: (Gene, Genomic accession) if both exist
676
- """
677
- alt_acs = genes_alt_acs["alt_acs"]
678
- len_alt_acs = len(alt_acs)
679
- if len_alt_acs > 1:
680
- return None, f"Found more than one accessions: {alt_acs}"
681
- if len_alt_acs == 0:
682
- return None, "No genomic accessions found"
683
- alt_ac = next(iter(alt_acs))
684
-
685
- genes = genes_alt_acs["genes"]
686
- len_genes = len(genes)
687
- input_gene = gene
688
- output_gene = None
689
- if len_genes == 1:
690
- output_gene = next(iter(genes))
691
- elif len_genes > 1:
692
- return None, f"Found more than one gene: {genes}"
693
- elif len_genes == 0:
694
- return None, "No genes found"
695
-
696
- if input_gene is not None and output_gene != input_gene.upper():
697
- return (
698
- None,
699
- f"Input gene, {input_gene}, does not match "
700
- f"expected output gene, {output_gene}",
701
- )
898
+ gene = _gene
899
+ elif chromosome:
900
+ # Try GRCh38 first
901
+ for assembly in [Assembly.GRCH38.value, Assembly.GRCH37.value]:
902
+ _genomic_acs, err_msg = self.seqrepo_access.translate_identifier(
903
+ f"{assembly}:chr{chromosome}", "refseq"
904
+ )
905
+ if err_msg:
906
+ return GenomicTxSeg(errors=[err_msg])
907
+ _genomic_ac = _genomic_acs[0].split(":")[-1]
702
908
 
703
- gene = output_gene if output_gene else input_gene
704
- return (gene, alt_ac), None
909
+ _gene, err_msg = await self._get_genomic_ac_gene(
910
+ genomic_pos, _genomic_ac
911
+ )
912
+ if _gene:
913
+ if gene and _gene != gene:
914
+ return GenomicTxSeg(
915
+ errors=[f"Expected gene, {gene}, but found {_gene}"]
916
+ )
917
+ gene = _gene
918
+ genomic_ac = _genomic_ac
919
+ break
920
+
921
+ if not genomic_ac:
922
+ return GenomicTxSeg(
923
+ errors=[
924
+ f"Unable to get genomic RefSeq accession for chromosome {chromosome} on position {genomic_pos}"
925
+ ]
926
+ )
705
927
 
706
- async def _set_mane_genomic_data(
707
- self,
708
- params: dict,
709
- gene: str,
710
- alt_ac: str,
711
- pos: int,
712
- strand: Strand,
713
- is_start: bool,
714
- ) -> str | None:
715
- """Set genomic data in `params` found from MANE.
716
-
717
- :param params: Parameters for response
718
- :param gene: Gene symbol
719
- :param alt_ac: Genomic accession
720
- :param pos: Genomic position
721
- :param strand: Strand
722
- :param is_start: `True` if `pos` is start position. `False` if `pos` is end
723
- position.
724
- :return: Warnings if found
725
- """
726
- start, end = get_inter_residue_pos(pos, pos, residue_mode=ResidueMode.ZERO)
727
- mane_data: (
728
- CdnaRepresentation | None
729
- ) = await self.mane_transcript.get_mane_transcript(
730
- alt_ac,
731
- start,
732
- end,
733
- AnnotationLayer.GENOMIC,
734
- gene=gene,
735
- try_longest_compatible=True,
736
- residue_mode=ResidueMode.INTER_RESIDUE,
737
- )
738
- if not mane_data:
739
- msg = f"Unable to find mane data for {alt_ac} with position {pos}"
740
- if gene:
741
- msg += f" on gene {gene}"
742
- _logger.warning(msg)
743
- return msg
744
-
745
- params["gene"] = mane_data.gene
746
- params["transcript"] = (
747
- mane_data.refseq
748
- if mane_data.refseq
749
- else mane_data.ensembl
750
- if mane_data.ensembl
751
- else None
752
- )
753
- tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac)
754
- if not tx_exons:
755
- return f"Unable to get exons for {params['transcript']}"
756
- tx_pos = mane_data.pos[0] + mane_data.coding_start_site
757
- params["exon"] = self._get_exon_number(tx_exons, tx_pos)
758
-
759
- try:
760
- tx_exon = tx_exons[params["exon"] - 1]
761
- except IndexError:
762
- msg = (
763
- f"{params['transcript']} with position {tx_pos} "
764
- f"does not exist on exons: {tx_exons}"
765
- )
766
- _logger.warning(msg)
767
- return msg
768
-
769
- strand_to_use = strand if strand is not None else mane_data.strand
770
- params["strand"] = strand_to_use
771
- self._set_exon_offset(
772
- params,
773
- tx_exon[0],
774
- tx_exon[1],
775
- tx_pos,
776
- is_start=is_start,
777
- strand=strand_to_use,
778
- )
928
+ if not gene:
929
+ return GenomicTxSeg(
930
+ errors=[
931
+ f"Unable to get gene given {genomic_ac} on position {genomic_pos}"
932
+ ]
933
+ )
779
934
 
780
- # Need to check if we need to change pos for liftover
781
- genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end(
782
- params["transcript"], tx_pos, tx_pos, gene
783
- )
784
- if genomic_data is None:
785
- return warnings
786
-
787
- params["chr"] = genomic_data[1]
788
- genomic_coords = genomic_data[2], genomic_data[3]
789
- genomic_pos = genomic_coords[1] - 1 if is_start else genomic_coords[0] + 1
790
- params["pos"] = (
791
- genomic_pos - params["exon_offset"]
792
- if strand_to_use == -1
793
- else genomic_pos + params["exon_offset"]
935
+ return await self._get_tx_seg_genomic_metadata(
936
+ genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
794
937
  )
795
- return None
796
-
797
- async def _set_genomic_data(
798
- self, params: dict, strand: Strand, is_start: bool
799
- ) -> str | None:
800
- """Set genomic data in ``params``
801
938
 
802
- :param params: Parameters for response
803
- :param strand: Strand
804
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
805
- end position.
806
- :return: Warnings if found
939
+ async def _get_grch38_ac_pos(
940
+ self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
941
+ ) -> tuple[str | None, int | None, str | None]:
942
+ """Get GRCh38 genomic representation for accession and position
943
+
944
+ :param genomic_ac: RefSeq genomic accession (GRCh37 or GRCh38 assembly)
945
+ :param genomic_pos: Genomic position on ``genomic_ac``
946
+ :param grch38_ac: A valid GRCh38 genomic accession for ``genomic_ac``. If not
947
+ provided, will attempt to retrieve associated GRCh38 accession from UTA.
948
+ :return: Tuple containing GRCh38 accession, GRCh38 position, and error message
949
+ if unable to get GRCh38 representation
807
950
  """
808
- # We should always try to liftover
809
- grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"])
810
951
  if not grch38_ac:
811
- return f"Invalid genomic accession: {params['chr']}"
952
+ grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
953
+ if not grch38_ac:
954
+ return None, None, f"Unrecognized genomic accession: {genomic_ac}."
955
+
956
+ grch38_ac = grch38_ac[0]
812
957
 
813
- grch38_ac = grch38_ac[0]
814
- if grch38_ac != params["chr"]: # params["chr"] is genomic accession
815
- # Liftover to 38
816
- descr = await self.uta_db.get_chr_assembly(params["chr"])
817
- if descr is None:
818
- return f"Unable to get chromosome and assembly for " f"{params['chr']}"
958
+ if grch38_ac != genomic_ac:
959
+ # Ensure genomic_ac is GRCh37
960
+ chromosome, _ = self.seqrepo_access.translate_identifier(
961
+ genomic_ac, Assembly.GRCH37.value
962
+ )
963
+ if not chromosome:
964
+ _logger.warning(
965
+ "SeqRepo could not find associated %s assembly for genomic accession %s.",
966
+ Assembly.GRCH37.value,
967
+ genomic_ac,
968
+ )
969
+ return (
970
+ None,
971
+ None,
972
+ f"`genomic_ac` must use {Assembly.GRCH37.value} or {Assembly.GRCH38.value} assembly.",
973
+ )
819
974
 
820
- chromosome_number, assembly = descr
975
+ chromosome = chromosome[-1].split(":")[-1]
821
976
  liftover_data = self.liftover.get_liftover(
822
- chromosome_number, params["pos"], Assembly.GRCH38
977
+ chromosome, genomic_pos, Assembly.GRCH38
823
978
  )
824
979
  if liftover_data is None:
825
980
  return (
826
- f"Position {params['pos']} does not exist on "
827
- f"chromosome {chromosome_number}"
981
+ None,
982
+ None,
983
+ f"Lifting over {genomic_pos} on {genomic_ac} from {Assembly.GRCH37.value} to {Assembly.GRCH38.value} was unsuccessful.",
828
984
  )
829
985
 
830
- params["pos"] = liftover_data[1]
831
- params["chr"] = grch38_ac
986
+ genomic_pos = liftover_data[1]
987
+ genomic_ac = grch38_ac
832
988
 
833
- tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac)
834
- if not tx_exons:
835
- return f"Unable to get exons for {params['transcript']}"
989
+ return genomic_ac, genomic_pos, None
836
990
 
837
- data = await self.uta_db.get_tx_exon_aln_v_data(
838
- params["transcript"],
839
- params["pos"],
840
- params["pos"],
841
- alt_ac=params["chr"],
842
- use_tx_pos=False,
843
- )
844
- if len(data) != 1:
845
- return (
846
- f"Must find exactly one row for genomic data, "
847
- f"but found: {len(data)}"
848
- )
991
+ async def _get_genomic_ac_gene(
992
+ self,
993
+ pos: int,
994
+ genomic_ac: str,
995
+ ) -> tuple[str | None, str | None]:
996
+ """Get gene given a genomic accession and position.
849
997
 
850
- # Find exon number
851
- data = data[0]
852
- data_exons = data[2], data[3]
853
- i = 1
854
- found_tx_exon = False
855
- for exon in tx_exons:
856
- if data_exons == exon:
857
- found_tx_exon = True
858
- break
859
- i += 1
860
- if not found_tx_exon:
861
- # Either first or last
862
- i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1
863
- params["exon"] = i
864
-
865
- strand_to_use = strand if strand is not None else Strand(data[7])
866
- params["strand"] = strand_to_use
867
- if not is_start:
868
- # convert back to inter-residue for end position
869
- params["pos"] += 1
870
- self._set_exon_offset(
871
- params,
872
- data[5] if is_start else data[5] + 1, # need to convert to inter-residue
873
- data[6] - 1 if is_start else data[6], # need to convert to inter-residue
874
- params["pos"],
875
- is_start=is_start,
876
- strand=strand_to_use,
877
- )
878
- return None
998
+ If multiple genes are found for a given ``pos`` and ``genomic_ac``, only one
999
+ gene will be returned.
879
1000
 
880
- @staticmethod
881
- def _set_exon_offset(
882
- params: dict, start: int, end: int, pos: int, is_start: bool, strand: Strand
883
- ) -> None:
884
- """Set value for ``exon_offset`` in ``params``.
885
-
886
- :param params: Parameters for response
887
- :param start: Start exon coord (can be transcript or aligned genomic)
888
- :param end: End exon coord (can be transcript or aligned genomic)
889
- :param pos: Position change (can be transcript or genomic)
890
- :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is
891
- end position
892
- :param strand: Strand
1001
+ :param pos: Genomic position on ``genomic_ac``
1002
+ :param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
1003
+ :return: HGNC gene symbol associated to genomic accession and position and
1004
+ warning
893
1005
  """
894
- if is_start:
895
- if strand == Strand.NEGATIVE:
896
- params["exon_offset"] = end - pos
897
- else:
898
- params["exon_offset"] = pos - end
899
- else:
900
- if strand == Strand.NEGATIVE:
901
- params["exon_offset"] = start - pos
902
- else:
903
- params["exon_offset"] = pos - start
1006
+ query = f"""
1007
+ SELECT DISTINCT hgnc
1008
+ FROM {self.uta_db.schema}.tx_exon_aln_v
1009
+ WHERE alt_ac = '{genomic_ac}'
1010
+ AND alt_aln_method = 'splign'
1011
+ AND {pos} BETWEEN alt_start_i AND alt_end_i
1012
+ ORDER BY hgnc
1013
+ LIMIT 1;
1014
+ """ # noqa: S608
1015
+ results = await self.uta_db.execute_query(query)
1016
+ if not results:
1017
+ return None, f"No gene(s) found given {genomic_ac} on position {pos}"
1018
+
1019
+ return results[0]["hgnc"], None
1020
+
1021
+ async def _get_tx_ac_gene(
1022
+ self,
1023
+ tx_ac: str,
1024
+ ) -> tuple[str | None, str | None]:
1025
+ """Get gene given a transcript.
904
1026
 
905
- async def _structure_exons(
906
- self, transcript: str, alt_ac: str | None = None
907
- ) -> list[tuple[int, int]]:
908
- """Structure exons as list of tuples.
1027
+ If multiple genes are found for a given ``tx_ac``, only one
1028
+ gene will be returned.
909
1029
 
910
- :param transcript: Transcript accession
911
- :param alt_ac: Genomic accession
912
- :return: List of tuples containing transcript exon coordinates
1030
+ :param tx_ac: RefSeq transcript, e.g. ``"NM_004333.6"``
1031
+ :return: HGNC gene symbol associated to transcript and
1032
+ warning
913
1033
  """
914
- tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac)
1034
+ query = f"""
1035
+ SELECT DISTINCT hgnc
1036
+ FROM {self.uta_db.schema}.tx_exon_aln_v
1037
+ WHERE tx_ac = '{tx_ac}'
1038
+ ORDER BY hgnc
1039
+ LIMIT 1;
1040
+ """ # noqa: S608
1041
+ results = await self.uta_db.execute_query(query)
1042
+ if not results:
1043
+ return None, f"No gene(s) found given {tx_ac}"
1044
+
1045
+ return results[0]["hgnc"], None
1046
+
1047
+ async def _get_tx_seg_genomic_metadata(
1048
+ self,
1049
+ genomic_ac: str,
1050
+ genomic_pos: int,
1051
+ is_seg_start: bool,
1052
+ gene: str,
1053
+ tx_ac: str | None,
1054
+ ) -> GenomicTxSeg:
1055
+ """Get transcript segment data and associated genomic metadata.
1056
+
1057
+ Will liftover to GRCh38 assembly. If liftover is unsuccessful, will return
1058
+ errors.
1059
+
1060
+ If ``tx_ac`` is not provided, will attempt to retrieve MANE transcript.
915
1061
 
1062
+ :param genomic_ac: Genomic RefSeq accession
1063
+ :param genomic_pos: Genomic position where the transcript segment occurs
1064
+ :param is_seg_start: Whether or not ``genomic_pos`` represents the start position.
1065
+ :param gene: HGNC gene symbol
1066
+ :param tx_ac: Transcript RefSeq accession. If not provided, will use MANE
1067
+ transcript
1068
+ :return: Transcript segment data and associated genomic metadata
1069
+ """
1070
+ if tx_ac:
1071
+ # We should always try to liftover
1072
+ grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
1073
+ if not grch38_ac:
1074
+ return GenomicTxSeg(errors=[f"Invalid genomic accession: {genomic_ac}"])
1075
+ grch38_ac = grch38_ac[0]
1076
+ else:
1077
+ mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
1078
+ if not mane_data:
1079
+ err_msg = f"Unable to find mane data for {genomic_ac} with position {genomic_pos}"
1080
+ if gene:
1081
+ err_msg += f" on gene {gene}"
1082
+ _logger.warning(err_msg)
1083
+ return GenomicTxSeg(errors=[err_msg])
1084
+
1085
+ mane_data = mane_data[0]
1086
+ tx_ac = mane_data["RefSeq_nuc"]
1087
+ grch38_ac = mane_data["GRCh38_chr"]
1088
+
1089
+ # Always liftover to GRCh38
1090
+ genomic_ac, genomic_pos, err_msg = await self._get_grch38_ac_pos(
1091
+ genomic_ac, genomic_pos, grch38_ac=grch38_ac
1092
+ )
1093
+ if err_msg:
1094
+ return GenomicTxSeg(errors=[err_msg])
1095
+
1096
+ tx_exons = await self._get_all_exon_coords(tx_ac, genomic_ac=grch38_ac)
916
1097
  if not tx_exons:
917
- return []
1098
+ return GenomicTxSeg(errors=[f"No exons found given {tx_ac}"])
1099
+
1100
+ tx_exon_aln_data = await self.uta_db.get_tx_exon_aln_v_data(
1101
+ tx_ac,
1102
+ genomic_pos,
1103
+ genomic_pos,
1104
+ alt_ac=genomic_ac,
1105
+ use_tx_pos=False,
1106
+ )
1107
+ if len(tx_exon_aln_data) != 1:
1108
+ return GenomicTxSeg(
1109
+ errors=[
1110
+ f"Must find exactly one row for genomic data, but found: {len(tx_exon_aln_data)}"
1111
+ ]
1112
+ )
1113
+
1114
+ tx_exon_aln_data = tx_exon_aln_data[0]
1115
+
1116
+ offset = self._get_exon_offset(
1117
+ start_i=tx_exon_aln_data.alt_start_i,
1118
+ end_i=tx_exon_aln_data.alt_end_i,
1119
+ strand=Strand(tx_exon_aln_data.alt_strand),
1120
+ use_start_i=False, # This doesn't impact anything since we're on the exon
1121
+ is_in_exon=True,
1122
+ start=genomic_pos if is_seg_start else None,
1123
+ end=genomic_pos if not is_seg_start else None,
1124
+ )
918
1125
 
919
- return [(coords[0], coords[1]) for coords in tx_exons]
1126
+ genomic_location, err_msg = self._get_vrs_seq_loc(
1127
+ genomic_ac, genomic_pos, is_seg_start, tx_exon_aln_data.alt_strand
1128
+ )
1129
+ if err_msg:
1130
+ return GenomicTxSeg(errors=[err_msg])
1131
+
1132
+ return GenomicTxSeg(
1133
+ gene=tx_exon_aln_data.hgnc,
1134
+ genomic_ac=genomic_ac,
1135
+ tx_ac=tx_exon_aln_data.tx_ac,
1136
+ seg=TxSegment(
1137
+ exon_ord=tx_exon_aln_data.ord,
1138
+ offset=offset,
1139
+ genomic_location=genomic_location,
1140
+ ),
1141
+ )
920
1142
 
921
1143
  @staticmethod
922
- def _get_exon_number(tx_exons: list, tx_pos: int) -> int:
923
- """Find related exon number for a position
1144
+ def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
1145
+ """Check if a breakpoint occurs on an exon
924
1146
 
925
- :param tx_exons: List of exon coordinates for a transcript
926
- :param tx_pos: Transcript position change
927
- :return: Exon number associated to transcript position change. Will be 1-based
1147
+ :param pos: Genomic breakpoint
1148
+ :param tx_genomic_coords: A list of transcript exon coordinate data
1149
+ :return: ``True`` if the breakpoint occurs on an exon
928
1150
  """
929
- i = 1
930
- for coords in tx_exons:
931
- if coords[0] <= tx_pos <= coords[1]:
932
- break
933
- i += 1
934
- return i
1151
+ return any(
1152
+ exon.alt_start_i <= pos <= exon.alt_end_i for exon in tx_genomic_coords
1153
+ )
935
1154
 
936
1155
  @staticmethod
937
1156
  def _get_adjacent_exon(
938
- tx_exons_genomic_coords: list[tuple[int, int, int, int, int]],
1157
+ tx_exons_genomic_coords: list[_ExonCoord],
939
1158
  strand: Strand,
940
1159
  start: int | None = None,
941
1160
  end: int | None = None,
@@ -946,20 +1165,18 @@ class ExonGenomicCoordsMapper:
946
1165
  adjacent is defined as the exon following the breakpoint for the 5' end and the
947
1166
  exon preceding the breakpoint for the 3' end.
948
1167
 
949
- :param: tx_exons_genomic_coords: List of tuples describing exons and genomic
950
- coordinates for a transcript. Each tuple contains the transcript number
951
- (0-indexed), the transcript coordinates for the exon, and the genomic
952
- coordinates for the exon. Pos 0 in the tuple corresponds to the exon
953
- number, pos 1 and pos 2 refer to the start and end transcript coordinates,
954
- respectively, and pos 3 and 4 refer to the start and end genomic
955
- coordinates, respectively.
1168
+ :param tx_exons_genomic_coords: Transcript exon coordinate data
956
1169
  :param strand: Strand
957
- :param: start: Genomic coordinate of breakpoint
958
- :param: end: Genomic coordinate of breakpoint
959
- :return: Exon number corresponding to adjacent exon. Will be 1-based
1170
+ :param start: Genomic coordinate of breakpoint
1171
+ :param end: Genomic coordinate of breakpoint
1172
+ :return: Exon number corresponding to adjacent exon. Will be 0-based
960
1173
  """
961
1174
  for i in range(len(tx_exons_genomic_coords) - 1):
962
1175
  exon = tx_exons_genomic_coords[i]
1176
+ if start == exon.alt_start_i:
1177
+ break
1178
+ if end == exon.alt_end_i:
1179
+ break
963
1180
  next_exon = tx_exons_genomic_coords[i + 1]
964
1181
  bp = start if start else end
965
1182
  if strand == Strand.POSITIVE:
@@ -968,19 +1185,46 @@ class ExonGenomicCoordsMapper:
968
1185
  else:
969
1186
  lte_exon = next_exon
970
1187
  gte_exon = exon
971
- if bp >= lte_exon[4] and bp <= gte_exon[3]:
1188
+ if bp >= lte_exon.alt_end_i and bp <= gte_exon.alt_start_i:
972
1189
  break
973
1190
  # Return current exon if end position is provided, next exon if start position
974
- # is provided. exon[0] needs to be incremented by 1 in both cases as exons are
975
- # 0-based in UTA
976
- return exon[0] + 1 if end else exon[0] + 2
1191
+ # is provided.
1192
+ return exon.ord if end else exon.ord + 1
977
1193
 
978
1194
  @staticmethod
979
- def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list) -> bool:
980
- """Check if a breakpoint occurs on an exon
1195
+ def _get_exon_offset(
1196
+ start_i: int,
1197
+ end_i: int,
1198
+ strand: Strand,
1199
+ use_start_i: bool = True,
1200
+ is_in_exon: bool = True,
1201
+ start: int | None = None,
1202
+ end: int | None = None,
1203
+ ) -> int:
1204
+ """Compute offset from exon start or end index
981
1205
 
982
- :param pos: Genomic breakpoint
983
- :param tx_genomic_coords: A list of genomic coordinates for a transcript
984
- :return: True if the breakpoint occurs on an exon
1206
+ :param start_i: Exon start index (inter-residue)
1207
+ :param end_i: Exon end index (inter-residue)
1208
+ :param strand: Strand
1209
+ :param use_start_i: Whether or not ``start_i`` should be used to compute the
1210
+ offset, defaults to ``True``. This is only used when ``is_in_exon`` is
1211
+ ``False``.
1212
+ :param is_in_exon: Whether or not the position occurs in an exon, defaults to
1213
+ ``True``
1214
+ :param start: Provided start position, defaults to ``None``. Must provide
1215
+ ``start`` or ``end``, not both.
1216
+ :param end: Provided end position, defaults to ``None``. Must provide ``start``
1217
+ or ``end``, not both
1218
+ :return: Offset from exon start or end index
985
1219
  """
986
- return any(pos >= exon[3] and pos <= exon[4] for exon in tx_genomic_coords)
1220
+ if is_in_exon:
1221
+ if start is not None:
1222
+ offset = start - start_i if strand == Strand.POSITIVE else end_i - start
1223
+ else:
1224
+ offset = end - end_i if strand == Strand.POSITIVE else start_i - end
1225
+ else:
1226
+ if strand == Strand.POSITIVE:
1227
+ offset = start - start_i if use_start_i else end - end_i
1228
+ else:
1229
+ offset = start_i - end if use_start_i else end_i - start
1230
+ return offset