cool-seq-tool 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@ import polars as pl
21
21
  from pydantic import BaseModel
22
22
 
23
23
  from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
24
+ from cool_seq_tool.mappers.liftover import LiftOver
24
25
  from cool_seq_tool.schemas import (
25
26
  AnnotationLayer,
26
27
  Assembly,
@@ -35,7 +36,7 @@ from cool_seq_tool.sources import (
35
36
  )
36
37
  from cool_seq_tool.utils import get_inter_residue_pos
37
38
 
38
- logger = logging.getLogger(__name__)
39
+ _logger = logging.getLogger(__name__)
39
40
 
40
41
 
41
42
  class EndAnnotationLayer(str, Enum):
@@ -92,6 +93,7 @@ class ManeTranscript:
92
93
  transcript_mappings: TranscriptMappings,
93
94
  mane_transcript_mappings: ManeTranscriptMappings,
94
95
  uta_db: UtaDatabase,
96
+ liftover: LiftOver,
95
97
  ) -> None:
96
98
  """Initialize the ManeTranscript class.
97
99
 
@@ -117,11 +119,13 @@ class ManeTranscript:
117
119
  :param mane_transcript_mappings: Access to MANE Transcript accession mapping
118
120
  data
119
121
  :param uta_db: UtaDatabase instance to give access to query UTA database
122
+ :param liftover: Instance to provide mapping between human genome assemblies
120
123
  """
121
124
  self.seqrepo_access = seqrepo_access
122
125
  self.transcript_mappings = transcript_mappings
123
126
  self.mane_transcript_mappings = mane_transcript_mappings
124
127
  self.uta_db = uta_db
128
+ self.liftover = liftover
125
129
 
126
130
  @staticmethod
127
131
  def _get_reading_frame(pos: int) -> int:
@@ -168,10 +172,10 @@ class ManeTranscript:
168
172
  elif ac.startswith("ENSP"):
169
173
  ac = self.transcript_mappings.ensp_to_enst[ac]
170
174
  else:
171
- logger.warning("Unable to find accession: %s", ac)
175
+ _logger.warning("Unable to find accession: %s", ac)
172
176
  return None
173
177
  except KeyError:
174
- logger.warning("%s not found in transcript_mappings", ac)
178
+ _logger.warning("%s not found in transcript_mappings", ac)
175
179
  return None
176
180
 
177
181
  pos = self._p_to_c_pos(start_pos, end_pos)
@@ -196,7 +200,7 @@ class ManeTranscript:
196
200
  0
197
201
  ]
198
202
  ):
199
- logger.warning("Ensembl transcript not found: %s", ac)
203
+ _logger.warning("Ensembl transcript not found: %s", ac)
200
204
  return None
201
205
 
202
206
  temp_ac = ac.split(".")[0]
@@ -206,7 +210,7 @@ class ManeTranscript:
206
210
  # c. coordinate does not contain cds start, so we need to add it
207
211
  cds_start_end = await self.uta_db.get_cds_start_end(temp_ac)
208
212
  if not cds_start_end:
209
- logger.warning("Accession %s not found in UTA", temp_ac)
213
+ _logger.warning("Accession %s not found in UTA", temp_ac)
210
214
  return None
211
215
  coding_start_site = cds_start_end[0]
212
216
  pos = pos[0] + coding_start_site, pos[1] + coding_start_site
@@ -215,6 +219,99 @@ class ManeTranscript:
215
219
  ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site
216
220
  )
217
221
 
222
+ async def _liftover_to_38(self, genomic_tx_data: dict) -> None:
223
+ """Liftover genomic_tx_data to hg38 assembly.
224
+
225
+ :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
226
+ strand. This will be mutated in-place if not GRCh38 assembly.
227
+ """
228
+ descr = await self.uta_db.get_chr_assembly(genomic_tx_data["alt_ac"])
229
+ if descr is None:
230
+ # already grch38
231
+ return
232
+ chromosome, _ = descr
233
+
234
+ query = f"""
235
+ SELECT DISTINCT alt_ac
236
+ FROM {self.uta_db.schema}.tx_exon_aln_v
237
+ WHERE tx_ac = '{genomic_tx_data['tx_ac']}';
238
+ """ # noqa: S608
239
+ nc_acs = await self.uta_db.execute_query(query)
240
+ nc_acs = [nc_ac[0] for nc_ac in nc_acs]
241
+ if nc_acs == [genomic_tx_data["alt_ac"]]:
242
+ _logger.warning(
243
+ "UTA does not have GRCh38 assembly for %s",
244
+ genomic_tx_data["alt_ac"].split(".")[0],
245
+ )
246
+ return
247
+
248
+ # Get most recent assembly version position
249
+ # Liftover range
250
+ self._set_liftover(
251
+ genomic_tx_data, "alt_pos_range", chromosome, Assembly.GRCH38
252
+ )
253
+
254
+ # Liftover changes range
255
+ self._set_liftover(
256
+ genomic_tx_data, "alt_pos_change_range", chromosome, Assembly.GRCH38
257
+ )
258
+
259
+ # Change alt_ac to most recent
260
+ if genomic_tx_data["alt_ac"].startswith("EN"):
261
+ order_by_cond = "ORDER BY alt_ac DESC;"
262
+ else:
263
+ order_by_cond = """
264
+ ORDER BY CAST(SUBSTR(alt_ac, position('.' in alt_ac) + 1,
265
+ LENGTH(alt_ac)) AS INT) DESC;
266
+ """
267
+ query = f"""
268
+ SELECT alt_ac
269
+ FROM {self.uta_db.schema}.genomic
270
+ WHERE alt_ac LIKE '{genomic_tx_data['alt_ac'].split('.')[0]}%'
271
+ {order_by_cond}
272
+ """ # noqa: S608
273
+ nc_acs = await self.uta_db.execute_query(query)
274
+ genomic_tx_data["alt_ac"] = nc_acs[0][0]
275
+
276
+ def _set_liftover(
277
+ self,
278
+ genomic_tx_data: dict,
279
+ key: str,
280
+ chromosome: str,
281
+ liftover_to_assembly: Assembly,
282
+ ) -> None:
283
+ """Update genomic_tx_data to have coordinates for given assembly.
284
+
285
+ :param genomic_tx_data: Dictionary containing gene, nc_accession, alt_pos, and
286
+ strand
287
+ :param key: Key to access coordinate positions
288
+ :param chromosome: Chromosome, must be prefixed with ``chr``
289
+ :param liftover_to_assembly: Assembly to liftover to
290
+ """
291
+ liftover_start_i = self.liftover.get_liftover(
292
+ chromosome, genomic_tx_data[key][0], liftover_to_assembly
293
+ )
294
+ if liftover_start_i is None:
295
+ _logger.warning(
296
+ "Unable to liftover position %s on %s",
297
+ genomic_tx_data[key][0],
298
+ chromosome,
299
+ )
300
+ return
301
+
302
+ liftover_end_i = self.liftover.get_liftover(
303
+ chromosome, genomic_tx_data[key][1], liftover_to_assembly
304
+ )
305
+ if liftover_end_i is None:
306
+ _logger.warning(
307
+ "Unable to liftover position %s on %s",
308
+ genomic_tx_data[key][1],
309
+ chromosome,
310
+ )
311
+ return
312
+
313
+ genomic_tx_data[key] = liftover_start_i[1], liftover_end_i[1]
314
+
218
315
  async def _get_and_validate_genomic_tx_data(
219
316
  self,
220
317
  tx_ac: str,
@@ -237,7 +334,7 @@ class ManeTranscript:
237
334
  tx_ac, pos, annotation_layer, alt_ac=alt_ac
238
335
  )
239
336
  if not genomic_tx_data:
240
- logger.warning(
337
+ _logger.warning(
241
338
  "Unable to find genomic_tx_data for %s at position %s on annotation layer %s",
242
339
  alt_ac,
243
340
  pos,
@@ -250,12 +347,12 @@ class ManeTranscript:
250
347
  # Only want to liftover if alt_ac not provided. If alt_ac is provided,
251
348
  # it means user wants to stick with the queried assembly
252
349
  og_alt_exon_id = genomic_tx_data["alt_exon_id"]
253
- await self.uta_db.liftover_to_38(genomic_tx_data)
350
+ await self._liftover_to_38(genomic_tx_data)
254
351
  liftover_alt_exon_id = genomic_tx_data["alt_exon_id"]
255
352
 
256
353
  # Validation check: Exon structure
257
354
  if og_alt_exon_id != liftover_alt_exon_id:
258
- logger.warning(
355
+ _logger.warning(
259
356
  "Original alt_exon_id %s does not match liftover alt_exon_id %s",
260
357
  og_alt_exon_id,
261
358
  liftover_alt_exon_id,
@@ -293,7 +390,7 @@ class ManeTranscript:
293
390
  gt_cds_end = c_pos_change[1] > cds_end and c_pos_change[1] > cds_end
294
391
 
295
392
  if lt_cds_start or gt_cds_end:
296
- logger.info(
393
+ _logger.info(
297
394
  "%s with position %s is not within CDS start/end",
298
395
  refseq_c_ac,
299
396
  c_pos_change,
@@ -381,7 +478,7 @@ class ManeTranscript:
381
478
  )
382
479
 
383
480
  if not result:
384
- logger.warning(
481
+ _logger.warning(
385
482
  "Unable to find transcript, %s, position change", refseq_c_ac
386
483
  )
387
484
  return None
@@ -438,7 +535,7 @@ class ManeTranscript:
438
535
  new_rf = self._get_reading_frame(transcript_data.pos[pos_index])
439
536
 
440
537
  if og_rf != new_rf:
441
- logger.warning(
538
+ _logger.warning(
442
539
  "%s original reading frame (%s) does not match new %s, %s reading frame (%s)",
443
540
  ac,
444
541
  og_rf,
@@ -449,7 +546,7 @@ class ManeTranscript:
449
546
  return False
450
547
  else:
451
548
  if pos_index == 0:
452
- logger.warning("%s must having start position", ac)
549
+ _logger.warning("%s must having start position", ac)
453
550
  return False
454
551
  return True
455
552
 
@@ -503,10 +600,10 @@ class ManeTranscript:
503
600
  residue_mode=residue_mode,
504
601
  )
505
602
  if not mane_ref:
506
- logger.info("Unable to validate reference for MANE Transcript")
603
+ _logger.info("Unable to validate reference for MANE Transcript")
507
604
 
508
605
  if expected_ref != mane_ref:
509
- logger.info(
606
+ _logger.info(
510
607
  "Expected ref, %s, but got %s on MANE accession, %s",
511
608
  expected_ref,
512
609
  mane_ref,
@@ -514,7 +611,7 @@ class ManeTranscript:
514
611
  )
515
612
 
516
613
  if expected_ref != ref:
517
- logger.warning(
614
+ _logger.warning(
518
615
  "Expected ref, %s, but got %s on accession, %s", expected_ref, ref, ac
519
616
  )
520
617
  return False
@@ -692,7 +789,7 @@ class ManeTranscript:
692
789
  )
693
790
 
694
791
  if df.is_empty():
695
- logger.warning("Unable to get transcripts from gene %s", gene)
792
+ _logger.warning("Unable to get transcripts from gene %s", gene)
696
793
  return lcr_result
697
794
 
698
795
  prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df)
@@ -814,7 +911,7 @@ class ManeTranscript:
814
911
  pos = lcr_result.pos
815
912
 
816
913
  if not self._validate_index(ac, pos, coding_start_site):
817
- logger.warning(
914
+ _logger.warning(
818
915
  "%s are not valid positions on %s with coding start site %s",
819
916
  pos,
820
917
  ac,
@@ -841,7 +938,7 @@ class ManeTranscript:
841
938
  pos = lcr_result_dict[k]["pos"]
842
939
  if not self._validate_index(ac, pos, cds):
843
940
  valid = False
844
- logger.warning(
941
+ _logger.warning(
845
942
  "%s are not valid positions on %s with coding start site %s",
846
943
  pos,
847
944
  ac,
@@ -999,7 +1096,7 @@ class ManeTranscript:
999
1096
  return await self.g_to_mane_c(
1000
1097
  ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode
1001
1098
  )
1002
- logger.warning("Annotation layer not supported: %s", start_annotation_layer)
1099
+ _logger.warning("Annotation layer not supported: %s", start_annotation_layer)
1003
1100
  return None
1004
1101
 
1005
1102
  async def g_to_grch38(self, ac: str, start_pos: int, end_pos: int) -> dict | None:
@@ -1024,11 +1121,11 @@ class ManeTranscript:
1024
1121
  is_same_pos = start_pos == end_pos
1025
1122
 
1026
1123
  # Coordinate liftover
1027
- if assembly < "GRCh37":
1028
- logger.warning("Liftover only supported for GRCh37")
1124
+ if assembly < Assembly.GRCH37:
1125
+ _logger.warning("Liftover only supported for GRCh37")
1029
1126
  return None
1030
1127
 
1031
- liftover_start_i = self.uta_db.get_liftover(
1128
+ liftover_start_i = self.liftover.get_liftover(
1032
1129
  chromosome, start_pos, Assembly.GRCH38
1033
1130
  )
1034
1131
  if liftover_start_i is None:
@@ -1036,7 +1133,7 @@ class ManeTranscript:
1036
1133
  start_pos = liftover_start_i[1]
1037
1134
 
1038
1135
  if not is_same_pos:
1039
- liftover_end_i = self.uta_db.get_liftover(
1136
+ liftover_end_i = self.liftover.get_liftover(
1040
1137
  chromosome, end_pos, Assembly.GRCH38
1041
1138
  )
1042
1139
  if liftover_end_i is None:
@@ -1130,7 +1227,7 @@ class ManeTranscript:
1130
1227
  )
1131
1228
 
1132
1229
  if not await self.uta_db.validate_genomic_ac(ac):
1133
- logger.warning("Genomic accession does not exist: %s", ac)
1230
+ _logger.warning("Genomic accession does not exist: %s", ac)
1134
1231
  return None
1135
1232
 
1136
1233
  mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
@@ -1156,7 +1253,7 @@ class ManeTranscript:
1156
1253
  )
1157
1254
  if not mane_tx_genomic_data:
1158
1255
  continue
1159
- logger.info("Not using most recent assembly")
1256
+ _logger.info("Not using most recent assembly")
1160
1257
 
1161
1258
  coding_start_site = mane_tx_genomic_data["coding_start_site"]
1162
1259
  coding_end_site = mane_tx_genomic_data["coding_end_site"]
@@ -1167,7 +1264,7 @@ class ManeTranscript:
1167
1264
  if not self._validate_index(
1168
1265
  mane_c_ac, mane_c_pos_change, coding_start_site
1169
1266
  ):
1170
- logger.warning(
1267
+ _logger.warning(
1171
1268
  "%s are not valid positions on %s with coding start site %s",
1172
1269
  mane_c_pos_change,
1173
1270
  mane_c_ac,
@@ -1257,7 +1354,7 @@ class ManeTranscript:
1257
1354
  if not self._validate_index(
1258
1355
  mane_c_ac, mane_c_pos_change, coding_start_site
1259
1356
  ):
1260
- logger.warning(
1357
+ _logger.warning(
1261
1358
  "%s are not valid positions on %s with coding start site %s",
1262
1359
  mane_c_pos_change,
1263
1360
  mane_c_ac,
@@ -9,8 +9,9 @@ from asyncpg import InvalidCatalogNameError, UndefinedTableError
9
9
  from biocommons.seqrepo import SeqRepo
10
10
 
11
11
  from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
12
+ from cool_seq_tool.mappers.liftover import LiftOver
12
13
  from cool_seq_tool.resources.data_files import DataFile, get_data_file
13
- from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase, get_liftover
14
+ from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase
14
15
 
15
16
  _logger = logging.getLogger(__name__)
16
17
 
@@ -42,9 +43,7 @@ async def check_status(
42
43
  Arguments are intended to mirror arguments to :py:meth:`cool_seq_tool.app.CoolSeqTool.__init__`.
43
44
 
44
45
  Additional arguments are available for testing paths to specific chainfiles (same
45
- signature as :py:meth:`cool_seq_tool.sources.uta_database.UtaDatabase.__init__`).
46
- Note that chainfile failures also entail UTA initialization failure; this status is
47
- reported separately to enable more precise debugging.
46
+ signature as :py:meth:`cool_seq_tool.mappers.liftover.LiftOver.__init__`).
48
47
 
49
48
  >>> from cool_seq_tool.resources.status import check_status
50
49
  >>> await check_status()
@@ -104,7 +103,10 @@ async def check_status(
104
103
  status[name_lower] = True
105
104
 
106
105
  try:
107
- get_liftover(chain_file_37_to_38, chain_file_38_to_37)
106
+ LiftOver(
107
+ chain_file_37_to_38=chain_file_37_to_38,
108
+ chain_file_38_to_37=chain_file_38_to_37,
109
+ )
108
110
  except (FileNotFoundError, ChainfileError) as e:
109
111
  _logger.error("agct converter setup failed: %s", e)
110
112
  except Exception as e:
cool_seq_tool/schemas.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Defines attribute constants, useful object structures, and API response schemas."""
2
2
 
3
3
  import datetime
4
- import re
5
4
  from enum import Enum, IntEnum
6
5
  from typing import Literal
7
6
 
@@ -10,11 +9,10 @@ from pydantic import (
10
9
  ConfigDict,
11
10
  StrictInt,
12
11
  StrictStr,
13
- field_validator,
14
12
  model_validator,
15
13
  )
16
14
 
17
- from cool_seq_tool.version import __version__
15
+ from cool_seq_tool import __version__
18
16
 
19
17
  _now = str(datetime.datetime.now(tz=datetime.timezone.utc))
20
18
 
@@ -35,11 +33,16 @@ class Strand(IntEnum):
35
33
 
36
34
 
37
35
  class Assembly(str, Enum):
38
- """Create Enum for supported genomic assemblies"""
36
+ """Define supported genomic assemblies. Must be defined in ascending order"""
39
37
 
40
38
  GRCH37 = "GRCh37"
41
39
  GRCH38 = "GRCh38"
42
40
 
41
+ @classmethod
42
+ def values(cls) -> list[str]:
43
+ """Return list of values in enum (ascending assembly order)"""
44
+ return [item.value for item in cls]
45
+
43
46
 
44
47
  class TranscriptPriority(str, Enum):
45
48
  """Create Enum for Transcript Priority labels"""
@@ -276,17 +279,6 @@ class ServiceMeta(BaseModelForbidExtra):
276
279
  "https://github.com/GenomicMedLab/cool-seq-tool"
277
280
  )
278
281
 
279
- @field_validator("version")
280
- def validate_version(cls, v):
281
- """Check version matches semantic versioning regex pattern.
282
- https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
283
- """
284
- version_regex = r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
285
- if not re.match(version_regex, v):
286
- msg = f"Invalid version {v}"
287
- raise ValueError(msg)
288
- return v
289
-
290
282
  model_config = ConfigDict(
291
283
  json_schema_extra={
292
284
  "example": {
@@ -384,7 +376,7 @@ class MappedManeData(BaseModel):
384
376
  "strand": Strand.NEGATIVE,
385
377
  "status": TranscriptPriority.MANE_PLUS_CLINICAL,
386
378
  "alt_ac": "NC_000007.13",
387
- "assembly": "GRCh37",
379
+ "assembly": Assembly.GRCH37,
388
380
  }
389
381
  }
390
382
  )
@@ -407,7 +399,7 @@ class MappedManeDataService(BaseModelForbidExtra):
407
399
  "strand": Strand.NEGATIVE,
408
400
  "status": TranscriptPriority.MANE_PLUS_CLINICAL,
409
401
  "alt_ac": "NC_000007.13",
410
- "assembly": "GRCh37",
402
+ "assembly": Assembly.GRCH37,
411
403
  },
412
404
  "warnings": [],
413
405
  "service_meta": {
@@ -9,7 +9,7 @@ import polars as pl
9
9
 
10
10
  from cool_seq_tool.resources.data_files import DataFile, get_data_file
11
11
 
12
- logger = logging.getLogger(__name__)
12
+ _logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
15
  class ManeTranscriptMappings:
@@ -63,7 +63,7 @@ class ManeTranscriptMappings:
63
63
  data = self.df.filter(pl.col("symbol") == gene_symbol.upper())
64
64
 
65
65
  if len(data) == 0:
66
- logger.warning(
66
+ _logger.warning(
67
67
  "Unable to get MANE Transcript data for gene: %s", gene_symbol
68
68
  )
69
69
  return []