cool-seq-tool 0.4.0.dev2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. cool_seq_tool/__init__.py +1 -3
  2. cool_seq_tool/api.py +1 -2
  3. cool_seq_tool/app.py +42 -24
  4. cool_seq_tool/handlers/__init__.py +1 -0
  5. cool_seq_tool/handlers/seqrepo_access.py +13 -15
  6. cool_seq_tool/mappers/__init__.py +1 -0
  7. cool_seq_tool/mappers/alignment.py +5 -6
  8. cool_seq_tool/mappers/exon_genomic_coords.py +232 -68
  9. cool_seq_tool/mappers/mane_transcript.py +84 -86
  10. cool_seq_tool/resources/__init__.py +1 -0
  11. cool_seq_tool/resources/data_files.py +93 -0
  12. cool_seq_tool/resources/status.py +151 -0
  13. cool_seq_tool/routers/__init__.py +1 -0
  14. cool_seq_tool/routers/default.py +1 -0
  15. cool_seq_tool/routers/mane.py +4 -4
  16. cool_seq_tool/routers/mappings.py +2 -2
  17. cool_seq_tool/schemas.py +83 -37
  18. cool_seq_tool/sources/__init__.py +1 -0
  19. cool_seq_tool/sources/mane_transcript_mappings.py +14 -7
  20. cool_seq_tool/sources/transcript_mappings.py +41 -32
  21. cool_seq_tool/sources/uta_database.py +120 -69
  22. cool_seq_tool/utils.py +2 -2
  23. cool_seq_tool/version.py +2 -1
  24. {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/LICENSE +1 -1
  25. {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/METADATA +15 -8
  26. cool_seq_tool-0.4.1.dist-info/RECORD +29 -0
  27. {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/WHEEL +1 -1
  28. cool_seq_tool/data/__init__.py +0 -2
  29. cool_seq_tool/data/data_downloads.py +0 -89
  30. cool_seq_tool/paths.py +0 -28
  31. cool_seq_tool-0.4.0.dev2.dist-info/RECORD +0 -29
  32. /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
  33. {cool_seq_tool-0.4.0.dev2.dist-info → cool_seq_tool-0.4.1.dist-info}/top_level.txt +0 -0
@@ -11,10 +11,11 @@ Steps:
11
11
  In addition to a mapper utility class, this module also defines several vocabulary
12
12
  constraints and data models for coordinate representation.
13
13
  """
14
+
14
15
  import logging
15
16
  import math
16
17
  from enum import Enum
17
- from typing import Dict, List, Optional, Set, Tuple, Union
18
+ from typing import Literal
18
19
 
19
20
  import polars as pl
20
21
  from pydantic import BaseModel
@@ -50,10 +51,10 @@ class EndAnnotationLayer(str, Enum):
50
51
  class DataRepresentation(BaseModel):
51
52
  """Define object model for final output representation"""
52
53
 
53
- gene: Optional[str] = None
54
+ gene: str | None = None
54
55
  refseq: str
55
- ensembl: Optional[str] = None
56
- pos: Tuple[int, int]
56
+ ensembl: str | None = None
57
+ pos: tuple[int, int]
57
58
  strand: Strand
58
59
  status: TranscriptPriority
59
60
 
@@ -63,14 +64,14 @@ class CdnaRepresentation(DataRepresentation):
63
64
 
64
65
  coding_start_site: int
65
66
  coding_end_site: int
66
- alt_ac: Optional[str] = None
67
+ alt_ac: str | None = None
67
68
 
68
69
 
69
70
  class GenomicRepresentation(BaseModel):
70
71
  """Define object model for genomic representation"""
71
72
 
72
73
  refseq: str
73
- pos: Tuple[int, int]
74
+ pos: tuple[int, int]
74
75
  status: TranscriptPriority
75
76
  alt_ac: str
76
77
 
@@ -105,7 +106,7 @@ class ManeTranscript:
105
106
 
106
107
  >>> import asyncio
107
108
  >>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
108
- >>> result['ac']
109
+ >>> result["ac"]
109
110
  'NC_000001.11'
110
111
 
111
112
  See the :ref:`Usage section <async_note>` for more information.
@@ -135,7 +136,7 @@ class ManeTranscript:
135
136
  return pos_mod_3
136
137
 
137
138
  @staticmethod
138
- def _p_to_c_pos(start: int, end: int) -> Tuple[int, int]:
139
+ def _p_to_c_pos(start: int, end: int) -> tuple[int, int]:
139
140
  """Return cDNA position given a protein position.
140
141
 
141
142
  :param start: Start protein position. Inter-residue coordinates
@@ -148,7 +149,7 @@ class ManeTranscript:
148
149
 
149
150
  async def _p_to_c(
150
151
  self, ac: str, start_pos: int, end_pos: int
151
- ) -> Optional[Tuple[str, Tuple[int, int]]]:
152
+ ) -> tuple[str, tuple[int, int]] | None:
152
153
  """Convert protein (p.) annotation to cDNA (c.) annotation.
153
154
 
154
155
  :param ac: Protein accession
@@ -176,7 +177,7 @@ class ManeTranscript:
176
177
  pos = self._p_to_c_pos(start_pos, end_pos)
177
178
  return ac, pos
178
179
 
179
- async def _c_to_g(self, ac: str, pos: Tuple[int, int]) -> Optional[Dict]:
180
+ async def _c_to_g(self, ac: str, pos: tuple[int, int]) -> dict | None:
180
181
  """Get g. annotation from c. annotation.
181
182
 
182
183
  :param ac: cDNA accession
@@ -217,13 +218,12 @@ class ManeTranscript:
217
218
  async def _get_and_validate_genomic_tx_data(
218
219
  self,
219
220
  tx_ac: str,
220
- pos: Tuple[int, int],
221
- annotation_layer: Union[
222
- AnnotationLayer.CDNA, AnnotationLayer.GENOMIC
223
- ] = AnnotationLayer.CDNA,
224
- coding_start_site: Optional[int] = None,
225
- alt_ac: Optional[str] = None,
226
- ) -> Optional[Dict]:
221
+ pos: tuple[int, int],
222
+ annotation_layer: Literal[AnnotationLayer.CDNA]
223
+ | Literal[AnnotationLayer.GENOMIC] = AnnotationLayer.CDNA,
224
+ coding_start_site: int | None = None,
225
+ alt_ac: str | None = None,
226
+ ) -> dict | None:
227
227
  """Get and validate genomic_tx_data
228
228
 
229
229
  :param tx_ac: Accession on c. coordinate
@@ -266,14 +266,14 @@ class ManeTranscript:
266
266
 
267
267
  @staticmethod
268
268
  def _get_c_data(
269
- cds_start_end: Tuple[int, int],
270
- c_pos_change: Tuple[int, int],
269
+ cds_start_end: tuple[int, int],
270
+ c_pos_change: tuple[int, int],
271
271
  strand: Strand,
272
272
  status: TranscriptPriority,
273
273
  refseq_c_ac: str,
274
- gene: Optional[str] = None,
275
- ensembl_c_ac: Optional[str] = None,
276
- alt_ac: Optional[str] = None,
274
+ gene: str | None = None,
275
+ ensembl_c_ac: str | None = None,
276
+ alt_ac: str | None = None,
277
277
  ) -> CdnaRepresentation:
278
278
  """Return transcript data on c. coordinate.
279
279
 
@@ -311,7 +311,7 @@ class ManeTranscript:
311
311
  alt_ac=alt_ac,
312
312
  )
313
313
 
314
- def _c_to_p_pos(self, c_pos: Tuple[int, int]) -> Tuple[int, int]:
314
+ def _c_to_p_pos(self, c_pos: tuple[int, int]) -> tuple[int, int]:
315
315
  """Get protein position from cdna position
316
316
 
317
317
  :param c_pos: cdna position. inter-residue coordinates
@@ -325,7 +325,7 @@ class ManeTranscript:
325
325
  return start, end
326
326
 
327
327
  def _get_mane_p(
328
- self, mane_data: Dict, mane_c_pos_range: Tuple[int, int]
328
+ self, mane_data: dict, mane_c_pos_range: tuple[int, int]
329
329
  ) -> DataRepresentation:
330
330
  """Translate MANE Transcript c. annotation to p. annotation
331
331
 
@@ -349,13 +349,13 @@ class ManeTranscript:
349
349
 
350
350
  async def _g_to_c(
351
351
  self,
352
- g: Dict,
352
+ g: dict,
353
353
  refseq_c_ac: str,
354
354
  status: TranscriptPriority,
355
- ensembl_c_ac: Optional[str] = None,
356
- alt_ac: Optional[str] = None,
355
+ ensembl_c_ac: str | None = None,
356
+ alt_ac: str | None = None,
357
357
  found_result: bool = False,
358
- ) -> Optional[CdnaRepresentation]:
358
+ ) -> CdnaRepresentation | None:
359
359
  """Get transcript c. annotation data from g. annotation.
360
360
 
361
361
  :param g: Genomic data
@@ -459,9 +459,9 @@ class ManeTranscript:
459
459
  coding_start_site: int,
460
460
  start_pos: int,
461
461
  end_pos: int,
462
- mane_transcript: Union[
463
- DataRepresentation, CdnaRepresentation, GenomicRepresentation
464
- ],
462
+ mane_transcript: DataRepresentation
463
+ | CdnaRepresentation
464
+ | GenomicRepresentation,
465
465
  expected_ref: str,
466
466
  anno: AnnotationLayer,
467
467
  residue_mode: ResidueMode,
@@ -522,7 +522,7 @@ class ManeTranscript:
522
522
  return True
523
523
 
524
524
  def _validate_index(
525
- self, ac: str, pos: Tuple[int, int], coding_start_site: int
525
+ self, ac: str, pos: tuple[int, int], coding_start_site: int
526
526
  ) -> bool:
527
527
  """Validate that positions actually exist on accession
528
528
 
@@ -533,13 +533,13 @@ class ManeTranscript:
533
533
  """
534
534
  start_pos = pos[0] + coding_start_site
535
535
  end_pos = pos[1] + coding_start_site
536
- if self.seqrepo_access.get_reference_sequence(
537
- ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
538
- )[0]:
539
- return True
540
- return False
536
+ return bool(
537
+ self.seqrepo_access.get_reference_sequence(
538
+ ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
539
+ )[0]
540
+ )
541
541
 
542
- def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> List:
542
+ def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> list:
543
543
  """Sort and filter transcripts from gene to get priority list
544
544
 
545
545
  :param df: Data frame containing transcripts from gene
@@ -550,7 +550,7 @@ class ManeTranscript:
550
550
  most recent version of a transcript associated with an assembly will be kept
551
551
  """
552
552
  copy_df = df.clone()
553
- copy_df = copy_df.drop(columns="alt_ac").unique()
553
+ copy_df = copy_df.drop("alt_ac").unique()
554
554
  copy_df = copy_df.with_columns(
555
555
  [
556
556
  pl.col("tx_ac")
@@ -590,15 +590,13 @@ class ManeTranscript:
590
590
  start_pos: int,
591
591
  end_pos: int,
592
592
  start_annotation_layer: AnnotationLayer,
593
- gene: Optional[str] = None,
594
- ref: Optional[str] = None,
593
+ gene: str | None = None,
594
+ ref: str | None = None,
595
595
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
596
- mane_transcripts: Optional[Set] = None,
597
- alt_ac: Optional[str] = None,
598
- end_annotation_layer: Optional[EndAnnotationLayer] = None,
599
- ) -> Optional[
600
- Union[DataRepresentation, CdnaRepresentation, ProteinAndCdnaRepresentation]
601
- ]:
596
+ mane_transcripts: set | None = None,
597
+ alt_ac: str | None = None,
598
+ end_annotation_layer: EndAnnotationLayer | None = None,
599
+ ) -> DataRepresentation | CdnaRepresentation | ProteinAndCdnaRepresentation | None:
602
600
  """Get longest compatible transcript from a gene. See the documentation for
603
601
  the :ref:`transcript compatibility policy <transcript_compatibility>` for more
604
602
  information.
@@ -613,14 +611,16 @@ class ManeTranscript:
613
611
  ... "NM_004333.6",
614
612
  ... "ENST00000644969.2",
615
613
  ... }
616
- >>> result = asyncio.run(mane_mapper.get_longest_compatible_transcript(
617
- ... 599,
618
- ... 599,
619
- ... gene="BRAF",
620
- ... start_annotation_layer=AnnotationLayer.PROTEIN,
621
- ... residue_mode=ResidueMode.INTER_RESIDUE,
622
- ... mane_transcripts=mane_transcripts,
623
- ... ))
614
+ >>> result = asyncio.run(
615
+ ... mane_mapper.get_longest_compatible_transcript(
616
+ ... 599,
617
+ ... 599,
618
+ ... gene="BRAF",
619
+ ... start_annotation_layer=AnnotationLayer.PROTEIN,
620
+ ... residue_mode=ResidueMode.INTER_RESIDUE,
621
+ ... mane_transcripts=mane_transcripts,
622
+ ... )
623
+ ... )
624
624
  >>> result.refseq
625
625
  'NP_001365396.1'
626
626
 
@@ -645,9 +645,9 @@ class ManeTranscript:
645
645
  """
646
646
 
647
647
  def _get_protein_rep(
648
- gene: Optional[str],
648
+ gene: str | None,
649
649
  pro_ac: str,
650
- lcr_c_data_pos: Tuple[int, int],
650
+ lcr_c_data_pos: tuple[int, int],
651
651
  strand: Strand,
652
652
  status: TranscriptPriority,
653
653
  ) -> DataRepresentation:
@@ -731,7 +731,7 @@ class ManeTranscript:
731
731
 
732
732
  # Get prioritized transcript data for gene
733
733
  # grch38 -> c
734
- lcr_c_data: Optional[CdnaRepresentation] = await self._g_to_c(
734
+ lcr_c_data: CdnaRepresentation | None = await self._g_to_c(
735
735
  g=g,
736
736
  refseq_c_ac=tx_ac,
737
737
  status=TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
@@ -859,25 +859,26 @@ class ManeTranscript:
859
859
  start_pos: int,
860
860
  end_pos: int,
861
861
  start_annotation_layer: AnnotationLayer,
862
- gene: Optional[str] = None,
863
- ref: Optional[str] = None,
862
+ gene: str | None = None,
863
+ ref: str | None = None,
864
864
  try_longest_compatible: bool = False,
865
- residue_mode: Union[
866
- ResidueMode.RESIDUE, ResidueMode.INTER_RESIDUE
867
- ] = ResidueMode.RESIDUE,
868
- ) -> Optional[Union[DataRepresentation, CdnaRepresentation]]:
865
+ residue_mode: Literal[ResidueMode.RESIDUE]
866
+ | Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.RESIDUE,
867
+ ) -> DataRepresentation | CdnaRepresentation | None:
869
868
  """Return MANE transcript.
870
869
 
871
870
  >>> from cool_seq_tool.app import CoolSeqTool
872
871
  >>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
873
872
  >>> import asyncio
874
873
  >>> mane_mapper = CoolSeqTool().mane_transcript
875
- >>> result = asyncio.run(mane_mapper.get_mane_transcript(
876
- ... "NP_004324.2",
877
- ... 599,
878
- ... AnnotationLayer.PROTEIN,
879
- ... residue_mode=ResidueMode.INTER_RESIDUE,
880
- ... ))
874
+ >>> result = asyncio.run(
875
+ ... mane_mapper.get_mane_transcript(
876
+ ... "NP_004324.2",
877
+ ... 599,
878
+ ... AnnotationLayer.PROTEIN,
879
+ ... residue_mode=ResidueMode.INTER_RESIDUE,
880
+ ... )
881
+ ... )
881
882
  >>> result.gene, result.refseq, result.status
882
883
  ('BRAF', 'NP_004324.2', <TranscriptPriority.MANE_SELECT: 'mane_select'>)
883
884
 
@@ -930,7 +931,7 @@ class ManeTranscript:
930
931
  current_mane_data["RefSeq_nuc"],
931
932
  current_mane_data["Ensembl_nuc"],
932
933
  }
933
- mane: Optional[CdnaRepresentation] = await self._g_to_c(
934
+ mane: CdnaRepresentation | None = await self._g_to_c(
934
935
  g=g,
935
936
  refseq_c_ac=current_mane_data["RefSeq_nuc"],
936
937
  status=TranscriptPriority(
@@ -1001,9 +1002,7 @@ class ManeTranscript:
1001
1002
  logger.warning("Annotation layer not supported: %s", start_annotation_layer)
1002
1003
  return None
1003
1004
 
1004
- async def g_to_grch38(
1005
- self, ac: str, start_pos: int, end_pos: int
1006
- ) -> Optional[Dict]:
1005
+ async def g_to_grch38(self, ac: str, start_pos: int, end_pos: int) -> dict | None:
1007
1006
  """Return genomic coordinate on GRCh38 when not given gene context.
1008
1007
 
1009
1008
  :param ac: Genomic accession
@@ -1055,8 +1054,8 @@ class ManeTranscript:
1055
1054
 
1056
1055
  @staticmethod
1057
1056
  def get_mane_c_pos_change(
1058
- mane_tx_genomic_data: Dict, coding_start_site: int
1059
- ) -> Tuple[int, int]:
1057
+ mane_tx_genomic_data: dict, coding_start_site: int
1058
+ ) -> tuple[int, int]:
1060
1059
  """Get mane c position change
1061
1060
 
1062
1061
  :param mane_tx_genomic_data: MANE transcript and genomic data
@@ -1080,9 +1079,9 @@ class ManeTranscript:
1080
1079
  ac: str,
1081
1080
  start_pos: int,
1082
1081
  end_pos: int,
1083
- gene: Optional[str] = None,
1082
+ gene: str | None = None,
1084
1083
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
1085
- ) -> Optional[Union[GenomicRepresentation, CdnaRepresentation]]:
1084
+ ) -> GenomicRepresentation | CdnaRepresentation | None:
1086
1085
  """Return MANE Transcript on the c. coordinate.
1087
1086
 
1088
1087
  If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
@@ -1091,12 +1090,11 @@ class ManeTranscript:
1091
1090
  >>> import asyncio
1092
1091
  >>> from cool_seq_tool.app import CoolSeqTool
1093
1092
  >>> cst = CoolSeqTool()
1094
- >>> result = asyncio.run(cst.mane_transcript.g_to_mane_c(
1095
- ... "NC_000007.13",
1096
- ... 55259515,
1097
- ... None,
1098
- ... gene="EGFR"
1099
- ... ))
1093
+ >>> result = asyncio.run(
1094
+ ... cst.mane_transcript.g_to_mane_c(
1095
+ ... "NC_000007.13", 55259515, None, gene="EGFR"
1096
+ ... )
1097
+ ... )
1100
1098
  >>> type(result)
1101
1099
  <class 'cool_seq_tool.mappers.mane_transcript.CdnaRepresentation'>
1102
1100
  >>> result.status
@@ -1198,10 +1196,10 @@ class ManeTranscript:
1198
1196
  alt_ac: str,
1199
1197
  start_pos: int,
1200
1198
  end_pos: int,
1201
- gene: Optional[str] = None,
1199
+ gene: str | None = None,
1202
1200
  residue_mode: ResidueMode = ResidueMode.RESIDUE,
1203
1201
  try_longest_compatible: bool = False,
1204
- ) -> Optional[Dict]:
1202
+ ) -> dict | None:
1205
1203
  """Given GRCh38 genomic representation, return protein representation.
1206
1204
 
1207
1205
  Will try MANE Select and then MANE Plus Clinical. If neither is found and
@@ -0,0 +1 @@
1
+ """Provide tools for acquiring and managing Cool-Seq-Tool data resources."""
@@ -0,0 +1,93 @@
1
+ """Fetch data files regarding transcript mapping and annotation."""
2
+
3
+ import logging
4
+ from enum import Enum
5
+ from importlib import resources
6
+ from os import environ
7
+ from pathlib import Path
8
+
9
+ from wags_tails import NcbiLrgRefSeqGeneData, NcbiManeSummaryData
10
+
11
+ _logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DataFile(str, Enum):
15
+ """Constrain legal values for file resource fetching in :py:meth:`get_data_file() <cool_seq_tool.resources.data_files.get_data_file>`."""
16
+
17
+ TRANSCRIPT_MAPPINGS = "transcript_mappings"
18
+ MANE_SUMMARY = "mane_summary"
19
+ LRG_REFSEQGENE = "lrg_refseqgene"
20
+
21
+ def lower(self) -> str:
22
+ """Return lower-cased value
23
+
24
+ :return: lower case string
25
+ """
26
+ return self.value.lower()
27
+
28
+
29
+ _resource_acquisition_params = {
30
+ DataFile.TRANSCRIPT_MAPPINGS: (
31
+ "TRANSCRIPT_MAPPINGS_PATH",
32
+ lambda _: resources.files(__package__) / "transcript_mapping.tsv",
33
+ ),
34
+ DataFile.MANE_SUMMARY: (
35
+ "MANE_SUMMARY_PATH",
36
+ lambda from_local: NcbiManeSummaryData(silent=True).get_latest(
37
+ from_local=from_local
38
+ )[0],
39
+ ),
40
+ DataFile.LRG_REFSEQGENE: (
41
+ "LRG_REFSEQGENE_PATH",
42
+ lambda from_local: NcbiLrgRefSeqGeneData(silent=True).get_latest(
43
+ from_local=from_local
44
+ )[0],
45
+ ),
46
+ }
47
+
48
+
49
+ def get_data_file(resource: DataFile, from_local: bool = False) -> Path:
50
+ """Acquire Cool-Seq-Tool file dependency.
51
+
52
+ Each resource can be defined using an environment variable:
53
+
54
+ * ``Resource.TRANSCRIPT_MAPPINGS`` -> ``TRANSCRIPT_MAPPINGS_PATH``
55
+ * ``Resource.MANE_SUMMARY`` -> ``MANE_SUMMARY_PATH``
56
+ * ``Resource.LRG_REFSEQGENE`` -> ``LRG_REFSEQGENE_PATH``
57
+
58
+ Otherwise, this function falls back on default expected locations:
59
+
60
+ * ``transcript_mappings.tsv`` is bundled with this library.
61
+ * LRG RefseqGene and MANE summary files are acquired from NCBI using the `wags-tails <https://wags-tails.readthedocs.io/stable/>`_ if unavailable locally, or out of date.
62
+
63
+ :param resource: resource to fetch
64
+ :param from_local: if ``True``, don't check for or acquire latest version -- just
65
+ provide most recent locally available file and raise FileNotFoundError otherwise
66
+ :return: path to file. Consuming functions can assume that it exists and is a file.
67
+ :raise FileNotFoundError: if file location configured by env var doesn't exist
68
+ :raise ValueError: if file location configured by env var isn't a file
69
+ """
70
+ params = _resource_acquisition_params[resource]
71
+ configured_path = environ.get(params[0])
72
+ if configured_path:
73
+ _logger.debug(
74
+ "Acquiring %s via env var %s:%s", resource, params[0], configured_path
75
+ )
76
+ path = Path(configured_path)
77
+ loc_descr = (
78
+ "the default file bundled with Cool-Seq-Tool"
79
+ if resource == DataFile.TRANSCRIPT_MAPPINGS
80
+ else "the the default file pattern and possibly acquire from source via the `wags-tails` package"
81
+ )
82
+ msg = f'No {params[0].replace("_", " ").title()} file exists at path {configured_path} defined under env var {params[0]}. Either unset to use {loc_descr}, or ensure that it is available at this location. See the "Environment configuration" section under the Usage page within the documentation for more: https://coolseqtool.readthedocs.io/stable/usage.html#environment-configuration'
83
+ if not path.exists():
84
+ raise FileNotFoundError(msg)
85
+ if not path.is_file():
86
+ raise ValueError(msg)
87
+ else:
88
+ _logger.debug("Acquiring %s from default location/method.", resource)
89
+ # param[1] is the resource fetcher function -- use `from_local` param to
90
+ # optionally avoid unnecessary fetches
91
+ path = params[1](from_local)
92
+ _logger.debug("Acquired %s at %s", resource, path)
93
+ return path
@@ -0,0 +1,151 @@
1
+ """Enable quick status check of Cool-Seq-Tool resources."""
2
+
3
+ import logging
4
+ from collections import namedtuple
5
+ from pathlib import Path
6
+
7
+ from agct._core import ChainfileError
8
+ from asyncpg import InvalidCatalogNameError, UndefinedTableError
9
+ from biocommons.seqrepo import SeqRepo
10
+
11
+ from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
12
+ from cool_seq_tool.resources.data_files import DataFile, get_data_file
13
+ from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase, get_liftover
14
+
15
+ _logger = logging.getLogger(__name__)
16
+
17
+
18
+ ResourceStatus = namedtuple(
19
+ "ResourceStatus",
20
+ (
21
+ "uta",
22
+ "seqrepo",
23
+ DataFile.TRANSCRIPT_MAPPINGS.lower(),
24
+ DataFile.MANE_SUMMARY.lower(),
25
+ DataFile.LRG_REFSEQGENE.lower(),
26
+ "liftover",
27
+ ),
28
+ )
29
+
30
+
31
+ async def check_status(
32
+ transcript_file_path: Path | None = None,
33
+ lrg_refseqgene_path: Path | None = None,
34
+ mane_data_path: Path | None = None,
35
+ db_url: str = UTA_DB_URL,
36
+ sr: SeqRepo | None = None,
37
+ chain_file_37_to_38: str | None = None,
38
+ chain_file_38_to_37: str | None = None,
39
+ ) -> ResourceStatus:
40
+ """Perform basic status checks on availability of required data resources.
41
+
42
+ Arguments are intended to mirror arguments to :py:meth:`cool_seq_tool.app.CoolSeqTool.__init__`.
43
+
44
+ Additional arguments are available for testing paths to specific chainfiles (same
45
+ signature as :py:meth:`cool_seq_tool.sources.uta_database.UtaDatabase.__init__`).
46
+ Note that chainfile failures also entail UTA initialization failure; this status is
47
+ reported separately to enable more precise debugging.
48
+
49
+ >>> from cool_seq_tool.resources.status import check_status
50
+ >>> await check_status()
51
+ ResourceStatus(uta=True, seqrepo=True, transcript_mappings=True, mane_summary=True, lrg_refseqgene=True, liftover=True)
52
+
53
+ :param transcript_file_path: The path to ``transcript_mapping.tsv``
54
+ :param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
55
+ :param mane_data_path: Path to RefSeq MANE summary data
56
+ :param db_url: PostgreSQL connection URL
57
+ Format: ``driver://user:password@host/database/schema``
58
+ :param chain_file_37_to_38: Optional path to chain file for 37 to 38 assembly. This
59
+ is used for ``agct``. If this is not provided, will check to see if
60
+ ``LIFTOVER_CHAIN_37_TO_38`` env var is set. If neither is provided, will allow
61
+ ``agct`` to download a chain file from UCSC
62
+ :param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly. This
63
+ is used for ``agct``. If this is not provided, will check to see if
64
+ ``LIFTOVER_CHAIN_38_TO_37`` env var is set. If neither is provided, will allow
65
+ ``agct`` to download a chain file from UCSC
66
+ :return: boolean description of availability of each resource, given current
67
+ environment configurations
68
+ """
69
+ file_path_params = {
70
+ DataFile.TRANSCRIPT_MAPPINGS.lower(): transcript_file_path,
71
+ DataFile.LRG_REFSEQGENE.lower(): lrg_refseqgene_path,
72
+ DataFile.MANE_SUMMARY.lower(): mane_data_path,
73
+ }
74
+
75
+ status = {
76
+ DataFile.TRANSCRIPT_MAPPINGS.lower(): False,
77
+ DataFile.LRG_REFSEQGENE.lower(): False,
78
+ DataFile.MANE_SUMMARY.lower(): False,
79
+ "liftover": False,
80
+ "uta": False,
81
+ "seqrepo": False,
82
+ }
83
+ for r in list(DataFile):
84
+ name_lower = r.lower()
85
+ declared_path = file_path_params[name_lower]
86
+ if declared_path and declared_path.exists() and declared_path.is_file():
87
+ status[name_lower] = True
88
+ continue
89
+ try:
90
+ get_data_file(r)
91
+ except FileNotFoundError:
92
+ _logger.error(
93
+ "%s does not exist at configured location %s", name_lower, declared_path
94
+ )
95
+ except ValueError:
96
+ _logger.error(
97
+ "%s configured at %s is not a valid file.", name_lower, declared_path
98
+ )
99
+ except Exception as e:
100
+ _logger.critical(
101
+ "Encountered unexpected error fetching %s: %s", name_lower, e
102
+ )
103
+ else:
104
+ status[name_lower] = True
105
+
106
+ try:
107
+ get_liftover(chain_file_37_to_38, chain_file_38_to_37)
108
+ except (FileNotFoundError, ChainfileError) as e:
109
+ _logger.error("agct converter setup failed: %s", e)
110
+ except Exception as e:
111
+ _logger.critical("Encountered unexpected error setting up agct: %s", e)
112
+ else:
113
+ status["liftover"] = True
114
+
115
+ try:
116
+ await UtaDatabase.create(db_url)
117
+ except (OSError, InvalidCatalogNameError, UndefinedTableError) as e:
118
+ _logger.error(
119
+ "Encountered error instantiating UTA at URI %s: %s", UTA_DB_URL, e
120
+ )
121
+ except Exception as e:
122
+ _logger.critical(
123
+ "Encountered unexpected error instantiating UTA from URI %s: %s",
124
+ UTA_DB_URL,
125
+ e,
126
+ )
127
+ else:
128
+ status["uta"] = True
129
+
130
+ try:
131
+ if not sr:
132
+ sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
133
+ sra = SeqRepoAccess(sr)
134
+ sra.sr["NC_000001.11"][1000:1001]
135
+ except OSError as e:
136
+ _logger.error("Encountered error while instantiating SeqRepo: %s", e)
137
+ except KeyError:
138
+ _logger.error("SeqRepo data fetch test failed -- is it populated?")
139
+ except Exception as e:
140
+ _logger.critical("Encountered unexpected error setting up SeqRepo: %s", e)
141
+ else:
142
+ status["seqrepo"] = True
143
+
144
+ structured_status = ResourceStatus(**status)
145
+ if all(status.values()):
146
+ _logger.info("Cool-Seq-Tool resource status passed")
147
+ else:
148
+ _logger.error(
149
+ "Cool-Seq-Tool resource check failed. Result: %s", structured_status
150
+ )
151
+ return structured_status
@@ -1,4 +1,5 @@
1
1
  """Module for routers"""
2
+
2
3
  from enum import Enum
3
4
 
4
5
  from cool_seq_tool.app import CoolSeqTool
@@ -1,4 +1,5 @@
1
1
  """Module containing default routes"""
2
+
2
3
  import logging
3
4
  import os
4
5
  import tempfile
@@ -1,6 +1,6 @@
1
1
  """Module containing routes related to MANE data"""
2
+
2
3
  import logging
3
- from typing import Optional
4
4
 
5
5
  from fastapi import APIRouter, Query
6
6
 
@@ -45,11 +45,11 @@ async def get_mane_data(
45
45
  start_annotation_layer: AnnotationLayer = Query(
46
46
  ..., description="Starting annotation layer for query"
47
47
  ),
48
- end_pos: Optional[int] = Query(
48
+ end_pos: int | None = Query(
49
49
  None, description="End position. If not set, will set to `start_pos`."
50
50
  ),
51
- gene: Optional[str] = Query(None, description="HGNC gene symbol"),
52
- ref: Optional[str] = Query(None, description=ref_descr),
51
+ gene: str | None = Query(None, description="HGNC gene symbol"),
52
+ ref: str | None = Query(None, description=ref_descr),
53
53
  try_longest_compatible: bool = Query(
54
54
  True, description=try_longest_compatible_descr
55
55
  ),