cool-seq-tool 0.3.0.dev0__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/api.py +3 -3
- cool_seq_tool/app.py +32 -11
- cool_seq_tool/data/data_downloads.py +8 -5
- cool_seq_tool/handlers/seqrepo_access.py +55 -27
- cool_seq_tool/mappers/__init__.py +4 -1
- cool_seq_tool/mappers/alignment.py +40 -37
- cool_seq_tool/mappers/exon_genomic_coords.py +329 -138
- cool_seq_tool/mappers/mane_transcript.py +402 -227
- cool_seq_tool/routers/mane.py +1 -1
- cool_seq_tool/routers/mappings.py +1 -1
- cool_seq_tool/schemas.py +31 -24
- cool_seq_tool/sources/__init__.py +4 -2
- cool_seq_tool/sources/mane_transcript_mappings.py +28 -7
- cool_seq_tool/sources/transcript_mappings.py +27 -11
- cool_seq_tool/sources/uta_database.py +179 -232
- cool_seq_tool/utils.py +22 -24
- cool_seq_tool/version.py +1 -1
- {cool_seq_tool-0.3.0.dev0.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/LICENSE +1 -1
- cool_seq_tool-0.4.0.dev0.dist-info/METADATA +130 -0
- cool_seq_tool-0.4.0.dev0.dist-info/RECORD +28 -0
- {cool_seq_tool-0.3.0.dev0.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/WHEEL +1 -1
- cool_seq_tool/data/transcript_mapping.tsv +0 -256226
- cool_seq_tool-0.3.0.dev0.dist-info/METADATA +0 -187
- cool_seq_tool-0.3.0.dev0.dist-info/RECORD +0 -29
- {cool_seq_tool-0.3.0.dev0.dist-info → cool_seq_tool-0.4.0.dev0.dist-info}/top_level.txt +0 -0
cool_seq_tool/routers/mane.py
CHANGED
@@ -79,8 +79,8 @@ async def get_mane_data(
|
|
79
79
|
mane_data = await cool_seq_tool.mane_transcript.get_mane_transcript(
|
80
80
|
ac=ac,
|
81
81
|
start_pos=start_pos,
|
82
|
-
start_annotation_layer=start_annotation_layer,
|
83
82
|
end_pos=end_pos,
|
83
|
+
start_annotation_layer=start_annotation_layer,
|
84
84
|
gene=gene,
|
85
85
|
ref=ref,
|
86
86
|
try_longest_compatible=try_longest_compatible,
|
@@ -57,7 +57,7 @@ async def p_to_c(
|
|
57
57
|
"/c_to_g",
|
58
58
|
summary="Translate cDNA representation to genomic representation",
|
59
59
|
response_description=RESP_DESCR,
|
60
|
-
description="Given cDNA accession and positions for codon(s), return associated genomic"
|
60
|
+
description="Given cDNA accession and positions for codon(s), return associated genomic"
|
61
61
|
" accession and positions for a given target genome assembly",
|
62
62
|
response_model=ToGenomicService,
|
63
63
|
tags=[Tags.ALIGNMENT_MAPPER],
|
cool_seq_tool/schemas.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
"""
|
1
|
+
"""Defines attribute constants, useful object structures, and API response schemas."""
|
2
2
|
import re
|
3
3
|
from datetime import datetime
|
4
|
-
from enum import Enum
|
4
|
+
from enum import Enum, IntEnum
|
5
5
|
from typing import List, Literal, Optional, Tuple, Union
|
6
6
|
|
7
7
|
from pydantic import (
|
@@ -24,11 +24,11 @@ class AnnotationLayer(str, Enum):
|
|
24
24
|
GENOMIC: Literal["g"] = "g"
|
25
25
|
|
26
26
|
|
27
|
-
class Strand(
|
27
|
+
class Strand(IntEnum):
|
28
28
|
"""Create enum for positive and negative strand"""
|
29
29
|
|
30
|
-
POSITIVE =
|
31
|
-
NEGATIVE =
|
30
|
+
POSITIVE = 1
|
31
|
+
NEGATIVE = -1
|
32
32
|
|
33
33
|
|
34
34
|
class Assembly(str, Enum):
|
@@ -48,8 +48,15 @@ class TranscriptPriority(str, Enum):
|
|
48
48
|
|
49
49
|
|
50
50
|
class ResidueMode(str, Enum):
|
51
|
-
"""Create Enum for residue modes.
|
51
|
+
"""Create Enum for residue modes.
|
52
52
|
|
53
|
+
| | C | | T | | G | |
|
54
|
+
ZERO | | 0 | | 1 | | 2 | |
|
55
|
+
RESIDUE | | 1 | | 2 | | 3 | |
|
56
|
+
INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
|
57
|
+
"""
|
58
|
+
|
59
|
+
ZERO = "zero"
|
53
60
|
RESIDUE = "residue"
|
54
61
|
INTER_RESIDUE = "inter-residue"
|
55
62
|
|
@@ -64,14 +71,14 @@ class GenomicRequestBody(BaseModelForbidExtra):
|
|
64
71
|
chromosome: Union[StrictStr, StrictInt]
|
65
72
|
start: Optional[StrictInt] = None
|
66
73
|
end: Optional[StrictInt] = None
|
67
|
-
strand: Optional[
|
74
|
+
strand: Optional[Strand] = None
|
68
75
|
transcript: Optional[StrictStr] = None
|
69
76
|
gene: Optional[StrictStr] = None
|
70
77
|
residue_mode: ResidueMode = ResidueMode.RESIDUE
|
71
78
|
|
72
79
|
@model_validator(mode="after")
|
73
80
|
def check_start_and_end(cls, values):
|
74
|
-
"""Check that at least one of {
|
81
|
+
"""Check that at least one of {``start``, ``end``} is set"""
|
75
82
|
msg = "Must provide either `start` or `end`"
|
76
83
|
start, end = values.start, values.end
|
77
84
|
assert start or end, msg
|
@@ -83,7 +90,7 @@ class GenomicRequestBody(BaseModelForbidExtra):
|
|
83
90
|
"chromosome": "NC_000001.11",
|
84
91
|
"start": 154192135,
|
85
92
|
"end": None,
|
86
|
-
"strand":
|
93
|
+
"strand": Strand.NEGATIVE,
|
87
94
|
"transcript": "NM_152263.3",
|
88
95
|
"gene": "TPM3",
|
89
96
|
"residue_mode": "residue",
|
@@ -95,8 +102,8 @@ class GenomicRequestBody(BaseModelForbidExtra):
|
|
95
102
|
class TranscriptRequestBody(BaseModelForbidExtra):
|
96
103
|
"""Define constraints for transcript exon to genomic coordinates request body"""
|
97
104
|
|
105
|
+
transcript: StrictStr
|
98
106
|
gene: Optional[StrictStr] = None
|
99
|
-
transcript: Optional[StrictStr] = None
|
100
107
|
exon_start: Optional[StrictInt] = None
|
101
108
|
exon_start_offset: Optional[StrictInt] = 0
|
102
109
|
exon_end: Optional[StrictInt] = None
|
@@ -104,7 +111,7 @@ class TranscriptRequestBody(BaseModelForbidExtra):
|
|
104
111
|
|
105
112
|
@model_validator(mode="after")
|
106
113
|
def check_exon_start_and_exon_end(cls, values):
|
107
|
-
"""Check that at least one of {
|
114
|
+
"""Check that at least one of {``exon_start``, ``exon_end``} is set"""
|
108
115
|
msg = "Must provide either `exon_start` or `exon_end`"
|
109
116
|
exon_start, exon_end = values.exon_start, values.exon_end
|
110
117
|
assert exon_start or exon_end, msg
|
@@ -133,7 +140,7 @@ class TranscriptExonData(BaseModelForbidExtra):
|
|
133
140
|
exon_offset: StrictInt = 0
|
134
141
|
gene: StrictStr
|
135
142
|
chr: StrictStr
|
136
|
-
strand:
|
143
|
+
strand: Strand
|
137
144
|
|
138
145
|
model_config = ConfigDict(
|
139
146
|
json_schema_extra={
|
@@ -144,7 +151,7 @@ class TranscriptExonData(BaseModelForbidExtra):
|
|
144
151
|
"exon": 1,
|
145
152
|
"exon_offset": 0,
|
146
153
|
"transcript": "NM_152263.3",
|
147
|
-
"strand":
|
154
|
+
"strand": Strand.NEGATIVE,
|
148
155
|
}
|
149
156
|
}
|
150
157
|
)
|
@@ -162,13 +169,13 @@ class GenomicData(BaseModelForbidExtra):
|
|
162
169
|
exon_end: Optional[StrictInt] = None
|
163
170
|
exon_end_offset: Optional[StrictInt] = 0
|
164
171
|
transcript: StrictStr
|
165
|
-
strand:
|
172
|
+
strand: Strand
|
166
173
|
|
167
174
|
@model_validator(mode="after")
|
168
175
|
def check_start_end(cls, values):
|
169
|
-
"""Check that at least one of {
|
170
|
-
Check that at least one of {
|
171
|
-
If not set, set corresponding offset to
|
176
|
+
"""Check that at least one of {``start``, ``end``} is set.
|
177
|
+
Check that at least one of {``exon_start``, ``exon_end``} is set.
|
178
|
+
If not set, set corresponding offset to ``None``
|
172
179
|
"""
|
173
180
|
msg = "Missing values for `start` or `end`"
|
174
181
|
start = values.start
|
@@ -200,7 +207,7 @@ class GenomicData(BaseModelForbidExtra):
|
|
200
207
|
"exon_start_offset": 0,
|
201
208
|
"exon_end_offset": None,
|
202
209
|
"transcript": "NM_152263.3",
|
203
|
-
"strand":
|
210
|
+
"strand": Strand.NEGATIVE,
|
204
211
|
}
|
205
212
|
}
|
206
213
|
)
|
@@ -254,7 +261,7 @@ class TranscriptExonDataResponse(BaseModelForbidExtra):
|
|
254
261
|
"exon": 1,
|
255
262
|
"exon_offset": 0,
|
256
263
|
"transcript": "NM_152263.3",
|
257
|
-
"strand":
|
264
|
+
"strand": Strand.NEGATIVE,
|
258
265
|
},
|
259
266
|
"warnings": [],
|
260
267
|
"service_meta": {
|
@@ -288,7 +295,7 @@ class GenomicDataResponse(BaseModelForbidExtra):
|
|
288
295
|
"exon_start_offset": 0,
|
289
296
|
"exon_end_offset": None,
|
290
297
|
"transcript": "NM_152263.3",
|
291
|
-
"strand":
|
298
|
+
"strand": Strand.NEGATIVE,
|
292
299
|
},
|
293
300
|
"warnings": [],
|
294
301
|
"service_meta": {
|
@@ -319,7 +326,7 @@ class MappedManeData(BaseModel):
|
|
319
326
|
"gene": "BRAF",
|
320
327
|
"refseq": "NM_001374258.1",
|
321
328
|
"ensembl": "ENST00000644969.2",
|
322
|
-
"strand":
|
329
|
+
"strand": Strand.NEGATIVE,
|
323
330
|
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
324
331
|
"alt_ac": "NC_000007.13",
|
325
332
|
"assembly": "GRCh37",
|
@@ -342,7 +349,7 @@ class MappedManeDataService(BaseModelForbidExtra):
|
|
342
349
|
"gene": "BRAF",
|
343
350
|
"refseq": "NM_001374258.1",
|
344
351
|
"ensembl": "ENST00000644969.2",
|
345
|
-
"strand":
|
352
|
+
"strand": Strand.NEGATIVE,
|
346
353
|
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
347
354
|
"alt_ac": "NC_000007.13",
|
348
355
|
"assembly": "GRCh37",
|
@@ -376,7 +383,7 @@ class ManeData(BaseModel):
|
|
376
383
|
"refseq": "NP_004324.2",
|
377
384
|
"ensembl": "ENSP00000493543.1",
|
378
385
|
"pos": (598, 598),
|
379
|
-
"strand":
|
386
|
+
"strand": Strand.NEGATIVE,
|
380
387
|
"status": TranscriptPriority.MANE_SELECT,
|
381
388
|
}
|
382
389
|
}
|
@@ -398,7 +405,7 @@ class ManeDataService(BaseModelForbidExtra):
|
|
398
405
|
"refseq": "NP_004324.2",
|
399
406
|
"ensembl": "ENSP00000493543.1",
|
400
407
|
"pos": (598, 598),
|
401
|
-
"strand":
|
408
|
+
"strand": Strand.NEGATIVE,
|
402
409
|
"status": TranscriptPriority.MANE_SELECT,
|
403
410
|
},
|
404
411
|
"warnings": [],
|
@@ -1,4 +1,6 @@
|
|
1
1
|
"""Module for providing basic acquisition/setup for the various resources"""
|
2
|
-
from .mane_transcript_mappings import
|
2
|
+
from .mane_transcript_mappings import ManeTranscriptMappings
|
3
3
|
from .transcript_mappings import TranscriptMappings
|
4
|
-
from .uta_database import
|
4
|
+
from .uta_database import UtaDatabase
|
5
|
+
|
6
|
+
__all__ = ["ManeTranscriptMappings", "TranscriptMappings", "UtaDatabase"]
|
@@ -1,4 +1,6 @@
|
|
1
|
-
"""
|
1
|
+
"""Provide fast tabular access to MANE summary file. Enables retrieval of associated
|
2
|
+
MANE transcripts for gene symbols, genomic positions, or transcript accessions.
|
3
|
+
"""
|
2
4
|
import logging
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import Dict, List
|
@@ -10,11 +12,19 @@ from cool_seq_tool.paths import MANE_SUMMARY_PATH
|
|
10
12
|
logger = logging.getLogger(__name__)
|
11
13
|
|
12
14
|
|
13
|
-
class
|
14
|
-
"""
|
15
|
+
class ManeTranscriptMappings:
|
16
|
+
"""Provide fast tabular access to MANE summary file.
|
17
|
+
|
18
|
+
By default, acquires data from `NCBI FTP server <ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/>`_
|
19
|
+
if unavailable locally. The local data location can be passed as an argument or
|
20
|
+
given under the environment variable ``MANE_SUMMARY_PATH``.
|
21
|
+
|
22
|
+
See the `NCBI MANE page <https://www.ncbi.nlm.nih.gov/refseq/MANE/>`_ for more information.
|
23
|
+
"""
|
15
24
|
|
16
25
|
def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
|
17
26
|
"""Initialize the MANE Transcript mappings class.
|
27
|
+
|
18
28
|
:param Path mane_data_path: Path to RefSeq MANE summary data
|
19
29
|
"""
|
20
30
|
self.mane_data_path = mane_data_path
|
@@ -22,16 +32,26 @@ class MANETranscriptMappings:
|
|
22
32
|
|
23
33
|
def _load_mane_transcript_data(self) -> pl.DataFrame:
|
24
34
|
"""Load RefSeq MANE data file into DataFrame.
|
35
|
+
|
25
36
|
:return: DataFrame containing RefSeq MANE Transcript data
|
26
37
|
"""
|
27
38
|
return pl.read_csv(self.mane_data_path, separator="\t")
|
28
39
|
|
29
40
|
def get_gene_mane_data(self, gene_symbol: str) -> List[Dict]:
|
30
41
|
"""Return MANE Transcript data for a gene.
|
42
|
+
|
43
|
+
>>> from cool_seq_tool.sources import ManeTranscriptMappings
|
44
|
+
>>> m = ManeTranscriptMappings()
|
45
|
+
>>> braf_mane = m.get_gene_mane_data("BRAF")
|
46
|
+
>>> braf_mane[0]["RefSeq_nuc"], braf_mane[0]["MANE_status"]
|
47
|
+
('NM_004333.6', 'MANE Select')
|
48
|
+
>>> braf_mane[1]["RefSeq_nuc"], braf_mane[1]["MANE_status"]
|
49
|
+
('NM_001374258.1', 'MANE Plus Clinical')
|
50
|
+
|
31
51
|
:param str gene_symbol: HGNC Gene Symbol
|
32
|
-
:return: List of MANE Transcript data (Transcript accessions,
|
33
|
-
|
34
|
-
Clinical
|
52
|
+
:return: List of MANE Transcript data (Transcript accessions, gene, and
|
53
|
+
location information). The list is sorted so that a MANE Select entry comes
|
54
|
+
first, followed by a MANE Plus Clinical entry, if available.
|
35
55
|
"""
|
36
56
|
data = self.df.filter(pl.col("symbol") == gene_symbol.upper())
|
37
57
|
|
@@ -58,7 +78,8 @@ class MANETranscriptMappings:
|
|
58
78
|
def get_mane_data_from_chr_pos(
|
59
79
|
self, alt_ac: str, start: int, end: int
|
60
80
|
) -> List[Dict]:
|
61
|
-
"""Get MANE data given
|
81
|
+
"""Get MANE data given a GRCh38 genomic position.
|
82
|
+
|
62
83
|
:param str alt_ac: NC Accession
|
63
84
|
:param int start: Start genomic position. Assumes residue coordinates.
|
64
85
|
:param int end: End genomic position. Assumes residue coordinates.
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions."""
|
2
2
|
import csv
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Dict, List, Optional
|
@@ -7,7 +7,17 @@ from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, TRANSCRIPT_MAPPINGS_PATH
|
|
7
7
|
|
8
8
|
|
9
9
|
class TranscriptMappings:
|
10
|
-
"""
|
10
|
+
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.
|
11
|
+
|
12
|
+
Uses ``LRG_RefSeqGene`` and ``transcript_mappings.csv``, which will automatically
|
13
|
+
be acquired if they aren't already available. See the
|
14
|
+
:ref:`configuration <configuration>` section in the documentation for information
|
15
|
+
about manual acquisition of data.
|
16
|
+
|
17
|
+
In general, this class's methods expect to receive NCBI gene symbols, so users
|
18
|
+
should be careful about the sourcing of their input in cases where terms are
|
19
|
+
conflicted or ambiguous (which, to be fair, should be relatively rare).
|
20
|
+
"""
|
11
21
|
|
12
22
|
def __init__(
|
13
23
|
self,
|
@@ -16,8 +26,8 @@ class TranscriptMappings:
|
|
16
26
|
) -> None:
|
17
27
|
"""Initialize the transcript mappings class.
|
18
28
|
|
19
|
-
:param
|
20
|
-
:param
|
29
|
+
:param transcript_file_path: Path to transcript mappings file
|
30
|
+
:param lrg_refseqgene_path: Path to LRG RefSeqGene file
|
21
31
|
"""
|
22
32
|
# ENSP <-> Gene Symbol
|
23
33
|
self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {}
|
@@ -53,7 +63,7 @@ class TranscriptMappings:
|
|
53
63
|
def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
|
54
64
|
"""Load transcript mappings file to dictionaries.
|
55
65
|
|
56
|
-
:param
|
66
|
+
:param transcript_file_path: Path to transcript mappings file
|
57
67
|
"""
|
58
68
|
with open(transcript_file_path) as file:
|
59
69
|
reader = csv.DictReader(file, delimiter="\t")
|
@@ -127,7 +137,13 @@ class TranscriptMappings:
|
|
127
137
|
def protein_transcripts(self, identifier: str) -> List[str]:
|
128
138
|
"""Return a list of protein transcripts for a gene symbol.
|
129
139
|
|
130
|
-
|
140
|
+
>>> from cool_seq_tool.sources import TranscriptMappings
|
141
|
+
>>> braf_txs = TranscriptMappings().protein_transcripts("BRAF")
|
142
|
+
>>> braf_txs.sort()
|
143
|
+
>>> braf_txs[-1]
|
144
|
+
'NP_004324.2'
|
145
|
+
|
146
|
+
:param identifier: Gene identifier to get protein transcripts for
|
131
147
|
:return: Protein transcripts for a gene symbol
|
132
148
|
"""
|
133
149
|
protein_transcripts = list()
|
@@ -141,7 +157,7 @@ class TranscriptMappings:
|
|
141
157
|
def coding_dna_transcripts(self, identifier: str) -> List[str]:
|
142
158
|
"""Return transcripts from a coding dna refseq for a gene symbol.
|
143
159
|
|
144
|
-
:param
|
160
|
+
:param identifier: Gene identifier to find transcripts for
|
145
161
|
:return: cDNA transcripts for a gene symbol
|
146
162
|
"""
|
147
163
|
genomic_transcripts = list()
|
@@ -159,7 +175,7 @@ class TranscriptMappings:
|
|
159
175
|
def get_gene_symbol_from_ensembl_protein(self, q: str) -> Optional[str]:
|
160
176
|
"""Return the gene symbol for a Ensembl Protein.
|
161
177
|
|
162
|
-
:param
|
178
|
+
:param q: ensembl protein accession
|
163
179
|
:return: Gene symbol
|
164
180
|
"""
|
165
181
|
gene_symbol = self.ensembl_protein_version_to_gene_symbol.get(q)
|
@@ -172,7 +188,7 @@ class TranscriptMappings:
|
|
172
188
|
def get_gene_symbol_from_refeq_protein(self, q: str) -> Optional[str]:
|
173
189
|
"""Return the gene symbol for a Refseq Protein.
|
174
190
|
|
175
|
-
:param
|
191
|
+
:param q: RefSeq protein accession
|
176
192
|
:return: Gene symbol
|
177
193
|
"""
|
178
194
|
return self.refseq_protein_to_gene_symbol.get(q)
|
@@ -180,7 +196,7 @@ class TranscriptMappings:
|
|
180
196
|
def get_gene_symbol_from_refseq_rna(self, q: str) -> Optional[str]:
|
181
197
|
"""Return gene symbol for a Refseq RNA Transcript.
|
182
198
|
|
183
|
-
:param
|
199
|
+
:param q: RefSeq RNA transcript accession
|
184
200
|
:return: Gene symbol
|
185
201
|
"""
|
186
202
|
gene_symbol = self.refseq_rna_version_to_gene_symbol.get(q)
|
@@ -193,7 +209,7 @@ class TranscriptMappings:
|
|
193
209
|
def get_gene_symbol_from_ensembl_transcript(self, q: str) -> Optional[str]:
|
194
210
|
"""Return gene symbol for an Ensembl Transcript.
|
195
211
|
|
196
|
-
:param
|
212
|
+
:param q: Ensembl transcript accession
|
197
213
|
:return: Gene symbol
|
198
214
|
"""
|
199
215
|
gene_symbol = self.ensembl_transcript_version_to_gene_symbol.get(q)
|