cool-seq-tool 0.4.0.dev3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +7 -11
- cool_seq_tool/app.py +44 -24
- cool_seq_tool/handlers/__init__.py +1 -0
- cool_seq_tool/handlers/seqrepo_access.py +27 -25
- cool_seq_tool/mappers/__init__.py +3 -1
- cool_seq_tool/mappers/alignment.py +5 -6
- cool_seq_tool/mappers/exon_genomic_coords.py +139 -124
- cool_seq_tool/mappers/liftover.py +90 -0
- cool_seq_tool/mappers/mane_transcript.py +208 -113
- cool_seq_tool/resources/__init__.py +1 -0
- cool_seq_tool/resources/data_files.py +93 -0
- cool_seq_tool/resources/status.py +153 -0
- cool_seq_tool/schemas.py +92 -54
- cool_seq_tool/sources/__init__.py +1 -0
- cool_seq_tool/sources/mane_transcript_mappings.py +16 -9
- cool_seq_tool/sources/transcript_mappings.py +41 -32
- cool_seq_tool/sources/uta_database.py +96 -249
- cool_seq_tool/utils.py +44 -4
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/LICENSE +1 -1
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/METADATA +16 -11
- cool_seq_tool-0.5.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/WHEEL +1 -1
- cool_seq_tool/api.py +0 -42
- cool_seq_tool/data/__init__.py +0 -2
- cool_seq_tool/data/data_downloads.py +0 -89
- cool_seq_tool/paths.py +0 -28
- cool_seq_tool/routers/__init__.py +0 -16
- cool_seq_tool/routers/default.py +0 -125
- cool_seq_tool/routers/mane.py +0 -98
- cool_seq_tool/routers/mappings.py +0 -155
- cool_seq_tool/version.py +0 -2
- cool_seq_tool-0.4.0.dev3.dist-info/RECORD +0 -29
- /cool_seq_tool/{data → resources}/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.4.0.dev3.dist-info → cool_seq_tool-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
|
|
1
1
|
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions."""
|
2
|
+
|
2
3
|
import csv
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import Dict, List, Optional
|
5
5
|
|
6
|
-
from cool_seq_tool.
|
6
|
+
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
7
7
|
|
8
8
|
|
9
9
|
class TranscriptMappings:
|
10
10
|
"""Provide mappings between gene symbols and RefSeq + Ensembl transcript accessions.
|
11
11
|
|
12
|
-
Uses ``LRG_RefSeqGene`` and ``transcript_mappings.
|
12
|
+
Uses ``LRG_RefSeqGene`` and ``transcript_mappings.tsv``, which will automatically
|
13
13
|
be acquired if they aren't already available. See the
|
14
14
|
:ref:`configuration <configuration>` section in the documentation for information
|
15
15
|
about manual acquisition of data.
|
@@ -21,44 +21,53 @@ class TranscriptMappings:
|
|
21
21
|
|
22
22
|
def __init__(
|
23
23
|
self,
|
24
|
-
transcript_file_path: Path =
|
25
|
-
lrg_refseqgene_path: Path =
|
24
|
+
transcript_file_path: Path | None = None,
|
25
|
+
lrg_refseqgene_path: Path | None = None,
|
26
|
+
from_local: bool = False,
|
26
27
|
) -> None:
|
27
28
|
"""Initialize the transcript mappings class.
|
28
29
|
|
29
30
|
:param transcript_file_path: Path to transcript mappings file
|
30
31
|
:param lrg_refseqgene_path: Path to LRG RefSeqGene file
|
32
|
+
:param from_local: if ``True``, don't check for or acquire latest version --
|
33
|
+
just provide most recent locally available file, if possible, and raise
|
34
|
+
error otherwise
|
31
35
|
"""
|
32
36
|
# ENSP <-> Gene Symbol
|
33
|
-
self.ensembl_protein_version_for_gene_symbol:
|
34
|
-
self.ensembl_protein_version_to_gene_symbol:
|
35
|
-
self.ensembl_protein_for_gene_symbol:
|
36
|
-
self.ensembl_protein_to_gene_symbol:
|
37
|
+
self.ensembl_protein_version_for_gene_symbol: dict[str, list[str]] = {}
|
38
|
+
self.ensembl_protein_version_to_gene_symbol: dict[str, str] = {}
|
39
|
+
self.ensembl_protein_for_gene_symbol: dict[str, list[str]] = {}
|
40
|
+
self.ensembl_protein_to_gene_symbol: dict[str, str] = {}
|
37
41
|
|
38
42
|
# Gene Symbol <-> ENST
|
39
|
-
self.ensembl_transcript_version_for_gene_symbol:
|
40
|
-
self.ensembl_transcript_version_to_gene_symbol:
|
41
|
-
self.ensembl_transcript_for_gene_symbol:
|
42
|
-
self.ensembl_transcript_to_gene_symbol:
|
43
|
+
self.ensembl_transcript_version_for_gene_symbol: dict[str, list[str]] = {}
|
44
|
+
self.ensembl_transcript_version_to_gene_symbol: dict[str, str] = {}
|
45
|
+
self.ensembl_transcript_for_gene_symbol: dict[str, list[str]] = {}
|
46
|
+
self.ensembl_transcript_to_gene_symbol: dict[str, str] = {}
|
43
47
|
|
44
48
|
# NP_ <-> Gene Symbol
|
45
|
-
self.refseq_protein_for_gene_symbol:
|
46
|
-
self.refseq_protein_to_gene_symbol:
|
49
|
+
self.refseq_protein_for_gene_symbol: dict[str, list[str]] = {}
|
50
|
+
self.refseq_protein_to_gene_symbol: dict[str, str] = {}
|
47
51
|
|
48
52
|
# NM_ <-> Gene Symbol
|
49
|
-
self.refseq_rna_version_for_gene_symbol:
|
50
|
-
self.refseq_rna_version_to_gene_symbol:
|
51
|
-
self.refseq_rna_for_gene_symbol:
|
52
|
-
self.refseq_rna_to_gene_symbol:
|
53
|
+
self.refseq_rna_version_for_gene_symbol: dict[str, list[str]] = {}
|
54
|
+
self.refseq_rna_version_to_gene_symbol: dict[str, str] = {}
|
55
|
+
self.refseq_rna_for_gene_symbol: dict[str, list[str]] = {}
|
56
|
+
self.refseq_rna_to_gene_symbol: dict[str, str] = {}
|
53
57
|
|
54
58
|
# NP -> NM
|
55
|
-
self.np_to_nm:
|
59
|
+
self.np_to_nm: dict[str, str] = {}
|
56
60
|
|
57
61
|
# ENSP -> ENST
|
58
|
-
self.ensp_to_enst:
|
62
|
+
self.ensp_to_enst: dict[str, str] = {}
|
59
63
|
|
60
|
-
self._load_transcript_mappings_data(
|
61
|
-
|
64
|
+
self._load_transcript_mappings_data(
|
65
|
+
transcript_file_path
|
66
|
+
or get_data_file(DataFile.TRANSCRIPT_MAPPINGS, from_local)
|
67
|
+
)
|
68
|
+
self._load_refseq_gene_symbol_data(
|
69
|
+
lrg_refseqgene_path or get_data_file(DataFile.LRG_REFSEQGENE, from_local)
|
70
|
+
)
|
62
71
|
|
63
72
|
def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
|
64
73
|
"""Load transcript mappings file to dictionaries.
|
@@ -99,9 +108,9 @@ class TranscriptMappings:
|
|
99
108
|
).append(transcript)
|
100
109
|
self.ensembl_transcript_to_gene_symbol[transcript] = gene
|
101
110
|
if versioned_transcript and versioned_protein_transcript:
|
102
|
-
self.ensp_to_enst[
|
103
|
-
|
104
|
-
|
111
|
+
self.ensp_to_enst[versioned_protein_transcript] = (
|
112
|
+
versioned_transcript
|
113
|
+
)
|
105
114
|
|
106
115
|
def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None:
|
107
116
|
"""Load data from RefSeq Gene Symbol file to dictionaries.
|
@@ -134,7 +143,7 @@ class TranscriptMappings:
|
|
134
143
|
if refseq_transcript and rna_transcript:
|
135
144
|
self.np_to_nm[refseq_transcript] = rna_transcript
|
136
145
|
|
137
|
-
def protein_transcripts(self, identifier: str) ->
|
146
|
+
def protein_transcripts(self, identifier: str) -> list[str]:
|
138
147
|
"""Return a list of protein transcripts for a gene symbol.
|
139
148
|
|
140
149
|
>>> from cool_seq_tool.sources import TranscriptMappings
|
@@ -154,7 +163,7 @@ class TranscriptMappings:
|
|
154
163
|
protein_transcripts += self.refseq_protein_for_gene_symbol.get(identifier, "")
|
155
164
|
return list(set(protein_transcripts))
|
156
165
|
|
157
|
-
def coding_dna_transcripts(self, identifier: str) ->
|
166
|
+
def coding_dna_transcripts(self, identifier: str) -> list[str]:
|
158
167
|
"""Return transcripts from a coding dna refseq for a gene symbol.
|
159
168
|
|
160
169
|
:param identifier: Gene identifier to find transcripts for
|
@@ -172,7 +181,7 @@ class TranscriptMappings:
|
|
172
181
|
)
|
173
182
|
return list(set(genomic_transcripts))
|
174
183
|
|
175
|
-
def get_gene_symbol_from_ensembl_protein(self, q: str) ->
|
184
|
+
def get_gene_symbol_from_ensembl_protein(self, q: str) -> str | None:
|
176
185
|
"""Return the gene symbol for a Ensembl Protein.
|
177
186
|
|
178
187
|
:param q: ensembl protein accession
|
@@ -184,7 +193,7 @@ class TranscriptMappings:
|
|
184
193
|
gene_symbol = self.ensembl_protein_to_gene_symbol.get(q)
|
185
194
|
return gene_symbol
|
186
195
|
|
187
|
-
def get_gene_symbol_from_refeq_protein(self, q: str) ->
|
196
|
+
def get_gene_symbol_from_refeq_protein(self, q: str) -> str | None:
|
188
197
|
"""Return the gene symbol for a Refseq Protein.
|
189
198
|
|
190
199
|
:param q: RefSeq protein accession
|
@@ -192,7 +201,7 @@ class TranscriptMappings:
|
|
192
201
|
"""
|
193
202
|
return self.refseq_protein_to_gene_symbol.get(q)
|
194
203
|
|
195
|
-
def get_gene_symbol_from_refseq_rna(self, q: str) ->
|
204
|
+
def get_gene_symbol_from_refseq_rna(self, q: str) -> str | None:
|
196
205
|
"""Return gene symbol for a Refseq RNA Transcript.
|
197
206
|
|
198
207
|
:param q: RefSeq RNA transcript accession
|
@@ -204,7 +213,7 @@ class TranscriptMappings:
|
|
204
213
|
gene_symbol = self.refseq_rna_to_gene_symbol.get(q)
|
205
214
|
return gene_symbol
|
206
215
|
|
207
|
-
def get_gene_symbol_from_ensembl_transcript(self, q: str) ->
|
216
|
+
def get_gene_symbol_from_ensembl_transcript(self, q: str) -> str | None:
|
208
217
|
"""Return gene symbol for an Ensembl Transcript.
|
209
218
|
|
210
219
|
:param q: Ensembl transcript accession
|