cool-seq-tool 0.4.0.dev1__py3-none-any.whl → 0.4.0.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/data/__init__.py +1 -1
- cool_seq_tool/data/data_downloads.py +19 -13
- cool_seq_tool/handlers/seqrepo_access.py +2 -4
- cool_seq_tool/mappers/exon_genomic_coords.py +25 -26
- cool_seq_tool/mappers/mane_transcript.py +112 -106
- cool_seq_tool/routers/default.py +7 -9
- cool_seq_tool/routers/mane.py +2 -2
- cool_seq_tool/schemas.py +30 -21
- cool_seq_tool/sources/mane_transcript_mappings.py +1 -1
- cool_seq_tool/sources/transcript_mappings.py +13 -16
- cool_seq_tool/sources/uta_database.py +134 -153
- cool_seq_tool/utils.py +5 -2
- cool_seq_tool/version.py +1 -1
- {cool_seq_tool-0.4.0.dev1.dist-info → cool_seq_tool-0.4.0.dev2.dist-info}/METADATA +7 -6
- cool_seq_tool-0.4.0.dev2.dist-info/RECORD +29 -0
- cool_seq_tool-0.4.0.dev1.dist-info/RECORD +0 -29
- {cool_seq_tool-0.4.0.dev1.dist-info → cool_seq_tool-0.4.0.dev2.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.4.0.dev1.dist-info → cool_seq_tool-0.4.0.dev2.dist-info}/WHEEL +0 -0
- {cool_seq_tool-0.4.0.dev1.dist-info → cool_seq_tool-0.4.0.dev2.dist-info}/top_level.txt +0 -0
cool_seq_tool/data/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Module for data"""
|
2
|
-
from .data_downloads import DataDownload
|
2
|
+
from .data_downloads import DataDownload
|
@@ -4,7 +4,6 @@ import gzip
|
|
4
4
|
import logging
|
5
5
|
import shutil
|
6
6
|
from ftplib import FTP
|
7
|
-
from os import remove
|
8
7
|
from pathlib import Path
|
9
8
|
|
10
9
|
from dateutil import parser
|
@@ -38,18 +37,20 @@ class DataDownload:
|
|
38
37
|
files = ftp.nlst()
|
39
38
|
mane_summary_file = [f for f in files if f.endswith(".summary.txt.gz")]
|
40
39
|
if not mane_summary_file:
|
41
|
-
|
40
|
+
msg = "Unable to download MANE summary data"
|
41
|
+
raise Exception(msg)
|
42
42
|
mane_summary_file = mane_summary_file[0]
|
43
43
|
self._mane_summary_path = self._data_dir / mane_summary_file[:-3]
|
44
44
|
mane_data_path = self._data_dir / mane_summary_file
|
45
45
|
if not self._mane_summary_path.exists():
|
46
46
|
logger.info("Downloading MANE summary file from NCBI.")
|
47
|
-
with open(
|
47
|
+
with mane_data_path.open("wb") as fp:
|
48
48
|
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
|
49
|
-
with gzip.open(
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
with gzip.open(
|
50
|
+
mane_data_path, "rb"
|
51
|
+
) as f_in, self._mane_summary_path.open("wb") as f_out:
|
52
|
+
shutil.copyfileobj(f_in, f_out)
|
53
|
+
mane_data_path.unlink()
|
53
54
|
logger.info("MANE summary file download complete.")
|
54
55
|
return self._mane_summary_path
|
55
56
|
|
@@ -66,18 +67,23 @@ class DataDownload:
|
|
66
67
|
ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
|
67
68
|
timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
|
68
69
|
date = str(parser.parse(timestamp)).split()[0]
|
69
|
-
version =
|
70
|
+
version = (
|
71
|
+
datetime.datetime.strptime(date, "%Y-%m-%d")
|
72
|
+
.astimezone(tz=datetime.timezone.utc)
|
73
|
+
.strftime("%Y%m%d")
|
74
|
+
)
|
70
75
|
fn_versioned = f"{lrg_refseqgene_file}_{version}"
|
71
76
|
lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
|
72
77
|
self._lrg_refseqgene_path = self._data_dir / fn_versioned
|
73
78
|
if not self._lrg_refseqgene_path.exists():
|
74
79
|
logger.info("Downloading LRG RefSeq data from NCBI.")
|
75
80
|
ftp.cwd(ftp_dir_path)
|
76
|
-
with open(
|
81
|
+
with lrg_refseqgene_path.open("wb") as fp:
|
77
82
|
ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
|
78
|
-
with open(
|
79
|
-
|
80
|
-
|
81
|
-
|
83
|
+
with lrg_refseqgene_path.open(
|
84
|
+
"rb"
|
85
|
+
) as f_in, self._lrg_refseqgene_path.open("wb") as f_out:
|
86
|
+
shutil.copyfileobj(f_in, f_out)
|
87
|
+
lrg_refseqgene_path.unlink()
|
82
88
|
logger.info("LRG RefSeq data download complete.")
|
83
89
|
return self._lrg_refseqgene_path
|
@@ -152,8 +152,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
|
|
152
152
|
acs.append(ac.split("refseq:")[-1])
|
153
153
|
if acs:
|
154
154
|
return acs, None
|
155
|
-
|
156
|
-
return None, f"{chromosome} is not a valid chromosome"
|
155
|
+
return None, f"{chromosome} is not a valid chromosome"
|
157
156
|
|
158
157
|
def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
|
159
158
|
"""Get chromosome for accession.
|
@@ -172,8 +171,7 @@ class SeqRepoAccess(SeqRepoDataProxy):
|
|
172
171
|
)[0]
|
173
172
|
if aliases is None:
|
174
173
|
return None, f"Unable to get chromosome for {ac}"
|
175
|
-
|
176
|
-
return aliases, None
|
174
|
+
return aliases, None
|
177
175
|
|
178
176
|
def get_fasta_file(self, sequence_id: str, outfile_path: Path) -> None:
|
179
177
|
"""Retrieve FASTA file containing sequence for requested sequence ID.
|
@@ -112,8 +112,7 @@ class ExonGenomicCoordsMapper:
|
|
112
112
|
# Ensure valid inputs
|
113
113
|
if not transcript:
|
114
114
|
return self._return_warnings(resp, "Must provide `transcript`")
|
115
|
-
|
116
|
-
transcript = transcript.strip()
|
115
|
+
transcript = transcript.strip()
|
117
116
|
|
118
117
|
exon_start_exists, exon_end_exists = False, False
|
119
118
|
if exon_start is not None:
|
@@ -130,12 +129,11 @@ class ExonGenomicCoordsMapper:
|
|
130
129
|
return self._return_warnings(
|
131
130
|
resp, "Must provide either `exon_start` or `exon_end`"
|
132
131
|
)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
)
|
132
|
+
if exon_start_exists and exon_end_exists and (exon_start > exon_end):
|
133
|
+
return self._return_warnings(
|
134
|
+
resp,
|
135
|
+
f"Start exon {exon_start} is greater than end exon {exon_end}",
|
136
|
+
)
|
139
137
|
|
140
138
|
# Get all exons and associated start/end coordinates for transcript
|
141
139
|
tx_exons, warning = await self.uta_db.get_tx_exons(transcript)
|
@@ -266,7 +264,7 @@ class ExonGenomicCoordsMapper:
|
|
266
264
|
if start is None and end is None:
|
267
265
|
return self._return_warnings(resp, "Must provide either `start` or `end`")
|
268
266
|
|
269
|
-
params = {key: None for key in GenomicData.model_fields
|
267
|
+
params = {key: None for key in GenomicData.model_fields}
|
270
268
|
if gene is not None:
|
271
269
|
gene = gene.upper().strip()
|
272
270
|
|
@@ -312,13 +310,12 @@ class ExonGenomicCoordsMapper:
|
|
312
310
|
|
313
311
|
for field in ["transcript", "gene", "chr", "strand"]:
|
314
312
|
if start_data:
|
315
|
-
if end_data:
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
return self._return_warnings(resp, msg)
|
313
|
+
if end_data and (start_data[field] != end_data[field]):
|
314
|
+
msg = (
|
315
|
+
f"Start `{field}`, {start_data[field]}, does "
|
316
|
+
f"not match End `{field}`, {end_data[field]}"
|
317
|
+
)
|
318
|
+
return self._return_warnings(resp, msg)
|
322
319
|
params[field] = start_data[field]
|
323
320
|
else:
|
324
321
|
params[field] = end_data[field]
|
@@ -440,7 +437,10 @@ class ExonGenomicCoordsMapper:
|
|
440
437
|
else:
|
441
438
|
error = "Strand does not match"
|
442
439
|
logger.warning(
|
443
|
-
|
440
|
+
"%s: %s != %s",
|
441
|
+
error,
|
442
|
+
alt_ac_data["start"][i],
|
443
|
+
alt_ac_data["end"][i],
|
444
444
|
)
|
445
445
|
return None, error
|
446
446
|
return tuple(alt_ac_data_values), None
|
@@ -482,7 +482,7 @@ class ExonGenomicCoordsMapper:
|
|
482
482
|
resp, "Must provide either `gene` or `transcript`"
|
483
483
|
)
|
484
484
|
|
485
|
-
params = {key: None for key in TranscriptExonData.model_fields
|
485
|
+
params = {key: None for key in TranscriptExonData.model_fields}
|
486
486
|
|
487
487
|
if alt_ac:
|
488
488
|
# Check if valid accession is given
|
@@ -550,7 +550,7 @@ class ExonGenomicCoordsMapper:
|
|
550
550
|
len_alt_acs = len(alt_acs)
|
551
551
|
if len_alt_acs > 1:
|
552
552
|
return None, f"Found more than one accessions: {alt_acs}"
|
553
|
-
|
553
|
+
if len_alt_acs == 0:
|
554
554
|
return None, "No genomic accessions found"
|
555
555
|
alt_ac = next(iter(alt_acs))
|
556
556
|
|
@@ -565,13 +565,12 @@ class ExonGenomicCoordsMapper:
|
|
565
565
|
elif len_genes == 0:
|
566
566
|
return None, "No genes found"
|
567
567
|
|
568
|
-
if input_gene is not None:
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
)
|
568
|
+
if input_gene is not None and output_gene != input_gene.upper():
|
569
|
+
return (
|
570
|
+
None,
|
571
|
+
f"Input gene, {input_gene}, does not match "
|
572
|
+
f"expected output gene, {output_gene}",
|
573
|
+
)
|
575
574
|
|
576
575
|
gene = output_gene if output_gene else input_gene
|
577
576
|
return (gene, alt_ac), None
|
@@ -13,7 +13,7 @@ constraints and data models for coordinate representation.
|
|
13
13
|
"""
|
14
14
|
import logging
|
15
15
|
import math
|
16
|
-
from enum import
|
16
|
+
from enum import Enum
|
17
17
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
18
18
|
|
19
19
|
import polars as pl
|
@@ -37,7 +37,7 @@ from cool_seq_tool.utils import get_inter_residue_pos
|
|
37
37
|
logger = logging.getLogger(__name__)
|
38
38
|
|
39
39
|
|
40
|
-
class EndAnnotationLayer(
|
40
|
+
class EndAnnotationLayer(str, Enum):
|
41
41
|
"""Define constraints for end annotation layer. This is used for determining the
|
42
42
|
end annotation layer when getting the longest compatible remaining representation
|
43
43
|
"""
|
@@ -143,10 +143,7 @@ class ManeTranscript:
|
|
143
143
|
:return: cDNA position start, cDNA position end
|
144
144
|
"""
|
145
145
|
start_pos = start * 3
|
146
|
-
if end != start
|
147
|
-
end_pos = end * 3
|
148
|
-
else:
|
149
|
-
end_pos = start_pos
|
146
|
+
end_pos = end * 3 if end != start else start_pos
|
150
147
|
return start_pos, end_pos - 1
|
151
148
|
|
152
149
|
async def _p_to_c(
|
@@ -170,10 +167,10 @@ class ManeTranscript:
|
|
170
167
|
elif ac.startswith("ENSP"):
|
171
168
|
ac = self.transcript_mappings.ensp_to_enst[ac]
|
172
169
|
else:
|
173
|
-
logger.warning(
|
170
|
+
logger.warning("Unable to find accession: %s", ac)
|
174
171
|
return None
|
175
172
|
except KeyError:
|
176
|
-
logger.warning(
|
173
|
+
logger.warning("%s not found in transcript_mappings", ac)
|
177
174
|
return None
|
178
175
|
|
179
176
|
pos = self._p_to_c_pos(start_pos, end_pos)
|
@@ -190,14 +187,16 @@ class ManeTranscript:
|
|
190
187
|
# UTA does not store ENST versions
|
191
188
|
# So we want to make sure version is valid
|
192
189
|
if ac.startswith("ENST"):
|
193
|
-
if
|
194
|
-
|
195
|
-
|
196
|
-
|
190
|
+
if (
|
191
|
+
not self.transcript_mappings.ensembl_transcript_version_to_gene_symbol.get(
|
192
|
+
ac
|
193
|
+
)
|
194
|
+
and not self.seqrepo_access.get_reference_sequence(ac, start=1, end=1)[
|
197
195
|
0
|
198
|
-
]
|
199
|
-
|
200
|
-
|
196
|
+
]
|
197
|
+
):
|
198
|
+
logger.warning("Ensembl transcript not found: %s", ac)
|
199
|
+
return None
|
201
200
|
|
202
201
|
temp_ac = ac.split(".")[0]
|
203
202
|
else:
|
@@ -206,15 +205,14 @@ class ManeTranscript:
|
|
206
205
|
# c. coordinate does not contain cds start, so we need to add it
|
207
206
|
cds_start_end = await self.uta_db.get_cds_start_end(temp_ac)
|
208
207
|
if not cds_start_end:
|
209
|
-
logger.warning(
|
208
|
+
logger.warning("Accession %s not found in UTA", temp_ac)
|
210
209
|
return None
|
211
210
|
coding_start_site = cds_start_end[0]
|
212
211
|
pos = pos[0] + coding_start_site, pos[1] + coding_start_site
|
213
212
|
|
214
|
-
|
213
|
+
return await self._get_and_validate_genomic_tx_data(
|
215
214
|
ac, pos, AnnotationLayer.CDNA, coding_start_site=coding_start_site
|
216
215
|
)
|
217
|
-
return genomic_tx_data
|
218
216
|
|
219
217
|
async def _get_and_validate_genomic_tx_data(
|
220
218
|
self,
|
@@ -240,8 +238,10 @@ class ManeTranscript:
|
|
240
238
|
)
|
241
239
|
if not genomic_tx_data:
|
242
240
|
logger.warning(
|
243
|
-
|
244
|
-
|
241
|
+
"Unable to find genomic_tx_data for %s at position %s on annotation layer %s",
|
242
|
+
alt_ac,
|
243
|
+
pos,
|
244
|
+
annotation_layer,
|
245
245
|
)
|
246
246
|
return None
|
247
247
|
genomic_tx_data["coding_start_site"] = coding_start_site
|
@@ -256,9 +256,9 @@ class ManeTranscript:
|
|
256
256
|
# Validation check: Exon structure
|
257
257
|
if og_alt_exon_id != liftover_alt_exon_id:
|
258
258
|
logger.warning(
|
259
|
-
|
260
|
-
|
261
|
-
|
259
|
+
"Original alt_exon_id %s does not match liftover alt_exon_id %s",
|
260
|
+
og_alt_exon_id,
|
261
|
+
liftover_alt_exon_id,
|
262
262
|
)
|
263
263
|
return None
|
264
264
|
|
@@ -294,7 +294,9 @@ class ManeTranscript:
|
|
294
294
|
|
295
295
|
if lt_cds_start or gt_cds_end:
|
296
296
|
logger.info(
|
297
|
-
|
297
|
+
"%s with position %s is not within CDS start/end",
|
298
|
+
refseq_c_ac,
|
299
|
+
c_pos_change,
|
298
300
|
)
|
299
301
|
|
300
302
|
return CdnaRepresentation(
|
@@ -380,13 +382,12 @@ class ManeTranscript:
|
|
380
382
|
|
381
383
|
if not result:
|
382
384
|
logger.warning(
|
383
|
-
|
385
|
+
"Unable to find transcript, %s, position change", refseq_c_ac
|
384
386
|
)
|
385
387
|
return None
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
tx_pos_range = result[2], result[3] # tx_start_i, tx_end_i
|
388
|
+
result = result[-1]
|
389
|
+
tx_g_pos = result[5], result[6] # alt_start_i, alt_end_i
|
390
|
+
tx_pos_range = result[2], result[3] # tx_start_i, tx_end_i
|
390
391
|
|
391
392
|
cds_start_end = await self.uta_db.get_cds_start_end(refseq_c_ac)
|
392
393
|
if not cds_start_end:
|
@@ -438,14 +439,17 @@ class ManeTranscript:
|
|
438
439
|
|
439
440
|
if og_rf != new_rf:
|
440
441
|
logger.warning(
|
441
|
-
|
442
|
-
|
443
|
-
|
442
|
+
"%s original reading frame (%s) does not match new %s, %s reading frame (%s)",
|
443
|
+
ac,
|
444
|
+
og_rf,
|
445
|
+
transcript_data.ensembl,
|
446
|
+
transcript_data.refseq,
|
447
|
+
new_rf,
|
444
448
|
)
|
445
449
|
return False
|
446
450
|
else:
|
447
451
|
if pos_index == 0:
|
448
|
-
logger.warning(
|
452
|
+
logger.warning("%s must having start position", ac)
|
449
453
|
return False
|
450
454
|
return True
|
451
455
|
|
@@ -503,13 +507,15 @@ class ManeTranscript:
|
|
503
507
|
|
504
508
|
if expected_ref != mane_ref:
|
505
509
|
logger.info(
|
506
|
-
|
507
|
-
|
510
|
+
"Expected ref, %s, but got %s on MANE accession, %s",
|
511
|
+
expected_ref,
|
512
|
+
mane_ref,
|
513
|
+
mane_transcript.refseq,
|
508
514
|
)
|
509
515
|
|
510
516
|
if expected_ref != ref:
|
511
517
|
logger.warning(
|
512
|
-
|
518
|
+
"Expected ref, %s, but got %s on accession, %s", expected_ref, ref, ac
|
513
519
|
)
|
514
520
|
return False
|
515
521
|
|
@@ -531,8 +537,7 @@ class ManeTranscript:
|
|
531
537
|
ac, start=start_pos, end=end_pos, residue_mode=ResidueMode.INTER_RESIDUE
|
532
538
|
)[0]:
|
533
539
|
return True
|
534
|
-
|
535
|
-
return False
|
540
|
+
return False
|
536
541
|
|
537
542
|
def _get_prioritized_transcripts_from_gene(self, df: pl.DataFrame) -> List:
|
538
543
|
"""Sort and filter transcripts from gene to get priority list
|
@@ -687,7 +692,7 @@ class ManeTranscript:
|
|
687
692
|
)
|
688
693
|
|
689
694
|
if df.is_empty():
|
690
|
-
logger.warning(
|
695
|
+
logger.warning("Unable to get transcripts from gene %s", gene)
|
691
696
|
return lcr_result
|
692
697
|
|
693
698
|
prioritized_tx_acs = self._get_prioritized_transcripts_from_gene(df)
|
@@ -810,38 +815,42 @@ class ManeTranscript:
|
|
810
815
|
|
811
816
|
if not self._validate_index(ac, pos, coding_start_site):
|
812
817
|
logger.warning(
|
813
|
-
|
814
|
-
|
818
|
+
"%s are not valid positions on %s with coding start site %s",
|
819
|
+
pos,
|
820
|
+
ac,
|
821
|
+
coding_start_site,
|
815
822
|
)
|
816
823
|
continue
|
817
824
|
return lcr_result
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
825
|
+
lcr_result = ProteinAndCdnaRepresentation(
|
826
|
+
protein=_get_protein_rep(
|
827
|
+
gene,
|
828
|
+
row["pro_ac"],
|
829
|
+
lcr_c_data.pos,
|
830
|
+
g["strand"],
|
831
|
+
lcr_c_data.status,
|
832
|
+
),
|
833
|
+
cdna=lcr_c_data,
|
834
|
+
)
|
835
|
+
lcr_result_dict = lcr_result.model_dump()
|
836
|
+
|
837
|
+
valid = True
|
838
|
+
for k in lcr_result_dict:
|
839
|
+
cds = lcr_result_dict[k].get("coding_start_site", 0)
|
840
|
+
ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
|
841
|
+
pos = lcr_result_dict[k]["pos"]
|
842
|
+
if not self._validate_index(ac, pos, cds):
|
843
|
+
valid = False
|
844
|
+
logger.warning(
|
845
|
+
"%s are not valid positions on %s with coding start site %s",
|
846
|
+
pos,
|
847
|
+
ac,
|
848
|
+
cds,
|
849
|
+
)
|
850
|
+
break
|
851
|
+
|
852
|
+
if valid:
|
853
|
+
return lcr_result
|
845
854
|
return lcr_result
|
846
855
|
|
847
856
|
async def get_mane_transcript(
|
@@ -917,9 +926,10 @@ class ManeTranscript:
|
|
917
926
|
# those transcripts meeting criterion
|
918
927
|
mane_transcripts = set()
|
919
928
|
for current_mane_data in mane_data:
|
920
|
-
mane_transcripts |=
|
921
|
-
|
922
|
-
|
929
|
+
mane_transcripts |= {
|
930
|
+
current_mane_data["RefSeq_nuc"],
|
931
|
+
current_mane_data["Ensembl_nuc"],
|
932
|
+
}
|
923
933
|
mane: Optional[CdnaRepresentation] = await self._g_to_c(
|
924
934
|
g=g,
|
925
935
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
@@ -974,24 +984,22 @@ class ManeTranscript:
|
|
974
984
|
residue_mode=residue_mode,
|
975
985
|
mane_transcripts=mane_transcripts,
|
976
986
|
)
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
return None
|
989
|
-
elif start_annotation_layer == AnnotationLayer.GENOMIC:
|
987
|
+
return await self.get_longest_compatible_transcript(
|
988
|
+
c_pos[0],
|
989
|
+
c_pos[1],
|
990
|
+
AnnotationLayer.CDNA,
|
991
|
+
ref=ref,
|
992
|
+
gene=g["gene"],
|
993
|
+
residue_mode=residue_mode,
|
994
|
+
mane_transcripts=mane_transcripts,
|
995
|
+
)
|
996
|
+
return None
|
997
|
+
if start_annotation_layer == AnnotationLayer.GENOMIC:
|
990
998
|
return await self.g_to_mane_c(
|
991
999
|
ac, start_pos, end_pos, gene=gene, residue_mode=residue_mode
|
992
1000
|
)
|
993
|
-
|
994
|
-
|
1001
|
+
logger.warning("Annotation layer not supported: %s", start_annotation_layer)
|
1002
|
+
return None
|
995
1003
|
|
996
1004
|
async def g_to_grch38(
|
997
1005
|
self, ac: str, start_pos: int, end_pos: int
|
@@ -1011,9 +1019,8 @@ class ManeTranscript:
|
|
1011
1019
|
if not descr:
|
1012
1020
|
# Already GRCh38 assembly
|
1013
1021
|
if self._validate_index(ac, (start_pos, end_pos), 0):
|
1014
|
-
return
|
1015
|
-
|
1016
|
-
return None
|
1022
|
+
return {"ac": ac, "pos": (start_pos, end_pos)}
|
1023
|
+
return None
|
1017
1024
|
chromosome, assembly = descr
|
1018
1025
|
is_same_pos = start_pos == end_pos
|
1019
1026
|
|
@@ -1027,8 +1034,7 @@ class ManeTranscript:
|
|
1027
1034
|
)
|
1028
1035
|
if liftover_start_i is None:
|
1029
1036
|
return None
|
1030
|
-
|
1031
|
-
start_pos = liftover_start_i[1]
|
1037
|
+
start_pos = liftover_start_i[1]
|
1032
1038
|
|
1033
1039
|
if not is_same_pos:
|
1034
1040
|
liftover_end_i = self.uta_db.get_liftover(
|
@@ -1036,8 +1042,7 @@ class ManeTranscript:
|
|
1036
1042
|
)
|
1037
1043
|
if liftover_end_i is None:
|
1038
1044
|
return None
|
1039
|
-
|
1040
|
-
end_pos = liftover_end_i[1]
|
1045
|
+
end_pos = liftover_end_i[1]
|
1041
1046
|
else:
|
1042
1047
|
end_pos = start_pos
|
1043
1048
|
|
@@ -1045,8 +1050,7 @@ class ManeTranscript:
|
|
1045
1050
|
if newest_ac:
|
1046
1051
|
ac = newest_ac[0]
|
1047
1052
|
if self._validate_index(ac, (start_pos, end_pos), 0):
|
1048
|
-
return
|
1049
|
-
|
1053
|
+
return {"ac": ac, "pos": (start_pos, end_pos)}
|
1050
1054
|
return None
|
1051
1055
|
|
1052
1056
|
@staticmethod
|
@@ -1128,7 +1132,7 @@ class ManeTranscript:
|
|
1128
1132
|
)
|
1129
1133
|
|
1130
1134
|
if not await self.uta_db.validate_genomic_ac(ac):
|
1131
|
-
logger.warning(
|
1135
|
+
logger.warning("Genomic accession does not exist: %s", ac)
|
1132
1136
|
return None
|
1133
1137
|
|
1134
1138
|
mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene)
|
@@ -1154,8 +1158,7 @@ class ManeTranscript:
|
|
1154
1158
|
)
|
1155
1159
|
if not mane_tx_genomic_data:
|
1156
1160
|
continue
|
1157
|
-
|
1158
|
-
logger.info("Not using most recent assembly")
|
1161
|
+
logger.info("Not using most recent assembly")
|
1159
1162
|
|
1160
1163
|
coding_start_site = mane_tx_genomic_data["coding_start_site"]
|
1161
1164
|
coding_end_site = mane_tx_genomic_data["coding_end_site"]
|
@@ -1167,9 +1170,10 @@ class ManeTranscript:
|
|
1167
1170
|
mane_c_ac, mane_c_pos_change, coding_start_site
|
1168
1171
|
):
|
1169
1172
|
logger.warning(
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
+
"%s are not valid positions on %s with coding start site %s",
|
1174
|
+
mane_c_pos_change,
|
1175
|
+
mane_c_ac,
|
1176
|
+
coding_start_site,
|
1173
1177
|
)
|
1174
1178
|
continue
|
1175
1179
|
|
@@ -1187,6 +1191,7 @@ class ManeTranscript:
|
|
1187
1191
|
ensembl_c_ac=current_mane_data["Ensembl_nuc"],
|
1188
1192
|
alt_ac=grch38["ac"] if grch38 else None,
|
1189
1193
|
)
|
1194
|
+
return None
|
1190
1195
|
|
1191
1196
|
async def grch38_to_mane_c_p(
|
1192
1197
|
self,
|
@@ -1234,7 +1239,7 @@ class ManeTranscript:
|
|
1234
1239
|
mane_transcripts = set() # Used if getting longest compatible remaining
|
1235
1240
|
for current_mane_data in mane_data:
|
1236
1241
|
mane_c_ac = current_mane_data["RefSeq_nuc"]
|
1237
|
-
mane_transcripts |=
|
1242
|
+
mane_transcripts |= {mane_c_ac, current_mane_data["Ensembl_nuc"]}
|
1238
1243
|
|
1239
1244
|
# GRCh38 -> MANE C
|
1240
1245
|
mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
|
@@ -1255,8 +1260,10 @@ class ManeTranscript:
|
|
1255
1260
|
mane_c_ac, mane_c_pos_change, coding_start_site
|
1256
1261
|
):
|
1257
1262
|
logger.warning(
|
1258
|
-
|
1259
|
-
|
1263
|
+
"%s are not valid positions on %s with coding start site %s",
|
1264
|
+
mane_c_pos_change,
|
1265
|
+
mane_c_ac,
|
1266
|
+
coding_start_site,
|
1260
1267
|
)
|
1261
1268
|
continue
|
1262
1269
|
|
@@ -1286,5 +1293,4 @@ class ManeTranscript:
|
|
1286
1293
|
end_annotation_layer=EndAnnotationLayer.PROTEIN_AND_CDNA,
|
1287
1294
|
mane_transcripts=mane_transcripts,
|
1288
1295
|
)
|
1289
|
-
|
1290
|
-
return None
|
1296
|
+
return None
|