cool-seq-tool 0.5.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/PKG-INFO +1 -1
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/mane_transcript.py +80 -53
- cool_seq_tool-0.6.0/src/cool_seq_tool/schemas.py +296 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/mane_transcript_mappings.py +35 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/PKG-INFO +1 -1
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/conftest.py +13 -1
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/mappers/test_mane_transcript.py +82 -64
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/sources/test_mane_transcript_mappings.py +51 -0
- cool_seq_tool-0.5.1/src/cool_seq_tool/schemas.py +0 -569
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.coveragerc +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/ISSUE_TEMPLATE/bug-report.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/ISSUE_TEMPLATE/feature-request.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/workflows/checks.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/workflows/close_issue.yml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/workflows/pr-priority-label.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/workflows/release.yml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.github/workflows/stale.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.gitignore +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.pre-commit-config.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/.readthedocs.yaml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/CITATION.cff +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/LICENSE +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/README.md +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/Makefile +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/make.bat +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/_static/img/biomart.png +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/_templates/module_summary.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/changelog.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/conf.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/contributing.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/index.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/install.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/license.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/reference/index.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/transcript_selection.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/docs/source/usage.rst +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/pyproject.toml +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/setup.cfg +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/__init__.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/app.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/handlers/__init__.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/handlers/seqrepo_access.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/__init__.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/alignment.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/exon_genomic_coords.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/mappers/liftover.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/resources/__init__.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/resources/data_files.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/resources/status.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/resources/transcript_mapping.tsv +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/__init__.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/transcript_mappings.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/uta_database.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/utils.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/SOURCES.txt +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/dependency_links.txt +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/requires.txt +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool.egg-info/top_level.txt +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/handlers/test_seqrepo_access.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/mappers/test_alignment.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/mappers/test_exon_genomic_coords.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/mappers/test_liftover.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/sources/test_uta_database.py +0 -0
- {cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/tests/test_utils.py +0 -0
@@ -25,6 +25,7 @@ from cool_seq_tool.mappers.liftover import LiftOver
|
|
25
25
|
from cool_seq_tool.schemas import (
|
26
26
|
AnnotationLayer,
|
27
27
|
Assembly,
|
28
|
+
ManeGeneData,
|
28
29
|
ResidueMode,
|
29
30
|
Strand,
|
30
31
|
TranscriptPriority,
|
@@ -71,10 +72,10 @@ class CdnaRepresentation(DataRepresentation):
|
|
71
72
|
class GenomicRepresentation(BaseModel):
|
72
73
|
"""Define object model for genomic representation"""
|
73
74
|
|
74
|
-
refseq: str
|
75
75
|
pos: tuple[int, int]
|
76
|
-
|
77
|
-
|
76
|
+
mane_genes: list[ManeGeneData] = []
|
77
|
+
status: Literal["grch38"] = TranscriptPriority.GRCH38.value
|
78
|
+
ac: str
|
78
79
|
|
79
80
|
|
80
81
|
class ProteinAndCdnaRepresentation(BaseModel):
|
@@ -108,7 +109,7 @@ class ManeTranscript:
|
|
108
109
|
|
109
110
|
>>> import asyncio
|
110
111
|
>>> result = asyncio.run(mane_mapper.g_to_grch38("NC_000001.11", 100, 200))
|
111
|
-
>>> result
|
112
|
+
>>> result.ac
|
112
113
|
'NC_000001.11'
|
113
114
|
|
114
115
|
See the :ref:`Usage section <async_note>` for more information.
|
@@ -128,7 +129,7 @@ class ManeTranscript:
|
|
128
129
|
self.liftover = liftover
|
129
130
|
|
130
131
|
@staticmethod
|
131
|
-
def
|
132
|
+
def get_reading_frame(pos: int) -> int:
|
132
133
|
"""Return reading frame number. Only used on c. coordinate.
|
133
134
|
|
134
135
|
:param pos: cDNA position
|
@@ -531,8 +532,8 @@ class ManeTranscript:
|
|
531
532
|
"""
|
532
533
|
for pos, pos_index in [(start_pos, 0), (end_pos, 1)]:
|
533
534
|
if pos is not None:
|
534
|
-
og_rf = self.
|
535
|
-
new_rf = self.
|
535
|
+
og_rf = self.get_reading_frame(pos)
|
536
|
+
new_rf = self.get_reading_frame(transcript_data.pos[pos_index])
|
536
537
|
|
537
538
|
if og_rf != new_rf:
|
538
539
|
_logger.warning(
|
@@ -618,7 +619,7 @@ class ManeTranscript:
|
|
618
619
|
|
619
620
|
return True
|
620
621
|
|
621
|
-
def
|
622
|
+
def validate_index(
|
622
623
|
self, ac: str, pos: tuple[int, int], coding_start_site: int
|
623
624
|
) -> bool:
|
624
625
|
"""Validate that positions actually exist on accession
|
@@ -910,7 +911,7 @@ class ManeTranscript:
|
|
910
911
|
ac = lcr_result.refseq or lcr_result.ensembl
|
911
912
|
pos = lcr_result.pos
|
912
913
|
|
913
|
-
if not self.
|
914
|
+
if not self.validate_index(ac, pos, coding_start_site):
|
914
915
|
_logger.warning(
|
915
916
|
"%s are not valid positions on %s with coding start site %s",
|
916
917
|
pos,
|
@@ -936,7 +937,7 @@ class ManeTranscript:
|
|
936
937
|
cds = lcr_result_dict[k].get("coding_start_site", 0)
|
937
938
|
ac = lcr_result_dict[k]["refseq"] or lcr_result_dict[k]["ensembl"]
|
938
939
|
pos = lcr_result_dict[k]["pos"]
|
939
|
-
if not self.
|
940
|
+
if not self.validate_index(ac, pos, cds):
|
940
941
|
valid = False
|
941
942
|
_logger.warning(
|
942
943
|
"%s are not valid positions on %s with coding start site %s",
|
@@ -962,7 +963,16 @@ class ManeTranscript:
|
|
962
963
|
residue_mode: Literal[ResidueMode.RESIDUE]
|
963
964
|
| Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.RESIDUE,
|
964
965
|
) -> DataRepresentation | CdnaRepresentation | None:
|
965
|
-
"""Return MANE
|
966
|
+
"""Return MANE representation
|
967
|
+
|
968
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.PROTEIN``, will return
|
969
|
+
``AnnotationLayer.PROTEIN`` representation.
|
970
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.CDNA``, will return
|
971
|
+
``AnnotationLayer.CDNA`` representation.
|
972
|
+
If ``start_annotation_layer`` is ``AnnotationLayer.GENOMIC`` will return
|
973
|
+
``AnnotationLayer.CDNA`` representation if ``gene`` is provided and
|
974
|
+
``AnnotationLayer.GENOMIC`` GRCh38 representation if ``gene`` is NOT
|
975
|
+
provided.
|
966
976
|
|
967
977
|
>>> from cool_seq_tool.app import CoolSeqTool
|
968
978
|
>>> from cool_seq_tool.schemas import AnnotationLayer, ResidueMode
|
@@ -983,7 +993,11 @@ class ManeTranscript:
|
|
983
993
|
:param start_pos: Start position change
|
984
994
|
:param end_pos: End position change
|
985
995
|
:param start_annotation_layer: Starting annotation layer.
|
986
|
-
:param gene: HGNC gene symbol
|
996
|
+
:param gene: HGNC gene symbol.
|
997
|
+
If ``gene`` is not provided and ``start_annotation_layer`` is
|
998
|
+
``AnnotationLayer.GENOMIC``, will return GRCh38 representation.
|
999
|
+
If ``gene`` is provided and ``start_annotation_layer`` is
|
1000
|
+
``AnnotationLayer.GENOMIC``, will return cDNA representation.
|
987
1001
|
:param ref: Reference at position given during input
|
988
1002
|
:param try_longest_compatible: ``True`` if should try longest compatible remaining
|
989
1003
|
if mane transcript was not compatible. ``False`` otherwise.
|
@@ -1093,29 +1107,56 @@ class ManeTranscript:
|
|
1093
1107
|
)
|
1094
1108
|
return None
|
1095
1109
|
if start_annotation_layer == AnnotationLayer.GENOMIC:
|
1110
|
+
if not gene:
|
1111
|
+
return await self.g_to_grch38(
|
1112
|
+
ac,
|
1113
|
+
start_pos,
|
1114
|
+
end_pos,
|
1115
|
+
get_mane_genes=True,
|
1116
|
+
residue_mode=residue_mode,
|
1117
|
+
)
|
1118
|
+
|
1096
1119
|
return await self.g_to_mane_c(
|
1097
|
-
ac, start_pos, end_pos, gene
|
1120
|
+
ac, start_pos, end_pos, gene, residue_mode=residue_mode
|
1098
1121
|
)
|
1099
1122
|
_logger.warning("Annotation layer not supported: %s", start_annotation_layer)
|
1100
1123
|
return None
|
1101
1124
|
|
1102
|
-
async def g_to_grch38(
|
1125
|
+
async def g_to_grch38(
|
1126
|
+
self,
|
1127
|
+
ac: str,
|
1128
|
+
start_pos: int,
|
1129
|
+
end_pos: int,
|
1130
|
+
get_mane_genes: bool = False,
|
1131
|
+
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1132
|
+
) -> GenomicRepresentation | None:
|
1103
1133
|
"""Return genomic coordinate on GRCh38 when not given gene context.
|
1104
1134
|
|
1105
1135
|
:param ac: Genomic accession
|
1106
1136
|
:param start_pos: Genomic start position
|
1107
1137
|
:param end_pos: Genomic end position
|
1108
|
-
:
|
1138
|
+
:param get_mane_genes: ``True`` if mane genes for genomic position should be
|
1139
|
+
included in response. ``False``, otherwise.
|
1140
|
+
:param residue_mode: Residue mode for ``start_pos`` and ``end_pos``
|
1141
|
+
:return: GRCh38 genomic representation (accession and start/end inter-residue
|
1142
|
+
position)
|
1109
1143
|
"""
|
1110
|
-
|
1111
|
-
end_pos = start_pos
|
1144
|
+
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
1112
1145
|
|
1113
1146
|
# Checking to see what chromosome and assembly we're on
|
1114
1147
|
descr = await self.uta_db.get_chr_assembly(ac)
|
1115
1148
|
if not descr:
|
1116
1149
|
# Already GRCh38 assembly
|
1117
|
-
if self.
|
1118
|
-
return
|
1150
|
+
if self.validate_index(ac, (start_pos, end_pos), 0):
|
1151
|
+
return GenomicRepresentation(
|
1152
|
+
ac=ac,
|
1153
|
+
pos=(start_pos, end_pos),
|
1154
|
+
mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
|
1155
|
+
ac, start_pos + 1, end_pos
|
1156
|
+
)
|
1157
|
+
if get_mane_genes
|
1158
|
+
else [],
|
1159
|
+
)
|
1119
1160
|
return None
|
1120
1161
|
chromosome, assembly = descr
|
1121
1162
|
is_same_pos = start_pos == end_pos
|
@@ -1145,8 +1186,16 @@ class ManeTranscript:
|
|
1145
1186
|
newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
|
1146
1187
|
if newest_ac:
|
1147
1188
|
ac = newest_ac[0]
|
1148
|
-
if self.
|
1149
|
-
return
|
1189
|
+
if self.validate_index(ac, (start_pos, end_pos), 0):
|
1190
|
+
return GenomicRepresentation(
|
1191
|
+
ac=ac,
|
1192
|
+
pos=(start_pos, end_pos),
|
1193
|
+
mane_genes=self.mane_transcript_mappings.get_genomic_mane_genes(
|
1194
|
+
ac, start_pos + 1, end_pos
|
1195
|
+
)
|
1196
|
+
if get_mane_genes
|
1197
|
+
else [],
|
1198
|
+
)
|
1150
1199
|
return None
|
1151
1200
|
|
1152
1201
|
@staticmethod
|
@@ -1176,14 +1225,11 @@ class ManeTranscript:
|
|
1176
1225
|
ac: str,
|
1177
1226
|
start_pos: int,
|
1178
1227
|
end_pos: int,
|
1179
|
-
gene: str
|
1228
|
+
gene: str,
|
1180
1229
|
residue_mode: ResidueMode = ResidueMode.RESIDUE,
|
1181
|
-
) ->
|
1230
|
+
) -> CdnaRepresentation | None:
|
1182
1231
|
"""Return MANE Transcript on the c. coordinate.
|
1183
1232
|
|
1184
|
-
If an arg for ``gene`` is provided, lifts to GRCh38, then gets MANE cDNA
|
1185
|
-
representation.
|
1186
|
-
|
1187
1233
|
>>> import asyncio
|
1188
1234
|
>>> from cool_seq_tool.app import CoolSeqTool
|
1189
1235
|
>>> cst = CoolSeqTool()
|
@@ -1198,34 +1244,17 @@ class ManeTranscript:
|
|
1198
1244
|
<TranscriptPriority.MANE_SELECT: 'mane_select'>
|
1199
1245
|
>>> del cst
|
1200
1246
|
|
1201
|
-
Locating a MANE transcript requires a ``gene`` symbol argument -- if none is
|
1202
|
-
given, this method will only lift over to genomic coordinates on GRCh38.
|
1203
|
-
|
1204
1247
|
:param ac: Transcript accession on g. coordinate
|
1205
1248
|
:param start_pos: genomic start position
|
1206
1249
|
:param end_pos: genomic end position
|
1207
1250
|
:param gene: HGNC gene symbol
|
1208
1251
|
:param residue_mode: Starting residue mode for ``start_pos`` and ``end_pos``.
|
1209
1252
|
Will always return coordinates in inter-residue.
|
1210
|
-
:return: MANE Transcripts with cDNA change on c. coordinate
|
1211
|
-
is provided. Else, GRCh38 data
|
1253
|
+
:return: MANE Transcripts with cDNA change on c. coordinate
|
1212
1254
|
"""
|
1213
1255
|
start_pos, end_pos = get_inter_residue_pos(start_pos, end_pos, residue_mode)
|
1214
1256
|
residue_mode = ResidueMode.INTER_RESIDUE
|
1215
1257
|
|
1216
|
-
# If gene not provided, return GRCh38
|
1217
|
-
if not gene:
|
1218
|
-
grch38 = await self.g_to_grch38(ac, start_pos, end_pos)
|
1219
|
-
if not grch38:
|
1220
|
-
return None
|
1221
|
-
|
1222
|
-
return GenomicRepresentation(
|
1223
|
-
refseq=grch38["ac"],
|
1224
|
-
pos=grch38["pos"],
|
1225
|
-
status=TranscriptPriority.GRCH38,
|
1226
|
-
alt_ac=grch38["ac"],
|
1227
|
-
)
|
1228
|
-
|
1229
1258
|
if not await self.uta_db.validate_genomic_ac(ac):
|
1230
1259
|
_logger.warning("Genomic accession does not exist: %s", ac)
|
1231
1260
|
return None
|
@@ -1238,12 +1267,14 @@ class ManeTranscript:
|
|
1238
1267
|
mane_c_ac = current_mane_data["RefSeq_nuc"]
|
1239
1268
|
|
1240
1269
|
# Liftover to GRCh38
|
1241
|
-
grch38 = await self.g_to_grch38(
|
1270
|
+
grch38 = await self.g_to_grch38(
|
1271
|
+
ac, start_pos, end_pos, get_mane_genes=False, residue_mode=residue_mode
|
1272
|
+
)
|
1242
1273
|
mane_tx_genomic_data = None
|
1243
1274
|
if grch38:
|
1244
1275
|
# GRCh38 -> MANE C
|
1245
1276
|
mane_tx_genomic_data = await self.uta_db.get_mane_c_genomic_data(
|
1246
|
-
mane_c_ac, grch38
|
1277
|
+
mane_c_ac, grch38.ac, grch38.pos[0], grch38.pos[1]
|
1247
1278
|
)
|
1248
1279
|
|
1249
1280
|
if not grch38 or not mane_tx_genomic_data:
|
@@ -1261,9 +1292,7 @@ class ManeTranscript:
|
|
1261
1292
|
mane_tx_genomic_data, coding_start_site
|
1262
1293
|
)
|
1263
1294
|
|
1264
|
-
if not self.
|
1265
|
-
mane_c_ac, mane_c_pos_change, coding_start_site
|
1266
|
-
):
|
1295
|
+
if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
|
1267
1296
|
_logger.warning(
|
1268
1297
|
"%s are not valid positions on %s with coding start site %s",
|
1269
1298
|
mane_c_pos_change,
|
@@ -1284,7 +1313,7 @@ class ManeTranscript:
|
|
1284
1313
|
),
|
1285
1314
|
refseq_c_ac=current_mane_data["RefSeq_nuc"],
|
1286
1315
|
ensembl_c_ac=current_mane_data["Ensembl_nuc"],
|
1287
|
-
alt_ac=grch38
|
1316
|
+
alt_ac=grch38.ac if grch38 else None,
|
1288
1317
|
)
|
1289
1318
|
return None
|
1290
1319
|
|
@@ -1351,9 +1380,7 @@ class ManeTranscript:
|
|
1351
1380
|
)
|
1352
1381
|
|
1353
1382
|
# Validate MANE C positions
|
1354
|
-
if not self.
|
1355
|
-
mane_c_ac, mane_c_pos_change, coding_start_site
|
1356
|
-
):
|
1383
|
+
if not self.validate_index(mane_c_ac, mane_c_pos_change, coding_start_site):
|
1357
1384
|
_logger.warning(
|
1358
1385
|
"%s are not valid positions on %s with coding start site %s",
|
1359
1386
|
mane_c_pos_change,
|
@@ -0,0 +1,296 @@
|
|
1
|
+
"""Defines attribute constants, useful object structures, and API response schemas."""
|
2
|
+
|
3
|
+
import datetime
|
4
|
+
from enum import Enum, IntEnum
|
5
|
+
from typing import Literal
|
6
|
+
|
7
|
+
from pydantic import (
|
8
|
+
BaseModel,
|
9
|
+
ConfigDict,
|
10
|
+
StrictInt,
|
11
|
+
StrictStr,
|
12
|
+
model_validator,
|
13
|
+
)
|
14
|
+
|
15
|
+
from cool_seq_tool import __version__
|
16
|
+
|
17
|
+
_now = str(datetime.datetime.now(tz=datetime.timezone.utc))
|
18
|
+
|
19
|
+
|
20
|
+
class AnnotationLayer(str, Enum):
|
21
|
+
"""Create enum for supported annotation layers"""
|
22
|
+
|
23
|
+
PROTEIN: Literal["p"] = "p"
|
24
|
+
CDNA: Literal["c"] = "c"
|
25
|
+
GENOMIC: Literal["g"] = "g"
|
26
|
+
|
27
|
+
|
28
|
+
class Strand(IntEnum):
|
29
|
+
"""Create enum for positive and negative strand"""
|
30
|
+
|
31
|
+
POSITIVE = 1
|
32
|
+
NEGATIVE = -1
|
33
|
+
|
34
|
+
|
35
|
+
class Assembly(str, Enum):
|
36
|
+
"""Define supported genomic assemblies. Must be defined in ascending order"""
|
37
|
+
|
38
|
+
GRCH37 = "GRCh37"
|
39
|
+
GRCH38 = "GRCh38"
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def values(cls) -> list[str]:
|
43
|
+
"""Return list of values in enum (ascending assembly order)"""
|
44
|
+
return [item.value for item in cls]
|
45
|
+
|
46
|
+
|
47
|
+
class TranscriptPriority(str, Enum):
|
48
|
+
"""Create Enum for Transcript Priority labels"""
|
49
|
+
|
50
|
+
MANE_SELECT = "mane_select"
|
51
|
+
MANE_PLUS_CLINICAL = "mane_plus_clinical"
|
52
|
+
LONGEST_COMPATIBLE_REMAINING = "longest_compatible_remaining"
|
53
|
+
GRCH38 = "grch38"
|
54
|
+
|
55
|
+
|
56
|
+
class ResidueMode(str, Enum):
|
57
|
+
"""Create Enum for residue modes.
|
58
|
+
|
59
|
+
We typically prefer to operate in inter-residue coordinates, but users should be
|
60
|
+
careful to define the coordinate mode of their data when calling ``cool-seq-tool``
|
61
|
+
functions.
|
62
|
+
|
63
|
+
| | C | | T | | G | |
|
64
|
+
ZERO | | 0 | | 1 | | 2 | |
|
65
|
+
RESIDUE | | 1 | | 2 | | 3 | |
|
66
|
+
INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
|
67
|
+
|
68
|
+
.. tabularcolumns:: |L|C|C|C|C|C|C|C|
|
69
|
+
.. list-table::
|
70
|
+
:header-rows: 1
|
71
|
+
|
72
|
+
* -
|
73
|
+
-
|
74
|
+
- C
|
75
|
+
-
|
76
|
+
- T
|
77
|
+
-
|
78
|
+
- G
|
79
|
+
-
|
80
|
+
* - ``ZERO``
|
81
|
+
-
|
82
|
+
- 0
|
83
|
+
-
|
84
|
+
- 1
|
85
|
+
-
|
86
|
+
- 2
|
87
|
+
-
|
88
|
+
* - ``RESIDUE``
|
89
|
+
-
|
90
|
+
- 1
|
91
|
+
-
|
92
|
+
- 2
|
93
|
+
-
|
94
|
+
- 3
|
95
|
+
-
|
96
|
+
* - ``INTER_RESIDUE``
|
97
|
+
- 0
|
98
|
+
-
|
99
|
+
- 1
|
100
|
+
-
|
101
|
+
- 2
|
102
|
+
-
|
103
|
+
- 3
|
104
|
+
|
105
|
+
|
106
|
+
See "Conventions that promote reliable data sharing" and figure 3 within the
|
107
|
+
`Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
|
108
|
+
"""
|
109
|
+
|
110
|
+
ZERO = "zero"
|
111
|
+
RESIDUE = "residue"
|
112
|
+
INTER_RESIDUE = "inter-residue"
|
113
|
+
|
114
|
+
|
115
|
+
class BaseModelForbidExtra(BaseModel, extra="forbid"):
|
116
|
+
"""Base Pydantic model class with extra values forbidden."""
|
117
|
+
|
118
|
+
|
119
|
+
class ManeGeneData(BaseModel, extra="forbid"):
|
120
|
+
"""Define minimal object model for representing a MANE gene"""
|
121
|
+
|
122
|
+
ncbi_gene_id: StrictInt
|
123
|
+
hgnc_id: StrictInt | None
|
124
|
+
symbol: StrictStr
|
125
|
+
|
126
|
+
|
127
|
+
class TranscriptExonData(BaseModelForbidExtra):
|
128
|
+
"""Model containing transcript exon data."""
|
129
|
+
|
130
|
+
transcript: StrictStr
|
131
|
+
pos: StrictInt
|
132
|
+
exon: StrictInt
|
133
|
+
exon_offset: StrictInt = 0
|
134
|
+
gene: StrictStr
|
135
|
+
chr: StrictStr
|
136
|
+
strand: Strand
|
137
|
+
|
138
|
+
model_config = ConfigDict(
|
139
|
+
json_schema_extra={
|
140
|
+
"example": {
|
141
|
+
"chr": "NC_000001.11",
|
142
|
+
"gene": "TPM3",
|
143
|
+
"pos": 154192135,
|
144
|
+
"exon": 1,
|
145
|
+
"exon_offset": 0,
|
146
|
+
"transcript": "NM_152263.3",
|
147
|
+
"strand": Strand.NEGATIVE,
|
148
|
+
}
|
149
|
+
}
|
150
|
+
)
|
151
|
+
|
152
|
+
|
153
|
+
class GenomicData(BaseModelForbidExtra):
|
154
|
+
"""Model containing genomic and transcript exon data."""
|
155
|
+
|
156
|
+
gene: StrictStr
|
157
|
+
chr: StrictStr
|
158
|
+
start: StrictInt | None = None # Genomic start position
|
159
|
+
end: StrictInt | None = None # Genomic end position
|
160
|
+
exon_start: StrictInt | None = None
|
161
|
+
exon_start_offset: StrictInt | None = 0
|
162
|
+
exon_end: StrictInt | None = None
|
163
|
+
exon_end_offset: StrictInt | None = 0
|
164
|
+
transcript: StrictStr
|
165
|
+
strand: Strand
|
166
|
+
|
167
|
+
@model_validator(mode="after")
|
168
|
+
def check_start_end(cls, values):
|
169
|
+
"""Check that at least one of {``start``, ``end``} is set.
|
170
|
+
Check that at least one of {``exon_start``, ``exon_end``} is set.
|
171
|
+
If not set, set corresponding offset to ``None``
|
172
|
+
"""
|
173
|
+
start = values.start
|
174
|
+
end = values.end
|
175
|
+
if not start and not end:
|
176
|
+
msg = "Missing values for `start` or `end`"
|
177
|
+
raise ValueError(msg)
|
178
|
+
|
179
|
+
if start:
|
180
|
+
if not values.exon_start:
|
181
|
+
msg = "Missing value `exon_start`"
|
182
|
+
raise ValueError(msg)
|
183
|
+
else:
|
184
|
+
values.exon_start_offset = None
|
185
|
+
|
186
|
+
if end:
|
187
|
+
if not values.exon_end:
|
188
|
+
msg = "Missing value `exon_end`"
|
189
|
+
raise ValueError(msg)
|
190
|
+
else:
|
191
|
+
values.exon_end_offset = None
|
192
|
+
return values
|
193
|
+
|
194
|
+
model_config = ConfigDict(
|
195
|
+
json_schema_extra={
|
196
|
+
"example": {
|
197
|
+
"gene": "TPM3",
|
198
|
+
"chr": "NC_000001.11",
|
199
|
+
"start": 154192135,
|
200
|
+
"end": None,
|
201
|
+
"exon_start": 1,
|
202
|
+
"exon_end": None,
|
203
|
+
"exon_start_offset": 0,
|
204
|
+
"exon_end_offset": None,
|
205
|
+
"transcript": "NM_152263.3",
|
206
|
+
"strand": Strand.NEGATIVE,
|
207
|
+
}
|
208
|
+
}
|
209
|
+
)
|
210
|
+
|
211
|
+
|
212
|
+
class ServiceMeta(BaseModelForbidExtra):
|
213
|
+
"""Metadata for cool_seq_tool service"""
|
214
|
+
|
215
|
+
name: Literal["cool_seq_tool"] = "cool_seq_tool"
|
216
|
+
version: StrictStr
|
217
|
+
response_datetime: datetime.datetime
|
218
|
+
url: Literal["https://github.com/GenomicMedLab/cool-seq-tool"] = (
|
219
|
+
"https://github.com/GenomicMedLab/cool-seq-tool"
|
220
|
+
)
|
221
|
+
|
222
|
+
model_config = ConfigDict(
|
223
|
+
json_schema_extra={
|
224
|
+
"example": {
|
225
|
+
"name": "cool_seq_tool",
|
226
|
+
"version": __version__,
|
227
|
+
"response_datetime": _now,
|
228
|
+
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
229
|
+
}
|
230
|
+
}
|
231
|
+
)
|
232
|
+
|
233
|
+
|
234
|
+
class TranscriptExonDataResponse(BaseModelForbidExtra):
|
235
|
+
"""Response model for Transcript Exon Data"""
|
236
|
+
|
237
|
+
transcript_exon_data: TranscriptExonData | None = None
|
238
|
+
warnings: list[StrictStr] = []
|
239
|
+
service_meta: ServiceMeta
|
240
|
+
|
241
|
+
model_config = ConfigDict(
|
242
|
+
json_schema_extra={
|
243
|
+
"example": {
|
244
|
+
"transcript_exon_data": {
|
245
|
+
"chr": "NC_000001.11",
|
246
|
+
"gene": "TPM3",
|
247
|
+
"pos": 154192135,
|
248
|
+
"exon": 1,
|
249
|
+
"exon_offset": 0,
|
250
|
+
"transcript": "NM_152263.3",
|
251
|
+
"strand": Strand.NEGATIVE,
|
252
|
+
},
|
253
|
+
"warnings": [],
|
254
|
+
"service_meta": {
|
255
|
+
"name": "cool_seq_tool",
|
256
|
+
"version": __version__,
|
257
|
+
"response_datetime": _now,
|
258
|
+
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
259
|
+
},
|
260
|
+
}
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
264
|
+
|
265
|
+
class GenomicDataResponse(BaseModelForbidExtra):
|
266
|
+
"""Response model for Genomic Data"""
|
267
|
+
|
268
|
+
genomic_data: GenomicData | None = None
|
269
|
+
warnings: list[StrictStr] = []
|
270
|
+
service_meta: ServiceMeta
|
271
|
+
|
272
|
+
model_config = ConfigDict(
|
273
|
+
json_schema_extra={
|
274
|
+
"example": {
|
275
|
+
"genomic_data": {
|
276
|
+
"gene": "TPM3",
|
277
|
+
"chr": "NC_000001.11",
|
278
|
+
"start": 154192135,
|
279
|
+
"end": None,
|
280
|
+
"exon_start": 1,
|
281
|
+
"exon_end": None,
|
282
|
+
"exon_start_offset": 0,
|
283
|
+
"exon_end_offset": None,
|
284
|
+
"transcript": "NM_152263.3",
|
285
|
+
"strand": Strand.NEGATIVE,
|
286
|
+
},
|
287
|
+
"warnings": [],
|
288
|
+
"service_meta": {
|
289
|
+
"name": "cool_seq_tool",
|
290
|
+
"version": __version__,
|
291
|
+
"response_datetime": _now,
|
292
|
+
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
293
|
+
},
|
294
|
+
}
|
295
|
+
}
|
296
|
+
)
|
{cool_seq_tool-0.5.1 → cool_seq_tool-0.6.0}/src/cool_seq_tool/sources/mane_transcript_mappings.py
RENAMED
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
8
8
|
import polars as pl
|
9
9
|
|
10
10
|
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
11
|
+
from cool_seq_tool.schemas import ManeGeneData
|
11
12
|
|
12
13
|
_logger = logging.getLogger(__name__)
|
13
14
|
|
@@ -103,3 +104,37 @@ class ManeTranscriptMappings:
|
|
103
104
|
|
104
105
|
mane_rows = mane_rows.sort(by="MANE_status", descending=True)
|
105
106
|
return mane_rows.to_dicts()
|
107
|
+
|
108
|
+
def get_genomic_mane_genes(
|
109
|
+
self, ac: str, start: int, end: int
|
110
|
+
) -> list[ManeGeneData]:
|
111
|
+
"""Get MANE gene(s) for genomic location
|
112
|
+
|
113
|
+
:param ac: RefSeq genomic accession
|
114
|
+
:param start: Genomic start position. Assumes residue coordinates.
|
115
|
+
:param end: Genomic end position. Assumes residue coordinates.
|
116
|
+
:return: Unique MANE gene(s) found for a genomic location
|
117
|
+
"""
|
118
|
+
mane_rows = self.df.filter(
|
119
|
+
(start >= pl.col("chr_start"))
|
120
|
+
& (end <= pl.col("chr_end"))
|
121
|
+
& (pl.col("GRCh38_chr") == ac)
|
122
|
+
).unique(subset=["#NCBI_GeneID"])
|
123
|
+
|
124
|
+
if len(mane_rows) == 0:
|
125
|
+
return []
|
126
|
+
|
127
|
+
mane_rows = mane_rows.with_columns(
|
128
|
+
pl.col("#NCBI_GeneID")
|
129
|
+
.str.split_exact(":", 1)
|
130
|
+
.struct.field("field_1")
|
131
|
+
.cast(pl.Int32)
|
132
|
+
.alias("ncbi_gene_id"),
|
133
|
+
pl.col("HGNC_ID")
|
134
|
+
.str.split_exact(":", 1)
|
135
|
+
.struct.field("field_1")
|
136
|
+
.cast(pl.Int32)
|
137
|
+
.alias("hgnc_id"),
|
138
|
+
)
|
139
|
+
mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
|
140
|
+
return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]
|
@@ -5,7 +5,7 @@ import asyncio
|
|
5
5
|
import pytest
|
6
6
|
|
7
7
|
from cool_seq_tool.app import CoolSeqTool
|
8
|
-
from cool_seq_tool.schemas import Strand
|
8
|
+
from cool_seq_tool.schemas import ManeGeneData, Strand
|
9
9
|
|
10
10
|
|
11
11
|
@pytest.fixture(scope="session")
|
@@ -121,3 +121,15 @@ def genomic_tx_data():
|
|
121
121
|
"tx_ac": "NM_004333.4",
|
122
122
|
"alt_ac": "NC_000007.13",
|
123
123
|
}
|
124
|
+
|
125
|
+
|
126
|
+
@pytest.fixture(scope="session")
|
127
|
+
def egfr_mane_gene():
|
128
|
+
"""Create test fixture for EGFR MANE gene"""
|
129
|
+
return ManeGeneData(ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR")
|
130
|
+
|
131
|
+
|
132
|
+
@pytest.fixture(scope="session")
|
133
|
+
def braf_mane_gene():
|
134
|
+
"""Create test fixture for BRAF MANE gene"""
|
135
|
+
return ManeGeneData(ncbi_gene_id=673, hgnc_id=1097, symbol="BRAF")
|