cool-seq-tool 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cool_seq_tool/__init__.py +6 -0
- cool_seq_tool/app.py +1 -2
- cool_seq_tool/handlers/seqrepo_access.py +5 -5
- cool_seq_tool/mappers/alignment.py +16 -16
- cool_seq_tool/mappers/exon_genomic_coords.py +845 -628
- cool_seq_tool/mappers/mane_transcript.py +184 -152
- cool_seq_tool/schemas.py +30 -438
- cool_seq_tool/sources/mane_transcript_mappings.py +35 -0
- cool_seq_tool/sources/uta_database.py +149 -229
- cool_seq_tool/utils.py +9 -9
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/METADATA +8 -8
- cool_seq_tool-0.7.0.dist-info/RECORD +24 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/WHEEL +1 -1
- cool_seq_tool-0.5.1.dist-info/RECORD +0 -24
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/LICENSE +0 -0
- {cool_seq_tool-0.5.1.dist-info → cool_seq_tool-0.7.0.dist-info}/top_level.txt +0 -0
cool_seq_tool/schemas.py
CHANGED
@@ -9,7 +9,6 @@ from pydantic import (
|
|
9
9
|
ConfigDict,
|
10
10
|
StrictInt,
|
11
11
|
StrictStr,
|
12
|
-
model_validator,
|
13
12
|
)
|
14
13
|
|
15
14
|
from cool_seq_tool import __version__
|
@@ -20,9 +19,9 @@ _now = str(datetime.datetime.now(tz=datetime.timezone.utc))
|
|
20
19
|
class AnnotationLayer(str, Enum):
|
21
20
|
"""Create enum for supported annotation layers"""
|
22
21
|
|
23
|
-
PROTEIN
|
24
|
-
CDNA
|
25
|
-
GENOMIC
|
22
|
+
PROTEIN = "p"
|
23
|
+
CDNA = "c"
|
24
|
+
GENOMIC = "g"
|
26
25
|
|
27
26
|
|
28
27
|
class Strand(IntEnum):
|
@@ -53,15 +52,17 @@ class TranscriptPriority(str, Enum):
|
|
53
52
|
GRCH38 = "grch38"
|
54
53
|
|
55
54
|
|
56
|
-
class
|
57
|
-
"""Create Enum for
|
55
|
+
class CoordinateType(str, Enum):
|
56
|
+
"""Create Enum for coordinate types.
|
58
57
|
|
59
|
-
|
58
|
+
It is preferred to operate in inter-residue coordinates, but users should be
|
60
59
|
careful to define the coordinate mode of their data when calling ``cool-seq-tool``
|
61
60
|
functions.
|
62
61
|
|
62
|
+
``RESIDUE`` means 1-indexed, residue coordinates and ``INTER_RESIDUE`` means
|
63
|
+
0-indexed, inter-residue coordinates.
|
64
|
+
|
63
65
|
| | C | | T | | G | |
|
64
|
-
ZERO | | 0 | | 1 | | 2 | |
|
65
66
|
RESIDUE | | 1 | | 2 | | 3 | |
|
66
67
|
INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
|
67
68
|
|
@@ -77,14 +78,6 @@ class ResidueMode(str, Enum):
|
|
77
78
|
-
|
78
79
|
- G
|
79
80
|
-
|
80
|
-
* - ``ZERO``
|
81
|
-
-
|
82
|
-
- 0
|
83
|
-
-
|
84
|
-
- 1
|
85
|
-
-
|
86
|
-
- 2
|
87
|
-
-
|
88
81
|
* - ``RESIDUE``
|
89
82
|
-
|
90
83
|
- 1
|
@@ -107,7 +100,6 @@ class ResidueMode(str, Enum):
|
|
107
100
|
`Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
|
108
101
|
"""
|
109
102
|
|
110
|
-
ZERO = "zero"
|
111
103
|
RESIDUE = "residue"
|
112
104
|
INTER_RESIDUE = "inter-residue"
|
113
105
|
|
@@ -116,157 +108,35 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"):
|
|
116
108
|
"""Base Pydantic model class with extra values forbidden."""
|
117
109
|
|
118
110
|
|
119
|
-
class
|
120
|
-
"""
|
121
|
-
|
122
|
-
chromosome: StrictStr | StrictInt
|
123
|
-
start: StrictInt | None = None
|
124
|
-
end: StrictInt | None = None
|
125
|
-
strand: Strand | None = None
|
126
|
-
transcript: StrictStr | None = None
|
127
|
-
gene: StrictStr | None = None
|
128
|
-
residue_mode: ResidueMode = ResidueMode.RESIDUE
|
129
|
-
|
130
|
-
@model_validator(mode="after")
|
131
|
-
def check_start_and_end(cls, values):
|
132
|
-
"""Check that at least one of {``start``, ``end``} is set"""
|
133
|
-
start, end = values.start, values.end
|
134
|
-
if not start or end:
|
135
|
-
msg = "Must provide either `start` or `end`"
|
136
|
-
raise ValueError(msg)
|
137
|
-
return values
|
138
|
-
|
139
|
-
model_config = ConfigDict(
|
140
|
-
json_schema_extra={
|
141
|
-
"example": {
|
142
|
-
"chromosome": "NC_000001.11",
|
143
|
-
"start": 154192135,
|
144
|
-
"end": None,
|
145
|
-
"strand": Strand.NEGATIVE,
|
146
|
-
"transcript": "NM_152263.3",
|
147
|
-
"gene": "TPM3",
|
148
|
-
"residue_mode": "residue",
|
149
|
-
}
|
150
|
-
}
|
151
|
-
)
|
152
|
-
|
153
|
-
|
154
|
-
class TranscriptRequestBody(BaseModelForbidExtra):
|
155
|
-
"""Define constraints for transcript exon to genomic coordinates request body"""
|
111
|
+
class GenomicTxData(BaseModelForbidExtra):
|
112
|
+
"""Represent aligned genomic/transcript exon data"""
|
156
113
|
|
157
|
-
|
158
|
-
gene: StrictStr | None = None
|
159
|
-
exon_start: StrictInt | None = None
|
160
|
-
exon_start_offset: StrictInt | None = 0
|
161
|
-
exon_end: StrictInt | None = None
|
162
|
-
exon_end_offset: StrictInt | None = 0
|
163
|
-
|
164
|
-
@model_validator(mode="after")
|
165
|
-
def check_exon_start_and_exon_end(cls, values):
|
166
|
-
"""Check that at least one of {``exon_start``, ``exon_end``} is set"""
|
167
|
-
exon_start, exon_end = values.exon_start, values.exon_end
|
168
|
-
if not exon_start or exon_end:
|
169
|
-
msg = "Must provide either `exon_start` or `exon_end`"
|
170
|
-
raise ValueError(msg)
|
171
|
-
return values
|
172
|
-
|
173
|
-
model_config = ConfigDict(
|
174
|
-
json_schema_extra={
|
175
|
-
"example": {
|
176
|
-
"gene": "TPM3",
|
177
|
-
"transcript": "NM_152263.3",
|
178
|
-
"exon_start": 1,
|
179
|
-
"exon_start_offset": 1,
|
180
|
-
"exon_end": None,
|
181
|
-
"exon_end_offset": None,
|
182
|
-
}
|
183
|
-
}
|
184
|
-
)
|
185
|
-
|
186
|
-
|
187
|
-
class TranscriptExonData(BaseModelForbidExtra):
|
188
|
-
"""Model containing transcript exon data."""
|
189
|
-
|
190
|
-
transcript: StrictStr
|
191
|
-
pos: StrictInt
|
192
|
-
exon: StrictInt
|
193
|
-
exon_offset: StrictInt = 0
|
194
|
-
gene: StrictStr
|
195
|
-
chr: StrictStr
|
114
|
+
gene: str
|
196
115
|
strand: Strand
|
116
|
+
tx_pos_range: tuple[int, int]
|
117
|
+
alt_pos_range: tuple[int, int]
|
118
|
+
alt_aln_method: str
|
119
|
+
tx_exon_id: int
|
120
|
+
alt_exon_id: int
|
197
121
|
|
198
|
-
model_config = ConfigDict(
|
199
|
-
json_schema_extra={
|
200
|
-
"example": {
|
201
|
-
"chr": "NC_000001.11",
|
202
|
-
"gene": "TPM3",
|
203
|
-
"pos": 154192135,
|
204
|
-
"exon": 1,
|
205
|
-
"exon_offset": 0,
|
206
|
-
"transcript": "NM_152263.3",
|
207
|
-
"strand": Strand.NEGATIVE,
|
208
|
-
}
|
209
|
-
}
|
210
|
-
)
|
211
122
|
|
123
|
+
class GenomicTxMetadata(GenomicTxData):
|
124
|
+
"""Store relevant metadata for genomic and transcript accessions"""
|
212
125
|
|
213
|
-
|
214
|
-
|
126
|
+
tx_ac: str
|
127
|
+
alt_ac: str
|
128
|
+
coding_start_site: int = 0
|
129
|
+
coding_end_site: int = 0
|
130
|
+
alt_pos_change_range: tuple[int, int]
|
131
|
+
pos_change: tuple[int, int] | None
|
215
132
|
|
216
|
-
gene: StrictStr
|
217
|
-
chr: StrictStr
|
218
|
-
start: StrictInt | None = None # Genomic start position
|
219
|
-
end: StrictInt | None = None # Genomic end position
|
220
|
-
exon_start: StrictInt | None = None
|
221
|
-
exon_start_offset: StrictInt | None = 0
|
222
|
-
exon_end: StrictInt | None = None
|
223
|
-
exon_end_offset: StrictInt | None = 0
|
224
|
-
transcript: StrictStr
|
225
|
-
strand: Strand
|
226
133
|
|
227
|
-
|
228
|
-
|
229
|
-
"""Check that at least one of {``start``, ``end``} is set.
|
230
|
-
Check that at least one of {``exon_start``, ``exon_end``} is set.
|
231
|
-
If not set, set corresponding offset to ``None``
|
232
|
-
"""
|
233
|
-
start = values.start
|
234
|
-
end = values.end
|
235
|
-
if not start and not end:
|
236
|
-
msg = "Missing values for `start` or `end`"
|
237
|
-
raise ValueError(msg)
|
238
|
-
|
239
|
-
if start:
|
240
|
-
if not values.exon_start:
|
241
|
-
msg = "Missing value `exon_start`"
|
242
|
-
raise ValueError(msg)
|
243
|
-
else:
|
244
|
-
values.exon_start_offset = None
|
245
|
-
|
246
|
-
if end:
|
247
|
-
if not values.exon_end:
|
248
|
-
msg = "Missing value `exon_end`"
|
249
|
-
raise ValueError(msg)
|
250
|
-
else:
|
251
|
-
values.exon_end_offset = None
|
252
|
-
return values
|
134
|
+
class ManeGeneData(BaseModel, extra="forbid"):
|
135
|
+
"""Define minimal object model for representing a MANE gene"""
|
253
136
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
"gene": "TPM3",
|
258
|
-
"chr": "NC_000001.11",
|
259
|
-
"start": 154192135,
|
260
|
-
"end": None,
|
261
|
-
"exon_start": 1,
|
262
|
-
"exon_end": None,
|
263
|
-
"exon_start_offset": 0,
|
264
|
-
"exon_end_offset": None,
|
265
|
-
"transcript": "NM_152263.3",
|
266
|
-
"strand": Strand.NEGATIVE,
|
267
|
-
}
|
268
|
-
}
|
269
|
-
)
|
137
|
+
ncbi_gene_id: StrictInt
|
138
|
+
hgnc_id: StrictInt | None
|
139
|
+
symbol: StrictStr
|
270
140
|
|
271
141
|
|
272
142
|
class ServiceMeta(BaseModelForbidExtra):
|
@@ -289,281 +159,3 @@ class ServiceMeta(BaseModelForbidExtra):
|
|
289
159
|
}
|
290
160
|
}
|
291
161
|
)
|
292
|
-
|
293
|
-
|
294
|
-
class TranscriptExonDataResponse(BaseModelForbidExtra):
|
295
|
-
"""Response model for Transcript Exon Data"""
|
296
|
-
|
297
|
-
transcript_exon_data: TranscriptExonData | None = None
|
298
|
-
warnings: list[StrictStr] = []
|
299
|
-
service_meta: ServiceMeta
|
300
|
-
|
301
|
-
model_config = ConfigDict(
|
302
|
-
json_schema_extra={
|
303
|
-
"example": {
|
304
|
-
"transcript_exon_data": {
|
305
|
-
"chr": "NC_000001.11",
|
306
|
-
"gene": "TPM3",
|
307
|
-
"pos": 154192135,
|
308
|
-
"exon": 1,
|
309
|
-
"exon_offset": 0,
|
310
|
-
"transcript": "NM_152263.3",
|
311
|
-
"strand": Strand.NEGATIVE,
|
312
|
-
},
|
313
|
-
"warnings": [],
|
314
|
-
"service_meta": {
|
315
|
-
"name": "cool_seq_tool",
|
316
|
-
"version": __version__,
|
317
|
-
"response_datetime": _now,
|
318
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
319
|
-
},
|
320
|
-
}
|
321
|
-
}
|
322
|
-
)
|
323
|
-
|
324
|
-
|
325
|
-
class GenomicDataResponse(BaseModelForbidExtra):
|
326
|
-
"""Response model for Genomic Data"""
|
327
|
-
|
328
|
-
genomic_data: GenomicData | None = None
|
329
|
-
warnings: list[StrictStr] = []
|
330
|
-
service_meta: ServiceMeta
|
331
|
-
|
332
|
-
model_config = ConfigDict(
|
333
|
-
json_schema_extra={
|
334
|
-
"example": {
|
335
|
-
"genomic_data": {
|
336
|
-
"gene": "TPM3",
|
337
|
-
"chr": "NC_000001.11",
|
338
|
-
"start": 154192135,
|
339
|
-
"end": None,
|
340
|
-
"exon_start": 1,
|
341
|
-
"exon_end": None,
|
342
|
-
"exon_start_offset": 0,
|
343
|
-
"exon_end_offset": None,
|
344
|
-
"transcript": "NM_152263.3",
|
345
|
-
"strand": Strand.NEGATIVE,
|
346
|
-
},
|
347
|
-
"warnings": [],
|
348
|
-
"service_meta": {
|
349
|
-
"name": "cool_seq_tool",
|
350
|
-
"version": __version__,
|
351
|
-
"response_datetime": _now,
|
352
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
353
|
-
},
|
354
|
-
}
|
355
|
-
}
|
356
|
-
)
|
357
|
-
|
358
|
-
|
359
|
-
class MappedManeData(BaseModel):
|
360
|
-
"""Define mapped mane data fields"""
|
361
|
-
|
362
|
-
gene: StrictStr
|
363
|
-
refseq: StrictStr
|
364
|
-
ensembl: StrictStr | None = None
|
365
|
-
strand: Strand
|
366
|
-
status: TranscriptPriority
|
367
|
-
alt_ac: StrictStr
|
368
|
-
assembly: Assembly
|
369
|
-
|
370
|
-
model_config = ConfigDict(
|
371
|
-
json_schema_extra={
|
372
|
-
"example": {
|
373
|
-
"gene": "BRAF",
|
374
|
-
"refseq": "NM_001374258.1",
|
375
|
-
"ensembl": "ENST00000644969.2",
|
376
|
-
"strand": Strand.NEGATIVE,
|
377
|
-
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
378
|
-
"alt_ac": "NC_000007.13",
|
379
|
-
"assembly": Assembly.GRCH37,
|
380
|
-
}
|
381
|
-
}
|
382
|
-
)
|
383
|
-
|
384
|
-
|
385
|
-
class MappedManeDataService(BaseModelForbidExtra):
|
386
|
-
"""Service model response for mapped mane data"""
|
387
|
-
|
388
|
-
mapped_mane_data: MappedManeData | None = None
|
389
|
-
warnings: list[StrictStr] = []
|
390
|
-
service_meta: ServiceMeta
|
391
|
-
|
392
|
-
model_config = ConfigDict(
|
393
|
-
json_schema_extra={
|
394
|
-
"example": {
|
395
|
-
"mapped_mane_data": {
|
396
|
-
"gene": "BRAF",
|
397
|
-
"refseq": "NM_001374258.1",
|
398
|
-
"ensembl": "ENST00000644969.2",
|
399
|
-
"strand": Strand.NEGATIVE,
|
400
|
-
"status": TranscriptPriority.MANE_PLUS_CLINICAL,
|
401
|
-
"alt_ac": "NC_000007.13",
|
402
|
-
"assembly": Assembly.GRCH37,
|
403
|
-
},
|
404
|
-
"warnings": [],
|
405
|
-
"service_meta": {
|
406
|
-
"name": "cool_seq_tool",
|
407
|
-
"version": __version__,
|
408
|
-
"response_datetime": _now,
|
409
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
410
|
-
},
|
411
|
-
}
|
412
|
-
}
|
413
|
-
)
|
414
|
-
|
415
|
-
|
416
|
-
class ManeData(BaseModel):
|
417
|
-
"""Define mane data fields"""
|
418
|
-
|
419
|
-
gene: StrictStr | None = None
|
420
|
-
refseq: StrictStr | None = None
|
421
|
-
ensembl: StrictStr | None = None
|
422
|
-
pos: tuple[int, int]
|
423
|
-
strand: Strand
|
424
|
-
status: TranscriptPriority
|
425
|
-
|
426
|
-
model_config = ConfigDict(
|
427
|
-
json_schema_extra={
|
428
|
-
"example": {
|
429
|
-
"gene": "BRAF",
|
430
|
-
"refseq": "NP_004324.2",
|
431
|
-
"ensembl": "ENSP00000493543.1",
|
432
|
-
"pos": (598, 598),
|
433
|
-
"strand": Strand.NEGATIVE,
|
434
|
-
"status": TranscriptPriority.MANE_SELECT,
|
435
|
-
}
|
436
|
-
}
|
437
|
-
)
|
438
|
-
|
439
|
-
|
440
|
-
class ManeDataService(BaseModelForbidExtra):
|
441
|
-
"""Service model response for getting mane data"""
|
442
|
-
|
443
|
-
mane_data: ManeData | None = None
|
444
|
-
warnings: list[StrictStr] = []
|
445
|
-
service_meta: ServiceMeta
|
446
|
-
|
447
|
-
model_config = ConfigDict(
|
448
|
-
json_schema_extra={
|
449
|
-
"example": {
|
450
|
-
"mane_data": {
|
451
|
-
"gene": "BRAF",
|
452
|
-
"refseq": "NP_004324.2",
|
453
|
-
"ensembl": "ENSP00000493543.1",
|
454
|
-
"pos": (598, 598),
|
455
|
-
"strand": Strand.NEGATIVE,
|
456
|
-
"status": TranscriptPriority.MANE_SELECT,
|
457
|
-
},
|
458
|
-
"warnings": [],
|
459
|
-
"service_meta": {
|
460
|
-
"name": "cool_seq_tool",
|
461
|
-
"version": __version__,
|
462
|
-
"response_datetime": _now,
|
463
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
464
|
-
},
|
465
|
-
}
|
466
|
-
}
|
467
|
-
)
|
468
|
-
|
469
|
-
|
470
|
-
# ALIGNMENT MAPPER SERVICE SCHEMAS
|
471
|
-
|
472
|
-
|
473
|
-
class CdnaRepresentation(BaseModelForbidExtra):
|
474
|
-
"""Model response for cDNA representation"""
|
475
|
-
|
476
|
-
c_ac: StrictStr
|
477
|
-
c_start_pos: StrictInt
|
478
|
-
c_end_pos: StrictInt
|
479
|
-
cds_start: StrictInt
|
480
|
-
residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
|
481
|
-
|
482
|
-
model_config = ConfigDict(
|
483
|
-
json_schema_extra={
|
484
|
-
"example": {
|
485
|
-
"c_ac": "NM_004333.6",
|
486
|
-
"c_start_pos": 1797,
|
487
|
-
"c_end_pos": 1800,
|
488
|
-
"cds_start": 226,
|
489
|
-
"residue_mode": ResidueMode.INTER_RESIDUE,
|
490
|
-
}
|
491
|
-
}
|
492
|
-
)
|
493
|
-
|
494
|
-
|
495
|
-
class ToCdnaService(BaseModelForbidExtra):
|
496
|
-
"""Service model response for protein -> cDNA"""
|
497
|
-
|
498
|
-
c_data: CdnaRepresentation | None = None
|
499
|
-
warnings: list[StrictStr] = []
|
500
|
-
service_meta: ServiceMeta
|
501
|
-
|
502
|
-
model_config = ConfigDict(
|
503
|
-
json_schema_extra={
|
504
|
-
"example": {
|
505
|
-
"c_data": {
|
506
|
-
"c_ac": "NM_004333.6",
|
507
|
-
"c_start_pos": 1797,
|
508
|
-
"c_end_pos": 1800,
|
509
|
-
"cds_start": 226,
|
510
|
-
"residue_mode": ResidueMode.INTER_RESIDUE,
|
511
|
-
},
|
512
|
-
"warnings": [],
|
513
|
-
"service_meta": {
|
514
|
-
"name": "cool_seq_tool",
|
515
|
-
"version": __version__,
|
516
|
-
"response_datetime": _now,
|
517
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
518
|
-
},
|
519
|
-
}
|
520
|
-
}
|
521
|
-
)
|
522
|
-
|
523
|
-
|
524
|
-
class GenomicRepresentation(BaseModelForbidExtra):
|
525
|
-
"""Model response for genomic representation"""
|
526
|
-
|
527
|
-
g_ac: str
|
528
|
-
g_start_pos: int
|
529
|
-
g_end_pos: int
|
530
|
-
residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
|
531
|
-
|
532
|
-
model_config = ConfigDict(
|
533
|
-
json_schema_extra={
|
534
|
-
"example": {
|
535
|
-
"g_ac": "NC_000007.13",
|
536
|
-
"g_start_pos": 140453134,
|
537
|
-
"g_end_pos": 140453137,
|
538
|
-
"residue_mode": ResidueMode.INTER_RESIDUE,
|
539
|
-
}
|
540
|
-
}
|
541
|
-
)
|
542
|
-
|
543
|
-
|
544
|
-
class ToGenomicService(BaseModelForbidExtra):
|
545
|
-
"""Service model response for cDNA -> genomic"""
|
546
|
-
|
547
|
-
g_data: GenomicRepresentation | None = None
|
548
|
-
warnings: list[StrictStr] = []
|
549
|
-
service_meta: ServiceMeta
|
550
|
-
|
551
|
-
model_config = ConfigDict(
|
552
|
-
json_schema_extra={
|
553
|
-
"example": {
|
554
|
-
"g_data": {
|
555
|
-
"g_ac": "NC_000007.13",
|
556
|
-
"g_start_pos": 140453134,
|
557
|
-
"g_end_pos": 140453137,
|
558
|
-
"residue_mode": ResidueMode.INTER_RESIDUE,
|
559
|
-
},
|
560
|
-
"warnings": [],
|
561
|
-
"service_meta": {
|
562
|
-
"name": "cool_seq_tool",
|
563
|
-
"version": __version__,
|
564
|
-
"response_datetime": _now,
|
565
|
-
"url": "https://github.com/GenomicMedLab/cool-seq-tool",
|
566
|
-
},
|
567
|
-
}
|
568
|
-
}
|
569
|
-
)
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
8
8
|
import polars as pl
|
9
9
|
|
10
10
|
from cool_seq_tool.resources.data_files import DataFile, get_data_file
|
11
|
+
from cool_seq_tool.schemas import ManeGeneData
|
11
12
|
|
12
13
|
_logger = logging.getLogger(__name__)
|
13
14
|
|
@@ -103,3 +104,37 @@ class ManeTranscriptMappings:
|
|
103
104
|
|
104
105
|
mane_rows = mane_rows.sort(by="MANE_status", descending=True)
|
105
106
|
return mane_rows.to_dicts()
|
107
|
+
|
108
|
+
def get_genomic_mane_genes(
|
109
|
+
self, ac: str, start: int, end: int
|
110
|
+
) -> list[ManeGeneData]:
|
111
|
+
"""Get MANE gene(s) for genomic location
|
112
|
+
|
113
|
+
:param ac: RefSeq genomic accession
|
114
|
+
:param start: Genomic start position. Assumes residue coordinates.
|
115
|
+
:param end: Genomic end position. Assumes residue coordinates.
|
116
|
+
:return: Unique MANE gene(s) found for a genomic location
|
117
|
+
"""
|
118
|
+
mane_rows = self.df.filter(
|
119
|
+
(start >= pl.col("chr_start"))
|
120
|
+
& (end <= pl.col("chr_end"))
|
121
|
+
& (pl.col("GRCh38_chr") == ac)
|
122
|
+
).unique(subset=["#NCBI_GeneID"])
|
123
|
+
|
124
|
+
if len(mane_rows) == 0:
|
125
|
+
return []
|
126
|
+
|
127
|
+
mane_rows = mane_rows.with_columns(
|
128
|
+
pl.col("#NCBI_GeneID")
|
129
|
+
.str.split_exact(":", 1)
|
130
|
+
.struct.field("field_1")
|
131
|
+
.cast(pl.Int32)
|
132
|
+
.alias("ncbi_gene_id"),
|
133
|
+
pl.col("HGNC_ID")
|
134
|
+
.str.split_exact(":", 1)
|
135
|
+
.struct.field("field_1")
|
136
|
+
.cast(pl.Int32)
|
137
|
+
.alias("hgnc_id"),
|
138
|
+
)
|
139
|
+
mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
|
140
|
+
return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]
|