cool-seq-tool 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cool_seq_tool/schemas.py CHANGED
@@ -9,7 +9,6 @@ from pydantic import (
9
9
  ConfigDict,
10
10
  StrictInt,
11
11
  StrictStr,
12
- model_validator,
13
12
  )
14
13
 
15
14
  from cool_seq_tool import __version__
@@ -20,9 +19,9 @@ _now = str(datetime.datetime.now(tz=datetime.timezone.utc))
20
19
  class AnnotationLayer(str, Enum):
21
20
  """Create enum for supported annotation layers"""
22
21
 
23
- PROTEIN: Literal["p"] = "p"
24
- CDNA: Literal["c"] = "c"
25
- GENOMIC: Literal["g"] = "g"
22
+ PROTEIN = "p"
23
+ CDNA = "c"
24
+ GENOMIC = "g"
26
25
 
27
26
 
28
27
  class Strand(IntEnum):
@@ -53,15 +52,17 @@ class TranscriptPriority(str, Enum):
53
52
  GRCH38 = "grch38"
54
53
 
55
54
 
56
- class ResidueMode(str, Enum):
57
- """Create Enum for residue modes.
55
+ class CoordinateType(str, Enum):
56
+ """Create Enum for coordinate types.
58
57
 
59
- We typically prefer to operate in inter-residue coordinates, but users should be
58
+ It is preferred to operate in inter-residue coordinates, but users should be
60
59
  careful to define the coordinate mode of their data when calling ``cool-seq-tool``
61
60
  functions.
62
61
 
62
+ ``RESIDUE`` means 1-indexed, residue coordinates and ``INTER_RESIDUE`` means
63
+ 0-indexed, inter-residue coordinates.
64
+
63
65
  | | C | | T | | G | |
64
- ZERO | | 0 | | 1 | | 2 | |
65
66
  RESIDUE | | 1 | | 2 | | 3 | |
66
67
  INTER_RESIDUE | 0 | | 1 | | 2 | | 3 |
67
68
 
@@ -77,14 +78,6 @@ class ResidueMode(str, Enum):
77
78
  -
78
79
  - G
79
80
  -
80
- * - ``ZERO``
81
- -
82
- - 0
83
- -
84
- - 1
85
- -
86
- - 2
87
- -
88
81
  * - ``RESIDUE``
89
82
  -
90
83
  - 1
@@ -107,7 +100,6 @@ class ResidueMode(str, Enum):
107
100
  `Variation Representation Schema (VRS) paper <https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/35311178/>`_ for further discussion.
108
101
  """
109
102
 
110
- ZERO = "zero"
111
103
  RESIDUE = "residue"
112
104
  INTER_RESIDUE = "inter-residue"
113
105
 
@@ -116,157 +108,35 @@ class BaseModelForbidExtra(BaseModel, extra="forbid"):
116
108
  """Base Pydantic model class with extra values forbidden."""
117
109
 
118
110
 
119
- class GenomicRequestBody(BaseModelForbidExtra):
120
- """Define constraints for genomic to transcript exon coordinates request body"""
121
-
122
- chromosome: StrictStr | StrictInt
123
- start: StrictInt | None = None
124
- end: StrictInt | None = None
125
- strand: Strand | None = None
126
- transcript: StrictStr | None = None
127
- gene: StrictStr | None = None
128
- residue_mode: ResidueMode = ResidueMode.RESIDUE
129
-
130
- @model_validator(mode="after")
131
- def check_start_and_end(cls, values):
132
- """Check that at least one of {``start``, ``end``} is set"""
133
- start, end = values.start, values.end
134
- if not start or end:
135
- msg = "Must provide either `start` or `end`"
136
- raise ValueError(msg)
137
- return values
138
-
139
- model_config = ConfigDict(
140
- json_schema_extra={
141
- "example": {
142
- "chromosome": "NC_000001.11",
143
- "start": 154192135,
144
- "end": None,
145
- "strand": Strand.NEGATIVE,
146
- "transcript": "NM_152263.3",
147
- "gene": "TPM3",
148
- "residue_mode": "residue",
149
- }
150
- }
151
- )
152
-
153
-
154
- class TranscriptRequestBody(BaseModelForbidExtra):
155
- """Define constraints for transcript exon to genomic coordinates request body"""
111
+ class GenomicTxData(BaseModelForbidExtra):
112
+ """Represent aligned genomic/transcript exon data"""
156
113
 
157
- transcript: StrictStr
158
- gene: StrictStr | None = None
159
- exon_start: StrictInt | None = None
160
- exon_start_offset: StrictInt | None = 0
161
- exon_end: StrictInt | None = None
162
- exon_end_offset: StrictInt | None = 0
163
-
164
- @model_validator(mode="after")
165
- def check_exon_start_and_exon_end(cls, values):
166
- """Check that at least one of {``exon_start``, ``exon_end``} is set"""
167
- exon_start, exon_end = values.exon_start, values.exon_end
168
- if not exon_start or exon_end:
169
- msg = "Must provide either `exon_start` or `exon_end`"
170
- raise ValueError(msg)
171
- return values
172
-
173
- model_config = ConfigDict(
174
- json_schema_extra={
175
- "example": {
176
- "gene": "TPM3",
177
- "transcript": "NM_152263.3",
178
- "exon_start": 1,
179
- "exon_start_offset": 1,
180
- "exon_end": None,
181
- "exon_end_offset": None,
182
- }
183
- }
184
- )
185
-
186
-
187
- class TranscriptExonData(BaseModelForbidExtra):
188
- """Model containing transcript exon data."""
189
-
190
- transcript: StrictStr
191
- pos: StrictInt
192
- exon: StrictInt
193
- exon_offset: StrictInt = 0
194
- gene: StrictStr
195
- chr: StrictStr
114
+ gene: str
196
115
  strand: Strand
116
+ tx_pos_range: tuple[int, int]
117
+ alt_pos_range: tuple[int, int]
118
+ alt_aln_method: str
119
+ tx_exon_id: int
120
+ alt_exon_id: int
197
121
 
198
- model_config = ConfigDict(
199
- json_schema_extra={
200
- "example": {
201
- "chr": "NC_000001.11",
202
- "gene": "TPM3",
203
- "pos": 154192135,
204
- "exon": 1,
205
- "exon_offset": 0,
206
- "transcript": "NM_152263.3",
207
- "strand": Strand.NEGATIVE,
208
- }
209
- }
210
- )
211
122
 
123
+ class GenomicTxMetadata(GenomicTxData):
124
+ """Store relevant metadata for genomic and transcript accessions"""
212
125
 
213
- class GenomicData(BaseModelForbidExtra):
214
- """Model containing genomic and transcript exon data."""
126
+ tx_ac: str
127
+ alt_ac: str
128
+ coding_start_site: int = 0
129
+ coding_end_site: int = 0
130
+ alt_pos_change_range: tuple[int, int]
131
+ pos_change: tuple[int, int] | None
215
132
 
216
- gene: StrictStr
217
- chr: StrictStr
218
- start: StrictInt | None = None # Genomic start position
219
- end: StrictInt | None = None # Genomic end position
220
- exon_start: StrictInt | None = None
221
- exon_start_offset: StrictInt | None = 0
222
- exon_end: StrictInt | None = None
223
- exon_end_offset: StrictInt | None = 0
224
- transcript: StrictStr
225
- strand: Strand
226
133
 
227
- @model_validator(mode="after")
228
- def check_start_end(cls, values):
229
- """Check that at least one of {``start``, ``end``} is set.
230
- Check that at least one of {``exon_start``, ``exon_end``} is set.
231
- If not set, set corresponding offset to ``None``
232
- """
233
- start = values.start
234
- end = values.end
235
- if not start and not end:
236
- msg = "Missing values for `start` or `end`"
237
- raise ValueError(msg)
238
-
239
- if start:
240
- if not values.exon_start:
241
- msg = "Missing value `exon_start`"
242
- raise ValueError(msg)
243
- else:
244
- values.exon_start_offset = None
245
-
246
- if end:
247
- if not values.exon_end:
248
- msg = "Missing value `exon_end`"
249
- raise ValueError(msg)
250
- else:
251
- values.exon_end_offset = None
252
- return values
134
+ class ManeGeneData(BaseModel, extra="forbid"):
135
+ """Define minimal object model for representing a MANE gene"""
253
136
 
254
- model_config = ConfigDict(
255
- json_schema_extra={
256
- "example": {
257
- "gene": "TPM3",
258
- "chr": "NC_000001.11",
259
- "start": 154192135,
260
- "end": None,
261
- "exon_start": 1,
262
- "exon_end": None,
263
- "exon_start_offset": 0,
264
- "exon_end_offset": None,
265
- "transcript": "NM_152263.3",
266
- "strand": Strand.NEGATIVE,
267
- }
268
- }
269
- )
137
+ ncbi_gene_id: StrictInt
138
+ hgnc_id: StrictInt | None
139
+ symbol: StrictStr
270
140
 
271
141
 
272
142
  class ServiceMeta(BaseModelForbidExtra):
@@ -289,281 +159,3 @@ class ServiceMeta(BaseModelForbidExtra):
289
159
  }
290
160
  }
291
161
  )
292
-
293
-
294
- class TranscriptExonDataResponse(BaseModelForbidExtra):
295
- """Response model for Transcript Exon Data"""
296
-
297
- transcript_exon_data: TranscriptExonData | None = None
298
- warnings: list[StrictStr] = []
299
- service_meta: ServiceMeta
300
-
301
- model_config = ConfigDict(
302
- json_schema_extra={
303
- "example": {
304
- "transcript_exon_data": {
305
- "chr": "NC_000001.11",
306
- "gene": "TPM3",
307
- "pos": 154192135,
308
- "exon": 1,
309
- "exon_offset": 0,
310
- "transcript": "NM_152263.3",
311
- "strand": Strand.NEGATIVE,
312
- },
313
- "warnings": [],
314
- "service_meta": {
315
- "name": "cool_seq_tool",
316
- "version": __version__,
317
- "response_datetime": _now,
318
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
319
- },
320
- }
321
- }
322
- )
323
-
324
-
325
- class GenomicDataResponse(BaseModelForbidExtra):
326
- """Response model for Genomic Data"""
327
-
328
- genomic_data: GenomicData | None = None
329
- warnings: list[StrictStr] = []
330
- service_meta: ServiceMeta
331
-
332
- model_config = ConfigDict(
333
- json_schema_extra={
334
- "example": {
335
- "genomic_data": {
336
- "gene": "TPM3",
337
- "chr": "NC_000001.11",
338
- "start": 154192135,
339
- "end": None,
340
- "exon_start": 1,
341
- "exon_end": None,
342
- "exon_start_offset": 0,
343
- "exon_end_offset": None,
344
- "transcript": "NM_152263.3",
345
- "strand": Strand.NEGATIVE,
346
- },
347
- "warnings": [],
348
- "service_meta": {
349
- "name": "cool_seq_tool",
350
- "version": __version__,
351
- "response_datetime": _now,
352
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
353
- },
354
- }
355
- }
356
- )
357
-
358
-
359
- class MappedManeData(BaseModel):
360
- """Define mapped mane data fields"""
361
-
362
- gene: StrictStr
363
- refseq: StrictStr
364
- ensembl: StrictStr | None = None
365
- strand: Strand
366
- status: TranscriptPriority
367
- alt_ac: StrictStr
368
- assembly: Assembly
369
-
370
- model_config = ConfigDict(
371
- json_schema_extra={
372
- "example": {
373
- "gene": "BRAF",
374
- "refseq": "NM_001374258.1",
375
- "ensembl": "ENST00000644969.2",
376
- "strand": Strand.NEGATIVE,
377
- "status": TranscriptPriority.MANE_PLUS_CLINICAL,
378
- "alt_ac": "NC_000007.13",
379
- "assembly": Assembly.GRCH37,
380
- }
381
- }
382
- )
383
-
384
-
385
- class MappedManeDataService(BaseModelForbidExtra):
386
- """Service model response for mapped mane data"""
387
-
388
- mapped_mane_data: MappedManeData | None = None
389
- warnings: list[StrictStr] = []
390
- service_meta: ServiceMeta
391
-
392
- model_config = ConfigDict(
393
- json_schema_extra={
394
- "example": {
395
- "mapped_mane_data": {
396
- "gene": "BRAF",
397
- "refseq": "NM_001374258.1",
398
- "ensembl": "ENST00000644969.2",
399
- "strand": Strand.NEGATIVE,
400
- "status": TranscriptPriority.MANE_PLUS_CLINICAL,
401
- "alt_ac": "NC_000007.13",
402
- "assembly": Assembly.GRCH37,
403
- },
404
- "warnings": [],
405
- "service_meta": {
406
- "name": "cool_seq_tool",
407
- "version": __version__,
408
- "response_datetime": _now,
409
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
410
- },
411
- }
412
- }
413
- )
414
-
415
-
416
- class ManeData(BaseModel):
417
- """Define mane data fields"""
418
-
419
- gene: StrictStr | None = None
420
- refseq: StrictStr | None = None
421
- ensembl: StrictStr | None = None
422
- pos: tuple[int, int]
423
- strand: Strand
424
- status: TranscriptPriority
425
-
426
- model_config = ConfigDict(
427
- json_schema_extra={
428
- "example": {
429
- "gene": "BRAF",
430
- "refseq": "NP_004324.2",
431
- "ensembl": "ENSP00000493543.1",
432
- "pos": (598, 598),
433
- "strand": Strand.NEGATIVE,
434
- "status": TranscriptPriority.MANE_SELECT,
435
- }
436
- }
437
- )
438
-
439
-
440
- class ManeDataService(BaseModelForbidExtra):
441
- """Service model response for getting mane data"""
442
-
443
- mane_data: ManeData | None = None
444
- warnings: list[StrictStr] = []
445
- service_meta: ServiceMeta
446
-
447
- model_config = ConfigDict(
448
- json_schema_extra={
449
- "example": {
450
- "mane_data": {
451
- "gene": "BRAF",
452
- "refseq": "NP_004324.2",
453
- "ensembl": "ENSP00000493543.1",
454
- "pos": (598, 598),
455
- "strand": Strand.NEGATIVE,
456
- "status": TranscriptPriority.MANE_SELECT,
457
- },
458
- "warnings": [],
459
- "service_meta": {
460
- "name": "cool_seq_tool",
461
- "version": __version__,
462
- "response_datetime": _now,
463
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
464
- },
465
- }
466
- }
467
- )
468
-
469
-
470
- # ALIGNMENT MAPPER SERVICE SCHEMAS
471
-
472
-
473
- class CdnaRepresentation(BaseModelForbidExtra):
474
- """Model response for cDNA representation"""
475
-
476
- c_ac: StrictStr
477
- c_start_pos: StrictInt
478
- c_end_pos: StrictInt
479
- cds_start: StrictInt
480
- residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
481
-
482
- model_config = ConfigDict(
483
- json_schema_extra={
484
- "example": {
485
- "c_ac": "NM_004333.6",
486
- "c_start_pos": 1797,
487
- "c_end_pos": 1800,
488
- "cds_start": 226,
489
- "residue_mode": ResidueMode.INTER_RESIDUE,
490
- }
491
- }
492
- )
493
-
494
-
495
- class ToCdnaService(BaseModelForbidExtra):
496
- """Service model response for protein -> cDNA"""
497
-
498
- c_data: CdnaRepresentation | None = None
499
- warnings: list[StrictStr] = []
500
- service_meta: ServiceMeta
501
-
502
- model_config = ConfigDict(
503
- json_schema_extra={
504
- "example": {
505
- "c_data": {
506
- "c_ac": "NM_004333.6",
507
- "c_start_pos": 1797,
508
- "c_end_pos": 1800,
509
- "cds_start": 226,
510
- "residue_mode": ResidueMode.INTER_RESIDUE,
511
- },
512
- "warnings": [],
513
- "service_meta": {
514
- "name": "cool_seq_tool",
515
- "version": __version__,
516
- "response_datetime": _now,
517
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
518
- },
519
- }
520
- }
521
- )
522
-
523
-
524
- class GenomicRepresentation(BaseModelForbidExtra):
525
- """Model response for genomic representation"""
526
-
527
- g_ac: str
528
- g_start_pos: int
529
- g_end_pos: int
530
- residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value
531
-
532
- model_config = ConfigDict(
533
- json_schema_extra={
534
- "example": {
535
- "g_ac": "NC_000007.13",
536
- "g_start_pos": 140453134,
537
- "g_end_pos": 140453137,
538
- "residue_mode": ResidueMode.INTER_RESIDUE,
539
- }
540
- }
541
- )
542
-
543
-
544
- class ToGenomicService(BaseModelForbidExtra):
545
- """Service model response for cDNA -> genomic"""
546
-
547
- g_data: GenomicRepresentation | None = None
548
- warnings: list[StrictStr] = []
549
- service_meta: ServiceMeta
550
-
551
- model_config = ConfigDict(
552
- json_schema_extra={
553
- "example": {
554
- "g_data": {
555
- "g_ac": "NC_000007.13",
556
- "g_start_pos": 140453134,
557
- "g_end_pos": 140453137,
558
- "residue_mode": ResidueMode.INTER_RESIDUE,
559
- },
560
- "warnings": [],
561
- "service_meta": {
562
- "name": "cool_seq_tool",
563
- "version": __version__,
564
- "response_datetime": _now,
565
- "url": "https://github.com/GenomicMedLab/cool-seq-tool",
566
- },
567
- }
568
- }
569
- )
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  import polars as pl
9
9
 
10
10
  from cool_seq_tool.resources.data_files import DataFile, get_data_file
11
+ from cool_seq_tool.schemas import ManeGeneData
11
12
 
12
13
  _logger = logging.getLogger(__name__)
13
14
 
@@ -103,3 +104,37 @@ class ManeTranscriptMappings:
103
104
 
104
105
  mane_rows = mane_rows.sort(by="MANE_status", descending=True)
105
106
  return mane_rows.to_dicts()
107
+
108
+ def get_genomic_mane_genes(
109
+ self, ac: str, start: int, end: int
110
+ ) -> list[ManeGeneData]:
111
+ """Get MANE gene(s) for genomic location
112
+
113
+ :param ac: RefSeq genomic accession
114
+ :param start: Genomic start position. Assumes residue coordinates.
115
+ :param end: Genomic end position. Assumes residue coordinates.
116
+ :return: Unique MANE gene(s) found for a genomic location
117
+ """
118
+ mane_rows = self.df.filter(
119
+ (start >= pl.col("chr_start"))
120
+ & (end <= pl.col("chr_end"))
121
+ & (pl.col("GRCh38_chr") == ac)
122
+ ).unique(subset=["#NCBI_GeneID"])
123
+
124
+ if len(mane_rows) == 0:
125
+ return []
126
+
127
+ mane_rows = mane_rows.with_columns(
128
+ pl.col("#NCBI_GeneID")
129
+ .str.split_exact(":", 1)
130
+ .struct.field("field_1")
131
+ .cast(pl.Int32)
132
+ .alias("ncbi_gene_id"),
133
+ pl.col("HGNC_ID")
134
+ .str.split_exact(":", 1)
135
+ .struct.field("field_1")
136
+ .cast(pl.Int32)
137
+ .alias("hgnc_id"),
138
+ )
139
+ mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
140
+ return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]