codeine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeine/__init__.py +15 -0
- codeine/constraints/banned.py +444 -0
- codeine/constraints/base.py +39 -0
- codeine/constraints/mutations.py +115 -0
- codeine/graph/base.py +267 -0
- codeine/graph/compile.py +489 -0
- codeine/graph/nodes.py +111 -0
- codeine/graph/view.py +781 -0
- codeine/motifs/restriction.py +105 -0
- codeine/motifs/validate.py +117 -0
- codeine/space/__init__.py +0 -0
- codeine/space/coding.py +490 -0
- codeine/space/mutation.py +512 -0
- codeine/translation/__init__.py +0 -0
- codeine/translation/data/__init__.py +0 -0
- codeine/translation/data/tables.json +2252 -0
- codeine/translation/data/weights.py +232 -0
- codeine/translation/tables.py +200 -0
- codeine/translation/weights.py +323 -0
- codeine/utils/__init__.py +0 -0
- codeine/utils/dict.py +23 -0
- codeine/utils/display.py +124 -0
- codeine/utils/sampling.py +90 -0
- codeine-0.1.0.dist-info/METADATA +162 -0
- codeine-0.1.0.dist-info/RECORD +28 -0
- codeine-0.1.0.dist-info/WHEEL +5 -0
- codeine-0.1.0.dist-info/licenses/LICENSE +21 -0
- codeine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Codon usage counts obtained from the GenScript Codon Frequency Table.
|
|
3
|
+
|
|
4
|
+
Source:
|
|
5
|
+
https://www.genscript.com/tools/codon-frequency-table
|
|
6
|
+
|
|
7
|
+
Retrieved: 2026-06-17
|
|
8
|
+
|
|
9
|
+
Values correspond to the "Number" column.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
ECOLI_WEIGHTS = {'F': {'TTT': 80995, 'TTC': 58774},
|
|
13
|
+
'L': {'TTA': 52382,
|
|
14
|
+
'TTG': 47500,
|
|
15
|
+
'CTT': 43449,
|
|
16
|
+
'CTC': 37347,
|
|
17
|
+
'CTA': 15409,
|
|
18
|
+
'CTG': 177210},
|
|
19
|
+
'Y': {'TAT': 63937, 'TAC': 44631},
|
|
20
|
+
'*': {'TAA': 7356, 'TAG': 989, 'TGA': 3623},
|
|
21
|
+
'H': {'CAT': 45879, 'CAC': 34078},
|
|
22
|
+
'Q': {'CAA': 53394, 'CAG': 104171},
|
|
23
|
+
'I': {'ATT': 109072, 'ATC': 86796, 'ATA': 24984},
|
|
24
|
+
'M': {'ATG': 96695},
|
|
25
|
+
'N': {'AAT': 75436, 'AAC': 78443},
|
|
26
|
+
'K': {'AAA': 129137, 'AAG': 45459},
|
|
27
|
+
'V': {'GTT': 72584, 'GTC': 52439, 'GTA': 42420, 'GTG': 89265},
|
|
28
|
+
'D': {'GAT': 119939, 'GAC': 70394},
|
|
29
|
+
'E': {'GAA': 143353, 'GAG': 68609},
|
|
30
|
+
'S': {'TCT': 38027,
|
|
31
|
+
'TCC': 33430,
|
|
32
|
+
'TCA': 32715,
|
|
33
|
+
'TCG': 31146,
|
|
34
|
+
'AGT': 36097,
|
|
35
|
+
'AGC': 55551},
|
|
36
|
+
'C': {'TGT': 19138, 'TGC': 22188},
|
|
37
|
+
'W': {'TGG': 50991},
|
|
38
|
+
'P': {'CCT': 27340, 'CCC': 19666, 'CCA': 31534, 'CCG': 76644},
|
|
39
|
+
'R': {'CGT': 73197,
|
|
40
|
+
'CGC': 72212,
|
|
41
|
+
'CGA': 13844,
|
|
42
|
+
'CGG': 21552,
|
|
43
|
+
'AGA': 13152,
|
|
44
|
+
'AGG': 7607},
|
|
45
|
+
'T': {'ACT': 37842, 'ACC': 80547, 'ACA': 33910, 'ACG': 50269},
|
|
46
|
+
'A': {'GCT': 62479, 'GCC': 88721, 'GCA': 77547, 'GCG': 110308},
|
|
47
|
+
'G': {'GGT': 93325, 'GGC': 99390, 'GGA': 34799, 'GGG': 41277}}
|
|
48
|
+
|
|
49
|
+
YEAST_WEIGHTS = {'F': {'TTT': 147662, 'TTC': 103110},
|
|
50
|
+
'L': {'TTA': 149466,
|
|
51
|
+
'TTG': 153307,
|
|
52
|
+
'CTT': 68959,
|
|
53
|
+
'CTC': 30501,
|
|
54
|
+
'CTA': 75775,
|
|
55
|
+
'CTG': 59027},
|
|
56
|
+
'Y': {'TAT': 106420, 'TAC': 83100},
|
|
57
|
+
'*': {'TAA': 5596, 'TAG': 2681, 'TGA': 3516},
|
|
58
|
+
'H': {'CAT': 77724, 'CAC': 44082},
|
|
59
|
+
'Q': {'CAA': 155751, 'CAG': 68932},
|
|
60
|
+
'I': {'ATT': 171272, 'ATC': 96735, 'ATA': 100930},
|
|
61
|
+
'M': {'ATG': 118274},
|
|
62
|
+
'N': {'AAT': 204015, 'AAC': 141103},
|
|
63
|
+
'K': {'AAA': 239186, 'AAG': 174250},
|
|
64
|
+
'V': {'GTT': 124665, 'GTC': 65664, 'GTA': 66649, 'GTG': 60286},
|
|
65
|
+
'D': {'GAT': 214363, 'GAC': 114918},
|
|
66
|
+
'E': {'GAA': 260082, 'GAG': 108411},
|
|
67
|
+
'S': {'TCT': 133502,
|
|
68
|
+
'TCC': 80612,
|
|
69
|
+
'TCA': 106266,
|
|
70
|
+
'TCG': 48524,
|
|
71
|
+
'AGT': 80293,
|
|
72
|
+
'AGC': 54825},
|
|
73
|
+
'C': {'TGT': 45222, 'TGC': 26520},
|
|
74
|
+
'W': {'TGG': 58416},
|
|
75
|
+
'P': {'CCT': 77061, 'CCC': 38414, 'CCA': 103062, 'CCG': 29916},
|
|
76
|
+
'R': {'CGT': 36693,
|
|
77
|
+
'CGC': 14645,
|
|
78
|
+
'CGA': 17111,
|
|
79
|
+
'CGG': 9872,
|
|
80
|
+
'AGA': 120587,
|
|
81
|
+
'AGG': 52322},
|
|
82
|
+
'T': {'ACT': 114746, 'ACC': 71204, 'ACA': 100680, 'ACG': 44995},
|
|
83
|
+
'A': {'GCT': 119402, 'GCC': 71065, 'GCA': 91502, 'GCG': 34754},
|
|
84
|
+
'G': {'GGT': 135417, 'GGC': 54977, 'GGA': 61825, 'GGG': 33885}}
|
|
85
|
+
|
|
86
|
+
HUMAN_WEIGHTS = {'F': {'TTT': 336562, 'TTC': 406571},
|
|
87
|
+
'L': {'TTA': 143715,
|
|
88
|
+
'TTG': 249879,
|
|
89
|
+
'CTT': 253795,
|
|
90
|
+
'CTC': 386182,
|
|
91
|
+
'CTA': 138154,
|
|
92
|
+
'CTG': 800774},
|
|
93
|
+
'Y': {'TAT': 239268, 'TAC': 310695},
|
|
94
|
+
'*': {'TAA': 14322, 'TAG': 10915, 'TGA': 25383},
|
|
95
|
+
'H': {'CAT': 207826, 'CAC': 297048},
|
|
96
|
+
'Q': {'CAA': 234785, 'CAG': 688316},
|
|
97
|
+
'I': {'ATT': 313225, 'ATC': 426570, 'ATA': 140652},
|
|
98
|
+
'M': {'ATG': 443795},
|
|
99
|
+
'N': {'AAT': 331714, 'AAC': 387148},
|
|
100
|
+
'K': {'AAA': 476554, 'AAG': 654280},
|
|
101
|
+
'V': {'GTT': 216818, 'GTC': 290874, 'GTA': 139156, 'GTG': 575438},
|
|
102
|
+
'D': {'GAT': 443369, 'GAC': 517579},
|
|
103
|
+
'E': {'GAA': 577846, 'GAG': 810842},
|
|
104
|
+
'S': {'TCT': 291040,
|
|
105
|
+
'TCC': 346943,
|
|
106
|
+
'TCA': 233110,
|
|
107
|
+
'TCG': 89429,
|
|
108
|
+
'AGT': 237404,
|
|
109
|
+
'AGC': 385113},
|
|
110
|
+
'C': {'TGT': 197293, 'TGC': 243685},
|
|
111
|
+
'W': {'TGG': 255512},
|
|
112
|
+
'P': {'CCT': 343793, 'CCC': 397790, 'CCA': 331944, 'CCG': 139414},
|
|
113
|
+
'R': {'CGT': 93458,
|
|
114
|
+
'CGC': 217130,
|
|
115
|
+
'CGA': 126113,
|
|
116
|
+
'CGG': 235938,
|
|
117
|
+
'AGA': 228151,
|
|
118
|
+
'AGG': 227281},
|
|
119
|
+
'T': {'ACT': 255582, 'ACC': 382050, 'ACA': 294223, 'ACG': 123533},
|
|
120
|
+
'A': {'GCT': 370873, 'GCC': 567930, 'GCA': 317338, 'GCG': 150708},
|
|
121
|
+
'G': {'GGT': 215544, 'GGC': 453917, 'GGA': 325243, 'GGG': 326879}}
|
|
122
|
+
|
|
123
|
+
MOUSE_WEIGHTS = {'F': {'TTT': 244935, 'TTC': 319002},
|
|
124
|
+
'L': {'TTA': 92974,
|
|
125
|
+
'TTG': 189840,
|
|
126
|
+
'CTT': 188236,
|
|
127
|
+
'CTC': 290198,
|
|
128
|
+
'CTA': 114707,
|
|
129
|
+
'CTG': 571592},
|
|
130
|
+
'S': {'TCT': 228046,
|
|
131
|
+
'TCC': 258787,
|
|
132
|
+
'TCA': 165371,
|
|
133
|
+
'TCG': 61116,
|
|
134
|
+
'AGT': 176056,
|
|
135
|
+
'AGC': 279098},
|
|
136
|
+
'Y': {'TAT': 175198, 'TAC': 236579},
|
|
137
|
+
'*': {'TAA': 8861, 'TAG': 7491, 'TGA': 17004},
|
|
138
|
+
'C': {'TGT': 158573, 'TGC': 173509},
|
|
139
|
+
'W': {'TGG': 177261},
|
|
140
|
+
'P': {'CCT': 261478, 'CCC': 262874, 'CCA': 244064, 'CCG': 89477},
|
|
141
|
+
'H': {'CAT': 145279, 'CAC': 217261},
|
|
142
|
+
'Q': {'CAA': 163794, 'CAG': 486041},
|
|
143
|
+
'R': {'CGT': 66962,
|
|
144
|
+
'CGC': 136330,
|
|
145
|
+
'CGA': 94626,
|
|
146
|
+
'CGG': 147905,
|
|
147
|
+
'AGA': 163062,
|
|
148
|
+
'AGG': 167166},
|
|
149
|
+
'I': {'ATT': 223528, 'ATC': 332072, 'ATA': 103441},
|
|
150
|
+
'M': {'ATG': 330431},
|
|
151
|
+
'T': {'ACT': 194120, 'ACC': 274588, 'ACA': 227958, 'ACG': 82653},
|
|
152
|
+
'N': {'AAT': 221760, 'AAC': 296083},
|
|
153
|
+
'K': {'AAA': 305128, 'AAG': 482643},
|
|
154
|
+
'V': {'GTT': 152706, 'GTC': 224465, 'GTA': 105532, 'GTG': 416428},
|
|
155
|
+
'A': {'GCT': 288132, 'GCC': 378142, 'GCA': 226309, 'GCG': 94463},
|
|
156
|
+
'D': {'GAT': 301359, 'GAC': 380355},
|
|
157
|
+
'E': {'GAA': 379216, 'GAG': 563999},
|
|
158
|
+
'G': {'GGT': 164971, 'GGC': 312846, 'GGA': 241435, 'GGG': 220474}}
|
|
159
|
+
|
|
160
|
+
ARABIDOPSIS_WEIGHTS = {'F': {'TTT': 534456, 'TTC': 505253},
|
|
161
|
+
'L': {'TTA': 308336,
|
|
162
|
+
'TTG': 508909,
|
|
163
|
+
'CTT': 589767,
|
|
164
|
+
'CTC': 391502,
|
|
165
|
+
'CTA': 240784,
|
|
166
|
+
'CTG': 240472},
|
|
167
|
+
'S': {'TCT': 610403,
|
|
168
|
+
'TCC': 270545,
|
|
169
|
+
'TCA': 440264,
|
|
170
|
+
'TCG': 223500,
|
|
171
|
+
'AGT': 339903,
|
|
172
|
+
'AGC': 273815},
|
|
173
|
+
'Y': {'TAT': 359583, 'TAC': 337738},
|
|
174
|
+
'*': {'TAA': 22128, 'TAG': 12250, 'TGA': 26304},
|
|
175
|
+
'C': {'TGT': 254179, 'TGC': 171924},
|
|
176
|
+
'W': {'TGG': 304101},
|
|
177
|
+
'P': {'CCT': 455638, 'CCC': 129838, 'CCA': 393957, 'CCG': 207684},
|
|
178
|
+
'H': {'CAT': 334012, 'CAC': 211635},
|
|
179
|
+
'Q': {'CAA': 470475, 'CAG': 371091},
|
|
180
|
+
'R': {'CGT': 220192,
|
|
181
|
+
'CGC': 91869,
|
|
182
|
+
'CGA': 152537,
|
|
183
|
+
'CGG': 117195,
|
|
184
|
+
'AGA': 457927,
|
|
185
|
+
'AGG': 266406},
|
|
186
|
+
'I': {'ATT': 528593, 'ATC': 452968, 'ATA': 305594},
|
|
187
|
+
'M': {'ATG': 594807},
|
|
188
|
+
'T': {'ACT': 431031, 'ACC': 252577, 'ACA': 381774, 'ACG': 186589},
|
|
189
|
+
'N': {'AAT': 543539, 'AAC': 508277},
|
|
190
|
+
'K': {'AAA': 749448, 'AAG': 798966},
|
|
191
|
+
'V': {'GTT': 668037, 'GTC': 312848, 'GTA': 242644, 'GTG': 423929},
|
|
192
|
+
'A': {'GCT': 697575, 'GCC': 253877, 'GCA': 427842, 'GCG': 217886},
|
|
193
|
+
'D': {'GAT': 895566, 'GAC': 420193},
|
|
194
|
+
'E': {'GAA': 836077, 'GAG': 786970},
|
|
195
|
+
'G': {'GGT': 546544, 'GGC': 223041, 'GGA': 592219, 'GGG': 249165}}
|
|
196
|
+
|
|
197
|
+
DROSOPHILA_WEIGHTS = {'F': {'TTT': 241160, 'TTC': 402987},
|
|
198
|
+
'L': {'TTA': 80654,
|
|
199
|
+
'TTG': 294929,
|
|
200
|
+
'CTT': 164079,
|
|
201
|
+
'CTC': 255170,
|
|
202
|
+
'CTA': 150674,
|
|
203
|
+
'CTG': 709057},
|
|
204
|
+
'S': {'TCT': 127758,
|
|
205
|
+
'TCC': 359352,
|
|
206
|
+
'TCA': 142775,
|
|
207
|
+
'TCG': 307642,
|
|
208
|
+
'AGT': 211452,
|
|
209
|
+
'AGC': 377750},
|
|
210
|
+
'Y': {'TAT': 196987, 'TAC': 338493},
|
|
211
|
+
'*': {'TAA': 14550, 'TAG': 11721, 'TGA': 9358},
|
|
212
|
+
'C': {'TGT': 98056, 'TGC': 241823},
|
|
213
|
+
'W': {'TGG': 181191},
|
|
214
|
+
'P': {'CCT': 127584, 'CCC': 331799, 'CCA': 248604, 'CCG': 293998},
|
|
215
|
+
'H': {'CAT': 194039, 'CAC': 294555},
|
|
216
|
+
'Q': {'CAA': 287398, 'CAG': 674575},
|
|
217
|
+
'R': {'CGT': 161800,
|
|
218
|
+
'CGC': 334733,
|
|
219
|
+
'CGA': 155637,
|
|
220
|
+
'CGG': 151728,
|
|
221
|
+
'AGA': 93738,
|
|
222
|
+
'AGG': 115430},
|
|
223
|
+
'I': {'ATT': 302063, 'ATC': 421851, 'ATA': 171610},
|
|
224
|
+
'M': {'ATG': 431822},
|
|
225
|
+
'T': {'ACT': 175261, 'ACC': 393516, 'ACA': 201178, 'ACG': 267095},
|
|
226
|
+
'N': {'AAT': 385441, 'AAC': 483430},
|
|
227
|
+
'K': {'AAA': 304266, 'AAG': 729428},
|
|
228
|
+
'V': {'GTT': 200189, 'GTC': 254974, 'GTA': 116035, 'GTG': 514471},
|
|
229
|
+
'A': {'GCT': 265295, 'GCC': 620889, 'GCA': 234549, 'GCG': 259119},
|
|
230
|
+
'D': {'GAT': 504880, 'GAC': 454677},
|
|
231
|
+
'E': {'GAA': 383850, 'GAG': 789807},
|
|
232
|
+
'G': {'GGT': 243460, 'GGC': 492283, 'GGA': 326983, 'GGG': 85472}}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
from codeine.utils.dict import FrozenDict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TranslationTable:
|
|
11
|
+
"""
|
|
12
|
+
Translation table. Here we use the NCBI translation table IDs
|
|
13
|
+
(see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
|
|
14
|
+
Uses DNA by default, but can switch to RNA by setting ``rna=True``.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, table_id: int = 1, rna: bool = False) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
table_id
|
|
22
|
+
Which translation table to use. Default is 1, which is the standard genetic code.
|
|
23
|
+
rna
|
|
24
|
+
Whether to use RNA. Default is no (False), i.e. use DNA.
|
|
25
|
+
"""
|
|
26
|
+
self._locked = False
|
|
27
|
+
|
|
28
|
+
self.table_id = table_id
|
|
29
|
+
self.rna = rna
|
|
30
|
+
|
|
31
|
+
tables = self._load_tables()
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
table = tables[str(table_id)]
|
|
35
|
+
except KeyError:
|
|
36
|
+
raise ValueError(f'Unknown NCBI translation table ID: {table_id}.')
|
|
37
|
+
|
|
38
|
+
self.name = table['name']
|
|
39
|
+
self.names = tuple(table.get('names', (self.name,)))
|
|
40
|
+
|
|
41
|
+
self.stop_codons = tuple(
|
|
42
|
+
self.normalise_sequence(codon)
|
|
43
|
+
for codon in table['stop_codons']
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
dna_to_aa = table['codon_to_aa']
|
|
47
|
+
|
|
48
|
+
aa_to_dna = {}
|
|
49
|
+
for codon, aa in dna_to_aa.items():
|
|
50
|
+
aa_to_dna.setdefault(aa, []).append(codon)
|
|
51
|
+
|
|
52
|
+
codons_to_aa = {
|
|
53
|
+
self.normalise_sequence(codon): aa
|
|
54
|
+
for codon, aa in dna_to_aa.items()
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
aa_to_codons = {
|
|
58
|
+
aa: tuple(
|
|
59
|
+
self.normalise_sequence(codon)
|
|
60
|
+
for codon in codons
|
|
61
|
+
)
|
|
62
|
+
for aa, codons in aa_to_dna.items()
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
self.codons_to_aa = FrozenDict(codons_to_aa)
|
|
66
|
+
self.aa_to_codons = FrozenDict(aa_to_codons)
|
|
67
|
+
|
|
68
|
+
self._locked = True
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def custom(
|
|
72
|
+
cls,
|
|
73
|
+
codons_to_aa: Dict[str, str],
|
|
74
|
+
name: str = 'Custom',
|
|
75
|
+
rna: bool = False,
|
|
76
|
+
) -> 'TranslationTable':
|
|
77
|
+
"""
|
|
78
|
+
Create a custom translation table.
|
|
79
|
+
|
|
80
|
+
This is useful for synthetic, partial, experimental, or externally
|
|
81
|
+
defined translation tables, or for tables that have been discovered
|
|
82
|
+
but not yet added to Codeine. Codons are normalised to DNA or RNA
|
|
83
|
+
according to the ``rna`` argument.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
codons_to_aa
|
|
88
|
+
Mapping from codons to amino-acid symbols.
|
|
89
|
+
name
|
|
90
|
+
Name of the custom translation table. Default is ``'Custom'``.
|
|
91
|
+
rna
|
|
92
|
+
Whether the table should use RNA codons.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
TranslationTable
|
|
97
|
+
A custom translation table.
|
|
98
|
+
"""
|
|
99
|
+
obj = cls.__new__(cls)
|
|
100
|
+
obj._locked = False
|
|
101
|
+
|
|
102
|
+
obj.table_id = None
|
|
103
|
+
obj.rna = rna
|
|
104
|
+
obj.name = name
|
|
105
|
+
obj.names = (name,)
|
|
106
|
+
|
|
107
|
+
normalised = {
|
|
108
|
+
obj.normalise_sequence(codon): aa
|
|
109
|
+
for codon, aa in codons_to_aa.items()
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if any(len(codon) != 3 for codon in normalised):
|
|
113
|
+
raise ValueError('All codons must have length 3.')
|
|
114
|
+
|
|
115
|
+
if any(len(aa) != 1 for aa in normalised.values()):
|
|
116
|
+
raise ValueError('Amino-acid symbols must be single characters.')
|
|
117
|
+
|
|
118
|
+
aa_to_codons = {}
|
|
119
|
+
for codon, aa in normalised.items():
|
|
120
|
+
aa_to_codons.setdefault(aa, []).append(codon)
|
|
121
|
+
|
|
122
|
+
obj.stop_codons = tuple(codon for codon, aa in normalised.items() if aa == '*')
|
|
123
|
+
|
|
124
|
+
obj.codons_to_aa = FrozenDict(normalised)
|
|
125
|
+
obj.aa_to_codons = FrozenDict({aa: tuple(codons) for aa, codons in aa_to_codons.items()})
|
|
126
|
+
|
|
127
|
+
obj._locked = True
|
|
128
|
+
return obj
|
|
129
|
+
|
|
130
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
|
131
|
+
if getattr(self, '_locked', False) and name != '_locked':
|
|
132
|
+
raise AttributeError(f'{type(self).__name__} is immutable')
|
|
133
|
+
object.__setattr__(self, name, value)
|
|
134
|
+
|
|
135
|
+
def __repr__(self) -> str:
|
|
136
|
+
molecule = 'RNA' if self.rna else 'DNA'
|
|
137
|
+
|
|
138
|
+
lines = [
|
|
139
|
+
'TranslationTable',
|
|
140
|
+
f'Table ID: {self.table_id} ({self.name})',
|
|
141
|
+
f'Molecule type: {molecule}',
|
|
142
|
+
'',
|
|
143
|
+
'Table:',
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
for aa in sorted(self.aa_to_codons):
|
|
147
|
+
codons = ' '.join(self.aa_to_codons[aa])
|
|
148
|
+
lines.append(f' {aa}: {codons}')
|
|
149
|
+
|
|
150
|
+
return '\n'.join(lines)
|
|
151
|
+
|
|
152
|
+
def __getitem__(self, codon: str) -> str:
|
|
153
|
+
try:
|
|
154
|
+
codon = self.normalise_sequence(codon)
|
|
155
|
+
return self.codons_to_aa[codon]
|
|
156
|
+
|
|
157
|
+
except (AttributeError, TypeError, ValueError, KeyError) as e:
|
|
158
|
+
raise KeyError(f'Invalid codon: {codon}.') from e
|
|
159
|
+
|
|
160
|
+
def normalise_sequence(self, seq: str) -> str:
|
|
161
|
+
"""
|
|
162
|
+
Format a sequence in the format specified by this codon table, i.e. convert to
|
|
163
|
+
RNA/DNA and cast to upper case.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
codon
|
|
168
|
+
The inputted codon.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
The normalised codon.
|
|
173
|
+
"""
|
|
174
|
+
seq = seq.upper().replace(' ', '')
|
|
175
|
+
seq = seq.replace('T', 'U') if self.rna else seq.replace('U', 'T')
|
|
176
|
+
regex = r'^[ACGU]*$' if self.rna else r'^[ACGT]*$'
|
|
177
|
+
if not re.match(regex, seq):
|
|
178
|
+
raise ValueError('Sequence to normalise must be a nucleotide sequence.')
|
|
179
|
+
else:
|
|
180
|
+
return seq
|
|
181
|
+
|
|
182
|
+
def translate(self, seq: str) -> str:
|
|
183
|
+
"""
|
|
184
|
+
Translate a DNA/RNA coding sequence into its amino-acid sequence.
|
|
185
|
+
"""
|
|
186
|
+
if len(seq) % 3 != 0:
|
|
187
|
+
raise ValueError('Sequence length must be a multiple of 3')
|
|
188
|
+
|
|
189
|
+
seq = self.normalise_sequence(seq)
|
|
190
|
+
|
|
191
|
+
return ''.join(self.codons_to_aa[seq[i:i + 3]] for i in range(0, len(seq), 3))
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _load_tables() -> dict:
|
|
195
|
+
"""
|
|
196
|
+
Load tables from stored JSON entries.
|
|
197
|
+
"""
|
|
198
|
+
path = Path(__file__).parent / 'data' / 'tables.json'
|
|
199
|
+
with path.open() as f:
|
|
200
|
+
return json.load(f)
|