codeine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ """
2
+ Codon usage counts obtained from the GenScript Codon Frequency Table.
3
+
4
+ Source:
5
+ https://www.genscript.com/tools/codon-frequency-table
6
+
7
+ Retrieved: 2026-06-17
8
+
9
+ Values correspond to the "Number" column.
10
+ """
11
+
12
+ ECOLI_WEIGHTS = {'F': {'TTT': 80995, 'TTC': 58774},
13
+ 'L': {'TTA': 52382,
14
+ 'TTG': 47500,
15
+ 'CTT': 43449,
16
+ 'CTC': 37347,
17
+ 'CTA': 15409,
18
+ 'CTG': 177210},
19
+ 'Y': {'TAT': 63937, 'TAC': 44631},
20
+ '*': {'TAA': 7356, 'TAG': 989, 'TGA': 3623},
21
+ 'H': {'CAT': 45879, 'CAC': 34078},
22
+ 'Q': {'CAA': 53394, 'CAG': 104171},
23
+ 'I': {'ATT': 109072, 'ATC': 86796, 'ATA': 24984},
24
+ 'M': {'ATG': 96695},
25
+ 'N': {'AAT': 75436, 'AAC': 78443},
26
+ 'K': {'AAA': 129137, 'AAG': 45459},
27
+ 'V': {'GTT': 72584, 'GTC': 52439, 'GTA': 42420, 'GTG': 89265},
28
+ 'D': {'GAT': 119939, 'GAC': 70394},
29
+ 'E': {'GAA': 143353, 'GAG': 68609},
30
+ 'S': {'TCT': 38027,
31
+ 'TCC': 33430,
32
+ 'TCA': 32715,
33
+ 'TCG': 31146,
34
+ 'AGT': 36097,
35
+ 'AGC': 55551},
36
+ 'C': {'TGT': 19138, 'TGC': 22188},
37
+ 'W': {'TGG': 50991},
38
+ 'P': {'CCT': 27340, 'CCC': 19666, 'CCA': 31534, 'CCG': 76644},
39
+ 'R': {'CGT': 73197,
40
+ 'CGC': 72212,
41
+ 'CGA': 13844,
42
+ 'CGG': 21552,
43
+ 'AGA': 13152,
44
+ 'AGG': 7607},
45
+ 'T': {'ACT': 37842, 'ACC': 80547, 'ACA': 33910, 'ACG': 50269},
46
+ 'A': {'GCT': 62479, 'GCC': 88721, 'GCA': 77547, 'GCG': 110308},
47
+ 'G': {'GGT': 93325, 'GGC': 99390, 'GGA': 34799, 'GGG': 41277}}
48
+
49
+ YEAST_WEIGHTS = {'F': {'TTT': 147662, 'TTC': 103110},
50
+ 'L': {'TTA': 149466,
51
+ 'TTG': 153307,
52
+ 'CTT': 68959,
53
+ 'CTC': 30501,
54
+ 'CTA': 75775,
55
+ 'CTG': 59027},
56
+ 'Y': {'TAT': 106420, 'TAC': 83100},
57
+ '*': {'TAA': 5596, 'TAG': 2681, 'TGA': 3516},
58
+ 'H': {'CAT': 77724, 'CAC': 44082},
59
+ 'Q': {'CAA': 155751, 'CAG': 68932},
60
+ 'I': {'ATT': 171272, 'ATC': 96735, 'ATA': 100930},
61
+ 'M': {'ATG': 118274},
62
+ 'N': {'AAT': 204015, 'AAC': 141103},
63
+ 'K': {'AAA': 239186, 'AAG': 174250},
64
+ 'V': {'GTT': 124665, 'GTC': 65664, 'GTA': 66649, 'GTG': 60286},
65
+ 'D': {'GAT': 214363, 'GAC': 114918},
66
+ 'E': {'GAA': 260082, 'GAG': 108411},
67
+ 'S': {'TCT': 133502,
68
+ 'TCC': 80612,
69
+ 'TCA': 106266,
70
+ 'TCG': 48524,
71
+ 'AGT': 80293,
72
+ 'AGC': 54825},
73
+ 'C': {'TGT': 45222, 'TGC': 26520},
74
+ 'W': {'TGG': 58416},
75
+ 'P': {'CCT': 77061, 'CCC': 38414, 'CCA': 103062, 'CCG': 29916},
76
+ 'R': {'CGT': 36693,
77
+ 'CGC': 14645,
78
+ 'CGA': 17111,
79
+ 'CGG': 9872,
80
+ 'AGA': 120587,
81
+ 'AGG': 52322},
82
+ 'T': {'ACT': 114746, 'ACC': 71204, 'ACA': 100680, 'ACG': 44995},
83
+ 'A': {'GCT': 119402, 'GCC': 71065, 'GCA': 91502, 'GCG': 34754},
84
+ 'G': {'GGT': 135417, 'GGC': 54977, 'GGA': 61825, 'GGG': 33885}}
85
+
86
+ HUMAN_WEIGHTS = {'F': {'TTT': 336562, 'TTC': 406571},
87
+ 'L': {'TTA': 143715,
88
+ 'TTG': 249879,
89
+ 'CTT': 253795,
90
+ 'CTC': 386182,
91
+ 'CTA': 138154,
92
+ 'CTG': 800774},
93
+ 'Y': {'TAT': 239268, 'TAC': 310695},
94
+ '*': {'TAA': 14322, 'TAG': 10915, 'TGA': 25383},
95
+ 'H': {'CAT': 207826, 'CAC': 297048},
96
+ 'Q': {'CAA': 234785, 'CAG': 688316},
97
+ 'I': {'ATT': 313225, 'ATC': 426570, 'ATA': 140652},
98
+ 'M': {'ATG': 443795},
99
+ 'N': {'AAT': 331714, 'AAC': 387148},
100
+ 'K': {'AAA': 476554, 'AAG': 654280},
101
+ 'V': {'GTT': 216818, 'GTC': 290874, 'GTA': 139156, 'GTG': 575438},
102
+ 'D': {'GAT': 443369, 'GAC': 517579},
103
+ 'E': {'GAA': 577846, 'GAG': 810842},
104
+ 'S': {'TCT': 291040,
105
+ 'TCC': 346943,
106
+ 'TCA': 233110,
107
+ 'TCG': 89429,
108
+ 'AGT': 237404,
109
+ 'AGC': 385113},
110
+ 'C': {'TGT': 197293, 'TGC': 243685},
111
+ 'W': {'TGG': 255512},
112
+ 'P': {'CCT': 343793, 'CCC': 397790, 'CCA': 331944, 'CCG': 139414},
113
+ 'R': {'CGT': 93458,
114
+ 'CGC': 217130,
115
+ 'CGA': 126113,
116
+ 'CGG': 235938,
117
+ 'AGA': 228151,
118
+ 'AGG': 227281},
119
+ 'T': {'ACT': 255582, 'ACC': 382050, 'ACA': 294223, 'ACG': 123533},
120
+ 'A': {'GCT': 370873, 'GCC': 567930, 'GCA': 317338, 'GCG': 150708},
121
+ 'G': {'GGT': 215544, 'GGC': 453917, 'GGA': 325243, 'GGG': 326879}}
122
+
123
+ MOUSE_WEIGHTS = {'F': {'TTT': 244935, 'TTC': 319002},
124
+ 'L': {'TTA': 92974,
125
+ 'TTG': 189840,
126
+ 'CTT': 188236,
127
+ 'CTC': 290198,
128
+ 'CTA': 114707,
129
+ 'CTG': 571592},
130
+ 'S': {'TCT': 228046,
131
+ 'TCC': 258787,
132
+ 'TCA': 165371,
133
+ 'TCG': 61116,
134
+ 'AGT': 176056,
135
+ 'AGC': 279098},
136
+ 'Y': {'TAT': 175198, 'TAC': 236579},
137
+ '*': {'TAA': 8861, 'TAG': 7491, 'TGA': 17004},
138
+ 'C': {'TGT': 158573, 'TGC': 173509},
139
+ 'W': {'TGG': 177261},
140
+ 'P': {'CCT': 261478, 'CCC': 262874, 'CCA': 244064, 'CCG': 89477},
141
+ 'H': {'CAT': 145279, 'CAC': 217261},
142
+ 'Q': {'CAA': 163794, 'CAG': 486041},
143
+ 'R': {'CGT': 66962,
144
+ 'CGC': 136330,
145
+ 'CGA': 94626,
146
+ 'CGG': 147905,
147
+ 'AGA': 163062,
148
+ 'AGG': 167166},
149
+ 'I': {'ATT': 223528, 'ATC': 332072, 'ATA': 103441},
150
+ 'M': {'ATG': 330431},
151
+ 'T': {'ACT': 194120, 'ACC': 274588, 'ACA': 227958, 'ACG': 82653},
152
+ 'N': {'AAT': 221760, 'AAC': 296083},
153
+ 'K': {'AAA': 305128, 'AAG': 482643},
154
+ 'V': {'GTT': 152706, 'GTC': 224465, 'GTA': 105532, 'GTG': 416428},
155
+ 'A': {'GCT': 288132, 'GCC': 378142, 'GCA': 226309, 'GCG': 94463},
156
+ 'D': {'GAT': 301359, 'GAC': 380355},
157
+ 'E': {'GAA': 379216, 'GAG': 563999},
158
+ 'G': {'GGT': 164971, 'GGC': 312846, 'GGA': 241435, 'GGG': 220474}}
159
+
160
+ ARABIDOPSIS_WEIGHTS = {'F': {'TTT': 534456, 'TTC': 505253},
161
+ 'L': {'TTA': 308336,
162
+ 'TTG': 508909,
163
+ 'CTT': 589767,
164
+ 'CTC': 391502,
165
+ 'CTA': 240784,
166
+ 'CTG': 240472},
167
+ 'S': {'TCT': 610403,
168
+ 'TCC': 270545,
169
+ 'TCA': 440264,
170
+ 'TCG': 223500,
171
+ 'AGT': 339903,
172
+ 'AGC': 273815},
173
+ 'Y': {'TAT': 359583, 'TAC': 337738},
174
+ '*': {'TAA': 22128, 'TAG': 12250, 'TGA': 26304},
175
+ 'C': {'TGT': 254179, 'TGC': 171924},
176
+ 'W': {'TGG': 304101},
177
+ 'P': {'CCT': 455638, 'CCC': 129838, 'CCA': 393957, 'CCG': 207684},
178
+ 'H': {'CAT': 334012, 'CAC': 211635},
179
+ 'Q': {'CAA': 470475, 'CAG': 371091},
180
+ 'R': {'CGT': 220192,
181
+ 'CGC': 91869,
182
+ 'CGA': 152537,
183
+ 'CGG': 117195,
184
+ 'AGA': 457927,
185
+ 'AGG': 266406},
186
+ 'I': {'ATT': 528593, 'ATC': 452968, 'ATA': 305594},
187
+ 'M': {'ATG': 594807},
188
+ 'T': {'ACT': 431031, 'ACC': 252577, 'ACA': 381774, 'ACG': 186589},
189
+ 'N': {'AAT': 543539, 'AAC': 508277},
190
+ 'K': {'AAA': 749448, 'AAG': 798966},
191
+ 'V': {'GTT': 668037, 'GTC': 312848, 'GTA': 242644, 'GTG': 423929},
192
+ 'A': {'GCT': 697575, 'GCC': 253877, 'GCA': 427842, 'GCG': 217886},
193
+ 'D': {'GAT': 895566, 'GAC': 420193},
194
+ 'E': {'GAA': 836077, 'GAG': 786970},
195
+ 'G': {'GGT': 546544, 'GGC': 223041, 'GGA': 592219, 'GGG': 249165}}
196
+
197
+ DROSOPHILA_WEIGHTS = {'F': {'TTT': 241160, 'TTC': 402987},
198
+ 'L': {'TTA': 80654,
199
+ 'TTG': 294929,
200
+ 'CTT': 164079,
201
+ 'CTC': 255170,
202
+ 'CTA': 150674,
203
+ 'CTG': 709057},
204
+ 'S': {'TCT': 127758,
205
+ 'TCC': 359352,
206
+ 'TCA': 142775,
207
+ 'TCG': 307642,
208
+ 'AGT': 211452,
209
+ 'AGC': 377750},
210
+ 'Y': {'TAT': 196987, 'TAC': 338493},
211
+ '*': {'TAA': 14550, 'TAG': 11721, 'TGA': 9358},
212
+ 'C': {'TGT': 98056, 'TGC': 241823},
213
+ 'W': {'TGG': 181191},
214
+ 'P': {'CCT': 127584, 'CCC': 331799, 'CCA': 248604, 'CCG': 293998},
215
+ 'H': {'CAT': 194039, 'CAC': 294555},
216
+ 'Q': {'CAA': 287398, 'CAG': 674575},
217
+ 'R': {'CGT': 161800,
218
+ 'CGC': 334733,
219
+ 'CGA': 155637,
220
+ 'CGG': 151728,
221
+ 'AGA': 93738,
222
+ 'AGG': 115430},
223
+ 'I': {'ATT': 302063, 'ATC': 421851, 'ATA': 171610},
224
+ 'M': {'ATG': 431822},
225
+ 'T': {'ACT': 175261, 'ACC': 393516, 'ACA': 201178, 'ACG': 267095},
226
+ 'N': {'AAT': 385441, 'AAC': 483430},
227
+ 'K': {'AAA': 304266, 'AAG': 729428},
228
+ 'V': {'GTT': 200189, 'GTC': 254974, 'GTA': 116035, 'GTG': 514471},
229
+ 'A': {'GCT': 265295, 'GCC': 620889, 'GCA': 234549, 'GCG': 259119},
230
+ 'D': {'GAT': 504880, 'GAC': 454677},
231
+ 'E': {'GAA': 383850, 'GAG': 789807},
232
+ 'G': {'GGT': 243460, 'GGC': 492283, 'GGA': 326983, 'GGG': 85472}}
@@ -0,0 +1,200 @@
1
+ import json
2
+ import re
3
+
4
+ from pathlib import Path
5
+ from typing import Any, Dict
6
+
7
+ from codeine.utils.dict import FrozenDict
8
+
9
+
10
+ class TranslationTable:
11
+ """
12
+ Translation table. Here we use the NCBI translation table IDs
13
+ (see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
14
+ Uses DNA by default, but can switch to RNA by setting ``rna=True``.
15
+ """
16
+
17
+ def __init__(self, table_id: int = 1, rna: bool = False) -> None:
18
+ """
19
+ Parameters
20
+ ----------
21
+ table_id
22
+ Which translation table to use. Default is 1, which is the standard genetic code.
23
+ rna
24
+ Whether to use RNA. Default is no (False), i.e. use DNA.
25
+ """
26
+ self._locked = False
27
+
28
+ self.table_id = table_id
29
+ self.rna = rna
30
+
31
+ tables = self._load_tables()
32
+
33
+ try:
34
+ table = tables[str(table_id)]
35
+ except KeyError:
36
+ raise ValueError(f'Unknown NCBI translation table ID: {table_id}.')
37
+
38
+ self.name = table['name']
39
+ self.names = tuple(table.get('names', (self.name,)))
40
+
41
+ self.stop_codons = tuple(
42
+ self.normalise_sequence(codon)
43
+ for codon in table['stop_codons']
44
+ )
45
+
46
+ dna_to_aa = table['codon_to_aa']
47
+
48
+ aa_to_dna = {}
49
+ for codon, aa in dna_to_aa.items():
50
+ aa_to_dna.setdefault(aa, []).append(codon)
51
+
52
+ codons_to_aa = {
53
+ self.normalise_sequence(codon): aa
54
+ for codon, aa in dna_to_aa.items()
55
+ }
56
+
57
+ aa_to_codons = {
58
+ aa: tuple(
59
+ self.normalise_sequence(codon)
60
+ for codon in codons
61
+ )
62
+ for aa, codons in aa_to_dna.items()
63
+ }
64
+
65
+ self.codons_to_aa = FrozenDict(codons_to_aa)
66
+ self.aa_to_codons = FrozenDict(aa_to_codons)
67
+
68
+ self._locked = True
69
+
70
+ @classmethod
71
+ def custom(
72
+ cls,
73
+ codons_to_aa: Dict[str, str],
74
+ name: str = 'Custom',
75
+ rna: bool = False,
76
+ ) -> 'TranslationTable':
77
+ """
78
+ Create a custom translation table.
79
+
80
+ This is useful for synthetic, partial, experimental, or externally
81
+ defined translation tables, or for tables that have been discovered
82
+ but not yet added to Codeine. Codons are normalised to DNA or RNA
83
+ according to the ``rna`` argument.
84
+
85
+ Parameters
86
+ ----------
87
+ codons_to_aa
88
+ Mapping from codons to amino-acid symbols.
89
+ name
90
+ Name of the custom translation table. Default is ``'Custom'``.
91
+ rna
92
+ Whether the table should use RNA codons.
93
+
94
+ Returns
95
+ -------
96
+ TranslationTable
97
+ A custom translation table.
98
+ """
99
+ obj = cls.__new__(cls)
100
+ obj._locked = False
101
+
102
+ obj.table_id = None
103
+ obj.rna = rna
104
+ obj.name = name
105
+ obj.names = (name,)
106
+
107
+ normalised = {
108
+ obj.normalise_sequence(codon): aa
109
+ for codon, aa in codons_to_aa.items()
110
+ }
111
+
112
+ if any(len(codon) != 3 for codon in normalised):
113
+ raise ValueError('All codons must have length 3.')
114
+
115
+ if any(len(aa) != 1 for aa in normalised.values()):
116
+ raise ValueError('Amino-acid symbols must be single characters.')
117
+
118
+ aa_to_codons = {}
119
+ for codon, aa in normalised.items():
120
+ aa_to_codons.setdefault(aa, []).append(codon)
121
+
122
+ obj.stop_codons = tuple(codon for codon, aa in normalised.items() if aa == '*')
123
+
124
+ obj.codons_to_aa = FrozenDict(normalised)
125
+ obj.aa_to_codons = FrozenDict({aa: tuple(codons) for aa, codons in aa_to_codons.items()})
126
+
127
+ obj._locked = True
128
+ return obj
129
+
130
+ def __setattr__(self, name: str, value: Any) -> None:
131
+ if getattr(self, '_locked', False) and name != '_locked':
132
+ raise AttributeError(f'{type(self).__name__} is immutable')
133
+ object.__setattr__(self, name, value)
134
+
135
+ def __repr__(self) -> str:
136
+ molecule = 'RNA' if self.rna else 'DNA'
137
+
138
+ lines = [
139
+ 'TranslationTable',
140
+ f'Table ID: {self.table_id} ({self.name})',
141
+ f'Molecule type: {molecule}',
142
+ '',
143
+ 'Table:',
144
+ ]
145
+
146
+ for aa in sorted(self.aa_to_codons):
147
+ codons = ' '.join(self.aa_to_codons[aa])
148
+ lines.append(f' {aa}: {codons}')
149
+
150
+ return '\n'.join(lines)
151
+
152
+ def __getitem__(self, codon: str) -> str:
153
+ try:
154
+ codon = self.normalise_sequence(codon)
155
+ return self.codons_to_aa[codon]
156
+
157
+ except (AttributeError, TypeError, ValueError, KeyError) as e:
158
+ raise KeyError(f'Invalid codon: {codon}.') from e
159
+
160
+ def normalise_sequence(self, seq: str) -> str:
161
+ """
162
+ Format a sequence in the format specified by this codon table, i.e. convert to
163
+ RNA/DNA and cast to upper case.
164
+
165
+ Parameters
166
+ ----------
167
+ codon
168
+ The inputted codon.
169
+
170
+ Returns
171
+ -------
172
+ The normalised codon.
173
+ """
174
+ seq = seq.upper().replace(' ', '')
175
+ seq = seq.replace('T', 'U') if self.rna else seq.replace('U', 'T')
176
+ regex = r'^[ACGU]*$' if self.rna else r'^[ACGT]*$'
177
+ if not re.match(regex, seq):
178
+ raise ValueError('Sequence to normalise must be a nucleotide sequence.')
179
+ else:
180
+ return seq
181
+
182
+ def translate(self, seq: str) -> str:
183
+ """
184
+ Translate a DNA/RNA coding sequence into its amino-acid sequence.
185
+ """
186
+ if len(seq) % 3 != 0:
187
+ raise ValueError('Sequence length must be a multiple of 3')
188
+
189
+ seq = self.normalise_sequence(seq)
190
+
191
+ return ''.join(self.codons_to_aa[seq[i:i + 3]] for i in range(0, len(seq), 3))
192
+
193
+ @staticmethod
194
+ def _load_tables() -> dict:
195
+ """
196
+ Load tables from stored JSON entries.
197
+ """
198
+ path = Path(__file__).parent / 'data' / 'tables.json'
199
+ with path.open() as f:
200
+ return json.load(f)