codeine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ """
2
+ Common restriction site motifs, obtained from New England BioLabs here:
3
+
4
+ https://www.neb.com/en/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities
5
+
6
+ Retrieved: 2026-06-09
7
+ """
8
+
9
+ from enum import Enum
10
+ from typing import Tuple
11
+
12
+ _COMPLEMENT = str.maketrans('ACGTacgt', 'TGCAtgca')
13
+
14
+
15
+ def reverse_complement(seq: str) -> str:
16
+ """
17
+ Return the reverse complement of a DNA sequence.
18
+ """
19
+ return seq.translate(_COMPLEMENT)[::-1]
20
+
21
+
22
+ class RestrictionSite(Enum):
23
+ """
24
+ Common restriction enzyme recognition sequences.
25
+
26
+ These can be passed directly to ``CodingSpace`` to forbid restriction
27
+ sites without specifying the DNA motifs manually. Both forward and reverse
28
+ sequences are passed.
29
+
30
+ Examples
31
+ --------
32
+ >>> CodingSpace(
33
+ ... aa_seq,
34
+ ... forbidden_motifs=[
35
+ ... RestrictionSite.EcoRI,
36
+ ... RestrictionSite.BsaI,
37
+ ... ],
38
+ ... )
39
+
40
+ >>> RestrictionSite.EcoRI.forward
41
+ 'GAATTC'
42
+ >>> RestrictionSite.EcoRI.reverse
43
+ 'GAATTC'
44
+ """
45
+
46
+ # BioBricks
47
+ EcoRI = 'GAATTC'
48
+ XbaI = 'TCTAGA'
49
+ SpeI = 'ACTAGT'
50
+ PstI = 'CTGCAG'
51
+
52
+ # Cloning
53
+ BamHI = 'GGATCC'
54
+ HindIII = 'AAGCTT'
55
+ XhoI = 'CTCGAG'
56
+ SalI = 'GTCGAC'
57
+ KpnI = 'GGTACC'
58
+ SacI = 'GAGCTC'
59
+ NcoI = 'CCATGG'
60
+ NdeI = 'CATATG'
61
+ NotI = 'GCGGCCGC'
62
+ MluI = 'ACGCGT'
63
+ AgeI = 'ACCGGT'
64
+ AvrII = 'CCTAGG'
65
+ BglII = 'AGATCT'
66
+
67
+ # Golden Gate
68
+ BsaI = 'GGTCTC'
69
+ BsmBI = 'CGTCTC'
70
+ BbsI = 'GAAGAC'
71
+ SapI = 'GCTCTTC'
72
+
73
+ def __repr__(self):
74
+ return f'RestrictionSite.{self.name}'
75
+
76
+ def __str__(self):
77
+ if self.forward == self.reverse:
78
+ return f'{self.name} ({self.forward})'
79
+
80
+ return f'{self.name} ({self.forward} / {self.reverse})'
81
+
82
+ @property
83
+ def forward(self) -> str:
84
+ """
85
+ Forward recognition sequence.
86
+ """
87
+ return self.value
88
+
89
+ @property
90
+ def reverse(self) -> str:
91
+ """
92
+ Reverse-complemented recognition sequence.
93
+ """
94
+ return reverse_complement(self.value)
95
+
96
+ @property
97
+ def motifs(self) -> Tuple[str, ...]:
98
+ """
99
+ All motifs corresponding to this restriction site (forward and reverse).
100
+ Palindromic sites return a single motif.
101
+ """
102
+ if self.forward == self.reverse:
103
+ return self.forward,
104
+
105
+ return self.forward, self.reverse
@@ -0,0 +1,117 @@
1
+
2
+ from typing import List, Optional, Sequence, Union
3
+
4
+ from codeine.motifs.restriction import RestrictionSite
5
+
6
+ ForbiddenMotif = Union[str, RestrictionSite]
7
+ ForbiddenMotifs = Union[ForbiddenMotif, Sequence[ForbiddenMotif]]
8
+
9
+
10
+ def expand_and_validate_forbidden_motifs(
11
+ forbidden_motifs: ForbiddenMotifs,
12
+ rna: bool
13
+ ) -> List[str]:
14
+ """
15
+ Convert a set of forbidden motifs into a set of sequences to ban.
16
+
17
+ Parameters
18
+ ----------
19
+ forbidden_motifs
20
+ A sequence of either dna/rna strings or RestrictionSite objects.
21
+ rna
22
+ Whether to use RNA.
23
+
24
+ Returns
25
+ -------
26
+ A list of forbidden nucleotide sequences.
27
+ """
28
+ all_sequences = []
29
+
30
+ if isinstance(forbidden_motifs, (str, RestrictionSite)):
31
+ forbidden_motifs = [forbidden_motifs]
32
+
33
+ for motif in forbidden_motifs:
34
+ if isinstance(motif, RestrictionSite):
35
+ sequences = [*motif.motifs]
36
+
37
+ elif isinstance(motif, str):
38
+ if len(motif) == 0:
39
+ raise ValueError('Forbidden motifs cannot be empty.')
40
+
41
+ sequences = [motif]
42
+
43
+ else:
44
+ raise TypeError('Forbidden motifs must be strings or codeine.RestrictionSite.')
45
+
46
+ sequences = [seq.upper() for seq in sequences]
47
+ sequences = [seq.replace('T', 'U') if rna else seq.replace('U', 'T') for seq in sequences]
48
+
49
+ allowed = set('ACGU' if rna else 'ACGT')
50
+ for seq in sequences:
51
+ if not set(seq) <= allowed:
52
+ raise ValueError('Forbidden motifs must be nucleotide sequences.')
53
+
54
+ all_sequences += sequences
55
+
56
+ return sorted(set(all_sequences))
57
+
58
+
59
+ def expand_and_validate_max_homopolymer(
60
+ max_homopolymer: int,
61
+ rna: bool = False
62
+ ) -> List[str]:
63
+ """
64
+ Convert a max homopolymer constraint into a set of banned sequences.
65
+
66
+ Parameters
67
+ ----------
68
+ max_homopolymer
69
+ The max length of homopolymer
70
+ rna
71
+ Whether to use RNA
72
+
73
+ Returns
74
+ -------
75
+ A list of forbidden nucleotide sequences.
76
+ """
77
+
78
+ if not isinstance(max_homopolymer, int):
79
+ raise TypeError('max_homopolymer must be an integer.')
80
+
81
+ if max_homopolymer < 1:
82
+ raise ValueError('max_homopolymer must be at least 1.')
83
+
84
+ nts = 'ACGU' if rna else 'ACGT'
85
+ return [nt * (max_homopolymer + 1) for nt in nts]
86
+
87
+
88
+ def expand_and_validate_sequence_constraints(
89
+ forbidden_motifs: Optional[ForbiddenMotifs] = None,
90
+ max_homopolymer: Optional[int] = None,
91
+ rna: bool = False,
92
+ ):
93
+ """
94
+ Convert forbidden sequences and/or max homopolymer constraints into sets of banned sequences.
95
+
96
+ Parameters
97
+ ----------
98
+ forbidden_motifs
99
+ A sequence of either dna/rna strings or RestrictionSite objects.
100
+ max_homopolymer
101
+ The max allowed homopolymer.
102
+ rna
103
+ Whether to use RNA.
104
+
105
+ Returns
106
+ -------
107
+ A list of forbidden nucleotide sequences.
108
+ """
109
+ forbidden_sequences = []
110
+
111
+ if forbidden_motifs is not None:
112
+ forbidden_sequences += expand_and_validate_forbidden_motifs(forbidden_motifs, rna=rna)
113
+
114
+ if max_homopolymer is not None:
115
+ forbidden_sequences += expand_and_validate_max_homopolymer(max_homopolymer, rna=rna)
116
+
117
+ return sorted(set(forbidden_sequences))
File without changes
@@ -0,0 +1,490 @@
1
+ import pickle
2
+ import random
3
+
4
+ from pathlib import Path
5
+ from typing import Dict, Generator, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from codeine.space.mutation import MutationSpace
9
+
10
+ from codeine.graph.base import CodonGraph, CodonRestriction
11
+ from codeine.motifs.validate import expand_and_validate_sequence_constraints, ForbiddenMotifs
12
+ from codeine.motifs.restriction import RestrictionSite
13
+ from codeine.translation.tables import TranslationTable
14
+ from codeine.translation.weights import CodonWeights
15
+ from codeine.utils.display import format_forbidden_motifs, format_forbidden_motif,\
16
+ format_count, format_restrictions
17
+ from codeine.utils.sampling import Seedable
18
+
19
+
20
+ class CodingSpace:
21
+ """
22
+ Represents a space of valid coding sequences for a protein under constraints.
23
+ """
24
+ def __init__(
25
+ self,
26
+ aa_seq: str,
27
+ *,
28
+ translation_table: Optional[TranslationTable] = None,
29
+ rna: Optional[bool] = None,
30
+ codon_restrictions: Optional[Dict[int, CodonRestriction]] = None,
31
+ forbidden_motifs: Optional[ForbiddenMotifs] = None,
32
+ max_homopolymer: Optional[int] = None,
33
+ context_l: str = '',
34
+ context_r: str = '',
35
+ codon_weights: Optional[CodonWeights] = None,
36
+ seed: Optional[Seedable] = None,
37
+ ) -> None:
38
+ """
39
+ Parameters
40
+ ----------
41
+ aa_seq
42
+ The amino acid sequence.
43
+ translation_table
44
+ The translation table to use. Leave blank to use standard table.
45
+ rna
46
+ Whether to use RNA. If false or blank, use DNA.
47
+ codon_restrictions
48
+ Any codon restrictions in the format e.g. ``{4: 'TCC'}`` or ``{5: ['AGT', 'AGC']}``. Positions are 1-based.
49
+ forbidden_motifs
50
+ Forbidden motifs, either as strings or as ``codeine.RestrictionSite``.
51
+ max_homopolymer
52
+ The maximum allowed length of nucleotide homopolymer
53
+ context_l
54
+ The context sequence to the left of the coding sequence.
55
+ context_r
56
+ The context sequence to the right of the coding sequence.
57
+ codon_weights
58
+ The codon weights to use. Leave blank to sample uniformly.
59
+ seed
60
+ Seed used to initialise the random number generator for sampling.
61
+ """
62
+
63
+ translation_table, codon_weights = self._resolve_tables(translation_table, codon_weights, rna)
64
+
65
+ graph = CodonGraph(
66
+ aa_seq,
67
+ codon_restrictions=codon_restrictions,
68
+ translation_table=translation_table,
69
+ weights=codon_weights,
70
+ context_l=context_l,
71
+ context_r=context_r,
72
+ )
73
+
74
+ view = graph.view(seed=seed)
75
+ self.view = view
76
+
77
+ self.forbidden_motifs = self._normalise_forbidden_motifs(forbidden_motifs)
78
+ self.max_homopolymer = max_homopolymer
79
+
80
+ self._update_forbidden_sequences()
81
+
82
+ @classmethod
83
+ def load(cls, path) -> 'CodingSpace':
84
+ """
85
+ Load a coding space from disc.
86
+ """
87
+ with Path(path).open('rb') as f:
88
+ return pickle.load(f)
89
+
90
+ def save(self, path) -> None:
91
+ """
92
+ Save this coding space to disc.
93
+ """
94
+ with Path(path).open('wb') as f:
95
+ pickle.dump(self, f)
96
+
97
+ def __getitem__(self, index: Union[int, slice]) -> Union[str, List[str]]:
98
+ """
99
+ Return one or more valid sequences.
100
+
101
+ Parameters
102
+ ----------
103
+ index
104
+ Zero-based sequence index or slice.
105
+
106
+ Returns
107
+ -------
108
+ str or List[str]
109
+ The indexed sequence, or a list of sequences for a slice.
110
+ """
111
+ return self.view[index]
112
+
113
+ def __iter__(self) -> Generator[str, None, None]:
114
+ """
115
+ Iterate over all valid sequences in this coding space.
116
+ Be aware that "all valid sequences" can be astronomically many!
117
+
118
+ Yields
119
+ ----------
120
+ All valid sequences in the coding space, in order.
121
+ """
122
+
123
+ yield from self.view
124
+
125
+ def __contains__(self, seq: str) -> bool:
126
+ """
127
+ Does the given seq exist in this space?
128
+
129
+ Returns
130
+ ----------
131
+ True if and only if this is a valid sequence in this space.
132
+ """
133
+
134
+ return seq in self.view
135
+
136
+ def __repr__(self) -> str:
137
+ molecule = 'RNA' if self.translation_table.rna else 'DNA'
138
+
139
+ lines = [
140
+ f'{type(self).__name__}',
141
+ '',
142
+ f'Translation table: {self.translation_table.table_id} ({self.translation_table.name})',
143
+ f'Molecule type: {molecule}',
144
+ '',
145
+ f'Amino acid sequence ({len(self.aa_seq)} aa):',
146
+ f'{self.aa_seq}',
147
+ '',
148
+ ]
149
+
150
+ if self.codon_restrictions:
151
+ lines += [
152
+ 'Codon restrictions:',
153
+ *format_restrictions(
154
+ self.codon_restrictions,
155
+ label='restricted positions',
156
+ max_lines=4,
157
+ ),
158
+ '',
159
+ ]
160
+
161
+ if self.forbidden_motifs:
162
+ motifs = self.forbidden_motifs
163
+
164
+ if isinstance(motifs, (str, RestrictionSite)):
165
+ motifs = [motifs]
166
+
167
+ lines += [
168
+ 'Forbidden motifs:',
169
+ *format_forbidden_motifs(
170
+ [
171
+ format_forbidden_motif(motif, rna=self.translation_table.rna)
172
+ for motif in motifs
173
+ ],
174
+ max_lines=4,
175
+ ),
176
+ '',
177
+ ]
178
+
179
+ if self.max_homopolymer is not None:
180
+ lines += [
181
+ 'Maximum homopolymer length:',
182
+ f' {self.max_homopolymer}',
183
+ '',
184
+ ]
185
+
186
+ if self.pinned_codons:
187
+ lines += [
188
+ 'Temporary pins:',
189
+ *format_restrictions(
190
+ self.pinned_codons,
191
+ label='pinned positions',
192
+ max_lines=4,
193
+ ),
194
+ '',
195
+ ]
196
+
197
+ lines.append(f'Num. valid coding sequences: {format_count(self.n_valid_sequences)}')
198
+
199
+ return '\n'.join(lines)
200
+
201
+ def sample(self, n: Optional[int] = None) -> str:
202
+ """
203
+ Sample one or more variants from this coding space.
204
+
205
+ Parameters
206
+ ----------
207
+ n
208
+ Number of sequences to sample. If omitted, return a single sequence.
209
+
210
+ Returns
211
+ -------
212
+ A sampled string sequence from this coding space.
213
+ """
214
+ return self.view.sample(n=n)
215
+
216
+ def enumerate(self) -> Generator[str, None, None]:
217
+ """
218
+ Generate all sequences in this space. If there are many (and often there are
219
+ astronomically many), one would not expect to reach the 'end'. However for smaller
220
+ sequence spaces, such as mutation spaces, it's quite possible to get there.
221
+
222
+ Yields
223
+ ------
224
+ str
225
+ A valid coding sequence.
226
+ """
227
+ yield from self.view.enumerate()
228
+
229
+ def contains(self, seq: str) -> bool:
230
+ """
231
+ Check whether a coding sequence is contained in this coding space.
232
+
233
+ Parameters
234
+ ----------
235
+ seq
236
+ The sequence to check.
237
+
238
+ Returns
239
+ -------
240
+ True if and only if the sequence is contained in this coding space.
241
+ """
242
+ return self.view.contains(seq)
243
+
244
+ def mutants(
245
+ self,
246
+ cds: str,
247
+ free_positions: Optional[Sequence[int]] = None,
248
+ min_nts: Optional[int] = None,
249
+ max_nts: Optional[int] = None,
250
+ min_codons: Optional[int] = None,
251
+ max_codons: Optional[int] = None,
252
+ ) -> 'MutationSpace':
253
+ """
254
+ Return a space of mutants relative to a given coding sequence, i.e. a space derived
255
+ from this one but which fixes the sequence on all but the specified positions.
256
+
257
+ Parameters
258
+ ----------
259
+ cds
260
+ The sequence to mutate.
261
+ free_positions
262
+ The positions that are allowed to vary.
263
+ min_nts
264
+ The min nucleotide (Hamming) distance relative to the reference sequence.
265
+ max_nts
266
+ The max nucleotide (Hamming) distance relative to the reference sequence.
267
+ min_codons
268
+ The min number of changed codons relative to the reference sequence.
269
+ max_codons
270
+ The max number of changed codons relative to the reference sequence.
271
+ """
272
+ cds = self.translation_table.normalise_sequence(cds)
273
+
274
+ if not self.contains(cds):
275
+ raise ValueError('CDS is not contained in this coding space.')
276
+
277
+ from codeine.space.mutation import MutationSpace
278
+
279
+ return MutationSpace(
280
+ space=self,
281
+ cds=cds,
282
+ free_positions=free_positions,
283
+ min_nts=min_nts,
284
+ max_nts=max_nts,
285
+ min_codons=min_codons,
286
+ max_codons=max_codons,
287
+ )
288
+
289
+ def pin_codons(self, pinned_codons: Dict[int, str]) -> None:
290
+ """
291
+ Pin temporary codons in this coding space.
292
+
293
+ Parameters
294
+ ----------
295
+ pinned_codons
296
+ A dict specifying which codons to pin, by position.
297
+ """
298
+ self.view.pin_codons(pinned_codons)
299
+
300
+ def unpin_codons(self, positions: Sequence[int]) -> None:
301
+ """
302
+ Remove temporary codon pins by position.
303
+
304
+ Parameters
305
+ ----------
306
+ positions
307
+ Positions to unpin.
308
+ """
309
+ self.view.unpin_codons(positions)
310
+
311
+ def set_pinned_codons(self, pinned_codons: Dict[int, str]) -> None:
312
+ """
313
+ Replace all temporary codon pins on this coding space.
314
+
315
+ Parameters
316
+ ----------
317
+ pinned_codons
318
+ A dict specifying which codons to pin, by position.
319
+ """
320
+ self.view.set_pinned_codons(pinned_codons)
321
+
322
+ def clear_pins(self) -> None:
323
+ """
324
+ Remove all temporary codon pins from this coding space.
325
+ """
326
+ self.view.clear_pins()
327
+
328
+ def set_forbidden_motifs(self, forbidden_motifs: ForbiddenMotifs) -> None:
329
+ """
330
+ Set the forbidden motifs for this coding space.
331
+
332
+ Parameters
333
+ ----------
334
+ forbidden_motifs
335
+ Motifs that should be forbidden in generated sequences.
336
+ """
337
+ self.forbidden_motifs = self._normalise_forbidden_motifs(forbidden_motifs)
338
+ self._update_forbidden_sequences()
339
+
340
+ def clear_forbidden_motifs(self) -> None:
341
+ """
342
+ Remove all forbidden motifs from this coding space.
343
+ """
344
+ self.set_forbidden_motifs(None)
345
+
346
+ def set_max_homopolymer(self, max_homopolymer: Optional[int]) -> None:
347
+ """
348
+ Set the maximum allowed homopolymer length.
349
+
350
+ Parameters
351
+ ----------
352
+ max_homopolymer
353
+ The longest allowed repeated run of one nucleotide, or None for no limit.
354
+ """
355
+ self.max_homopolymer = max_homopolymer
356
+ self._update_forbidden_sequences()
357
+
358
+ def clear_max_homopolymer(self) -> None:
359
+ """
360
+ Remove the maximum homopolymer constraint from this coding space.
361
+ """
362
+ self.set_max_homopolymer(None)
363
+
364
+ @property
365
+ def n_valid_sequences(self) -> int:
366
+ """
367
+ The number of valid sequences in this space.
368
+ """
369
+ return self.view.n_valid_sequences
370
+
371
+ @property
372
+ def aa_seq(self) -> str:
373
+ """
374
+ The amino acid sequence for this coding space.
375
+ """
376
+ return self.view.aa_seq
377
+
378
+ @property
379
+ def translation_table(self) -> TranslationTable:
380
+ """
381
+ The translation table being used in this space.
382
+ """
383
+ return self.view.translation_table
384
+
385
+ @property
386
+ def codon_weights(self) -> CodonWeights:
387
+ """
388
+ The codon weights being used in this space.
389
+ """
390
+ return self.view.codon_weights
391
+
392
+ @property
393
+ def codon_restrictions(self) -> Dict[int, CodonRestriction]:
394
+ """
395
+ The fixed codon restrictions from the underlying graph.
396
+ """
397
+ return self.view.codon_restrictions
398
+
399
+ @property
400
+ def context_l(self) -> str:
401
+ """
402
+ The left context sequence from the underlying graph.
403
+ """
404
+ return self.view.context_l
405
+
406
+ @property
407
+ def context_r(self) -> str:
408
+ """
409
+ The right context sequence from the underlying graph.
410
+ """
411
+ return self.view.context_r
412
+
413
+ @property
414
+ def pinned_codons(self) -> Dict[int, List[str]]:
415
+ """
416
+ Temporary codon pins currently applied to this coding space.
417
+ """
418
+ return self.view.pinned_codons
419
+
420
+ def _update_forbidden_sequences(self) -> None:
421
+ """
422
+ Rebuild concrete forbidden sequences and apply them to the view.
423
+ """
424
+ forbidden_sequences = expand_and_validate_sequence_constraints(
425
+ forbidden_motifs=self.forbidden_motifs,
426
+ max_homopolymer=self.max_homopolymer,
427
+ rna=self.translation_table.rna,
428
+ )
429
+ self.view.set_banned_sequences(forbidden_sequences)
430
+
431
+ @staticmethod
432
+ def _resolve_tables(
433
+ translation_table: Optional[TranslationTable],
434
+ codon_weights: Optional[CodonWeights],
435
+ rna: Optional[bool],
436
+ ) -> Tuple[TranslationTable, CodonWeights]:
437
+ """
438
+ Resolve user-submited (or not) translation table, codon weights, and RNA flag.
439
+ """
440
+
441
+ if rna is None:
442
+ if translation_table is not None and codon_weights is not None \
443
+ and translation_table.rna != codon_weights.rna:
444
+ raise ValueError('Provided translation table and codon weights must have the same molecule type.')
445
+
446
+ if translation_table is not None:
447
+ rna = translation_table.rna
448
+ elif codon_weights is not None:
449
+ rna = codon_weights.rna
450
+ else:
451
+ rna = False
452
+
453
+ else:
454
+ if translation_table is not None and translation_table.rna != rna:
455
+ raise ValueError('Value for rna is inconsistent with the provided translation table.')
456
+
457
+ if codon_weights is not None and codon_weights.rna != rna:
458
+ raise ValueError('Value for rna is inconsistent with the provided codon weights.')
459
+
460
+ if translation_table is None:
461
+ translation_table = TranslationTable(table_id=1, rna=rna)
462
+
463
+ if codon_weights is None:
464
+ codon_weights = CodonWeights.uniform(table=translation_table)
465
+
466
+ return translation_table, codon_weights
467
+
468
+ def _normalise_forbidden_motifs(
469
+ self,
470
+ forbidden_motifs: Optional[ForbiddenMotifs],
471
+ ) -> Optional[ForbiddenMotifs]:
472
+ """
473
+ Normalise string forbidden motifs to the molecule type used by this coding
474
+ space. RestrictionSite objects are left unchanged.
475
+ """
476
+ if forbidden_motifs is None:
477
+ return None
478
+
479
+ if isinstance(forbidden_motifs, str):
480
+ return self.translation_table.normalise_sequence(forbidden_motifs)
481
+
482
+ if isinstance(forbidden_motifs, RestrictionSite):
483
+ return forbidden_motifs
484
+
485
+ return [
486
+ self.translation_table.normalise_sequence(motif)
487
+ if isinstance(motif, str)
488
+ else motif
489
+ for motif in forbidden_motifs
490
+ ]